The Tor Browser: security/nss/lib/freebl/mpi/hppa20.s@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ; This Source Code Form is subject to the terms of the Mozilla Public

     2 ; License, v. 2.0. If a copy of the MPL was not distributed with this

     3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.

     5 #ifdef __LP64__

     6         .LEVEL   2.0W

     7 #else

     8 ;       .LEVEL   1.1

     9 ;       .ALLOW   2.0N

    10         .LEVEL   2.0

    11 #endif

    12         .SPACE   $TEXT$,SORT=8

    13         .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24

    15 ; ***************************************************************

    16 ;

    17 ;                 maxpy_[little/big]

    18 ;

    19 ; ***************************************************************

    21 ; There is no default -- you must specify one or the other.

    22 #define LITTLE_WORDIAN 1

    24 #ifdef LITTLE_WORDIAN

    25 #define EIGHT 8

    26 #define SIXTEEN 16

    27 #define THIRTY_TWO 32

    28 #define UN_EIGHT -8

    29 #define UN_SIXTEEN -16

    30 #define UN_TWENTY_FOUR -24

    31 #endif

    33 #ifdef BIG_WORDIAN

    34 #define EIGHT -8

    35 #define SIXTEEN -16

    36 #define THIRTY_TWO -32

    37 #define UN_EIGHT 8

    38 #define UN_SIXTEEN 16

    39 #define UN_TWENTY_FOUR 24

    40 #endif

    42 ; This performs a multiple-precision integer version of "daxpy",

    43 ; Using the selected addressing direction.  "Little-wordian" means that

    44 ; the least significant word of a number is stored at the lowest address.

    45 ; "Big-wordian" means that the most significant word is at the lowest

    46 ; address.  Either way, the incoming address of the vector is that

    47 ; of the least significant word.  That means that, for little-wordian

    48 ; addressing, we move the address upward as we propagate carries

    49 ; from the least significant word to the most significant.  For

    50 ; big-wordian we move the address downward.

    52 ; We use the following registers:

    53 ;

    54 ;     r2   return PC, of course

    55 ;     r26 = arg1 =  length

    56 ;     r25 = arg2 =  address of scalar

    57 ;     r24 = arg3 =  multiplicand vector

    58 ;     r23 = arg4 =  result vector

    59 ;

    60 ;     fr9 = scalar loaded once only from r25

    62 ; The cycle counts shown in the bodies below are simply the result of a

    63 ; scheduling by hand.  The actual PCX-U hardware does it differently.

    64 ; The intention is that the overall speed is the same.

    66 ; The pipeline startup and shutdown code is constructed in the usual way,

    67 ; by taking the loop bodies and removing unnecessary instructions.

    68 ; We have left the comments describing cycle numbers in the code.

    69 ; These are intended for reference when comparing with the main loop,

    70 ; and have no particular relationship to actual cycle numbers.

    72 #ifdef LITTLE_WORDIAN

    73 maxpy_little

    74 #else

    75 maxpy_big

    76 #endif

    77         .PROC

    78         .CALLINFO FRAME=120,ENTRY_GR=4

    79         .ENTRY

    80         STW,MA  %r3,128(%sp)

    81         STW     %r4,-124(%sp)

    83         ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.

    84         FLDD    0(%r25),%fr9        ; fr9 = scalar

    86 ; First startup

    88         FLDD    0(%r24),%fr24       ; Cycle 1

    89         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

    90         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

    91         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

    92         CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3

    93         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

    94         FLDD    EIGHT(%r24),%fr28   ; Cycle 8

    95         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

    96         FSTD    %fr24,-96(%sp)

    97         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

    98         FSTD    %fr25,-80(%sp)

    99         LDO     SIXTEEN(%r24),%r24  ; Cycle 12

   100         FSTD    %fr31,-64(%sp)

   101         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

   102         FSTD    %fr27,-48(%sp)

   104 ; Second startup

   106         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

   107         FSTD    %fr30,-56(%sp)

   108         FLDD    0(%r24),%fr24

   110         FSTD    %fr26,-88(%sp)      ; Cycle 2

   112         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

   113         FSTD    %fr28,-104(%sp)

   115         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

   116         LDD     -96(%sp),%r3

   117         FSTD    %fr29,-72(%sp)

   119         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

   120         LDD     -64(%sp),%r19

   121         LDD     -80(%sp),%r21

   123         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

   124         LDD     -56(%sp),%r20

   125         ADD     %r21,%r3,%r3

   127         ADD,DC  %r20,%r19,%r19      ; Cycle 7

   128         LDD     -88(%sp),%r4

   129         SHRPD   %r3,%r0,32,%r21

   130         LDD     -48(%sp),%r1

   132         FLDD    EIGHT(%r24),%fr28   ; Cycle 8

   133         LDD     -104(%sp),%r31

   134         ADD,DC  %r0,%r0,%r20

   135         SHRPD   %r19,%r3,32,%r3

   137         LDD     -72(%sp),%r29       ; Cycle 9

   138         SHRPD   %r20,%r19,32,%r20

   139         ADD     %r21,%r1,%r1

   141         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

   142         ADD,DC  %r3,%r4,%r4

   143         FSTD    %fr24,-96(%sp)

   145         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

   146         ADD,DC  %r0,%r20,%r20

   147         LDD     0(%r23),%r3

   148         FSTD    %fr25,-80(%sp)

   150         LDO     SIXTEEN(%r24),%r24  ; Cycle 12

   151         FSTD    %fr31,-64(%sp)

   153         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

   154         ADD     %r0,%r0,%r0         ; clear the carry bit

   155         ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12

   156         FSTD    %fr27,-48(%sp)

   157 ;        MFCTL   %cr16,%r21         ; for timing

   158 ;        STD     %r21,-112(%sp)

   160 ; Here is the loop.

   162 $LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

   163         ADD,DC  %r29,%r4,%r4

   164         FSTD    %fr30,-56(%sp)

   165         FLDD    0(%r24),%fr24

   167         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   168         ADD,DC  %r0,%r20,%r20

   169         FSTD    %fr26,-88(%sp)

   171         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

   172         ADD     %r3,%r1,%r1

   173         FSTD    %fr28,-104(%sp)

   174         LDD     UN_EIGHT(%r23),%r21

   176         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

   177         ADD,DC  %r21,%r4,%r28

   178         FSTD    %fr29,-72(%sp)

   179         LDD     -96(%sp),%r3

   181         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

   182         ADD,DC  %r20,%r31,%r22

   183         LDD     -64(%sp),%r19

   184         LDD     -80(%sp),%r21

   186         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6

   187         ADD     %r21,%r3,%r3

   188         LDD     -56(%sp),%r20

   189         STD     %r1,UN_SIXTEEN(%r23)

   191         ADD,DC  %r20,%r19,%r19      ; Cycle 7

   192         SHRPD   %r3,%r0,32,%r21

   193         LDD     -88(%sp),%r4

   194         LDD     -48(%sp),%r1

   196         ADD,DC  %r0,%r0,%r20        ; Cycle 8

   197         SHRPD   %r19,%r3,32,%r3

   198         FLDD    EIGHT(%r24),%fr28

   199         LDD     -104(%sp),%r31

   201         SHRPD   %r20,%r19,32,%r20   ; Cycle 9

   202         ADD     %r21,%r1,%r1

   203         STD     %r28,UN_EIGHT(%r23)

   204         LDD     -72(%sp),%r29

   206         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

   207         ADD,DC  %r3,%r4,%r4

   208         FSTD    %fr24,-96(%sp)

   210         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

   211         ADD,DC  %r0,%r20,%r20

   212         FSTD    %fr25,-80(%sp)

   213         LDD     0(%r23),%r3

   215         LDO     SIXTEEN(%r24),%r24  ; Cycle 12

   216         FSTD    %fr31,-64(%sp)

   218         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

   219         ADD     %r22,%r1,%r1

   220         ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12

   221         FSTD    %fr27,-48(%sp)

   223 $ENDLOOP

   225 ; Shutdown code, first stage.

   227 ;        MFCTL   %cr16,%r21         ; for timing

   228 ;        STD     %r21,UN_SIXTEEN(%r23)

   229 ;        LDD     -112(%sp),%r21

   230 ;        STD     %r21,UN_EIGHT(%r23)

   232         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

   233         ADD,DC  %r29,%r4,%r4

   234         CMPIB,= 0,%r26,$ONEMORE

   235         FSTD    %fr30,-56(%sp)

   237         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   238         ADD,DC  %r0,%r20,%r20

   239         FSTD    %fr26,-88(%sp)

   241         ADD     %r3,%r1,%r1         ; Cycle 3

   242         FSTD    %fr28,-104(%sp)

   243         LDD     UN_EIGHT(%r23),%r21

   245         ADD,DC  %r21,%r4,%r28       ; Cycle 4

   246         FSTD    %fr29,-72(%sp)

   247         STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9

   248         LDD     -96(%sp),%r3

   250         ADD,DC  %r20,%r31,%r22      ; Cycle 5

   251         STD     %r1,UN_SIXTEEN(%r23)

   252 $JOIN4

   253         LDD     -64(%sp),%r19

   254         LDD     -80(%sp),%r21

   256         ADD     %r21,%r3,%r3        ; Cycle 6

   257         LDD     -56(%sp),%r20

   259         ADD,DC  %r20,%r19,%r19      ; Cycle 7

   260         SHRPD   %r3,%r0,32,%r21

   261         LDD     -88(%sp),%r4

   262         LDD     -48(%sp),%r1

   264         ADD,DC  %r0,%r0,%r20        ; Cycle 8

   265         SHRPD   %r19,%r3,32,%r3

   266         LDD     -104(%sp),%r31

   268         SHRPD   %r20,%r19,32,%r20   ; Cycle 9

   269         ADD     %r21,%r1,%r1

   270         LDD     -72(%sp),%r29

   272         ADD,DC  %r3,%r4,%r4         ; Cycle 10

   274         ADD,DC  %r0,%r20,%r20       ; Cycle 11

   275         LDD     0(%r23),%r3

   277         ADD     %r22,%r1,%r1        ; Cycle 13

   279 ; Shutdown code, second stage.

   281         ADD,DC  %r29,%r4,%r4        ; Cycle 1

   283         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   284         ADD,DC  %r0,%r20,%r20

   286         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

   287         ADD     %r3,%r1,%r1

   289         ADD,DC  %r21,%r4,%r28       ; Cycle 4

   291         ADD,DC  %r20,%r31,%r22      ; Cycle 5

   293         STD     %r1,UN_SIXTEEN(%r23); Cycle 6

   295         STD     %r28,UN_EIGHT(%r23) ; Cycle 9

   297         LDD     0(%r23),%r3         ; Cycle 11

   299 ; Shutdown code, third stage.

   301         LDO     SIXTEEN(%r23),%r23

   302         ADD     %r3,%r22,%r1

   303 $JOIN1  ADD,DC  %r0,%r0,%r21

   304         CMPIB,*= 0,%r21,$L0         ; if no overflow, exit

   305         STD     %r1,UN_SIXTEEN(%r23)

   307 ; Final carry propagation

   309 $FINAL1 LDO     EIGHT(%r23),%r23

   310         LDD     UN_SIXTEEN(%r23),%r21

   311         ADDI    1,%r21,%r21

   312         CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.

   313         STD     %r21,UN_SIXTEEN(%r23)

   314         B       $L0

   315         NOP

   317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.

   318 ; We do the usual trick -- branch out of the startup code at appropriate

   319 ; points, and branch into the shutdown code.

   321 $N_IS_SMALL

   322         CMPIB,= 0,%r26,$N_IS_ONE

   323         FSTD    %fr24,-96(%sp)      ; Cycle 10

   324         FLDD    EIGHT(%r24),%fr28   ; Cycle 8

   325         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10

   326         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11

   327         FSTD    %fr25,-80(%sp)

   328         FSTD    %fr31,-64(%sp)      ; Cycle 12

   329         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13

   330         FSTD    %fr27,-48(%sp)

   331         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1

   332         CMPIB,= 2,%r26,$N_IS_THREE

   333         FSTD    %fr30,-56(%sp)

   335 ; N = 2

   336         FSTD    %fr26,-88(%sp)      ; Cycle 2

   337         FSTD    %fr28,-104(%sp)     ; Cycle 3

   338         LDD     -96(%sp),%r3        ; Cycle 4

   339         FSTD    %fr29,-72(%sp)

   340         B       $JOIN4

   341         ADD     %r0,%r0,%r22

   343 $N_IS_THREE

   344         FLDD    SIXTEEN(%r24),%fr24

   345         FSTD    %fr26,-88(%sp)      ; Cycle 2

   346         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

   347         FSTD    %fr28,-104(%sp)

   348         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

   349         LDD     -96(%sp),%r3

   350         FSTD    %fr29,-72(%sp)

   351         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

   352         LDD     -64(%sp),%r19

   353         LDD     -80(%sp),%r21

   354         B       $JOIN3

   355         ADD     %r0,%r0,%r22

   357 $N_IS_ONE

   358         FSTD    %fr25,-80(%sp)

   359         FSTD    %fr27,-48(%sp)

   360         FSTD    %fr26,-88(%sp)      ; Cycle 2

   361         B       $JOIN5

   362         ADD     %r0,%r0,%r22

   364 ; We came out of the unrolled loop with wrong parity.  Do one more

   365 ; single cycle.  This is quite tricky, because of the way the

   366 ; carry chains and SHRPD chains have been chopped up.

   368 $ONEMORE

   370         FLDD    0(%r24),%fr24

   372         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   373         ADD,DC  %r0,%r20,%r20

   374         FSTD    %fr26,-88(%sp)

   376         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3

   377         FSTD    %fr28,-104(%sp)

   378         LDD     UN_EIGHT(%r23),%r21

   379         ADD     %r3,%r1,%r1

   381         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4

   382         ADD,DC  %r21,%r4,%r28

   383         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9

   384         LDD     -96(%sp),%r3

   385         FSTD    %fr29,-72(%sp)

   387         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5

   388         ADD,DC  %r20,%r31,%r22

   389         LDD     -64(%sp),%r19

   390         LDD     -80(%sp),%r21

   392         STD     %r1,UN_SIXTEEN(%r23); Cycle 6

   393 $JOIN3

   394         XMPYU   %fr9L,%fr24R,%fr24

   395         LDD     -56(%sp),%r20

   396         ADD     %r21,%r3,%r3

   398         ADD,DC  %r20,%r19,%r19      ; Cycle 7

   399         LDD     -88(%sp),%r4

   400         SHRPD   %r3,%r0,32,%r21

   401         LDD     -48(%sp),%r1

   403         LDD     -104(%sp),%r31      ; Cycle 8

   404         ADD,DC  %r0,%r0,%r20

   405         SHRPD   %r19,%r3,32,%r3

   407         LDD     -72(%sp),%r29       ; Cycle 9

   408         SHRPD   %r20,%r19,32,%r20

   409         ADD     %r21,%r1,%r1

   411         ADD,DC  %r3,%r4,%r4         ; Cycle 10

   412         FSTD    %fr24,-96(%sp)

   414         ADD,DC  %r0,%r20,%r20       ; Cycle 11

   415         LDD     0(%r23),%r3

   416         FSTD    %fr25,-80(%sp)

   418         ADD     %r22,%r1,%r1        ; Cycle 13

   419         FSTD    %fr27,-48(%sp)

   421 ; Shutdown code, stage 1-1/2.

   423         ADD,DC  %r29,%r4,%r4        ; Cycle 1

   425         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   426         ADD,DC  %r0,%r20,%r20

   427         FSTD    %fr26,-88(%sp)

   429         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

   430         ADD     %r3,%r1,%r1

   432         ADD,DC  %r21,%r4,%r28       ; Cycle 4

   433         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9

   435         ADD,DC  %r20,%r31,%r22      ; Cycle 5

   436         STD     %r1,UN_SIXTEEN(%r23)

   437 $JOIN5

   438         LDD     -96(%sp),%r3        ; moved from cycle 4

   439         LDD     -80(%sp),%r21

   440         ADD     %r21,%r3,%r3        ; Cycle 6

   441         ADD,DC  %r0,%r0,%r19        ; Cycle 7

   442         LDD     -88(%sp),%r4

   443         SHRPD   %r3,%r0,32,%r21

   444         LDD     -48(%sp),%r1

   445         SHRPD   %r19,%r3,32,%r3     ; Cycle 8

   446         ADD     %r21,%r1,%r1        ; Cycle 9

   447         ADD,DC  %r3,%r4,%r4         ; Cycle 10

   448         LDD     0(%r23),%r3         ; Cycle 11

   449         ADD     %r22,%r1,%r1        ; Cycle 13

   451 ; Shutdown code, stage 2-1/2.

   453         ADD,DC  %r0,%r4,%r4         ; Cycle 1

   454         LDO     SIXTEEN(%r23),%r23  ; Cycle 2

   455         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3

   456         ADD     %r3,%r1,%r1

   457         STD     %r1,UN_SIXTEEN(%r23)

   458         ADD,DC  %r21,%r4,%r1

   459         B       $JOIN1

   460         LDO     EIGHT(%r23),%r23

   462 ; exit

   464 $L0

   465         LDW     -124(%sp),%r4

   466         BVE     (%r2)

   467         .EXIT

   468         LDW,MB  -128(%sp),%r3

   470         .PROCEND

   472 ; ***************************************************************

   473 ;

   474 ;                 add_diag_[little/big]

   475 ;

   476 ; ***************************************************************

   478 ; The arguments are as follows:

   479 ;     r2   return PC, of course

   480 ;     r26 = arg1 =  length

   481 ;     r25 = arg2 =  vector to square

   482 ;     r24 = arg3 =  result vector

   484 #ifdef LITTLE_WORDIAN

   485 add_diag_little

   486 #else

   487 add_diag_big

   488 #endif

   489         .PROC

   490         .CALLINFO FRAME=120,ENTRY_GR=4

   491         .ENTRY

   492         STW,MA  %r3,128(%sp)

   493         STW     %r4,-124(%sp)

   495         ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.

   496         NOP

   498 ; Startup code

   500         FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)

   501         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4

   502         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5

   503         XMPYU   %fr7L,%fr7L,%fr30

   504         LDO     SIXTEEN(%r25),%r25  ; Cycle 6

   505         FSTD    %fr29,-88(%sp)

   506         FSTD    %fr27,-72(%sp)      ; Cycle 7

   507         CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)

   508         FSTD    %fr30,-96(%sp)

   509         FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2

   510         LDD     -88(%sp),%r22       ; Cycle 3

   511         LDD     -72(%sp),%r31       ; Cycle 4

   512         XMPYU   %fr7R,%fr7R,%fr28

   513         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5

   514         XMPYU   %fr7L,%fr7L,%fr31

   515         LDD     -96(%sp),%r20       ; Cycle 6

   516         FSTD    %fr28,-80(%sp)

   517         ADD     %r0,%r0,%r0         ; clear the carry bit

   518         ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7

   519         FSTD    %fr24,-64(%sp)

   521 ; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".

   523 $DIAGLOOP

   524         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)

   525         LDO     SIXTEEN(%r25),%r25

   526         LDD     0(%r24),%r1

   527         FSTD    %fr31,-104(%sp)

   528         SHRPD   %r0,%r31,31,%r4     ; Cycle 2

   529         ADD,DC  %r22,%r3,%r3

   530         FLDD    UN_SIXTEEN(%r25),%fr7

   531         ADD,DC  %r0,%r20,%r20       ; Cycle 3

   532         ADD     %r1,%r3,%r3

   533         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4

   534         LDD     -80(%sp),%r21

   535         STD     %r3,0(%r24)

   536         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5

   537         XMPYU   %fr7L,%fr7L,%fr30

   538         LDD     -64(%sp),%r29

   539         LDD     EIGHT(%r24),%r1

   540         ADD,DC  %r4,%r20,%r20       ; Cycle 6

   541         LDD     -104(%sp),%r19

   542         FSTD    %fr29,-88(%sp)

   543         ADD     %r20,%r1,%r1        ; Cycle 7

   544         FSTD    %fr27,-72(%sp)

   545         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

   546         LDO     THIRTY_TWO(%r24),%r24

   547         LDD     UN_SIXTEEN(%r24),%r28

   548         FSTD    %fr30,-96(%sp)

   549         SHRPD   %r0,%r29,31,%r3     ; Cycle 2

   550         ADD,DC  %r21,%r4,%r4

   551         FLDD    UN_EIGHT(%r25),%fr7

   552         STD     %r1,UN_TWENTY_FOUR(%r24)

   553         ADD,DC  %r0,%r19,%r19       ; Cycle 3

   554         ADD     %r28,%r4,%r4

   555         XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4

   556         LDD     -88(%sp),%r22

   557         STD     %r4,UN_SIXTEEN(%r24)

   558         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5

   559         XMPYU   %fr7L,%fr7L,%fr31

   560         LDD     -72(%sp),%r31

   561         LDD     UN_EIGHT(%r24),%r28

   562         ADD,DC  %r3,%r19,%r19       ; Cycle 6

   563         LDD     -96(%sp),%r20

   564         FSTD    %fr28,-80(%sp)

   565         ADD     %r19,%r28,%r28      ; Cycle 7

   566         FSTD    %fr24,-64(%sp)

   567         ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8

   568         STD     %r28,UN_EIGHT(%r24)

   570 $ENDDIAGLOOP

   572         ADD,DC  %r0,%r22,%r22

   573         CMPIB,= 0,%r26,$ONEMOREDIAG

   574         SHRPD   %r31,%r0,31,%r3

   576 ; Shutdown code, first stage.

   578         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)

   579         LDD     0(%r24),%r28

   580         SHRPD   %r0,%r31,31,%r4     ; Cycle 2

   581         ADD     %r3,%r22,%r3

   582         ADD,DC  %r0,%r20,%r20       ; Cycle 3

   583         LDD     -80(%sp),%r21

   584         ADD     %r3,%r28,%r3

   585         LDD     -64(%sp),%r29       ; Cycle 4

   586         STD     %r3,0(%r24)

   587         LDD     EIGHT(%r24),%r1     ; Cycle 5

   588         LDO     SIXTEEN(%r25),%r25  ; Cycle 6

   589         LDD     -104(%sp),%r19

   590         ADD,DC  %r4,%r20,%r20

   591         ADD     %r20,%r1,%r1        ; Cycle 7

   592         ADD,DC  %r0,%r21,%r21       ; Cycle 8

   593         STD     %r1,EIGHT(%r24)

   595 ; Shutdown code, second stage.

   597         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

   598         LDO     THIRTY_TWO(%r24),%r24

   599         LDD     UN_SIXTEEN(%r24),%r1

   600         SHRPD   %r0,%r29,31,%r3      ; Cycle 2

   601         ADD     %r4,%r21,%r4

   602         ADD,DC  %r0,%r19,%r19       ; Cycle 3

   603         ADD     %r4,%r1,%r4

   604         STD     %r4,UN_SIXTEEN(%r24); Cycle 4

   605         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5

   606         ADD,DC  %r3,%r19,%r19       ; Cycle 6

   607         ADD     %r19,%r28,%r28      ; Cycle 7

   608         ADD,DC  %r0,%r0,%r22        ; Cycle 8

   609         CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit

   610         STD     %r28,UN_EIGHT(%r24)

   612 ; Final carry propagation

   614 $FDIAG2

   615         LDO     EIGHT(%r24),%r24

   616         LDD     UN_EIGHT(%r24),%r26

   617         ADDI    1,%r26,%r26

   618         CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.

   619         STD     %r26,UN_EIGHT(%r24)

   621         B   $Z0

   622         NOP

   624 ; Here is the code that handles the difficult case N=1.

   625 ; We do the usual trick -- branch out of the startup code at appropriate

   626 ; points, and branch into the shutdown code.

   628 $DIAG_N_IS_ONE

   630         LDD     -88(%sp),%r22

   631         LDD     -72(%sp),%r31

   632         B       $JOINDIAG

   633         LDD     -96(%sp),%r20

   635 ; We came out of the unrolled loop with wrong parity.  Do one more

   636 ; single cycle.  This is the "alternate body".  It will, of course,

   637 ; give us opposite registers from the other case, so we need

   638 ; completely different shutdown code.

   640 $ONEMOREDIAG

   641         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)

   642         LDD     0(%r24),%r28

   643         FLDD    0(%r25),%fr7        ; Cycle 2

   644         SHRPD   %r0,%r31,31,%r4

   645         ADD     %r3,%r22,%r3

   646         ADD,DC  %r0,%r20,%r20       ; Cycle 3

   647         LDD     -80(%sp),%r21

   648         ADD     %r3,%r28,%r3

   649         LDD     -64(%sp),%r29       ; Cycle 4

   650         STD     %r3,0(%r24)

   651         XMPYU   %fr7R,%fr7R,%fr29

   652         LDD     EIGHT(%r24),%r1     ; Cycle 5

   653         XMPYU   %fr7L,%fr7R,%fr27

   654         XMPYU   %fr7L,%fr7L,%fr30

   655         LDD     -104(%sp),%r19      ; Cycle 6

   656         FSTD    %fr29,-88(%sp)

   657         ADD,DC  %r4,%r20,%r20

   658         FSTD    %fr27,-72(%sp)      ; Cycle 7

   659         ADD     %r20,%r1,%r1

   660         ADD,DC  %r0,%r21,%r21       ; Cycle 8

   661         STD     %r1,EIGHT(%r24)

   663 ; Shutdown code, first stage.

   665         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)

   666         LDO     THIRTY_TWO(%r24),%r24

   667         FSTD    %fr30,-96(%sp)

   668         LDD     UN_SIXTEEN(%r24),%r1

   669         SHRPD   %r0,%r29,31,%r3     ; Cycle 2

   670         ADD     %r4,%r21,%r4

   671         ADD,DC  %r0,%r19,%r19       ; Cycle 3

   672         LDD     -88(%sp),%r22

   673         ADD     %r4,%r1,%r4

   674         LDD     -72(%sp),%r31       ; Cycle 4

   675         STD     %r4,UN_SIXTEEN(%r24)

   676         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5

   677         LDD     -96(%sp),%r20       ; Cycle 6

   678         ADD,DC  %r3,%r19,%r19

   679         ADD     %r19,%r28,%r28      ; Cycle 7

   680         ADD,DC  %r0,%r22,%r22       ; Cycle 8

   681         STD     %r28,UN_EIGHT(%r24)

   683 ; Shutdown code, second stage.

   685 $JOINDIAG

   686         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)

   687         LDD     0(%r24),%r28

   688         SHRPD   %r0,%r31,31,%r4     ; Cycle 2

   689         ADD     %r3,%r22,%r3

   690         ADD,DC  %r0,%r20,%r20       ; Cycle 3

   691         ADD     %r3,%r28,%r3

   692         STD     %r3,0(%r24)         ; Cycle 4

   693         LDD     EIGHT(%r24),%r1     ; Cycle 5

   694         ADD,DC  %r4,%r20,%r20

   695         ADD     %r20,%r1,%r1        ; Cycle 7

   696         ADD,DC  %r0,%r0,%r21        ; Cycle 8

   697         CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit

   698         STD     %r1,EIGHT(%r24)

   700 ; Final carry propagation

   702 $FDIAG1

   703         LDO     EIGHT(%r24),%r24

   704         LDD     EIGHT(%r24),%r26

   705         ADDI    1,%r26,%r26

   706         CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.

   707         STD     %r26,EIGHT(%r24)

   709 $Z0

   710         LDW     -124(%sp),%r4

   711         BVE     (%r2)

   712         .EXIT

   713         LDW,MB  -128(%sp),%r3

   714         .PROCEND

   715 ;	.ALLOW

   717         .SPACE         $TEXT$

   718         .SUBSPA        $CODE$

   719 #ifdef LITTLE_WORDIAN

   720 #ifdef __GNUC__

   721 ; GNU-as (as of 2.19) does not support LONG_RETURN

   722         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR

   723         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR

   724 #else

   725         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN

   726         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN

   727 #endif

   728 #else

   729         .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN

   730         .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN

   731 #endif

   732         .END

   735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"

   736 ;

   737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"

   738 ; performs a 64-bit x any-size multiply, and adds the

   739 ; result to an area of memory.  That is, it performs

   740 ; something like

   741 ;

   742 ;      A B C D

   743 ;    *       Z

   744 ;   __________

   745 ;    P Q R S T

   746 ;

   747 ; and then adds the "PQRST" vector into an area of memory,

   748 ; handling all carries.

   749 ;

   750 ; Digression on nomenclature and endian-ness:

   751 ;

   752 ; Each of the capital letters in the above represents a 64-bit

   753 ; quantity.  That is, you could think of the discussion as

   754 ; being in terms of radix-16-quintillion arithmetic.  The data

   755 ; type being manipulated is "unsigned long long int".  This

   756 ; requires the 64-bit extension of the HP-UX C compiler,

   757 ; available at release 10.  You need these compiler flags to

   758 ; enable these extensions:

   759 ;

   760 ;       -Aa +e +DA2.0 +DS2.0

   761 ;

   762 ; (The first specifies ANSI C, the second enables the

   763 ; extensions, which are beyond ANSI C, and the third and

   764 ; fourth tell the compiler to use whatever features of the

   765 ; PA2.0 architecture it wishes, in order to made the code more

   766 ; efficient.  Since the presence of the assembly code will

   767 ; make the program unable to run on anything less than PA2.0,

   768 ; you might as well gain the performance enhancements in the C

   769 ; code as well.)

   770 ;

   771 ; Questions of "endian-ness" often come up, usually in the

   772 ; context of byte ordering in a word.  These routines have a

   773 ; similar issue, that could be called "wordian-ness".

   774 ; Independent of byte ordering (PA is always big-endian), one

   775 ; can make two choices when representing extremely large

   776 ; numbers as arrays of 64-bit doublewords in memory.

   777 ;

   778 ; "Little-wordian" layout means that the least significant

   779 ; word of a number is stored at the lowest address.

   780 ;

   781 ;   MSW     LSW

   782 ;    |       |

   783 ;    V       V

   784 ;

   785 ;    A B C D E

   786 ;

   787 ;    ^     ^ ^

   788 ;    |     | |____ address 0

   789 ;    |     |

   790 ;    |     |_______address 8

   791 ;    |

   792 ;    address 32

   793 ;

   794 ; "Big-wordian" means that the most significant word is at the

   795 ; lowest address.

   796 ;

   797 ;   MSW     LSW

   798 ;    |       |

   799 ;    V       V

   800 ;

   801 ;    A B C D E

   802 ;

   803 ;    ^     ^ ^

   804 ;    |     | |____ address 32

   805 ;    |     |

   806 ;    |     |_______address 24

   807 ;    |

   808 ;    address 0

   809 ;

   810 ; When you compile the file, you must specify one or the other, with

   811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".

   812 ;

   813 ;     Incidentally, you assemble this file as part of your

   814 ;     project with the same C compiler as the rest of the program.

   815 ;     My "makefile" for a superprecision arithmetic package has

   816 ;     the following stuff:

   817 ;

   818 ;     # definitions:

   819 ;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1

   820 ;     CFLAGS = +O3

   821 ;     LDFLAGS = -L /usr/lib -Wl,-aarchive

   822 ;

   823 ;     # general build rule for ".s" files:

   824 ;     .s.o:

   825 ;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN

   826 ;

   827 ;     # Now any bind step that calls for pa20.o will assemble pa20.s

   828 ;

   829 ; End of digression, back to arithmetic:

   830 ;

   831 ; The way we multiply two huge numbers is, of course, to multiply

   832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding

   833 ; the result vectors with increasing offsets, the way we learned

   834 ; in school, back before we all used calculators:

   835 ;

   836 ;            A B C D

   837 ;          * W X Y Z

   838 ;         __________

   839 ;          P Q R S T

   840 ;        E F G H I

   841 ;      M N O P Q

   842 ;  + R S T U V

   843 ;    _______________

   844 ;    F I N A L S U M

   845 ;

   846 ; So we call maxpy_PA20_big (in my case; my package is

   847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments

   848 ; in turn as the "scalar", and giving the "ABCD" vector each

   849 ; time.  We direct it to add its result into an area of memory

   850 ; that we have cleared at the start.  We skew the exact

   851 ; location into that area with each call.

   852 ;

   853 ; The prototype for the function is

   854 ;

   855 ; extern void maxpy_PA20_big(

   856 ;    int length,        /* Number of doublewords in the multiplicand vector. */

   857 ;    const long long int *scalaraddr,    /* Address to fetch the scalar. */

   858 ;    const long long int *multiplicand,  /* The multiplicand vector. */

   859 ;    long long int *result);             /* Where to accumulate the result. */

   860 ;

   861 ; (You should place a copy of this prototype in an include file

   862 ; or in your C file.)

   863 ;

   864 ; Now, IN ALL CASES, the given address for the multiplicand or

   865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.

   866 ; That word is, of course, the word at which the routine

   867 ; starts processing.  "maxpy_PA20_little" then increases the

   868 ; addresses as it computes.  "maxpy_PA20_big" decreases them.

   869 ;

   870 ; In our example above, "length" would be 4 in each case.

   871 ; "multiplicand" would be the "ABCD" vector.  Specifically,

   872 ; the address of the element "D".  "scalaraddr" would be the

   873 ; address of "W", "X", "Y", or "Z" on the four calls that we

   874 ; would make.  (The order doesn't matter, of course.)

   875 ; "result" would be the appropriate address in the result

   876 ; area.  When multiplying by "Z", that would be the least

   877 ; significant word.  When multiplying by "Y", it would be the

   878 ; next higher word (8 bytes higher if little-wordian; 8 bytes

   879 ; lower if big-wordian), and so on.  The size of the result

   880 ; area must be the the sum of the sizes of the multiplicand

   881 ; and multiplier vectors, and must be initialized to zero

   882 ; before we start.

   883 ;

   884 ; Whenever the routine adds its partial product into the result

   885 ; vector, it follows carry chains as far as they need to go.

   886 ;

   887 ; Here is the super-precision multiply routine that I use for

   888 ; my package.  The package is big-wordian.  I have taken out

   889 ; handling of exponents (it's a floating point package):

   890 ;

   891 ; static void mul_PA20(

   892 ;   int size,

   893 ;   const long long int *arg1,

   894 ;   const long long int *arg2,

   895 ;   long long int *result)

   896 ; {

   897 ;    int i;

   898 ;

   899 ;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;

   900 ;

   901 ;    for (i=0 ; i<size ; i++) {

   902 ;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);

   903 ;    }

   904 ; }

The Tor Browser / file revision

security/nss/lib/freebl/mpi/hppa20.s@b8a032363ba2

security/nss/lib/freebl/mpi/hppa20.s