security/nss/lib/freebl/mpi/hppa20.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ; This Source Code Form is subject to the terms of the Mozilla Public
     2 ; License, v. 2.0. If a copy of the MPL was not distributed with this
     3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
     5 #ifdef __LP64__
     6         .LEVEL   2.0W
     7 #else
     8 ;       .LEVEL   1.1
     9 ;       .ALLOW   2.0N
    10         .LEVEL   2.0
    11 #endif
    12         .SPACE   $TEXT$,SORT=8
    13         .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
    15 ; ***************************************************************
    16 ;
    17 ;                 maxpy_[little/big]
    18 ;
    19 ; ***************************************************************
    21 ; There is no default -- you must specify one or the other.
    22 #define LITTLE_WORDIAN 1
    24 #ifdef LITTLE_WORDIAN
    25 #define EIGHT 8
    26 #define SIXTEEN 16
    27 #define THIRTY_TWO 32
    28 #define UN_EIGHT -8
    29 #define UN_SIXTEEN -16
    30 #define UN_TWENTY_FOUR -24
    31 #endif
    33 #ifdef BIG_WORDIAN
    34 #define EIGHT -8
    35 #define SIXTEEN -16
    36 #define THIRTY_TWO -32
    37 #define UN_EIGHT 8
    38 #define UN_SIXTEEN 16
    39 #define UN_TWENTY_FOUR 24
    40 #endif
    42 ; This performs a multiple-precision integer version of "daxpy",
    43 ; Using the selected addressing direction.  "Little-wordian" means that
    44 ; the least significant word of a number is stored at the lowest address.
    45 ; "Big-wordian" means that the most significant word is at the lowest
    46 ; address.  Either way, the incoming address of the vector is that
    47 ; of the least significant word.  That means that, for little-wordian
    48 ; addressing, we move the address upward as we propagate carries
    49 ; from the least significant word to the most significant.  For
    50 ; big-wordian we move the address downward.
    52 ; We use the following registers:
    53 ;
    54 ;     r2   return PC, of course
    55 ;     r26 = arg1 =  length
    56 ;     r25 = arg2 =  address of scalar
    57 ;     r24 = arg3 =  multiplicand vector
    58 ;     r23 = arg4 =  result vector
    59 ;
    60 ;     fr9 = scalar loaded once only from r25
    62 ; The cycle counts shown in the bodies below are simply the result of a
    63 ; scheduling by hand.  The actual PCX-U hardware does it differently.
    64 ; The intention is that the overall speed is the same.
    66 ; The pipeline startup and shutdown code is constructed in the usual way,
    67 ; by taking the loop bodies and removing unnecessary instructions.
    68 ; We have left the comments describing cycle numbers in the code.
    69 ; These are intended for reference when comparing with the main loop,
    70 ; and have no particular relationship to actual cycle numbers.
    72 #ifdef LITTLE_WORDIAN
    73 maxpy_little
    74 #else
    75 maxpy_big
    76 #endif
    77         .PROC
    78         .CALLINFO FRAME=120,ENTRY_GR=4
    79         .ENTRY
    80         STW,MA  %r3,128(%sp)
    81         STW     %r4,-124(%sp)
    83         ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
    84         FLDD    0(%r25),%fr9        ; fr9 = scalar
    86 ; First startup
    88         FLDD    0(%r24),%fr24       ; Cycle 1
    89         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    90         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    91         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    92         CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
    93         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
    94         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
    95         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
    96         FSTD    %fr24,-96(%sp)
    97         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
    98         FSTD    %fr25,-80(%sp)
    99         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   100         FSTD    %fr31,-64(%sp)
   101         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   102         FSTD    %fr27,-48(%sp)
   104 ; Second startup
   106         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   107         FSTD    %fr30,-56(%sp)
   108         FLDD    0(%r24),%fr24
   110         FSTD    %fr26,-88(%sp)      ; Cycle 2
   112         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   113         FSTD    %fr28,-104(%sp)
   115         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   116         LDD     -96(%sp),%r3
   117         FSTD    %fr29,-72(%sp)
   119         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   120         LDD     -64(%sp),%r19
   121         LDD     -80(%sp),%r21
   123         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
   124         LDD     -56(%sp),%r20
   125         ADD     %r21,%r3,%r3
   127         ADD,DC  %r20,%r19,%r19      ; Cycle 7
   128         LDD     -88(%sp),%r4
   129         SHRPD   %r3,%r0,32,%r21
   130         LDD     -48(%sp),%r1
   132         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
   133         LDD     -104(%sp),%r31
   134         ADD,DC  %r0,%r0,%r20
   135         SHRPD   %r19,%r3,32,%r3
   137         LDD     -72(%sp),%r29       ; Cycle 9
   138         SHRPD   %r20,%r19,32,%r20
   139         ADD     %r21,%r1,%r1
   141         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   142         ADD,DC  %r3,%r4,%r4
   143         FSTD    %fr24,-96(%sp)
   145         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   146         ADD,DC  %r0,%r20,%r20
   147         LDD     0(%r23),%r3
   148         FSTD    %fr25,-80(%sp)
   150         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   151         FSTD    %fr31,-64(%sp)
   153         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   154         ADD     %r0,%r0,%r0         ; clear the carry bit
   155         ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
   156         FSTD    %fr27,-48(%sp)
   157 ;        MFCTL   %cr16,%r21         ; for timing
   158 ;        STD     %r21,-112(%sp)
   160 ; Here is the loop.
   162 $LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   163         ADD,DC  %r29,%r4,%r4
   164         FSTD    %fr30,-56(%sp)
   165         FLDD    0(%r24),%fr24
   167         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   168         ADD,DC  %r0,%r20,%r20
   169         FSTD    %fr26,-88(%sp)
   171         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   172         ADD     %r3,%r1,%r1
   173         FSTD    %fr28,-104(%sp)
   174         LDD     UN_EIGHT(%r23),%r21
   176         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   177         ADD,DC  %r21,%r4,%r28
   178         FSTD    %fr29,-72(%sp)    
   179         LDD     -96(%sp),%r3
   181         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   182         ADD,DC  %r20,%r31,%r22
   183         LDD     -64(%sp),%r19
   184         LDD     -80(%sp),%r21
   186         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
   187         ADD     %r21,%r3,%r3
   188         LDD     -56(%sp),%r20
   189         STD     %r1,UN_SIXTEEN(%r23)
   191         ADD,DC  %r20,%r19,%r19      ; Cycle 7
   192         SHRPD   %r3,%r0,32,%r21
   193         LDD     -88(%sp),%r4
   194         LDD     -48(%sp),%r1
   196         ADD,DC  %r0,%r0,%r20        ; Cycle 8
   197         SHRPD   %r19,%r3,32,%r3
   198         FLDD    EIGHT(%r24),%fr28
   199         LDD     -104(%sp),%r31
   201         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
   202         ADD     %r21,%r1,%r1
   203         STD     %r28,UN_EIGHT(%r23)
   204         LDD     -72(%sp),%r29
   206         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   207         ADD,DC  %r3,%r4,%r4
   208         FSTD    %fr24,-96(%sp)
   210         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   211         ADD,DC  %r0,%r20,%r20
   212         FSTD    %fr25,-80(%sp)
   213         LDD     0(%r23),%r3
   215         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   216         FSTD    %fr31,-64(%sp)
   218         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   219         ADD     %r22,%r1,%r1
   220         ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
   221         FSTD    %fr27,-48(%sp)
   223 $ENDLOOP
   225 ; Shutdown code, first stage.
   227 ;        MFCTL   %cr16,%r21         ; for timing
   228 ;        STD     %r21,UN_SIXTEEN(%r23)
   229 ;        LDD     -112(%sp),%r21
   230 ;        STD     %r21,UN_EIGHT(%r23)
   232         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   233         ADD,DC  %r29,%r4,%r4
   234         CMPIB,= 0,%r26,$ONEMORE
   235         FSTD    %fr30,-56(%sp)
   237         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   238         ADD,DC  %r0,%r20,%r20
   239         FSTD    %fr26,-88(%sp)
   241         ADD     %r3,%r1,%r1         ; Cycle 3
   242         FSTD    %fr28,-104(%sp)
   243         LDD     UN_EIGHT(%r23),%r21
   245         ADD,DC  %r21,%r4,%r28       ; Cycle 4
   246         FSTD    %fr29,-72(%sp)    
   247         STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
   248         LDD     -96(%sp),%r3
   250         ADD,DC  %r20,%r31,%r22      ; Cycle 5
   251         STD     %r1,UN_SIXTEEN(%r23)
   252 $JOIN4
   253         LDD     -64(%sp),%r19
   254         LDD     -80(%sp),%r21
   256         ADD     %r21,%r3,%r3        ; Cycle 6
   257         LDD     -56(%sp),%r20
   259         ADD,DC  %r20,%r19,%r19      ; Cycle 7
   260         SHRPD   %r3,%r0,32,%r21
   261         LDD     -88(%sp),%r4
   262         LDD     -48(%sp),%r1
   264         ADD,DC  %r0,%r0,%r20        ; Cycle 8
   265         SHRPD   %r19,%r3,32,%r3
   266         LDD     -104(%sp),%r31
   268         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
   269         ADD     %r21,%r1,%r1
   270         LDD     -72(%sp),%r29
   272         ADD,DC  %r3,%r4,%r4         ; Cycle 10
   274         ADD,DC  %r0,%r20,%r20       ; Cycle 11
   275         LDD     0(%r23),%r3
   277         ADD     %r22,%r1,%r1        ; Cycle 13
   279 ; Shutdown code, second stage.
   281         ADD,DC  %r29,%r4,%r4        ; Cycle 1
   283         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   284         ADD,DC  %r0,%r20,%r20
   286         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   287         ADD     %r3,%r1,%r1
   289         ADD,DC  %r21,%r4,%r28       ; Cycle 4
   291         ADD,DC  %r20,%r31,%r22      ; Cycle 5
   293         STD     %r1,UN_SIXTEEN(%r23); Cycle 6
   295         STD     %r28,UN_EIGHT(%r23) ; Cycle 9
   297         LDD     0(%r23),%r3         ; Cycle 11
   299 ; Shutdown code, third stage.
   301         LDO     SIXTEEN(%r23),%r23
   302         ADD     %r3,%r22,%r1
   303 $JOIN1  ADD,DC  %r0,%r0,%r21
   304         CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
   305         STD     %r1,UN_SIXTEEN(%r23)
   307 ; Final carry propagation
   309 $FINAL1 LDO     EIGHT(%r23),%r23
   310         LDD     UN_SIXTEEN(%r23),%r21
   311         ADDI    1,%r21,%r21
   312         CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
   313         STD     %r21,UN_SIXTEEN(%r23)
   314         B       $L0
   315         NOP
   317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
   318 ; We do the usual trick -- branch out of the startup code at appropriate
   319 ; points, and branch into the shutdown code.
   321 $N_IS_SMALL
   322         CMPIB,= 0,%r26,$N_IS_ONE
   323         FSTD    %fr24,-96(%sp)      ; Cycle 10
   324         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
   325         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   326         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   327         FSTD    %fr25,-80(%sp)
   328         FSTD    %fr31,-64(%sp)      ; Cycle 12
   329         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   330         FSTD    %fr27,-48(%sp)
   331         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   332         CMPIB,= 2,%r26,$N_IS_THREE
   333         FSTD    %fr30,-56(%sp)
   335 ; N = 2
   336         FSTD    %fr26,-88(%sp)      ; Cycle 2
   337         FSTD    %fr28,-104(%sp)     ; Cycle 3
   338         LDD     -96(%sp),%r3        ; Cycle 4
   339         FSTD    %fr29,-72(%sp)
   340         B       $JOIN4
   341         ADD     %r0,%r0,%r22
   343 $N_IS_THREE
   344         FLDD    SIXTEEN(%r24),%fr24
   345         FSTD    %fr26,-88(%sp)      ; Cycle 2
   346         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   347         FSTD    %fr28,-104(%sp)
   348         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   349         LDD     -96(%sp),%r3
   350         FSTD    %fr29,-72(%sp)
   351         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   352         LDD     -64(%sp),%r19
   353         LDD     -80(%sp),%r21
   354         B       $JOIN3
   355         ADD     %r0,%r0,%r22
   357 $N_IS_ONE
   358         FSTD    %fr25,-80(%sp)
   359         FSTD    %fr27,-48(%sp)
   360         FSTD    %fr26,-88(%sp)      ; Cycle 2
   361         B       $JOIN5
   362         ADD     %r0,%r0,%r22
   364 ; We came out of the unrolled loop with wrong parity.  Do one more
   365 ; single cycle.  This is quite tricky, because of the way the
   366 ; carry chains and SHRPD chains have been chopped up.
   368 $ONEMORE
   370         FLDD    0(%r24),%fr24
   372         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   373         ADD,DC  %r0,%r20,%r20
   374         FSTD    %fr26,-88(%sp)
   376         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   377         FSTD    %fr28,-104(%sp)
   378         LDD     UN_EIGHT(%r23),%r21
   379         ADD     %r3,%r1,%r1
   381         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   382         ADD,DC  %r21,%r4,%r28
   383         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
   384         LDD     -96(%sp),%r3
   385         FSTD    %fr29,-72(%sp)    
   387         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   388         ADD,DC  %r20,%r31,%r22
   389         LDD     -64(%sp),%r19
   390         LDD     -80(%sp),%r21
   392         STD     %r1,UN_SIXTEEN(%r23); Cycle 6
   393 $JOIN3
   394         XMPYU   %fr9L,%fr24R,%fr24
   395         LDD     -56(%sp),%r20
   396         ADD     %r21,%r3,%r3
   398         ADD,DC  %r20,%r19,%r19      ; Cycle 7
   399         LDD     -88(%sp),%r4
   400         SHRPD   %r3,%r0,32,%r21
   401         LDD     -48(%sp),%r1
   403         LDD     -104(%sp),%r31      ; Cycle 8
   404         ADD,DC  %r0,%r0,%r20
   405         SHRPD   %r19,%r3,32,%r3
   407         LDD     -72(%sp),%r29       ; Cycle 9
   408         SHRPD   %r20,%r19,32,%r20
   409         ADD     %r21,%r1,%r1
   411         ADD,DC  %r3,%r4,%r4         ; Cycle 10
   412         FSTD    %fr24,-96(%sp)
   414         ADD,DC  %r0,%r20,%r20       ; Cycle 11
   415         LDD     0(%r23),%r3
   416         FSTD    %fr25,-80(%sp)
   418         ADD     %r22,%r1,%r1        ; Cycle 13
   419         FSTD    %fr27,-48(%sp)
   421 ; Shutdown code, stage 1-1/2.
   423         ADD,DC  %r29,%r4,%r4        ; Cycle 1
   425         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   426         ADD,DC  %r0,%r20,%r20     
   427         FSTD    %fr26,-88(%sp)
   429         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   430         ADD     %r3,%r1,%r1
   432         ADD,DC  %r21,%r4,%r28       ; Cycle 4
   433         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
   435         ADD,DC  %r20,%r31,%r22      ; Cycle 5
   436         STD     %r1,UN_SIXTEEN(%r23)
   437 $JOIN5
   438         LDD     -96(%sp),%r3        ; moved from cycle 4
   439         LDD     -80(%sp),%r21
   440         ADD     %r21,%r3,%r3        ; Cycle 6
   441         ADD,DC  %r0,%r0,%r19        ; Cycle 7
   442         LDD     -88(%sp),%r4
   443         SHRPD   %r3,%r0,32,%r21
   444         LDD     -48(%sp),%r1
   445         SHRPD   %r19,%r3,32,%r3     ; Cycle 8
   446         ADD     %r21,%r1,%r1        ; Cycle 9
   447         ADD,DC  %r3,%r4,%r4         ; Cycle 10
   448         LDD     0(%r23),%r3         ; Cycle 11
   449         ADD     %r22,%r1,%r1        ; Cycle 13
   451 ; Shutdown code, stage 2-1/2.
   453         ADD,DC  %r0,%r4,%r4         ; Cycle 1
   454         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   455         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   456         ADD     %r3,%r1,%r1
   457         STD     %r1,UN_SIXTEEN(%r23)
   458         ADD,DC  %r21,%r4,%r1
   459         B       $JOIN1
   460         LDO     EIGHT(%r23),%r23
   462 ; exit
   464 $L0
   465         LDW     -124(%sp),%r4
   466         BVE     (%r2)
   467         .EXIT
   468         LDW,MB  -128(%sp),%r3
   470         .PROCEND
   472 ; ***************************************************************
   473 ;
   474 ;                 add_diag_[little/big]
   475 ;
   476 ; ***************************************************************
   478 ; The arguments are as follows:
   479 ;     r2   return PC, of course
   480 ;     r26 = arg1 =  length
   481 ;     r25 = arg2 =  vector to square
   482 ;     r24 = arg3 =  result vector
   484 #ifdef LITTLE_WORDIAN
   485 add_diag_little
   486 #else
   487 add_diag_big
   488 #endif
   489         .PROC
   490         .CALLINFO FRAME=120,ENTRY_GR=4
   491         .ENTRY
   492         STW,MA  %r3,128(%sp)
   493         STW     %r4,-124(%sp)
   495         ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
   496         NOP
   498 ; Startup code
   500         FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
   501         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
   502         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
   503         XMPYU   %fr7L,%fr7L,%fr30
   504         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
   505         FSTD    %fr29,-88(%sp)
   506         FSTD    %fr27,-72(%sp)      ; Cycle 7
   507         CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
   508         FSTD    %fr30,-96(%sp)
   509         FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
   510         LDD     -88(%sp),%r22       ; Cycle 3
   511         LDD     -72(%sp),%r31       ; Cycle 4
   512         XMPYU   %fr7R,%fr7R,%fr28
   513         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
   514         XMPYU   %fr7L,%fr7L,%fr31
   515         LDD     -96(%sp),%r20       ; Cycle 6
   516         FSTD    %fr28,-80(%sp)
   517         ADD     %r0,%r0,%r0         ; clear the carry bit
   518         ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
   519         FSTD    %fr24,-64(%sp)
   521 ; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
   523 $DIAGLOOP
   524         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
   525         LDO     SIXTEEN(%r25),%r25
   526         LDD     0(%r24),%r1
   527         FSTD    %fr31,-104(%sp)
   528         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   529         ADD,DC  %r22,%r3,%r3
   530         FLDD    UN_SIXTEEN(%r25),%fr7   
   531         ADD,DC  %r0,%r20,%r20       ; Cycle 3
   532         ADD     %r1,%r3,%r3
   533         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
   534         LDD     -80(%sp),%r21
   535         STD     %r3,0(%r24)
   536         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
   537         XMPYU   %fr7L,%fr7L,%fr30
   538         LDD     -64(%sp),%r29       
   539         LDD     EIGHT(%r24),%r1  
   540         ADD,DC  %r4,%r20,%r20       ; Cycle 6
   541         LDD     -104(%sp),%r19
   542         FSTD    %fr29,-88(%sp)
   543         ADD     %r20,%r1,%r1        ; Cycle 7
   544         FSTD    %fr27,-72(%sp)
   545         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   546         LDO     THIRTY_TWO(%r24),%r24
   547         LDD     UN_SIXTEEN(%r24),%r28
   548         FSTD    %fr30,-96(%sp)
   549         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
   550         ADD,DC  %r21,%r4,%r4
   551         FLDD    UN_EIGHT(%r25),%fr7
   552         STD     %r1,UN_TWENTY_FOUR(%r24)
   553         ADD,DC  %r0,%r19,%r19       ; Cycle 3
   554         ADD     %r28,%r4,%r4
   555         XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
   556         LDD     -88(%sp),%r22
   557         STD     %r4,UN_SIXTEEN(%r24)
   558         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
   559         XMPYU   %fr7L,%fr7L,%fr31
   560         LDD     -72(%sp),%r31
   561         LDD     UN_EIGHT(%r24),%r28
   562         ADD,DC  %r3,%r19,%r19       ; Cycle 6
   563         LDD     -96(%sp),%r20
   564         FSTD    %fr28,-80(%sp)
   565         ADD     %r19,%r28,%r28      ; Cycle 7
   566         FSTD    %fr24,-64(%sp)
   567         ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
   568         STD     %r28,UN_EIGHT(%r24)
   570 $ENDDIAGLOOP
   572         ADD,DC  %r0,%r22,%r22    
   573         CMPIB,= 0,%r26,$ONEMOREDIAG
   574         SHRPD   %r31,%r0,31,%r3
   576 ; Shutdown code, first stage.
   578         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
   579         LDD     0(%r24),%r28
   580         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   581         ADD     %r3,%r22,%r3
   582         ADD,DC  %r0,%r20,%r20       ; Cycle 3
   583         LDD     -80(%sp),%r21
   584         ADD     %r3,%r28,%r3
   585         LDD     -64(%sp),%r29       ; Cycle 4
   586         STD     %r3,0(%r24)
   587         LDD     EIGHT(%r24),%r1     ; Cycle 5
   588         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
   589         LDD     -104(%sp),%r19
   590         ADD,DC  %r4,%r20,%r20
   591         ADD     %r20,%r1,%r1        ; Cycle 7
   592         ADD,DC  %r0,%r21,%r21       ; Cycle 8
   593         STD     %r1,EIGHT(%r24)
   595 ; Shutdown code, second stage.
   597         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   598         LDO     THIRTY_TWO(%r24),%r24
   599         LDD     UN_SIXTEEN(%r24),%r1
   600         SHRPD   %r0,%r29,31,%r3      ; Cycle 2
   601         ADD     %r4,%r21,%r4
   602         ADD,DC  %r0,%r19,%r19       ; Cycle 3
   603         ADD     %r4,%r1,%r4
   604         STD     %r4,UN_SIXTEEN(%r24); Cycle 4
   605         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
   606         ADD,DC  %r3,%r19,%r19       ; Cycle 6       
   607         ADD     %r19,%r28,%r28      ; Cycle 7
   608         ADD,DC  %r0,%r0,%r22        ; Cycle 8
   609         CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
   610         STD     %r28,UN_EIGHT(%r24)
   612 ; Final carry propagation
   614 $FDIAG2
   615         LDO     EIGHT(%r24),%r24
   616         LDD     UN_EIGHT(%r24),%r26
   617         ADDI    1,%r26,%r26
   618         CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
   619         STD     %r26,UN_EIGHT(%r24)
   621         B   $Z0
   622         NOP
   624 ; Here is the code that handles the difficult case N=1.
   625 ; We do the usual trick -- branch out of the startup code at appropriate
   626 ; points, and branch into the shutdown code.
   628 $DIAG_N_IS_ONE
   630         LDD     -88(%sp),%r22
   631         LDD     -72(%sp),%r31
   632         B       $JOINDIAG
   633         LDD     -96(%sp),%r20
   635 ; We came out of the unrolled loop with wrong parity.  Do one more
   636 ; single cycle.  This is the "alternate body".  It will, of course,
   637 ; give us opposite registers from the other case, so we need
   638 ; completely different shutdown code.
   640 $ONEMOREDIAG
   641         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
   642         LDD     0(%r24),%r28
   643         FLDD    0(%r25),%fr7        ; Cycle 2
   644         SHRPD   %r0,%r31,31,%r4
   645         ADD     %r3,%r22,%r3
   646         ADD,DC  %r0,%r20,%r20       ; Cycle 3
   647         LDD     -80(%sp),%r21
   648         ADD     %r3,%r28,%r3
   649         LDD     -64(%sp),%r29       ; Cycle 4
   650         STD     %r3,0(%r24)
   651         XMPYU   %fr7R,%fr7R,%fr29
   652         LDD     EIGHT(%r24),%r1     ; Cycle 5
   653         XMPYU   %fr7L,%fr7R,%fr27
   654         XMPYU   %fr7L,%fr7L,%fr30
   655         LDD     -104(%sp),%r19      ; Cycle 6
   656         FSTD    %fr29,-88(%sp)
   657         ADD,DC  %r4,%r20,%r20
   658         FSTD    %fr27,-72(%sp)      ; Cycle 7
   659         ADD     %r20,%r1,%r1
   660         ADD,DC  %r0,%r21,%r21       ; Cycle 8
   661         STD     %r1,EIGHT(%r24)
   663 ; Shutdown code, first stage.
   665         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   666         LDO     THIRTY_TWO(%r24),%r24
   667         FSTD    %fr30,-96(%sp)
   668         LDD     UN_SIXTEEN(%r24),%r1
   669         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
   670         ADD     %r4,%r21,%r4
   671         ADD,DC  %r0,%r19,%r19       ; Cycle 3
   672         LDD     -88(%sp),%r22
   673         ADD     %r4,%r1,%r4
   674         LDD     -72(%sp),%r31       ; Cycle 4
   675         STD     %r4,UN_SIXTEEN(%r24)
   676         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
   677         LDD     -96(%sp),%r20       ; Cycle 6
   678         ADD,DC  %r3,%r19,%r19
   679         ADD     %r19,%r28,%r28      ; Cycle 7
   680         ADD,DC  %r0,%r22,%r22       ; Cycle 8
   681         STD     %r28,UN_EIGHT(%r24)
   683 ; Shutdown code, second stage.
   685 $JOINDIAG
   686         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
   687         LDD     0(%r24),%r28        
   688         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   689         ADD     %r3,%r22,%r3
   690         ADD,DC  %r0,%r20,%r20       ; Cycle 3
   691         ADD     %r3,%r28,%r3
   692         STD     %r3,0(%r24)         ; Cycle 4
   693         LDD     EIGHT(%r24),%r1     ; Cycle 5
   694         ADD,DC  %r4,%r20,%r20
   695         ADD     %r20,%r1,%r1        ; Cycle 7
   696         ADD,DC  %r0,%r0,%r21        ; Cycle 8
   697         CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
   698         STD     %r1,EIGHT(%r24)
   700 ; Final carry propagation
   702 $FDIAG1
   703         LDO     EIGHT(%r24),%r24
   704         LDD     EIGHT(%r24),%r26
   705         ADDI    1,%r26,%r26
   706         CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
   707         STD     %r26,EIGHT(%r24)
   709 $Z0
   710         LDW     -124(%sp),%r4
   711         BVE     (%r2)
   712         .EXIT
   713         LDW,MB  -128(%sp),%r3
   714         .PROCEND
   715 ;	.ALLOW
   717         .SPACE         $TEXT$
   718         .SUBSPA        $CODE$
   719 #ifdef LITTLE_WORDIAN
   720 #ifdef __GNUC__
   721 ; GNU-as (as of 2.19) does not support LONG_RETURN
   722         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
   723         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
   724 #else
   725         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
   726         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
   727 #endif
   728 #else
   729         .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
   730         .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
   731 #endif
   732         .END
   735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
   736 ; 
   737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
   738 ; performs a 64-bit x any-size multiply, and adds the
   739 ; result to an area of memory.  That is, it performs
   740 ; something like
   741 ; 
   742 ;      A B C D
   743 ;    *       Z
   744 ;   __________
   745 ;    P Q R S T
   746 ; 
   747 ; and then adds the "PQRST" vector into an area of memory,
   748 ; handling all carries.
   749 ; 
   750 ; Digression on nomenclature and endian-ness:
   751 ; 
   752 ; Each of the capital letters in the above represents a 64-bit
   753 ; quantity.  That is, you could think of the discussion as
   754 ; being in terms of radix-16-quintillion arithmetic.  The data
   755 ; type being manipulated is "unsigned long long int".  This
   756 ; requires the 64-bit extension of the HP-UX C compiler,
   757 ; available at release 10.  You need these compiler flags to
   758 ; enable these extensions:
   759 ; 
   760 ;       -Aa +e +DA2.0 +DS2.0
   761 ; 
   762 ; (The first specifies ANSI C, the second enables the
   763 ; extensions, which are beyond ANSI C, and the third and
   764 ; fourth tell the compiler to use whatever features of the
   765 ; PA2.0 architecture it wishes, in order to made the code more
   766 ; efficient.  Since the presence of the assembly code will
   767 ; make the program unable to run on anything less than PA2.0,
   768 ; you might as well gain the performance enhancements in the C
   769 ; code as well.)
   770 ; 
   771 ; Questions of "endian-ness" often come up, usually in the
   772 ; context of byte ordering in a word.  These routines have a
   773 ; similar issue, that could be called "wordian-ness".
   774 ; Independent of byte ordering (PA is always big-endian), one
   775 ; can make two choices when representing extremely large
   776 ; numbers as arrays of 64-bit doublewords in memory.
   777 ; 
   778 ; "Little-wordian" layout means that the least significant
   779 ; word of a number is stored at the lowest address.
   780 ; 
   781 ;   MSW     LSW
   782 ;    |       |
   783 ;    V       V
   784 ; 
   785 ;    A B C D E
   786 ; 
   787 ;    ^     ^ ^
   788 ;    |     | |____ address 0
   789 ;    |     |
   790 ;    |     |_______address 8
   791 ;    |
   792 ;    address 32
   793 ; 
   794 ; "Big-wordian" means that the most significant word is at the
   795 ; lowest address.
   796 ; 
   797 ;   MSW     LSW
   798 ;    |       |
   799 ;    V       V
   800 ; 
   801 ;    A B C D E
   802 ; 
   803 ;    ^     ^ ^
   804 ;    |     | |____ address 32
   805 ;    |     |
   806 ;    |     |_______address 24
   807 ;    |
   808 ;    address 0
   809 ; 
   810 ; When you compile the file, you must specify one or the other, with
   811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
   812 ; 
   813 ;     Incidentally, you assemble this file as part of your
   814 ;     project with the same C compiler as the rest of the program.
   815 ;     My "makefile" for a superprecision arithmetic package has
   816 ;     the following stuff:
   817 ; 
   818 ;     # definitions:
   819 ;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
   820 ;     CFLAGS = +O3
   821 ;     LDFLAGS = -L /usr/lib -Wl,-aarchive
   822 ; 
   823 ;     # general build rule for ".s" files:
   824 ;     .s.o:
   825 ;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
   826 ; 
   827 ;     # Now any bind step that calls for pa20.o will assemble pa20.s
   828 ; 
   829 ; End of digression, back to arithmetic:
   830 ; 
   831 ; The way we multiply two huge numbers is, of course, to multiply
   832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
   833 ; the result vectors with increasing offsets, the way we learned
   834 ; in school, back before we all used calculators:
   835 ; 
   836 ;            A B C D
   837 ;          * W X Y Z
   838 ;         __________
   839 ;          P Q R S T
   840 ;        E F G H I
   841 ;      M N O P Q
   842 ;  + R S T U V
   843 ;    _______________
   844 ;    F I N A L S U M
   845 ; 
   846 ; So we call maxpy_PA20_big (in my case; my package is
   847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
   848 ; in turn as the "scalar", and giving the "ABCD" vector each
   849 ; time.  We direct it to add its result into an area of memory
   850 ; that we have cleared at the start.  We skew the exact
   851 ; location into that area with each call.
   852 ; 
   853 ; The prototype for the function is
   854 ; 
   855 ; extern void maxpy_PA20_big(
   856 ;    int length,        /* Number of doublewords in the multiplicand vector. */
   857 ;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
   858 ;    const long long int *multiplicand,  /* The multiplicand vector. */
   859 ;    long long int *result);             /* Where to accumulate the result. */
   860 ; 
   861 ; (You should place a copy of this prototype in an include file
   862 ; or in your C file.)
   863 ; 
   864 ; Now, IN ALL CASES, the given address for the multiplicand or
   865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
   866 ; That word is, of course, the word at which the routine
   867 ; starts processing.  "maxpy_PA20_little" then increases the
   868 ; addresses as it computes.  "maxpy_PA20_big" decreases them.
   869 ; 
   870 ; In our example above, "length" would be 4 in each case.
   871 ; "multiplicand" would be the "ABCD" vector.  Specifically,
   872 ; the address of the element "D".  "scalaraddr" would be the
   873 ; address of "W", "X", "Y", or "Z" on the four calls that we
   874 ; would make.  (The order doesn't matter, of course.)
   875 ; "result" would be the appropriate address in the result
   876 ; area.  When multiplying by "Z", that would be the least
   877 ; significant word.  When multiplying by "Y", it would be the
   878 ; next higher word (8 bytes higher if little-wordian; 8 bytes
   879 ; lower if big-wordian), and so on.  The size of the result
   880 ; area must be the the sum of the sizes of the multiplicand
   881 ; and multiplier vectors, and must be initialized to zero
   882 ; before we start.
   883 ; 
   884 ; Whenever the routine adds its partial product into the result
   885 ; vector, it follows carry chains as far as they need to go.
   886 ; 
   887 ; Here is the super-precision multiply routine that I use for
   888 ; my package.  The package is big-wordian.  I have taken out
   889 ; handling of exponents (it's a floating point package):
   890 ; 
   891 ; static void mul_PA20(
   892 ;   int size,
   893 ;   const long long int *arg1,
   894 ;   const long long int *arg2,
   895 ;   long long int *result)
   896 ; {
   897 ;    int i;
   898 ; 
   899 ;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
   900 ; 
   901 ;    for (i=0 ; i<size ; i++) {
   902 ;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
   903 ;    }
   904 ; }

mercurial