security/nss/lib/freebl/mpi/hppa20.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/hppa20.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,904 @@
     1.4 +; This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +; License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +; file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +#ifdef __LP64__
     1.9 +        .LEVEL   2.0W
    1.10 +#else
    1.11 +;       .LEVEL   1.1
    1.12 +;       .ALLOW   2.0N
    1.13 +        .LEVEL   2.0
    1.14 +#endif
    1.15 +        .SPACE   $TEXT$,SORT=8
    1.16 +        .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
    1.17 +
    1.18 +; ***************************************************************
    1.19 +;
    1.20 +;                 maxpy_[little/big]
    1.21 +;
    1.22 +; ***************************************************************
    1.23 +
    1.24 +; There is no default -- you must specify one or the other.
    1.25 +#define LITTLE_WORDIAN 1
    1.26 +
    1.27 +#ifdef LITTLE_WORDIAN
    1.28 +#define EIGHT 8
    1.29 +#define SIXTEEN 16
    1.30 +#define THIRTY_TWO 32
    1.31 +#define UN_EIGHT -8
    1.32 +#define UN_SIXTEEN -16
    1.33 +#define UN_TWENTY_FOUR -24
    1.34 +#endif
    1.35 +
    1.36 +#ifdef BIG_WORDIAN
    1.37 +#define EIGHT -8
    1.38 +#define SIXTEEN -16
    1.39 +#define THIRTY_TWO -32
    1.40 +#define UN_EIGHT 8
    1.41 +#define UN_SIXTEEN 16
    1.42 +#define UN_TWENTY_FOUR 24
    1.43 +#endif
    1.44 +
    1.45 +; This performs a multiple-precision integer version of "daxpy",
    1.46 +; Using the selected addressing direction.  "Little-wordian" means that
    1.47 +; the least significant word of a number is stored at the lowest address.
    1.48 +; "Big-wordian" means that the most significant word is at the lowest
    1.49 +; address.  Either way, the incoming address of the vector is that
    1.50 +; of the least significant word.  That means that, for little-wordian
    1.51 +; addressing, we move the address upward as we propagate carries
    1.52 +; from the least significant word to the most significant.  For
    1.53 +; big-wordian we move the address downward.
    1.54 +
    1.55 +; We use the following registers:
    1.56 +;
    1.57 +;     r2   return PC, of course
    1.58 +;     r26 = arg1 =  length
    1.59 +;     r25 = arg2 =  address of scalar
    1.60 +;     r24 = arg3 =  multiplicand vector
    1.61 +;     r23 = arg4 =  result vector
    1.62 +;
    1.63 +;     fr9 = scalar loaded once only from r25
    1.64 +
    1.65 +; The cycle counts shown in the bodies below are simply the result of a
    1.66 +; scheduling by hand.  The actual PCX-U hardware does it differently.
    1.67 +; The intention is that the overall speed is the same.
    1.68 +
    1.69 +; The pipeline startup and shutdown code is constructed in the usual way,
    1.70 +; by taking the loop bodies and removing unnecessary instructions.
    1.71 +; We have left the comments describing cycle numbers in the code.
    1.72 +; These are intended for reference when comparing with the main loop,
    1.73 +; and have no particular relationship to actual cycle numbers.
    1.74 +
    1.75 +#ifdef LITTLE_WORDIAN
    1.76 +maxpy_little
    1.77 +#else
    1.78 +maxpy_big
    1.79 +#endif
    1.80 +        .PROC
    1.81 +        .CALLINFO FRAME=120,ENTRY_GR=4
    1.82 +        .ENTRY
    1.83 +        STW,MA  %r3,128(%sp)
    1.84 +        STW     %r4,-124(%sp)
    1.85 +
    1.86 +        ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
    1.87 +        FLDD    0(%r25),%fr9        ; fr9 = scalar
    1.88 +
    1.89 +; First startup
    1.90 +
    1.91 +        FLDD    0(%r24),%fr24       ; Cycle 1
    1.92 +        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    1.93 +        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    1.94 +        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    1.95 +        CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
    1.96 +        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
    1.97 +        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
    1.98 +        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
    1.99 +        FSTD    %fr24,-96(%sp)
   1.100 +        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   1.101 +        FSTD    %fr25,-80(%sp)
   1.102 +        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   1.103 +        FSTD    %fr31,-64(%sp)
   1.104 +        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   1.105 +        FSTD    %fr27,-48(%sp)
   1.106 +
   1.107 +; Second startup
   1.108 +
   1.109 +        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   1.110 +        FSTD    %fr30,-56(%sp)
   1.111 +        FLDD    0(%r24),%fr24
   1.112 +
   1.113 +        FSTD    %fr26,-88(%sp)      ; Cycle 2
   1.114 +
   1.115 +        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   1.116 +        FSTD    %fr28,-104(%sp)
   1.117 +
   1.118 +        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   1.119 +        LDD     -96(%sp),%r3
   1.120 +        FSTD    %fr29,-72(%sp)
   1.121 +
   1.122 +        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   1.123 +        LDD     -64(%sp),%r19
   1.124 +        LDD     -80(%sp),%r21
   1.125 +
   1.126 +        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
   1.127 +        LDD     -56(%sp),%r20
   1.128 +        ADD     %r21,%r3,%r3
   1.129 +
   1.130 +        ADD,DC  %r20,%r19,%r19      ; Cycle 7
   1.131 +        LDD     -88(%sp),%r4
   1.132 +        SHRPD   %r3,%r0,32,%r21
   1.133 +        LDD     -48(%sp),%r1
   1.134 +
   1.135 +        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
   1.136 +        LDD     -104(%sp),%r31
   1.137 +        ADD,DC  %r0,%r0,%r20
   1.138 +        SHRPD   %r19,%r3,32,%r3
   1.139 +
   1.140 +        LDD     -72(%sp),%r29       ; Cycle 9
   1.141 +        SHRPD   %r20,%r19,32,%r20
   1.142 +        ADD     %r21,%r1,%r1
   1.143 +
   1.144 +        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   1.145 +        ADD,DC  %r3,%r4,%r4
   1.146 +        FSTD    %fr24,-96(%sp)
   1.147 +
   1.148 +        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   1.149 +        ADD,DC  %r0,%r20,%r20
   1.150 +        LDD     0(%r23),%r3
   1.151 +        FSTD    %fr25,-80(%sp)
   1.152 +
   1.153 +        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   1.154 +        FSTD    %fr31,-64(%sp)
   1.155 +
   1.156 +        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   1.157 +        ADD     %r0,%r0,%r0         ; clear the carry bit
   1.158 +        ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
   1.159 +        FSTD    %fr27,-48(%sp)
   1.160 +;        MFCTL   %cr16,%r21         ; for timing
   1.161 +;        STD     %r21,-112(%sp)
   1.162 +
   1.163 +; Here is the loop.
   1.164 +
   1.165 +$LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   1.166 +        ADD,DC  %r29,%r4,%r4
   1.167 +        FSTD    %fr30,-56(%sp)
   1.168 +        FLDD    0(%r24),%fr24
   1.169 +
   1.170 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.171 +        ADD,DC  %r0,%r20,%r20
   1.172 +        FSTD    %fr26,-88(%sp)
   1.173 +
   1.174 +        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   1.175 +        ADD     %r3,%r1,%r1
   1.176 +        FSTD    %fr28,-104(%sp)
   1.177 +        LDD     UN_EIGHT(%r23),%r21
   1.178 +
   1.179 +        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   1.180 +        ADD,DC  %r21,%r4,%r28
   1.181 +        FSTD    %fr29,-72(%sp)    
   1.182 +        LDD     -96(%sp),%r3
   1.183 +
   1.184 +        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   1.185 +        ADD,DC  %r20,%r31,%r22
   1.186 +        LDD     -64(%sp),%r19
   1.187 +        LDD     -80(%sp),%r21
   1.188 +
   1.189 +        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
   1.190 +        ADD     %r21,%r3,%r3
   1.191 +        LDD     -56(%sp),%r20
   1.192 +        STD     %r1,UN_SIXTEEN(%r23)
   1.193 +
   1.194 +        ADD,DC  %r20,%r19,%r19      ; Cycle 7
   1.195 +        SHRPD   %r3,%r0,32,%r21
   1.196 +        LDD     -88(%sp),%r4
   1.197 +        LDD     -48(%sp),%r1
   1.198 +
   1.199 +        ADD,DC  %r0,%r0,%r20        ; Cycle 8
   1.200 +        SHRPD   %r19,%r3,32,%r3
   1.201 +        FLDD    EIGHT(%r24),%fr28
   1.202 +        LDD     -104(%sp),%r31
   1.203 +
   1.204 +        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
   1.205 +        ADD     %r21,%r1,%r1
   1.206 +        STD     %r28,UN_EIGHT(%r23)
   1.207 +        LDD     -72(%sp),%r29
   1.208 +
   1.209 +        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   1.210 +        ADD,DC  %r3,%r4,%r4
   1.211 +        FSTD    %fr24,-96(%sp)
   1.212 +
   1.213 +        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   1.214 +        ADD,DC  %r0,%r20,%r20
   1.215 +        FSTD    %fr25,-80(%sp)
   1.216 +        LDD     0(%r23),%r3
   1.217 +
   1.218 +        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
   1.219 +        FSTD    %fr31,-64(%sp)
   1.220 +
   1.221 +        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   1.222 +        ADD     %r22,%r1,%r1
   1.223 +        ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
   1.224 +        FSTD    %fr27,-48(%sp)
   1.225 +
   1.226 +$ENDLOOP
   1.227 +
   1.228 +; Shutdown code, first stage.
   1.229 +
   1.230 +;        MFCTL   %cr16,%r21         ; for timing
   1.231 +;        STD     %r21,UN_SIXTEEN(%r23)
   1.232 +;        LDD     -112(%sp),%r21
   1.233 +;        STD     %r21,UN_EIGHT(%r23)
   1.234 +
   1.235 +        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   1.236 +        ADD,DC  %r29,%r4,%r4
   1.237 +        CMPIB,= 0,%r26,$ONEMORE
   1.238 +        FSTD    %fr30,-56(%sp)
   1.239 +
   1.240 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.241 +        ADD,DC  %r0,%r20,%r20
   1.242 +        FSTD    %fr26,-88(%sp)
   1.243 +
   1.244 +        ADD     %r3,%r1,%r1         ; Cycle 3
   1.245 +        FSTD    %fr28,-104(%sp)
   1.246 +        LDD     UN_EIGHT(%r23),%r21
   1.247 +
   1.248 +        ADD,DC  %r21,%r4,%r28       ; Cycle 4
   1.249 +        FSTD    %fr29,-72(%sp)    
   1.250 +        STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
   1.251 +        LDD     -96(%sp),%r3
   1.252 +
   1.253 +        ADD,DC  %r20,%r31,%r22      ; Cycle 5
   1.254 +        STD     %r1,UN_SIXTEEN(%r23)
   1.255 +$JOIN4
   1.256 +        LDD     -64(%sp),%r19
   1.257 +        LDD     -80(%sp),%r21
   1.258 +
   1.259 +        ADD     %r21,%r3,%r3        ; Cycle 6
   1.260 +        LDD     -56(%sp),%r20
   1.261 +
   1.262 +        ADD,DC  %r20,%r19,%r19      ; Cycle 7
   1.263 +        SHRPD   %r3,%r0,32,%r21
   1.264 +        LDD     -88(%sp),%r4
   1.265 +        LDD     -48(%sp),%r1
   1.266 +
   1.267 +        ADD,DC  %r0,%r0,%r20        ; Cycle 8
   1.268 +        SHRPD   %r19,%r3,32,%r3
   1.269 +        LDD     -104(%sp),%r31
   1.270 +
   1.271 +        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
   1.272 +        ADD     %r21,%r1,%r1
   1.273 +        LDD     -72(%sp),%r29
   1.274 +
   1.275 +        ADD,DC  %r3,%r4,%r4         ; Cycle 10
   1.276 +
   1.277 +        ADD,DC  %r0,%r20,%r20       ; Cycle 11
   1.278 +        LDD     0(%r23),%r3
   1.279 +
   1.280 +        ADD     %r22,%r1,%r1        ; Cycle 13
   1.281 +
   1.282 +; Shutdown code, second stage.
   1.283 +
   1.284 +        ADD,DC  %r29,%r4,%r4        ; Cycle 1
   1.285 +
   1.286 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.287 +        ADD,DC  %r0,%r20,%r20
   1.288 +
   1.289 +        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   1.290 +        ADD     %r3,%r1,%r1
   1.291 +
   1.292 +        ADD,DC  %r21,%r4,%r28       ; Cycle 4
   1.293 +
   1.294 +        ADD,DC  %r20,%r31,%r22      ; Cycle 5
   1.295 +
   1.296 +        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
   1.297 +
   1.298 +        STD     %r28,UN_EIGHT(%r23) ; Cycle 9
   1.299 +
   1.300 +        LDD     0(%r23),%r3         ; Cycle 11
   1.301 +
   1.302 +; Shutdown code, third stage.
   1.303 +
   1.304 +        LDO     SIXTEEN(%r23),%r23
   1.305 +        ADD     %r3,%r22,%r1
   1.306 +$JOIN1  ADD,DC  %r0,%r0,%r21
   1.307 +        CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
   1.308 +        STD     %r1,UN_SIXTEEN(%r23)
   1.309 +
   1.310 +; Final carry propagation
   1.311 +
   1.312 +$FINAL1 LDO     EIGHT(%r23),%r23
   1.313 +        LDD     UN_SIXTEEN(%r23),%r21
   1.314 +        ADDI    1,%r21,%r21
   1.315 +        CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
   1.316 +        STD     %r21,UN_SIXTEEN(%r23)
   1.317 +        B       $L0
   1.318 +        NOP
   1.319 +
   1.320 +; Here is the code that handles the difficult cases N=1, N=2, and N=3.
   1.321 +; We do the usual trick -- branch out of the startup code at appropriate
   1.322 +; points, and branch into the shutdown code.
   1.323 +
   1.324 +$N_IS_SMALL
   1.325 +        CMPIB,= 0,%r26,$N_IS_ONE
   1.326 +        FSTD    %fr24,-96(%sp)      ; Cycle 10
   1.327 +        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
   1.328 +        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
   1.329 +        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
   1.330 +        FSTD    %fr25,-80(%sp)
   1.331 +        FSTD    %fr31,-64(%sp)      ; Cycle 12
   1.332 +        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
   1.333 +        FSTD    %fr27,-48(%sp)
   1.334 +        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
   1.335 +        CMPIB,= 2,%r26,$N_IS_THREE
   1.336 +        FSTD    %fr30,-56(%sp)
   1.337 +
   1.338 +; N = 2
   1.339 +        FSTD    %fr26,-88(%sp)      ; Cycle 2
   1.340 +        FSTD    %fr28,-104(%sp)     ; Cycle 3
   1.341 +        LDD     -96(%sp),%r3        ; Cycle 4
   1.342 +        FSTD    %fr29,-72(%sp)
   1.343 +        B       $JOIN4
   1.344 +        ADD     %r0,%r0,%r22
   1.345 +
   1.346 +$N_IS_THREE
   1.347 +        FLDD    SIXTEEN(%r24),%fr24
   1.348 +        FSTD    %fr26,-88(%sp)      ; Cycle 2
   1.349 +        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   1.350 +        FSTD    %fr28,-104(%sp)
   1.351 +        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   1.352 +        LDD     -96(%sp),%r3
   1.353 +        FSTD    %fr29,-72(%sp)
   1.354 +        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   1.355 +        LDD     -64(%sp),%r19
   1.356 +        LDD     -80(%sp),%r21
   1.357 +        B       $JOIN3
   1.358 +        ADD     %r0,%r0,%r22
   1.359 +
   1.360 +$N_IS_ONE
   1.361 +        FSTD    %fr25,-80(%sp)
   1.362 +        FSTD    %fr27,-48(%sp)
   1.363 +        FSTD    %fr26,-88(%sp)      ; Cycle 2
   1.364 +        B       $JOIN5
   1.365 +        ADD     %r0,%r0,%r22
   1.366 +
   1.367 +; We came out of the unrolled loop with wrong parity.  Do one more
   1.368 +; single cycle.  This is quite tricky, because of the way the
   1.369 +; carry chains and SHRPD chains have been chopped up.
   1.370 +
   1.371 +$ONEMORE
   1.372 +
   1.373 +        FLDD    0(%r24),%fr24
   1.374 +
   1.375 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.376 +        ADD,DC  %r0,%r20,%r20
   1.377 +        FSTD    %fr26,-88(%sp)
   1.378 +
   1.379 +        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
   1.380 +        FSTD    %fr28,-104(%sp)
   1.381 +        LDD     UN_EIGHT(%r23),%r21
   1.382 +        ADD     %r3,%r1,%r1
   1.383 +
   1.384 +        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
   1.385 +        ADD,DC  %r21,%r4,%r28
   1.386 +        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
   1.387 +        LDD     -96(%sp),%r3
   1.388 +        FSTD    %fr29,-72(%sp)    
   1.389 +
   1.390 +        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
   1.391 +        ADD,DC  %r20,%r31,%r22
   1.392 +        LDD     -64(%sp),%r19
   1.393 +        LDD     -80(%sp),%r21
   1.394 +
   1.395 +        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
   1.396 +$JOIN3
   1.397 +        XMPYU   %fr9L,%fr24R,%fr24
   1.398 +        LDD     -56(%sp),%r20
   1.399 +        ADD     %r21,%r3,%r3
   1.400 +
   1.401 +        ADD,DC  %r20,%r19,%r19      ; Cycle 7
   1.402 +        LDD     -88(%sp),%r4
   1.403 +        SHRPD   %r3,%r0,32,%r21
   1.404 +        LDD     -48(%sp),%r1
   1.405 +
   1.406 +        LDD     -104(%sp),%r31      ; Cycle 8
   1.407 +        ADD,DC  %r0,%r0,%r20
   1.408 +        SHRPD   %r19,%r3,32,%r3
   1.409 +
   1.410 +        LDD     -72(%sp),%r29       ; Cycle 9
   1.411 +        SHRPD   %r20,%r19,32,%r20
   1.412 +        ADD     %r21,%r1,%r1
   1.413 +
   1.414 +        ADD,DC  %r3,%r4,%r4         ; Cycle 10
   1.415 +        FSTD    %fr24,-96(%sp)
   1.416 +
   1.417 +        ADD,DC  %r0,%r20,%r20       ; Cycle 11
   1.418 +        LDD     0(%r23),%r3
   1.419 +        FSTD    %fr25,-80(%sp)
   1.420 +
   1.421 +        ADD     %r22,%r1,%r1        ; Cycle 13
   1.422 +        FSTD    %fr27,-48(%sp)
   1.423 +
   1.424 +; Shutdown code, stage 1-1/2.
   1.425 +
   1.426 +        ADD,DC  %r29,%r4,%r4        ; Cycle 1
   1.427 +
   1.428 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.429 +        ADD,DC  %r0,%r20,%r20     
   1.430 +        FSTD    %fr26,-88(%sp)
   1.431 +
   1.432 +        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   1.433 +        ADD     %r3,%r1,%r1
   1.434 +
   1.435 +        ADD,DC  %r21,%r4,%r28       ; Cycle 4
   1.436 +        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
   1.437 +
   1.438 +        ADD,DC  %r20,%r31,%r22      ; Cycle 5
   1.439 +        STD     %r1,UN_SIXTEEN(%r23)
   1.440 +$JOIN5
   1.441 +        LDD     -96(%sp),%r3        ; moved from cycle 4
   1.442 +        LDD     -80(%sp),%r21
   1.443 +        ADD     %r21,%r3,%r3        ; Cycle 6
   1.444 +        ADD,DC  %r0,%r0,%r19        ; Cycle 7
   1.445 +        LDD     -88(%sp),%r4
   1.446 +        SHRPD   %r3,%r0,32,%r21
   1.447 +        LDD     -48(%sp),%r1
   1.448 +        SHRPD   %r19,%r3,32,%r3     ; Cycle 8
   1.449 +        ADD     %r21,%r1,%r1        ; Cycle 9
   1.450 +        ADD,DC  %r3,%r4,%r4         ; Cycle 10
   1.451 +        LDD     0(%r23),%r3         ; Cycle 11
   1.452 +        ADD     %r22,%r1,%r1        ; Cycle 13
   1.453 +
   1.454 +; Shutdown code, stage 2-1/2.
   1.455 +
   1.456 +        ADD,DC  %r0,%r4,%r4         ; Cycle 1
   1.457 +        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
   1.458 +        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
   1.459 +        ADD     %r3,%r1,%r1
   1.460 +        STD     %r1,UN_SIXTEEN(%r23)
   1.461 +        ADD,DC  %r21,%r4,%r1
   1.462 +        B       $JOIN1
   1.463 +        LDO     EIGHT(%r23),%r23
   1.464 +
   1.465 +; exit
   1.466 +
   1.467 +$L0
   1.468 +        LDW     -124(%sp),%r4
   1.469 +        BVE     (%r2)
   1.470 +        .EXIT
   1.471 +        LDW,MB  -128(%sp),%r3
   1.472 +
   1.473 +        .PROCEND
   1.474 +
   1.475 +; ***************************************************************
   1.476 +;
   1.477 +;                 add_diag_[little/big]
   1.478 +;
   1.479 +; ***************************************************************
   1.480 +
   1.481 +; The arguments are as follows:
   1.482 +;     r2   return PC, of course
   1.483 +;     r26 = arg1 =  length
   1.484 +;     r25 = arg2 =  vector to square
   1.485 +;     r24 = arg3 =  result vector
   1.486 +
   1.487 +#ifdef LITTLE_WORDIAN
   1.488 +add_diag_little
   1.489 +#else
   1.490 +add_diag_big
   1.491 +#endif
   1.492 +        .PROC
   1.493 +        .CALLINFO FRAME=120,ENTRY_GR=4
   1.494 +        .ENTRY
   1.495 +        STW,MA  %r3,128(%sp)
   1.496 +        STW     %r4,-124(%sp)
   1.497 +
   1.498 +        ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
   1.499 +        NOP
   1.500 +
   1.501 +; Startup code
   1.502 +
   1.503 +        FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
   1.504 +        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
   1.505 +        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
   1.506 +        XMPYU   %fr7L,%fr7L,%fr30
   1.507 +        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
   1.508 +        FSTD    %fr29,-88(%sp)
   1.509 +        FSTD    %fr27,-72(%sp)      ; Cycle 7
   1.510 +        CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
   1.511 +        FSTD    %fr30,-96(%sp)
   1.512 +        FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
   1.513 +        LDD     -88(%sp),%r22       ; Cycle 3
   1.514 +        LDD     -72(%sp),%r31       ; Cycle 4
   1.515 +        XMPYU   %fr7R,%fr7R,%fr28
   1.516 +        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
   1.517 +        XMPYU   %fr7L,%fr7L,%fr31
   1.518 +        LDD     -96(%sp),%r20       ; Cycle 6
   1.519 +        FSTD    %fr28,-80(%sp)
   1.520 +        ADD     %r0,%r0,%r0         ; clear the carry bit
   1.521 +        ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
   1.522 +        FSTD    %fr24,-64(%sp)
   1.523 +
   1.524 +; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
   1.525 +
   1.526 +$DIAGLOOP
   1.527 +        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
   1.528 +        LDO     SIXTEEN(%r25),%r25
   1.529 +        LDD     0(%r24),%r1
   1.530 +        FSTD    %fr31,-104(%sp)
   1.531 +        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   1.532 +        ADD,DC  %r22,%r3,%r3
   1.533 +        FLDD    UN_SIXTEEN(%r25),%fr7   
   1.534 +        ADD,DC  %r0,%r20,%r20       ; Cycle 3
   1.535 +        ADD     %r1,%r3,%r3
   1.536 +        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
   1.537 +        LDD     -80(%sp),%r21
   1.538 +        STD     %r3,0(%r24)
   1.539 +        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
   1.540 +        XMPYU   %fr7L,%fr7L,%fr30
   1.541 +        LDD     -64(%sp),%r29       
   1.542 +        LDD     EIGHT(%r24),%r1  
   1.543 +        ADD,DC  %r4,%r20,%r20       ; Cycle 6
   1.544 +        LDD     -104(%sp),%r19
   1.545 +        FSTD    %fr29,-88(%sp)
   1.546 +        ADD     %r20,%r1,%r1        ; Cycle 7
   1.547 +        FSTD    %fr27,-72(%sp)
   1.548 +        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   1.549 +        LDO     THIRTY_TWO(%r24),%r24
   1.550 +        LDD     UN_SIXTEEN(%r24),%r28
   1.551 +        FSTD    %fr30,-96(%sp)
   1.552 +        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
   1.553 +        ADD,DC  %r21,%r4,%r4
   1.554 +        FLDD    UN_EIGHT(%r25),%fr7
   1.555 +        STD     %r1,UN_TWENTY_FOUR(%r24)
   1.556 +        ADD,DC  %r0,%r19,%r19       ; Cycle 3
   1.557 +        ADD     %r28,%r4,%r4
   1.558 +        XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
   1.559 +        LDD     -88(%sp),%r22
   1.560 +        STD     %r4,UN_SIXTEEN(%r24)
   1.561 +        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
   1.562 +        XMPYU   %fr7L,%fr7L,%fr31
   1.563 +        LDD     -72(%sp),%r31
   1.564 +        LDD     UN_EIGHT(%r24),%r28
   1.565 +        ADD,DC  %r3,%r19,%r19       ; Cycle 6
   1.566 +        LDD     -96(%sp),%r20
   1.567 +        FSTD    %fr28,-80(%sp)
   1.568 +        ADD     %r19,%r28,%r28      ; Cycle 7
   1.569 +        FSTD    %fr24,-64(%sp)
   1.570 +        ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
   1.571 +        STD     %r28,UN_EIGHT(%r24)
   1.572 +
   1.573 +$ENDDIAGLOOP
   1.574 +
   1.575 +        ADD,DC  %r0,%r22,%r22    
   1.576 +        CMPIB,= 0,%r26,$ONEMOREDIAG
   1.577 +        SHRPD   %r31,%r0,31,%r3
   1.578 +
   1.579 +; Shutdown code, first stage.
   1.580 +
   1.581 +        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
   1.582 +        LDD     0(%r24),%r28
   1.583 +        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   1.584 +        ADD     %r3,%r22,%r3
   1.585 +        ADD,DC  %r0,%r20,%r20       ; Cycle 3
   1.586 +        LDD     -80(%sp),%r21
   1.587 +        ADD     %r3,%r28,%r3
   1.588 +        LDD     -64(%sp),%r29       ; Cycle 4
   1.589 +        STD     %r3,0(%r24)
   1.590 +        LDD     EIGHT(%r24),%r1     ; Cycle 5
   1.591 +        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
   1.592 +        LDD     -104(%sp),%r19
   1.593 +        ADD,DC  %r4,%r20,%r20
   1.594 +        ADD     %r20,%r1,%r1        ; Cycle 7
   1.595 +        ADD,DC  %r0,%r21,%r21       ; Cycle 8
   1.596 +        STD     %r1,EIGHT(%r24)
   1.597 +
   1.598 +; Shutdown code, second stage.
   1.599 +
   1.600 +        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   1.601 +        LDO     THIRTY_TWO(%r24),%r24
   1.602 +        LDD     UN_SIXTEEN(%r24),%r1
   1.603 +        SHRPD   %r0,%r29,31,%r3      ; Cycle 2
   1.604 +        ADD     %r4,%r21,%r4
   1.605 +        ADD,DC  %r0,%r19,%r19       ; Cycle 3
   1.606 +        ADD     %r4,%r1,%r4
   1.607 +        STD     %r4,UN_SIXTEEN(%r24); Cycle 4
   1.608 +        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
   1.609 +        ADD,DC  %r3,%r19,%r19       ; Cycle 6       
   1.610 +        ADD     %r19,%r28,%r28      ; Cycle 7
   1.611 +        ADD,DC  %r0,%r0,%r22        ; Cycle 8
   1.612 +        CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
   1.613 +        STD     %r28,UN_EIGHT(%r24)
   1.614 +
   1.615 +; Final carry propagation
   1.616 +
   1.617 +$FDIAG2
   1.618 +        LDO     EIGHT(%r24),%r24
   1.619 +        LDD     UN_EIGHT(%r24),%r26
   1.620 +        ADDI    1,%r26,%r26
   1.621 +        CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
   1.622 +        STD     %r26,UN_EIGHT(%r24)
   1.623 +
   1.624 +        B   $Z0
   1.625 +        NOP
   1.626 +
   1.627 +; Here is the code that handles the difficult case N=1.
   1.628 +; We do the usual trick -- branch out of the startup code at appropriate
   1.629 +; points, and branch into the shutdown code.
   1.630 +
   1.631 +$DIAG_N_IS_ONE
   1.632 +
   1.633 +        LDD     -88(%sp),%r22
   1.634 +        LDD     -72(%sp),%r31
   1.635 +        B       $JOINDIAG
   1.636 +        LDD     -96(%sp),%r20
   1.637 +
   1.638 +; We came out of the unrolled loop with wrong parity.  Do one more
   1.639 +; single cycle.  This is the "alternate body".  It will, of course,
   1.640 +; give us opposite registers from the other case, so we need
   1.641 +; completely different shutdown code.
   1.642 +
   1.643 +$ONEMOREDIAG
   1.644 +        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
   1.645 +        LDD     0(%r24),%r28
   1.646 +        FLDD    0(%r25),%fr7        ; Cycle 2
   1.647 +        SHRPD   %r0,%r31,31,%r4
   1.648 +        ADD     %r3,%r22,%r3
   1.649 +        ADD,DC  %r0,%r20,%r20       ; Cycle 3
   1.650 +        LDD     -80(%sp),%r21
   1.651 +        ADD     %r3,%r28,%r3
   1.652 +        LDD     -64(%sp),%r29       ; Cycle 4
   1.653 +        STD     %r3,0(%r24)
   1.654 +        XMPYU   %fr7R,%fr7R,%fr29
   1.655 +        LDD     EIGHT(%r24),%r1     ; Cycle 5
   1.656 +        XMPYU   %fr7L,%fr7R,%fr27
   1.657 +        XMPYU   %fr7L,%fr7L,%fr30
   1.658 +        LDD     -104(%sp),%r19      ; Cycle 6
   1.659 +        FSTD    %fr29,-88(%sp)
   1.660 +        ADD,DC  %r4,%r20,%r20
   1.661 +        FSTD    %fr27,-72(%sp)      ; Cycle 7
   1.662 +        ADD     %r20,%r1,%r1
   1.663 +        ADD,DC  %r0,%r21,%r21       ; Cycle 8
   1.664 +        STD     %r1,EIGHT(%r24)
   1.665 +
   1.666 +; Shutdown code, first stage.
   1.667 +
   1.668 +        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
   1.669 +        LDO     THIRTY_TWO(%r24),%r24
   1.670 +        FSTD    %fr30,-96(%sp)
   1.671 +        LDD     UN_SIXTEEN(%r24),%r1
   1.672 +        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
   1.673 +        ADD     %r4,%r21,%r4
   1.674 +        ADD,DC  %r0,%r19,%r19       ; Cycle 3
   1.675 +        LDD     -88(%sp),%r22
   1.676 +        ADD     %r4,%r1,%r4
   1.677 +        LDD     -72(%sp),%r31       ; Cycle 4
   1.678 +        STD     %r4,UN_SIXTEEN(%r24)
   1.679 +        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
   1.680 +        LDD     -96(%sp),%r20       ; Cycle 6
   1.681 +        ADD,DC  %r3,%r19,%r19
   1.682 +        ADD     %r19,%r28,%r28      ; Cycle 7
   1.683 +        ADD,DC  %r0,%r22,%r22       ; Cycle 8
   1.684 +        STD     %r28,UN_EIGHT(%r24)
   1.685 +
   1.686 +; Shutdown code, second stage.
   1.687 +
   1.688 +$JOINDIAG
   1.689 +        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
   1.690 +        LDD     0(%r24),%r28        
   1.691 +        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
   1.692 +        ADD     %r3,%r22,%r3
   1.693 +        ADD,DC  %r0,%r20,%r20       ; Cycle 3
   1.694 +        ADD     %r3,%r28,%r3
   1.695 +        STD     %r3,0(%r24)         ; Cycle 4
   1.696 +        LDD     EIGHT(%r24),%r1     ; Cycle 5
   1.697 +        ADD,DC  %r4,%r20,%r20
   1.698 +        ADD     %r20,%r1,%r1        ; Cycle 7
   1.699 +        ADD,DC  %r0,%r0,%r21        ; Cycle 8
   1.700 +        CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
   1.701 +        STD     %r1,EIGHT(%r24)
   1.702 +
   1.703 +; Final carry propagation
   1.704 +
   1.705 +$FDIAG1
   1.706 +        LDO     EIGHT(%r24),%r24
   1.707 +        LDD     EIGHT(%r24),%r26
   1.708 +        ADDI    1,%r26,%r26
   1.709 +        CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
   1.710 +        STD     %r26,EIGHT(%r24)
   1.711 +
   1.712 +$Z0
   1.713 +        LDW     -124(%sp),%r4
   1.714 +        BVE     (%r2)
   1.715 +        .EXIT
   1.716 +        LDW,MB  -128(%sp),%r3
   1.717 +        .PROCEND
   1.718 +;	.ALLOW
   1.719 +
   1.720 +        .SPACE         $TEXT$
   1.721 +        .SUBSPA        $CODE$
   1.722 +#ifdef LITTLE_WORDIAN
   1.723 +#ifdef __GNUC__
   1.724 +; GNU-as (as of 2.19) does not support LONG_RETURN
   1.725 +        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
   1.726 +        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
   1.727 +#else
   1.728 +        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
   1.729 +        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
   1.730 +#endif
   1.731 +#else
   1.732 +        .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
   1.733 +        .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
   1.734 +#endif
   1.735 +        .END
   1.736 +
   1.737 +
   1.738 +; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
   1.739 +; 
   1.740 +; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
   1.741 +; performs a 64-bit x any-size multiply, and adds the
   1.742 +; result to an area of memory.  That is, it performs
   1.743 +; something like
   1.744 +; 
   1.745 +;      A B C D
   1.746 +;    *       Z
   1.747 +;   __________
   1.748 +;    P Q R S T
   1.749 +; 
   1.750 +; and then adds the "PQRST" vector into an area of memory,
   1.751 +; handling all carries.
   1.752 +; 
   1.753 +; Digression on nomenclature and endian-ness:
   1.754 +; 
   1.755 +; Each of the capital letters in the above represents a 64-bit
   1.756 +; quantity.  That is, you could think of the discussion as
   1.757 +; being in terms of radix-16-quintillion arithmetic.  The data
   1.758 +; type being manipulated is "unsigned long long int".  This
   1.759 +; requires the 64-bit extension of the HP-UX C compiler,
   1.760 +; available at release 10.  You need these compiler flags to
   1.761 +; enable these extensions:
   1.762 +; 
   1.763 +;       -Aa +e +DA2.0 +DS2.0
   1.764 +; 
   1.765 +; (The first specifies ANSI C, the second enables the
   1.766 +; extensions, which are beyond ANSI C, and the third and
   1.767 +; fourth tell the compiler to use whatever features of the
   1.768 +; PA2.0 architecture it wishes, in order to made the code more
   1.769 +; efficient.  Since the presence of the assembly code will
   1.770 +; make the program unable to run on anything less than PA2.0,
   1.771 +; you might as well gain the performance enhancements in the C
   1.772 +; code as well.)
   1.773 +; 
   1.774 +; Questions of "endian-ness" often come up, usually in the
   1.775 +; context of byte ordering in a word.  These routines have a
   1.776 +; similar issue, that could be called "wordian-ness".
   1.777 +; Independent of byte ordering (PA is always big-endian), one
   1.778 +; can make two choices when representing extremely large
   1.779 +; numbers as arrays of 64-bit doublewords in memory.
   1.780 +; 
   1.781 +; "Little-wordian" layout means that the least significant
   1.782 +; word of a number is stored at the lowest address.
   1.783 +; 
   1.784 +;   MSW     LSW
   1.785 +;    |       |
   1.786 +;    V       V
   1.787 +; 
   1.788 +;    A B C D E
   1.789 +; 
   1.790 +;    ^     ^ ^
   1.791 +;    |     | |____ address 0
   1.792 +;    |     |
   1.793 +;    |     |_______address 8
   1.794 +;    |
   1.795 +;    address 32
   1.796 +; 
   1.797 +; "Big-wordian" means that the most significant word is at the
   1.798 +; lowest address.
   1.799 +; 
   1.800 +;   MSW     LSW
   1.801 +;    |       |
   1.802 +;    V       V
   1.803 +; 
   1.804 +;    A B C D E
   1.805 +; 
   1.806 +;    ^     ^ ^
   1.807 +;    |     | |____ address 32
   1.808 +;    |     |
   1.809 +;    |     |_______address 24
   1.810 +;    |
   1.811 +;    address 0
   1.812 +; 
   1.813 +; When you compile the file, you must specify one or the other, with
   1.814 +; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
   1.815 +; 
   1.816 +;     Incidentally, you assemble this file as part of your
   1.817 +;     project with the same C compiler as the rest of the program.
   1.818 +;     My "makefile" for a superprecision arithmetic package has
   1.819 +;     the following stuff:
   1.820 +; 
   1.821 +;     # definitions:
   1.822 +;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
   1.823 +;     CFLAGS = +O3
   1.824 +;     LDFLAGS = -L /usr/lib -Wl,-aarchive
   1.825 +; 
   1.826 +;     # general build rule for ".s" files:
   1.827 +;     .s.o:
   1.828 +;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
   1.829 +; 
   1.830 +;     # Now any bind step that calls for pa20.o will assemble pa20.s
   1.831 +; 
   1.832 +; End of digression, back to arithmetic:
   1.833 +; 
   1.834 +; The way we multiply two huge numbers is, of course, to multiply
   1.835 +; the "ABCD" vector by each of the "WXYZ" doublewords, adding
   1.836 +; the result vectors with increasing offsets, the way we learned
   1.837 +; in school, back before we all used calculators:
   1.838 +; 
   1.839 +;            A B C D
   1.840 +;          * W X Y Z
   1.841 +;         __________
   1.842 +;          P Q R S T
   1.843 +;        E F G H I
   1.844 +;      M N O P Q
   1.845 +;  + R S T U V
   1.846 +;    _______________
   1.847 +;    F I N A L S U M
   1.848 +; 
   1.849 +; So we call maxpy_PA20_big (in my case; my package is
   1.850 +; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
   1.851 +; in turn as the "scalar", and giving the "ABCD" vector each
   1.852 +; time.  We direct it to add its result into an area of memory
   1.853 +; that we have cleared at the start.  We skew the exact
   1.854 +; location into that area with each call.
   1.855 +; 
   1.856 +; The prototype for the function is
   1.857 +; 
   1.858 +; extern void maxpy_PA20_big(
   1.859 +;    int length,        /* Number of doublewords in the multiplicand vector. */
   1.860 +;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
   1.861 +;    const long long int *multiplicand,  /* The multiplicand vector. */
   1.862 +;    long long int *result);             /* Where to accumulate the result. */
   1.863 +; 
   1.864 +; (You should place a copy of this prototype in an include file
   1.865 +; or in your C file.)
   1.866 +; 
   1.867 +; Now, IN ALL CASES, the given address for the multiplicand or
   1.868 +; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
   1.869 +; That word is, of course, the word at which the routine
   1.870 +; starts processing.  "maxpy_PA20_little" then increases the
   1.871 +; addresses as it computes.  "maxpy_PA20_big" decreases them.
   1.872 +; 
   1.873 +; In our example above, "length" would be 4 in each case.
   1.874 +; "multiplicand" would be the "ABCD" vector.  Specifically,
   1.875 +; the address of the element "D".  "scalaraddr" would be the
   1.876 +; address of "W", "X", "Y", or "Z" on the four calls that we
   1.877 +; would make.  (The order doesn't matter, of course.)
   1.878 +; "result" would be the appropriate address in the result
   1.879 +; area.  When multiplying by "Z", that would be the least
   1.880 +; significant word.  When multiplying by "Y", it would be the
   1.881 +; next higher word (8 bytes higher if little-wordian; 8 bytes
   1.882 +; lower if big-wordian), and so on.  The size of the result
   1.883 +; area must be the the sum of the sizes of the multiplicand
   1.884 +; and multiplier vectors, and must be initialized to zero
   1.885 +; before we start.
   1.886 +; 
   1.887 +; Whenever the routine adds its partial product into the result
   1.888 +; vector, it follows carry chains as far as they need to go.
   1.889 +; 
   1.890 +; Here is the super-precision multiply routine that I use for
   1.891 +; my package.  The package is big-wordian.  I have taken out
   1.892 +; handling of exponents (it's a floating point package):
   1.893 +; 
   1.894 +; static void mul_PA20(
   1.895 +;   int size,
   1.896 +;   const long long int *arg1,
   1.897 +;   const long long int *arg2,
   1.898 +;   long long int *result)
   1.899 +; {
   1.900 +;    int i;
   1.901 +; 
   1.902 +;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
   1.903 +; 
   1.904 +;    for (i=0 ; i<size ; i++) {
   1.905 +;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
   1.906 +;    }
   1.907 +; }

mercurial