security/nss/lib/freebl/mpi/hpma512.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/hpma512.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,615 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +/*
     1.8 + *
     1.9 + *  This PA-RISC 2.0 function computes the product of two unsigned integers,
    1.10 + *  and adds the result to a previously computed integer.  The multiplicand
    1.11 + *  is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in
    1.12 + *  memory in little-double-wordian order.  The multiplier is an unsigned
    1.13 + *  64-bit integer.  The previously computed integer to which the product is
    1.14 + *  added is located in the result ("res") area, and is assumed to be a
    1.15 + *  576-bit (72-byte, nine doubleword) unsigned integer, stored in memory
    1.16 + *  in little-double-wordian order.  This value normally will be the result
    1.17 + *  of a previously computed nine doubleword result.  It is not necessary
    1.18 + *  to pad the multiplicand with an additional 64-bit zero doubleword.
    1.19 + *
    1.20 + *  Multiplicand, multiplier, and addend ideally should be aligned at
    1.21 + *  16-byte boundaries for best performance.  The code will function
    1.22 + *  correctly for alignment at eight-byte boundaries which are not 16-byte
    1.23 + *  boundaries, but the execution may be slightly slower due to even/odd
    1.24 + *  bank conflicts on PA-RISC 8000 processors.
    1.25 + *
    1.26 + *  This function is designed to accept the same calling sequence as Bill
    1.27 + *  Ackerman's "maxpy_little" function.  The carry from the ninth doubleword
    1.28 + *  of the result is written to the tenth word of the result, as is done by
    1.29 + *  Bill Ackerman's function.  The final carry also is returned as an
    1.30 + *  integer, which may be ignored.  The function prototype may be either
    1.31 + *  of the following:
    1.32 + *
    1.33 + *      void multacc512( int l, chunk* m, const chunk* a, chunk* res );
    1.34 + *          or
    1.35 + *      int multacc512( int l, chunk* m, const chunk* a, chunk* res );
    1.36 + *
    1.37 + *  where:  "l" originally denoted vector lengths.  This parameter is
    1.38 + *      ignored.  This function always assumes a multiplicand length of
    1.39 + *      512 bits (eight doublewords), and addend and result lengths of
    1.40 + *      576 bits (nine doublewords).
    1.41 + *
    1.42 + *      "m" is a pointer to the doubleword multiplier, ideally aligned
    1.43 + *      on a 16-byte boundary.
    1.44 + *
    1.45 + *      "a" is a pointer to the eight-doubleword multiplicand, stored
    1.46 + *      in little-double-wordian order, and ideally aligned on a 16-byte
    1.47 + *      boundary.
    1.48 + *
    1.49 + *      "res" is a pointer to the nine doubleword addend, and to the
    1.50 + *      nine-doubleword product computed by this function.  The result
    1.51 + *      also is stored in little-double-wordian order, and ideally is
    1.52 + *      aligned on a 16-byte boundary. It is expected that the alignment
    1.53 + *      of the "res" area may alternate between even/odd doubleword
    1.54 + *      boundaries for successive calls for 512-bit x 512-bit
    1.55 + *      multiplications.
    1.56 + *
    1.57 + *  The code for this function has been scheduled to use the parallelism
    1.58 + *  of the PA-RISC 8000 series microprocessors as well as the author was
    1.59 + *  able.  Comments and/or suggestions for improvement are welcomed.
    1.60 + *
    1.61 + *  The code is "64-bit safe".  This means it may be called in either
    1.62 + *  the 32ILP context or the 64LP context.  All 64-bits of registers are
    1.63 + *  saved and restored.
    1.64 + *
    1.65 + *  This code is self-contained.  It requires no other header files in order
    1.66 + *  to compile and to be linkable on a PA-RISC 2.0 machine.  Symbolic
    1.67 + *  definitions for registers and stack offsets are included within this
    1.68 + *  one source file.
    1.69 + *
    1.70 + *  This is a leaf routine.  As such, minimal use is made of the stack area.
    1.71 + *  Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight
    1.72 + *  general registers, and 128 bytes are used to move intermediate products
    1.73 + *  from the floating-point registers to the general registers.  Stack
    1.74 + *  protocols assure proper alignment of these areas.
    1.75 + *
    1.76 + */
    1.77 +
    1.78 +
    1.79 +/*  ====================================================================*/
    1.80 +/*      symbolic definitions for PA-RISC registers      */
    1.81 +/*      in the MIPS style, avoids lots of case shifts       */
    1.82 +/*      assigments (except t4) preserve register number parity  */
    1.83 +/*  ====================================================================*/
    1.84 +
    1.85 +#define zero    %r0         /* permanent zero */
    1.86 +#define t5      %r1         /* temp register, altered by addil */
    1.87 +
    1.88 +#define rp      %r2         /* return pointer */
    1.89 +
    1.90 +#define s1      %r3         /* callee saves register*/
    1.91 +#define s0      %r4         /* callee saves register*/
    1.92 +#define s3      %r5         /* callee saves register*/
    1.93 +#define s2      %r6         /* callee saves register*/
    1.94 +#define s5      %r7         /* callee saves register*/
    1.95 +#define s4      %r8         /* callee saves register*/
    1.96 +#define s7      %r9         /* callee saves register*/
    1.97 +#define s6      %r10        /* callee saves register*/
    1.98 +
    1.99 +#define t1      %r19        /* caller saves register*/
   1.100 +#define t0      %r20        /* caller saves register*/
   1.101 +#define t3      %r21        /* caller saves register*/
   1.102 +#define t2      %r22        /* caller saves register*/
   1.103 +
   1.104 +#define a3      %r23        /* fourth argument register, high word */
   1.105 +#define a2      %r24        /* third argument register, low word*/
   1.106 +#define a1      %r25        /* second argument register, high word*/
   1.107 +#define a0      %r26        /* first argument register, low word*/
   1.108 +
   1.109 +#define v0      %r28        /* high order return value*/
   1.110 +#define v1      %r29        /* low order return value*/
   1.111 +
   1.112 +#define sp      %r30        /* stack pointer*/
   1.113 +#define t4      %r31        /* temporary register   */
   1.114 +
   1.115 +#define fa0     %fr4        /* first argument register*/
   1.116 +#define fa1     %fr5        /* second argument register*/
   1.117 +#define fa2     %fr6        /* third argument register*/
   1.118 +#define fa3     %fr7        /* fourth argument register*/
   1.119 +
   1.120 +#define fa0r    %fr4R       /* first argument register*/
   1.121 +#define fa1r    %fr5R       /* second argument register*/
   1.122 +#define fa2r    %fr6R       /* third argument register*/
   1.123 +#define fa3r    %fr7R       /* fourth argument register*/
   1.124 +
   1.125 +#define ft0     %fr8        /* caller saves register*/
   1.126 +#define ft1     %fr9        /* caller saves register*/
   1.127 +#define ft2     %fr10       /* caller saves register*/
   1.128 +#define ft3     %fr11       /* caller saves register*/
   1.129 +
   1.130 +#define ft0r    %fr8R       /* caller saves register*/
   1.131 +#define ft1r    %fr9R       /* caller saves register*/
   1.132 +#define ft2r    %fr10R      /* caller saves register*/
   1.133 +#define ft3r    %fr11R      /* caller saves register*/
   1.134 +
   1.135 +#define ft4     %fr22       /* caller saves register*/
   1.136 +#define ft5     %fr23       /* caller saves register*/
   1.137 +#define ft6     %fr24       /* caller saves register*/
   1.138 +#define ft7     %fr25       /* caller saves register*/
   1.139 +#define ft8     %fr26       /* caller saves register*/
   1.140 +#define ft9     %fr27       /* caller saves register*/
   1.141 +#define ft10    %fr28       /* caller saves register*/
   1.142 +#define ft11    %fr29       /* caller saves register*/
   1.143 +#define ft12    %fr30       /* caller saves register*/
   1.144 +#define ft13    %fr31       /* caller saves register*/
   1.145 +
   1.146 +#define ft4r    %fr22R      /* caller saves register*/
   1.147 +#define ft5r    %fr23R      /* caller saves register*/
   1.148 +#define ft6r    %fr24R      /* caller saves register*/
   1.149 +#define ft7r    %fr25R      /* caller saves register*/
   1.150 +#define ft8r    %fr26R      /* caller saves register*/
   1.151 +#define ft9r    %fr27R      /* caller saves register*/
   1.152 +#define ft10r   %fr28R      /* caller saves register*/
   1.153 +#define ft11r   %fr29R      /* caller saves register*/
   1.154 +#define ft12r   %fr30R      /* caller saves register*/
   1.155 +#define ft13r   %fr31R      /* caller saves register*/
   1.156 +
   1.157 +
   1.158 +
   1.159 +/*  ================================================================== */
   1.160 +/*      functional definitions for PA-RISC registers           */
   1.161 +/*  ================================================================== */
   1.162 +
   1.163 +/*              general registers           */
   1.164 +
   1.165 +#define T1      a0          /* temp, (length parameter ignored)             */
   1.166 +
   1.167 +#define pM      a1          /* -> 64-bit multiplier                         */
   1.168 +#define T2      a1          /* temp, (after fetching multiplier)            */
   1.169 +
   1.170 +#define pA      a2          /* -> multiplicand vector (8 64-bit words)      */
   1.171 +#define T3      a2          /* temp, (after fetching multiplicand)          */
   1.172 +
   1.173 +#define pR      a3          /* -> addend vector (8 64-bit doublewords,
   1.174 +                                  result vector (9 64-bit words)            */
   1.175 +
   1.176 +#define S0      s0          /* callee saves summand registers               */
   1.177 +#define S1      s1
   1.178 +#define S2      s2
   1.179 +#define S3      s3
   1.180 +#define S4      s4
   1.181 +#define S5      s5
   1.182 +#define S6      s6
   1.183 +#define S7      s7
   1.184 +
   1.185 +#define S8      v0          /* caller saves summand registers               */
   1.186 +#define S9      v1
   1.187 +#define S10     t0
   1.188 +#define S11     t1
   1.189 +#define S12     t2
   1.190 +#define S13     t3
   1.191 +#define S14     t4
   1.192 +#define S15     t5
   1.193 +
   1.194 +
   1.195 +
   1.196 +/*              floating-point registers                                    */
   1.197 +
   1.198 +#define M       fa0         /* multiplier double word                       */
   1.199 +#define MR      fa0r        /* low order half of multiplier double word     */
   1.200 +#define ML      fa0         /* high order half of multiplier double word    */
   1.201 +
   1.202 +#define A0      fa2         /* multiplicand double word 0                   */
   1.203 +#define A0R     fa2r        /* low order half of multiplicand double word   */
   1.204 +#define A0L     fa2         /* high order half of multiplicand double word  */
   1.205 +
   1.206 +#define A1      fa3         /* multiplicand double word 1                   */
   1.207 +#define A1R     fa3r        /* low order half of multiplicand double word   */
   1.208 +#define A1L     fa3         /* high order half of multiplicand double word  */
   1.209 +
   1.210 +#define A2      ft0         /* multiplicand double word 2                   */
   1.211 +#define A2R     ft0r        /* low order half of multiplicand double word   */
   1.212 +#define A2L     ft0         /* high order half of multiplicand double word  */
   1.213 +
   1.214 +#define A3      ft1         /* multiplicand double word 3                   */
   1.215 +#define A3R     ft1r        /* low order half of multiplicand double word   */
   1.216 +#define A3L     ft1         /* high order half of multiplicand double word  */
   1.217 +
   1.218 +#define A4      ft2         /* multiplicand double word 4                   */
   1.219 +#define A4R     ft2r        /* low order half of multiplicand double word   */
   1.220 +#define A4L     ft2         /* high order half of multiplicand double word  */
   1.221 +
   1.222 +#define A5      ft3         /* multiplicand double word 5                   */
   1.223 +#define A5R     ft3r        /* low order half of multiplicand double word   */
   1.224 +#define A5L     ft3         /* high order half of multiplicand double word  */
   1.225 +
   1.226 +#define A6      ft4         /* multiplicand double word 6                   */
   1.227 +#define A6R     ft4r        /* low order half of multiplicand double word   */
   1.228 +#define A6L     ft4         /* high order half of multiplicand double word  */
   1.229 +
   1.230 +#define A7      ft5         /* multiplicand double word 7                   */
   1.231 +#define A7R     ft5r        /* low order half of multiplicand double word   */
   1.232 +#define A7L     ft5         /* high order half of multiplicand double word  */
   1.233 +
   1.234 +#define P0      ft6         /* product word 0                               */
   1.235 +#define P1      ft7         /* product word 0                               */
   1.236 +#define P2      ft8         /* product word 0                               */
   1.237 +#define P3      ft9         /* product word 0                               */
   1.238 +#define P4      ft10        /* product word 0                               */
   1.239 +#define P5      ft11        /* product word 0                               */
   1.240 +#define P6      ft12        /* product word 0                               */
   1.241 +#define P7      ft13        /* product word 0                               */
   1.242 +
   1.243 +
   1.244 +
   1.245 +
   1.246 +/*  ======================================================================  */
   1.247 +/*      symbolic definitions for HP-UX stack offsets                        */
   1.248 +/*      symbolic definitions for memory NOPs                                */
   1.249 +/*  ======================================================================  */
   1.250 +
   1.251 +#define ST_SZ       192         /* stack area total size                    */
   1.252 +
   1.253 +#define SV0         -192(sp)    /* general register save area               */
   1.254 +#define SV1         -184(sp)
   1.255 +#define SV2         -176(sp)
   1.256 +#define SV3         -168(sp)
   1.257 +#define SV4         -160(sp)
   1.258 +#define SV5         -152(sp)
   1.259 +#define SV6         -144(sp)
   1.260 +#define SV7         -136(sp)
   1.261 +
   1.262 +#define XF0         -128(sp)    /* data transfer area                       */
   1.263 +#define XF1         -120(sp)    /* for floating-pt to integer regs          */
   1.264 +#define XF2         -112(sp)
   1.265 +#define XF3         -104(sp)
   1.266 +#define XF4         -96(sp)
   1.267 +#define XF5         -88(sp)
   1.268 +#define XF6         -80(sp)
   1.269 +#define XF7         -72(sp)
   1.270 +#define XF8         -64(sp)
   1.271 +#define XF9         -56(sp)
   1.272 +#define XF10        -48(sp)
   1.273 +#define XF11        -40(sp)
   1.274 +#define XF12        -32(sp)
   1.275 +#define XF13        -24(sp)
   1.276 +#define XF14        -16(sp)
   1.277 +#define XF15        -8(sp)
   1.278 +
   1.279 +#define mnop    proberi (sp),3,zero     /* memory NOP                       */
   1.280 +
   1.281 +
   1.282 +
   1.283 +
   1.284 +/*  ======================================================================  */
   1.285 +/*      assembler formalities                                               */
   1.286 +/*  ======================================================================  */
   1.287 +
   1.288 +#ifdef __LP64__
   1.289 +                .level  2.0W
   1.290 +#else
   1.291 +                .level  2.0
   1.292 +#endif
   1.293 +                .space    $TEXT$
   1.294 +                .subspa   $CODE$
   1.295 +                .align    16
   1.296 +
   1.297 +/*  ======================================================================  */
   1.298 +/*      here to compute 64-bit x 512-bit product + 512-bit addend           */
   1.299 +/*  ======================================================================  */
   1.300 +
   1.301 +multacc512
   1.302 +        .PROC
   1.303 +        .CALLINFO
   1.304 +        .ENTRY
   1.305 +    fldd    0(pM),M                 ; multiplier double word
   1.306 +    ldo     ST_SZ(sp),sp            ; push stack
   1.307 +
   1.308 +    fldd    0(pA),A0                ; multiplicand double word 0
   1.309 +    std     S1,SV1                  ; save s1
   1.310 +
   1.311 +    fldd    16(pA),A2               ; multiplicand double word 2
   1.312 +    std     S3,SV3                  ; save s3
   1.313 +
   1.314 +    fldd    32(pA),A4               ; multiplicand double word 4
   1.315 +    std     S5,SV5                  ; save s5
   1.316 +
   1.317 +    fldd    48(pA),A6               ; multiplicand double word 6
   1.318 +    std     S7,SV7                  ; save s7
   1.319 +
   1.320 +
   1.321 +    std     S0,SV0                  ; save s0
   1.322 +    fldd    8(pA),A1                ; multiplicand double word 1
   1.323 +    xmpyu   MR,A0L,P0               ; A0 cross 32-bit word products
   1.324 +    xmpyu   ML,A0R,P2
   1.325 +
   1.326 +    std     S2,SV2                  ; save s2
   1.327 +    fldd    24(pA),A3               ; multiplicand double word 3
   1.328 +    xmpyu   MR,A2L,P4               ; A2 cross 32-bit word products
   1.329 +    xmpyu   ML,A2R,P6
   1.330 +
   1.331 +    std     S4,SV4                  ; save s4
   1.332 +    fldd    40(pA),A5               ; multiplicand double word 5
   1.333 +
   1.334 +    std     S6,SV6                  ; save s6
   1.335 +    fldd    56(pA),A7               ; multiplicand double word 7
   1.336 +
   1.337 +
   1.338 +    fstd    P0,XF0                  ; MR * A0L
   1.339 +    xmpyu   MR,A0R,P0               ; A0 right 32-bit word product
   1.340 +    xmpyu   MR,A1L,P1               ; A1 cross 32-bit word product
   1.341 +
   1.342 +    fstd    P2,XF2                  ; ML * A0R
   1.343 +    xmpyu   ML,A0L,P2               ; A0 left 32-bit word product
   1.344 +    xmpyu   ML,A1R,P3               ; A1 cross 32-bit word product
   1.345 +
   1.346 +    fstd    P4,XF4                  ; MR * A2L
   1.347 +    xmpyu   MR,A2R,P4               ; A2 right 32-bit word product
   1.348 +    xmpyu   MR,A3L,P5               ; A3 cross 32-bit word product
   1.349 +
   1.350 +    fstd    P6,XF6                  ; ML * A2R
   1.351 +    xmpyu   ML,A2L,P6               ; A2 parallel 32-bit word product
   1.352 +    xmpyu   ML,A3R,P7               ; A3 cross 32-bit word product
   1.353 +
   1.354 +
   1.355 +    ldd     XF0,S0                  ; MR * A0L
   1.356 +    fstd    P1,XF1                  ; MR * A1L
   1.357 +
   1.358 +    ldd     XF2,S2                  ; ML * A0R
   1.359 +    fstd    P3,XF3                  ; ML * A1R
   1.360 +
   1.361 +    ldd     XF4,S4                  ; MR * A2L
   1.362 +    fstd    P5,XF5                  ; MR * A3L
   1.363 +    xmpyu   MR,A1R,P1               ; A1 parallel 32-bit word products
   1.364 +    xmpyu   ML,A1L,P3
   1.365 +
   1.366 +    ldd     XF6,S6                  ; ML * A2R
   1.367 +    fstd    P7,XF7                  ; ML * A3R
   1.368 +    xmpyu   MR,A3R,P5               ; A3 parallel 32-bit word products
   1.369 +    xmpyu   ML,A3L,P7
   1.370 +
   1.371 +
   1.372 +    fstd    P0,XF0                  ; MR * A0R
   1.373 +    ldd     XF1,S1                  ; MR * A1L
   1.374 +    nop
   1.375 +    add     S0,S2,T1                ; A0 cross product sum
   1.376 +
   1.377 +    fstd    P2,XF2                  ; ML * A0L
   1.378 +    ldd     XF3,S3                  ; ML * A1R
   1.379 +    add,dc  zero,zero,S0            ; A0 cross product sum carry
   1.380 +    depd,z  T1,31,32,S2             ; A0 cross product sum << 32
   1.381 +
   1.382 +    fstd    P4,XF4                  ; MR * A2R
   1.383 +    ldd     XF5,S5                  ; MR * A3L
   1.384 +    shrpd   S0,T1,32,S0             ; A0 carry | cross product sum >> 32
   1.385 +    add     S4,S6,T3                ; A2 cross product sum
   1.386 +
   1.387 +    fstd    P6,XF6                  ; ML * A2L
   1.388 +    ldd     XF7,S7                  ; ML * A3R
   1.389 +    add,dc  zero,zero,S4            ; A2 cross product sum carry
   1.390 +    depd,z  T3,31,32,S6             ; A2 cross product sum << 32
   1.391 +
   1.392 +
   1.393 +    ldd     XF0,S8                  ; MR * A0R
   1.394 +    fstd    P1,XF1                  ; MR * A1R
   1.395 +    xmpyu   MR,A4L,P0               ; A4 cross 32-bit word product
   1.396 +    xmpyu   MR,A5L,P1               ; A5 cross 32-bit word product
   1.397 +
   1.398 +    ldd     XF2,S10                 ; ML * A0L
   1.399 +    fstd    P3,XF3                  ; ML * A1L
   1.400 +    xmpyu   ML,A4R,P2               ; A4 cross 32-bit word product
   1.401 +    xmpyu   ML,A5R,P3               ; A5 cross 32-bit word product
   1.402 +
   1.403 +    ldd     XF4,S12                 ; MR * A2R
   1.404 +    fstd    P5,XF5                  ; MR * A3L
   1.405 +    xmpyu   MR,A6L,P4               ; A6 cross 32-bit word product
   1.406 +    xmpyu   MR,A7L,P5               ; A7 cross 32-bit word product
   1.407 +
   1.408 +    ldd     XF6,S14                 ; ML * A2L
   1.409 +    fstd    P7,XF7                  ; ML * A3L
   1.410 +    xmpyu   ML,A6R,P6               ; A6 cross 32-bit word product
   1.411 +    xmpyu   ML,A7R,P7               ; A7 cross 32-bit word product
   1.412 +
   1.413 +
   1.414 +    fstd    P0,XF0                  ; MR * A4L
   1.415 +    ldd     XF1,S9                  ; MR * A1R
   1.416 +    shrpd   S4,T3,32,S4             ; A2 carry | cross product sum >> 32
   1.417 +    add     S1,S3,T1                ; A1 cross product sum
   1.418 +
   1.419 +    fstd    P2,XF2                  ; ML * A4R
   1.420 +    ldd     XF3,S11                 ; ML * A1L
   1.421 +    add,dc  zero,zero,S1            ; A1 cross product sum carry
   1.422 +    depd,z  T1,31,32,S3             ; A1 cross product sum << 32
   1.423 +
   1.424 +    fstd    P4,XF4                  ; MR * A6L
   1.425 +    ldd     XF5,S13                 ; MR * A3R
   1.426 +    shrpd   S1,T1,32,S1             ; A1 carry | cross product sum >> 32
   1.427 +    add     S5,S7,T3                ; A3 cross product sum
   1.428 +
   1.429 +    fstd    P6,XF6                  ; ML * A6R
   1.430 +    ldd     XF7,S15                 ; ML * A3L
   1.431 +    add,dc  zero,zero,S5            ; A3 cross product sum carry
   1.432 +    depd,z  T3,31,32,S7             ; A3 cross product sum << 32
   1.433 +
   1.434 +
   1.435 +    shrpd   S5,T3,32,S5             ; A3 carry | cross product sum >> 32
   1.436 +    add     S2,S8,S8                ; M * A0 right doubleword, P0 doubleword
   1.437 +
   1.438 +    add,dc  S0,S10,S10              ; M * A0 left doubleword
   1.439 +    add     S3,S9,S9                ; M * A1 right doubleword
   1.440 +
   1.441 +    add,dc  S1,S11,S11              ; M * A1 left doubleword
   1.442 +    add     S6,S12,S12              ; M * A2 right doubleword
   1.443 +
   1.444 +
   1.445 +    ldd     24(pR),S3               ; Addend word 3
   1.446 +    fstd    P1,XF1                  ; MR * A5L
   1.447 +    add,dc  S4,S14,S14              ; M * A2 left doubleword
   1.448 +    xmpyu   MR,A5R,P1               ; A5 right 32-bit word product
   1.449 +
   1.450 +    ldd     8(pR),S1                ; Addend word 1
   1.451 +    fstd    P3,XF3                  ; ML * A5R
   1.452 +    add     S7,S13,S13              ; M * A3 right doubleword
   1.453 +    xmpyu   ML,A5L,P3               ; A5 left 32-bit word product
   1.454 +
   1.455 +    ldd     0(pR),S7                ; Addend word 0
   1.456 +    fstd    P5,XF5                  ; MR * A7L
   1.457 +    add,dc  S5,S15,S15              ; M * A3 left doubleword
   1.458 +    xmpyu   MR,A7R,P5               ; A7 right 32-bit word product
   1.459 +
   1.460 +    ldd     16(pR),S5               ; Addend word 2
   1.461 +    fstd    P7,XF7                  ; ML * A7R
   1.462 +    add     S10,S9,S9               ; P1 doubleword
   1.463 +    xmpyu   ML,A7L,P7               ; A7 left 32-bit word products
   1.464 +
   1.465 +
   1.466 +    ldd     XF0,S0                  ; MR * A4L
   1.467 +    fstd    P1,XF9                  ; MR * A5R
   1.468 +    add,dc  S11,S12,S12             ; P2 doubleword
   1.469 +    xmpyu   MR,A4R,P0               ; A4 right 32-bit word product
   1.470 +
   1.471 +    ldd     XF2,S2                  ; ML * A4R
   1.472 +    fstd    P3,XF11                 ; ML * A5L
   1.473 +    add,dc  S14,S13,S13             ; P3 doubleword
   1.474 +    xmpyu   ML,A4L,P2               ; A4 left 32-bit word product
   1.475 +
   1.476 +    ldd     XF6,S6                  ; ML * A6R
   1.477 +    fstd    P5,XF13                 ; MR * A7R
   1.478 +    add,dc  zero,S15,T2             ; P4 partial doubleword
   1.479 +    xmpyu   MR,A6R,P4               ; A6 right 32-bit word product
   1.480 +
   1.481 +    ldd     XF4,S4                  ; MR * A6L
   1.482 +    fstd    P7,XF15                 ; ML * A7L
   1.483 +    add     S7,S8,S8                ; R0 + P0, new R0 doubleword
   1.484 +    xmpyu   ML,A6L,P6               ; A6 left 32-bit word product
   1.485 +
   1.486 +
   1.487 +    fstd    P0,XF0                  ; MR * A4R
   1.488 +    ldd     XF7,S7                  ; ML * A7R
   1.489 +    add,dc  S1,S9,S9                ; c + R1 + P1, new R1 doubleword
   1.490 +
   1.491 +    fstd    P2,XF2                  ; ML * A4L
   1.492 +    ldd     XF1,S1                  ; MR * A5L
   1.493 +    add,dc  S5,S12,S12              ; c + R2 + P2, new R2 doubleword
   1.494 +
   1.495 +    fstd    P4,XF4                  ; MR * A6R
   1.496 +    ldd     XF5,S5                  ; MR * A7L
   1.497 +    add,dc  S3,S13,S13              ; c + R3 + P3, new R3 doubleword
   1.498 +
   1.499 +    fstd    P6,XF6                  ; ML * A6L
   1.500 +    ldd     XF3,S3                  ; ML * A5R
   1.501 +    add,dc  zero,T2,T2              ; c + partial P4
   1.502 +    add     S0,S2,T1                ; A4 cross product sum
   1.503 +
   1.504 +
   1.505 +    std     S8,0(pR)                ; save R0
   1.506 +    add,dc  zero,zero,S0            ; A4 cross product sum carry
   1.507 +    depd,z  T1,31,32,S2             ; A4 cross product sum << 32
   1.508 +
   1.509 +    std     S9,8(pR)                ; save R1
   1.510 +    shrpd   S0,T1,32,S0             ; A4 carry | cross product sum >> 32
   1.511 +    add     S4,S6,T3                ; A6 cross product sum
   1.512 +
   1.513 +    std     S12,16(pR)              ; save R2
   1.514 +    add,dc  zero,zero,S4            ; A6 cross product sum carry
   1.515 +    depd,z  T3,31,32,S6             ; A6 cross product sum << 32
   1.516 +
   1.517 +
   1.518 +    std     S13,24(pR)              ; save R3
   1.519 +    shrpd   S4,T3,32,S4             ; A6 carry | cross product sum >> 32
   1.520 +    add     S1,S3,T1                ; A5 cross product sum
   1.521 +
   1.522 +    ldd     XF0,S8                  ; MR * A4R
   1.523 +    add,dc  zero,zero,S1            ; A5 cross product sum carry
   1.524 +    depd,z  T1,31,32,S3             ; A5 cross product sum << 32
   1.525 +
   1.526 +    ldd     XF2,S10                 ; ML * A4L
   1.527 +    ldd     XF9,S9                  ; MR * A5R
   1.528 +    shrpd   S1,T1,32,S1             ; A5 carry | cross product sum >> 32
   1.529 +    add     S5,S7,T3                ; A7 cross product sum
   1.530 +
   1.531 +    ldd     XF4,S12                 ; MR * A6R
   1.532 +    ldd     XF11,S11                ; ML * A5L
   1.533 +    add,dc  zero,zero,S5            ; A7 cross product sum carry
   1.534 +    depd,z  T3,31,32,S7             ; A7 cross product sum << 32
   1.535 +
   1.536 +    ldd     XF6,S14                 ; ML * A6L
   1.537 +    ldd     XF13,S13                ; MR * A7R
   1.538 +    shrpd   S5,T3,32,S5             ; A7 carry | cross product sum >> 32
   1.539 +    add     S2,S8,S8                ; M * A4 right doubleword
   1.540 +
   1.541 +
   1.542 +    ldd     XF15,S15                ; ML * A7L
   1.543 +    add,dc  S0,S10,S10              ; M * A4 left doubleword
   1.544 +    add     S3,S9,S9                ; M * A5 right doubleword
   1.545 +
   1.546 +    add,dc  S1,S11,S11              ; M * A5 left doubleword
   1.547 +    add     S6,S12,S12              ; M * A6 right doubleword
   1.548 +
   1.549 +    ldd     32(pR),S0               ; Addend word 4
   1.550 +    ldd     40(pR),S1               ; Addend word 5
   1.551 +    add,dc  S4,S14,S14              ; M * A6 left doubleword
   1.552 +    add     S7,S13,S13              ; M * A7 right doubleword
   1.553 +
   1.554 +    ldd     48(pR),S2               ; Addend word 6
   1.555 +    ldd     56(pR),S3               ; Addend word 7
   1.556 +    add,dc  S5,S15,S15              ; M * A7 left doubleword
   1.557 +    add     S8,T2,S8                ; P4 doubleword
   1.558 +
   1.559 +    ldd     64(pR),S4               ; Addend word 8
   1.560 +    ldd     SV5,s5                  ; restore s5
   1.561 +    add,dc  S10,S9,S9               ; P5 doubleword
   1.562 +    add,dc  S11,S12,S12             ; P6 doubleword
   1.563 +
   1.564 +
   1.565 +    ldd     SV6,s6                  ; restore s6
   1.566 +    ldd     SV7,s7                  ; restore s7
   1.567 +    add,dc  S14,S13,S13             ; P7 doubleword
   1.568 +    add,dc  zero,S15,S15            ; P8 doubleword
   1.569 +
   1.570 +    add     S0,S8,S8                ; new R4 doubleword
   1.571 +
   1.572 +    ldd     SV0,s0                  ; restore s0
   1.573 +    std     S8,32(pR)               ; save R4
   1.574 +    add,dc  S1,S9,S9                ; new R5 doubleword
   1.575 +
   1.576 +    ldd     SV1,s1                  ; restore s1
   1.577 +    std     S9,40(pR)               ; save R5
   1.578 +    add,dc  S2,S12,S12              ; new R6 doubleword
   1.579 +
   1.580 +    ldd     SV2,s2                  ; restore s2
   1.581 +    std     S12,48(pR)              ; save R6
   1.582 +    add,dc  S3,S13,S13              ; new R7 doubleword
   1.583 +
   1.584 +    ldd     SV3,s3                  ; restore s3
   1.585 +    std     S13,56(pR)              ; save R7
   1.586 +    add,dc  S4,S15,S15              ; new R8 doubleword
   1.587 +
   1.588 +    ldd     SV4,s4                  ; restore s4
   1.589 +    std     S15,64(pR)              ; save result[8]
   1.590 +    add,dc  zero,zero,v0            ; return carry from R8
   1.591 +
   1.592 +    CMPIB,*= 0,v0,$L0               ; if no overflow, exit
   1.593 +    LDO     8(pR),pR
   1.594 +
   1.595 +$FINAL1                             ; Final carry propagation
   1.596 +    LDD     64(pR),v0
   1.597 +    LDO     8(pR),pR
   1.598 +    ADDI    1,v0,v0
   1.599 +    CMPIB,*= 0,v0,$FINAL1           ; Keep looping if there is a carry.
   1.600 +    STD     v0,56(pR)
   1.601 +$L0
   1.602 +    bv      zero(rp)                ; -> caller
   1.603 +    ldo     -ST_SZ(sp),sp           ; pop stack
   1.604 +
   1.605 +/*  ======================================================================  */
   1.606 +/*      end of module                                                       */
   1.607 +/*  ======================================================================  */
   1.608 +
   1.609 +
   1.610 +        bve (rp)
   1.611 +        .EXIT
   1.612 +        nop
   1.613 +                .PROCEND
   1.614 +                .SPACE         $TEXT$
   1.615 +                .SUBSPA        $CODE$
   1.616 +                .EXPORT        multacc512,ENTRY
   1.617 +
   1.618 +        .end

mercurial