1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/hpma512.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,615 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 +/* 1.8 + * 1.9 + * This PA-RISC 2.0 function computes the product of two unsigned integers, 1.10 + * and adds the result to a previously computed integer. The multiplicand 1.11 + * is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in 1.12 + * memory in little-double-wordian order. The multiplier is an unsigned 1.13 + * 64-bit integer. The previously computed integer to which the product is 1.14 + * added is located in the result ("res") area, and is assumed to be a 1.15 + * 576-bit (72-byte, nine doubleword) unsigned integer, stored in memory 1.16 + * in little-double-wordian order. This value normally will be the result 1.17 + * of a previously computed nine doubleword result. It is not necessary 1.18 + * to pad the multiplicand with an additional 64-bit zero doubleword. 1.19 + * 1.20 + * Multiplicand, multiplier, and addend ideally should be aligned at 1.21 + * 16-byte boundaries for best performance. The code will function 1.22 + * correctly for alignment at eight-byte boundaries which are not 16-byte 1.23 + * boundaries, but the execution may be slightly slower due to even/odd 1.24 + * bank conflicts on PA-RISC 8000 processors. 1.25 + * 1.26 + * This function is designed to accept the same calling sequence as Bill 1.27 + * Ackerman's "maxpy_little" function. The carry from the ninth doubleword 1.28 + * of the result is written to the tenth word of the result, as is done by 1.29 + * Bill Ackerman's function. The final carry also is returned as an 1.30 + * integer, which may be ignored. The function prototype may be either 1.31 + * of the following: 1.32 + * 1.33 + * void multacc512( int l, chunk* m, const chunk* a, chunk* res ); 1.34 + * or 1.35 + * int multacc512( int l, chunk* m, const chunk* a, chunk* res ); 1.36 + * 1.37 + * where: "l" originally denoted vector lengths. This parameter is 1.38 + * ignored. This function always assumes a multiplicand length of 1.39 + * 512 bits (eight doublewords), and addend and result lengths of 1.40 + * 576 bits (nine doublewords). 1.41 + * 1.42 + * "m" is a pointer to the doubleword multiplier, ideally aligned 1.43 + * on a 16-byte boundary. 1.44 + * 1.45 + * "a" is a pointer to the eight-doubleword multiplicand, stored 1.46 + * in little-double-wordian order, and ideally aligned on a 16-byte 1.47 + * boundary. 1.48 + * 1.49 + * "res" is a pointer to the nine doubleword addend, and to the 1.50 + * nine-doubleword product computed by this function. The result 1.51 + * also is stored in little-double-wordian order, and ideally is 1.52 + * aligned on a 16-byte boundary. It is expected that the alignment 1.53 + * of the "res" area may alternate between even/odd doubleword 1.54 + * boundaries for successive calls for 512-bit x 512-bit 1.55 + * multiplications. 1.56 + * 1.57 + * The code for this function has been scheduled to use the parallelism 1.58 + * of the PA-RISC 8000 series microprocessors as well as the author was 1.59 + * able. Comments and/or suggestions for improvement are welcomed. 1.60 + * 1.61 + * The code is "64-bit safe". This means it may be called in either 1.62 + * the 32ILP context or the 64LP context. All 64-bits of registers are 1.63 + * saved and restored. 1.64 + * 1.65 + * This code is self-contained. It requires no other header files in order 1.66 + * to compile and to be linkable on a PA-RISC 2.0 machine. Symbolic 1.67 + * definitions for registers and stack offsets are included within this 1.68 + * one source file. 1.69 + * 1.70 + * This is a leaf routine. As such, minimal use is made of the stack area. 1.71 + * Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight 1.72 + * general registers, and 128 bytes are used to move intermediate products 1.73 + * from the floating-point registers to the general registers. Stack 1.74 + * protocols assure proper alignment of these areas. 1.75 + * 1.76 + */ 1.77 + 1.78 + 1.79 +/* ====================================================================*/ 1.80 +/* symbolic definitions for PA-RISC registers */ 1.81 +/* in the MIPS style, avoids lots of case shifts */ 1.82 +/* assigments (except t4) preserve register number parity */ 1.83 +/* ====================================================================*/ 1.84 + 1.85 +#define zero %r0 /* permanent zero */ 1.86 +#define t5 %r1 /* temp register, altered by addil */ 1.87 + 1.88 +#define rp %r2 /* return pointer */ 1.89 + 1.90 +#define s1 %r3 /* callee saves register*/ 1.91 +#define s0 %r4 /* callee saves register*/ 1.92 +#define s3 %r5 /* callee saves register*/ 1.93 +#define s2 %r6 /* callee saves register*/ 1.94 +#define s5 %r7 /* callee saves register*/ 1.95 +#define s4 %r8 /* callee saves register*/ 1.96 +#define s7 %r9 /* callee saves register*/ 1.97 +#define s6 %r10 /* callee saves register*/ 1.98 + 1.99 +#define t1 %r19 /* caller saves register*/ 1.100 +#define t0 %r20 /* caller saves register*/ 1.101 +#define t3 %r21 /* caller saves register*/ 1.102 +#define t2 %r22 /* caller saves register*/ 1.103 + 1.104 +#define a3 %r23 /* fourth argument register, high word */ 1.105 +#define a2 %r24 /* third argument register, low word*/ 1.106 +#define a1 %r25 /* second argument register, high word*/ 1.107 +#define a0 %r26 /* first argument register, low word*/ 1.108 + 1.109 +#define v0 %r28 /* high order return value*/ 1.110 +#define v1 %r29 /* low order return value*/ 1.111 + 1.112 +#define sp %r30 /* stack pointer*/ 1.113 +#define t4 %r31 /* temporary register */ 1.114 + 1.115 +#define fa0 %fr4 /* first argument register*/ 1.116 +#define fa1 %fr5 /* second argument register*/ 1.117 +#define fa2 %fr6 /* third argument register*/ 1.118 +#define fa3 %fr7 /* fourth argument register*/ 1.119 + 1.120 +#define fa0r %fr4R /* first argument register*/ 1.121 +#define fa1r %fr5R /* second argument register*/ 1.122 +#define fa2r %fr6R /* third argument register*/ 1.123 +#define fa3r %fr7R /* fourth argument register*/ 1.124 + 1.125 +#define ft0 %fr8 /* caller saves register*/ 1.126 +#define ft1 %fr9 /* caller saves register*/ 1.127 +#define ft2 %fr10 /* caller saves register*/ 1.128 +#define ft3 %fr11 /* caller saves register*/ 1.129 + 1.130 +#define ft0r %fr8R /* caller saves register*/ 1.131 +#define ft1r %fr9R /* caller saves register*/ 1.132 +#define ft2r %fr10R /* caller saves register*/ 1.133 +#define ft3r %fr11R /* caller saves register*/ 1.134 + 1.135 +#define ft4 %fr22 /* caller saves register*/ 1.136 +#define ft5 %fr23 /* caller saves register*/ 1.137 +#define ft6 %fr24 /* caller saves register*/ 1.138 +#define ft7 %fr25 /* caller saves register*/ 1.139 +#define ft8 %fr26 /* caller saves register*/ 1.140 +#define ft9 %fr27 /* caller saves register*/ 1.141 +#define ft10 %fr28 /* caller saves register*/ 1.142 +#define ft11 %fr29 /* caller saves register*/ 1.143 +#define ft12 %fr30 /* caller saves register*/ 1.144 +#define ft13 %fr31 /* caller saves register*/ 1.145 + 1.146 +#define ft4r %fr22R /* caller saves register*/ 1.147 +#define ft5r %fr23R /* caller saves register*/ 1.148 +#define ft6r %fr24R /* caller saves register*/ 1.149 +#define ft7r %fr25R /* caller saves register*/ 1.150 +#define ft8r %fr26R /* caller saves register*/ 1.151 +#define ft9r %fr27R /* caller saves register*/ 1.152 +#define ft10r %fr28R /* caller saves register*/ 1.153 +#define ft11r %fr29R /* caller saves register*/ 1.154 +#define ft12r %fr30R /* caller saves register*/ 1.155 +#define ft13r %fr31R /* caller saves register*/ 1.156 + 1.157 + 1.158 + 1.159 +/* ================================================================== */ 1.160 +/* functional definitions for PA-RISC registers */ 1.161 +/* ================================================================== */ 1.162 + 1.163 +/* general registers */ 1.164 + 1.165 +#define T1 a0 /* temp, (length parameter ignored) */ 1.166 + 1.167 +#define pM a1 /* -> 64-bit multiplier */ 1.168 +#define T2 a1 /* temp, (after fetching multiplier) */ 1.169 + 1.170 +#define pA a2 /* -> multiplicand vector (8 64-bit words) */ 1.171 +#define T3 a2 /* temp, (after fetching multiplicand) */ 1.172 + 1.173 +#define pR a3 /* -> addend vector (8 64-bit doublewords, 1.174 + result vector (9 64-bit words) */ 1.175 + 1.176 +#define S0 s0 /* callee saves summand registers */ 1.177 +#define S1 s1 1.178 +#define S2 s2 1.179 +#define S3 s3 1.180 +#define S4 s4 1.181 +#define S5 s5 1.182 +#define S6 s6 1.183 +#define S7 s7 1.184 + 1.185 +#define S8 v0 /* caller saves summand registers */ 1.186 +#define S9 v1 1.187 +#define S10 t0 1.188 +#define S11 t1 1.189 +#define S12 t2 1.190 +#define S13 t3 1.191 +#define S14 t4 1.192 +#define S15 t5 1.193 + 1.194 + 1.195 + 1.196 +/* floating-point registers */ 1.197 + 1.198 +#define M fa0 /* multiplier double word */ 1.199 +#define MR fa0r /* low order half of multiplier double word */ 1.200 +#define ML fa0 /* high order half of multiplier double word */ 1.201 + 1.202 +#define A0 fa2 /* multiplicand double word 0 */ 1.203 +#define A0R fa2r /* low order half of multiplicand double word */ 1.204 +#define A0L fa2 /* high order half of multiplicand double word */ 1.205 + 1.206 +#define A1 fa3 /* multiplicand double word 1 */ 1.207 +#define A1R fa3r /* low order half of multiplicand double word */ 1.208 +#define A1L fa3 /* high order half of multiplicand double word */ 1.209 + 1.210 +#define A2 ft0 /* multiplicand double word 2 */ 1.211 +#define A2R ft0r /* low order half of multiplicand double word */ 1.212 +#define A2L ft0 /* high order half of multiplicand double word */ 1.213 + 1.214 +#define A3 ft1 /* multiplicand double word 3 */ 1.215 +#define A3R ft1r /* low order half of multiplicand double word */ 1.216 +#define A3L ft1 /* high order half of multiplicand double word */ 1.217 + 1.218 +#define A4 ft2 /* multiplicand double word 4 */ 1.219 +#define A4R ft2r /* low order half of multiplicand double word */ 1.220 +#define A4L ft2 /* high order half of multiplicand double word */ 1.221 + 1.222 +#define A5 ft3 /* multiplicand double word 5 */ 1.223 +#define A5R ft3r /* low order half of multiplicand double word */ 1.224 +#define A5L ft3 /* high order half of multiplicand double word */ 1.225 + 1.226 +#define A6 ft4 /* multiplicand double word 6 */ 1.227 +#define A6R ft4r /* low order half of multiplicand double word */ 1.228 +#define A6L ft4 /* high order half of multiplicand double word */ 1.229 + 1.230 +#define A7 ft5 /* multiplicand double word 7 */ 1.231 +#define A7R ft5r /* low order half of multiplicand double word */ 1.232 +#define A7L ft5 /* high order half of multiplicand double word */ 1.233 + 1.234 +#define P0 ft6 /* product word 0 */ 1.235 +#define P1 ft7 /* product word 0 */ 1.236 +#define P2 ft8 /* product word 0 */ 1.237 +#define P3 ft9 /* product word 0 */ 1.238 +#define P4 ft10 /* product word 0 */ 1.239 +#define P5 ft11 /* product word 0 */ 1.240 +#define P6 ft12 /* product word 0 */ 1.241 +#define P7 ft13 /* product word 0 */ 1.242 + 1.243 + 1.244 + 1.245 + 1.246 +/* ====================================================================== */ 1.247 +/* symbolic definitions for HP-UX stack offsets */ 1.248 +/* symbolic definitions for memory NOPs */ 1.249 +/* ====================================================================== */ 1.250 + 1.251 +#define ST_SZ 192 /* stack area total size */ 1.252 + 1.253 +#define SV0 -192(sp) /* general register save area */ 1.254 +#define SV1 -184(sp) 1.255 +#define SV2 -176(sp) 1.256 +#define SV3 -168(sp) 1.257 +#define SV4 -160(sp) 1.258 +#define SV5 -152(sp) 1.259 +#define SV6 -144(sp) 1.260 +#define SV7 -136(sp) 1.261 + 1.262 +#define XF0 -128(sp) /* data transfer area */ 1.263 +#define XF1 -120(sp) /* for floating-pt to integer regs */ 1.264 +#define XF2 -112(sp) 1.265 +#define XF3 -104(sp) 1.266 +#define XF4 -96(sp) 1.267 +#define XF5 -88(sp) 1.268 +#define XF6 -80(sp) 1.269 +#define XF7 -72(sp) 1.270 +#define XF8 -64(sp) 1.271 +#define XF9 -56(sp) 1.272 +#define XF10 -48(sp) 1.273 +#define XF11 -40(sp) 1.274 +#define XF12 -32(sp) 1.275 +#define XF13 -24(sp) 1.276 +#define XF14 -16(sp) 1.277 +#define XF15 -8(sp) 1.278 + 1.279 +#define mnop proberi (sp),3,zero /* memory NOP */ 1.280 + 1.281 + 1.282 + 1.283 + 1.284 +/* ====================================================================== */ 1.285 +/* assembler formalities */ 1.286 +/* ====================================================================== */ 1.287 + 1.288 +#ifdef __LP64__ 1.289 + .level 2.0W 1.290 +#else 1.291 + .level 2.0 1.292 +#endif 1.293 + .space $TEXT$ 1.294 + .subspa $CODE$ 1.295 + .align 16 1.296 + 1.297 +/* ====================================================================== */ 1.298 +/* here to compute 64-bit x 512-bit product + 512-bit addend */ 1.299 +/* ====================================================================== */ 1.300 + 1.301 +multacc512 1.302 + .PROC 1.303 + .CALLINFO 1.304 + .ENTRY 1.305 + fldd 0(pM),M ; multiplier double word 1.306 + ldo ST_SZ(sp),sp ; push stack 1.307 + 1.308 + fldd 0(pA),A0 ; multiplicand double word 0 1.309 + std S1,SV1 ; save s1 1.310 + 1.311 + fldd 16(pA),A2 ; multiplicand double word 2 1.312 + std S3,SV3 ; save s3 1.313 + 1.314 + fldd 32(pA),A4 ; multiplicand double word 4 1.315 + std S5,SV5 ; save s5 1.316 + 1.317 + fldd 48(pA),A6 ; multiplicand double word 6 1.318 + std S7,SV7 ; save s7 1.319 + 1.320 + 1.321 + std S0,SV0 ; save s0 1.322 + fldd 8(pA),A1 ; multiplicand double word 1 1.323 + xmpyu MR,A0L,P0 ; A0 cross 32-bit word products 1.324 + xmpyu ML,A0R,P2 1.325 + 1.326 + std S2,SV2 ; save s2 1.327 + fldd 24(pA),A3 ; multiplicand double word 3 1.328 + xmpyu MR,A2L,P4 ; A2 cross 32-bit word products 1.329 + xmpyu ML,A2R,P6 1.330 + 1.331 + std S4,SV4 ; save s4 1.332 + fldd 40(pA),A5 ; multiplicand double word 5 1.333 + 1.334 + std S6,SV6 ; save s6 1.335 + fldd 56(pA),A7 ; multiplicand double word 7 1.336 + 1.337 + 1.338 + fstd P0,XF0 ; MR * A0L 1.339 + xmpyu MR,A0R,P0 ; A0 right 32-bit word product 1.340 + xmpyu MR,A1L,P1 ; A1 cross 32-bit word product 1.341 + 1.342 + fstd P2,XF2 ; ML * A0R 1.343 + xmpyu ML,A0L,P2 ; A0 left 32-bit word product 1.344 + xmpyu ML,A1R,P3 ; A1 cross 32-bit word product 1.345 + 1.346 + fstd P4,XF4 ; MR * A2L 1.347 + xmpyu MR,A2R,P4 ; A2 right 32-bit word product 1.348 + xmpyu MR,A3L,P5 ; A3 cross 32-bit word product 1.349 + 1.350 + fstd P6,XF6 ; ML * A2R 1.351 + xmpyu ML,A2L,P6 ; A2 parallel 32-bit word product 1.352 + xmpyu ML,A3R,P7 ; A3 cross 32-bit word product 1.353 + 1.354 + 1.355 + ldd XF0,S0 ; MR * A0L 1.356 + fstd P1,XF1 ; MR * A1L 1.357 + 1.358 + ldd XF2,S2 ; ML * A0R 1.359 + fstd P3,XF3 ; ML * A1R 1.360 + 1.361 + ldd XF4,S4 ; MR * A2L 1.362 + fstd P5,XF5 ; MR * A3L 1.363 + xmpyu MR,A1R,P1 ; A1 parallel 32-bit word products 1.364 + xmpyu ML,A1L,P3 1.365 + 1.366 + ldd XF6,S6 ; ML * A2R 1.367 + fstd P7,XF7 ; ML * A3R 1.368 + xmpyu MR,A3R,P5 ; A3 parallel 32-bit word products 1.369 + xmpyu ML,A3L,P7 1.370 + 1.371 + 1.372 + fstd P0,XF0 ; MR * A0R 1.373 + ldd XF1,S1 ; MR * A1L 1.374 + nop 1.375 + add S0,S2,T1 ; A0 cross product sum 1.376 + 1.377 + fstd P2,XF2 ; ML * A0L 1.378 + ldd XF3,S3 ; ML * A1R 1.379 + add,dc zero,zero,S0 ; A0 cross product sum carry 1.380 + depd,z T1,31,32,S2 ; A0 cross product sum << 32 1.381 + 1.382 + fstd P4,XF4 ; MR * A2R 1.383 + ldd XF5,S5 ; MR * A3L 1.384 + shrpd S0,T1,32,S0 ; A0 carry | cross product sum >> 32 1.385 + add S4,S6,T3 ; A2 cross product sum 1.386 + 1.387 + fstd P6,XF6 ; ML * A2L 1.388 + ldd XF7,S7 ; ML * A3R 1.389 + add,dc zero,zero,S4 ; A2 cross product sum carry 1.390 + depd,z T3,31,32,S6 ; A2 cross product sum << 32 1.391 + 1.392 + 1.393 + ldd XF0,S8 ; MR * A0R 1.394 + fstd P1,XF1 ; MR * A1R 1.395 + xmpyu MR,A4L,P0 ; A4 cross 32-bit word product 1.396 + xmpyu MR,A5L,P1 ; A5 cross 32-bit word product 1.397 + 1.398 + ldd XF2,S10 ; ML * A0L 1.399 + fstd P3,XF3 ; ML * A1L 1.400 + xmpyu ML,A4R,P2 ; A4 cross 32-bit word product 1.401 + xmpyu ML,A5R,P3 ; A5 cross 32-bit word product 1.402 + 1.403 + ldd XF4,S12 ; MR * A2R 1.404 + fstd P5,XF5 ; MR * A3L 1.405 + xmpyu MR,A6L,P4 ; A6 cross 32-bit word product 1.406 + xmpyu MR,A7L,P5 ; A7 cross 32-bit word product 1.407 + 1.408 + ldd XF6,S14 ; ML * A2L 1.409 + fstd P7,XF7 ; ML * A3L 1.410 + xmpyu ML,A6R,P6 ; A6 cross 32-bit word product 1.411 + xmpyu ML,A7R,P7 ; A7 cross 32-bit word product 1.412 + 1.413 + 1.414 + fstd P0,XF0 ; MR * A4L 1.415 + ldd XF1,S9 ; MR * A1R 1.416 + shrpd S4,T3,32,S4 ; A2 carry | cross product sum >> 32 1.417 + add S1,S3,T1 ; A1 cross product sum 1.418 + 1.419 + fstd P2,XF2 ; ML * A4R 1.420 + ldd XF3,S11 ; ML * A1L 1.421 + add,dc zero,zero,S1 ; A1 cross product sum carry 1.422 + depd,z T1,31,32,S3 ; A1 cross product sum << 32 1.423 + 1.424 + fstd P4,XF4 ; MR * A6L 1.425 + ldd XF5,S13 ; MR * A3R 1.426 + shrpd S1,T1,32,S1 ; A1 carry | cross product sum >> 32 1.427 + add S5,S7,T3 ; A3 cross product sum 1.428 + 1.429 + fstd P6,XF6 ; ML * A6R 1.430 + ldd XF7,S15 ; ML * A3L 1.431 + add,dc zero,zero,S5 ; A3 cross product sum carry 1.432 + depd,z T3,31,32,S7 ; A3 cross product sum << 32 1.433 + 1.434 + 1.435 + shrpd S5,T3,32,S5 ; A3 carry | cross product sum >> 32 1.436 + add S2,S8,S8 ; M * A0 right doubleword, P0 doubleword 1.437 + 1.438 + add,dc S0,S10,S10 ; M * A0 left doubleword 1.439 + add S3,S9,S9 ; M * A1 right doubleword 1.440 + 1.441 + add,dc S1,S11,S11 ; M * A1 left doubleword 1.442 + add S6,S12,S12 ; M * A2 right doubleword 1.443 + 1.444 + 1.445 + ldd 24(pR),S3 ; Addend word 3 1.446 + fstd P1,XF1 ; MR * A5L 1.447 + add,dc S4,S14,S14 ; M * A2 left doubleword 1.448 + xmpyu MR,A5R,P1 ; A5 right 32-bit word product 1.449 + 1.450 + ldd 8(pR),S1 ; Addend word 1 1.451 + fstd P3,XF3 ; ML * A5R 1.452 + add S7,S13,S13 ; M * A3 right doubleword 1.453 + xmpyu ML,A5L,P3 ; A5 left 32-bit word product 1.454 + 1.455 + ldd 0(pR),S7 ; Addend word 0 1.456 + fstd P5,XF5 ; MR * A7L 1.457 + add,dc S5,S15,S15 ; M * A3 left doubleword 1.458 + xmpyu MR,A7R,P5 ; A7 right 32-bit word product 1.459 + 1.460 + ldd 16(pR),S5 ; Addend word 2 1.461 + fstd P7,XF7 ; ML * A7R 1.462 + add S10,S9,S9 ; P1 doubleword 1.463 + xmpyu ML,A7L,P7 ; A7 left 32-bit word products 1.464 + 1.465 + 1.466 + ldd XF0,S0 ; MR * A4L 1.467 + fstd P1,XF9 ; MR * A5R 1.468 + add,dc S11,S12,S12 ; P2 doubleword 1.469 + xmpyu MR,A4R,P0 ; A4 right 32-bit word product 1.470 + 1.471 + ldd XF2,S2 ; ML * A4R 1.472 + fstd P3,XF11 ; ML * A5L 1.473 + add,dc S14,S13,S13 ; P3 doubleword 1.474 + xmpyu ML,A4L,P2 ; A4 left 32-bit word product 1.475 + 1.476 + ldd XF6,S6 ; ML * A6R 1.477 + fstd P5,XF13 ; MR * A7R 1.478 + add,dc zero,S15,T2 ; P4 partial doubleword 1.479 + xmpyu MR,A6R,P4 ; A6 right 32-bit word product 1.480 + 1.481 + ldd XF4,S4 ; MR * A6L 1.482 + fstd P7,XF15 ; ML * A7L 1.483 + add S7,S8,S8 ; R0 + P0, new R0 doubleword 1.484 + xmpyu ML,A6L,P6 ; A6 left 32-bit word product 1.485 + 1.486 + 1.487 + fstd P0,XF0 ; MR * A4R 1.488 + ldd XF7,S7 ; ML * A7R 1.489 + add,dc S1,S9,S9 ; c + R1 + P1, new R1 doubleword 1.490 + 1.491 + fstd P2,XF2 ; ML * A4L 1.492 + ldd XF1,S1 ; MR * A5L 1.493 + add,dc S5,S12,S12 ; c + R2 + P2, new R2 doubleword 1.494 + 1.495 + fstd P4,XF4 ; MR * A6R 1.496 + ldd XF5,S5 ; MR * A7L 1.497 + add,dc S3,S13,S13 ; c + R3 + P3, new R3 doubleword 1.498 + 1.499 + fstd P6,XF6 ; ML * A6L 1.500 + ldd XF3,S3 ; ML * A5R 1.501 + add,dc zero,T2,T2 ; c + partial P4 1.502 + add S0,S2,T1 ; A4 cross product sum 1.503 + 1.504 + 1.505 + std S8,0(pR) ; save R0 1.506 + add,dc zero,zero,S0 ; A4 cross product sum carry 1.507 + depd,z T1,31,32,S2 ; A4 cross product sum << 32 1.508 + 1.509 + std S9,8(pR) ; save R1 1.510 + shrpd S0,T1,32,S0 ; A4 carry | cross product sum >> 32 1.511 + add S4,S6,T3 ; A6 cross product sum 1.512 + 1.513 + std S12,16(pR) ; save R2 1.514 + add,dc zero,zero,S4 ; A6 cross product sum carry 1.515 + depd,z T3,31,32,S6 ; A6 cross product sum << 32 1.516 + 1.517 + 1.518 + std S13,24(pR) ; save R3 1.519 + shrpd S4,T3,32,S4 ; A6 carry | cross product sum >> 32 1.520 + add S1,S3,T1 ; A5 cross product sum 1.521 + 1.522 + ldd XF0,S8 ; MR * A4R 1.523 + add,dc zero,zero,S1 ; A5 cross product sum carry 1.524 + depd,z T1,31,32,S3 ; A5 cross product sum << 32 1.525 + 1.526 + ldd XF2,S10 ; ML * A4L 1.527 + ldd XF9,S9 ; MR * A5R 1.528 + shrpd S1,T1,32,S1 ; A5 carry | cross product sum >> 32 1.529 + add S5,S7,T3 ; A7 cross product sum 1.530 + 1.531 + ldd XF4,S12 ; MR * A6R 1.532 + ldd XF11,S11 ; ML * A5L 1.533 + add,dc zero,zero,S5 ; A7 cross product sum carry 1.534 + depd,z T3,31,32,S7 ; A7 cross product sum << 32 1.535 + 1.536 + ldd XF6,S14 ; ML * A6L 1.537 + ldd XF13,S13 ; MR * A7R 1.538 + shrpd S5,T3,32,S5 ; A7 carry | cross product sum >> 32 1.539 + add S2,S8,S8 ; M * A4 right doubleword 1.540 + 1.541 + 1.542 + ldd XF15,S15 ; ML * A7L 1.543 + add,dc S0,S10,S10 ; M * A4 left doubleword 1.544 + add S3,S9,S9 ; M * A5 right doubleword 1.545 + 1.546 + add,dc S1,S11,S11 ; M * A5 left doubleword 1.547 + add S6,S12,S12 ; M * A6 right doubleword 1.548 + 1.549 + ldd 32(pR),S0 ; Addend word 4 1.550 + ldd 40(pR),S1 ; Addend word 5 1.551 + add,dc S4,S14,S14 ; M * A6 left doubleword 1.552 + add S7,S13,S13 ; M * A7 right doubleword 1.553 + 1.554 + ldd 48(pR),S2 ; Addend word 6 1.555 + ldd 56(pR),S3 ; Addend word 7 1.556 + add,dc S5,S15,S15 ; M * A7 left doubleword 1.557 + add S8,T2,S8 ; P4 doubleword 1.558 + 1.559 + ldd 64(pR),S4 ; Addend word 8 1.560 + ldd SV5,s5 ; restore s5 1.561 + add,dc S10,S9,S9 ; P5 doubleword 1.562 + add,dc S11,S12,S12 ; P6 doubleword 1.563 + 1.564 + 1.565 + ldd SV6,s6 ; restore s6 1.566 + ldd SV7,s7 ; restore s7 1.567 + add,dc S14,S13,S13 ; P7 doubleword 1.568 + add,dc zero,S15,S15 ; P8 doubleword 1.569 + 1.570 + add S0,S8,S8 ; new R4 doubleword 1.571 + 1.572 + ldd SV0,s0 ; restore s0 1.573 + std S8,32(pR) ; save R4 1.574 + add,dc S1,S9,S9 ; new R5 doubleword 1.575 + 1.576 + ldd SV1,s1 ; restore s1 1.577 + std S9,40(pR) ; save R5 1.578 + add,dc S2,S12,S12 ; new R6 doubleword 1.579 + 1.580 + ldd SV2,s2 ; restore s2 1.581 + std S12,48(pR) ; save R6 1.582 + add,dc S3,S13,S13 ; new R7 doubleword 1.583 + 1.584 + ldd SV3,s3 ; restore s3 1.585 + std S13,56(pR) ; save R7 1.586 + add,dc S4,S15,S15 ; new R8 doubleword 1.587 + 1.588 + ldd SV4,s4 ; restore s4 1.589 + std S15,64(pR) ; save result[8] 1.590 + add,dc zero,zero,v0 ; return carry from R8 1.591 + 1.592 + CMPIB,*= 0,v0,$L0 ; if no overflow, exit 1.593 + LDO 8(pR),pR 1.594 + 1.595 +$FINAL1 ; Final carry propagation 1.596 + LDD 64(pR),v0 1.597 + LDO 8(pR),pR 1.598 + ADDI 1,v0,v0 1.599 + CMPIB,*= 0,v0,$FINAL1 ; Keep looping if there is a carry. 1.600 + STD v0,56(pR) 1.601 +$L0 1.602 + bv zero(rp) ; -> caller 1.603 + ldo -ST_SZ(sp),sp ; pop stack 1.604 + 1.605 +/* ====================================================================== */ 1.606 +/* end of module */ 1.607 +/* ====================================================================== */ 1.608 + 1.609 + 1.610 + bve (rp) 1.611 + .EXIT 1.612 + nop 1.613 + .PROCEND 1.614 + .SPACE $TEXT$ 1.615 + .SUBSPA $CODE$ 1.616 + .EXPORT multacc512,ENTRY 1.617 + 1.618 + .end