Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | /* |
michael@0 | 5 | * |
michael@0 | 6 | * This PA-RISC 2.0 function computes the product of two unsigned integers, |
michael@0 | 7 | * and adds the result to a previously computed integer. The multiplicand |
michael@0 | 8 | * is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in |
michael@0 | 9 | * memory in little-double-wordian order. The multiplier is an unsigned |
michael@0 | 10 | * 64-bit integer. The previously computed integer to which the product is |
michael@0 | 11 | * added is located in the result ("res") area, and is assumed to be a |
michael@0 | 12 | * 576-bit (72-byte, nine doubleword) unsigned integer, stored in memory |
michael@0 | 13 | * in little-double-wordian order. This value normally will be the result |
michael@0 | 14 | * of a previously computed nine doubleword result. It is not necessary |
michael@0 | 15 | * to pad the multiplicand with an additional 64-bit zero doubleword. |
michael@0 | 16 | * |
michael@0 | 17 | * Multiplicand, multiplier, and addend ideally should be aligned at |
michael@0 | 18 | * 16-byte boundaries for best performance. The code will function |
michael@0 | 19 | * correctly for alignment at eight-byte boundaries which are not 16-byte |
michael@0 | 20 | * boundaries, but the execution may be slightly slower due to even/odd |
michael@0 | 21 | * bank conflicts on PA-RISC 8000 processors. |
michael@0 | 22 | * |
michael@0 | 23 | * This function is designed to accept the same calling sequence as Bill |
michael@0 | 24 | * Ackerman's "maxpy_little" function. The carry from the ninth doubleword |
michael@0 | 25 | * of the result is written to the tenth word of the result, as is done by |
michael@0 | 26 | * Bill Ackerman's function. The final carry also is returned as an |
michael@0 | 27 | * integer, which may be ignored. The function prototype may be either |
michael@0 | 28 | * of the following: |
michael@0 | 29 | * |
michael@0 | 30 | * void multacc512( int l, chunk* m, const chunk* a, chunk* res ); |
michael@0 | 31 | * or |
michael@0 | 32 | * int multacc512( int l, chunk* m, const chunk* a, chunk* res ); |
michael@0 | 33 | * |
michael@0 | 34 | * where: "l" originally denoted vector lengths. This parameter is |
michael@0 | 35 | * ignored. This function always assumes a multiplicand length of |
michael@0 | 36 | * 512 bits (eight doublewords), and addend and result lengths of |
michael@0 | 37 | * 576 bits (nine doublewords). |
michael@0 | 38 | * |
michael@0 | 39 | * "m" is a pointer to the doubleword multiplier, ideally aligned |
michael@0 | 40 | * on a 16-byte boundary. |
michael@0 | 41 | * |
michael@0 | 42 | * "a" is a pointer to the eight-doubleword multiplicand, stored |
michael@0 | 43 | * in little-double-wordian order, and ideally aligned on a 16-byte |
michael@0 | 44 | * boundary. |
michael@0 | 45 | * |
michael@0 | 46 | * "res" is a pointer to the nine doubleword addend, and to the |
michael@0 | 47 | * nine-doubleword product computed by this function. The result |
michael@0 | 48 | * also is stored in little-double-wordian order, and ideally is |
michael@0 | 49 | * aligned on a 16-byte boundary. It is expected that the alignment |
michael@0 | 50 | * of the "res" area may alternate between even/odd doubleword |
michael@0 | 51 | * boundaries for successive calls for 512-bit x 512-bit |
michael@0 | 52 | * multiplications. |
michael@0 | 53 | * |
michael@0 | 54 | * The code for this function has been scheduled to use the parallelism |
michael@0 | 55 | * of the PA-RISC 8000 series microprocessors as well as the author was |
michael@0 | 56 | * able. Comments and/or suggestions for improvement are welcomed. |
michael@0 | 57 | * |
michael@0 | 58 | * The code is "64-bit safe". This means it may be called in either |
michael@0 | 59 | * the 32ILP context or the 64LP context. All 64-bits of registers are |
michael@0 | 60 | * saved and restored. |
michael@0 | 61 | * |
michael@0 | 62 | * This code is self-contained. It requires no other header files in order |
michael@0 | 63 | * to compile and to be linkable on a PA-RISC 2.0 machine. Symbolic |
michael@0 | 64 | * definitions for registers and stack offsets are included within this |
michael@0 | 65 | * one source file. |
michael@0 | 66 | * |
michael@0 | 67 | * This is a leaf routine. As such, minimal use is made of the stack area. |
michael@0 | 68 | * Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight |
michael@0 | 69 | * general registers, and 128 bytes are used to move intermediate products |
michael@0 | 70 | * from the floating-point registers to the general registers. Stack |
michael@0 | 71 | * protocols assure proper alignment of these areas. |
michael@0 | 72 | * |
michael@0 | 73 | */ |
michael@0 | 74 | |
michael@0 | 75 | |
michael@0 | 76 | /* ====================================================================*/ |
michael@0 | 77 | /* symbolic definitions for PA-RISC registers */ |
michael@0 | 78 | /* in the MIPS style, avoids lots of case shifts */ |
michael@0 | 79 | /* assigments (except t4) preserve register number parity */ |
michael@0 | 80 | /* ====================================================================*/ |
michael@0 | 81 | |
michael@0 | 82 | #define zero %r0 /* permanent zero */ |
michael@0 | 83 | #define t5 %r1 /* temp register, altered by addil */ |
michael@0 | 84 | |
michael@0 | 85 | #define rp %r2 /* return pointer */ |
michael@0 | 86 | |
michael@0 | 87 | #define s1 %r3 /* callee saves register*/ |
michael@0 | 88 | #define s0 %r4 /* callee saves register*/ |
michael@0 | 89 | #define s3 %r5 /* callee saves register*/ |
michael@0 | 90 | #define s2 %r6 /* callee saves register*/ |
michael@0 | 91 | #define s5 %r7 /* callee saves register*/ |
michael@0 | 92 | #define s4 %r8 /* callee saves register*/ |
michael@0 | 93 | #define s7 %r9 /* callee saves register*/ |
michael@0 | 94 | #define s6 %r10 /* callee saves register*/ |
michael@0 | 95 | |
michael@0 | 96 | #define t1 %r19 /* caller saves register*/ |
michael@0 | 97 | #define t0 %r20 /* caller saves register*/ |
michael@0 | 98 | #define t3 %r21 /* caller saves register*/ |
michael@0 | 99 | #define t2 %r22 /* caller saves register*/ |
michael@0 | 100 | |
michael@0 | 101 | #define a3 %r23 /* fourth argument register, high word */ |
michael@0 | 102 | #define a2 %r24 /* third argument register, low word*/ |
michael@0 | 103 | #define a1 %r25 /* second argument register, high word*/ |
michael@0 | 104 | #define a0 %r26 /* first argument register, low word*/ |
michael@0 | 105 | |
michael@0 | 106 | #define v0 %r28 /* high order return value*/ |
michael@0 | 107 | #define v1 %r29 /* low order return value*/ |
michael@0 | 108 | |
michael@0 | 109 | #define sp %r30 /* stack pointer*/ |
michael@0 | 110 | #define t4 %r31 /* temporary register */ |
michael@0 | 111 | |
michael@0 | 112 | #define fa0 %fr4 /* first argument register*/ |
michael@0 | 113 | #define fa1 %fr5 /* second argument register*/ |
michael@0 | 114 | #define fa2 %fr6 /* third argument register*/ |
michael@0 | 115 | #define fa3 %fr7 /* fourth argument register*/ |
michael@0 | 116 | |
michael@0 | 117 | #define fa0r %fr4R /* first argument register*/ |
michael@0 | 118 | #define fa1r %fr5R /* second argument register*/ |
michael@0 | 119 | #define fa2r %fr6R /* third argument register*/ |
michael@0 | 120 | #define fa3r %fr7R /* fourth argument register*/ |
michael@0 | 121 | |
michael@0 | 122 | #define ft0 %fr8 /* caller saves register*/ |
michael@0 | 123 | #define ft1 %fr9 /* caller saves register*/ |
michael@0 | 124 | #define ft2 %fr10 /* caller saves register*/ |
michael@0 | 125 | #define ft3 %fr11 /* caller saves register*/ |
michael@0 | 126 | |
michael@0 | 127 | #define ft0r %fr8R /* caller saves register*/ |
michael@0 | 128 | #define ft1r %fr9R /* caller saves register*/ |
michael@0 | 129 | #define ft2r %fr10R /* caller saves register*/ |
michael@0 | 130 | #define ft3r %fr11R /* caller saves register*/ |
michael@0 | 131 | |
michael@0 | 132 | #define ft4 %fr22 /* caller saves register*/ |
michael@0 | 133 | #define ft5 %fr23 /* caller saves register*/ |
michael@0 | 134 | #define ft6 %fr24 /* caller saves register*/ |
michael@0 | 135 | #define ft7 %fr25 /* caller saves register*/ |
michael@0 | 136 | #define ft8 %fr26 /* caller saves register*/ |
michael@0 | 137 | #define ft9 %fr27 /* caller saves register*/ |
michael@0 | 138 | #define ft10 %fr28 /* caller saves register*/ |
michael@0 | 139 | #define ft11 %fr29 /* caller saves register*/ |
michael@0 | 140 | #define ft12 %fr30 /* caller saves register*/ |
michael@0 | 141 | #define ft13 %fr31 /* caller saves register*/ |
michael@0 | 142 | |
michael@0 | 143 | #define ft4r %fr22R /* caller saves register*/ |
michael@0 | 144 | #define ft5r %fr23R /* caller saves register*/ |
michael@0 | 145 | #define ft6r %fr24R /* caller saves register*/ |
michael@0 | 146 | #define ft7r %fr25R /* caller saves register*/ |
michael@0 | 147 | #define ft8r %fr26R /* caller saves register*/ |
michael@0 | 148 | #define ft9r %fr27R /* caller saves register*/ |
michael@0 | 149 | #define ft10r %fr28R /* caller saves register*/ |
michael@0 | 150 | #define ft11r %fr29R /* caller saves register*/ |
michael@0 | 151 | #define ft12r %fr30R /* caller saves register*/ |
michael@0 | 152 | #define ft13r %fr31R /* caller saves register*/ |
michael@0 | 153 | |
michael@0 | 154 | |
michael@0 | 155 | |
michael@0 | 156 | /* ================================================================== */ |
michael@0 | 157 | /* functional definitions for PA-RISC registers */ |
michael@0 | 158 | /* ================================================================== */ |
michael@0 | 159 | |
michael@0 | 160 | /* general registers */ |
michael@0 | 161 | |
michael@0 | 162 | #define T1 a0 /* temp, (length parameter ignored) */ |
michael@0 | 163 | |
michael@0 | 164 | #define pM a1 /* -> 64-bit multiplier */ |
michael@0 | 165 | #define T2 a1 /* temp, (after fetching multiplier) */ |
michael@0 | 166 | |
michael@0 | 167 | #define pA a2 /* -> multiplicand vector (8 64-bit words) */ |
michael@0 | 168 | #define T3 a2 /* temp, (after fetching multiplicand) */ |
michael@0 | 169 | |
michael@0 | 170 | #define pR a3 /* -> addend vector (8 64-bit doublewords, |
michael@0 | 171 | result vector (9 64-bit words) */ |
michael@0 | 172 | |
michael@0 | 173 | #define S0 s0 /* callee saves summand registers */ |
michael@0 | 174 | #define S1 s1 |
michael@0 | 175 | #define S2 s2 |
michael@0 | 176 | #define S3 s3 |
michael@0 | 177 | #define S4 s4 |
michael@0 | 178 | #define S5 s5 |
michael@0 | 179 | #define S6 s6 |
michael@0 | 180 | #define S7 s7 |
michael@0 | 181 | |
michael@0 | 182 | #define S8 v0 /* caller saves summand registers */ |
michael@0 | 183 | #define S9 v1 |
michael@0 | 184 | #define S10 t0 |
michael@0 | 185 | #define S11 t1 |
michael@0 | 186 | #define S12 t2 |
michael@0 | 187 | #define S13 t3 |
michael@0 | 188 | #define S14 t4 |
michael@0 | 189 | #define S15 t5 |
michael@0 | 190 | |
michael@0 | 191 | |
michael@0 | 192 | |
michael@0 | 193 | /* floating-point registers */ |
michael@0 | 194 | |
michael@0 | 195 | #define M fa0 /* multiplier double word */ |
michael@0 | 196 | #define MR fa0r /* low order half of multiplier double word */ |
michael@0 | 197 | #define ML fa0 /* high order half of multiplier double word */ |
michael@0 | 198 | |
michael@0 | 199 | #define A0 fa2 /* multiplicand double word 0 */ |
michael@0 | 200 | #define A0R fa2r /* low order half of multiplicand double word */ |
michael@0 | 201 | #define A0L fa2 /* high order half of multiplicand double word */ |
michael@0 | 202 | |
michael@0 | 203 | #define A1 fa3 /* multiplicand double word 1 */ |
michael@0 | 204 | #define A1R fa3r /* low order half of multiplicand double word */ |
michael@0 | 205 | #define A1L fa3 /* high order half of multiplicand double word */ |
michael@0 | 206 | |
michael@0 | 207 | #define A2 ft0 /* multiplicand double word 2 */ |
michael@0 | 208 | #define A2R ft0r /* low order half of multiplicand double word */ |
michael@0 | 209 | #define A2L ft0 /* high order half of multiplicand double word */ |
michael@0 | 210 | |
michael@0 | 211 | #define A3 ft1 /* multiplicand double word 3 */ |
michael@0 | 212 | #define A3R ft1r /* low order half of multiplicand double word */ |
michael@0 | 213 | #define A3L ft1 /* high order half of multiplicand double word */ |
michael@0 | 214 | |
michael@0 | 215 | #define A4 ft2 /* multiplicand double word 4 */ |
michael@0 | 216 | #define A4R ft2r /* low order half of multiplicand double word */ |
michael@0 | 217 | #define A4L ft2 /* high order half of multiplicand double word */ |
michael@0 | 218 | |
michael@0 | 219 | #define A5 ft3 /* multiplicand double word 5 */ |
michael@0 | 220 | #define A5R ft3r /* low order half of multiplicand double word */ |
michael@0 | 221 | #define A5L ft3 /* high order half of multiplicand double word */ |
michael@0 | 222 | |
michael@0 | 223 | #define A6 ft4 /* multiplicand double word 6 */ |
michael@0 | 224 | #define A6R ft4r /* low order half of multiplicand double word */ |
michael@0 | 225 | #define A6L ft4 /* high order half of multiplicand double word */ |
michael@0 | 226 | |
michael@0 | 227 | #define A7 ft5 /* multiplicand double word 7 */ |
michael@0 | 228 | #define A7R ft5r /* low order half of multiplicand double word */ |
michael@0 | 229 | #define A7L ft5 /* high order half of multiplicand double word */ |
michael@0 | 230 | |
michael@0 | 231 | #define P0 ft6 /* product word 0 */ |
michael@0 | 232 | #define P1 ft7 /* product word 0 */ |
michael@0 | 233 | #define P2 ft8 /* product word 0 */ |
michael@0 | 234 | #define P3 ft9 /* product word 0 */ |
michael@0 | 235 | #define P4 ft10 /* product word 0 */ |
michael@0 | 236 | #define P5 ft11 /* product word 0 */ |
michael@0 | 237 | #define P6 ft12 /* product word 0 */ |
michael@0 | 238 | #define P7 ft13 /* product word 0 */ |
michael@0 | 239 | |
michael@0 | 240 | |
michael@0 | 241 | |
michael@0 | 242 | |
michael@0 | 243 | /* ====================================================================== */ |
michael@0 | 244 | /* symbolic definitions for HP-UX stack offsets */ |
michael@0 | 245 | /* symbolic definitions for memory NOPs */ |
michael@0 | 246 | /* ====================================================================== */ |
michael@0 | 247 | |
michael@0 | 248 | #define ST_SZ 192 /* stack area total size */ |
michael@0 | 249 | |
michael@0 | 250 | #define SV0 -192(sp) /* general register save area */ |
michael@0 | 251 | #define SV1 -184(sp) |
michael@0 | 252 | #define SV2 -176(sp) |
michael@0 | 253 | #define SV3 -168(sp) |
michael@0 | 254 | #define SV4 -160(sp) |
michael@0 | 255 | #define SV5 -152(sp) |
michael@0 | 256 | #define SV6 -144(sp) |
michael@0 | 257 | #define SV7 -136(sp) |
michael@0 | 258 | |
michael@0 | 259 | #define XF0 -128(sp) /* data transfer area */ |
michael@0 | 260 | #define XF1 -120(sp) /* for floating-pt to integer regs */ |
michael@0 | 261 | #define XF2 -112(sp) |
michael@0 | 262 | #define XF3 -104(sp) |
michael@0 | 263 | #define XF4 -96(sp) |
michael@0 | 264 | #define XF5 -88(sp) |
michael@0 | 265 | #define XF6 -80(sp) |
michael@0 | 266 | #define XF7 -72(sp) |
michael@0 | 267 | #define XF8 -64(sp) |
michael@0 | 268 | #define XF9 -56(sp) |
michael@0 | 269 | #define XF10 -48(sp) |
michael@0 | 270 | #define XF11 -40(sp) |
michael@0 | 271 | #define XF12 -32(sp) |
michael@0 | 272 | #define XF13 -24(sp) |
michael@0 | 273 | #define XF14 -16(sp) |
michael@0 | 274 | #define XF15 -8(sp) |
michael@0 | 275 | |
michael@0 | 276 | #define mnop proberi (sp),3,zero /* memory NOP */ |
michael@0 | 277 | |
michael@0 | 278 | |
michael@0 | 279 | |
michael@0 | 280 | |
michael@0 | 281 | /* ====================================================================== */ |
michael@0 | 282 | /* assembler formalities */ |
michael@0 | 283 | /* ====================================================================== */ |
michael@0 | 284 | |
michael@0 | 285 | #ifdef __LP64__ |
michael@0 | 286 | .level 2.0W |
michael@0 | 287 | #else |
michael@0 | 288 | .level 2.0 |
michael@0 | 289 | #endif |
michael@0 | 290 | .space $TEXT$ |
michael@0 | 291 | .subspa $CODE$ |
michael@0 | 292 | .align 16 |
michael@0 | 293 | |
michael@0 | 294 | /* ====================================================================== */ |
michael@0 | 295 | /* here to compute 64-bit x 512-bit product + 512-bit addend */ |
michael@0 | 296 | /* ====================================================================== */ |
michael@0 | 297 | |
michael@0 | 298 | multacc512 |
michael@0 | 299 | .PROC |
michael@0 | 300 | .CALLINFO |
michael@0 | 301 | .ENTRY |
michael@0 | 302 | fldd 0(pM),M ; multiplier double word |
michael@0 | 303 | ldo ST_SZ(sp),sp ; push stack |
michael@0 | 304 | |
michael@0 | 305 | fldd 0(pA),A0 ; multiplicand double word 0 |
michael@0 | 306 | std S1,SV1 ; save s1 |
michael@0 | 307 | |
michael@0 | 308 | fldd 16(pA),A2 ; multiplicand double word 2 |
michael@0 | 309 | std S3,SV3 ; save s3 |
michael@0 | 310 | |
michael@0 | 311 | fldd 32(pA),A4 ; multiplicand double word 4 |
michael@0 | 312 | std S5,SV5 ; save s5 |
michael@0 | 313 | |
michael@0 | 314 | fldd 48(pA),A6 ; multiplicand double word 6 |
michael@0 | 315 | std S7,SV7 ; save s7 |
michael@0 | 316 | |
michael@0 | 317 | |
michael@0 | 318 | std S0,SV0 ; save s0 |
michael@0 | 319 | fldd 8(pA),A1 ; multiplicand double word 1 |
michael@0 | 320 | xmpyu MR,A0L,P0 ; A0 cross 32-bit word products |
michael@0 | 321 | xmpyu ML,A0R,P2 |
michael@0 | 322 | |
michael@0 | 323 | std S2,SV2 ; save s2 |
michael@0 | 324 | fldd 24(pA),A3 ; multiplicand double word 3 |
michael@0 | 325 | xmpyu MR,A2L,P4 ; A2 cross 32-bit word products |
michael@0 | 326 | xmpyu ML,A2R,P6 |
michael@0 | 327 | |
michael@0 | 328 | std S4,SV4 ; save s4 |
michael@0 | 329 | fldd 40(pA),A5 ; multiplicand double word 5 |
michael@0 | 330 | |
michael@0 | 331 | std S6,SV6 ; save s6 |
michael@0 | 332 | fldd 56(pA),A7 ; multiplicand double word 7 |
michael@0 | 333 | |
michael@0 | 334 | |
michael@0 | 335 | fstd P0,XF0 ; MR * A0L |
michael@0 | 336 | xmpyu MR,A0R,P0 ; A0 right 32-bit word product |
michael@0 | 337 | xmpyu MR,A1L,P1 ; A1 cross 32-bit word product |
michael@0 | 338 | |
michael@0 | 339 | fstd P2,XF2 ; ML * A0R |
michael@0 | 340 | xmpyu ML,A0L,P2 ; A0 left 32-bit word product |
michael@0 | 341 | xmpyu ML,A1R,P3 ; A1 cross 32-bit word product |
michael@0 | 342 | |
michael@0 | 343 | fstd P4,XF4 ; MR * A2L |
michael@0 | 344 | xmpyu MR,A2R,P4 ; A2 right 32-bit word product |
michael@0 | 345 | xmpyu MR,A3L,P5 ; A3 cross 32-bit word product |
michael@0 | 346 | |
michael@0 | 347 | fstd P6,XF6 ; ML * A2R |
michael@0 | 348 | xmpyu ML,A2L,P6 ; A2 parallel 32-bit word product |
michael@0 | 349 | xmpyu ML,A3R,P7 ; A3 cross 32-bit word product |
michael@0 | 350 | |
michael@0 | 351 | |
michael@0 | 352 | ldd XF0,S0 ; MR * A0L |
michael@0 | 353 | fstd P1,XF1 ; MR * A1L |
michael@0 | 354 | |
michael@0 | 355 | ldd XF2,S2 ; ML * A0R |
michael@0 | 356 | fstd P3,XF3 ; ML * A1R |
michael@0 | 357 | |
michael@0 | 358 | ldd XF4,S4 ; MR * A2L |
michael@0 | 359 | fstd P5,XF5 ; MR * A3L |
michael@0 | 360 | xmpyu MR,A1R,P1 ; A1 parallel 32-bit word products |
michael@0 | 361 | xmpyu ML,A1L,P3 |
michael@0 | 362 | |
michael@0 | 363 | ldd XF6,S6 ; ML * A2R |
michael@0 | 364 | fstd P7,XF7 ; ML * A3R |
michael@0 | 365 | xmpyu MR,A3R,P5 ; A3 parallel 32-bit word products |
michael@0 | 366 | xmpyu ML,A3L,P7 |
michael@0 | 367 | |
michael@0 | 368 | |
michael@0 | 369 | fstd P0,XF0 ; MR * A0R |
michael@0 | 370 | ldd XF1,S1 ; MR * A1L |
michael@0 | 371 | nop |
michael@0 | 372 | add S0,S2,T1 ; A0 cross product sum |
michael@0 | 373 | |
michael@0 | 374 | fstd P2,XF2 ; ML * A0L |
michael@0 | 375 | ldd XF3,S3 ; ML * A1R |
michael@0 | 376 | add,dc zero,zero,S0 ; A0 cross product sum carry |
michael@0 | 377 | depd,z T1,31,32,S2 ; A0 cross product sum << 32 |
michael@0 | 378 | |
michael@0 | 379 | fstd P4,XF4 ; MR * A2R |
michael@0 | 380 | ldd XF5,S5 ; MR * A3L |
michael@0 | 381 | shrpd S0,T1,32,S0 ; A0 carry | cross product sum >> 32 |
michael@0 | 382 | add S4,S6,T3 ; A2 cross product sum |
michael@0 | 383 | |
michael@0 | 384 | fstd P6,XF6 ; ML * A2L |
michael@0 | 385 | ldd XF7,S7 ; ML * A3R |
michael@0 | 386 | add,dc zero,zero,S4 ; A2 cross product sum carry |
michael@0 | 387 | depd,z T3,31,32,S6 ; A2 cross product sum << 32 |
michael@0 | 388 | |
michael@0 | 389 | |
michael@0 | 390 | ldd XF0,S8 ; MR * A0R |
michael@0 | 391 | fstd P1,XF1 ; MR * A1R |
michael@0 | 392 | xmpyu MR,A4L,P0 ; A4 cross 32-bit word product |
michael@0 | 393 | xmpyu MR,A5L,P1 ; A5 cross 32-bit word product |
michael@0 | 394 | |
michael@0 | 395 | ldd XF2,S10 ; ML * A0L |
michael@0 | 396 | fstd P3,XF3 ; ML * A1L |
michael@0 | 397 | xmpyu ML,A4R,P2 ; A4 cross 32-bit word product |
michael@0 | 398 | xmpyu ML,A5R,P3 ; A5 cross 32-bit word product |
michael@0 | 399 | |
michael@0 | 400 | ldd XF4,S12 ; MR * A2R |
michael@0 | 401 | fstd P5,XF5 ; MR * A3L |
michael@0 | 402 | xmpyu MR,A6L,P4 ; A6 cross 32-bit word product |
michael@0 | 403 | xmpyu MR,A7L,P5 ; A7 cross 32-bit word product |
michael@0 | 404 | |
michael@0 | 405 | ldd XF6,S14 ; ML * A2L |
michael@0 | 406 | fstd P7,XF7 ; ML * A3L |
michael@0 | 407 | xmpyu ML,A6R,P6 ; A6 cross 32-bit word product |
michael@0 | 408 | xmpyu ML,A7R,P7 ; A7 cross 32-bit word product |
michael@0 | 409 | |
michael@0 | 410 | |
michael@0 | 411 | fstd P0,XF0 ; MR * A4L |
michael@0 | 412 | ldd XF1,S9 ; MR * A1R |
michael@0 | 413 | shrpd S4,T3,32,S4 ; A2 carry | cross product sum >> 32 |
michael@0 | 414 | add S1,S3,T1 ; A1 cross product sum |
michael@0 | 415 | |
michael@0 | 416 | fstd P2,XF2 ; ML * A4R |
michael@0 | 417 | ldd XF3,S11 ; ML * A1L |
michael@0 | 418 | add,dc zero,zero,S1 ; A1 cross product sum carry |
michael@0 | 419 | depd,z T1,31,32,S3 ; A1 cross product sum << 32 |
michael@0 | 420 | |
michael@0 | 421 | fstd P4,XF4 ; MR * A6L |
michael@0 | 422 | ldd XF5,S13 ; MR * A3R |
michael@0 | 423 | shrpd S1,T1,32,S1 ; A1 carry | cross product sum >> 32 |
michael@0 | 424 | add S5,S7,T3 ; A3 cross product sum |
michael@0 | 425 | |
michael@0 | 426 | fstd P6,XF6 ; ML * A6R |
michael@0 | 427 | ldd XF7,S15 ; ML * A3L |
michael@0 | 428 | add,dc zero,zero,S5 ; A3 cross product sum carry |
michael@0 | 429 | depd,z T3,31,32,S7 ; A3 cross product sum << 32 |
michael@0 | 430 | |
michael@0 | 431 | |
michael@0 | 432 | shrpd S5,T3,32,S5 ; A3 carry | cross product sum >> 32 |
michael@0 | 433 | add S2,S8,S8 ; M * A0 right doubleword, P0 doubleword |
michael@0 | 434 | |
michael@0 | 435 | add,dc S0,S10,S10 ; M * A0 left doubleword |
michael@0 | 436 | add S3,S9,S9 ; M * A1 right doubleword |
michael@0 | 437 | |
michael@0 | 438 | add,dc S1,S11,S11 ; M * A1 left doubleword |
michael@0 | 439 | add S6,S12,S12 ; M * A2 right doubleword |
michael@0 | 440 | |
michael@0 | 441 | |
michael@0 | 442 | ldd 24(pR),S3 ; Addend word 3 |
michael@0 | 443 | fstd P1,XF1 ; MR * A5L |
michael@0 | 444 | add,dc S4,S14,S14 ; M * A2 left doubleword |
michael@0 | 445 | xmpyu MR,A5R,P1 ; A5 right 32-bit word product |
michael@0 | 446 | |
michael@0 | 447 | ldd 8(pR),S1 ; Addend word 1 |
michael@0 | 448 | fstd P3,XF3 ; ML * A5R |
michael@0 | 449 | add S7,S13,S13 ; M * A3 right doubleword |
michael@0 | 450 | xmpyu ML,A5L,P3 ; A5 left 32-bit word product |
michael@0 | 451 | |
michael@0 | 452 | ldd 0(pR),S7 ; Addend word 0 |
michael@0 | 453 | fstd P5,XF5 ; MR * A7L |
michael@0 | 454 | add,dc S5,S15,S15 ; M * A3 left doubleword |
michael@0 | 455 | xmpyu MR,A7R,P5 ; A7 right 32-bit word product |
michael@0 | 456 | |
michael@0 | 457 | ldd 16(pR),S5 ; Addend word 2 |
michael@0 | 458 | fstd P7,XF7 ; ML * A7R |
michael@0 | 459 | add S10,S9,S9 ; P1 doubleword |
michael@0 | 460 | xmpyu ML,A7L,P7 ; A7 left 32-bit word products |
michael@0 | 461 | |
michael@0 | 462 | |
michael@0 | 463 | ldd XF0,S0 ; MR * A4L |
michael@0 | 464 | fstd P1,XF9 ; MR * A5R |
michael@0 | 465 | add,dc S11,S12,S12 ; P2 doubleword |
michael@0 | 466 | xmpyu MR,A4R,P0 ; A4 right 32-bit word product |
michael@0 | 467 | |
michael@0 | 468 | ldd XF2,S2 ; ML * A4R |
michael@0 | 469 | fstd P3,XF11 ; ML * A5L |
michael@0 | 470 | add,dc S14,S13,S13 ; P3 doubleword |
michael@0 | 471 | xmpyu ML,A4L,P2 ; A4 left 32-bit word product |
michael@0 | 472 | |
michael@0 | 473 | ldd XF6,S6 ; ML * A6R |
michael@0 | 474 | fstd P5,XF13 ; MR * A7R |
michael@0 | 475 | add,dc zero,S15,T2 ; P4 partial doubleword |
michael@0 | 476 | xmpyu MR,A6R,P4 ; A6 right 32-bit word product |
michael@0 | 477 | |
michael@0 | 478 | ldd XF4,S4 ; MR * A6L |
michael@0 | 479 | fstd P7,XF15 ; ML * A7L |
michael@0 | 480 | add S7,S8,S8 ; R0 + P0, new R0 doubleword |
michael@0 | 481 | xmpyu ML,A6L,P6 ; A6 left 32-bit word product |
michael@0 | 482 | |
michael@0 | 483 | |
michael@0 | 484 | fstd P0,XF0 ; MR * A4R |
michael@0 | 485 | ldd XF7,S7 ; ML * A7R |
michael@0 | 486 | add,dc S1,S9,S9 ; c + R1 + P1, new R1 doubleword |
michael@0 | 487 | |
michael@0 | 488 | fstd P2,XF2 ; ML * A4L |
michael@0 | 489 | ldd XF1,S1 ; MR * A5L |
michael@0 | 490 | add,dc S5,S12,S12 ; c + R2 + P2, new R2 doubleword |
michael@0 | 491 | |
michael@0 | 492 | fstd P4,XF4 ; MR * A6R |
michael@0 | 493 | ldd XF5,S5 ; MR * A7L |
michael@0 | 494 | add,dc S3,S13,S13 ; c + R3 + P3, new R3 doubleword |
michael@0 | 495 | |
michael@0 | 496 | fstd P6,XF6 ; ML * A6L |
michael@0 | 497 | ldd XF3,S3 ; ML * A5R |
michael@0 | 498 | add,dc zero,T2,T2 ; c + partial P4 |
michael@0 | 499 | add S0,S2,T1 ; A4 cross product sum |
michael@0 | 500 | |
michael@0 | 501 | |
michael@0 | 502 | std S8,0(pR) ; save R0 |
michael@0 | 503 | add,dc zero,zero,S0 ; A4 cross product sum carry |
michael@0 | 504 | depd,z T1,31,32,S2 ; A4 cross product sum << 32 |
michael@0 | 505 | |
michael@0 | 506 | std S9,8(pR) ; save R1 |
michael@0 | 507 | shrpd S0,T1,32,S0 ; A4 carry | cross product sum >> 32 |
michael@0 | 508 | add S4,S6,T3 ; A6 cross product sum |
michael@0 | 509 | |
michael@0 | 510 | std S12,16(pR) ; save R2 |
michael@0 | 511 | add,dc zero,zero,S4 ; A6 cross product sum carry |
michael@0 | 512 | depd,z T3,31,32,S6 ; A6 cross product sum << 32 |
michael@0 | 513 | |
michael@0 | 514 | |
michael@0 | 515 | std S13,24(pR) ; save R3 |
michael@0 | 516 | shrpd S4,T3,32,S4 ; A6 carry | cross product sum >> 32 |
michael@0 | 517 | add S1,S3,T1 ; A5 cross product sum |
michael@0 | 518 | |
michael@0 | 519 | ldd XF0,S8 ; MR * A4R |
michael@0 | 520 | add,dc zero,zero,S1 ; A5 cross product sum carry |
michael@0 | 521 | depd,z T1,31,32,S3 ; A5 cross product sum << 32 |
michael@0 | 522 | |
michael@0 | 523 | ldd XF2,S10 ; ML * A4L |
michael@0 | 524 | ldd XF9,S9 ; MR * A5R |
michael@0 | 525 | shrpd S1,T1,32,S1 ; A5 carry | cross product sum >> 32 |
michael@0 | 526 | add S5,S7,T3 ; A7 cross product sum |
michael@0 | 527 | |
michael@0 | 528 | ldd XF4,S12 ; MR * A6R |
michael@0 | 529 | ldd XF11,S11 ; ML * A5L |
michael@0 | 530 | add,dc zero,zero,S5 ; A7 cross product sum carry |
michael@0 | 531 | depd,z T3,31,32,S7 ; A7 cross product sum << 32 |
michael@0 | 532 | |
michael@0 | 533 | ldd XF6,S14 ; ML * A6L |
michael@0 | 534 | ldd XF13,S13 ; MR * A7R |
michael@0 | 535 | shrpd S5,T3,32,S5 ; A7 carry | cross product sum >> 32 |
michael@0 | 536 | add S2,S8,S8 ; M * A4 right doubleword |
michael@0 | 537 | |
michael@0 | 538 | |
michael@0 | 539 | ldd XF15,S15 ; ML * A7L |
michael@0 | 540 | add,dc S0,S10,S10 ; M * A4 left doubleword |
michael@0 | 541 | add S3,S9,S9 ; M * A5 right doubleword |
michael@0 | 542 | |
michael@0 | 543 | add,dc S1,S11,S11 ; M * A5 left doubleword |
michael@0 | 544 | add S6,S12,S12 ; M * A6 right doubleword |
michael@0 | 545 | |
michael@0 | 546 | ldd 32(pR),S0 ; Addend word 4 |
michael@0 | 547 | ldd 40(pR),S1 ; Addend word 5 |
michael@0 | 548 | add,dc S4,S14,S14 ; M * A6 left doubleword |
michael@0 | 549 | add S7,S13,S13 ; M * A7 right doubleword |
michael@0 | 550 | |
michael@0 | 551 | ldd 48(pR),S2 ; Addend word 6 |
michael@0 | 552 | ldd 56(pR),S3 ; Addend word 7 |
michael@0 | 553 | add,dc S5,S15,S15 ; M * A7 left doubleword |
michael@0 | 554 | add S8,T2,S8 ; P4 doubleword |
michael@0 | 555 | |
michael@0 | 556 | ldd 64(pR),S4 ; Addend word 8 |
michael@0 | 557 | ldd SV5,s5 ; restore s5 |
michael@0 | 558 | add,dc S10,S9,S9 ; P5 doubleword |
michael@0 | 559 | add,dc S11,S12,S12 ; P6 doubleword |
michael@0 | 560 | |
michael@0 | 561 | |
michael@0 | 562 | ldd SV6,s6 ; restore s6 |
michael@0 | 563 | ldd SV7,s7 ; restore s7 |
michael@0 | 564 | add,dc S14,S13,S13 ; P7 doubleword |
michael@0 | 565 | add,dc zero,S15,S15 ; P8 doubleword |
michael@0 | 566 | |
michael@0 | 567 | add S0,S8,S8 ; new R4 doubleword |
michael@0 | 568 | |
michael@0 | 569 | ldd SV0,s0 ; restore s0 |
michael@0 | 570 | std S8,32(pR) ; save R4 |
michael@0 | 571 | add,dc S1,S9,S9 ; new R5 doubleword |
michael@0 | 572 | |
michael@0 | 573 | ldd SV1,s1 ; restore s1 |
michael@0 | 574 | std S9,40(pR) ; save R5 |
michael@0 | 575 | add,dc S2,S12,S12 ; new R6 doubleword |
michael@0 | 576 | |
michael@0 | 577 | ldd SV2,s2 ; restore s2 |
michael@0 | 578 | std S12,48(pR) ; save R6 |
michael@0 | 579 | add,dc S3,S13,S13 ; new R7 doubleword |
michael@0 | 580 | |
michael@0 | 581 | ldd SV3,s3 ; restore s3 |
michael@0 | 582 | std S13,56(pR) ; save R7 |
michael@0 | 583 | add,dc S4,S15,S15 ; new R8 doubleword |
michael@0 | 584 | |
michael@0 | 585 | ldd SV4,s4 ; restore s4 |
michael@0 | 586 | std S15,64(pR) ; save result[8] |
michael@0 | 587 | add,dc zero,zero,v0 ; return carry from R8 |
michael@0 | 588 | |
michael@0 | 589 | CMPIB,*= 0,v0,$L0 ; if no overflow, exit |
michael@0 | 590 | LDO 8(pR),pR |
michael@0 | 591 | |
michael@0 | 592 | $FINAL1 ; Final carry propagation |
michael@0 | 593 | LDD 64(pR),v0 |
michael@0 | 594 | LDO 8(pR),pR |
michael@0 | 595 | ADDI 1,v0,v0 |
michael@0 | 596 | CMPIB,*= 0,v0,$FINAL1 ; Keep looping if there is a carry. |
michael@0 | 597 | STD v0,56(pR) |
michael@0 | 598 | $L0 |
michael@0 | 599 | bv zero(rp) ; -> caller |
michael@0 | 600 | ldo -ST_SZ(sp),sp ; pop stack |
michael@0 | 601 | |
michael@0 | 602 | /* ====================================================================== */ |
michael@0 | 603 | /* end of module */ |
michael@0 | 604 | /* ====================================================================== */ |
michael@0 | 605 | |
michael@0 | 606 | |
michael@0 | 607 | bve (rp) |
michael@0 | 608 | .EXIT |
michael@0 | 609 | nop |
michael@0 | 610 | .PROCEND |
michael@0 | 611 | .SPACE $TEXT$ |
michael@0 | 612 | .SUBSPA $CODE$ |
michael@0 | 613 | .EXPORT multacc512,ENTRY |
michael@0 | 614 | |
michael@0 | 615 | .end |