|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 /* |
|
5 * |
|
6 * This PA-RISC 2.0 function computes the product of two unsigned integers, |
|
7 * and adds the result to a previously computed integer. The multiplicand |
|
8 * is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in |
|
9 * memory in little-double-wordian order. The multiplier is an unsigned |
|
10 * 64-bit integer. The previously computed integer to which the product is |
|
11 * added is located in the result ("res") area, and is assumed to be a |
|
12 * 576-bit (72-byte, nine doubleword) unsigned integer, stored in memory |
|
13 * in little-double-wordian order. This value normally will be the result |
|
14 * of a previously computed nine doubleword result. It is not necessary |
|
15 * to pad the multiplicand with an additional 64-bit zero doubleword. |
|
16 * |
|
17 * Multiplicand, multiplier, and addend ideally should be aligned at |
|
18 * 16-byte boundaries for best performance. The code will function |
|
19 * correctly for alignment at eight-byte boundaries which are not 16-byte |
|
20 * boundaries, but the execution may be slightly slower due to even/odd |
|
21 * bank conflicts on PA-RISC 8000 processors. |
|
22 * |
|
23 * This function is designed to accept the same calling sequence as Bill |
|
24 * Ackerman's "maxpy_little" function. The carry from the ninth doubleword |
|
25 * of the result is written to the tenth word of the result, as is done by |
|
26 * Bill Ackerman's function. The final carry also is returned as an |
|
27 * integer, which may be ignored. The function prototype may be either |
|
28 * of the following: |
|
29 * |
|
30 * void multacc512( int l, chunk* m, const chunk* a, chunk* res ); |
|
31 * or |
|
32 * int multacc512( int l, chunk* m, const chunk* a, chunk* res ); |
|
33 * |
|
34 * where: "l" originally denoted vector lengths. This parameter is |
|
35 * ignored. This function always assumes a multiplicand length of |
|
36 * 512 bits (eight doublewords), and addend and result lengths of |
|
37 * 576 bits (nine doublewords). |
|
38 * |
|
39 * "m" is a pointer to the doubleword multiplier, ideally aligned |
|
40 * on a 16-byte boundary. |
|
41 * |
|
42 * "a" is a pointer to the eight-doubleword multiplicand, stored |
|
43 * in little-double-wordian order, and ideally aligned on a 16-byte |
|
44 * boundary. |
|
45 * |
|
46 * "res" is a pointer to the nine doubleword addend, and to the |
|
47 * nine-doubleword product computed by this function. The result |
|
48 * also is stored in little-double-wordian order, and ideally is |
|
49 * aligned on a 16-byte boundary. It is expected that the alignment |
|
50 * of the "res" area may alternate between even/odd doubleword |
|
51 * boundaries for successive calls for 512-bit x 512-bit |
|
52 * multiplications. |
|
53 * |
|
54 * The code for this function has been scheduled to use the parallelism |
|
55 * of the PA-RISC 8000 series microprocessors as well as the author was |
|
56 * able. Comments and/or suggestions for improvement are welcomed. |
|
57 * |
|
58 * The code is "64-bit safe". This means it may be called in either |
|
59 * the 32ILP context or the 64LP context. All 64-bits of registers are |
|
60 * saved and restored. |
|
61 * |
|
62 * This code is self-contained. It requires no other header files in order |
|
63 * to compile and to be linkable on a PA-RISC 2.0 machine. Symbolic |
|
64 * definitions for registers and stack offsets are included within this |
|
65 * one source file. |
|
66 * |
|
67 * This is a leaf routine. As such, minimal use is made of the stack area. |
|
68 * Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight |
|
69 * general registers, and 128 bytes are used to move intermediate products |
|
70 * from the floating-point registers to the general registers. Stack |
|
71 * protocols assure proper alignment of these areas. |
|
72 * |
|
73 */ |
|
74 |
|
75 |
|
76 /* ====================================================================*/ |
|
77 /* symbolic definitions for PA-RISC registers */ |
|
78 /* in the MIPS style, avoids lots of case shifts */ |
|
79 /* assigments (except t4) preserve register number parity */ |
|
80 /* ====================================================================*/ |
|
81 |
|
82 #define zero %r0 /* permanent zero */ |
|
83 #define t5 %r1 /* temp register, altered by addil */ |
|
84 |
|
85 #define rp %r2 /* return pointer */ |
|
86 |
|
87 #define s1 %r3 /* callee saves register*/ |
|
88 #define s0 %r4 /* callee saves register*/ |
|
89 #define s3 %r5 /* callee saves register*/ |
|
90 #define s2 %r6 /* callee saves register*/ |
|
91 #define s5 %r7 /* callee saves register*/ |
|
92 #define s4 %r8 /* callee saves register*/ |
|
93 #define s7 %r9 /* callee saves register*/ |
|
94 #define s6 %r10 /* callee saves register*/ |
|
95 |
|
96 #define t1 %r19 /* caller saves register*/ |
|
97 #define t0 %r20 /* caller saves register*/ |
|
98 #define t3 %r21 /* caller saves register*/ |
|
99 #define t2 %r22 /* caller saves register*/ |
|
100 |
|
101 #define a3 %r23 /* fourth argument register, high word */ |
|
102 #define a2 %r24 /* third argument register, low word*/ |
|
103 #define a1 %r25 /* second argument register, high word*/ |
|
104 #define a0 %r26 /* first argument register, low word*/ |
|
105 |
|
106 #define v0 %r28 /* high order return value*/ |
|
107 #define v1 %r29 /* low order return value*/ |
|
108 |
|
109 #define sp %r30 /* stack pointer*/ |
|
110 #define t4 %r31 /* temporary register */ |
|
111 |
|
112 #define fa0 %fr4 /* first argument register*/ |
|
113 #define fa1 %fr5 /* second argument register*/ |
|
114 #define fa2 %fr6 /* third argument register*/ |
|
115 #define fa3 %fr7 /* fourth argument register*/ |
|
116 |
|
117 #define fa0r %fr4R /* first argument register*/ |
|
118 #define fa1r %fr5R /* second argument register*/ |
|
119 #define fa2r %fr6R /* third argument register*/ |
|
120 #define fa3r %fr7R /* fourth argument register*/ |
|
121 |
|
122 #define ft0 %fr8 /* caller saves register*/ |
|
123 #define ft1 %fr9 /* caller saves register*/ |
|
124 #define ft2 %fr10 /* caller saves register*/ |
|
125 #define ft3 %fr11 /* caller saves register*/ |
|
126 |
|
127 #define ft0r %fr8R /* caller saves register*/ |
|
128 #define ft1r %fr9R /* caller saves register*/ |
|
129 #define ft2r %fr10R /* caller saves register*/ |
|
130 #define ft3r %fr11R /* caller saves register*/ |
|
131 |
|
132 #define ft4 %fr22 /* caller saves register*/ |
|
133 #define ft5 %fr23 /* caller saves register*/ |
|
134 #define ft6 %fr24 /* caller saves register*/ |
|
135 #define ft7 %fr25 /* caller saves register*/ |
|
136 #define ft8 %fr26 /* caller saves register*/ |
|
137 #define ft9 %fr27 /* caller saves register*/ |
|
138 #define ft10 %fr28 /* caller saves register*/ |
|
139 #define ft11 %fr29 /* caller saves register*/ |
|
140 #define ft12 %fr30 /* caller saves register*/ |
|
141 #define ft13 %fr31 /* caller saves register*/ |
|
142 |
|
143 #define ft4r %fr22R /* caller saves register*/ |
|
144 #define ft5r %fr23R /* caller saves register*/ |
|
145 #define ft6r %fr24R /* caller saves register*/ |
|
146 #define ft7r %fr25R /* caller saves register*/ |
|
147 #define ft8r %fr26R /* caller saves register*/ |
|
148 #define ft9r %fr27R /* caller saves register*/ |
|
149 #define ft10r %fr28R /* caller saves register*/ |
|
150 #define ft11r %fr29R /* caller saves register*/ |
|
151 #define ft12r %fr30R /* caller saves register*/ |
|
152 #define ft13r %fr31R /* caller saves register*/ |
|
153 |
|
154 |
|
155 |
|
156 /* ================================================================== */ |
|
157 /* functional definitions for PA-RISC registers */ |
|
158 /* ================================================================== */ |
|
159 |
|
160 /* general registers */ |
|
161 |
|
162 #define T1 a0 /* temp, (length parameter ignored) */ |
|
163 |
|
164 #define pM a1 /* -> 64-bit multiplier */ |
|
165 #define T2 a1 /* temp, (after fetching multiplier) */ |
|
166 |
|
167 #define pA a2 /* -> multiplicand vector (8 64-bit words) */ |
|
168 #define T3 a2 /* temp, (after fetching multiplicand) */ |
|
169 |
|
170 #define pR a3 /* -> addend vector (8 64-bit doublewords, |
|
171 result vector (9 64-bit words) */ |
|
172 |
|
173 #define S0 s0 /* callee saves summand registers */ |
|
174 #define S1 s1 |
|
175 #define S2 s2 |
|
176 #define S3 s3 |
|
177 #define S4 s4 |
|
178 #define S5 s5 |
|
179 #define S6 s6 |
|
180 #define S7 s7 |
|
181 |
|
182 #define S8 v0 /* caller saves summand registers */ |
|
183 #define S9 v1 |
|
184 #define S10 t0 |
|
185 #define S11 t1 |
|
186 #define S12 t2 |
|
187 #define S13 t3 |
|
188 #define S14 t4 |
|
189 #define S15 t5 |
|
190 |
|
191 |
|
192 |
|
193 /* floating-point registers */ |
|
194 |
|
195 #define M fa0 /* multiplier double word */ |
|
196 #define MR fa0r /* low order half of multiplier double word */ |
|
197 #define ML fa0 /* high order half of multiplier double word */ |
|
198 |
|
199 #define A0 fa2 /* multiplicand double word 0 */ |
|
200 #define A0R fa2r /* low order half of multiplicand double word */ |
|
201 #define A0L fa2 /* high order half of multiplicand double word */ |
|
202 |
|
203 #define A1 fa3 /* multiplicand double word 1 */ |
|
204 #define A1R fa3r /* low order half of multiplicand double word */ |
|
205 #define A1L fa3 /* high order half of multiplicand double word */ |
|
206 |
|
207 #define A2 ft0 /* multiplicand double word 2 */ |
|
208 #define A2R ft0r /* low order half of multiplicand double word */ |
|
209 #define A2L ft0 /* high order half of multiplicand double word */ |
|
210 |
|
211 #define A3 ft1 /* multiplicand double word 3 */ |
|
212 #define A3R ft1r /* low order half of multiplicand double word */ |
|
213 #define A3L ft1 /* high order half of multiplicand double word */ |
|
214 |
|
215 #define A4 ft2 /* multiplicand double word 4 */ |
|
216 #define A4R ft2r /* low order half of multiplicand double word */ |
|
217 #define A4L ft2 /* high order half of multiplicand double word */ |
|
218 |
|
219 #define A5 ft3 /* multiplicand double word 5 */ |
|
220 #define A5R ft3r /* low order half of multiplicand double word */ |
|
221 #define A5L ft3 /* high order half of multiplicand double word */ |
|
222 |
|
223 #define A6 ft4 /* multiplicand double word 6 */ |
|
224 #define A6R ft4r /* low order half of multiplicand double word */ |
|
225 #define A6L ft4 /* high order half of multiplicand double word */ |
|
226 |
|
227 #define A7 ft5 /* multiplicand double word 7 */ |
|
228 #define A7R ft5r /* low order half of multiplicand double word */ |
|
229 #define A7L ft5 /* high order half of multiplicand double word */ |
|
230 |
|
231 #define P0 ft6 /* product word 0 */ |
|
232 #define P1 ft7 /* product word 0 */ |
|
233 #define P2 ft8 /* product word 0 */ |
|
234 #define P3 ft9 /* product word 0 */ |
|
235 #define P4 ft10 /* product word 0 */ |
|
236 #define P5 ft11 /* product word 0 */ |
|
237 #define P6 ft12 /* product word 0 */ |
|
238 #define P7 ft13 /* product word 0 */ |
|
239 |
|
240 |
|
241 |
|
242 |
|
243 /* ====================================================================== */ |
|
244 /* symbolic definitions for HP-UX stack offsets */ |
|
245 /* symbolic definitions for memory NOPs */ |
|
246 /* ====================================================================== */ |
|
247 |
|
248 #define ST_SZ 192 /* stack area total size */ |
|
249 |
|
250 #define SV0 -192(sp) /* general register save area */ |
|
251 #define SV1 -184(sp) |
|
252 #define SV2 -176(sp) |
|
253 #define SV3 -168(sp) |
|
254 #define SV4 -160(sp) |
|
255 #define SV5 -152(sp) |
|
256 #define SV6 -144(sp) |
|
257 #define SV7 -136(sp) |
|
258 |
|
259 #define XF0 -128(sp) /* data transfer area */ |
|
260 #define XF1 -120(sp) /* for floating-pt to integer regs */ |
|
261 #define XF2 -112(sp) |
|
262 #define XF3 -104(sp) |
|
263 #define XF4 -96(sp) |
|
264 #define XF5 -88(sp) |
|
265 #define XF6 -80(sp) |
|
266 #define XF7 -72(sp) |
|
267 #define XF8 -64(sp) |
|
268 #define XF9 -56(sp) |
|
269 #define XF10 -48(sp) |
|
270 #define XF11 -40(sp) |
|
271 #define XF12 -32(sp) |
|
272 #define XF13 -24(sp) |
|
273 #define XF14 -16(sp) |
|
274 #define XF15 -8(sp) |
|
275 |
|
276 #define mnop proberi (sp),3,zero /* memory NOP */ |
|
277 |
|
278 |
|
279 |
|
280 |
|
281 /* ====================================================================== */ |
|
282 /* assembler formalities */ |
|
283 /* ====================================================================== */ |
|
284 |
|
285 #ifdef __LP64__ |
|
286 .level 2.0W |
|
287 #else |
|
288 .level 2.0 |
|
289 #endif |
|
290 .space $TEXT$ |
|
291 .subspa $CODE$ |
|
292 .align 16 |
|
293 |
|
294 /* ====================================================================== */ |
|
295 /* here to compute 64-bit x 512-bit product + 512-bit addend */ |
|
296 /* ====================================================================== */ |
|
297 |
|
298 multacc512 |
|
299 .PROC |
|
300 .CALLINFO |
|
301 .ENTRY |
|
302 fldd 0(pM),M ; multiplier double word |
|
303 ldo ST_SZ(sp),sp ; push stack |
|
304 |
|
305 fldd 0(pA),A0 ; multiplicand double word 0 |
|
306 std S1,SV1 ; save s1 |
|
307 |
|
308 fldd 16(pA),A2 ; multiplicand double word 2 |
|
309 std S3,SV3 ; save s3 |
|
310 |
|
311 fldd 32(pA),A4 ; multiplicand double word 4 |
|
312 std S5,SV5 ; save s5 |
|
313 |
|
314 fldd 48(pA),A6 ; multiplicand double word 6 |
|
315 std S7,SV7 ; save s7 |
|
316 |
|
317 |
|
318 std S0,SV0 ; save s0 |
|
319 fldd 8(pA),A1 ; multiplicand double word 1 |
|
320 xmpyu MR,A0L,P0 ; A0 cross 32-bit word products |
|
321 xmpyu ML,A0R,P2 |
|
322 |
|
323 std S2,SV2 ; save s2 |
|
324 fldd 24(pA),A3 ; multiplicand double word 3 |
|
325 xmpyu MR,A2L,P4 ; A2 cross 32-bit word products |
|
326 xmpyu ML,A2R,P6 |
|
327 |
|
328 std S4,SV4 ; save s4 |
|
329 fldd 40(pA),A5 ; multiplicand double word 5 |
|
330 |
|
331 std S6,SV6 ; save s6 |
|
332 fldd 56(pA),A7 ; multiplicand double word 7 |
|
333 |
|
334 |
|
335 fstd P0,XF0 ; MR * A0L |
|
336 xmpyu MR,A0R,P0 ; A0 right 32-bit word product |
|
337 xmpyu MR,A1L,P1 ; A1 cross 32-bit word product |
|
338 |
|
339 fstd P2,XF2 ; ML * A0R |
|
340 xmpyu ML,A0L,P2 ; A0 left 32-bit word product |
|
341 xmpyu ML,A1R,P3 ; A1 cross 32-bit word product |
|
342 |
|
343 fstd P4,XF4 ; MR * A2L |
|
344 xmpyu MR,A2R,P4 ; A2 right 32-bit word product |
|
345 xmpyu MR,A3L,P5 ; A3 cross 32-bit word product |
|
346 |
|
347 fstd P6,XF6 ; ML * A2R |
|
348 xmpyu ML,A2L,P6 ; A2 parallel 32-bit word product |
|
349 xmpyu ML,A3R,P7 ; A3 cross 32-bit word product |
|
350 |
|
351 |
|
352 ldd XF0,S0 ; MR * A0L |
|
353 fstd P1,XF1 ; MR * A1L |
|
354 |
|
355 ldd XF2,S2 ; ML * A0R |
|
356 fstd P3,XF3 ; ML * A1R |
|
357 |
|
358 ldd XF4,S4 ; MR * A2L |
|
359 fstd P5,XF5 ; MR * A3L |
|
360 xmpyu MR,A1R,P1 ; A1 parallel 32-bit word products |
|
361 xmpyu ML,A1L,P3 |
|
362 |
|
363 ldd XF6,S6 ; ML * A2R |
|
364 fstd P7,XF7 ; ML * A3R |
|
365 xmpyu MR,A3R,P5 ; A3 parallel 32-bit word products |
|
366 xmpyu ML,A3L,P7 |
|
367 |
|
368 |
|
369 fstd P0,XF0 ; MR * A0R |
|
370 ldd XF1,S1 ; MR * A1L |
|
371 nop |
|
372 add S0,S2,T1 ; A0 cross product sum |
|
373 |
|
374 fstd P2,XF2 ; ML * A0L |
|
375 ldd XF3,S3 ; ML * A1R |
|
376 add,dc zero,zero,S0 ; A0 cross product sum carry |
|
377 depd,z T1,31,32,S2 ; A0 cross product sum << 32 |
|
378 |
|
379 fstd P4,XF4 ; MR * A2R |
|
380 ldd XF5,S5 ; MR * A3L |
|
381 shrpd S0,T1,32,S0 ; A0 carry | cross product sum >> 32 |
|
382 add S4,S6,T3 ; A2 cross product sum |
|
383 |
|
384 fstd P6,XF6 ; ML * A2L |
|
385 ldd XF7,S7 ; ML * A3R |
|
386 add,dc zero,zero,S4 ; A2 cross product sum carry |
|
387 depd,z T3,31,32,S6 ; A2 cross product sum << 32 |
|
388 |
|
389 |
|
390 ldd XF0,S8 ; MR * A0R |
|
391 fstd P1,XF1 ; MR * A1R |
|
392 xmpyu MR,A4L,P0 ; A4 cross 32-bit word product |
|
393 xmpyu MR,A5L,P1 ; A5 cross 32-bit word product |
|
394 |
|
395 ldd XF2,S10 ; ML * A0L |
|
396 fstd P3,XF3 ; ML * A1L |
|
397 xmpyu ML,A4R,P2 ; A4 cross 32-bit word product |
|
398 xmpyu ML,A5R,P3 ; A5 cross 32-bit word product |
|
399 |
|
400 ldd XF4,S12 ; MR * A2R |
|
401 fstd P5,XF5 ; MR * A3L |
|
402 xmpyu MR,A6L,P4 ; A6 cross 32-bit word product |
|
403 xmpyu MR,A7L,P5 ; A7 cross 32-bit word product |
|
404 |
|
405 ldd XF6,S14 ; ML * A2L |
|
406 fstd P7,XF7 ; ML * A3L |
|
407 xmpyu ML,A6R,P6 ; A6 cross 32-bit word product |
|
408 xmpyu ML,A7R,P7 ; A7 cross 32-bit word product |
|
409 |
|
410 |
|
411 fstd P0,XF0 ; MR * A4L |
|
412 ldd XF1,S9 ; MR * A1R |
|
413 shrpd S4,T3,32,S4 ; A2 carry | cross product sum >> 32 |
|
414 add S1,S3,T1 ; A1 cross product sum |
|
415 |
|
416 fstd P2,XF2 ; ML * A4R |
|
417 ldd XF3,S11 ; ML * A1L |
|
418 add,dc zero,zero,S1 ; A1 cross product sum carry |
|
419 depd,z T1,31,32,S3 ; A1 cross product sum << 32 |
|
420 |
|
421 fstd P4,XF4 ; MR * A6L |
|
422 ldd XF5,S13 ; MR * A3R |
|
423 shrpd S1,T1,32,S1 ; A1 carry | cross product sum >> 32 |
|
424 add S5,S7,T3 ; A3 cross product sum |
|
425 |
|
426 fstd P6,XF6 ; ML * A6R |
|
427 ldd XF7,S15 ; ML * A3L |
|
428 add,dc zero,zero,S5 ; A3 cross product sum carry |
|
429 depd,z T3,31,32,S7 ; A3 cross product sum << 32 |
|
430 |
|
431 |
|
432 shrpd S5,T3,32,S5 ; A3 carry | cross product sum >> 32 |
|
433 add S2,S8,S8 ; M * A0 right doubleword, P0 doubleword |
|
434 |
|
435 add,dc S0,S10,S10 ; M * A0 left doubleword |
|
436 add S3,S9,S9 ; M * A1 right doubleword |
|
437 |
|
438 add,dc S1,S11,S11 ; M * A1 left doubleword |
|
439 add S6,S12,S12 ; M * A2 right doubleword |
|
440 |
|
441 |
|
442 ldd 24(pR),S3 ; Addend word 3 |
|
443 fstd P1,XF1 ; MR * A5L |
|
444 add,dc S4,S14,S14 ; M * A2 left doubleword |
|
445 xmpyu MR,A5R,P1 ; A5 right 32-bit word product |
|
446 |
|
447 ldd 8(pR),S1 ; Addend word 1 |
|
448 fstd P3,XF3 ; ML * A5R |
|
449 add S7,S13,S13 ; M * A3 right doubleword |
|
450 xmpyu ML,A5L,P3 ; A5 left 32-bit word product |
|
451 |
|
452 ldd 0(pR),S7 ; Addend word 0 |
|
453 fstd P5,XF5 ; MR * A7L |
|
454 add,dc S5,S15,S15 ; M * A3 left doubleword |
|
455 xmpyu MR,A7R,P5 ; A7 right 32-bit word product |
|
456 |
|
457 ldd 16(pR),S5 ; Addend word 2 |
|
458 fstd P7,XF7 ; ML * A7R |
|
459 add S10,S9,S9 ; P1 doubleword |
|
460 xmpyu ML,A7L,P7 ; A7 left 32-bit word products |
|
461 |
|
462 |
|
463 ldd XF0,S0 ; MR * A4L |
|
464 fstd P1,XF9 ; MR * A5R |
|
465 add,dc S11,S12,S12 ; P2 doubleword |
|
466 xmpyu MR,A4R,P0 ; A4 right 32-bit word product |
|
467 |
|
468 ldd XF2,S2 ; ML * A4R |
|
469 fstd P3,XF11 ; ML * A5L |
|
470 add,dc S14,S13,S13 ; P3 doubleword |
|
471 xmpyu ML,A4L,P2 ; A4 left 32-bit word product |
|
472 |
|
473 ldd XF6,S6 ; ML * A6R |
|
474 fstd P5,XF13 ; MR * A7R |
|
475 add,dc zero,S15,T2 ; P4 partial doubleword |
|
476 xmpyu MR,A6R,P4 ; A6 right 32-bit word product |
|
477 |
|
478 ldd XF4,S4 ; MR * A6L |
|
479 fstd P7,XF15 ; ML * A7L |
|
480 add S7,S8,S8 ; R0 + P0, new R0 doubleword |
|
481 xmpyu ML,A6L,P6 ; A6 left 32-bit word product |
|
482 |
|
483 |
|
484 fstd P0,XF0 ; MR * A4R |
|
485 ldd XF7,S7 ; ML * A7R |
|
486 add,dc S1,S9,S9 ; c + R1 + P1, new R1 doubleword |
|
487 |
|
488 fstd P2,XF2 ; ML * A4L |
|
489 ldd XF1,S1 ; MR * A5L |
|
490 add,dc S5,S12,S12 ; c + R2 + P2, new R2 doubleword |
|
491 |
|
492 fstd P4,XF4 ; MR * A6R |
|
493 ldd XF5,S5 ; MR * A7L |
|
494 add,dc S3,S13,S13 ; c + R3 + P3, new R3 doubleword |
|
495 |
|
496 fstd P6,XF6 ; ML * A6L |
|
497 ldd XF3,S3 ; ML * A5R |
|
498 add,dc zero,T2,T2 ; c + partial P4 |
|
499 add S0,S2,T1 ; A4 cross product sum |
|
500 |
|
501 |
|
502 std S8,0(pR) ; save R0 |
|
503 add,dc zero,zero,S0 ; A4 cross product sum carry |
|
504 depd,z T1,31,32,S2 ; A4 cross product sum << 32 |
|
505 |
|
506 std S9,8(pR) ; save R1 |
|
507 shrpd S0,T1,32,S0 ; A4 carry | cross product sum >> 32 |
|
508 add S4,S6,T3 ; A6 cross product sum |
|
509 |
|
510 std S12,16(pR) ; save R2 |
|
511 add,dc zero,zero,S4 ; A6 cross product sum carry |
|
512 depd,z T3,31,32,S6 ; A6 cross product sum << 32 |
|
513 |
|
514 |
|
515 std S13,24(pR) ; save R3 |
|
516 shrpd S4,T3,32,S4 ; A6 carry | cross product sum >> 32 |
|
517 add S1,S3,T1 ; A5 cross product sum |
|
518 |
|
519 ldd XF0,S8 ; MR * A4R |
|
520 add,dc zero,zero,S1 ; A5 cross product sum carry |
|
521 depd,z T1,31,32,S3 ; A5 cross product sum << 32 |
|
522 |
|
523 ldd XF2,S10 ; ML * A4L |
|
524 ldd XF9,S9 ; MR * A5R |
|
525 shrpd S1,T1,32,S1 ; A5 carry | cross product sum >> 32 |
|
526 add S5,S7,T3 ; A7 cross product sum |
|
527 |
|
528 ldd XF4,S12 ; MR * A6R |
|
529 ldd XF11,S11 ; ML * A5L |
|
530 add,dc zero,zero,S5 ; A7 cross product sum carry |
|
531 depd,z T3,31,32,S7 ; A7 cross product sum << 32 |
|
532 |
|
533 ldd XF6,S14 ; ML * A6L |
|
534 ldd XF13,S13 ; MR * A7R |
|
535 shrpd S5,T3,32,S5 ; A7 carry | cross product sum >> 32 |
|
536 add S2,S8,S8 ; M * A4 right doubleword |
|
537 |
|
538 |
|
539 ldd XF15,S15 ; ML * A7L |
|
540 add,dc S0,S10,S10 ; M * A4 left doubleword |
|
541 add S3,S9,S9 ; M * A5 right doubleword |
|
542 |
|
543 add,dc S1,S11,S11 ; M * A5 left doubleword |
|
544 add S6,S12,S12 ; M * A6 right doubleword |
|
545 |
|
546 ldd 32(pR),S0 ; Addend word 4 |
|
547 ldd 40(pR),S1 ; Addend word 5 |
|
548 add,dc S4,S14,S14 ; M * A6 left doubleword |
|
549 add S7,S13,S13 ; M * A7 right doubleword |
|
550 |
|
551 ldd 48(pR),S2 ; Addend word 6 |
|
552 ldd 56(pR),S3 ; Addend word 7 |
|
553 add,dc S5,S15,S15 ; M * A7 left doubleword |
|
554 add S8,T2,S8 ; P4 doubleword |
|
555 |
|
556 ldd 64(pR),S4 ; Addend word 8 |
|
557 ldd SV5,s5 ; restore s5 |
|
558 add,dc S10,S9,S9 ; P5 doubleword |
|
559 add,dc S11,S12,S12 ; P6 doubleword |
|
560 |
|
561 |
|
562 ldd SV6,s6 ; restore s6 |
|
563 ldd SV7,s7 ; restore s7 |
|
564 add,dc S14,S13,S13 ; P7 doubleword |
|
565 add,dc zero,S15,S15 ; P8 doubleword |
|
566 |
|
567 add S0,S8,S8 ; new R4 doubleword |
|
568 |
|
569 ldd SV0,s0 ; restore s0 |
|
570 std S8,32(pR) ; save R4 |
|
571 add,dc S1,S9,S9 ; new R5 doubleword |
|
572 |
|
573 ldd SV1,s1 ; restore s1 |
|
574 std S9,40(pR) ; save R5 |
|
575 add,dc S2,S12,S12 ; new R6 doubleword |
|
576 |
|
577 ldd SV2,s2 ; restore s2 |
|
578 std S12,48(pR) ; save R6 |
|
579 add,dc S3,S13,S13 ; new R7 doubleword |
|
580 |
|
581 ldd SV3,s3 ; restore s3 |
|
582 std S13,56(pR) ; save R7 |
|
583 add,dc S4,S15,S15 ; new R8 doubleword |
|
584 |
|
585 ldd SV4,s4 ; restore s4 |
|
586 std S15,64(pR) ; save result[8] |
|
587 add,dc zero,zero,v0 ; return carry from R8 |
|
588 |
|
589 CMPIB,*= 0,v0,$L0 ; if no overflow, exit |
|
590 LDO 8(pR),pR |
|
591 |
|
592 $FINAL1 ; Final carry propagation |
|
593 LDD 64(pR),v0 |
|
594 LDO 8(pR),pR |
|
595 ADDI 1,v0,v0 |
|
596 CMPIB,*= 0,v0,$FINAL1 ; Keep looping if there is a carry. |
|
597 STD v0,56(pR) |
|
598 $L0 |
|
599 bv zero(rp) ; -> caller |
|
600 ldo -ST_SZ(sp),sp ; pop stack |
|
601 |
|
602 /* ====================================================================== */ |
|
603 /* end of module */ |
|
604 /* ====================================================================== */ |
|
605 |
|
606 |
|
607 bve (rp) |
|
608 .EXIT |
|
609 nop |
|
610 .PROCEND |
|
611 .SPACE $TEXT$ |
|
612 .SUBSPA $CODE$ |
|
613 .EXPORT multacc512,ENTRY |
|
614 |
|
615 .end |