Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 ; This Source Code Form is subject to the terms of the Mozilla Public
2 ; License, v. 2.0. If a copy of the MPL was not distributed with this
3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 #ifdef __LP64__
6 .LEVEL 2.0W
7 #else
8 ; .LEVEL 1.1
9 ; .ALLOW 2.0N
10 .LEVEL 2.0
11 #endif
12 .SPACE $TEXT$,SORT=8
13 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
15 ; ***************************************************************
16 ;
17 ; maxpy_[little/big]
18 ;
19 ; ***************************************************************
21 ; There is no default -- you must specify one or the other.
22 #define LITTLE_WORDIAN 1
24 #ifdef LITTLE_WORDIAN
25 #define EIGHT 8
26 #define SIXTEEN 16
27 #define THIRTY_TWO 32
28 #define UN_EIGHT -8
29 #define UN_SIXTEEN -16
30 #define UN_TWENTY_FOUR -24
31 #endif
33 #ifdef BIG_WORDIAN
34 #define EIGHT -8
35 #define SIXTEEN -16
36 #define THIRTY_TWO -32
37 #define UN_EIGHT 8
38 #define UN_SIXTEEN 16
39 #define UN_TWENTY_FOUR 24
40 #endif
42 ; This performs a multiple-precision integer version of "daxpy",
43 ; Using the selected addressing direction. "Little-wordian" means that
44 ; the least significant word of a number is stored at the lowest address.
45 ; "Big-wordian" means that the most significant word is at the lowest
46 ; address. Either way, the incoming address of the vector is that
47 ; of the least significant word. That means that, for little-wordian
48 ; addressing, we move the address upward as we propagate carries
49 ; from the least significant word to the most significant. For
50 ; big-wordian we move the address downward.
52 ; We use the following registers:
53 ;
54 ; r2 return PC, of course
55 ; r26 = arg1 = length
56 ; r25 = arg2 = address of scalar
57 ; r24 = arg3 = multiplicand vector
58 ; r23 = arg4 = result vector
59 ;
60 ; fr9 = scalar loaded once only from r25
62 ; The cycle counts shown in the bodies below are simply the result of a
63 ; scheduling by hand. The actual PCX-U hardware does it differently.
64 ; The intention is that the overall speed is the same.
66 ; The pipeline startup and shutdown code is constructed in the usual way,
67 ; by taking the loop bodies and removing unnecessary instructions.
68 ; We have left the comments describing cycle numbers in the code.
69 ; These are intended for reference when comparing with the main loop,
70 ; and have no particular relationship to actual cycle numbers.
72 #ifdef LITTLE_WORDIAN
73 maxpy_little
74 #else
75 maxpy_big
76 #endif
77 .PROC
78 .CALLINFO FRAME=120,ENTRY_GR=4
79 .ENTRY
80 STW,MA %r3,128(%sp)
81 STW %r4,-124(%sp)
83 ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
84 FLDD 0(%r25),%fr9 ; fr9 = scalar
86 ; First startup
88 FLDD 0(%r24),%fr24 ; Cycle 1
89 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
90 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
91 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
92 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
93 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
94 FLDD EIGHT(%r24),%fr28 ; Cycle 8
95 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
96 FSTD %fr24,-96(%sp)
97 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
98 FSTD %fr25,-80(%sp)
99 LDO SIXTEEN(%r24),%r24 ; Cycle 12
100 FSTD %fr31,-64(%sp)
101 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
102 FSTD %fr27,-48(%sp)
104 ; Second startup
106 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
107 FSTD %fr30,-56(%sp)
108 FLDD 0(%r24),%fr24
110 FSTD %fr26,-88(%sp) ; Cycle 2
112 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
113 FSTD %fr28,-104(%sp)
115 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
116 LDD -96(%sp),%r3
117 FSTD %fr29,-72(%sp)
119 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
120 LDD -64(%sp),%r19
121 LDD -80(%sp),%r21
123 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
124 LDD -56(%sp),%r20
125 ADD %r21,%r3,%r3
127 ADD,DC %r20,%r19,%r19 ; Cycle 7
128 LDD -88(%sp),%r4
129 SHRPD %r3,%r0,32,%r21
130 LDD -48(%sp),%r1
132 FLDD EIGHT(%r24),%fr28 ; Cycle 8
133 LDD -104(%sp),%r31
134 ADD,DC %r0,%r0,%r20
135 SHRPD %r19,%r3,32,%r3
137 LDD -72(%sp),%r29 ; Cycle 9
138 SHRPD %r20,%r19,32,%r20
139 ADD %r21,%r1,%r1
141 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
142 ADD,DC %r3,%r4,%r4
143 FSTD %fr24,-96(%sp)
145 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
146 ADD,DC %r0,%r20,%r20
147 LDD 0(%r23),%r3
148 FSTD %fr25,-80(%sp)
150 LDO SIXTEEN(%r24),%r24 ; Cycle 12
151 FSTD %fr31,-64(%sp)
153 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
154 ADD %r0,%r0,%r0 ; clear the carry bit
155 ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
156 FSTD %fr27,-48(%sp)
157 ; MFCTL %cr16,%r21 ; for timing
158 ; STD %r21,-112(%sp)
160 ; Here is the loop.
162 $LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
163 ADD,DC %r29,%r4,%r4
164 FSTD %fr30,-56(%sp)
165 FLDD 0(%r24),%fr24
167 LDO SIXTEEN(%r23),%r23 ; Cycle 2
168 ADD,DC %r0,%r20,%r20
169 FSTD %fr26,-88(%sp)
171 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
172 ADD %r3,%r1,%r1
173 FSTD %fr28,-104(%sp)
174 LDD UN_EIGHT(%r23),%r21
176 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
177 ADD,DC %r21,%r4,%r28
178 FSTD %fr29,-72(%sp)
179 LDD -96(%sp),%r3
181 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
182 ADD,DC %r20,%r31,%r22
183 LDD -64(%sp),%r19
184 LDD -80(%sp),%r21
186 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
187 ADD %r21,%r3,%r3
188 LDD -56(%sp),%r20
189 STD %r1,UN_SIXTEEN(%r23)
191 ADD,DC %r20,%r19,%r19 ; Cycle 7
192 SHRPD %r3,%r0,32,%r21
193 LDD -88(%sp),%r4
194 LDD -48(%sp),%r1
196 ADD,DC %r0,%r0,%r20 ; Cycle 8
197 SHRPD %r19,%r3,32,%r3
198 FLDD EIGHT(%r24),%fr28
199 LDD -104(%sp),%r31
201 SHRPD %r20,%r19,32,%r20 ; Cycle 9
202 ADD %r21,%r1,%r1
203 STD %r28,UN_EIGHT(%r23)
204 LDD -72(%sp),%r29
206 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
207 ADD,DC %r3,%r4,%r4
208 FSTD %fr24,-96(%sp)
210 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
211 ADD,DC %r0,%r20,%r20
212 FSTD %fr25,-80(%sp)
213 LDD 0(%r23),%r3
215 LDO SIXTEEN(%r24),%r24 ; Cycle 12
216 FSTD %fr31,-64(%sp)
218 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
219 ADD %r22,%r1,%r1
220 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
221 FSTD %fr27,-48(%sp)
223 $ENDLOOP
225 ; Shutdown code, first stage.
227 ; MFCTL %cr16,%r21 ; for timing
228 ; STD %r21,UN_SIXTEEN(%r23)
229 ; LDD -112(%sp),%r21
230 ; STD %r21,UN_EIGHT(%r23)
232 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
233 ADD,DC %r29,%r4,%r4
234 CMPIB,= 0,%r26,$ONEMORE
235 FSTD %fr30,-56(%sp)
237 LDO SIXTEEN(%r23),%r23 ; Cycle 2
238 ADD,DC %r0,%r20,%r20
239 FSTD %fr26,-88(%sp)
241 ADD %r3,%r1,%r1 ; Cycle 3
242 FSTD %fr28,-104(%sp)
243 LDD UN_EIGHT(%r23),%r21
245 ADD,DC %r21,%r4,%r28 ; Cycle 4
246 FSTD %fr29,-72(%sp)
247 STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
248 LDD -96(%sp),%r3
250 ADD,DC %r20,%r31,%r22 ; Cycle 5
251 STD %r1,UN_SIXTEEN(%r23)
252 $JOIN4
253 LDD -64(%sp),%r19
254 LDD -80(%sp),%r21
256 ADD %r21,%r3,%r3 ; Cycle 6
257 LDD -56(%sp),%r20
259 ADD,DC %r20,%r19,%r19 ; Cycle 7
260 SHRPD %r3,%r0,32,%r21
261 LDD -88(%sp),%r4
262 LDD -48(%sp),%r1
264 ADD,DC %r0,%r0,%r20 ; Cycle 8
265 SHRPD %r19,%r3,32,%r3
266 LDD -104(%sp),%r31
268 SHRPD %r20,%r19,32,%r20 ; Cycle 9
269 ADD %r21,%r1,%r1
270 LDD -72(%sp),%r29
272 ADD,DC %r3,%r4,%r4 ; Cycle 10
274 ADD,DC %r0,%r20,%r20 ; Cycle 11
275 LDD 0(%r23),%r3
277 ADD %r22,%r1,%r1 ; Cycle 13
279 ; Shutdown code, second stage.
281 ADD,DC %r29,%r4,%r4 ; Cycle 1
283 LDO SIXTEEN(%r23),%r23 ; Cycle 2
284 ADD,DC %r0,%r20,%r20
286 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
287 ADD %r3,%r1,%r1
289 ADD,DC %r21,%r4,%r28 ; Cycle 4
291 ADD,DC %r20,%r31,%r22 ; Cycle 5
293 STD %r1,UN_SIXTEEN(%r23); Cycle 6
295 STD %r28,UN_EIGHT(%r23) ; Cycle 9
297 LDD 0(%r23),%r3 ; Cycle 11
299 ; Shutdown code, third stage.
301 LDO SIXTEEN(%r23),%r23
302 ADD %r3,%r22,%r1
303 $JOIN1 ADD,DC %r0,%r0,%r21
304 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
305 STD %r1,UN_SIXTEEN(%r23)
307 ; Final carry propagation
309 $FINAL1 LDO EIGHT(%r23),%r23
310 LDD UN_SIXTEEN(%r23),%r21
311 ADDI 1,%r21,%r21
312 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
313 STD %r21,UN_SIXTEEN(%r23)
314 B $L0
315 NOP
317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
318 ; We do the usual trick -- branch out of the startup code at appropriate
319 ; points, and branch into the shutdown code.
321 $N_IS_SMALL
322 CMPIB,= 0,%r26,$N_IS_ONE
323 FSTD %fr24,-96(%sp) ; Cycle 10
324 FLDD EIGHT(%r24),%fr28 ; Cycle 8
325 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
326 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
327 FSTD %fr25,-80(%sp)
328 FSTD %fr31,-64(%sp) ; Cycle 12
329 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
330 FSTD %fr27,-48(%sp)
331 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
332 CMPIB,= 2,%r26,$N_IS_THREE
333 FSTD %fr30,-56(%sp)
335 ; N = 2
336 FSTD %fr26,-88(%sp) ; Cycle 2
337 FSTD %fr28,-104(%sp) ; Cycle 3
338 LDD -96(%sp),%r3 ; Cycle 4
339 FSTD %fr29,-72(%sp)
340 B $JOIN4
341 ADD %r0,%r0,%r22
343 $N_IS_THREE
344 FLDD SIXTEEN(%r24),%fr24
345 FSTD %fr26,-88(%sp) ; Cycle 2
346 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
347 FSTD %fr28,-104(%sp)
348 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
349 LDD -96(%sp),%r3
350 FSTD %fr29,-72(%sp)
351 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
352 LDD -64(%sp),%r19
353 LDD -80(%sp),%r21
354 B $JOIN3
355 ADD %r0,%r0,%r22
357 $N_IS_ONE
358 FSTD %fr25,-80(%sp)
359 FSTD %fr27,-48(%sp)
360 FSTD %fr26,-88(%sp) ; Cycle 2
361 B $JOIN5
362 ADD %r0,%r0,%r22
364 ; We came out of the unrolled loop with wrong parity. Do one more
365 ; single cycle. This is quite tricky, because of the way the
366 ; carry chains and SHRPD chains have been chopped up.
368 $ONEMORE
370 FLDD 0(%r24),%fr24
372 LDO SIXTEEN(%r23),%r23 ; Cycle 2
373 ADD,DC %r0,%r20,%r20
374 FSTD %fr26,-88(%sp)
376 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
377 FSTD %fr28,-104(%sp)
378 LDD UN_EIGHT(%r23),%r21
379 ADD %r3,%r1,%r1
381 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
382 ADD,DC %r21,%r4,%r28
383 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
384 LDD -96(%sp),%r3
385 FSTD %fr29,-72(%sp)
387 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
388 ADD,DC %r20,%r31,%r22
389 LDD -64(%sp),%r19
390 LDD -80(%sp),%r21
392 STD %r1,UN_SIXTEEN(%r23); Cycle 6
393 $JOIN3
394 XMPYU %fr9L,%fr24R,%fr24
395 LDD -56(%sp),%r20
396 ADD %r21,%r3,%r3
398 ADD,DC %r20,%r19,%r19 ; Cycle 7
399 LDD -88(%sp),%r4
400 SHRPD %r3,%r0,32,%r21
401 LDD -48(%sp),%r1
403 LDD -104(%sp),%r31 ; Cycle 8
404 ADD,DC %r0,%r0,%r20
405 SHRPD %r19,%r3,32,%r3
407 LDD -72(%sp),%r29 ; Cycle 9
408 SHRPD %r20,%r19,32,%r20
409 ADD %r21,%r1,%r1
411 ADD,DC %r3,%r4,%r4 ; Cycle 10
412 FSTD %fr24,-96(%sp)
414 ADD,DC %r0,%r20,%r20 ; Cycle 11
415 LDD 0(%r23),%r3
416 FSTD %fr25,-80(%sp)
418 ADD %r22,%r1,%r1 ; Cycle 13
419 FSTD %fr27,-48(%sp)
421 ; Shutdown code, stage 1-1/2.
423 ADD,DC %r29,%r4,%r4 ; Cycle 1
425 LDO SIXTEEN(%r23),%r23 ; Cycle 2
426 ADD,DC %r0,%r20,%r20
427 FSTD %fr26,-88(%sp)
429 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
430 ADD %r3,%r1,%r1
432 ADD,DC %r21,%r4,%r28 ; Cycle 4
433 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
435 ADD,DC %r20,%r31,%r22 ; Cycle 5
436 STD %r1,UN_SIXTEEN(%r23)
437 $JOIN5
438 LDD -96(%sp),%r3 ; moved from cycle 4
439 LDD -80(%sp),%r21
440 ADD %r21,%r3,%r3 ; Cycle 6
441 ADD,DC %r0,%r0,%r19 ; Cycle 7
442 LDD -88(%sp),%r4
443 SHRPD %r3,%r0,32,%r21
444 LDD -48(%sp),%r1
445 SHRPD %r19,%r3,32,%r3 ; Cycle 8
446 ADD %r21,%r1,%r1 ; Cycle 9
447 ADD,DC %r3,%r4,%r4 ; Cycle 10
448 LDD 0(%r23),%r3 ; Cycle 11
449 ADD %r22,%r1,%r1 ; Cycle 13
451 ; Shutdown code, stage 2-1/2.
453 ADD,DC %r0,%r4,%r4 ; Cycle 1
454 LDO SIXTEEN(%r23),%r23 ; Cycle 2
455 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
456 ADD %r3,%r1,%r1
457 STD %r1,UN_SIXTEEN(%r23)
458 ADD,DC %r21,%r4,%r1
459 B $JOIN1
460 LDO EIGHT(%r23),%r23
462 ; exit
464 $L0
465 LDW -124(%sp),%r4
466 BVE (%r2)
467 .EXIT
468 LDW,MB -128(%sp),%r3
470 .PROCEND
472 ; ***************************************************************
473 ;
474 ; add_diag_[little/big]
475 ;
476 ; ***************************************************************
478 ; The arguments are as follows:
479 ; r2 return PC, of course
480 ; r26 = arg1 = length
481 ; r25 = arg2 = vector to square
482 ; r24 = arg3 = result vector
484 #ifdef LITTLE_WORDIAN
485 add_diag_little
486 #else
487 add_diag_big
488 #endif
489 .PROC
490 .CALLINFO FRAME=120,ENTRY_GR=4
491 .ENTRY
492 STW,MA %r3,128(%sp)
493 STW %r4,-124(%sp)
495 ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
496 NOP
498 ; Startup code
500 FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
501 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
502 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
503 XMPYU %fr7L,%fr7L,%fr30
504 LDO SIXTEEN(%r25),%r25 ; Cycle 6
505 FSTD %fr29,-88(%sp)
506 FSTD %fr27,-72(%sp) ; Cycle 7
507 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
508 FSTD %fr30,-96(%sp)
509 FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
510 LDD -88(%sp),%r22 ; Cycle 3
511 LDD -72(%sp),%r31 ; Cycle 4
512 XMPYU %fr7R,%fr7R,%fr28
513 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
514 XMPYU %fr7L,%fr7L,%fr31
515 LDD -96(%sp),%r20 ; Cycle 6
516 FSTD %fr28,-80(%sp)
517 ADD %r0,%r0,%r0 ; clear the carry bit
518 ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
519 FSTD %fr24,-64(%sp)
521 ; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
523 $DIAGLOOP
524 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
525 LDO SIXTEEN(%r25),%r25
526 LDD 0(%r24),%r1
527 FSTD %fr31,-104(%sp)
528 SHRPD %r0,%r31,31,%r4 ; Cycle 2
529 ADD,DC %r22,%r3,%r3
530 FLDD UN_SIXTEEN(%r25),%fr7
531 ADD,DC %r0,%r20,%r20 ; Cycle 3
532 ADD %r1,%r3,%r3
533 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
534 LDD -80(%sp),%r21
535 STD %r3,0(%r24)
536 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
537 XMPYU %fr7L,%fr7L,%fr30
538 LDD -64(%sp),%r29
539 LDD EIGHT(%r24),%r1
540 ADD,DC %r4,%r20,%r20 ; Cycle 6
541 LDD -104(%sp),%r19
542 FSTD %fr29,-88(%sp)
543 ADD %r20,%r1,%r1 ; Cycle 7
544 FSTD %fr27,-72(%sp)
545 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
546 LDO THIRTY_TWO(%r24),%r24
547 LDD UN_SIXTEEN(%r24),%r28
548 FSTD %fr30,-96(%sp)
549 SHRPD %r0,%r29,31,%r3 ; Cycle 2
550 ADD,DC %r21,%r4,%r4
551 FLDD UN_EIGHT(%r25),%fr7
552 STD %r1,UN_TWENTY_FOUR(%r24)
553 ADD,DC %r0,%r19,%r19 ; Cycle 3
554 ADD %r28,%r4,%r4
555 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
556 LDD -88(%sp),%r22
557 STD %r4,UN_SIXTEEN(%r24)
558 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
559 XMPYU %fr7L,%fr7L,%fr31
560 LDD -72(%sp),%r31
561 LDD UN_EIGHT(%r24),%r28
562 ADD,DC %r3,%r19,%r19 ; Cycle 6
563 LDD -96(%sp),%r20
564 FSTD %fr28,-80(%sp)
565 ADD %r19,%r28,%r28 ; Cycle 7
566 FSTD %fr24,-64(%sp)
567 ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
568 STD %r28,UN_EIGHT(%r24)
570 $ENDDIAGLOOP
572 ADD,DC %r0,%r22,%r22
573 CMPIB,= 0,%r26,$ONEMOREDIAG
574 SHRPD %r31,%r0,31,%r3
576 ; Shutdown code, first stage.
578 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
579 LDD 0(%r24),%r28
580 SHRPD %r0,%r31,31,%r4 ; Cycle 2
581 ADD %r3,%r22,%r3
582 ADD,DC %r0,%r20,%r20 ; Cycle 3
583 LDD -80(%sp),%r21
584 ADD %r3,%r28,%r3
585 LDD -64(%sp),%r29 ; Cycle 4
586 STD %r3,0(%r24)
587 LDD EIGHT(%r24),%r1 ; Cycle 5
588 LDO SIXTEEN(%r25),%r25 ; Cycle 6
589 LDD -104(%sp),%r19
590 ADD,DC %r4,%r20,%r20
591 ADD %r20,%r1,%r1 ; Cycle 7
592 ADD,DC %r0,%r21,%r21 ; Cycle 8
593 STD %r1,EIGHT(%r24)
595 ; Shutdown code, second stage.
597 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
598 LDO THIRTY_TWO(%r24),%r24
599 LDD UN_SIXTEEN(%r24),%r1
600 SHRPD %r0,%r29,31,%r3 ; Cycle 2
601 ADD %r4,%r21,%r4
602 ADD,DC %r0,%r19,%r19 ; Cycle 3
603 ADD %r4,%r1,%r4
604 STD %r4,UN_SIXTEEN(%r24); Cycle 4
605 LDD UN_EIGHT(%r24),%r28 ; Cycle 5
606 ADD,DC %r3,%r19,%r19 ; Cycle 6
607 ADD %r19,%r28,%r28 ; Cycle 7
608 ADD,DC %r0,%r0,%r22 ; Cycle 8
609 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
610 STD %r28,UN_EIGHT(%r24)
612 ; Final carry propagation
614 $FDIAG2
615 LDO EIGHT(%r24),%r24
616 LDD UN_EIGHT(%r24),%r26
617 ADDI 1,%r26,%r26
618 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
619 STD %r26,UN_EIGHT(%r24)
621 B $Z0
622 NOP
624 ; Here is the code that handles the difficult case N=1.
625 ; We do the usual trick -- branch out of the startup code at appropriate
626 ; points, and branch into the shutdown code.
628 $DIAG_N_IS_ONE
630 LDD -88(%sp),%r22
631 LDD -72(%sp),%r31
632 B $JOINDIAG
633 LDD -96(%sp),%r20
635 ; We came out of the unrolled loop with wrong parity. Do one more
636 ; single cycle. This is the "alternate body". It will, of course,
637 ; give us opposite registers from the other case, so we need
638 ; completely different shutdown code.
640 $ONEMOREDIAG
641 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
642 LDD 0(%r24),%r28
643 FLDD 0(%r25),%fr7 ; Cycle 2
644 SHRPD %r0,%r31,31,%r4
645 ADD %r3,%r22,%r3
646 ADD,DC %r0,%r20,%r20 ; Cycle 3
647 LDD -80(%sp),%r21
648 ADD %r3,%r28,%r3
649 LDD -64(%sp),%r29 ; Cycle 4
650 STD %r3,0(%r24)
651 XMPYU %fr7R,%fr7R,%fr29
652 LDD EIGHT(%r24),%r1 ; Cycle 5
653 XMPYU %fr7L,%fr7R,%fr27
654 XMPYU %fr7L,%fr7L,%fr30
655 LDD -104(%sp),%r19 ; Cycle 6
656 FSTD %fr29,-88(%sp)
657 ADD,DC %r4,%r20,%r20
658 FSTD %fr27,-72(%sp) ; Cycle 7
659 ADD %r20,%r1,%r1
660 ADD,DC %r0,%r21,%r21 ; Cycle 8
661 STD %r1,EIGHT(%r24)
663 ; Shutdown code, first stage.
665 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
666 LDO THIRTY_TWO(%r24),%r24
667 FSTD %fr30,-96(%sp)
668 LDD UN_SIXTEEN(%r24),%r1
669 SHRPD %r0,%r29,31,%r3 ; Cycle 2
670 ADD %r4,%r21,%r4
671 ADD,DC %r0,%r19,%r19 ; Cycle 3
672 LDD -88(%sp),%r22
673 ADD %r4,%r1,%r4
674 LDD -72(%sp),%r31 ; Cycle 4
675 STD %r4,UN_SIXTEEN(%r24)
676 LDD UN_EIGHT(%r24),%r28 ; Cycle 5
677 LDD -96(%sp),%r20 ; Cycle 6
678 ADD,DC %r3,%r19,%r19
679 ADD %r19,%r28,%r28 ; Cycle 7
680 ADD,DC %r0,%r22,%r22 ; Cycle 8
681 STD %r28,UN_EIGHT(%r24)
683 ; Shutdown code, second stage.
685 $JOINDIAG
686 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
687 LDD 0(%r24),%r28
688 SHRPD %r0,%r31,31,%r4 ; Cycle 2
689 ADD %r3,%r22,%r3
690 ADD,DC %r0,%r20,%r20 ; Cycle 3
691 ADD %r3,%r28,%r3
692 STD %r3,0(%r24) ; Cycle 4
693 LDD EIGHT(%r24),%r1 ; Cycle 5
694 ADD,DC %r4,%r20,%r20
695 ADD %r20,%r1,%r1 ; Cycle 7
696 ADD,DC %r0,%r0,%r21 ; Cycle 8
697 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
698 STD %r1,EIGHT(%r24)
700 ; Final carry propagation
702 $FDIAG1
703 LDO EIGHT(%r24),%r24
704 LDD EIGHT(%r24),%r26
705 ADDI 1,%r26,%r26
706 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
707 STD %r26,EIGHT(%r24)
709 $Z0
710 LDW -124(%sp),%r4
711 BVE (%r2)
712 .EXIT
713 LDW,MB -128(%sp),%r3
714 .PROCEND
715 ; .ALLOW
717 .SPACE $TEXT$
718 .SUBSPA $CODE$
719 #ifdef LITTLE_WORDIAN
720 #ifdef __GNUC__
721 ; GNU-as (as of 2.19) does not support LONG_RETURN
722 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
723 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
724 #else
725 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
726 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
727 #endif
728 #else
729 .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
730 .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
731 #endif
732 .END
735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
736 ;
737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
738 ; performs a 64-bit x any-size multiply, and adds the
739 ; result to an area of memory. That is, it performs
740 ; something like
741 ;
742 ; A B C D
743 ; * Z
744 ; __________
745 ; P Q R S T
746 ;
747 ; and then adds the "PQRST" vector into an area of memory,
748 ; handling all carries.
749 ;
750 ; Digression on nomenclature and endian-ness:
751 ;
752 ; Each of the capital letters in the above represents a 64-bit
753 ; quantity. That is, you could think of the discussion as
754 ; being in terms of radix-16-quintillion arithmetic. The data
755 ; type being manipulated is "unsigned long long int". This
756 ; requires the 64-bit extension of the HP-UX C compiler,
757 ; available at release 10. You need these compiler flags to
758 ; enable these extensions:
759 ;
760 ; -Aa +e +DA2.0 +DS2.0
761 ;
762 ; (The first specifies ANSI C, the second enables the
763 ; extensions, which are beyond ANSI C, and the third and
764 ; fourth tell the compiler to use whatever features of the
765 ; PA2.0 architecture it wishes, in order to made the code more
766 ; efficient. Since the presence of the assembly code will
767 ; make the program unable to run on anything less than PA2.0,
768 ; you might as well gain the performance enhancements in the C
769 ; code as well.)
770 ;
771 ; Questions of "endian-ness" often come up, usually in the
772 ; context of byte ordering in a word. These routines have a
773 ; similar issue, that could be called "wordian-ness".
774 ; Independent of byte ordering (PA is always big-endian), one
775 ; can make two choices when representing extremely large
776 ; numbers as arrays of 64-bit doublewords in memory.
777 ;
778 ; "Little-wordian" layout means that the least significant
779 ; word of a number is stored at the lowest address.
780 ;
781 ; MSW LSW
782 ; | |
783 ; V V
784 ;
785 ; A B C D E
786 ;
787 ; ^ ^ ^
788 ; | | |____ address 0
789 ; | |
790 ; | |_______address 8
791 ; |
792 ; address 32
793 ;
794 ; "Big-wordian" means that the most significant word is at the
795 ; lowest address.
796 ;
797 ; MSW LSW
798 ; | |
799 ; V V
800 ;
801 ; A B C D E
802 ;
803 ; ^ ^ ^
804 ; | | |____ address 32
805 ; | |
806 ; | |_______address 24
807 ; |
808 ; address 0
809 ;
810 ; When you compile the file, you must specify one or the other, with
811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
812 ;
813 ; Incidentally, you assemble this file as part of your
814 ; project with the same C compiler as the rest of the program.
815 ; My "makefile" for a superprecision arithmetic package has
816 ; the following stuff:
817 ;
818 ; # definitions:
819 ; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
820 ; CFLAGS = +O3
821 ; LDFLAGS = -L /usr/lib -Wl,-aarchive
822 ;
823 ; # general build rule for ".s" files:
824 ; .s.o:
825 ; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
826 ;
827 ; # Now any bind step that calls for pa20.o will assemble pa20.s
828 ;
829 ; End of digression, back to arithmetic:
830 ;
831 ; The way we multiply two huge numbers is, of course, to multiply
832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
833 ; the result vectors with increasing offsets, the way we learned
834 ; in school, back before we all used calculators:
835 ;
836 ; A B C D
837 ; * W X Y Z
838 ; __________
839 ; P Q R S T
840 ; E F G H I
841 ; M N O P Q
842 ; + R S T U V
843 ; _______________
844 ; F I N A L S U M
845 ;
846 ; So we call maxpy_PA20_big (in my case; my package is
847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
848 ; in turn as the "scalar", and giving the "ABCD" vector each
849 ; time. We direct it to add its result into an area of memory
850 ; that we have cleared at the start. We skew the exact
851 ; location into that area with each call.
852 ;
853 ; The prototype for the function is
854 ;
855 ; extern void maxpy_PA20_big(
856 ; int length, /* Number of doublewords in the multiplicand vector. */
857 ; const long long int *scalaraddr, /* Address to fetch the scalar. */
858 ; const long long int *multiplicand, /* The multiplicand vector. */
859 ; long long int *result); /* Where to accumulate the result. */
860 ;
861 ; (You should place a copy of this prototype in an include file
862 ; or in your C file.)
863 ;
864 ; Now, IN ALL CASES, the given address for the multiplicand or
865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
866 ; That word is, of course, the word at which the routine
867 ; starts processing. "maxpy_PA20_little" then increases the
868 ; addresses as it computes. "maxpy_PA20_big" decreases them.
869 ;
870 ; In our example above, "length" would be 4 in each case.
871 ; "multiplicand" would be the "ABCD" vector. Specifically,
872 ; the address of the element "D". "scalaraddr" would be the
873 ; address of "W", "X", "Y", or "Z" on the four calls that we
874 ; would make. (The order doesn't matter, of course.)
875 ; "result" would be the appropriate address in the result
876 ; area. When multiplying by "Z", that would be the least
877 ; significant word. When multiplying by "Y", it would be the
878 ; next higher word (8 bytes higher if little-wordian; 8 bytes
879 ; lower if big-wordian), and so on. The size of the result
880 ; area must be the the sum of the sizes of the multiplicand
881 ; and multiplier vectors, and must be initialized to zero
882 ; before we start.
883 ;
884 ; Whenever the routine adds its partial product into the result
885 ; vector, it follows carry chains as far as they need to go.
886 ;
887 ; Here is the super-precision multiply routine that I use for
888 ; my package. The package is big-wordian. I have taken out
889 ; handling of exponents (it's a floating point package):
890 ;
891 ; static void mul_PA20(
892 ; int size,
893 ; const long long int *arg1,
894 ; const long long int *arg2,
895 ; long long int *result)
896 ; {
897 ; int i;
898 ;
899 ; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
900 ;
901 ; for (i=0 ; i<size ; i++) {
902 ; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
903 ; }
904 ; }