|
1 ; This Source Code Form is subject to the terms of the Mozilla Public |
|
2 ; License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 #ifdef __LP64__ |
|
6 .LEVEL 2.0W |
|
7 #else |
|
8 ; .LEVEL 1.1 |
|
9 ; .ALLOW 2.0N |
|
10 .LEVEL 2.0 |
|
11 #endif |
|
12 .SPACE $TEXT$,SORT=8 |
|
13 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 |
|
14 |
|
15 ; *************************************************************** |
|
16 ; |
|
17 ; maxpy_[little/big] |
|
18 ; |
|
19 ; *************************************************************** |
|
20 |
|
21 ; There is no default -- you must specify one or the other. |
|
22 #define LITTLE_WORDIAN 1 |
|
23 |
|
24 #ifdef LITTLE_WORDIAN |
|
25 #define EIGHT 8 |
|
26 #define SIXTEEN 16 |
|
27 #define THIRTY_TWO 32 |
|
28 #define UN_EIGHT -8 |
|
29 #define UN_SIXTEEN -16 |
|
30 #define UN_TWENTY_FOUR -24 |
|
31 #endif |
|
32 |
|
33 #ifdef BIG_WORDIAN |
|
34 #define EIGHT -8 |
|
35 #define SIXTEEN -16 |
|
36 #define THIRTY_TWO -32 |
|
37 #define UN_EIGHT 8 |
|
38 #define UN_SIXTEEN 16 |
|
39 #define UN_TWENTY_FOUR 24 |
|
40 #endif |
|
41 |
|
42 ; This performs a multiple-precision integer version of "daxpy", |
|
43 ; Using the selected addressing direction. "Little-wordian" means that |
|
44 ; the least significant word of a number is stored at the lowest address. |
|
45 ; "Big-wordian" means that the most significant word is at the lowest |
|
46 ; address. Either way, the incoming address of the vector is that |
|
47 ; of the least significant word. That means that, for little-wordian |
|
48 ; addressing, we move the address upward as we propagate carries |
|
49 ; from the least significant word to the most significant. For |
|
50 ; big-wordian we move the address downward. |
|
51 |
|
52 ; We use the following registers: |
|
53 ; |
|
54 ; r2 return PC, of course |
|
55 ; r26 = arg1 = length |
|
56 ; r25 = arg2 = address of scalar |
|
57 ; r24 = arg3 = multiplicand vector |
|
58 ; r23 = arg4 = result vector |
|
59 ; |
|
60 ; fr9 = scalar loaded once only from r25 |
|
61 |
|
62 ; The cycle counts shown in the bodies below are simply the result of a |
|
63 ; scheduling by hand. The actual PCX-U hardware does it differently. |
|
64 ; The intention is that the overall speed is the same. |
|
65 |
|
66 ; The pipeline startup and shutdown code is constructed in the usual way, |
|
67 ; by taking the loop bodies and removing unnecessary instructions. |
|
68 ; We have left the comments describing cycle numbers in the code. |
|
69 ; These are intended for reference when comparing with the main loop, |
|
70 ; and have no particular relationship to actual cycle numbers. |
|
71 |
|
72 #ifdef LITTLE_WORDIAN |
|
73 maxpy_little |
|
74 #else |
|
75 maxpy_big |
|
76 #endif |
|
77 .PROC |
|
78 .CALLINFO FRAME=120,ENTRY_GR=4 |
|
79 .ENTRY |
|
80 STW,MA %r3,128(%sp) |
|
81 STW %r4,-124(%sp) |
|
82 |
|
83 ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. |
|
84 FLDD 0(%r25),%fr9 ; fr9 = scalar |
|
85 |
|
86 ; First startup |
|
87 |
|
88 FLDD 0(%r24),%fr24 ; Cycle 1 |
|
89 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 |
|
90 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 |
|
91 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 |
|
92 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 |
|
93 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 |
|
94 FLDD EIGHT(%r24),%fr28 ; Cycle 8 |
|
95 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 |
|
96 FSTD %fr24,-96(%sp) |
|
97 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 |
|
98 FSTD %fr25,-80(%sp) |
|
99 LDO SIXTEEN(%r24),%r24 ; Cycle 12 |
|
100 FSTD %fr31,-64(%sp) |
|
101 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 |
|
102 FSTD %fr27,-48(%sp) |
|
103 |
|
104 ; Second startup |
|
105 |
|
106 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 |
|
107 FSTD %fr30,-56(%sp) |
|
108 FLDD 0(%r24),%fr24 |
|
109 |
|
110 FSTD %fr26,-88(%sp) ; Cycle 2 |
|
111 |
|
112 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 |
|
113 FSTD %fr28,-104(%sp) |
|
114 |
|
115 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 |
|
116 LDD -96(%sp),%r3 |
|
117 FSTD %fr29,-72(%sp) |
|
118 |
|
119 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 |
|
120 LDD -64(%sp),%r19 |
|
121 LDD -80(%sp),%r21 |
|
122 |
|
123 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 |
|
124 LDD -56(%sp),%r20 |
|
125 ADD %r21,%r3,%r3 |
|
126 |
|
127 ADD,DC %r20,%r19,%r19 ; Cycle 7 |
|
128 LDD -88(%sp),%r4 |
|
129 SHRPD %r3,%r0,32,%r21 |
|
130 LDD -48(%sp),%r1 |
|
131 |
|
132 FLDD EIGHT(%r24),%fr28 ; Cycle 8 |
|
133 LDD -104(%sp),%r31 |
|
134 ADD,DC %r0,%r0,%r20 |
|
135 SHRPD %r19,%r3,32,%r3 |
|
136 |
|
137 LDD -72(%sp),%r29 ; Cycle 9 |
|
138 SHRPD %r20,%r19,32,%r20 |
|
139 ADD %r21,%r1,%r1 |
|
140 |
|
141 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 |
|
142 ADD,DC %r3,%r4,%r4 |
|
143 FSTD %fr24,-96(%sp) |
|
144 |
|
145 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 |
|
146 ADD,DC %r0,%r20,%r20 |
|
147 LDD 0(%r23),%r3 |
|
148 FSTD %fr25,-80(%sp) |
|
149 |
|
150 LDO SIXTEEN(%r24),%r24 ; Cycle 12 |
|
151 FSTD %fr31,-64(%sp) |
|
152 |
|
153 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 |
|
154 ADD %r0,%r0,%r0 ; clear the carry bit |
|
155 ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 |
|
156 FSTD %fr27,-48(%sp) |
|
157 ; MFCTL %cr16,%r21 ; for timing |
|
158 ; STD %r21,-112(%sp) |
|
159 |
|
160 ; Here is the loop. |
|
161 |
|
162 $LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 |
|
163 ADD,DC %r29,%r4,%r4 |
|
164 FSTD %fr30,-56(%sp) |
|
165 FLDD 0(%r24),%fr24 |
|
166 |
|
167 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
168 ADD,DC %r0,%r20,%r20 |
|
169 FSTD %fr26,-88(%sp) |
|
170 |
|
171 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 |
|
172 ADD %r3,%r1,%r1 |
|
173 FSTD %fr28,-104(%sp) |
|
174 LDD UN_EIGHT(%r23),%r21 |
|
175 |
|
176 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 |
|
177 ADD,DC %r21,%r4,%r28 |
|
178 FSTD %fr29,-72(%sp) |
|
179 LDD -96(%sp),%r3 |
|
180 |
|
181 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 |
|
182 ADD,DC %r20,%r31,%r22 |
|
183 LDD -64(%sp),%r19 |
|
184 LDD -80(%sp),%r21 |
|
185 |
|
186 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 |
|
187 ADD %r21,%r3,%r3 |
|
188 LDD -56(%sp),%r20 |
|
189 STD %r1,UN_SIXTEEN(%r23) |
|
190 |
|
191 ADD,DC %r20,%r19,%r19 ; Cycle 7 |
|
192 SHRPD %r3,%r0,32,%r21 |
|
193 LDD -88(%sp),%r4 |
|
194 LDD -48(%sp),%r1 |
|
195 |
|
196 ADD,DC %r0,%r0,%r20 ; Cycle 8 |
|
197 SHRPD %r19,%r3,32,%r3 |
|
198 FLDD EIGHT(%r24),%fr28 |
|
199 LDD -104(%sp),%r31 |
|
200 |
|
201 SHRPD %r20,%r19,32,%r20 ; Cycle 9 |
|
202 ADD %r21,%r1,%r1 |
|
203 STD %r28,UN_EIGHT(%r23) |
|
204 LDD -72(%sp),%r29 |
|
205 |
|
206 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 |
|
207 ADD,DC %r3,%r4,%r4 |
|
208 FSTD %fr24,-96(%sp) |
|
209 |
|
210 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 |
|
211 ADD,DC %r0,%r20,%r20 |
|
212 FSTD %fr25,-80(%sp) |
|
213 LDD 0(%r23),%r3 |
|
214 |
|
215 LDO SIXTEEN(%r24),%r24 ; Cycle 12 |
|
216 FSTD %fr31,-64(%sp) |
|
217 |
|
218 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 |
|
219 ADD %r22,%r1,%r1 |
|
220 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 |
|
221 FSTD %fr27,-48(%sp) |
|
222 |
|
223 $ENDLOOP |
|
224 |
|
225 ; Shutdown code, first stage. |
|
226 |
|
227 ; MFCTL %cr16,%r21 ; for timing |
|
228 ; STD %r21,UN_SIXTEEN(%r23) |
|
229 ; LDD -112(%sp),%r21 |
|
230 ; STD %r21,UN_EIGHT(%r23) |
|
231 |
|
232 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 |
|
233 ADD,DC %r29,%r4,%r4 |
|
234 CMPIB,= 0,%r26,$ONEMORE |
|
235 FSTD %fr30,-56(%sp) |
|
236 |
|
237 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
238 ADD,DC %r0,%r20,%r20 |
|
239 FSTD %fr26,-88(%sp) |
|
240 |
|
241 ADD %r3,%r1,%r1 ; Cycle 3 |
|
242 FSTD %fr28,-104(%sp) |
|
243 LDD UN_EIGHT(%r23),%r21 |
|
244 |
|
245 ADD,DC %r21,%r4,%r28 ; Cycle 4 |
|
246 FSTD %fr29,-72(%sp) |
|
247 STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 |
|
248 LDD -96(%sp),%r3 |
|
249 |
|
250 ADD,DC %r20,%r31,%r22 ; Cycle 5 |
|
251 STD %r1,UN_SIXTEEN(%r23) |
|
252 $JOIN4 |
|
253 LDD -64(%sp),%r19 |
|
254 LDD -80(%sp),%r21 |
|
255 |
|
256 ADD %r21,%r3,%r3 ; Cycle 6 |
|
257 LDD -56(%sp),%r20 |
|
258 |
|
259 ADD,DC %r20,%r19,%r19 ; Cycle 7 |
|
260 SHRPD %r3,%r0,32,%r21 |
|
261 LDD -88(%sp),%r4 |
|
262 LDD -48(%sp),%r1 |
|
263 |
|
264 ADD,DC %r0,%r0,%r20 ; Cycle 8 |
|
265 SHRPD %r19,%r3,32,%r3 |
|
266 LDD -104(%sp),%r31 |
|
267 |
|
268 SHRPD %r20,%r19,32,%r20 ; Cycle 9 |
|
269 ADD %r21,%r1,%r1 |
|
270 LDD -72(%sp),%r29 |
|
271 |
|
272 ADD,DC %r3,%r4,%r4 ; Cycle 10 |
|
273 |
|
274 ADD,DC %r0,%r20,%r20 ; Cycle 11 |
|
275 LDD 0(%r23),%r3 |
|
276 |
|
277 ADD %r22,%r1,%r1 ; Cycle 13 |
|
278 |
|
279 ; Shutdown code, second stage. |
|
280 |
|
281 ADD,DC %r29,%r4,%r4 ; Cycle 1 |
|
282 |
|
283 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
284 ADD,DC %r0,%r20,%r20 |
|
285 |
|
286 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 |
|
287 ADD %r3,%r1,%r1 |
|
288 |
|
289 ADD,DC %r21,%r4,%r28 ; Cycle 4 |
|
290 |
|
291 ADD,DC %r20,%r31,%r22 ; Cycle 5 |
|
292 |
|
293 STD %r1,UN_SIXTEEN(%r23); Cycle 6 |
|
294 |
|
295 STD %r28,UN_EIGHT(%r23) ; Cycle 9 |
|
296 |
|
297 LDD 0(%r23),%r3 ; Cycle 11 |
|
298 |
|
299 ; Shutdown code, third stage. |
|
300 |
|
301 LDO SIXTEEN(%r23),%r23 |
|
302 ADD %r3,%r22,%r1 |
|
303 $JOIN1 ADD,DC %r0,%r0,%r21 |
|
304 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit |
|
305 STD %r1,UN_SIXTEEN(%r23) |
|
306 |
|
307 ; Final carry propagation |
|
308 |
|
309 $FINAL1 LDO EIGHT(%r23),%r23 |
|
310 LDD UN_SIXTEEN(%r23),%r21 |
|
311 ADDI 1,%r21,%r21 |
|
312 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. |
|
313 STD %r21,UN_SIXTEEN(%r23) |
|
314 B $L0 |
|
315 NOP |
|
316 |
|
317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3. |
|
318 ; We do the usual trick -- branch out of the startup code at appropriate |
|
319 ; points, and branch into the shutdown code. |
|
320 |
|
321 $N_IS_SMALL |
|
322 CMPIB,= 0,%r26,$N_IS_ONE |
|
323 FSTD %fr24,-96(%sp) ; Cycle 10 |
|
324 FLDD EIGHT(%r24),%fr28 ; Cycle 8 |
|
325 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 |
|
326 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 |
|
327 FSTD %fr25,-80(%sp) |
|
328 FSTD %fr31,-64(%sp) ; Cycle 12 |
|
329 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 |
|
330 FSTD %fr27,-48(%sp) |
|
331 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 |
|
332 CMPIB,= 2,%r26,$N_IS_THREE |
|
333 FSTD %fr30,-56(%sp) |
|
334 |
|
335 ; N = 2 |
|
336 FSTD %fr26,-88(%sp) ; Cycle 2 |
|
337 FSTD %fr28,-104(%sp) ; Cycle 3 |
|
338 LDD -96(%sp),%r3 ; Cycle 4 |
|
339 FSTD %fr29,-72(%sp) |
|
340 B $JOIN4 |
|
341 ADD %r0,%r0,%r22 |
|
342 |
|
343 $N_IS_THREE |
|
344 FLDD SIXTEEN(%r24),%fr24 |
|
345 FSTD %fr26,-88(%sp) ; Cycle 2 |
|
346 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 |
|
347 FSTD %fr28,-104(%sp) |
|
348 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 |
|
349 LDD -96(%sp),%r3 |
|
350 FSTD %fr29,-72(%sp) |
|
351 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 |
|
352 LDD -64(%sp),%r19 |
|
353 LDD -80(%sp),%r21 |
|
354 B $JOIN3 |
|
355 ADD %r0,%r0,%r22 |
|
356 |
|
357 $N_IS_ONE |
|
358 FSTD %fr25,-80(%sp) |
|
359 FSTD %fr27,-48(%sp) |
|
360 FSTD %fr26,-88(%sp) ; Cycle 2 |
|
361 B $JOIN5 |
|
362 ADD %r0,%r0,%r22 |
|
363 |
|
364 ; We came out of the unrolled loop with wrong parity. Do one more |
|
365 ; single cycle. This is quite tricky, because of the way the |
|
366 ; carry chains and SHRPD chains have been chopped up. |
|
367 |
|
368 $ONEMORE |
|
369 |
|
370 FLDD 0(%r24),%fr24 |
|
371 |
|
372 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
373 ADD,DC %r0,%r20,%r20 |
|
374 FSTD %fr26,-88(%sp) |
|
375 |
|
376 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 |
|
377 FSTD %fr28,-104(%sp) |
|
378 LDD UN_EIGHT(%r23),%r21 |
|
379 ADD %r3,%r1,%r1 |
|
380 |
|
381 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 |
|
382 ADD,DC %r21,%r4,%r28 |
|
383 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 |
|
384 LDD -96(%sp),%r3 |
|
385 FSTD %fr29,-72(%sp) |
|
386 |
|
387 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 |
|
388 ADD,DC %r20,%r31,%r22 |
|
389 LDD -64(%sp),%r19 |
|
390 LDD -80(%sp),%r21 |
|
391 |
|
392 STD %r1,UN_SIXTEEN(%r23); Cycle 6 |
|
393 $JOIN3 |
|
394 XMPYU %fr9L,%fr24R,%fr24 |
|
395 LDD -56(%sp),%r20 |
|
396 ADD %r21,%r3,%r3 |
|
397 |
|
398 ADD,DC %r20,%r19,%r19 ; Cycle 7 |
|
399 LDD -88(%sp),%r4 |
|
400 SHRPD %r3,%r0,32,%r21 |
|
401 LDD -48(%sp),%r1 |
|
402 |
|
403 LDD -104(%sp),%r31 ; Cycle 8 |
|
404 ADD,DC %r0,%r0,%r20 |
|
405 SHRPD %r19,%r3,32,%r3 |
|
406 |
|
407 LDD -72(%sp),%r29 ; Cycle 9 |
|
408 SHRPD %r20,%r19,32,%r20 |
|
409 ADD %r21,%r1,%r1 |
|
410 |
|
411 ADD,DC %r3,%r4,%r4 ; Cycle 10 |
|
412 FSTD %fr24,-96(%sp) |
|
413 |
|
414 ADD,DC %r0,%r20,%r20 ; Cycle 11 |
|
415 LDD 0(%r23),%r3 |
|
416 FSTD %fr25,-80(%sp) |
|
417 |
|
418 ADD %r22,%r1,%r1 ; Cycle 13 |
|
419 FSTD %fr27,-48(%sp) |
|
420 |
|
421 ; Shutdown code, stage 1-1/2. |
|
422 |
|
423 ADD,DC %r29,%r4,%r4 ; Cycle 1 |
|
424 |
|
425 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
426 ADD,DC %r0,%r20,%r20 |
|
427 FSTD %fr26,-88(%sp) |
|
428 |
|
429 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 |
|
430 ADD %r3,%r1,%r1 |
|
431 |
|
432 ADD,DC %r21,%r4,%r28 ; Cycle 4 |
|
433 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 |
|
434 |
|
435 ADD,DC %r20,%r31,%r22 ; Cycle 5 |
|
436 STD %r1,UN_SIXTEEN(%r23) |
|
437 $JOIN5 |
|
438 LDD -96(%sp),%r3 ; moved from cycle 4 |
|
439 LDD -80(%sp),%r21 |
|
440 ADD %r21,%r3,%r3 ; Cycle 6 |
|
441 ADD,DC %r0,%r0,%r19 ; Cycle 7 |
|
442 LDD -88(%sp),%r4 |
|
443 SHRPD %r3,%r0,32,%r21 |
|
444 LDD -48(%sp),%r1 |
|
445 SHRPD %r19,%r3,32,%r3 ; Cycle 8 |
|
446 ADD %r21,%r1,%r1 ; Cycle 9 |
|
447 ADD,DC %r3,%r4,%r4 ; Cycle 10 |
|
448 LDD 0(%r23),%r3 ; Cycle 11 |
|
449 ADD %r22,%r1,%r1 ; Cycle 13 |
|
450 |
|
451 ; Shutdown code, stage 2-1/2. |
|
452 |
|
453 ADD,DC %r0,%r4,%r4 ; Cycle 1 |
|
454 LDO SIXTEEN(%r23),%r23 ; Cycle 2 |
|
455 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 |
|
456 ADD %r3,%r1,%r1 |
|
457 STD %r1,UN_SIXTEEN(%r23) |
|
458 ADD,DC %r21,%r4,%r1 |
|
459 B $JOIN1 |
|
460 LDO EIGHT(%r23),%r23 |
|
461 |
|
462 ; exit |
|
463 |
|
464 $L0 |
|
465 LDW -124(%sp),%r4 |
|
466 BVE (%r2) |
|
467 .EXIT |
|
468 LDW,MB -128(%sp),%r3 |
|
469 |
|
470 .PROCEND |
|
471 |
|
472 ; *************************************************************** |
|
473 ; |
|
474 ; add_diag_[little/big] |
|
475 ; |
|
476 ; *************************************************************** |
|
477 |
|
478 ; The arguments are as follows: |
|
479 ; r2 return PC, of course |
|
480 ; r26 = arg1 = length |
|
481 ; r25 = arg2 = vector to square |
|
482 ; r24 = arg3 = result vector |
|
483 |
|
484 #ifdef LITTLE_WORDIAN |
|
485 add_diag_little |
|
486 #else |
|
487 add_diag_big |
|
488 #endif |
|
489 .PROC |
|
490 .CALLINFO FRAME=120,ENTRY_GR=4 |
|
491 .ENTRY |
|
492 STW,MA %r3,128(%sp) |
|
493 STW %r4,-124(%sp) |
|
494 |
|
495 ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. |
|
496 NOP |
|
497 |
|
498 ; Startup code |
|
499 |
|
500 FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) |
|
501 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 |
|
502 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 |
|
503 XMPYU %fr7L,%fr7L,%fr30 |
|
504 LDO SIXTEEN(%r25),%r25 ; Cycle 6 |
|
505 FSTD %fr29,-88(%sp) |
|
506 FSTD %fr27,-72(%sp) ; Cycle 7 |
|
507 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) |
|
508 FSTD %fr30,-96(%sp) |
|
509 FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 |
|
510 LDD -88(%sp),%r22 ; Cycle 3 |
|
511 LDD -72(%sp),%r31 ; Cycle 4 |
|
512 XMPYU %fr7R,%fr7R,%fr28 |
|
513 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 |
|
514 XMPYU %fr7L,%fr7L,%fr31 |
|
515 LDD -96(%sp),%r20 ; Cycle 6 |
|
516 FSTD %fr28,-80(%sp) |
|
517 ADD %r0,%r0,%r0 ; clear the carry bit |
|
518 ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 |
|
519 FSTD %fr24,-64(%sp) |
|
520 |
|
521 ; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". |
|
522 |
|
523 $DIAGLOOP |
|
524 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) |
|
525 LDO SIXTEEN(%r25),%r25 |
|
526 LDD 0(%r24),%r1 |
|
527 FSTD %fr31,-104(%sp) |
|
528 SHRPD %r0,%r31,31,%r4 ; Cycle 2 |
|
529 ADD,DC %r22,%r3,%r3 |
|
530 FLDD UN_SIXTEEN(%r25),%fr7 |
|
531 ADD,DC %r0,%r20,%r20 ; Cycle 3 |
|
532 ADD %r1,%r3,%r3 |
|
533 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 |
|
534 LDD -80(%sp),%r21 |
|
535 STD %r3,0(%r24) |
|
536 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 |
|
537 XMPYU %fr7L,%fr7L,%fr30 |
|
538 LDD -64(%sp),%r29 |
|
539 LDD EIGHT(%r24),%r1 |
|
540 ADD,DC %r4,%r20,%r20 ; Cycle 6 |
|
541 LDD -104(%sp),%r19 |
|
542 FSTD %fr29,-88(%sp) |
|
543 ADD %r20,%r1,%r1 ; Cycle 7 |
|
544 FSTD %fr27,-72(%sp) |
|
545 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) |
|
546 LDO THIRTY_TWO(%r24),%r24 |
|
547 LDD UN_SIXTEEN(%r24),%r28 |
|
548 FSTD %fr30,-96(%sp) |
|
549 SHRPD %r0,%r29,31,%r3 ; Cycle 2 |
|
550 ADD,DC %r21,%r4,%r4 |
|
551 FLDD UN_EIGHT(%r25),%fr7 |
|
552 STD %r1,UN_TWENTY_FOUR(%r24) |
|
553 ADD,DC %r0,%r19,%r19 ; Cycle 3 |
|
554 ADD %r28,%r4,%r4 |
|
555 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 |
|
556 LDD -88(%sp),%r22 |
|
557 STD %r4,UN_SIXTEEN(%r24) |
|
558 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 |
|
559 XMPYU %fr7L,%fr7L,%fr31 |
|
560 LDD -72(%sp),%r31 |
|
561 LDD UN_EIGHT(%r24),%r28 |
|
562 ADD,DC %r3,%r19,%r19 ; Cycle 6 |
|
563 LDD -96(%sp),%r20 |
|
564 FSTD %fr28,-80(%sp) |
|
565 ADD %r19,%r28,%r28 ; Cycle 7 |
|
566 FSTD %fr24,-64(%sp) |
|
567 ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 |
|
568 STD %r28,UN_EIGHT(%r24) |
|
569 |
|
570 $ENDDIAGLOOP |
|
571 |
|
572 ADD,DC %r0,%r22,%r22 |
|
573 CMPIB,= 0,%r26,$ONEMOREDIAG |
|
574 SHRPD %r31,%r0,31,%r3 |
|
575 |
|
576 ; Shutdown code, first stage. |
|
577 |
|
578 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) |
|
579 LDD 0(%r24),%r28 |
|
580 SHRPD %r0,%r31,31,%r4 ; Cycle 2 |
|
581 ADD %r3,%r22,%r3 |
|
582 ADD,DC %r0,%r20,%r20 ; Cycle 3 |
|
583 LDD -80(%sp),%r21 |
|
584 ADD %r3,%r28,%r3 |
|
585 LDD -64(%sp),%r29 ; Cycle 4 |
|
586 STD %r3,0(%r24) |
|
587 LDD EIGHT(%r24),%r1 ; Cycle 5 |
|
588 LDO SIXTEEN(%r25),%r25 ; Cycle 6 |
|
589 LDD -104(%sp),%r19 |
|
590 ADD,DC %r4,%r20,%r20 |
|
591 ADD %r20,%r1,%r1 ; Cycle 7 |
|
592 ADD,DC %r0,%r21,%r21 ; Cycle 8 |
|
593 STD %r1,EIGHT(%r24) |
|
594 |
|
595 ; Shutdown code, second stage. |
|
596 |
|
597 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) |
|
598 LDO THIRTY_TWO(%r24),%r24 |
|
599 LDD UN_SIXTEEN(%r24),%r1 |
|
600 SHRPD %r0,%r29,31,%r3 ; Cycle 2 |
|
601 ADD %r4,%r21,%r4 |
|
602 ADD,DC %r0,%r19,%r19 ; Cycle 3 |
|
603 ADD %r4,%r1,%r4 |
|
604 STD %r4,UN_SIXTEEN(%r24); Cycle 4 |
|
605 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 |
|
606 ADD,DC %r3,%r19,%r19 ; Cycle 6 |
|
607 ADD %r19,%r28,%r28 ; Cycle 7 |
|
608 ADD,DC %r0,%r0,%r22 ; Cycle 8 |
|
609 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit |
|
610 STD %r28,UN_EIGHT(%r24) |
|
611 |
|
612 ; Final carry propagation |
|
613 |
|
614 $FDIAG2 |
|
615 LDO EIGHT(%r24),%r24 |
|
616 LDD UN_EIGHT(%r24),%r26 |
|
617 ADDI 1,%r26,%r26 |
|
618 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. |
|
619 STD %r26,UN_EIGHT(%r24) |
|
620 |
|
621 B $Z0 |
|
622 NOP |
|
623 |
|
624 ; Here is the code that handles the difficult case N=1. |
|
625 ; We do the usual trick -- branch out of the startup code at appropriate |
|
626 ; points, and branch into the shutdown code. |
|
627 |
|
628 $DIAG_N_IS_ONE |
|
629 |
|
630 LDD -88(%sp),%r22 |
|
631 LDD -72(%sp),%r31 |
|
632 B $JOINDIAG |
|
633 LDD -96(%sp),%r20 |
|
634 |
|
635 ; We came out of the unrolled loop with wrong parity. Do one more |
|
636 ; single cycle. This is the "alternate body". It will, of course, |
|
637 ; give us opposite registers from the other case, so we need |
|
638 ; completely different shutdown code. |
|
639 |
|
640 $ONEMOREDIAG |
|
641 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) |
|
642 LDD 0(%r24),%r28 |
|
643 FLDD 0(%r25),%fr7 ; Cycle 2 |
|
644 SHRPD %r0,%r31,31,%r4 |
|
645 ADD %r3,%r22,%r3 |
|
646 ADD,DC %r0,%r20,%r20 ; Cycle 3 |
|
647 LDD -80(%sp),%r21 |
|
648 ADD %r3,%r28,%r3 |
|
649 LDD -64(%sp),%r29 ; Cycle 4 |
|
650 STD %r3,0(%r24) |
|
651 XMPYU %fr7R,%fr7R,%fr29 |
|
652 LDD EIGHT(%r24),%r1 ; Cycle 5 |
|
653 XMPYU %fr7L,%fr7R,%fr27 |
|
654 XMPYU %fr7L,%fr7L,%fr30 |
|
655 LDD -104(%sp),%r19 ; Cycle 6 |
|
656 FSTD %fr29,-88(%sp) |
|
657 ADD,DC %r4,%r20,%r20 |
|
658 FSTD %fr27,-72(%sp) ; Cycle 7 |
|
659 ADD %r20,%r1,%r1 |
|
660 ADD,DC %r0,%r21,%r21 ; Cycle 8 |
|
661 STD %r1,EIGHT(%r24) |
|
662 |
|
663 ; Shutdown code, first stage. |
|
664 |
|
665 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) |
|
666 LDO THIRTY_TWO(%r24),%r24 |
|
667 FSTD %fr30,-96(%sp) |
|
668 LDD UN_SIXTEEN(%r24),%r1 |
|
669 SHRPD %r0,%r29,31,%r3 ; Cycle 2 |
|
670 ADD %r4,%r21,%r4 |
|
671 ADD,DC %r0,%r19,%r19 ; Cycle 3 |
|
672 LDD -88(%sp),%r22 |
|
673 ADD %r4,%r1,%r4 |
|
674 LDD -72(%sp),%r31 ; Cycle 4 |
|
675 STD %r4,UN_SIXTEEN(%r24) |
|
676 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 |
|
677 LDD -96(%sp),%r20 ; Cycle 6 |
|
678 ADD,DC %r3,%r19,%r19 |
|
679 ADD %r19,%r28,%r28 ; Cycle 7 |
|
680 ADD,DC %r0,%r22,%r22 ; Cycle 8 |
|
681 STD %r28,UN_EIGHT(%r24) |
|
682 |
|
683 ; Shutdown code, second stage. |
|
684 |
|
685 $JOINDIAG |
|
686 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) |
|
687 LDD 0(%r24),%r28 |
|
688 SHRPD %r0,%r31,31,%r4 ; Cycle 2 |
|
689 ADD %r3,%r22,%r3 |
|
690 ADD,DC %r0,%r20,%r20 ; Cycle 3 |
|
691 ADD %r3,%r28,%r3 |
|
692 STD %r3,0(%r24) ; Cycle 4 |
|
693 LDD EIGHT(%r24),%r1 ; Cycle 5 |
|
694 ADD,DC %r4,%r20,%r20 |
|
695 ADD %r20,%r1,%r1 ; Cycle 7 |
|
696 ADD,DC %r0,%r0,%r21 ; Cycle 8 |
|
697 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit |
|
698 STD %r1,EIGHT(%r24) |
|
699 |
|
700 ; Final carry propagation |
|
701 |
|
702 $FDIAG1 |
|
703 LDO EIGHT(%r24),%r24 |
|
704 LDD EIGHT(%r24),%r26 |
|
705 ADDI 1,%r26,%r26 |
|
706 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. |
|
707 STD %r26,EIGHT(%r24) |
|
708 |
|
709 $Z0 |
|
710 LDW -124(%sp),%r4 |
|
711 BVE (%r2) |
|
712 .EXIT |
|
713 LDW,MB -128(%sp),%r3 |
|
714 .PROCEND |
|
715 ; .ALLOW |
|
716 |
|
717 .SPACE $TEXT$ |
|
718 .SUBSPA $CODE$ |
|
719 #ifdef LITTLE_WORDIAN |
|
720 #ifdef __GNUC__ |
|
721 ; GNU-as (as of 2.19) does not support LONG_RETURN |
|
722 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR |
|
723 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR |
|
724 #else |
|
725 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN |
|
726 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN |
|
727 #endif |
|
728 #else |
|
729 .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN |
|
730 .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN |
|
731 #endif |
|
732 .END |
|
733 |
|
734 |
|
735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big" |
|
736 ; |
|
737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big" |
|
738 ; performs a 64-bit x any-size multiply, and adds the |
|
739 ; result to an area of memory. That is, it performs |
|
740 ; something like |
|
741 ; |
|
742 ; A B C D |
|
743 ; * Z |
|
744 ; __________ |
|
745 ; P Q R S T |
|
746 ; |
|
747 ; and then adds the "PQRST" vector into an area of memory, |
|
748 ; handling all carries. |
|
749 ; |
|
750 ; Digression on nomenclature and endian-ness: |
|
751 ; |
|
752 ; Each of the capital letters in the above represents a 64-bit |
|
753 ; quantity. That is, you could think of the discussion as |
|
754 ; being in terms of radix-16-quintillion arithmetic. The data |
|
755 ; type being manipulated is "unsigned long long int". This |
|
756 ; requires the 64-bit extension of the HP-UX C compiler, |
|
757 ; available at release 10. You need these compiler flags to |
|
758 ; enable these extensions: |
|
759 ; |
|
760 ; -Aa +e +DA2.0 +DS2.0 |
|
761 ; |
|
762 ; (The first specifies ANSI C, the second enables the |
|
763 ; extensions, which are beyond ANSI C, and the third and |
|
764 ; fourth tell the compiler to use whatever features of the |
|
765 ; PA2.0 architecture it wishes, in order to made the code more |
|
766 ; efficient. Since the presence of the assembly code will |
|
767 ; make the program unable to run on anything less than PA2.0, |
|
768 ; you might as well gain the performance enhancements in the C |
|
769 ; code as well.) |
|
770 ; |
|
771 ; Questions of "endian-ness" often come up, usually in the |
|
772 ; context of byte ordering in a word. These routines have a |
|
773 ; similar issue, that could be called "wordian-ness". |
|
774 ; Independent of byte ordering (PA is always big-endian), one |
|
775 ; can make two choices when representing extremely large |
|
776 ; numbers as arrays of 64-bit doublewords in memory. |
|
777 ; |
|
778 ; "Little-wordian" layout means that the least significant |
|
779 ; word of a number is stored at the lowest address. |
|
780 ; |
|
781 ; MSW LSW |
|
782 ; | | |
|
783 ; V V |
|
784 ; |
|
785 ; A B C D E |
|
786 ; |
|
787 ; ^ ^ ^ |
|
788 ; | | |____ address 0 |
|
789 ; | | |
|
790 ; | |_______address 8 |
|
791 ; | |
|
792 ; address 32 |
|
793 ; |
|
794 ; "Big-wordian" means that the most significant word is at the |
|
795 ; lowest address. |
|
796 ; |
|
797 ; MSW LSW |
|
798 ; | | |
|
799 ; V V |
|
800 ; |
|
801 ; A B C D E |
|
802 ; |
|
803 ; ^ ^ ^ |
|
804 ; | | |____ address 32 |
|
805 ; | | |
|
806 ; | |_______address 24 |
|
807 ; | |
|
808 ; address 0 |
|
809 ; |
|
810 ; When you compile the file, you must specify one or the other, with |
|
811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". |
|
812 ; |
|
813 ; Incidentally, you assemble this file as part of your |
|
814 ; project with the same C compiler as the rest of the program. |
|
815 ; My "makefile" for a superprecision arithmetic package has |
|
816 ; the following stuff: |
|
817 ; |
|
818 ; # definitions: |
|
819 ; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 |
|
820 ; CFLAGS = +O3 |
|
821 ; LDFLAGS = -L /usr/lib -Wl,-aarchive |
|
822 ; |
|
823 ; # general build rule for ".s" files: |
|
824 ; .s.o: |
|
825 ; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN |
|
826 ; |
|
827 ; # Now any bind step that calls for pa20.o will assemble pa20.s |
|
828 ; |
|
829 ; End of digression, back to arithmetic: |
|
830 ; |
|
831 ; The way we multiply two huge numbers is, of course, to multiply |
|
832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding |
|
833 ; the result vectors with increasing offsets, the way we learned |
|
834 ; in school, back before we all used calculators: |
|
835 ; |
|
836 ; A B C D |
|
837 ; * W X Y Z |
|
838 ; __________ |
|
839 ; P Q R S T |
|
840 ; E F G H I |
|
841 ; M N O P Q |
|
842 ; + R S T U V |
|
843 ; _______________ |
|
844 ; F I N A L S U M |
|
845 ; |
|
846 ; So we call maxpy_PA20_big (in my case; my package is |
|
847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments |
|
848 ; in turn as the "scalar", and giving the "ABCD" vector each |
|
849 ; time. We direct it to add its result into an area of memory |
|
850 ; that we have cleared at the start. We skew the exact |
|
851 ; location into that area with each call. |
|
852 ; |
|
853 ; The prototype for the function is |
|
854 ; |
|
855 ; extern void maxpy_PA20_big( |
|
856 ; int length, /* Number of doublewords in the multiplicand vector. */ |
|
857 ; const long long int *scalaraddr, /* Address to fetch the scalar. */ |
|
858 ; const long long int *multiplicand, /* The multiplicand vector. */ |
|
859 ; long long int *result); /* Where to accumulate the result. */ |
|
860 ; |
|
861 ; (You should place a copy of this prototype in an include file |
|
862 ; or in your C file.) |
|
863 ; |
|
864 ; Now, IN ALL CASES, the given address for the multiplicand or |
|
865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. |
|
866 ; That word is, of course, the word at which the routine |
|
867 ; starts processing. "maxpy_PA20_little" then increases the |
|
868 ; addresses as it computes. "maxpy_PA20_big" decreases them. |
|
869 ; |
|
870 ; In our example above, "length" would be 4 in each case. |
|
871 ; "multiplicand" would be the "ABCD" vector. Specifically, |
|
872 ; the address of the element "D". "scalaraddr" would be the |
|
873 ; address of "W", "X", "Y", or "Z" on the four calls that we |
|
874 ; would make. (The order doesn't matter, of course.) |
|
875 ; "result" would be the appropriate address in the result |
|
876 ; area. When multiplying by "Z", that would be the least |
|
877 ; significant word. When multiplying by "Y", it would be the |
|
878 ; next higher word (8 bytes higher if little-wordian; 8 bytes |
|
879 ; lower if big-wordian), and so on. The size of the result |
|
880 ; area must be the the sum of the sizes of the multiplicand |
|
881 ; and multiplier vectors, and must be initialized to zero |
|
882 ; before we start. |
|
883 ; |
|
884 ; Whenever the routine adds its partial product into the result |
|
885 ; vector, it follows carry chains as far as they need to go. |
|
886 ; |
|
887 ; Here is the super-precision multiply routine that I use for |
|
888 ; my package. The package is big-wordian. I have taken out |
|
889 ; handling of exponents (it's a floating point package): |
|
890 ; |
|
891 ; static void mul_PA20( |
|
892 ; int size, |
|
893 ; const long long int *arg1, |
|
894 ; const long long int *arg2, |
|
895 ; long long int *result) |
|
896 ; { |
|
897 ; int i; |
|
898 ; |
|
899 ; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; |
|
900 ; |
|
901 ; for (i=0 ; i<size ; i++) { |
|
902 ; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]); |
|
903 ; } |
|
904 ; } |