1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/hppa20.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,904 @@ 1.4 +; This Source Code Form is subject to the terms of the Mozilla Public 1.5 +; License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +; file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +#ifdef __LP64__ 1.9 + .LEVEL 2.0W 1.10 +#else 1.11 +; .LEVEL 1.1 1.12 +; .ALLOW 2.0N 1.13 + .LEVEL 2.0 1.14 +#endif 1.15 + .SPACE $TEXT$,SORT=8 1.16 + .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 1.17 + 1.18 +; *************************************************************** 1.19 +; 1.20 +; maxpy_[little/big] 1.21 +; 1.22 +; *************************************************************** 1.23 + 1.24 +; There is no default -- you must specify one or the other. 1.25 +#define LITTLE_WORDIAN 1 1.26 + 1.27 +#ifdef LITTLE_WORDIAN 1.28 +#define EIGHT 8 1.29 +#define SIXTEEN 16 1.30 +#define THIRTY_TWO 32 1.31 +#define UN_EIGHT -8 1.32 +#define UN_SIXTEEN -16 1.33 +#define UN_TWENTY_FOUR -24 1.34 +#endif 1.35 + 1.36 +#ifdef BIG_WORDIAN 1.37 +#define EIGHT -8 1.38 +#define SIXTEEN -16 1.39 +#define THIRTY_TWO -32 1.40 +#define UN_EIGHT 8 1.41 +#define UN_SIXTEEN 16 1.42 +#define UN_TWENTY_FOUR 24 1.43 +#endif 1.44 + 1.45 +; This performs a multiple-precision integer version of "daxpy", 1.46 +; Using the selected addressing direction. "Little-wordian" means that 1.47 +; the least significant word of a number is stored at the lowest address. 1.48 +; "Big-wordian" means that the most significant word is at the lowest 1.49 +; address. Either way, the incoming address of the vector is that 1.50 +; of the least significant word. That means that, for little-wordian 1.51 +; addressing, we move the address upward as we propagate carries 1.52 +; from the least significant word to the most significant. For 1.53 +; big-wordian we move the address downward. 1.54 + 1.55 +; We use the following registers: 1.56 +; 1.57 +; r2 return PC, of course 1.58 +; r26 = arg1 = length 1.59 +; r25 = arg2 = address of scalar 1.60 +; r24 = arg3 = multiplicand vector 1.61 +; r23 = arg4 = result vector 1.62 +; 1.63 +; fr9 = scalar loaded once only from r25 1.64 + 1.65 +; The cycle counts shown in the bodies below are simply the result of a 1.66 +; scheduling by hand. The actual PCX-U hardware does it differently. 1.67 +; The intention is that the overall speed is the same. 1.68 + 1.69 +; The pipeline startup and shutdown code is constructed in the usual way, 1.70 +; by taking the loop bodies and removing unnecessary instructions. 1.71 +; We have left the comments describing cycle numbers in the code. 1.72 +; These are intended for reference when comparing with the main loop, 1.73 +; and have no particular relationship to actual cycle numbers. 1.74 + 1.75 +#ifdef LITTLE_WORDIAN 1.76 +maxpy_little 1.77 +#else 1.78 +maxpy_big 1.79 +#endif 1.80 + .PROC 1.81 + .CALLINFO FRAME=120,ENTRY_GR=4 1.82 + .ENTRY 1.83 + STW,MA %r3,128(%sp) 1.84 + STW %r4,-124(%sp) 1.85 + 1.86 + ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. 1.87 + FLDD 0(%r25),%fr9 ; fr9 = scalar 1.88 + 1.89 +; First startup 1.90 + 1.91 + FLDD 0(%r24),%fr24 ; Cycle 1 1.92 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 1.93 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 1.94 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 1.95 + CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 1.96 + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 1.97 + FLDD EIGHT(%r24),%fr28 ; Cycle 8 1.98 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 1.99 + FSTD %fr24,-96(%sp) 1.100 + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 1.101 + FSTD %fr25,-80(%sp) 1.102 + LDO SIXTEEN(%r24),%r24 ; Cycle 12 1.103 + FSTD %fr31,-64(%sp) 1.104 + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 1.105 + FSTD %fr27,-48(%sp) 1.106 + 1.107 +; Second startup 1.108 + 1.109 + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 1.110 + FSTD %fr30,-56(%sp) 1.111 + FLDD 0(%r24),%fr24 1.112 + 1.113 + FSTD %fr26,-88(%sp) ; Cycle 2 1.114 + 1.115 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 1.116 + FSTD %fr28,-104(%sp) 1.117 + 1.118 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 1.119 + LDD -96(%sp),%r3 1.120 + FSTD %fr29,-72(%sp) 1.121 + 1.122 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 1.123 + LDD -64(%sp),%r19 1.124 + LDD -80(%sp),%r21 1.125 + 1.126 + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 1.127 + LDD -56(%sp),%r20 1.128 + ADD %r21,%r3,%r3 1.129 + 1.130 + ADD,DC %r20,%r19,%r19 ; Cycle 7 1.131 + LDD -88(%sp),%r4 1.132 + SHRPD %r3,%r0,32,%r21 1.133 + LDD -48(%sp),%r1 1.134 + 1.135 + FLDD EIGHT(%r24),%fr28 ; Cycle 8 1.136 + LDD -104(%sp),%r31 1.137 + ADD,DC %r0,%r0,%r20 1.138 + SHRPD %r19,%r3,32,%r3 1.139 + 1.140 + LDD -72(%sp),%r29 ; Cycle 9 1.141 + SHRPD %r20,%r19,32,%r20 1.142 + ADD %r21,%r1,%r1 1.143 + 1.144 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 1.145 + ADD,DC %r3,%r4,%r4 1.146 + FSTD %fr24,-96(%sp) 1.147 + 1.148 + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 1.149 + ADD,DC %r0,%r20,%r20 1.150 + LDD 0(%r23),%r3 1.151 + FSTD %fr25,-80(%sp) 1.152 + 1.153 + LDO SIXTEEN(%r24),%r24 ; Cycle 12 1.154 + FSTD %fr31,-64(%sp) 1.155 + 1.156 + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 1.157 + ADD %r0,%r0,%r0 ; clear the carry bit 1.158 + ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 1.159 + FSTD %fr27,-48(%sp) 1.160 +; MFCTL %cr16,%r21 ; for timing 1.161 +; STD %r21,-112(%sp) 1.162 + 1.163 +; Here is the loop. 1.164 + 1.165 +$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 1.166 + ADD,DC %r29,%r4,%r4 1.167 + FSTD %fr30,-56(%sp) 1.168 + FLDD 0(%r24),%fr24 1.169 + 1.170 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.171 + ADD,DC %r0,%r20,%r20 1.172 + FSTD %fr26,-88(%sp) 1.173 + 1.174 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 1.175 + ADD %r3,%r1,%r1 1.176 + FSTD %fr28,-104(%sp) 1.177 + LDD UN_EIGHT(%r23),%r21 1.178 + 1.179 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 1.180 + ADD,DC %r21,%r4,%r28 1.181 + FSTD %fr29,-72(%sp) 1.182 + LDD -96(%sp),%r3 1.183 + 1.184 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 1.185 + ADD,DC %r20,%r31,%r22 1.186 + LDD -64(%sp),%r19 1.187 + LDD -80(%sp),%r21 1.188 + 1.189 + XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 1.190 + ADD %r21,%r3,%r3 1.191 + LDD -56(%sp),%r20 1.192 + STD %r1,UN_SIXTEEN(%r23) 1.193 + 1.194 + ADD,DC %r20,%r19,%r19 ; Cycle 7 1.195 + SHRPD %r3,%r0,32,%r21 1.196 + LDD -88(%sp),%r4 1.197 + LDD -48(%sp),%r1 1.198 + 1.199 + ADD,DC %r0,%r0,%r20 ; Cycle 8 1.200 + SHRPD %r19,%r3,32,%r3 1.201 + FLDD EIGHT(%r24),%fr28 1.202 + LDD -104(%sp),%r31 1.203 + 1.204 + SHRPD %r20,%r19,32,%r20 ; Cycle 9 1.205 + ADD %r21,%r1,%r1 1.206 + STD %r28,UN_EIGHT(%r23) 1.207 + LDD -72(%sp),%r29 1.208 + 1.209 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 1.210 + ADD,DC %r3,%r4,%r4 1.211 + FSTD %fr24,-96(%sp) 1.212 + 1.213 + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 1.214 + ADD,DC %r0,%r20,%r20 1.215 + FSTD %fr25,-80(%sp) 1.216 + LDD 0(%r23),%r3 1.217 + 1.218 + LDO SIXTEEN(%r24),%r24 ; Cycle 12 1.219 + FSTD %fr31,-64(%sp) 1.220 + 1.221 + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 1.222 + ADD %r22,%r1,%r1 1.223 + ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 1.224 + FSTD %fr27,-48(%sp) 1.225 + 1.226 +$ENDLOOP 1.227 + 1.228 +; Shutdown code, first stage. 1.229 + 1.230 +; MFCTL %cr16,%r21 ; for timing 1.231 +; STD %r21,UN_SIXTEEN(%r23) 1.232 +; LDD -112(%sp),%r21 1.233 +; STD %r21,UN_EIGHT(%r23) 1.234 + 1.235 + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 1.236 + ADD,DC %r29,%r4,%r4 1.237 + CMPIB,= 0,%r26,$ONEMORE 1.238 + FSTD %fr30,-56(%sp) 1.239 + 1.240 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.241 + ADD,DC %r0,%r20,%r20 1.242 + FSTD %fr26,-88(%sp) 1.243 + 1.244 + ADD %r3,%r1,%r1 ; Cycle 3 1.245 + FSTD %fr28,-104(%sp) 1.246 + LDD UN_EIGHT(%r23),%r21 1.247 + 1.248 + ADD,DC %r21,%r4,%r28 ; Cycle 4 1.249 + FSTD %fr29,-72(%sp) 1.250 + STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 1.251 + LDD -96(%sp),%r3 1.252 + 1.253 + ADD,DC %r20,%r31,%r22 ; Cycle 5 1.254 + STD %r1,UN_SIXTEEN(%r23) 1.255 +$JOIN4 1.256 + LDD -64(%sp),%r19 1.257 + LDD -80(%sp),%r21 1.258 + 1.259 + ADD %r21,%r3,%r3 ; Cycle 6 1.260 + LDD -56(%sp),%r20 1.261 + 1.262 + ADD,DC %r20,%r19,%r19 ; Cycle 7 1.263 + SHRPD %r3,%r0,32,%r21 1.264 + LDD -88(%sp),%r4 1.265 + LDD -48(%sp),%r1 1.266 + 1.267 + ADD,DC %r0,%r0,%r20 ; Cycle 8 1.268 + SHRPD %r19,%r3,32,%r3 1.269 + LDD -104(%sp),%r31 1.270 + 1.271 + SHRPD %r20,%r19,32,%r20 ; Cycle 9 1.272 + ADD %r21,%r1,%r1 1.273 + LDD -72(%sp),%r29 1.274 + 1.275 + ADD,DC %r3,%r4,%r4 ; Cycle 10 1.276 + 1.277 + ADD,DC %r0,%r20,%r20 ; Cycle 11 1.278 + LDD 0(%r23),%r3 1.279 + 1.280 + ADD %r22,%r1,%r1 ; Cycle 13 1.281 + 1.282 +; Shutdown code, second stage. 1.283 + 1.284 + ADD,DC %r29,%r4,%r4 ; Cycle 1 1.285 + 1.286 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.287 + ADD,DC %r0,%r20,%r20 1.288 + 1.289 + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 1.290 + ADD %r3,%r1,%r1 1.291 + 1.292 + ADD,DC %r21,%r4,%r28 ; Cycle 4 1.293 + 1.294 + ADD,DC %r20,%r31,%r22 ; Cycle 5 1.295 + 1.296 + STD %r1,UN_SIXTEEN(%r23); Cycle 6 1.297 + 1.298 + STD %r28,UN_EIGHT(%r23) ; Cycle 9 1.299 + 1.300 + LDD 0(%r23),%r3 ; Cycle 11 1.301 + 1.302 +; Shutdown code, third stage. 1.303 + 1.304 + LDO SIXTEEN(%r23),%r23 1.305 + ADD %r3,%r22,%r1 1.306 +$JOIN1 ADD,DC %r0,%r0,%r21 1.307 + CMPIB,*= 0,%r21,$L0 ; if no overflow, exit 1.308 + STD %r1,UN_SIXTEEN(%r23) 1.309 + 1.310 +; Final carry propagation 1.311 + 1.312 +$FINAL1 LDO EIGHT(%r23),%r23 1.313 + LDD UN_SIXTEEN(%r23),%r21 1.314 + ADDI 1,%r21,%r21 1.315 + CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. 1.316 + STD %r21,UN_SIXTEEN(%r23) 1.317 + B $L0 1.318 + NOP 1.319 + 1.320 +; Here is the code that handles the difficult cases N=1, N=2, and N=3. 1.321 +; We do the usual trick -- branch out of the startup code at appropriate 1.322 +; points, and branch into the shutdown code. 1.323 + 1.324 +$N_IS_SMALL 1.325 + CMPIB,= 0,%r26,$N_IS_ONE 1.326 + FSTD %fr24,-96(%sp) ; Cycle 10 1.327 + FLDD EIGHT(%r24),%fr28 ; Cycle 8 1.328 + XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 1.329 + XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 1.330 + FSTD %fr25,-80(%sp) 1.331 + FSTD %fr31,-64(%sp) ; Cycle 12 1.332 + XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 1.333 + FSTD %fr27,-48(%sp) 1.334 + XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 1.335 + CMPIB,= 2,%r26,$N_IS_THREE 1.336 + FSTD %fr30,-56(%sp) 1.337 + 1.338 +; N = 2 1.339 + FSTD %fr26,-88(%sp) ; Cycle 2 1.340 + FSTD %fr28,-104(%sp) ; Cycle 3 1.341 + LDD -96(%sp),%r3 ; Cycle 4 1.342 + FSTD %fr29,-72(%sp) 1.343 + B $JOIN4 1.344 + ADD %r0,%r0,%r22 1.345 + 1.346 +$N_IS_THREE 1.347 + FLDD SIXTEEN(%r24),%fr24 1.348 + FSTD %fr26,-88(%sp) ; Cycle 2 1.349 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 1.350 + FSTD %fr28,-104(%sp) 1.351 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 1.352 + LDD -96(%sp),%r3 1.353 + FSTD %fr29,-72(%sp) 1.354 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 1.355 + LDD -64(%sp),%r19 1.356 + LDD -80(%sp),%r21 1.357 + B $JOIN3 1.358 + ADD %r0,%r0,%r22 1.359 + 1.360 +$N_IS_ONE 1.361 + FSTD %fr25,-80(%sp) 1.362 + FSTD %fr27,-48(%sp) 1.363 + FSTD %fr26,-88(%sp) ; Cycle 2 1.364 + B $JOIN5 1.365 + ADD %r0,%r0,%r22 1.366 + 1.367 +; We came out of the unrolled loop with wrong parity. Do one more 1.368 +; single cycle. This is quite tricky, because of the way the 1.369 +; carry chains and SHRPD chains have been chopped up. 1.370 + 1.371 +$ONEMORE 1.372 + 1.373 + FLDD 0(%r24),%fr24 1.374 + 1.375 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.376 + ADD,DC %r0,%r20,%r20 1.377 + FSTD %fr26,-88(%sp) 1.378 + 1.379 + XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 1.380 + FSTD %fr28,-104(%sp) 1.381 + LDD UN_EIGHT(%r23),%r21 1.382 + ADD %r3,%r1,%r1 1.383 + 1.384 + XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 1.385 + ADD,DC %r21,%r4,%r28 1.386 + STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 1.387 + LDD -96(%sp),%r3 1.388 + FSTD %fr29,-72(%sp) 1.389 + 1.390 + XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 1.391 + ADD,DC %r20,%r31,%r22 1.392 + LDD -64(%sp),%r19 1.393 + LDD -80(%sp),%r21 1.394 + 1.395 + STD %r1,UN_SIXTEEN(%r23); Cycle 6 1.396 +$JOIN3 1.397 + XMPYU %fr9L,%fr24R,%fr24 1.398 + LDD -56(%sp),%r20 1.399 + ADD %r21,%r3,%r3 1.400 + 1.401 + ADD,DC %r20,%r19,%r19 ; Cycle 7 1.402 + LDD -88(%sp),%r4 1.403 + SHRPD %r3,%r0,32,%r21 1.404 + LDD -48(%sp),%r1 1.405 + 1.406 + LDD -104(%sp),%r31 ; Cycle 8 1.407 + ADD,DC %r0,%r0,%r20 1.408 + SHRPD %r19,%r3,32,%r3 1.409 + 1.410 + LDD -72(%sp),%r29 ; Cycle 9 1.411 + SHRPD %r20,%r19,32,%r20 1.412 + ADD %r21,%r1,%r1 1.413 + 1.414 + ADD,DC %r3,%r4,%r4 ; Cycle 10 1.415 + FSTD %fr24,-96(%sp) 1.416 + 1.417 + ADD,DC %r0,%r20,%r20 ; Cycle 11 1.418 + LDD 0(%r23),%r3 1.419 + FSTD %fr25,-80(%sp) 1.420 + 1.421 + ADD %r22,%r1,%r1 ; Cycle 13 1.422 + FSTD %fr27,-48(%sp) 1.423 + 1.424 +; Shutdown code, stage 1-1/2. 1.425 + 1.426 + ADD,DC %r29,%r4,%r4 ; Cycle 1 1.427 + 1.428 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.429 + ADD,DC %r0,%r20,%r20 1.430 + FSTD %fr26,-88(%sp) 1.431 + 1.432 + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 1.433 + ADD %r3,%r1,%r1 1.434 + 1.435 + ADD,DC %r21,%r4,%r28 ; Cycle 4 1.436 + STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 1.437 + 1.438 + ADD,DC %r20,%r31,%r22 ; Cycle 5 1.439 + STD %r1,UN_SIXTEEN(%r23) 1.440 +$JOIN5 1.441 + LDD -96(%sp),%r3 ; moved from cycle 4 1.442 + LDD -80(%sp),%r21 1.443 + ADD %r21,%r3,%r3 ; Cycle 6 1.444 + ADD,DC %r0,%r0,%r19 ; Cycle 7 1.445 + LDD -88(%sp),%r4 1.446 + SHRPD %r3,%r0,32,%r21 1.447 + LDD -48(%sp),%r1 1.448 + SHRPD %r19,%r3,32,%r3 ; Cycle 8 1.449 + ADD %r21,%r1,%r1 ; Cycle 9 1.450 + ADD,DC %r3,%r4,%r4 ; Cycle 10 1.451 + LDD 0(%r23),%r3 ; Cycle 11 1.452 + ADD %r22,%r1,%r1 ; Cycle 13 1.453 + 1.454 +; Shutdown code, stage 2-1/2. 1.455 + 1.456 + ADD,DC %r0,%r4,%r4 ; Cycle 1 1.457 + LDO SIXTEEN(%r23),%r23 ; Cycle 2 1.458 + LDD UN_EIGHT(%r23),%r21 ; Cycle 3 1.459 + ADD %r3,%r1,%r1 1.460 + STD %r1,UN_SIXTEEN(%r23) 1.461 + ADD,DC %r21,%r4,%r1 1.462 + B $JOIN1 1.463 + LDO EIGHT(%r23),%r23 1.464 + 1.465 +; exit 1.466 + 1.467 +$L0 1.468 + LDW -124(%sp),%r4 1.469 + BVE (%r2) 1.470 + .EXIT 1.471 + LDW,MB -128(%sp),%r3 1.472 + 1.473 + .PROCEND 1.474 + 1.475 +; *************************************************************** 1.476 +; 1.477 +; add_diag_[little/big] 1.478 +; 1.479 +; *************************************************************** 1.480 + 1.481 +; The arguments are as follows: 1.482 +; r2 return PC, of course 1.483 +; r26 = arg1 = length 1.484 +; r25 = arg2 = vector to square 1.485 +; r24 = arg3 = result vector 1.486 + 1.487 +#ifdef LITTLE_WORDIAN 1.488 +add_diag_little 1.489 +#else 1.490 +add_diag_big 1.491 +#endif 1.492 + .PROC 1.493 + .CALLINFO FRAME=120,ENTRY_GR=4 1.494 + .ENTRY 1.495 + STW,MA %r3,128(%sp) 1.496 + STW %r4,-124(%sp) 1.497 + 1.498 + ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. 1.499 + NOP 1.500 + 1.501 +; Startup code 1.502 + 1.503 + FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) 1.504 + XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 1.505 + XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 1.506 + XMPYU %fr7L,%fr7L,%fr30 1.507 + LDO SIXTEEN(%r25),%r25 ; Cycle 6 1.508 + FSTD %fr29,-88(%sp) 1.509 + FSTD %fr27,-72(%sp) ; Cycle 7 1.510 + CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) 1.511 + FSTD %fr30,-96(%sp) 1.512 + FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 1.513 + LDD -88(%sp),%r22 ; Cycle 3 1.514 + LDD -72(%sp),%r31 ; Cycle 4 1.515 + XMPYU %fr7R,%fr7R,%fr28 1.516 + XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 1.517 + XMPYU %fr7L,%fr7L,%fr31 1.518 + LDD -96(%sp),%r20 ; Cycle 6 1.519 + FSTD %fr28,-80(%sp) 1.520 + ADD %r0,%r0,%r0 ; clear the carry bit 1.521 + ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 1.522 + FSTD %fr24,-64(%sp) 1.523 + 1.524 +; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". 1.525 + 1.526 +$DIAGLOOP 1.527 + SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 1.528 + LDO SIXTEEN(%r25),%r25 1.529 + LDD 0(%r24),%r1 1.530 + FSTD %fr31,-104(%sp) 1.531 + SHRPD %r0,%r31,31,%r4 ; Cycle 2 1.532 + ADD,DC %r22,%r3,%r3 1.533 + FLDD UN_SIXTEEN(%r25),%fr7 1.534 + ADD,DC %r0,%r20,%r20 ; Cycle 3 1.535 + ADD %r1,%r3,%r3 1.536 + XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 1.537 + LDD -80(%sp),%r21 1.538 + STD %r3,0(%r24) 1.539 + XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 1.540 + XMPYU %fr7L,%fr7L,%fr30 1.541 + LDD -64(%sp),%r29 1.542 + LDD EIGHT(%r24),%r1 1.543 + ADD,DC %r4,%r20,%r20 ; Cycle 6 1.544 + LDD -104(%sp),%r19 1.545 + FSTD %fr29,-88(%sp) 1.546 + ADD %r20,%r1,%r1 ; Cycle 7 1.547 + FSTD %fr27,-72(%sp) 1.548 + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 1.549 + LDO THIRTY_TWO(%r24),%r24 1.550 + LDD UN_SIXTEEN(%r24),%r28 1.551 + FSTD %fr30,-96(%sp) 1.552 + SHRPD %r0,%r29,31,%r3 ; Cycle 2 1.553 + ADD,DC %r21,%r4,%r4 1.554 + FLDD UN_EIGHT(%r25),%fr7 1.555 + STD %r1,UN_TWENTY_FOUR(%r24) 1.556 + ADD,DC %r0,%r19,%r19 ; Cycle 3 1.557 + ADD %r28,%r4,%r4 1.558 + XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 1.559 + LDD -88(%sp),%r22 1.560 + STD %r4,UN_SIXTEEN(%r24) 1.561 + XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 1.562 + XMPYU %fr7L,%fr7L,%fr31 1.563 + LDD -72(%sp),%r31 1.564 + LDD UN_EIGHT(%r24),%r28 1.565 + ADD,DC %r3,%r19,%r19 ; Cycle 6 1.566 + LDD -96(%sp),%r20 1.567 + FSTD %fr28,-80(%sp) 1.568 + ADD %r19,%r28,%r28 ; Cycle 7 1.569 + FSTD %fr24,-64(%sp) 1.570 + ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 1.571 + STD %r28,UN_EIGHT(%r24) 1.572 + 1.573 +$ENDDIAGLOOP 1.574 + 1.575 + ADD,DC %r0,%r22,%r22 1.576 + CMPIB,= 0,%r26,$ONEMOREDIAG 1.577 + SHRPD %r31,%r0,31,%r3 1.578 + 1.579 +; Shutdown code, first stage. 1.580 + 1.581 + FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 1.582 + LDD 0(%r24),%r28 1.583 + SHRPD %r0,%r31,31,%r4 ; Cycle 2 1.584 + ADD %r3,%r22,%r3 1.585 + ADD,DC %r0,%r20,%r20 ; Cycle 3 1.586 + LDD -80(%sp),%r21 1.587 + ADD %r3,%r28,%r3 1.588 + LDD -64(%sp),%r29 ; Cycle 4 1.589 + STD %r3,0(%r24) 1.590 + LDD EIGHT(%r24),%r1 ; Cycle 5 1.591 + LDO SIXTEEN(%r25),%r25 ; Cycle 6 1.592 + LDD -104(%sp),%r19 1.593 + ADD,DC %r4,%r20,%r20 1.594 + ADD %r20,%r1,%r1 ; Cycle 7 1.595 + ADD,DC %r0,%r21,%r21 ; Cycle 8 1.596 + STD %r1,EIGHT(%r24) 1.597 + 1.598 +; Shutdown code, second stage. 1.599 + 1.600 + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 1.601 + LDO THIRTY_TWO(%r24),%r24 1.602 + LDD UN_SIXTEEN(%r24),%r1 1.603 + SHRPD %r0,%r29,31,%r3 ; Cycle 2 1.604 + ADD %r4,%r21,%r4 1.605 + ADD,DC %r0,%r19,%r19 ; Cycle 3 1.606 + ADD %r4,%r1,%r4 1.607 + STD %r4,UN_SIXTEEN(%r24); Cycle 4 1.608 + LDD UN_EIGHT(%r24),%r28 ; Cycle 5 1.609 + ADD,DC %r3,%r19,%r19 ; Cycle 6 1.610 + ADD %r19,%r28,%r28 ; Cycle 7 1.611 + ADD,DC %r0,%r0,%r22 ; Cycle 8 1.612 + CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit 1.613 + STD %r28,UN_EIGHT(%r24) 1.614 + 1.615 +; Final carry propagation 1.616 + 1.617 +$FDIAG2 1.618 + LDO EIGHT(%r24),%r24 1.619 + LDD UN_EIGHT(%r24),%r26 1.620 + ADDI 1,%r26,%r26 1.621 + CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. 1.622 + STD %r26,UN_EIGHT(%r24) 1.623 + 1.624 + B $Z0 1.625 + NOP 1.626 + 1.627 +; Here is the code that handles the difficult case N=1. 1.628 +; We do the usual trick -- branch out of the startup code at appropriate 1.629 +; points, and branch into the shutdown code. 1.630 + 1.631 +$DIAG_N_IS_ONE 1.632 + 1.633 + LDD -88(%sp),%r22 1.634 + LDD -72(%sp),%r31 1.635 + B $JOINDIAG 1.636 + LDD -96(%sp),%r20 1.637 + 1.638 +; We came out of the unrolled loop with wrong parity. Do one more 1.639 +; single cycle. This is the "alternate body". It will, of course, 1.640 +; give us opposite registers from the other case, so we need 1.641 +; completely different shutdown code. 1.642 + 1.643 +$ONEMOREDIAG 1.644 + FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 1.645 + LDD 0(%r24),%r28 1.646 + FLDD 0(%r25),%fr7 ; Cycle 2 1.647 + SHRPD %r0,%r31,31,%r4 1.648 + ADD %r3,%r22,%r3 1.649 + ADD,DC %r0,%r20,%r20 ; Cycle 3 1.650 + LDD -80(%sp),%r21 1.651 + ADD %r3,%r28,%r3 1.652 + LDD -64(%sp),%r29 ; Cycle 4 1.653 + STD %r3,0(%r24) 1.654 + XMPYU %fr7R,%fr7R,%fr29 1.655 + LDD EIGHT(%r24),%r1 ; Cycle 5 1.656 + XMPYU %fr7L,%fr7R,%fr27 1.657 + XMPYU %fr7L,%fr7L,%fr30 1.658 + LDD -104(%sp),%r19 ; Cycle 6 1.659 + FSTD %fr29,-88(%sp) 1.660 + ADD,DC %r4,%r20,%r20 1.661 + FSTD %fr27,-72(%sp) ; Cycle 7 1.662 + ADD %r20,%r1,%r1 1.663 + ADD,DC %r0,%r21,%r21 ; Cycle 8 1.664 + STD %r1,EIGHT(%r24) 1.665 + 1.666 +; Shutdown code, first stage. 1.667 + 1.668 + SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 1.669 + LDO THIRTY_TWO(%r24),%r24 1.670 + FSTD %fr30,-96(%sp) 1.671 + LDD UN_SIXTEEN(%r24),%r1 1.672 + SHRPD %r0,%r29,31,%r3 ; Cycle 2 1.673 + ADD %r4,%r21,%r4 1.674 + ADD,DC %r0,%r19,%r19 ; Cycle 3 1.675 + LDD -88(%sp),%r22 1.676 + ADD %r4,%r1,%r4 1.677 + LDD -72(%sp),%r31 ; Cycle 4 1.678 + STD %r4,UN_SIXTEEN(%r24) 1.679 + LDD UN_EIGHT(%r24),%r28 ; Cycle 5 1.680 + LDD -96(%sp),%r20 ; Cycle 6 1.681 + ADD,DC %r3,%r19,%r19 1.682 + ADD %r19,%r28,%r28 ; Cycle 7 1.683 + ADD,DC %r0,%r22,%r22 ; Cycle 8 1.684 + STD %r28,UN_EIGHT(%r24) 1.685 + 1.686 +; Shutdown code, second stage. 1.687 + 1.688 +$JOINDIAG 1.689 + SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 1.690 + LDD 0(%r24),%r28 1.691 + SHRPD %r0,%r31,31,%r4 ; Cycle 2 1.692 + ADD %r3,%r22,%r3 1.693 + ADD,DC %r0,%r20,%r20 ; Cycle 3 1.694 + ADD %r3,%r28,%r3 1.695 + STD %r3,0(%r24) ; Cycle 4 1.696 + LDD EIGHT(%r24),%r1 ; Cycle 5 1.697 + ADD,DC %r4,%r20,%r20 1.698 + ADD %r20,%r1,%r1 ; Cycle 7 1.699 + ADD,DC %r0,%r0,%r21 ; Cycle 8 1.700 + CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit 1.701 + STD %r1,EIGHT(%r24) 1.702 + 1.703 +; Final carry propagation 1.704 + 1.705 +$FDIAG1 1.706 + LDO EIGHT(%r24),%r24 1.707 + LDD EIGHT(%r24),%r26 1.708 + ADDI 1,%r26,%r26 1.709 + CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. 1.710 + STD %r26,EIGHT(%r24) 1.711 + 1.712 +$Z0 1.713 + LDW -124(%sp),%r4 1.714 + BVE (%r2) 1.715 + .EXIT 1.716 + LDW,MB -128(%sp),%r3 1.717 + .PROCEND 1.718 +; .ALLOW 1.719 + 1.720 + .SPACE $TEXT$ 1.721 + .SUBSPA $CODE$ 1.722 +#ifdef LITTLE_WORDIAN 1.723 +#ifdef __GNUC__ 1.724 +; GNU-as (as of 2.19) does not support LONG_RETURN 1.725 + .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 1.726 + .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR 1.727 +#else 1.728 + .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 1.729 + .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 1.730 +#endif 1.731 +#else 1.732 + .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 1.733 + .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 1.734 +#endif 1.735 + .END 1.736 + 1.737 + 1.738 +; How to use "maxpy_PA20_little" and "maxpy_PA20_big" 1.739 +; 1.740 +; The routine "maxpy_PA20_little" or "maxpy_PA20_big" 1.741 +; performs a 64-bit x any-size multiply, and adds the 1.742 +; result to an area of memory. That is, it performs 1.743 +; something like 1.744 +; 1.745 +; A B C D 1.746 +; * Z 1.747 +; __________ 1.748 +; P Q R S T 1.749 +; 1.750 +; and then adds the "PQRST" vector into an area of memory, 1.751 +; handling all carries. 1.752 +; 1.753 +; Digression on nomenclature and endian-ness: 1.754 +; 1.755 +; Each of the capital letters in the above represents a 64-bit 1.756 +; quantity. That is, you could think of the discussion as 1.757 +; being in terms of radix-16-quintillion arithmetic. The data 1.758 +; type being manipulated is "unsigned long long int". This 1.759 +; requires the 64-bit extension of the HP-UX C compiler, 1.760 +; available at release 10. You need these compiler flags to 1.761 +; enable these extensions: 1.762 +; 1.763 +; -Aa +e +DA2.0 +DS2.0 1.764 +; 1.765 +; (The first specifies ANSI C, the second enables the 1.766 +; extensions, which are beyond ANSI C, and the third and 1.767 +; fourth tell the compiler to use whatever features of the 1.768 +; PA2.0 architecture it wishes, in order to made the code more 1.769 +; efficient. Since the presence of the assembly code will 1.770 +; make the program unable to run on anything less than PA2.0, 1.771 +; you might as well gain the performance enhancements in the C 1.772 +; code as well.) 1.773 +; 1.774 +; Questions of "endian-ness" often come up, usually in the 1.775 +; context of byte ordering in a word. These routines have a 1.776 +; similar issue, that could be called "wordian-ness". 1.777 +; Independent of byte ordering (PA is always big-endian), one 1.778 +; can make two choices when representing extremely large 1.779 +; numbers as arrays of 64-bit doublewords in memory. 1.780 +; 1.781 +; "Little-wordian" layout means that the least significant 1.782 +; word of a number is stored at the lowest address. 1.783 +; 1.784 +; MSW LSW 1.785 +; | | 1.786 +; V V 1.787 +; 1.788 +; A B C D E 1.789 +; 1.790 +; ^ ^ ^ 1.791 +; | | |____ address 0 1.792 +; | | 1.793 +; | |_______address 8 1.794 +; | 1.795 +; address 32 1.796 +; 1.797 +; "Big-wordian" means that the most significant word is at the 1.798 +; lowest address. 1.799 +; 1.800 +; MSW LSW 1.801 +; | | 1.802 +; V V 1.803 +; 1.804 +; A B C D E 1.805 +; 1.806 +; ^ ^ ^ 1.807 +; | | |____ address 32 1.808 +; | | 1.809 +; | |_______address 24 1.810 +; | 1.811 +; address 0 1.812 +; 1.813 +; When you compile the file, you must specify one or the other, with 1.814 +; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". 1.815 +; 1.816 +; Incidentally, you assemble this file as part of your 1.817 +; project with the same C compiler as the rest of the program. 1.818 +; My "makefile" for a superprecision arithmetic package has 1.819 +; the following stuff: 1.820 +; 1.821 +; # definitions: 1.822 +; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 1.823 +; CFLAGS = +O3 1.824 +; LDFLAGS = -L /usr/lib -Wl,-aarchive 1.825 +; 1.826 +; # general build rule for ".s" files: 1.827 +; .s.o: 1.828 +; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN 1.829 +; 1.830 +; # Now any bind step that calls for pa20.o will assemble pa20.s 1.831 +; 1.832 +; End of digression, back to arithmetic: 1.833 +; 1.834 +; The way we multiply two huge numbers is, of course, to multiply 1.835 +; the "ABCD" vector by each of the "WXYZ" doublewords, adding 1.836 +; the result vectors with increasing offsets, the way we learned 1.837 +; in school, back before we all used calculators: 1.838 +; 1.839 +; A B C D 1.840 +; * W X Y Z 1.841 +; __________ 1.842 +; P Q R S T 1.843 +; E F G H I 1.844 +; M N O P Q 1.845 +; + R S T U V 1.846 +; _______________ 1.847 +; F I N A L S U M 1.848 +; 1.849 +; So we call maxpy_PA20_big (in my case; my package is 1.850 +; big-wordian) repeatedly, giving the W, X, Y, and Z arguments 1.851 +; in turn as the "scalar", and giving the "ABCD" vector each 1.852 +; time. We direct it to add its result into an area of memory 1.853 +; that we have cleared at the start. We skew the exact 1.854 +; location into that area with each call. 1.855 +; 1.856 +; The prototype for the function is 1.857 +; 1.858 +; extern void maxpy_PA20_big( 1.859 +; int length, /* Number of doublewords in the multiplicand vector. */ 1.860 +; const long long int *scalaraddr, /* Address to fetch the scalar. */ 1.861 +; const long long int *multiplicand, /* The multiplicand vector. */ 1.862 +; long long int *result); /* Where to accumulate the result. */ 1.863 +; 1.864 +; (You should place a copy of this prototype in an include file 1.865 +; or in your C file.) 1.866 +; 1.867 +; Now, IN ALL CASES, the given address for the multiplicand or 1.868 +; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. 1.869 +; That word is, of course, the word at which the routine 1.870 +; starts processing. "maxpy_PA20_little" then increases the 1.871 +; addresses as it computes. "maxpy_PA20_big" decreases them. 1.872 +; 1.873 +; In our example above, "length" would be 4 in each case. 1.874 +; "multiplicand" would be the "ABCD" vector. Specifically, 1.875 +; the address of the element "D". "scalaraddr" would be the 1.876 +; address of "W", "X", "Y", or "Z" on the four calls that we 1.877 +; would make. (The order doesn't matter, of course.) 1.878 +; "result" would be the appropriate address in the result 1.879 +; area. When multiplying by "Z", that would be the least 1.880 +; significant word. When multiplying by "Y", it would be the 1.881 +; next higher word (8 bytes higher if little-wordian; 8 bytes 1.882 +; lower if big-wordian), and so on. The size of the result 1.883 +; area must be the the sum of the sizes of the multiplicand 1.884 +; and multiplier vectors, and must be initialized to zero 1.885 +; before we start. 1.886 +; 1.887 +; Whenever the routine adds its partial product into the result 1.888 +; vector, it follows carry chains as far as they need to go. 1.889 +; 1.890 +; Here is the super-precision multiply routine that I use for 1.891 +; my package. The package is big-wordian. I have taken out 1.892 +; handling of exponents (it's a floating point package): 1.893 +; 1.894 +; static void mul_PA20( 1.895 +; int size, 1.896 +; const long long int *arg1, 1.897 +; const long long int *arg2, 1.898 +; long long int *result) 1.899 +; { 1.900 +; int i; 1.901 +; 1.902 +; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; 1.903 +; 1.904 +; for (i=0 ; i<size ; i++) { 1.905 +; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]); 1.906 +; } 1.907 +; }