security/nss/lib/freebl/mpi/hppa20.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ; This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 ; License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 4
michael@0 5 #ifdef __LP64__
michael@0 6 .LEVEL 2.0W
michael@0 7 #else
michael@0 8 ; .LEVEL 1.1
michael@0 9 ; .ALLOW 2.0N
michael@0 10 .LEVEL 2.0
michael@0 11 #endif
michael@0 12 .SPACE $TEXT$,SORT=8
michael@0 13 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
michael@0 14
michael@0 15 ; ***************************************************************
michael@0 16 ;
michael@0 17 ; maxpy_[little/big]
michael@0 18 ;
michael@0 19 ; ***************************************************************
michael@0 20
michael@0 21 ; There is no default -- you must specify one or the other.
michael@0 22 #define LITTLE_WORDIAN 1
michael@0 23
michael@0 24 #ifdef LITTLE_WORDIAN
michael@0 25 #define EIGHT 8
michael@0 26 #define SIXTEEN 16
michael@0 27 #define THIRTY_TWO 32
michael@0 28 #define UN_EIGHT -8
michael@0 29 #define UN_SIXTEEN -16
michael@0 30 #define UN_TWENTY_FOUR -24
michael@0 31 #endif
michael@0 32
michael@0 33 #ifdef BIG_WORDIAN
michael@0 34 #define EIGHT -8
michael@0 35 #define SIXTEEN -16
michael@0 36 #define THIRTY_TWO -32
michael@0 37 #define UN_EIGHT 8
michael@0 38 #define UN_SIXTEEN 16
michael@0 39 #define UN_TWENTY_FOUR 24
michael@0 40 #endif
michael@0 41
michael@0 42 ; This performs a multiple-precision integer version of "daxpy",
michael@0 43 ; Using the selected addressing direction. "Little-wordian" means that
michael@0 44 ; the least significant word of a number is stored at the lowest address.
michael@0 45 ; "Big-wordian" means that the most significant word is at the lowest
michael@0 46 ; address. Either way, the incoming address of the vector is that
michael@0 47 ; of the least significant word. That means that, for little-wordian
michael@0 48 ; addressing, we move the address upward as we propagate carries
michael@0 49 ; from the least significant word to the most significant. For
michael@0 50 ; big-wordian we move the address downward.
michael@0 51
michael@0 52 ; We use the following registers:
michael@0 53 ;
michael@0 54 ; r2 return PC, of course
michael@0 55 ; r26 = arg1 = length
michael@0 56 ; r25 = arg2 = address of scalar
michael@0 57 ; r24 = arg3 = multiplicand vector
michael@0 58 ; r23 = arg4 = result vector
michael@0 59 ;
michael@0 60 ; fr9 = scalar loaded once only from r25
michael@0 61
michael@0 62 ; The cycle counts shown in the bodies below are simply the result of a
michael@0 63 ; scheduling by hand. The actual PCX-U hardware does it differently.
michael@0 64 ; The intention is that the overall speed is the same.
michael@0 65
michael@0 66 ; The pipeline startup and shutdown code is constructed in the usual way,
michael@0 67 ; by taking the loop bodies and removing unnecessary instructions.
michael@0 68 ; We have left the comments describing cycle numbers in the code.
michael@0 69 ; These are intended for reference when comparing with the main loop,
michael@0 70 ; and have no particular relationship to actual cycle numbers.
michael@0 71
michael@0 72 #ifdef LITTLE_WORDIAN
michael@0 73 maxpy_little
michael@0 74 #else
michael@0 75 maxpy_big
michael@0 76 #endif
michael@0 77 .PROC
michael@0 78 .CALLINFO FRAME=120,ENTRY_GR=4
michael@0 79 .ENTRY
michael@0 80 STW,MA %r3,128(%sp)
michael@0 81 STW %r4,-124(%sp)
michael@0 82
michael@0 83 ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
michael@0 84 FLDD 0(%r25),%fr9 ; fr9 = scalar
michael@0 85
michael@0 86 ; First startup
michael@0 87
michael@0 88 FLDD 0(%r24),%fr24 ; Cycle 1
michael@0 89 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
michael@0 90 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
michael@0 91 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
michael@0 92 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
michael@0 93 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
michael@0 94 FLDD EIGHT(%r24),%fr28 ; Cycle 8
michael@0 95 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
michael@0 96 FSTD %fr24,-96(%sp)
michael@0 97 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
michael@0 98 FSTD %fr25,-80(%sp)
michael@0 99 LDO SIXTEEN(%r24),%r24 ; Cycle 12
michael@0 100 FSTD %fr31,-64(%sp)
michael@0 101 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
michael@0 102 FSTD %fr27,-48(%sp)
michael@0 103
michael@0 104 ; Second startup
michael@0 105
michael@0 106 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
michael@0 107 FSTD %fr30,-56(%sp)
michael@0 108 FLDD 0(%r24),%fr24
michael@0 109
michael@0 110 FSTD %fr26,-88(%sp) ; Cycle 2
michael@0 111
michael@0 112 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
michael@0 113 FSTD %fr28,-104(%sp)
michael@0 114
michael@0 115 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
michael@0 116 LDD -96(%sp),%r3
michael@0 117 FSTD %fr29,-72(%sp)
michael@0 118
michael@0 119 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
michael@0 120 LDD -64(%sp),%r19
michael@0 121 LDD -80(%sp),%r21
michael@0 122
michael@0 123 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
michael@0 124 LDD -56(%sp),%r20
michael@0 125 ADD %r21,%r3,%r3
michael@0 126
michael@0 127 ADD,DC %r20,%r19,%r19 ; Cycle 7
michael@0 128 LDD -88(%sp),%r4
michael@0 129 SHRPD %r3,%r0,32,%r21
michael@0 130 LDD -48(%sp),%r1
michael@0 131
michael@0 132 FLDD EIGHT(%r24),%fr28 ; Cycle 8
michael@0 133 LDD -104(%sp),%r31
michael@0 134 ADD,DC %r0,%r0,%r20
michael@0 135 SHRPD %r19,%r3,32,%r3
michael@0 136
michael@0 137 LDD -72(%sp),%r29 ; Cycle 9
michael@0 138 SHRPD %r20,%r19,32,%r20
michael@0 139 ADD %r21,%r1,%r1
michael@0 140
michael@0 141 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
michael@0 142 ADD,DC %r3,%r4,%r4
michael@0 143 FSTD %fr24,-96(%sp)
michael@0 144
michael@0 145 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
michael@0 146 ADD,DC %r0,%r20,%r20
michael@0 147 LDD 0(%r23),%r3
michael@0 148 FSTD %fr25,-80(%sp)
michael@0 149
michael@0 150 LDO SIXTEEN(%r24),%r24 ; Cycle 12
michael@0 151 FSTD %fr31,-64(%sp)
michael@0 152
michael@0 153 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
michael@0 154 ADD %r0,%r0,%r0 ; clear the carry bit
michael@0 155 ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
michael@0 156 FSTD %fr27,-48(%sp)
michael@0 157 ; MFCTL %cr16,%r21 ; for timing
michael@0 158 ; STD %r21,-112(%sp)
michael@0 159
michael@0 160 ; Here is the loop.
michael@0 161
michael@0 162 $LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
michael@0 163 ADD,DC %r29,%r4,%r4
michael@0 164 FSTD %fr30,-56(%sp)
michael@0 165 FLDD 0(%r24),%fr24
michael@0 166
michael@0 167 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 168 ADD,DC %r0,%r20,%r20
michael@0 169 FSTD %fr26,-88(%sp)
michael@0 170
michael@0 171 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
michael@0 172 ADD %r3,%r1,%r1
michael@0 173 FSTD %fr28,-104(%sp)
michael@0 174 LDD UN_EIGHT(%r23),%r21
michael@0 175
michael@0 176 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
michael@0 177 ADD,DC %r21,%r4,%r28
michael@0 178 FSTD %fr29,-72(%sp)
michael@0 179 LDD -96(%sp),%r3
michael@0 180
michael@0 181 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
michael@0 182 ADD,DC %r20,%r31,%r22
michael@0 183 LDD -64(%sp),%r19
michael@0 184 LDD -80(%sp),%r21
michael@0 185
michael@0 186 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
michael@0 187 ADD %r21,%r3,%r3
michael@0 188 LDD -56(%sp),%r20
michael@0 189 STD %r1,UN_SIXTEEN(%r23)
michael@0 190
michael@0 191 ADD,DC %r20,%r19,%r19 ; Cycle 7
michael@0 192 SHRPD %r3,%r0,32,%r21
michael@0 193 LDD -88(%sp),%r4
michael@0 194 LDD -48(%sp),%r1
michael@0 195
michael@0 196 ADD,DC %r0,%r0,%r20 ; Cycle 8
michael@0 197 SHRPD %r19,%r3,32,%r3
michael@0 198 FLDD EIGHT(%r24),%fr28
michael@0 199 LDD -104(%sp),%r31
michael@0 200
michael@0 201 SHRPD %r20,%r19,32,%r20 ; Cycle 9
michael@0 202 ADD %r21,%r1,%r1
michael@0 203 STD %r28,UN_EIGHT(%r23)
michael@0 204 LDD -72(%sp),%r29
michael@0 205
michael@0 206 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
michael@0 207 ADD,DC %r3,%r4,%r4
michael@0 208 FSTD %fr24,-96(%sp)
michael@0 209
michael@0 210 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
michael@0 211 ADD,DC %r0,%r20,%r20
michael@0 212 FSTD %fr25,-80(%sp)
michael@0 213 LDD 0(%r23),%r3
michael@0 214
michael@0 215 LDO SIXTEEN(%r24),%r24 ; Cycle 12
michael@0 216 FSTD %fr31,-64(%sp)
michael@0 217
michael@0 218 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
michael@0 219 ADD %r22,%r1,%r1
michael@0 220 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
michael@0 221 FSTD %fr27,-48(%sp)
michael@0 222
michael@0 223 $ENDLOOP
michael@0 224
michael@0 225 ; Shutdown code, first stage.
michael@0 226
michael@0 227 ; MFCTL %cr16,%r21 ; for timing
michael@0 228 ; STD %r21,UN_SIXTEEN(%r23)
michael@0 229 ; LDD -112(%sp),%r21
michael@0 230 ; STD %r21,UN_EIGHT(%r23)
michael@0 231
michael@0 232 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
michael@0 233 ADD,DC %r29,%r4,%r4
michael@0 234 CMPIB,= 0,%r26,$ONEMORE
michael@0 235 FSTD %fr30,-56(%sp)
michael@0 236
michael@0 237 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 238 ADD,DC %r0,%r20,%r20
michael@0 239 FSTD %fr26,-88(%sp)
michael@0 240
michael@0 241 ADD %r3,%r1,%r1 ; Cycle 3
michael@0 242 FSTD %fr28,-104(%sp)
michael@0 243 LDD UN_EIGHT(%r23),%r21
michael@0 244
michael@0 245 ADD,DC %r21,%r4,%r28 ; Cycle 4
michael@0 246 FSTD %fr29,-72(%sp)
michael@0 247 STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
michael@0 248 LDD -96(%sp),%r3
michael@0 249
michael@0 250 ADD,DC %r20,%r31,%r22 ; Cycle 5
michael@0 251 STD %r1,UN_SIXTEEN(%r23)
michael@0 252 $JOIN4
michael@0 253 LDD -64(%sp),%r19
michael@0 254 LDD -80(%sp),%r21
michael@0 255
michael@0 256 ADD %r21,%r3,%r3 ; Cycle 6
michael@0 257 LDD -56(%sp),%r20
michael@0 258
michael@0 259 ADD,DC %r20,%r19,%r19 ; Cycle 7
michael@0 260 SHRPD %r3,%r0,32,%r21
michael@0 261 LDD -88(%sp),%r4
michael@0 262 LDD -48(%sp),%r1
michael@0 263
michael@0 264 ADD,DC %r0,%r0,%r20 ; Cycle 8
michael@0 265 SHRPD %r19,%r3,32,%r3
michael@0 266 LDD -104(%sp),%r31
michael@0 267
michael@0 268 SHRPD %r20,%r19,32,%r20 ; Cycle 9
michael@0 269 ADD %r21,%r1,%r1
michael@0 270 LDD -72(%sp),%r29
michael@0 271
michael@0 272 ADD,DC %r3,%r4,%r4 ; Cycle 10
michael@0 273
michael@0 274 ADD,DC %r0,%r20,%r20 ; Cycle 11
michael@0 275 LDD 0(%r23),%r3
michael@0 276
michael@0 277 ADD %r22,%r1,%r1 ; Cycle 13
michael@0 278
michael@0 279 ; Shutdown code, second stage.
michael@0 280
michael@0 281 ADD,DC %r29,%r4,%r4 ; Cycle 1
michael@0 282
michael@0 283 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 284 ADD,DC %r0,%r20,%r20
michael@0 285
michael@0 286 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0 287 ADD %r3,%r1,%r1
michael@0 288
michael@0 289 ADD,DC %r21,%r4,%r28 ; Cycle 4
michael@0 290
michael@0 291 ADD,DC %r20,%r31,%r22 ; Cycle 5
michael@0 292
michael@0 293 STD %r1,UN_SIXTEEN(%r23); Cycle 6
michael@0 294
michael@0 295 STD %r28,UN_EIGHT(%r23) ; Cycle 9
michael@0 296
michael@0 297 LDD 0(%r23),%r3 ; Cycle 11
michael@0 298
michael@0 299 ; Shutdown code, third stage.
michael@0 300
michael@0 301 LDO SIXTEEN(%r23),%r23
michael@0 302 ADD %r3,%r22,%r1
michael@0 303 $JOIN1 ADD,DC %r0,%r0,%r21
michael@0 304 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
michael@0 305 STD %r1,UN_SIXTEEN(%r23)
michael@0 306
michael@0 307 ; Final carry propagation
michael@0 308
michael@0 309 $FINAL1 LDO EIGHT(%r23),%r23
michael@0 310 LDD UN_SIXTEEN(%r23),%r21
michael@0 311 ADDI 1,%r21,%r21
michael@0 312 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
michael@0 313 STD %r21,UN_SIXTEEN(%r23)
michael@0 314 B $L0
michael@0 315 NOP
michael@0 316
michael@0 317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
michael@0 318 ; We do the usual trick -- branch out of the startup code at appropriate
michael@0 319 ; points, and branch into the shutdown code.
michael@0 320
michael@0 321 $N_IS_SMALL
michael@0 322 CMPIB,= 0,%r26,$N_IS_ONE
michael@0 323 FSTD %fr24,-96(%sp) ; Cycle 10
michael@0 324 FLDD EIGHT(%r24),%fr28 ; Cycle 8
michael@0 325 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
michael@0 326 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
michael@0 327 FSTD %fr25,-80(%sp)
michael@0 328 FSTD %fr31,-64(%sp) ; Cycle 12
michael@0 329 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
michael@0 330 FSTD %fr27,-48(%sp)
michael@0 331 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
michael@0 332 CMPIB,= 2,%r26,$N_IS_THREE
michael@0 333 FSTD %fr30,-56(%sp)
michael@0 334
michael@0 335 ; N = 2
michael@0 336 FSTD %fr26,-88(%sp) ; Cycle 2
michael@0 337 FSTD %fr28,-104(%sp) ; Cycle 3
michael@0 338 LDD -96(%sp),%r3 ; Cycle 4
michael@0 339 FSTD %fr29,-72(%sp)
michael@0 340 B $JOIN4
michael@0 341 ADD %r0,%r0,%r22
michael@0 342
michael@0 343 $N_IS_THREE
michael@0 344 FLDD SIXTEEN(%r24),%fr24
michael@0 345 FSTD %fr26,-88(%sp) ; Cycle 2
michael@0 346 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
michael@0 347 FSTD %fr28,-104(%sp)
michael@0 348 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
michael@0 349 LDD -96(%sp),%r3
michael@0 350 FSTD %fr29,-72(%sp)
michael@0 351 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
michael@0 352 LDD -64(%sp),%r19
michael@0 353 LDD -80(%sp),%r21
michael@0 354 B $JOIN3
michael@0 355 ADD %r0,%r0,%r22
michael@0 356
michael@0 357 $N_IS_ONE
michael@0 358 FSTD %fr25,-80(%sp)
michael@0 359 FSTD %fr27,-48(%sp)
michael@0 360 FSTD %fr26,-88(%sp) ; Cycle 2
michael@0 361 B $JOIN5
michael@0 362 ADD %r0,%r0,%r22
michael@0 363
michael@0 364 ; We came out of the unrolled loop with wrong parity. Do one more
michael@0 365 ; single cycle. This is quite tricky, because of the way the
michael@0 366 ; carry chains and SHRPD chains have been chopped up.
michael@0 367
michael@0 368 $ONEMORE
michael@0 369
michael@0 370 FLDD 0(%r24),%fr24
michael@0 371
michael@0 372 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 373 ADD,DC %r0,%r20,%r20
michael@0 374 FSTD %fr26,-88(%sp)
michael@0 375
michael@0 376 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
michael@0 377 FSTD %fr28,-104(%sp)
michael@0 378 LDD UN_EIGHT(%r23),%r21
michael@0 379 ADD %r3,%r1,%r1
michael@0 380
michael@0 381 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
michael@0 382 ADD,DC %r21,%r4,%r28
michael@0 383 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
michael@0 384 LDD -96(%sp),%r3
michael@0 385 FSTD %fr29,-72(%sp)
michael@0 386
michael@0 387 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
michael@0 388 ADD,DC %r20,%r31,%r22
michael@0 389 LDD -64(%sp),%r19
michael@0 390 LDD -80(%sp),%r21
michael@0 391
michael@0 392 STD %r1,UN_SIXTEEN(%r23); Cycle 6
michael@0 393 $JOIN3
michael@0 394 XMPYU %fr9L,%fr24R,%fr24
michael@0 395 LDD -56(%sp),%r20
michael@0 396 ADD %r21,%r3,%r3
michael@0 397
michael@0 398 ADD,DC %r20,%r19,%r19 ; Cycle 7
michael@0 399 LDD -88(%sp),%r4
michael@0 400 SHRPD %r3,%r0,32,%r21
michael@0 401 LDD -48(%sp),%r1
michael@0 402
michael@0 403 LDD -104(%sp),%r31 ; Cycle 8
michael@0 404 ADD,DC %r0,%r0,%r20
michael@0 405 SHRPD %r19,%r3,32,%r3
michael@0 406
michael@0 407 LDD -72(%sp),%r29 ; Cycle 9
michael@0 408 SHRPD %r20,%r19,32,%r20
michael@0 409 ADD %r21,%r1,%r1
michael@0 410
michael@0 411 ADD,DC %r3,%r4,%r4 ; Cycle 10
michael@0 412 FSTD %fr24,-96(%sp)
michael@0 413
michael@0 414 ADD,DC %r0,%r20,%r20 ; Cycle 11
michael@0 415 LDD 0(%r23),%r3
michael@0 416 FSTD %fr25,-80(%sp)
michael@0 417
michael@0 418 ADD %r22,%r1,%r1 ; Cycle 13
michael@0 419 FSTD %fr27,-48(%sp)
michael@0 420
michael@0 421 ; Shutdown code, stage 1-1/2.
michael@0 422
michael@0 423 ADD,DC %r29,%r4,%r4 ; Cycle 1
michael@0 424
michael@0 425 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 426 ADD,DC %r0,%r20,%r20
michael@0 427 FSTD %fr26,-88(%sp)
michael@0 428
michael@0 429 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0 430 ADD %r3,%r1,%r1
michael@0 431
michael@0 432 ADD,DC %r21,%r4,%r28 ; Cycle 4
michael@0 433 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
michael@0 434
michael@0 435 ADD,DC %r20,%r31,%r22 ; Cycle 5
michael@0 436 STD %r1,UN_SIXTEEN(%r23)
michael@0 437 $JOIN5
michael@0 438 LDD -96(%sp),%r3 ; moved from cycle 4
michael@0 439 LDD -80(%sp),%r21
michael@0 440 ADD %r21,%r3,%r3 ; Cycle 6
michael@0 441 ADD,DC %r0,%r0,%r19 ; Cycle 7
michael@0 442 LDD -88(%sp),%r4
michael@0 443 SHRPD %r3,%r0,32,%r21
michael@0 444 LDD -48(%sp),%r1
michael@0 445 SHRPD %r19,%r3,32,%r3 ; Cycle 8
michael@0 446 ADD %r21,%r1,%r1 ; Cycle 9
michael@0 447 ADD,DC %r3,%r4,%r4 ; Cycle 10
michael@0 448 LDD 0(%r23),%r3 ; Cycle 11
michael@0 449 ADD %r22,%r1,%r1 ; Cycle 13
michael@0 450
michael@0 451 ; Shutdown code, stage 2-1/2.
michael@0 452
michael@0 453 ADD,DC %r0,%r4,%r4 ; Cycle 1
michael@0 454 LDO SIXTEEN(%r23),%r23 ; Cycle 2
michael@0 455 LDD UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0 456 ADD %r3,%r1,%r1
michael@0 457 STD %r1,UN_SIXTEEN(%r23)
michael@0 458 ADD,DC %r21,%r4,%r1
michael@0 459 B $JOIN1
michael@0 460 LDO EIGHT(%r23),%r23
michael@0 461
michael@0 462 ; exit
michael@0 463
michael@0 464 $L0
michael@0 465 LDW -124(%sp),%r4
michael@0 466 BVE (%r2)
michael@0 467 .EXIT
michael@0 468 LDW,MB -128(%sp),%r3
michael@0 469
michael@0 470 .PROCEND
michael@0 471
michael@0 472 ; ***************************************************************
michael@0 473 ;
michael@0 474 ; add_diag_[little/big]
michael@0 475 ;
michael@0 476 ; ***************************************************************
michael@0 477
michael@0 478 ; The arguments are as follows:
michael@0 479 ; r2 return PC, of course
michael@0 480 ; r26 = arg1 = length
michael@0 481 ; r25 = arg2 = vector to square
michael@0 482 ; r24 = arg3 = result vector
michael@0 483
michael@0 484 #ifdef LITTLE_WORDIAN
michael@0 485 add_diag_little
michael@0 486 #else
michael@0 487 add_diag_big
michael@0 488 #endif
michael@0 489 .PROC
michael@0 490 .CALLINFO FRAME=120,ENTRY_GR=4
michael@0 491 .ENTRY
michael@0 492 STW,MA %r3,128(%sp)
michael@0 493 STW %r4,-124(%sp)
michael@0 494
michael@0 495 ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
michael@0 496 NOP
michael@0 497
michael@0 498 ; Startup code
michael@0 499
michael@0 500 FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
michael@0 501 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
michael@0 502 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
michael@0 503 XMPYU %fr7L,%fr7L,%fr30
michael@0 504 LDO SIXTEEN(%r25),%r25 ; Cycle 6
michael@0 505 FSTD %fr29,-88(%sp)
michael@0 506 FSTD %fr27,-72(%sp) ; Cycle 7
michael@0 507 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
michael@0 508 FSTD %fr30,-96(%sp)
michael@0 509 FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
michael@0 510 LDD -88(%sp),%r22 ; Cycle 3
michael@0 511 LDD -72(%sp),%r31 ; Cycle 4
michael@0 512 XMPYU %fr7R,%fr7R,%fr28
michael@0 513 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
michael@0 514 XMPYU %fr7L,%fr7L,%fr31
michael@0 515 LDD -96(%sp),%r20 ; Cycle 6
michael@0 516 FSTD %fr28,-80(%sp)
michael@0 517 ADD %r0,%r0,%r0 ; clear the carry bit
michael@0 518 ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
michael@0 519 FSTD %fr24,-64(%sp)
michael@0 520
michael@0 521 ; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
michael@0 522
michael@0 523 $DIAGLOOP
michael@0 524 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
michael@0 525 LDO SIXTEEN(%r25),%r25
michael@0 526 LDD 0(%r24),%r1
michael@0 527 FSTD %fr31,-104(%sp)
michael@0 528 SHRPD %r0,%r31,31,%r4 ; Cycle 2
michael@0 529 ADD,DC %r22,%r3,%r3
michael@0 530 FLDD UN_SIXTEEN(%r25),%fr7
michael@0 531 ADD,DC %r0,%r20,%r20 ; Cycle 3
michael@0 532 ADD %r1,%r3,%r3
michael@0 533 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
michael@0 534 LDD -80(%sp),%r21
michael@0 535 STD %r3,0(%r24)
michael@0 536 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
michael@0 537 XMPYU %fr7L,%fr7L,%fr30
michael@0 538 LDD -64(%sp),%r29
michael@0 539 LDD EIGHT(%r24),%r1
michael@0 540 ADD,DC %r4,%r20,%r20 ; Cycle 6
michael@0 541 LDD -104(%sp),%r19
michael@0 542 FSTD %fr29,-88(%sp)
michael@0 543 ADD %r20,%r1,%r1 ; Cycle 7
michael@0 544 FSTD %fr27,-72(%sp)
michael@0 545 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
michael@0 546 LDO THIRTY_TWO(%r24),%r24
michael@0 547 LDD UN_SIXTEEN(%r24),%r28
michael@0 548 FSTD %fr30,-96(%sp)
michael@0 549 SHRPD %r0,%r29,31,%r3 ; Cycle 2
michael@0 550 ADD,DC %r21,%r4,%r4
michael@0 551 FLDD UN_EIGHT(%r25),%fr7
michael@0 552 STD %r1,UN_TWENTY_FOUR(%r24)
michael@0 553 ADD,DC %r0,%r19,%r19 ; Cycle 3
michael@0 554 ADD %r28,%r4,%r4
michael@0 555 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
michael@0 556 LDD -88(%sp),%r22
michael@0 557 STD %r4,UN_SIXTEEN(%r24)
michael@0 558 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
michael@0 559 XMPYU %fr7L,%fr7L,%fr31
michael@0 560 LDD -72(%sp),%r31
michael@0 561 LDD UN_EIGHT(%r24),%r28
michael@0 562 ADD,DC %r3,%r19,%r19 ; Cycle 6
michael@0 563 LDD -96(%sp),%r20
michael@0 564 FSTD %fr28,-80(%sp)
michael@0 565 ADD %r19,%r28,%r28 ; Cycle 7
michael@0 566 FSTD %fr24,-64(%sp)
michael@0 567 ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
michael@0 568 STD %r28,UN_EIGHT(%r24)
michael@0 569
michael@0 570 $ENDDIAGLOOP
michael@0 571
michael@0 572 ADD,DC %r0,%r22,%r22
michael@0 573 CMPIB,= 0,%r26,$ONEMOREDIAG
michael@0 574 SHRPD %r31,%r0,31,%r3
michael@0 575
michael@0 576 ; Shutdown code, first stage.
michael@0 577
michael@0 578 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
michael@0 579 LDD 0(%r24),%r28
michael@0 580 SHRPD %r0,%r31,31,%r4 ; Cycle 2
michael@0 581 ADD %r3,%r22,%r3
michael@0 582 ADD,DC %r0,%r20,%r20 ; Cycle 3
michael@0 583 LDD -80(%sp),%r21
michael@0 584 ADD %r3,%r28,%r3
michael@0 585 LDD -64(%sp),%r29 ; Cycle 4
michael@0 586 STD %r3,0(%r24)
michael@0 587 LDD EIGHT(%r24),%r1 ; Cycle 5
michael@0 588 LDO SIXTEEN(%r25),%r25 ; Cycle 6
michael@0 589 LDD -104(%sp),%r19
michael@0 590 ADD,DC %r4,%r20,%r20
michael@0 591 ADD %r20,%r1,%r1 ; Cycle 7
michael@0 592 ADD,DC %r0,%r21,%r21 ; Cycle 8
michael@0 593 STD %r1,EIGHT(%r24)
michael@0 594
michael@0 595 ; Shutdown code, second stage.
michael@0 596
michael@0 597 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
michael@0 598 LDO THIRTY_TWO(%r24),%r24
michael@0 599 LDD UN_SIXTEEN(%r24),%r1
michael@0 600 SHRPD %r0,%r29,31,%r3 ; Cycle 2
michael@0 601 ADD %r4,%r21,%r4
michael@0 602 ADD,DC %r0,%r19,%r19 ; Cycle 3
michael@0 603 ADD %r4,%r1,%r4
michael@0 604 STD %r4,UN_SIXTEEN(%r24); Cycle 4
michael@0 605 LDD UN_EIGHT(%r24),%r28 ; Cycle 5
michael@0 606 ADD,DC %r3,%r19,%r19 ; Cycle 6
michael@0 607 ADD %r19,%r28,%r28 ; Cycle 7
michael@0 608 ADD,DC %r0,%r0,%r22 ; Cycle 8
michael@0 609 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
michael@0 610 STD %r28,UN_EIGHT(%r24)
michael@0 611
michael@0 612 ; Final carry propagation
michael@0 613
michael@0 614 $FDIAG2
michael@0 615 LDO EIGHT(%r24),%r24
michael@0 616 LDD UN_EIGHT(%r24),%r26
michael@0 617 ADDI 1,%r26,%r26
michael@0 618 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
michael@0 619 STD %r26,UN_EIGHT(%r24)
michael@0 620
michael@0 621 B $Z0
michael@0 622 NOP
michael@0 623
michael@0 624 ; Here is the code that handles the difficult case N=1.
michael@0 625 ; We do the usual trick -- branch out of the startup code at appropriate
michael@0 626 ; points, and branch into the shutdown code.
michael@0 627
michael@0 628 $DIAG_N_IS_ONE
michael@0 629
michael@0 630 LDD -88(%sp),%r22
michael@0 631 LDD -72(%sp),%r31
michael@0 632 B $JOINDIAG
michael@0 633 LDD -96(%sp),%r20
michael@0 634
michael@0 635 ; We came out of the unrolled loop with wrong parity. Do one more
michael@0 636 ; single cycle. This is the "alternate body". It will, of course,
michael@0 637 ; give us opposite registers from the other case, so we need
michael@0 638 ; completely different shutdown code.
michael@0 639
michael@0 640 $ONEMOREDIAG
michael@0 641 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
michael@0 642 LDD 0(%r24),%r28
michael@0 643 FLDD 0(%r25),%fr7 ; Cycle 2
michael@0 644 SHRPD %r0,%r31,31,%r4
michael@0 645 ADD %r3,%r22,%r3
michael@0 646 ADD,DC %r0,%r20,%r20 ; Cycle 3
michael@0 647 LDD -80(%sp),%r21
michael@0 648 ADD %r3,%r28,%r3
michael@0 649 LDD -64(%sp),%r29 ; Cycle 4
michael@0 650 STD %r3,0(%r24)
michael@0 651 XMPYU %fr7R,%fr7R,%fr29
michael@0 652 LDD EIGHT(%r24),%r1 ; Cycle 5
michael@0 653 XMPYU %fr7L,%fr7R,%fr27
michael@0 654 XMPYU %fr7L,%fr7L,%fr30
michael@0 655 LDD -104(%sp),%r19 ; Cycle 6
michael@0 656 FSTD %fr29,-88(%sp)
michael@0 657 ADD,DC %r4,%r20,%r20
michael@0 658 FSTD %fr27,-72(%sp) ; Cycle 7
michael@0 659 ADD %r20,%r1,%r1
michael@0 660 ADD,DC %r0,%r21,%r21 ; Cycle 8
michael@0 661 STD %r1,EIGHT(%r24)
michael@0 662
michael@0 663 ; Shutdown code, first stage.
michael@0 664
michael@0 665 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
michael@0 666 LDO THIRTY_TWO(%r24),%r24
michael@0 667 FSTD %fr30,-96(%sp)
michael@0 668 LDD UN_SIXTEEN(%r24),%r1
michael@0 669 SHRPD %r0,%r29,31,%r3 ; Cycle 2
michael@0 670 ADD %r4,%r21,%r4
michael@0 671 ADD,DC %r0,%r19,%r19 ; Cycle 3
michael@0 672 LDD -88(%sp),%r22
michael@0 673 ADD %r4,%r1,%r4
michael@0 674 LDD -72(%sp),%r31 ; Cycle 4
michael@0 675 STD %r4,UN_SIXTEEN(%r24)
michael@0 676 LDD UN_EIGHT(%r24),%r28 ; Cycle 5
michael@0 677 LDD -96(%sp),%r20 ; Cycle 6
michael@0 678 ADD,DC %r3,%r19,%r19
michael@0 679 ADD %r19,%r28,%r28 ; Cycle 7
michael@0 680 ADD,DC %r0,%r22,%r22 ; Cycle 8
michael@0 681 STD %r28,UN_EIGHT(%r24)
michael@0 682
michael@0 683 ; Shutdown code, second stage.
michael@0 684
michael@0 685 $JOINDIAG
michael@0 686 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
michael@0 687 LDD 0(%r24),%r28
michael@0 688 SHRPD %r0,%r31,31,%r4 ; Cycle 2
michael@0 689 ADD %r3,%r22,%r3
michael@0 690 ADD,DC %r0,%r20,%r20 ; Cycle 3
michael@0 691 ADD %r3,%r28,%r3
michael@0 692 STD %r3,0(%r24) ; Cycle 4
michael@0 693 LDD EIGHT(%r24),%r1 ; Cycle 5
michael@0 694 ADD,DC %r4,%r20,%r20
michael@0 695 ADD %r20,%r1,%r1 ; Cycle 7
michael@0 696 ADD,DC %r0,%r0,%r21 ; Cycle 8
michael@0 697 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
michael@0 698 STD %r1,EIGHT(%r24)
michael@0 699
michael@0 700 ; Final carry propagation
michael@0 701
michael@0 702 $FDIAG1
michael@0 703 LDO EIGHT(%r24),%r24
michael@0 704 LDD EIGHT(%r24),%r26
michael@0 705 ADDI 1,%r26,%r26
michael@0 706 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
michael@0 707 STD %r26,EIGHT(%r24)
michael@0 708
michael@0 709 $Z0
michael@0 710 LDW -124(%sp),%r4
michael@0 711 BVE (%r2)
michael@0 712 .EXIT
michael@0 713 LDW,MB -128(%sp),%r3
michael@0 714 .PROCEND
michael@0 715 ; .ALLOW
michael@0 716
michael@0 717 .SPACE $TEXT$
michael@0 718 .SUBSPA $CODE$
michael@0 719 #ifdef LITTLE_WORDIAN
michael@0 720 #ifdef __GNUC__
michael@0 721 ; GNU-as (as of 2.19) does not support LONG_RETURN
michael@0 722 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
michael@0 723 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
michael@0 724 #else
michael@0 725 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
michael@0 726 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
michael@0 727 #endif
michael@0 728 #else
michael@0 729 .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
michael@0 730 .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
michael@0 731 #endif
michael@0 732 .END
michael@0 733
michael@0 734
michael@0 735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
michael@0 736 ;
michael@0 737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
michael@0 738 ; performs a 64-bit x any-size multiply, and adds the
michael@0 739 ; result to an area of memory. That is, it performs
michael@0 740 ; something like
michael@0 741 ;
michael@0 742 ; A B C D
michael@0 743 ; * Z
michael@0 744 ; __________
michael@0 745 ; P Q R S T
michael@0 746 ;
michael@0 747 ; and then adds the "PQRST" vector into an area of memory,
michael@0 748 ; handling all carries.
michael@0 749 ;
michael@0 750 ; Digression on nomenclature and endian-ness:
michael@0 751 ;
michael@0 752 ; Each of the capital letters in the above represents a 64-bit
michael@0 753 ; quantity. That is, you could think of the discussion as
michael@0 754 ; being in terms of radix-16-quintillion arithmetic. The data
michael@0 755 ; type being manipulated is "unsigned long long int". This
michael@0 756 ; requires the 64-bit extension of the HP-UX C compiler,
michael@0 757 ; available at release 10. You need these compiler flags to
michael@0 758 ; enable these extensions:
michael@0 759 ;
michael@0 760 ; -Aa +e +DA2.0 +DS2.0
michael@0 761 ;
michael@0 762 ; (The first specifies ANSI C, the second enables the
michael@0 763 ; extensions, which are beyond ANSI C, and the third and
michael@0 764 ; fourth tell the compiler to use whatever features of the
michael@0 765 ; PA2.0 architecture it wishes, in order to made the code more
michael@0 766 ; efficient. Since the presence of the assembly code will
michael@0 767 ; make the program unable to run on anything less than PA2.0,
michael@0 768 ; you might as well gain the performance enhancements in the C
michael@0 769 ; code as well.)
michael@0 770 ;
michael@0 771 ; Questions of "endian-ness" often come up, usually in the
michael@0 772 ; context of byte ordering in a word. These routines have a
michael@0 773 ; similar issue, that could be called "wordian-ness".
michael@0 774 ; Independent of byte ordering (PA is always big-endian), one
michael@0 775 ; can make two choices when representing extremely large
michael@0 776 ; numbers as arrays of 64-bit doublewords in memory.
michael@0 777 ;
michael@0 778 ; "Little-wordian" layout means that the least significant
michael@0 779 ; word of a number is stored at the lowest address.
michael@0 780 ;
michael@0 781 ; MSW LSW
michael@0 782 ; | |
michael@0 783 ; V V
michael@0 784 ;
michael@0 785 ; A B C D E
michael@0 786 ;
michael@0 787 ; ^ ^ ^
michael@0 788 ; | | |____ address 0
michael@0 789 ; | |
michael@0 790 ; | |_______address 8
michael@0 791 ; |
michael@0 792 ; address 32
michael@0 793 ;
michael@0 794 ; "Big-wordian" means that the most significant word is at the
michael@0 795 ; lowest address.
michael@0 796 ;
michael@0 797 ; MSW LSW
michael@0 798 ; | |
michael@0 799 ; V V
michael@0 800 ;
michael@0 801 ; A B C D E
michael@0 802 ;
michael@0 803 ; ^ ^ ^
michael@0 804 ; | | |____ address 32
michael@0 805 ; | |
michael@0 806 ; | |_______address 24
michael@0 807 ; |
michael@0 808 ; address 0
michael@0 809 ;
michael@0 810 ; When you compile the file, you must specify one or the other, with
michael@0 811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
michael@0 812 ;
michael@0 813 ; Incidentally, you assemble this file as part of your
michael@0 814 ; project with the same C compiler as the rest of the program.
michael@0 815 ; My "makefile" for a superprecision arithmetic package has
michael@0 816 ; the following stuff:
michael@0 817 ;
michael@0 818 ; # definitions:
michael@0 819 ; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
michael@0 820 ; CFLAGS = +O3
michael@0 821 ; LDFLAGS = -L /usr/lib -Wl,-aarchive
michael@0 822 ;
michael@0 823 ; # general build rule for ".s" files:
michael@0 824 ; .s.o:
michael@0 825 ; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
michael@0 826 ;
michael@0 827 ; # Now any bind step that calls for pa20.o will assemble pa20.s
michael@0 828 ;
michael@0 829 ; End of digression, back to arithmetic:
michael@0 830 ;
michael@0 831 ; The way we multiply two huge numbers is, of course, to multiply
michael@0 832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
michael@0 833 ; the result vectors with increasing offsets, the way we learned
michael@0 834 ; in school, back before we all used calculators:
michael@0 835 ;
michael@0 836 ; A B C D
michael@0 837 ; * W X Y Z
michael@0 838 ; __________
michael@0 839 ; P Q R S T
michael@0 840 ; E F G H I
michael@0 841 ; M N O P Q
michael@0 842 ; + R S T U V
michael@0 843 ; _______________
michael@0 844 ; F I N A L S U M
michael@0 845 ;
michael@0 846 ; So we call maxpy_PA20_big (in my case; my package is
michael@0 847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
michael@0 848 ; in turn as the "scalar", and giving the "ABCD" vector each
michael@0 849 ; time. We direct it to add its result into an area of memory
michael@0 850 ; that we have cleared at the start. We skew the exact
michael@0 851 ; location into that area with each call.
michael@0 852 ;
michael@0 853 ; The prototype for the function is
michael@0 854 ;
michael@0 855 ; extern void maxpy_PA20_big(
michael@0 856 ; int length, /* Number of doublewords in the multiplicand vector. */
michael@0 857 ; const long long int *scalaraddr, /* Address to fetch the scalar. */
michael@0 858 ; const long long int *multiplicand, /* The multiplicand vector. */
michael@0 859 ; long long int *result); /* Where to accumulate the result. */
michael@0 860 ;
michael@0 861 ; (You should place a copy of this prototype in an include file
michael@0 862 ; or in your C file.)
michael@0 863 ;
michael@0 864 ; Now, IN ALL CASES, the given address for the multiplicand or
michael@0 865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
michael@0 866 ; That word is, of course, the word at which the routine
michael@0 867 ; starts processing. "maxpy_PA20_little" then increases the
michael@0 868 ; addresses as it computes. "maxpy_PA20_big" decreases them.
michael@0 869 ;
michael@0 870 ; In our example above, "length" would be 4 in each case.
michael@0 871 ; "multiplicand" would be the "ABCD" vector. Specifically,
michael@0 872 ; the address of the element "D". "scalaraddr" would be the
michael@0 873 ; address of "W", "X", "Y", or "Z" on the four calls that we
michael@0 874 ; would make. (The order doesn't matter, of course.)
michael@0 875 ; "result" would be the appropriate address in the result
michael@0 876 ; area. When multiplying by "Z", that would be the least
michael@0 877 ; significant word. When multiplying by "Y", it would be the
michael@0 878 ; next higher word (8 bytes higher if little-wordian; 8 bytes
michael@0 879 ; lower if big-wordian), and so on. The size of the result
michael@0 880 ; area must be the the sum of the sizes of the multiplicand
michael@0 881 ; and multiplier vectors, and must be initialized to zero
michael@0 882 ; before we start.
michael@0 883 ;
michael@0 884 ; Whenever the routine adds its partial product into the result
michael@0 885 ; vector, it follows carry chains as far as they need to go.
michael@0 886 ;
michael@0 887 ; Here is the super-precision multiply routine that I use for
michael@0 888 ; my package. The package is big-wordian. I have taken out
michael@0 889 ; handling of exponents (it's a floating point package):
michael@0 890 ;
michael@0 891 ; static void mul_PA20(
michael@0 892 ; int size,
michael@0 893 ; const long long int *arg1,
michael@0 894 ; const long long int *arg2,
michael@0 895 ; long long int *result)
michael@0 896 ; {
michael@0 897 ; int i;
michael@0 898 ;
michael@0 899 ; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
michael@0 900 ;
michael@0 901 ; for (i=0 ; i<size ; i++) {
michael@0 902 ; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
michael@0 903 ; }
michael@0 904 ; }

mercurial