michael@0: ; This Source Code Form is subject to the terms of the Mozilla Public
michael@0: ; License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0: ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0: 
michael@0: #ifdef __LP64__
michael@0:         .LEVEL   2.0W
michael@0: #else
michael@0: ;       .LEVEL   1.1
michael@0: ;       .ALLOW   2.0N
michael@0:         .LEVEL   2.0
michael@0: #endif
michael@0:         .SPACE   $TEXT$,SORT=8
michael@0:         .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
michael@0: 
michael@0: ; ***************************************************************
michael@0: ;
michael@0: ;                 maxpy_[little/big]
michael@0: ;
michael@0: ; ***************************************************************
michael@0: 
michael@0: ; There is no default -- you must specify one or the other.
michael@0: #define LITTLE_WORDIAN 1
michael@0: 
michael@0: #ifdef LITTLE_WORDIAN
michael@0: #define EIGHT 8
michael@0: #define SIXTEEN 16
michael@0: #define THIRTY_TWO 32
michael@0: #define UN_EIGHT -8
michael@0: #define UN_SIXTEEN -16
michael@0: #define UN_TWENTY_FOUR -24
michael@0: #endif
michael@0: 
michael@0: #ifdef BIG_WORDIAN
michael@0: #define EIGHT -8
michael@0: #define SIXTEEN -16
michael@0: #define THIRTY_TWO -32
michael@0: #define UN_EIGHT 8
michael@0: #define UN_SIXTEEN 16
michael@0: #define UN_TWENTY_FOUR 24
michael@0: #endif
michael@0: 
michael@0: ; This performs a multiple-precision integer version of "daxpy",
michael@0: ; Using the selected addressing direction.  "Little-wordian" means that
michael@0: ; the least significant word of a number is stored at the lowest address.
michael@0: ; "Big-wordian" means that the most significant word is at the lowest
michael@0: ; address.  Either way, the incoming address of the vector is that
michael@0: ; of the least significant word.  That means that, for little-wordian
michael@0: ; addressing, we move the address upward as we propagate carries
michael@0: ; from the least significant word to the most significant.  For
michael@0: ; big-wordian we move the address downward.
michael@0: 
michael@0: ; We use the following registers:
michael@0: ;
michael@0: ;     r2   return PC, of course
michael@0: ;     r26 = arg1 =  length
michael@0: ;     r25 = arg2 =  address of scalar
michael@0: ;     r24 = arg3 =  multiplicand vector
michael@0: ;     r23 = arg4 =  result vector
michael@0: ;
michael@0: ;     fr9 = scalar loaded once only from r25
michael@0: 
michael@0: ; The cycle counts shown in the bodies below are simply the result of a
michael@0: ; scheduling by hand.  The actual PCX-U hardware does it differently.
michael@0: ; The intention is that the overall speed is the same.
michael@0: 
michael@0: ; The pipeline startup and shutdown code is constructed in the usual way,
michael@0: ; by taking the loop bodies and removing unnecessary instructions.
michael@0: ; We have left the comments describing cycle numbers in the code.
michael@0: ; These are intended for reference when comparing with the main loop,
michael@0: ; and have no particular relationship to actual cycle numbers.
michael@0: 
michael@0: #ifdef LITTLE_WORDIAN
michael@0: maxpy_little
michael@0: #else
michael@0: maxpy_big
michael@0: #endif
michael@0:         .PROC
michael@0:         .CALLINFO FRAME=120,ENTRY_GR=4
michael@0:         .ENTRY
michael@0:         STW,MA  %r3,128(%sp)
michael@0:         STW     %r4,-124(%sp)
michael@0: 
michael@0:         ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
michael@0:         FLDD    0(%r25),%fr9        ; fr9 = scalar
michael@0: 
michael@0: ; First startup
michael@0: 
michael@0:         FLDD    0(%r24),%fr24       ; Cycle 1
michael@0:         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
michael@0:         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
michael@0:         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
michael@0:         CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
michael@0:         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
michael@0:         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
michael@0:         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
michael@0:         FSTD    %fr24,-96(%sp)
michael@0:         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
michael@0:         FSTD    %fr25,-80(%sp)
michael@0:         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
michael@0:         FSTD    %fr31,-64(%sp)
michael@0:         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
michael@0:         FSTD    %fr27,-48(%sp)
michael@0: 
michael@0: ; Second startup
michael@0: 
michael@0:         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
michael@0:         FSTD    %fr30,-56(%sp)
michael@0:         FLDD    0(%r24),%fr24
michael@0: 
michael@0:         FSTD    %fr26,-88(%sp)      ; Cycle 2
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
michael@0:         FSTD    %fr28,-104(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
michael@0:         LDD     -96(%sp),%r3
michael@0:         FSTD    %fr29,-72(%sp)
michael@0: 
michael@0:         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
michael@0:         LDD     -64(%sp),%r19
michael@0:         LDD     -80(%sp),%r21
michael@0: 
michael@0:         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
michael@0:         LDD     -56(%sp),%r20
michael@0:         ADD     %r21,%r3,%r3
michael@0: 
michael@0:         ADD,DC  %r20,%r19,%r19      ; Cycle 7
michael@0:         LDD     -88(%sp),%r4
michael@0:         SHRPD   %r3,%r0,32,%r21
michael@0:         LDD     -48(%sp),%r1
michael@0: 
michael@0:         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
michael@0:         LDD     -104(%sp),%r31
michael@0:         ADD,DC  %r0,%r0,%r20
michael@0:         SHRPD   %r19,%r3,32,%r3
michael@0: 
michael@0:         LDD     -72(%sp),%r29       ; Cycle 9
michael@0:         SHRPD   %r20,%r19,32,%r20
michael@0:         ADD     %r21,%r1,%r1
michael@0: 
michael@0:         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
michael@0:         ADD,DC  %r3,%r4,%r4
michael@0:         FSTD    %fr24,-96(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0:         LDD     0(%r23),%r3
michael@0:         FSTD    %fr25,-80(%sp)
michael@0: 
michael@0:         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
michael@0:         FSTD    %fr31,-64(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
michael@0:         ADD     %r0,%r0,%r0         ; clear the carry bit
michael@0:         ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
michael@0:         FSTD    %fr27,-48(%sp)
michael@0: ;        MFCTL   %cr16,%r21         ; for timing
michael@0: ;        STD     %r21,-112(%sp)
michael@0: 
michael@0: ; Here is the loop.
michael@0: 
michael@0: $LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
michael@0:         ADD,DC  %r29,%r4,%r4
michael@0:         FSTD    %fr30,-56(%sp)
michael@0:         FLDD    0(%r24),%fr24
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0:         FSTD    %fr26,-88(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
michael@0:         ADD     %r3,%r1,%r1
michael@0:         FSTD    %fr28,-104(%sp)
michael@0:         LDD     UN_EIGHT(%r23),%r21
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
michael@0:         ADD,DC  %r21,%r4,%r28
michael@0:         FSTD    %fr29,-72(%sp)    
michael@0:         LDD     -96(%sp),%r3
michael@0: 
michael@0:         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
michael@0:         ADD,DC  %r20,%r31,%r22
michael@0:         LDD     -64(%sp),%r19
michael@0:         LDD     -80(%sp),%r21
michael@0: 
michael@0:         XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
michael@0:         ADD     %r21,%r3,%r3
michael@0:         LDD     -56(%sp),%r20
michael@0:         STD     %r1,UN_SIXTEEN(%r23)
michael@0: 
michael@0:         ADD,DC  %r20,%r19,%r19      ; Cycle 7
michael@0:         SHRPD   %r3,%r0,32,%r21
michael@0:         LDD     -88(%sp),%r4
michael@0:         LDD     -48(%sp),%r1
michael@0: 
michael@0:         ADD,DC  %r0,%r0,%r20        ; Cycle 8
michael@0:         SHRPD   %r19,%r3,32,%r3
michael@0:         FLDD    EIGHT(%r24),%fr28
michael@0:         LDD     -104(%sp),%r31
michael@0: 
michael@0:         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
michael@0:         ADD     %r21,%r1,%r1
michael@0:         STD     %r28,UN_EIGHT(%r23)
michael@0:         LDD     -72(%sp),%r29
michael@0: 
michael@0:         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
michael@0:         ADD,DC  %r3,%r4,%r4
michael@0:         FSTD    %fr24,-96(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0:         FSTD    %fr25,-80(%sp)
michael@0:         LDD     0(%r23),%r3
michael@0: 
michael@0:         LDO     SIXTEEN(%r24),%r24  ; Cycle 12
michael@0:         FSTD    %fr31,-64(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
michael@0:         ADD     %r22,%r1,%r1
michael@0:         ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
michael@0:         FSTD    %fr27,-48(%sp)
michael@0: 
michael@0: $ENDLOOP
michael@0: 
michael@0: ; Shutdown code, first stage.
michael@0: 
michael@0: ;        MFCTL   %cr16,%r21         ; for timing
michael@0: ;        STD     %r21,UN_SIXTEEN(%r23)
michael@0: ;        LDD     -112(%sp),%r21
michael@0: ;        STD     %r21,UN_EIGHT(%r23)
michael@0: 
michael@0:         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
michael@0:         ADD,DC  %r29,%r4,%r4
michael@0:         CMPIB,= 0,%r26,$ONEMORE
michael@0:         FSTD    %fr30,-56(%sp)
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0:         FSTD    %fr26,-88(%sp)
michael@0: 
michael@0:         ADD     %r3,%r1,%r1         ; Cycle 3
michael@0:         FSTD    %fr28,-104(%sp)
michael@0:         LDD     UN_EIGHT(%r23),%r21
michael@0: 
michael@0:         ADD,DC  %r21,%r4,%r28       ; Cycle 4
michael@0:         FSTD    %fr29,-72(%sp)    
michael@0:         STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
michael@0:         LDD     -96(%sp),%r3
michael@0: 
michael@0:         ADD,DC  %r20,%r31,%r22      ; Cycle 5
michael@0:         STD     %r1,UN_SIXTEEN(%r23)
michael@0: $JOIN4
michael@0:         LDD     -64(%sp),%r19
michael@0:         LDD     -80(%sp),%r21
michael@0: 
michael@0:         ADD     %r21,%r3,%r3        ; Cycle 6
michael@0:         LDD     -56(%sp),%r20
michael@0: 
michael@0:         ADD,DC  %r20,%r19,%r19      ; Cycle 7
michael@0:         SHRPD   %r3,%r0,32,%r21
michael@0:         LDD     -88(%sp),%r4
michael@0:         LDD     -48(%sp),%r1
michael@0: 
michael@0:         ADD,DC  %r0,%r0,%r20        ; Cycle 8
michael@0:         SHRPD   %r19,%r3,32,%r3
michael@0:         LDD     -104(%sp),%r31
michael@0: 
michael@0:         SHRPD   %r20,%r19,32,%r20   ; Cycle 9
michael@0:         ADD     %r21,%r1,%r1
michael@0:         LDD     -72(%sp),%r29
michael@0: 
michael@0:         ADD,DC  %r3,%r4,%r4         ; Cycle 10
michael@0: 
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 11
michael@0:         LDD     0(%r23),%r3
michael@0: 
michael@0:         ADD     %r22,%r1,%r1        ; Cycle 13
michael@0: 
michael@0: ; Shutdown code, second stage.
michael@0: 
michael@0:         ADD,DC  %r29,%r4,%r4        ; Cycle 1
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0: 
michael@0:         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0:         ADD     %r3,%r1,%r1
michael@0: 
michael@0:         ADD,DC  %r21,%r4,%r28       ; Cycle 4
michael@0: 
michael@0:         ADD,DC  %r20,%r31,%r22      ; Cycle 5
michael@0: 
michael@0:         STD     %r1,UN_SIXTEEN(%r23); Cycle 6
michael@0: 
michael@0:         STD     %r28,UN_EIGHT(%r23) ; Cycle 9
michael@0: 
michael@0:         LDD     0(%r23),%r3         ; Cycle 11
michael@0: 
michael@0: ; Shutdown code, third stage.
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23
michael@0:         ADD     %r3,%r22,%r1
michael@0: $JOIN1  ADD,DC  %r0,%r0,%r21
michael@0:         CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
michael@0:         STD     %r1,UN_SIXTEEN(%r23)
michael@0: 
michael@0: ; Final carry propagation
michael@0: 
michael@0: $FINAL1 LDO     EIGHT(%r23),%r23
michael@0:         LDD     UN_SIXTEEN(%r23),%r21
michael@0:         ADDI    1,%r21,%r21
michael@0:         CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
michael@0:         STD     %r21,UN_SIXTEEN(%r23)
michael@0:         B       $L0
michael@0:         NOP
michael@0: 
michael@0: ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
michael@0: ; We do the usual trick -- branch out of the startup code at appropriate
michael@0: ; points, and branch into the shutdown code.
michael@0: 
michael@0: $N_IS_SMALL
michael@0:         CMPIB,= 0,%r26,$N_IS_ONE
michael@0:         FSTD    %fr24,-96(%sp)      ; Cycle 10
michael@0:         FLDD    EIGHT(%r24),%fr28   ; Cycle 8
michael@0:         XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
michael@0:         XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
michael@0:         FSTD    %fr25,-80(%sp)
michael@0:         FSTD    %fr31,-64(%sp)      ; Cycle 12
michael@0:         XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
michael@0:         FSTD    %fr27,-48(%sp)
michael@0:         XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
michael@0:         CMPIB,= 2,%r26,$N_IS_THREE
michael@0:         FSTD    %fr30,-56(%sp)
michael@0: 
michael@0: ; N = 2
michael@0:         FSTD    %fr26,-88(%sp)      ; Cycle 2
michael@0:         FSTD    %fr28,-104(%sp)     ; Cycle 3
michael@0:         LDD     -96(%sp),%r3        ; Cycle 4
michael@0:         FSTD    %fr29,-72(%sp)
michael@0:         B       $JOIN4
michael@0:         ADD     %r0,%r0,%r22
michael@0: 
michael@0: $N_IS_THREE
michael@0:         FLDD    SIXTEEN(%r24),%fr24
michael@0:         FSTD    %fr26,-88(%sp)      ; Cycle 2
michael@0:         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
michael@0:         FSTD    %fr28,-104(%sp)
michael@0:         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
michael@0:         LDD     -96(%sp),%r3
michael@0:         FSTD    %fr29,-72(%sp)
michael@0:         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
michael@0:         LDD     -64(%sp),%r19
michael@0:         LDD     -80(%sp),%r21
michael@0:         B       $JOIN3
michael@0:         ADD     %r0,%r0,%r22
michael@0: 
michael@0: $N_IS_ONE
michael@0:         FSTD    %fr25,-80(%sp)
michael@0:         FSTD    %fr27,-48(%sp)
michael@0:         FSTD    %fr26,-88(%sp)      ; Cycle 2
michael@0:         B       $JOIN5
michael@0:         ADD     %r0,%r0,%r22
michael@0: 
michael@0: ; We came out of the unrolled loop with wrong parity.  Do one more
michael@0: ; single cycle.  This is quite tricky, because of the way the
michael@0: ; carry chains and SHRPD chains have been chopped up.
michael@0: 
michael@0: $ONEMORE
michael@0: 
michael@0:         FLDD    0(%r24),%fr24
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         ADD,DC  %r0,%r20,%r20
michael@0:         FSTD    %fr26,-88(%sp)
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
michael@0:         FSTD    %fr28,-104(%sp)
michael@0:         LDD     UN_EIGHT(%r23),%r21
michael@0:         ADD     %r3,%r1,%r1
michael@0: 
michael@0:         XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
michael@0:         ADD,DC  %r21,%r4,%r28
michael@0:         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
michael@0:         LDD     -96(%sp),%r3
michael@0:         FSTD    %fr29,-72(%sp)    
michael@0: 
michael@0:         XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
michael@0:         ADD,DC  %r20,%r31,%r22
michael@0:         LDD     -64(%sp),%r19
michael@0:         LDD     -80(%sp),%r21
michael@0: 
michael@0:         STD     %r1,UN_SIXTEEN(%r23); Cycle 6
michael@0: $JOIN3
michael@0:         XMPYU   %fr9L,%fr24R,%fr24
michael@0:         LDD     -56(%sp),%r20
michael@0:         ADD     %r21,%r3,%r3
michael@0: 
michael@0:         ADD,DC  %r20,%r19,%r19      ; Cycle 7
michael@0:         LDD     -88(%sp),%r4
michael@0:         SHRPD   %r3,%r0,32,%r21
michael@0:         LDD     -48(%sp),%r1
michael@0: 
michael@0:         LDD     -104(%sp),%r31      ; Cycle 8
michael@0:         ADD,DC  %r0,%r0,%r20
michael@0:         SHRPD   %r19,%r3,32,%r3
michael@0: 
michael@0:         LDD     -72(%sp),%r29       ; Cycle 9
michael@0:         SHRPD   %r20,%r19,32,%r20
michael@0:         ADD     %r21,%r1,%r1
michael@0: 
michael@0:         ADD,DC  %r3,%r4,%r4         ; Cycle 10
michael@0:         FSTD    %fr24,-96(%sp)
michael@0: 
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 11
michael@0:         LDD     0(%r23),%r3
michael@0:         FSTD    %fr25,-80(%sp)
michael@0: 
michael@0:         ADD     %r22,%r1,%r1        ; Cycle 13
michael@0:         FSTD    %fr27,-48(%sp)
michael@0: 
michael@0: ; Shutdown code, stage 1-1/2.
michael@0: 
michael@0:         ADD,DC  %r29,%r4,%r4        ; Cycle 1
michael@0: 
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         ADD,DC  %r0,%r20,%r20     
michael@0:         FSTD    %fr26,-88(%sp)
michael@0: 
michael@0:         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0:         ADD     %r3,%r1,%r1
michael@0: 
michael@0:         ADD,DC  %r21,%r4,%r28       ; Cycle 4
michael@0:         STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
michael@0: 
michael@0:         ADD,DC  %r20,%r31,%r22      ; Cycle 5
michael@0:         STD     %r1,UN_SIXTEEN(%r23)
michael@0: $JOIN5
michael@0:         LDD     -96(%sp),%r3        ; moved from cycle 4
michael@0:         LDD     -80(%sp),%r21
michael@0:         ADD     %r21,%r3,%r3        ; Cycle 6
michael@0:         ADD,DC  %r0,%r0,%r19        ; Cycle 7
michael@0:         LDD     -88(%sp),%r4
michael@0:         SHRPD   %r3,%r0,32,%r21
michael@0:         LDD     -48(%sp),%r1
michael@0:         SHRPD   %r19,%r3,32,%r3     ; Cycle 8
michael@0:         ADD     %r21,%r1,%r1        ; Cycle 9
michael@0:         ADD,DC  %r3,%r4,%r4         ; Cycle 10
michael@0:         LDD     0(%r23),%r3         ; Cycle 11
michael@0:         ADD     %r22,%r1,%r1        ; Cycle 13
michael@0: 
michael@0: ; Shutdown code, stage 2-1/2.
michael@0: 
michael@0:         ADD,DC  %r0,%r4,%r4         ; Cycle 1
michael@0:         LDO     SIXTEEN(%r23),%r23  ; Cycle 2
michael@0:         LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
michael@0:         ADD     %r3,%r1,%r1
michael@0:         STD     %r1,UN_SIXTEEN(%r23)
michael@0:         ADD,DC  %r21,%r4,%r1
michael@0:         B       $JOIN1
michael@0:         LDO     EIGHT(%r23),%r23
michael@0: 
michael@0: ; exit
michael@0: 
michael@0: $L0
michael@0:         LDW     -124(%sp),%r4
michael@0:         BVE     (%r2)
michael@0:         .EXIT
michael@0:         LDW,MB  -128(%sp),%r3
michael@0: 
michael@0:         .PROCEND
michael@0: 
michael@0: ; ***************************************************************
michael@0: ;
michael@0: ;                 add_diag_[little/big]
michael@0: ;
michael@0: ; ***************************************************************
michael@0: 
michael@0: ; The arguments are as follows:
michael@0: ;     r2   return PC, of course
michael@0: ;     r26 = arg1 =  length
michael@0: ;     r25 = arg2 =  vector to square
michael@0: ;     r24 = arg3 =  result vector
michael@0: 
michael@0: #ifdef LITTLE_WORDIAN
michael@0: add_diag_little
michael@0: #else
michael@0: add_diag_big
michael@0: #endif
michael@0:         .PROC
michael@0:         .CALLINFO FRAME=120,ENTRY_GR=4
michael@0:         .ENTRY
michael@0:         STW,MA  %r3,128(%sp)
michael@0:         STW     %r4,-124(%sp)
michael@0: 
michael@0:         ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
michael@0:         NOP
michael@0: 
michael@0: ; Startup code
michael@0: 
michael@0:         FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
michael@0:         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
michael@0:         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
michael@0:         XMPYU   %fr7L,%fr7L,%fr30
michael@0:         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
michael@0:         FSTD    %fr29,-88(%sp)
michael@0:         FSTD    %fr27,-72(%sp)      ; Cycle 7
michael@0:         CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
michael@0:         FSTD    %fr30,-96(%sp)
michael@0:         FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
michael@0:         LDD     -88(%sp),%r22       ; Cycle 3
michael@0:         LDD     -72(%sp),%r31       ; Cycle 4
michael@0:         XMPYU   %fr7R,%fr7R,%fr28
michael@0:         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
michael@0:         XMPYU   %fr7L,%fr7L,%fr31
michael@0:         LDD     -96(%sp),%r20       ; Cycle 6
michael@0:         FSTD    %fr28,-80(%sp)
michael@0:         ADD     %r0,%r0,%r0         ; clear the carry bit
michael@0:         ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
michael@0:         FSTD    %fr24,-64(%sp)
michael@0: 
michael@0: ; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
michael@0: 
michael@0: $DIAGLOOP
michael@0:         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
michael@0:         LDO     SIXTEEN(%r25),%r25
michael@0:         LDD     0(%r24),%r1
michael@0:         FSTD    %fr31,-104(%sp)
michael@0:         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
michael@0:         ADD,DC  %r22,%r3,%r3
michael@0:         FLDD    UN_SIXTEEN(%r25),%fr7   
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 3
michael@0:         ADD     %r1,%r3,%r3
michael@0:         XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
michael@0:         LDD     -80(%sp),%r21
michael@0:         STD     %r3,0(%r24)
michael@0:         XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
michael@0:         XMPYU   %fr7L,%fr7L,%fr30
michael@0:         LDD     -64(%sp),%r29       
michael@0:         LDD     EIGHT(%r24),%r1  
michael@0:         ADD,DC  %r4,%r20,%r20       ; Cycle 6
michael@0:         LDD     -104(%sp),%r19
michael@0:         FSTD    %fr29,-88(%sp)
michael@0:         ADD     %r20,%r1,%r1        ; Cycle 7
michael@0:         FSTD    %fr27,-72(%sp)
michael@0:         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
michael@0:         LDO     THIRTY_TWO(%r24),%r24
michael@0:         LDD     UN_SIXTEEN(%r24),%r28
michael@0:         FSTD    %fr30,-96(%sp)
michael@0:         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
michael@0:         ADD,DC  %r21,%r4,%r4
michael@0:         FLDD    UN_EIGHT(%r25),%fr7
michael@0:         STD     %r1,UN_TWENTY_FOUR(%r24)
michael@0:         ADD,DC  %r0,%r19,%r19       ; Cycle 3
michael@0:         ADD     %r28,%r4,%r4
michael@0:         XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
michael@0:         LDD     -88(%sp),%r22
michael@0:         STD     %r4,UN_SIXTEEN(%r24)
michael@0:         XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
michael@0:         XMPYU   %fr7L,%fr7L,%fr31
michael@0:         LDD     -72(%sp),%r31
michael@0:         LDD     UN_EIGHT(%r24),%r28
michael@0:         ADD,DC  %r3,%r19,%r19       ; Cycle 6
michael@0:         LDD     -96(%sp),%r20
michael@0:         FSTD    %fr28,-80(%sp)
michael@0:         ADD     %r19,%r28,%r28      ; Cycle 7
michael@0:         FSTD    %fr24,-64(%sp)
michael@0:         ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
michael@0:         STD     %r28,UN_EIGHT(%r24)
michael@0: 
michael@0: $ENDDIAGLOOP
michael@0: 
michael@0:         ADD,DC  %r0,%r22,%r22    
michael@0:         CMPIB,= 0,%r26,$ONEMOREDIAG
michael@0:         SHRPD   %r31,%r0,31,%r3
michael@0: 
michael@0: ; Shutdown code, first stage.
michael@0: 
michael@0:         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
michael@0:         LDD     0(%r24),%r28
michael@0:         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
michael@0:         ADD     %r3,%r22,%r3
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 3
michael@0:         LDD     -80(%sp),%r21
michael@0:         ADD     %r3,%r28,%r3
michael@0:         LDD     -64(%sp),%r29       ; Cycle 4
michael@0:         STD     %r3,0(%r24)
michael@0:         LDD     EIGHT(%r24),%r1     ; Cycle 5
michael@0:         LDO     SIXTEEN(%r25),%r25  ; Cycle 6
michael@0:         LDD     -104(%sp),%r19
michael@0:         ADD,DC  %r4,%r20,%r20
michael@0:         ADD     %r20,%r1,%r1        ; Cycle 7
michael@0:         ADD,DC  %r0,%r21,%r21       ; Cycle 8
michael@0:         STD     %r1,EIGHT(%r24)
michael@0: 
michael@0: ; Shutdown code, second stage.
michael@0: 
michael@0:         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
michael@0:         LDO     THIRTY_TWO(%r24),%r24
michael@0:         LDD     UN_SIXTEEN(%r24),%r1
michael@0:         SHRPD   %r0,%r29,31,%r3      ; Cycle 2
michael@0:         ADD     %r4,%r21,%r4
michael@0:         ADD,DC  %r0,%r19,%r19       ; Cycle 3
michael@0:         ADD     %r4,%r1,%r4
michael@0:         STD     %r4,UN_SIXTEEN(%r24); Cycle 4
michael@0:         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
michael@0:         ADD,DC  %r3,%r19,%r19       ; Cycle 6       
michael@0:         ADD     %r19,%r28,%r28      ; Cycle 7
michael@0:         ADD,DC  %r0,%r0,%r22        ; Cycle 8
michael@0:         CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
michael@0:         STD     %r28,UN_EIGHT(%r24)
michael@0: 
michael@0: ; Final carry propagation
michael@0: 
michael@0: $FDIAG2
michael@0:         LDO     EIGHT(%r24),%r24
michael@0:         LDD     UN_EIGHT(%r24),%r26
michael@0:         ADDI    1,%r26,%r26
michael@0:         CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
michael@0:         STD     %r26,UN_EIGHT(%r24)
michael@0: 
michael@0:         B   $Z0
michael@0:         NOP
michael@0: 
michael@0: ; Here is the code that handles the difficult case N=1.
michael@0: ; We do the usual trick -- branch out of the startup code at appropriate
michael@0: ; points, and branch into the shutdown code.
michael@0: 
michael@0: $DIAG_N_IS_ONE
michael@0: 
michael@0:         LDD     -88(%sp),%r22
michael@0:         LDD     -72(%sp),%r31
michael@0:         B       $JOINDIAG
michael@0:         LDD     -96(%sp),%r20
michael@0: 
michael@0: ; We came out of the unrolled loop with wrong parity.  Do one more
michael@0: ; single cycle.  This is the "alternate body".  It will, of course,
michael@0: ; give us opposite registers from the other case, so we need
michael@0: ; completely different shutdown code.
michael@0: 
michael@0: $ONEMOREDIAG
michael@0:         FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
michael@0:         LDD     0(%r24),%r28
michael@0:         FLDD    0(%r25),%fr7        ; Cycle 2
michael@0:         SHRPD   %r0,%r31,31,%r4
michael@0:         ADD     %r3,%r22,%r3
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 3
michael@0:         LDD     -80(%sp),%r21
michael@0:         ADD     %r3,%r28,%r3
michael@0:         LDD     -64(%sp),%r29       ; Cycle 4
michael@0:         STD     %r3,0(%r24)
michael@0:         XMPYU   %fr7R,%fr7R,%fr29
michael@0:         LDD     EIGHT(%r24),%r1     ; Cycle 5
michael@0:         XMPYU   %fr7L,%fr7R,%fr27
michael@0:         XMPYU   %fr7L,%fr7L,%fr30
michael@0:         LDD     -104(%sp),%r19      ; Cycle 6
michael@0:         FSTD    %fr29,-88(%sp)
michael@0:         ADD,DC  %r4,%r20,%r20
michael@0:         FSTD    %fr27,-72(%sp)      ; Cycle 7
michael@0:         ADD     %r20,%r1,%r1
michael@0:         ADD,DC  %r0,%r21,%r21       ; Cycle 8
michael@0:         STD     %r1,EIGHT(%r24)
michael@0: 
michael@0: ; Shutdown code, first stage.
michael@0: 
michael@0:         SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
michael@0:         LDO     THIRTY_TWO(%r24),%r24
michael@0:         FSTD    %fr30,-96(%sp)
michael@0:         LDD     UN_SIXTEEN(%r24),%r1
michael@0:         SHRPD   %r0,%r29,31,%r3     ; Cycle 2
michael@0:         ADD     %r4,%r21,%r4
michael@0:         ADD,DC  %r0,%r19,%r19       ; Cycle 3
michael@0:         LDD     -88(%sp),%r22
michael@0:         ADD     %r4,%r1,%r4
michael@0:         LDD     -72(%sp),%r31       ; Cycle 4
michael@0:         STD     %r4,UN_SIXTEEN(%r24)
michael@0:         LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
michael@0:         LDD     -96(%sp),%r20       ; Cycle 6
michael@0:         ADD,DC  %r3,%r19,%r19
michael@0:         ADD     %r19,%r28,%r28      ; Cycle 7
michael@0:         ADD,DC  %r0,%r22,%r22       ; Cycle 8
michael@0:         STD     %r28,UN_EIGHT(%r24)
michael@0: 
michael@0: ; Shutdown code, second stage.
michael@0: 
michael@0: $JOINDIAG
michael@0:         SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
michael@0:         LDD     0(%r24),%r28        
michael@0:         SHRPD   %r0,%r31,31,%r4     ; Cycle 2
michael@0:         ADD     %r3,%r22,%r3
michael@0:         ADD,DC  %r0,%r20,%r20       ; Cycle 3
michael@0:         ADD     %r3,%r28,%r3
michael@0:         STD     %r3,0(%r24)         ; Cycle 4
michael@0:         LDD     EIGHT(%r24),%r1     ; Cycle 5
michael@0:         ADD,DC  %r4,%r20,%r20
michael@0:         ADD     %r20,%r1,%r1        ; Cycle 7
michael@0:         ADD,DC  %r0,%r0,%r21        ; Cycle 8
michael@0:         CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
michael@0:         STD     %r1,EIGHT(%r24)
michael@0: 
michael@0: ; Final carry propagation
michael@0: 
michael@0: $FDIAG1
michael@0:         LDO     EIGHT(%r24),%r24
michael@0:         LDD     EIGHT(%r24),%r26
michael@0:         ADDI    1,%r26,%r26
michael@0:         CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
michael@0:         STD     %r26,EIGHT(%r24)
michael@0: 
michael@0: $Z0
michael@0:         LDW     -124(%sp),%r4
michael@0:         BVE     (%r2)
michael@0:         .EXIT
michael@0:         LDW,MB  -128(%sp),%r3
michael@0:         .PROCEND
michael@0: ;	.ALLOW
michael@0: 
michael@0:         .SPACE         $TEXT$
michael@0:         .SUBSPA        $CODE$
michael@0: #ifdef LITTLE_WORDIAN
michael@0: #ifdef __GNUC__
michael@0: ; GNU-as (as of 2.19) does not support LONG_RETURN
michael@0:         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
michael@0:         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
michael@0: #else
michael@0:         .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
michael@0:         .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
michael@0: #endif
michael@0: #else
michael@0:         .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
michael@0:         .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
michael@0: #endif
michael@0:         .END
michael@0: 
michael@0: 
michael@0: ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
michael@0: ; 
michael@0: ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
michael@0: ; performs a 64-bit x any-size multiply, and adds the
michael@0: ; result to an area of memory.  That is, it performs
michael@0: ; something like
michael@0: ; 
michael@0: ;      A B C D
michael@0: ;    *       Z
michael@0: ;   __________
michael@0: ;    P Q R S T
michael@0: ; 
michael@0: ; and then adds the "PQRST" vector into an area of memory,
michael@0: ; handling all carries.
michael@0: ; 
michael@0: ; Digression on nomenclature and endian-ness:
michael@0: ; 
michael@0: ; Each of the capital letters in the above represents a 64-bit
michael@0: ; quantity.  That is, you could think of the discussion as
michael@0: ; being in terms of radix-16-quintillion arithmetic.  The data
michael@0: ; type being manipulated is "unsigned long long int".  This
michael@0: ; requires the 64-bit extension of the HP-UX C compiler,
michael@0: ; available at release 10.  You need these compiler flags to
michael@0: ; enable these extensions:
michael@0: ; 
michael@0: ;       -Aa +e +DA2.0 +DS2.0
michael@0: ; 
michael@0: ; (The first specifies ANSI C, the second enables the
michael@0: ; extensions, which are beyond ANSI C, and the third and
michael@0: ; fourth tell the compiler to use whatever features of the
michael@0: ; PA2.0 architecture it wishes, in order to made the code more
michael@0: ; efficient.  Since the presence of the assembly code will
michael@0: ; make the program unable to run on anything less than PA2.0,
michael@0: ; you might as well gain the performance enhancements in the C
michael@0: ; code as well.)
michael@0: ; 
michael@0: ; Questions of "endian-ness" often come up, usually in the
michael@0: ; context of byte ordering in a word.  These routines have a
michael@0: ; similar issue, that could be called "wordian-ness".
michael@0: ; Independent of byte ordering (PA is always big-endian), one
michael@0: ; can make two choices when representing extremely large
michael@0: ; numbers as arrays of 64-bit doublewords in memory.
michael@0: ; 
michael@0: ; "Little-wordian" layout means that the least significant
michael@0: ; word of a number is stored at the lowest address.
michael@0: ; 
michael@0: ;   MSW     LSW
michael@0: ;    |       |
michael@0: ;    V       V
michael@0: ; 
michael@0: ;    A B C D E
michael@0: ; 
michael@0: ;    ^     ^ ^
michael@0: ;    |     | |____ address 0
michael@0: ;    |     |
michael@0: ;    |     |_______address 8
michael@0: ;    |
michael@0: ;    address 32
michael@0: ; 
michael@0: ; "Big-wordian" means that the most significant word is at the
michael@0: ; lowest address.
michael@0: ; 
michael@0: ;   MSW     LSW
michael@0: ;    |       |
michael@0: ;    V       V
michael@0: ; 
michael@0: ;    A B C D E
michael@0: ; 
michael@0: ;    ^     ^ ^
michael@0: ;    |     | |____ address 32
michael@0: ;    |     |
michael@0: ;    |     |_______address 24
michael@0: ;    |
michael@0: ;    address 0
michael@0: ; 
michael@0: ; When you compile the file, you must specify one or the other, with
michael@0: ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
michael@0: ; 
michael@0: ;     Incidentally, you assemble this file as part of your
michael@0: ;     project with the same C compiler as the rest of the program.
michael@0: ;     My "makefile" for a superprecision arithmetic package has
michael@0: ;     the following stuff:
michael@0: ; 
michael@0: ;     # definitions:
michael@0: ;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
michael@0: ;     CFLAGS = +O3
michael@0: ;     LDFLAGS = -L /usr/lib -Wl,-aarchive
michael@0: ; 
michael@0: ;     # general build rule for ".s" files:
michael@0: ;     .s.o:
michael@0: ;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
michael@0: ; 
michael@0: ;     # Now any bind step that calls for pa20.o will assemble pa20.s
michael@0: ; 
michael@0: ; End of digression, back to arithmetic:
michael@0: ; 
michael@0: ; The way we multiply two huge numbers is, of course, to multiply
michael@0: ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
michael@0: ; the result vectors with increasing offsets, the way we learned
michael@0: ; in school, back before we all used calculators:
michael@0: ; 
michael@0: ;            A B C D
michael@0: ;          * W X Y Z
michael@0: ;         __________
michael@0: ;          P Q R S T
michael@0: ;        E F G H I
michael@0: ;      M N O P Q
michael@0: ;  + R S T U V
michael@0: ;    _______________
michael@0: ;    F I N A L S U M
michael@0: ; 
michael@0: ; So we call maxpy_PA20_big (in my case; my package is
michael@0: ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
michael@0: ; in turn as the "scalar", and giving the "ABCD" vector each
michael@0: ; time.  We direct it to add its result into an area of memory
michael@0: ; that we have cleared at the start.  We skew the exact
michael@0: ; location into that area with each call.
michael@0: ; 
michael@0: ; The prototype for the function is
michael@0: ; 
michael@0: ; extern void maxpy_PA20_big(
michael@0: ;    int length,        /* Number of doublewords in the multiplicand vector. */
michael@0: ;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
michael@0: ;    const long long int *multiplicand,  /* The multiplicand vector. */
michael@0: ;    long long int *result);             /* Where to accumulate the result. */
michael@0: ; 
michael@0: ; (You should place a copy of this prototype in an include file
michael@0: ; or in your C file.)
michael@0: ; 
michael@0: ; Now, IN ALL CASES, the given address for the multiplicand or
michael@0: ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
michael@0: ; That word is, of course, the word at which the routine
michael@0: ; starts processing.  "maxpy_PA20_little" then increases the
michael@0: ; addresses as it computes.  "maxpy_PA20_big" decreases them.
michael@0: ; 
michael@0: ; In our example above, "length" would be 4 in each case.
michael@0: ; "multiplicand" would be the "ABCD" vector.  Specifically,
michael@0: ; the address of the element "D".  "scalaraddr" would be the
michael@0: ; address of "W", "X", "Y", or "Z" on the four calls that we
michael@0: ; would make.  (The order doesn't matter, of course.)
michael@0: ; "result" would be the appropriate address in the result
michael@0: ; area.  When multiplying by "Z", that would be the least
michael@0: ; significant word.  When multiplying by "Y", it would be the
michael@0: ; next higher word (8 bytes higher if little-wordian; 8 bytes
michael@0: ; lower if big-wordian), and so on.  The size of the result
michael@0: ; area must be the the sum of the sizes of the multiplicand
michael@0: ; and multiplier vectors, and must be initialized to zero
michael@0: ; before we start.
michael@0: ; 
michael@0: ; Whenever the routine adds its partial product into the result
michael@0: ; vector, it follows carry chains as far as they need to go.
michael@0: ; 
michael@0: ; Here is the super-precision multiply routine that I use for
michael@0: ; my package.  The package is big-wordian.  I have taken out
michael@0: ; handling of exponents (it's a floating point package):
michael@0: ; 
michael@0: ; static void mul_PA20(
michael@0: ;   int size,
michael@0: ;   const long long int *arg1,
michael@0: ;   const long long int *arg2,
michael@0: ;   long long int *result)
michael@0: ; {
michael@0: ;    int i;
michael@0: ; 
michael@0: ;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
michael@0: ; 
michael@0: ;    for (i=0 ; i<size ; i++) {
michael@0: ;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
michael@0: ;    }
michael@0: ; }