security/nss/lib/freebl/arcfour-amd64-masm.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/arcfour-amd64-masm.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,107 @@
     1.4 +; This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +; License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +; file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +; ** ARCFOUR implementation optimized for AMD64.
     1.9 +; **
    1.10 +; ** The throughput achieved by this code is about 320 MBytes/sec, on
    1.11 +; ** a 1.8 GHz AMD Opteron (rev C0) processor.
    1.12 +
    1.13 +.CODE
    1.14 +
    1.15 +; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, 
    1.16 +;                     const unsigned char *input, unsigned char *output);
    1.17 +
    1.18 +
    1.19 +ARCFOUR PROC
    1.20 +
    1.21 +        push    rbp
    1.22 +        push    rbx
    1.23 +        push    rsi
    1.24 +        push    rdi
    1.25 +
    1.26 +        mov     rbp, rcx                        ; key = ARG(key)
    1.27 +        mov     rbx, rdx                        ; rbx = ARG(len)
    1.28 +        mov     rsi, r8                         ; in = ARG(in)
    1.29 +        mov     rdi, r9                         ; out = ARG(out)
    1.30 +        mov     rcx, [rbp]                      ; x = key->x
    1.31 +        mov     rdx, [rbp+8]                    ; y = key->y
    1.32 +        add     rbp, 16                         ; d = key->data
    1.33 +        inc     rcx                             ; x++
    1.34 +        and     rcx, 0ffh                       ; x &= 0xff
    1.35 +        lea     rbx, [rbx+rsi-8]                ; rbx = in+len-8
    1.36 +        mov     r9, rbx                         ; tmp = in+len-8
    1.37 +        mov     rax, [rbp+rcx*8]                ; tx = d[x]
    1.38 +        cmp     rbx, rsi                        ; cmp in with in+len-8
    1.39 +        jl      Lend                            ; jump if (in+len-8 < in)
    1.40 +
    1.41 +Lstart:
    1.42 +        add     rsi, 8                          ; increment in
    1.43 +        add     rdi, 8                          ; increment out
    1.44 +
    1.45 +        ;
    1.46 +        ; generate the next 8 bytes of the rc4 stream into r8
    1.47 +        ;
    1.48 +
    1.49 +        mov     r11, 8                          ; byte counter
    1.50 +
    1.51 +@@:
    1.52 +        add     dl, al                          ; y += tx
    1.53 +        mov     ebx, [rbp+rdx*8]                ; ty = d[y]
    1.54 +        mov     [rbp+rcx*8], ebx                ; d[x] = ty
    1.55 +        add     bl, al                          ; val = ty + tx
    1.56 +        mov     [rbp+rdx*8], eax                ; d[y] = tx
    1.57 +        inc     cl                              ; x++ (NEXT ROUND)
    1.58 +        mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
    1.59 +        mov     r8b, [rbp+rbx*8]                ; val = d[val]
    1.60 +        dec     r11b
    1.61 +        ror     r8, 8                           ; (ror does not change ZF)
    1.62 +        jnz     @b
    1.63 +
    1.64 +        ;
    1.65 +        ; xor 8 bytes
    1.66 +        ;
    1.67 +
    1.68 +        xor     r8, [rsi-8]
    1.69 +        cmp     rsi, r9                         ; cmp in+len-8 with in
    1.70 +        mov     [rdi-8], r8
    1.71 +        jle     Lstart
    1.72 +
    1.73 +Lend:
    1.74 +        add     r9, 8                           ; tmp = in+len
    1.75 +
    1.76 +        ;
    1.77 +        ; handle the last bytes, one by one
    1.78 +        ;
    1.79 +
    1.80 +@@:
    1.81 +        cmp     r9, rsi                         ; cmp in with in+len
    1.82 +        jle     Lfinished                       ; jump if (in+len <= in)
    1.83 +        add     dl, al                          ; y += tx
    1.84 +        mov     ebx, [rbp+rdx*8]                ; ty = d[y]
    1.85 +        mov     [rbp+rcx*8], ebx                ; d[x] = ty
    1.86 +        add     bl, al                          ; val = ty + tx
    1.87 +        mov     [rbp+rdx*8], eax                ; d[y] = tx
    1.88 +        inc     cl                              ; x++ (NEXT ROUND)
    1.89 +        mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
    1.90 +        mov     r8b, [rbp+rbx*8]                ; val = d[val]
    1.91 +        xor     r8b, [rsi]                      ; xor 1 byte
    1.92 +        mov     [rdi], r8b
    1.93 +        inc     rsi                             ; in++
    1.94 +        inc     rdi
    1.95 +        jmp     @b
    1.96 +
    1.97 +Lfinished:
    1.98 +        dec     rcx                             ; x--
    1.99 +        mov     [rbp-8], dl                     ; key->y = y
   1.100 +        mov     [rbp-16], cl                    ; key->x = x
   1.101 +
   1.102 +        pop     rdi
   1.103 +        pop     rsi
   1.104 +        pop     rbx
   1.105 +        pop     rbp
   1.106 +        ret
   1.107 +
   1.108 +ARCFOUR ENDP
   1.109 +
   1.110 +END

mercurial