michael@0: ; This Source Code Form is subject to the terms of the Mozilla Public
michael@0: ; License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0: ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0: 
michael@0: ; ** ARCFOUR implementation optimized for AMD64.
michael@0: ; **
michael@0: ; ** The throughput achieved by this code is about 320 MBytes/sec, on
michael@0: ; ** a 1.8 GHz AMD Opteron (rev C0) processor.
michael@0: 
michael@0: .CODE
michael@0: 
michael@0: ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, 
michael@0: ;                     const unsigned char *input, unsigned char *output);
michael@0: 
michael@0: 
michael@0: ARCFOUR PROC
michael@0: 
michael@0:         push    rbp
michael@0:         push    rbx
michael@0:         push    rsi
michael@0:         push    rdi
michael@0: 
michael@0:         mov     rbp, rcx                        ; key = ARG(key)
michael@0:         mov     rbx, rdx                        ; rbx = ARG(len)
michael@0:         mov     rsi, r8                         ; in = ARG(in)
michael@0:         mov     rdi, r9                         ; out = ARG(out)
michael@0:         mov     rcx, [rbp]                      ; x = key->x
michael@0:         mov     rdx, [rbp+8]                    ; y = key->y
michael@0:         add     rbp, 16                         ; d = key->data
michael@0:         inc     rcx                             ; x++
michael@0:         and     rcx, 0ffh                       ; x &= 0xff
michael@0:         lea     rbx, [rbx+rsi-8]                ; rbx = in+len-8
michael@0:         mov     r9, rbx                         ; tmp = in+len-8
michael@0:         mov     rax, [rbp+rcx*8]                ; tx = d[x]
michael@0:         cmp     rbx, rsi                        ; cmp in with in+len-8
michael@0:         jl      Lend                            ; jump if (in+len-8 < in)
michael@0: 
michael@0: Lstart:
michael@0:         add     rsi, 8                          ; increment in
michael@0:         add     rdi, 8                          ; increment out
michael@0: 
michael@0:         ;
michael@0:         ; generate the next 8 bytes of the rc4 stream into r8
michael@0:         ;
michael@0: 
michael@0:         mov     r11, 8                          ; byte counter
michael@0: 
michael@0: @@:
michael@0:         add     dl, al                          ; y += tx
michael@0:         mov     ebx, [rbp+rdx*8]                ; ty = d[y]
michael@0:         mov     [rbp+rcx*8], ebx                ; d[x] = ty
michael@0:         add     bl, al                          ; val = ty + tx
michael@0:         mov     [rbp+rdx*8], eax                ; d[y] = tx
michael@0:         inc     cl                              ; x++ (NEXT ROUND)
michael@0:         mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
michael@0:         mov     r8b, [rbp+rbx*8]                ; val = d[val]
michael@0:         dec     r11b
michael@0:         ror     r8, 8                           ; (ror does not change ZF)
michael@0:         jnz     @b
michael@0: 
michael@0:         ;
michael@0:         ; xor 8 bytes
michael@0:         ;
michael@0: 
michael@0:         xor     r8, [rsi-8]
michael@0:         cmp     rsi, r9                         ; cmp in+len-8 with in
michael@0:         mov     [rdi-8], r8
michael@0:         jle     Lstart
michael@0: 
michael@0: Lend:
michael@0:         add     r9, 8                           ; tmp = in+len
michael@0: 
michael@0:         ;
michael@0:         ; handle the last bytes, one by one
michael@0:         ;
michael@0: 
michael@0: @@:
michael@0:         cmp     r9, rsi                         ; cmp in with in+len
michael@0:         jle     Lfinished                       ; jump if (in+len <= in)
michael@0:         add     dl, al                          ; y += tx
michael@0:         mov     ebx, [rbp+rdx*8]                ; ty = d[y]
michael@0:         mov     [rbp+rcx*8], ebx                ; d[x] = ty
michael@0:         add     bl, al                          ; val = ty + tx
michael@0:         mov     [rbp+rdx*8], eax                ; d[y] = tx
michael@0:         inc     cl                              ; x++ (NEXT ROUND)
michael@0:         mov     eax, [rbp+rcx*8]                ; tx = d[x] (NEXT ROUND)
michael@0:         mov     r8b, [rbp+rbx*8]                ; val = d[val]
michael@0:         xor     r8b, [rsi]                      ; xor 1 byte
michael@0:         mov     [rdi], r8b
michael@0:         inc     rsi                             ; in++
michael@0:         inc     rdi
michael@0:         jmp     @b
michael@0: 
michael@0: Lfinished:
michael@0:         dec     rcx                             ; x--
michael@0:         mov     [rbp-8], dl                     ; key->y = y
michael@0:         mov     [rbp-16], cl                    ; key->x = x
michael@0: 
michael@0:         pop     rdi
michael@0:         pop     rsi
michael@0:         pop     rbx
michael@0:         pop     rbp
michael@0:         ret
michael@0: 
michael@0: ARCFOUR ENDP
michael@0: 
michael@0: END