michael@0: ; This Source Code Form is subject to the terms of the Mozilla Public michael@0: ; License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: ; file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: ; ** ARCFOUR implementation optimized for AMD64. michael@0: ; ** michael@0: ; ** The throughput achieved by this code is about 320 MBytes/sec, on michael@0: ; ** a 1.8 GHz AMD Opteron (rev C0) processor. michael@0: michael@0: .CODE michael@0: michael@0: ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, michael@0: ; const unsigned char *input, unsigned char *output); michael@0: michael@0: michael@0: ARCFOUR PROC michael@0: michael@0: push rbp michael@0: push rbx michael@0: push rsi michael@0: push rdi michael@0: michael@0: mov rbp, rcx ; key = ARG(key) michael@0: mov rbx, rdx ; rbx = ARG(len) michael@0: mov rsi, r8 ; in = ARG(in) michael@0: mov rdi, r9 ; out = ARG(out) michael@0: mov rcx, [rbp] ; x = key->x michael@0: mov rdx, [rbp+8] ; y = key->y michael@0: add rbp, 16 ; d = key->data michael@0: inc rcx ; x++ michael@0: and rcx, 0ffh ; x &= 0xff michael@0: lea rbx, [rbx+rsi-8] ; rbx = in+len-8 michael@0: mov r9, rbx ; tmp = in+len-8 michael@0: mov rax, [rbp+rcx*8] ; tx = d[x] michael@0: cmp rbx, rsi ; cmp in with in+len-8 michael@0: jl Lend ; jump if (in+len-8 < in) michael@0: michael@0: Lstart: michael@0: add rsi, 8 ; increment in michael@0: add rdi, 8 ; increment out michael@0: michael@0: ; michael@0: ; generate the next 8 bytes of the rc4 stream into r8 michael@0: ; michael@0: michael@0: mov r11, 8 ; byte counter michael@0: michael@0: @@: michael@0: add dl, al ; y += tx michael@0: mov ebx, [rbp+rdx*8] ; ty = d[y] michael@0: mov [rbp+rcx*8], ebx ; d[x] = ty michael@0: add bl, al ; val = ty + tx michael@0: mov [rbp+rdx*8], eax ; d[y] = tx michael@0: inc cl ; x++ (NEXT ROUND) michael@0: mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) michael@0: mov r8b, [rbp+rbx*8] ; val = d[val] michael@0: dec r11b michael@0: ror r8, 8 ; (ror does not change ZF) michael@0: jnz @b michael@0: michael@0: ; michael@0: ; xor 8 bytes michael@0: ; michael@0: michael@0: xor r8, [rsi-8] michael@0: cmp rsi, r9 ; cmp in+len-8 with in michael@0: mov [rdi-8], r8 michael@0: jle Lstart michael@0: michael@0: Lend: michael@0: add r9, 8 ; tmp = in+len michael@0: michael@0: ; michael@0: ; handle the last bytes, one by one michael@0: ; michael@0: michael@0: @@: michael@0: cmp r9, rsi ; cmp in with in+len michael@0: jle Lfinished ; jump if (in+len <= in) michael@0: add dl, al ; y += tx michael@0: mov ebx, [rbp+rdx*8] ; ty = d[y] michael@0: mov [rbp+rcx*8], ebx ; d[x] = ty michael@0: add bl, al ; val = ty + tx michael@0: mov [rbp+rdx*8], eax ; d[y] = tx michael@0: inc cl ; x++ (NEXT ROUND) michael@0: mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) michael@0: mov r8b, [rbp+rbx*8] ; val = d[val] michael@0: xor r8b, [rsi] ; xor 1 byte michael@0: mov [rdi], r8b michael@0: inc rsi ; in++ michael@0: inc rdi michael@0: jmp @b michael@0: michael@0: Lfinished: michael@0: dec rcx ; x-- michael@0: mov [rbp-8], dl ; key->y = y michael@0: mov [rbp-16], cl ; key->x = x michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: pop rbx michael@0: pop rbp michael@0: ret michael@0: michael@0: ARCFOUR ENDP michael@0: michael@0: END