1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/arcfour-amd64-masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,107 @@ 1.4 +; This Source Code Form is subject to the terms of the Mozilla Public 1.5 +; License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +; file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +; ** ARCFOUR implementation optimized for AMD64. 1.9 +; ** 1.10 +; ** The throughput achieved by this code is about 320 MBytes/sec, on 1.11 +; ** a 1.8 GHz AMD Opteron (rev C0) processor. 1.12 + 1.13 +.CODE 1.14 + 1.15 +; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, 1.16 +; const unsigned char *input, unsigned char *output); 1.17 + 1.18 + 1.19 +ARCFOUR PROC 1.20 + 1.21 + push rbp 1.22 + push rbx 1.23 + push rsi 1.24 + push rdi 1.25 + 1.26 + mov rbp, rcx ; key = ARG(key) 1.27 + mov rbx, rdx ; rbx = ARG(len) 1.28 + mov rsi, r8 ; in = ARG(in) 1.29 + mov rdi, r9 ; out = ARG(out) 1.30 + mov rcx, [rbp] ; x = key->x 1.31 + mov rdx, [rbp+8] ; y = key->y 1.32 + add rbp, 16 ; d = key->data 1.33 + inc rcx ; x++ 1.34 + and rcx, 0ffh ; x &= 0xff 1.35 + lea rbx, [rbx+rsi-8] ; rbx = in+len-8 1.36 + mov r9, rbx ; tmp = in+len-8 1.37 + mov rax, [rbp+rcx*8] ; tx = d[x] 1.38 + cmp rbx, rsi ; cmp in with in+len-8 1.39 + jl Lend ; jump if (in+len-8 < in) 1.40 + 1.41 +Lstart: 1.42 + add rsi, 8 ; increment in 1.43 + add rdi, 8 ; increment out 1.44 + 1.45 + ; 1.46 + ; generate the next 8 bytes of the rc4 stream into r8 1.47 + ; 1.48 + 1.49 + mov r11, 8 ; byte counter 1.50 + 1.51 +@@: 1.52 + add dl, al ; y += tx 1.53 + mov ebx, [rbp+rdx*8] ; ty = d[y] 1.54 + mov [rbp+rcx*8], ebx ; d[x] = ty 1.55 + add bl, al ; val = ty + tx 1.56 + mov [rbp+rdx*8], eax ; d[y] = tx 1.57 + inc cl ; x++ (NEXT ROUND) 1.58 + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) 1.59 + mov r8b, [rbp+rbx*8] ; val = d[val] 1.60 + dec r11b 1.61 + ror r8, 8 ; (ror does not change ZF) 1.62 + jnz @b 1.63 + 1.64 + ; 1.65 + ; xor 8 bytes 1.66 + ; 1.67 + 1.68 + xor r8, [rsi-8] 1.69 + cmp rsi, r9 ; cmp in+len-8 with in 1.70 + mov [rdi-8], r8 1.71 + jle Lstart 1.72 + 1.73 +Lend: 1.74 + add r9, 8 ; tmp = in+len 1.75 + 1.76 + ; 1.77 + ; handle the last bytes, one by one 1.78 + ; 1.79 + 1.80 +@@: 1.81 + cmp r9, rsi ; cmp in with in+len 1.82 + jle Lfinished ; jump if (in+len <= in) 1.83 + add dl, al ; y += tx 1.84 + mov ebx, [rbp+rdx*8] ; ty = d[y] 1.85 + mov [rbp+rcx*8], ebx ; d[x] = ty 1.86 + add bl, al ; val = ty + tx 1.87 + mov [rbp+rdx*8], eax ; d[y] = tx 1.88 + inc cl ; x++ (NEXT ROUND) 1.89 + mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) 1.90 + mov r8b, [rbp+rbx*8] ; val = d[val] 1.91 + xor r8b, [rsi] ; xor 1 byte 1.92 + mov [rdi], r8b 1.93 + inc rsi ; in++ 1.94 + inc rdi 1.95 + jmp @b 1.96 + 1.97 +Lfinished: 1.98 + dec rcx ; x-- 1.99 + mov [rbp-8], dl ; key->y = y 1.100 + mov [rbp-16], cl ; key->x = x 1.101 + 1.102 + pop rdi 1.103 + pop rsi 1.104 + pop rbx 1.105 + pop rbp 1.106 + ret 1.107 + 1.108 +ARCFOUR ENDP 1.109 + 1.110 +END