|
1 ; This Source Code Form is subject to the terms of the Mozilla Public |
|
2 ; License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 ; ** ARCFOUR implementation optimized for AMD64. |
|
6 ; ** |
|
7 ; ** The throughput achieved by this code is about 320 MBytes/sec, on |
|
8 ; ** a 1.8 GHz AMD Opteron (rev C0) processor. |
|
9 |
|
10 .CODE |
|
11 |
|
12 ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, |
|
13 ; const unsigned char *input, unsigned char *output); |
|
14 |
|
15 |
|
16 ARCFOUR PROC |
|
17 |
|
18 push rbp |
|
19 push rbx |
|
20 push rsi |
|
21 push rdi |
|
22 |
|
23 mov rbp, rcx ; key = ARG(key) |
|
24 mov rbx, rdx ; rbx = ARG(len) |
|
25 mov rsi, r8 ; in = ARG(in) |
|
26 mov rdi, r9 ; out = ARG(out) |
|
27 mov rcx, [rbp] ; x = key->x |
|
28 mov rdx, [rbp+8] ; y = key->y |
|
29 add rbp, 16 ; d = key->data |
|
30 inc rcx ; x++ |
|
31 and rcx, 0ffh ; x &= 0xff |
|
32 lea rbx, [rbx+rsi-8] ; rbx = in+len-8 |
|
33 mov r9, rbx ; tmp = in+len-8 |
|
34 mov rax, [rbp+rcx*8] ; tx = d[x] |
|
35 cmp rbx, rsi ; cmp in with in+len-8 |
|
36 jl Lend ; jump if (in+len-8 < in) |
|
37 |
|
38 Lstart: |
|
39 add rsi, 8 ; increment in |
|
40 add rdi, 8 ; increment out |
|
41 |
|
42 ; |
|
43 ; generate the next 8 bytes of the rc4 stream into r8 |
|
44 ; |
|
45 |
|
46 mov r11, 8 ; byte counter |
|
47 |
|
48 @@: |
|
49 add dl, al ; y += tx |
|
50 mov ebx, [rbp+rdx*8] ; ty = d[y] |
|
51 mov [rbp+rcx*8], ebx ; d[x] = ty |
|
52 add bl, al ; val = ty + tx |
|
53 mov [rbp+rdx*8], eax ; d[y] = tx |
|
54 inc cl ; x++ (NEXT ROUND) |
|
55 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) |
|
56 mov r8b, [rbp+rbx*8] ; val = d[val] |
|
57 dec r11b |
|
58 ror r8, 8 ; (ror does not change ZF) |
|
59 jnz @b |
|
60 |
|
61 ; |
|
62 ; xor 8 bytes |
|
63 ; |
|
64 |
|
65 xor r8, [rsi-8] |
|
66 cmp rsi, r9 ; cmp in+len-8 with in |
|
67 mov [rdi-8], r8 |
|
68 jle Lstart |
|
69 |
|
70 Lend: |
|
71 add r9, 8 ; tmp = in+len |
|
72 |
|
73 ; |
|
74 ; handle the last bytes, one by one |
|
75 ; |
|
76 |
|
77 @@: |
|
78 cmp r9, rsi ; cmp in with in+len |
|
79 jle Lfinished ; jump if (in+len <= in) |
|
80 add dl, al ; y += tx |
|
81 mov ebx, [rbp+rdx*8] ; ty = d[y] |
|
82 mov [rbp+rcx*8], ebx ; d[x] = ty |
|
83 add bl, al ; val = ty + tx |
|
84 mov [rbp+rdx*8], eax ; d[y] = tx |
|
85 inc cl ; x++ (NEXT ROUND) |
|
86 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) |
|
87 mov r8b, [rbp+rbx*8] ; val = d[val] |
|
88 xor r8b, [rsi] ; xor 1 byte |
|
89 mov [rdi], r8b |
|
90 inc rsi ; in++ |
|
91 inc rdi |
|
92 jmp @b |
|
93 |
|
94 Lfinished: |
|
95 dec rcx ; x-- |
|
96 mov [rbp-8], dl ; key->y = y |
|
97 mov [rbp-16], cl ; key->x = x |
|
98 |
|
99 pop rdi |
|
100 pop rsi |
|
101 pop rbx |
|
102 pop rbp |
|
103 ret |
|
104 |
|
105 ARCFOUR ENDP |
|
106 |
|
107 END |