|
1 # This Source Code Form is subject to the terms of the Mozilla Public |
|
2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 # ** ARCFOUR implementation optimized for AMD64. |
|
6 # ** |
|
7 # ** The throughput achieved by this code is about 320 MBytes/sec, on |
|
8 # ** a 1.8 GHz AMD Opteron (rev C0) processor. |
|
9 |
|
10 .text |
|
11 .align 16 |
|
12 .globl ARCFOUR |
|
13 .type ARCFOUR,@function |
|
14 ARCFOUR: |
|
15 pushq %rbp |
|
16 pushq %rbx |
|
17 movq %rdi, %rbp # key = ARG(key) |
|
18 movq %rsi, %rbx # rbx = ARG(len) |
|
19 movq %rdx, %rsi # in = ARG(in) |
|
20 movq %rcx, %rdi # out = ARG(out) |
|
21 movq (%rbp), %rcx # x = key->x |
|
22 movq 8(%rbp), %rdx # y = key->y |
|
23 addq $16, %rbp # d = key->data |
|
24 incq %rcx # x++ |
|
25 andq $255, %rcx # x &= 0xff |
|
26 leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8 |
|
27 movq %rbx, %r9 # tmp = in+len-8 |
|
28 movq 0(%rbp,%rcx,8), %rax # tx = d[x] |
|
29 cmpq %rsi, %rbx # cmp in with in+len-8 |
|
30 jl .Lend # jump if (in+len-8 < in) |
|
31 |
|
32 .Lstart: |
|
33 addq $8, %rsi # increment in |
|
34 addq $8, %rdi # increment out |
|
35 |
|
36 # generate the next 8 bytes of the rc4 stream into %r8 |
|
37 movq $8, %r11 # byte counter |
|
38 1: addb %al, %dl # y += tx |
|
39 movl 0(%rbp,%rdx,8), %ebx # ty = d[y] |
|
40 movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty |
|
41 addb %al, %bl # val = ty + tx |
|
42 movl %eax, 0(%rbp,%rdx,8) # d[y] = tx |
|
43 incb %cl # x++ (NEXT ROUND) |
|
44 movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) |
|
45 movb 0(%rbp,%rbx,8), %r8b # val = d[val] |
|
46 decb %r11b |
|
47 rorq $8, %r8 # (ror does not change ZF) |
|
48 jnz 1b |
|
49 |
|
50 # xor 8 bytes |
|
51 xorq -8(%rsi), %r8 |
|
52 cmpq %r9, %rsi # cmp in+len-8 with in |
|
53 movq %r8, -8(%rdi) |
|
54 jle .Lstart # jump if (in <= in+len-8) |
|
55 |
|
56 .Lend: |
|
57 addq $8, %r9 # tmp = in+len |
|
58 |
|
59 # handle the last bytes, one by one |
|
60 1: cmpq %rsi, %r9 # cmp in with in+len |
|
61 jle .Lfinished # jump if (in+len <= in) |
|
62 addb %al, %dl # y += tx |
|
63 movl 0(%rbp,%rdx,8), %ebx # ty = d[y] |
|
64 movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty |
|
65 addb %al, %bl # val = ty + tx |
|
66 movl %eax, 0(%rbp,%rdx,8) # d[y] = tx |
|
67 incb %cl # x++ (NEXT ROUND) |
|
68 movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) |
|
69 movb 0(%rbp,%rbx,8), %r8b # val = d[val] |
|
70 xorb (%rsi), %r8b # xor 1 byte |
|
71 movb %r8b, (%rdi) |
|
72 incq %rsi # in++ |
|
73 incq %rdi # out++ |
|
74 jmp 1b |
|
75 |
|
76 .Lfinished: |
|
77 decq %rcx # x-- |
|
78 movb %dl, -8(%rbp) # key->y = y |
|
79 movb %cl, -16(%rbp) # key->x = x |
|
80 popq %rbx |
|
81 popq %rbp |
|
82 ret |
|
83 .L_ARCFOUR_end: |
|
84 .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR |
|
85 |
|
86 # Magic indicating no need for an executable stack |
|
87 .section .note.GNU-stack,"",@progbits |
|
88 .previous |