Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | / This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | / License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | / file, You can obtain one at http://mozilla.org/MPL/2.0/. |
michael@0 | 4 | |
michael@0 | 5 | / ** ARCFOUR implementation optimized for AMD64. |
michael@0 | 6 | / ** |
michael@0 | 7 | / ** The throughput achieved by this code is about 320 MBytes/sec, on |
michael@0 | 8 | / ** a 1.8 GHz AMD Opteron (rev C0) processor. |
michael@0 | 9 | |
michael@0 | 10 | .text |
michael@0 | 11 | .align 16 |
michael@0 | 12 | .globl ARCFOUR |
michael@0 | 13 | .type ARCFOUR,@function |
michael@0 | 14 | ARCFOUR: |
michael@0 | 15 | pushq %rbp |
michael@0 | 16 | pushq %rbx |
michael@0 | 17 | movq %rdi, %rbp / key = ARG(key) |
michael@0 | 18 | movq %rsi, %rbx / rbx = ARG(len) |
michael@0 | 19 | movq %rdx, %rsi / in = ARG(in) |
michael@0 | 20 | movq %rcx, %rdi / out = ARG(out) |
michael@0 | 21 | movq (%rbp), %rcx / x = key->x |
michael@0 | 22 | movq 8(%rbp), %rdx / y = key->y |
michael@0 | 23 | addq $16, %rbp / d = key->data |
michael@0 | 24 | incq %rcx / x++ |
michael@0 | 25 | andq $255, %rcx / x &= 0xff |
michael@0 | 26 | leaq -8(%rbx,%rsi), %rbx / rbx = in+len-8 |
michael@0 | 27 | movq %rbx, %r9 / tmp = in+len-8 |
michael@0 | 28 | movq 0(%rbp,%rcx,8), %rax / tx = d[x] |
michael@0 | 29 | cmpq %rsi, %rbx / cmp in with in+len-8 |
michael@0 | 30 | jl .Lend / jump if (in+len-8 < in) |
michael@0 | 31 | |
michael@0 | 32 | .Lstart: |
michael@0 | 33 | addq $8, %rsi / increment in |
michael@0 | 34 | addq $8, %rdi / increment out |
michael@0 | 35 | |
michael@0 | 36 | / generate the next 8 bytes of the rc4 stream into %r8 |
michael@0 | 37 | movq $8, %r11 / byte counter |
michael@0 | 38 | 1: addb %al, %dl / y += tx |
michael@0 | 39 | movl 0(%rbp,%rdx,8), %ebx / ty = d[y] |
michael@0 | 40 | movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty |
michael@0 | 41 | addb %al, %bl / val = ty + tx |
michael@0 | 42 | movl %eax, 0(%rbp,%rdx,8) / d[y] = tx |
michael@0 | 43 | incb %cl / x++ (NEXT ROUND) |
michael@0 | 44 | movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) |
michael@0 | 45 | movb 0(%rbp,%rbx,8), %r8b / val = d[val] |
michael@0 | 46 | decb %r11b |
michael@0 | 47 | rorq $8, %r8 / (ror does not change ZF) |
michael@0 | 48 | jnz 1b |
michael@0 | 49 | |
michael@0 | 50 | / xor 8 bytes |
michael@0 | 51 | xorq -8(%rsi), %r8 |
michael@0 | 52 | cmpq %r9, %rsi / cmp in+len-8 with in |
michael@0 | 53 | movq %r8, -8(%rdi) |
michael@0 | 54 | jle .Lstart / jump if (in <= in+len-8) |
michael@0 | 55 | |
michael@0 | 56 | .Lend: |
michael@0 | 57 | addq $8, %r9 / tmp = in+len |
michael@0 | 58 | |
michael@0 | 59 | / handle the last bytes, one by one |
michael@0 | 60 | 1: cmpq %rsi, %r9 / cmp in with in+len |
michael@0 | 61 | jle .Lfinished / jump if (in+len <= in) |
michael@0 | 62 | addb %al, %dl / y += tx |
michael@0 | 63 | movl 0(%rbp,%rdx,8), %ebx / ty = d[y] |
michael@0 | 64 | movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty |
michael@0 | 65 | addb %al, %bl / val = ty + tx |
michael@0 | 66 | movl %eax, 0(%rbp,%rdx,8) / d[y] = tx |
michael@0 | 67 | incb %cl / x++ (NEXT ROUND) |
michael@0 | 68 | movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) |
michael@0 | 69 | movb 0(%rbp,%rbx,8), %r8b / val = d[val] |
michael@0 | 70 | xorb (%rsi), %r8b / xor 1 byte |
michael@0 | 71 | movb %r8b, (%rdi) |
michael@0 | 72 | incq %rsi / in++ |
michael@0 | 73 | incq %rdi / out++ |
michael@0 | 74 | jmp 1b |
michael@0 | 75 | |
michael@0 | 76 | .Lfinished: |
michael@0 | 77 | decq %rcx / x-- |
michael@0 | 78 | movb %dl, -8(%rbp) / key->y = y |
michael@0 | 79 | movb %cl, -16(%rbp) / key->x = x |
michael@0 | 80 | popq %rbx |
michael@0 | 81 | popq %rbp |
michael@0 | 82 | ret |
michael@0 | 83 | .L_ARCFOUR_end: |
michael@0 | 84 | .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR |