michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: # ** ARCFOUR implementation optimized for AMD64. michael@0: # ** michael@0: # ** The throughput achieved by this code is about 320 MBytes/sec, on michael@0: # ** a 1.8 GHz AMD Opteron (rev C0) processor. michael@0: michael@0: .text michael@0: .align 16 michael@0: .globl ARCFOUR michael@0: .type ARCFOUR,@function michael@0: ARCFOUR: michael@0: pushq %rbp michael@0: pushq %rbx michael@0: movq %rdi, %rbp # key = ARG(key) michael@0: movq %rsi, %rbx # rbx = ARG(len) michael@0: movq %rdx, %rsi # in = ARG(in) michael@0: movq %rcx, %rdi # out = ARG(out) michael@0: movq (%rbp), %rcx # x = key->x michael@0: movq 8(%rbp), %rdx # y = key->y michael@0: addq $16, %rbp # d = key->data michael@0: incq %rcx # x++ michael@0: andq $255, %rcx # x &= 0xff michael@0: leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8 michael@0: movq %rbx, %r9 # tmp = in+len-8 michael@0: movq 0(%rbp,%rcx,8), %rax # tx = d[x] michael@0: cmpq %rsi, %rbx # cmp in with in+len-8 michael@0: jl .Lend # jump if (in+len-8 < in) michael@0: michael@0: .Lstart: michael@0: addq $8, %rsi # increment in michael@0: addq $8, %rdi # increment out michael@0: michael@0: # generate the next 8 bytes of the rc4 stream into %r8 michael@0: movq $8, %r11 # byte counter michael@0: 1: addb %al, %dl # y += tx michael@0: movl 0(%rbp,%rdx,8), %ebx # ty = d[y] michael@0: movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty michael@0: addb %al, %bl # val = ty + tx michael@0: movl %eax, 0(%rbp,%rdx,8) # d[y] = tx michael@0: incb %cl # x++ (NEXT ROUND) michael@0: movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) michael@0: movb 0(%rbp,%rbx,8), %r8b # val = d[val] michael@0: decb %r11b michael@0: rorq $8, %r8 # (ror does not change ZF) michael@0: jnz 1b michael@0: michael@0: # xor 8 bytes michael@0: xorq -8(%rsi), %r8 michael@0: cmpq %r9, %rsi # cmp in+len-8 with in michael@0: movq %r8, -8(%rdi) michael@0: jle .Lstart # jump if (in <= in+len-8) michael@0: michael@0: .Lend: michael@0: addq $8, %r9 # tmp = in+len michael@0: michael@0: # handle the last bytes, one by one michael@0: 1: cmpq %rsi, %r9 # cmp in with in+len michael@0: jle .Lfinished # jump if (in+len <= in) michael@0: addb %al, %dl # y += tx michael@0: movl 0(%rbp,%rdx,8), %ebx # ty = d[y] michael@0: movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty michael@0: addb %al, %bl # val = ty + tx michael@0: movl %eax, 0(%rbp,%rdx,8) # d[y] = tx michael@0: incb %cl # x++ (NEXT ROUND) michael@0: movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) michael@0: movb 0(%rbp,%rbx,8), %r8b # val = d[val] michael@0: xorb (%rsi), %r8b # xor 1 byte michael@0: movb %r8b, (%rdi) michael@0: incq %rsi # in++ michael@0: incq %rdi # out++ michael@0: jmp 1b michael@0: michael@0: .Lfinished: michael@0: decq %rcx # x-- michael@0: movb %dl, -8(%rbp) # key->y = y michael@0: movb %cl, -16(%rbp) # key->x = x michael@0: popq %rbx michael@0: popq %rbp michael@0: ret michael@0: .L_ARCFOUR_end: michael@0: .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR michael@0: michael@0: # Magic indicating no need for an executable stack michael@0: .section .note.GNU-stack,"",@progbits michael@0: .previous