1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/arcfour-amd64-gas.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,88 @@ 1.4 +# This Source Code Form is subject to the terms of the Mozilla Public 1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +# ** ARCFOUR implementation optimized for AMD64. 1.9 +# ** 1.10 +# ** The throughput achieved by this code is about 320 MBytes/sec, on 1.11 +# ** a 1.8 GHz AMD Opteron (rev C0) processor. 1.12 + 1.13 +.text 1.14 +.align 16 1.15 +.globl ARCFOUR 1.16 +.type ARCFOUR,@function 1.17 +ARCFOUR: 1.18 + pushq %rbp 1.19 + pushq %rbx 1.20 + movq %rdi, %rbp # key = ARG(key) 1.21 + movq %rsi, %rbx # rbx = ARG(len) 1.22 + movq %rdx, %rsi # in = ARG(in) 1.23 + movq %rcx, %rdi # out = ARG(out) 1.24 + movq (%rbp), %rcx # x = key->x 1.25 + movq 8(%rbp), %rdx # y = key->y 1.26 + addq $16, %rbp # d = key->data 1.27 + incq %rcx # x++ 1.28 + andq $255, %rcx # x &= 0xff 1.29 + leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8 1.30 + movq %rbx, %r9 # tmp = in+len-8 1.31 + movq 0(%rbp,%rcx,8), %rax # tx = d[x] 1.32 + cmpq %rsi, %rbx # cmp in with in+len-8 1.33 + jl .Lend # jump if (in+len-8 < in) 1.34 + 1.35 +.Lstart: 1.36 + addq $8, %rsi # increment in 1.37 + addq $8, %rdi # increment out 1.38 + 1.39 + # generate the next 8 bytes of the rc4 stream into %r8 1.40 + movq $8, %r11 # byte counter 1.41 +1: addb %al, %dl # y += tx 1.42 + movl 0(%rbp,%rdx,8), %ebx # ty = d[y] 1.43 + movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty 1.44 + addb %al, %bl # val = ty + tx 1.45 + movl %eax, 0(%rbp,%rdx,8) # d[y] = tx 1.46 + incb %cl # x++ (NEXT ROUND) 1.47 + movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) 1.48 + movb 0(%rbp,%rbx,8), %r8b # val = d[val] 1.49 + decb %r11b 1.50 + rorq $8, %r8 # (ror does not change ZF) 1.51 + jnz 1b 1.52 + 1.53 + # xor 8 bytes 1.54 + xorq -8(%rsi), %r8 1.55 + cmpq %r9, %rsi # cmp in+len-8 with in 1.56 + movq %r8, -8(%rdi) 1.57 + jle .Lstart # jump if (in <= in+len-8) 1.58 + 1.59 +.Lend: 1.60 + addq $8, %r9 # tmp = in+len 1.61 + 1.62 + # handle the last bytes, one by one 1.63 +1: cmpq %rsi, %r9 # cmp in with in+len 1.64 + jle .Lfinished # jump if (in+len <= in) 1.65 + addb %al, %dl # y += tx 1.66 + movl 0(%rbp,%rdx,8), %ebx # ty = d[y] 1.67 + movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty 1.68 + addb %al, %bl # val = ty + tx 1.69 + movl %eax, 0(%rbp,%rdx,8) # d[y] = tx 1.70 + incb %cl # x++ (NEXT ROUND) 1.71 + movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) 1.72 + movb 0(%rbp,%rbx,8), %r8b # val = d[val] 1.73 + xorb (%rsi), %r8b # xor 1 byte 1.74 + movb %r8b, (%rdi) 1.75 + incq %rsi # in++ 1.76 + incq %rdi # out++ 1.77 + jmp 1b 1.78 + 1.79 +.Lfinished: 1.80 + decq %rcx # x-- 1.81 + movb %dl, -8(%rbp) # key->y = y 1.82 + movb %cl, -16(%rbp) # key->x = x 1.83 + popq %rbx 1.84 + popq %rbp 1.85 + ret 1.86 +.L_ARCFOUR_end: 1.87 +.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR 1.88 + 1.89 +# Magic indicating no need for an executable stack 1.90 +.section .note.GNU-stack,"",@progbits 1.91 +.previous