1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/arcfour-amd64-sun.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,84 @@ 1.4 +/ This Source Code Form is subject to the terms of the Mozilla Public 1.5 +/ License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +/ file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +/ ** ARCFOUR implementation optimized for AMD64. 1.9 +/ ** 1.10 +/ ** The throughput achieved by this code is about 320 MBytes/sec, on 1.11 +/ ** a 1.8 GHz AMD Opteron (rev C0) processor. 1.12 + 1.13 +.text 1.14 +.align 16 1.15 +.globl ARCFOUR 1.16 +.type ARCFOUR,@function 1.17 +ARCFOUR: 1.18 + pushq %rbp 1.19 + pushq %rbx 1.20 + movq %rdi, %rbp / key = ARG(key) 1.21 + movq %rsi, %rbx / rbx = ARG(len) 1.22 + movq %rdx, %rsi / in = ARG(in) 1.23 + movq %rcx, %rdi / out = ARG(out) 1.24 + movq (%rbp), %rcx / x = key->x 1.25 + movq 8(%rbp), %rdx / y = key->y 1.26 + addq $16, %rbp / d = key->data 1.27 + incq %rcx / x++ 1.28 + andq $255, %rcx / x &= 0xff 1.29 + leaq -8(%rbx,%rsi), %rbx / rbx = in+len-8 1.30 + movq %rbx, %r9 / tmp = in+len-8 1.31 + movq 0(%rbp,%rcx,8), %rax / tx = d[x] 1.32 + cmpq %rsi, %rbx / cmp in with in+len-8 1.33 + jl .Lend / jump if (in+len-8 < in) 1.34 + 1.35 +.Lstart: 1.36 + addq $8, %rsi / increment in 1.37 + addq $8, %rdi / increment out 1.38 + 1.39 + / generate the next 8 bytes of the rc4 stream into %r8 1.40 + movq $8, %r11 / byte counter 1.41 +1: addb %al, %dl / y += tx 1.42 + movl 0(%rbp,%rdx,8), %ebx / ty = d[y] 1.43 + movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty 1.44 + addb %al, %bl / val = ty + tx 1.45 + movl %eax, 0(%rbp,%rdx,8) / d[y] = tx 1.46 + incb %cl / x++ (NEXT ROUND) 1.47 + movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) 1.48 + movb 0(%rbp,%rbx,8), %r8b / val = d[val] 1.49 + decb %r11b 1.50 + rorq $8, %r8 / (ror does not change ZF) 1.51 + jnz 1b 1.52 + 1.53 + / xor 8 bytes 1.54 + xorq -8(%rsi), %r8 1.55 + cmpq %r9, %rsi / cmp in+len-8 with in 1.56 + movq %r8, -8(%rdi) 1.57 + jle .Lstart / jump if (in <= in+len-8) 1.58 + 1.59 +.Lend: 1.60 + addq $8, %r9 / tmp = in+len 1.61 + 1.62 + / handle the last bytes, one by one 1.63 +1: cmpq %rsi, %r9 / cmp in with in+len 1.64 + jle .Lfinished / jump if (in+len <= in) 1.65 + addb %al, %dl / y += tx 1.66 + movl 0(%rbp,%rdx,8), %ebx / ty = d[y] 1.67 + movl %ebx, 0(%rbp,%rcx,8) / d[x] = ty 1.68 + addb %al, %bl / val = ty + tx 1.69 + movl %eax, 0(%rbp,%rdx,8) / d[y] = tx 1.70 + incb %cl / x++ (NEXT ROUND) 1.71 + movl 0(%rbp,%rcx,8), %eax / tx = d[x] (NEXT ROUND) 1.72 + movb 0(%rbp,%rbx,8), %r8b / val = d[val] 1.73 + xorb (%rsi), %r8b / xor 1 byte 1.74 + movb %r8b, (%rdi) 1.75 + incq %rsi / in++ 1.76 + incq %rdi / out++ 1.77 + jmp 1b 1.78 + 1.79 +.Lfinished: 1.80 + decq %rcx / x-- 1.81 + movb %dl, -8(%rbp) / key->y = y 1.82 + movb %cl, -16(%rbp) / key->x = x 1.83 + popq %rbx 1.84 + popq %rbp 1.85 + ret 1.86 +.L_ARCFOUR_end: 1.87 +.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR