security/nss/lib/freebl/arcfour-amd64-gas.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/arcfour-amd64-gas.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,88 @@
     1.4 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +# ** ARCFOUR implementation optimized for AMD64.
     1.9 +# **
    1.10 +# ** The throughput achieved by this code is about 320 MBytes/sec, on
    1.11 +# ** a 1.8 GHz AMD Opteron (rev C0) processor.
    1.12 +
    1.13 +.text
    1.14 +.align 16
    1.15 +.globl ARCFOUR
    1.16 +.type ARCFOUR,@function
    1.17 +ARCFOUR:
    1.18 +	pushq	%rbp
    1.19 +	pushq	%rbx
    1.20 +	movq	%rdi,		%rbp	# key = ARG(key)
    1.21 +	movq	%rsi,		%rbx	# rbx = ARG(len)
    1.22 +	movq	%rdx,		%rsi	# in = ARG(in)
    1.23 +	movq	%rcx,		%rdi	# out = ARG(out)
    1.24 +	movq	(%rbp),		%rcx	# x = key->x
    1.25 +	movq	8(%rbp),	%rdx	# y = key->y
    1.26 +	addq	$16,		%rbp	# d = key->data
    1.27 +	incq	%rcx			# x++
    1.28 +	andq	$255,		%rcx	# x &= 0xff
    1.29 +	leaq	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
    1.30 +	movq	%rbx,		%r9	# tmp = in+len-8
    1.31 +	movq	0(%rbp,%rcx,8),	%rax	# tx = d[x]
    1.32 +	cmpq	%rsi,		%rbx	# cmp in with in+len-8
    1.33 +	jl	.Lend			# jump if (in+len-8 < in)
    1.34 +
    1.35 +.Lstart:
    1.36 +	addq	$8,		%rsi		# increment in
    1.37 +	addq	$8,		%rdi		# increment out
    1.38 +
    1.39 +	# generate the next 8 bytes of the rc4 stream into %r8
    1.40 +	movq	$8,		%r11		# byte counter
    1.41 +1:	addb	%al,		%dl		# y += tx
    1.42 +	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
    1.43 +	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
    1.44 +	addb	%al,		%bl		# val = ty + tx
    1.45 +	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
    1.46 +	incb	%cl				# x++		(NEXT ROUND)
    1.47 +	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
    1.48 +	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
    1.49 +	decb	%r11b
    1.50 +	rorq	$8,		%r8		# (ror does not change ZF)
    1.51 +	jnz 	1b
    1.52 +
    1.53 +	# xor 8 bytes
    1.54 +	xorq	-8(%rsi),	%r8
    1.55 +	cmpq	%r9,		%rsi		# cmp in+len-8 with in
    1.56 +	movq	%r8,		-8(%rdi)
    1.57 +	jle	.Lstart				# jump if (in <= in+len-8)
    1.58 +
    1.59 +.Lend:
    1.60 +	addq	$8,		%r9		# tmp = in+len
    1.61 +
    1.62 +	# handle the last bytes, one by one
    1.63 +1:	cmpq	%rsi,		%r9		# cmp in with in+len
    1.64 +	jle	.Lfinished			# jump if (in+len <= in)
    1.65 +	addb	%al,		%dl		# y += tx
    1.66 +	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
    1.67 +	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
    1.68 +	addb	%al,		%bl		# val = ty + tx
    1.69 +	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
    1.70 +	incb	%cl				# x++		(NEXT ROUND)
    1.71 +	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
    1.72 +	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
    1.73 +	xorb	(%rsi),		%r8b		# xor 1 byte
    1.74 +	movb	%r8b,		(%rdi)
    1.75 +	incq	%rsi				# in++
    1.76 +	incq	%rdi				# out++
    1.77 +	jmp 1b
    1.78 +
    1.79 +.Lfinished:
    1.80 +	decq	%rcx				# x--
    1.81 +	movb	%dl,		-8(%rbp)	# key->y = y
    1.82 +	movb	%cl,		-16(%rbp)	# key->x = x
    1.83 +	popq	%rbx
    1.84 +	popq	%rbp
    1.85 +	ret
    1.86 +.L_ARCFOUR_end:
    1.87 +.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR
    1.88 +
    1.89 +# Magic indicating no need for an executable stack
    1.90 +.section .note.GNU-stack,"",@progbits
    1.91 +.previous

mercurial