security/nss/lib/freebl/arcfour-amd64-gas.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 # This Source Code Form is subject to the terms of the Mozilla Public
     2 # License, v. 2.0. If a copy of the MPL was not distributed with this
     3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     5 # ** ARCFOUR implementation optimized for AMD64.
     6 # **
     7 # ** The throughput achieved by this code is about 320 MBytes/sec, on
     8 # ** a 1.8 GHz AMD Opteron (rev C0) processor.
    10 .text
    11 .align 16
    12 .globl ARCFOUR
    13 .type ARCFOUR,@function
    14 ARCFOUR:
    15 	pushq	%rbp
    16 	pushq	%rbx
    17 	movq	%rdi,		%rbp	# key = ARG(key)
    18 	movq	%rsi,		%rbx	# rbx = ARG(len)
    19 	movq	%rdx,		%rsi	# in = ARG(in)
    20 	movq	%rcx,		%rdi	# out = ARG(out)
    21 	movq	(%rbp),		%rcx	# x = key->x
    22 	movq	8(%rbp),	%rdx	# y = key->y
    23 	addq	$16,		%rbp	# d = key->data
    24 	incq	%rcx			# x++
    25 	andq	$255,		%rcx	# x &= 0xff
    26 	leaq	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
    27 	movq	%rbx,		%r9	# tmp = in+len-8
    28 	movq	0(%rbp,%rcx,8),	%rax	# tx = d[x]
    29 	cmpq	%rsi,		%rbx	# cmp in with in+len-8
    30 	jl	.Lend			# jump if (in+len-8 < in)
    32 .Lstart:
    33 	addq	$8,		%rsi		# increment in
    34 	addq	$8,		%rdi		# increment out
    36 	# generate the next 8 bytes of the rc4 stream into %r8
    37 	movq	$8,		%r11		# byte counter
    38 1:	addb	%al,		%dl		# y += tx
    39 	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
    40 	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
    41 	addb	%al,		%bl		# val = ty + tx
    42 	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
    43 	incb	%cl				# x++		(NEXT ROUND)
    44 	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
    45 	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
    46 	decb	%r11b
    47 	rorq	$8,		%r8		# (ror does not change ZF)
    48 	jnz 	1b
    50 	# xor 8 bytes
    51 	xorq	-8(%rsi),	%r8
    52 	cmpq	%r9,		%rsi		# cmp in+len-8 with in
    53 	movq	%r8,		-8(%rdi)
    54 	jle	.Lstart				# jump if (in <= in+len-8)
    56 .Lend:
    57 	addq	$8,		%r9		# tmp = in+len
    59 	# handle the last bytes, one by one
    60 1:	cmpq	%rsi,		%r9		# cmp in with in+len
    61 	jle	.Lfinished			# jump if (in+len <= in)
    62 	addb	%al,		%dl		# y += tx
    63 	movl	0(%rbp,%rdx,8),	%ebx		# ty = d[y]
    64 	movl	%ebx,		0(%rbp,%rcx,8)	# d[x] = ty
    65 	addb	%al,		%bl		# val = ty + tx
    66 	movl	%eax,		0(%rbp,%rdx,8)	# d[y] = tx
    67 	incb	%cl				# x++		(NEXT ROUND)
    68 	movl	0(%rbp,%rcx,8),	%eax		# tx = d[x]	(NEXT ROUND)
    69 	movb	0(%rbp,%rbx,8),	%r8b		# val = d[val]
    70 	xorb	(%rsi),		%r8b		# xor 1 byte
    71 	movb	%r8b,		(%rdi)
    72 	incq	%rsi				# in++
    73 	incq	%rdi				# out++
    74 	jmp 1b
    76 .Lfinished:
    77 	decq	%rcx				# x--
    78 	movb	%dl,		-8(%rbp)	# key->y = y
    79 	movb	%cl,		-16(%rbp)	# key->x = x
    80 	popq	%rbx
    81 	popq	%rbp
    82 	ret
    83 .L_ARCFOUR_end:
    84 .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR
    86 # Magic indicating no need for an executable stack
    87 .section .note.GNU-stack,"",@progbits
    88 .previous

mercurial