security/nss/lib/freebl/intel-aes.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/intel-aes.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2488 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +	.text
     1.9 +
    1.10 +#define IV_OFFSET 16
    1.11 +#define EXPANDED_KEY_OFFSET 48
    1.12 +
    1.13 +
    1.14 +/* in %rdi : the key
    1.15 +   in %rsi : buffer for expanded key
    1.16 +*/
    1.17 +	.type intel_aes_encrypt_init_128,@function
    1.18 +	.globl intel_aes_encrypt_init_128
    1.19 +	.align	16
    1.20 +intel_aes_encrypt_init_128:
    1.21 +	movups	(%rdi), %xmm1
    1.22 +	movups	%xmm1, (%rsi)
    1.23 +	leaq	16(%rsi), %rsi
    1.24 +	xorl	%eax, %eax
    1.25 +
    1.26 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
    1.27 +	call key_expansion128
    1.28 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
    1.29 +	call key_expansion128
    1.30 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
    1.31 +	call key_expansion128
    1.32 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
    1.33 +	call key_expansion128
    1.34 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
    1.35 +	call key_expansion128
    1.36 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
    1.37 +	call key_expansion128
    1.38 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
    1.39 +	call key_expansion128
    1.40 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
    1.41 +	call key_expansion128
    1.42 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
    1.43 +	call key_expansion128
    1.44 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
    1.45 +	call key_expansion128
    1.46 +
    1.47 +	ret
    1.48 +	.size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128
    1.49 +
    1.50 +
    1.51 +/* in %rdi : the key
    1.52 +   in %rsi : buffer for expanded key
    1.53 +*/
    1.54 +	.type intel_aes_decrypt_init_128,@function
    1.55 +	.globl intel_aes_decrypt_init_128
    1.56 +	.align	16
    1.57 +intel_aes_decrypt_init_128:
    1.58 +	movups	(%rdi), %xmm1
    1.59 +	movups	%xmm1, (%rsi)
    1.60 +	leaq	16(%rsi), %rsi
    1.61 +	xorl	%eax, %eax
    1.62 +
    1.63 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
    1.64 +	call key_expansion128
    1.65 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.66 +	movups	%xmm2, -16(%rsi)
    1.67 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
    1.68 +	call key_expansion128
    1.69 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.70 +	movups	%xmm2, -16(%rsi)
    1.71 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
    1.72 +	call key_expansion128
    1.73 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.74 +	movups	%xmm2, -16(%rsi)
    1.75 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
    1.76 +	call key_expansion128
    1.77 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.78 +	movups	%xmm2, -16(%rsi)
    1.79 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
    1.80 +	call key_expansion128
    1.81 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.82 +	movups	%xmm2, -16(%rsi)
    1.83 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
    1.84 +	call key_expansion128
    1.85 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.86 +	movups	%xmm2, -16(%rsi)
    1.87 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
    1.88 +	call key_expansion128
    1.89 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.90 +	movups	%xmm2, -16(%rsi)
    1.91 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
    1.92 +	call key_expansion128
    1.93 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.94 +	movups	%xmm2, -16(%rsi)
    1.95 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
    1.96 +	call key_expansion128
    1.97 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    1.98 +	movups	%xmm2, -16(%rsi)
    1.99 +	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
   1.100 +	call key_expansion128
   1.101 +
   1.102 +	ret
   1.103 +	.size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128
   1.104 +
   1.105 +
   1.106 +	.type key_expansion128,@function
   1.107 +	.align	16
   1.108 +key_expansion128:
   1.109 +	movd	%eax, %xmm3
   1.110 +	pshufd	$0xff, %xmm2, %xmm2
   1.111 +	shufps	$0x10, %xmm1, %xmm3
   1.112 +	pxor	%xmm3, %xmm1
   1.113 +	shufps	$0x8c, %xmm1, %xmm3
   1.114 +	pxor	%xmm2, %xmm1
   1.115 +	pxor	%xmm3, %xmm1
   1.116 +	movdqu	%xmm1, (%rsi)
   1.117 +	addq	$16, %rsi
   1.118 +	ret
   1.119 +	.size key_expansion128, .-key_expansion128
   1.120 +
   1.121 +
   1.122 +/* in %rdi : cx - context
   1.123 +   in %rsi : output - pointer to output buffer
   1.124 +   in %rdx : outputLen - pointer to variable for length of output
   1.125 +             (filled by caller)
   1.126 +   in %rcx : maxOutputLen - length of output buffer
   1.127 +   in %r8  : input - pointer to input buffer
   1.128 +   in %r9  : inputLen - length of input buffer
   1.129 +   on stack: blocksize - AES blocksize (always 16, unused)
   1.130 +*/
   1.131 +	.type intel_aes_encrypt_ecb_128,@function
   1.132 +	.globl intel_aes_encrypt_ecb_128
   1.133 +	.align	16
   1.134 +intel_aes_encrypt_ecb_128:
   1.135 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   1.136 +	leaq	48(%rdi), %rdi
   1.137 +
   1.138 +	movdqu	(%rdi), %xmm2
   1.139 +	movdqu	160(%rdi), %xmm12
   1.140 +	xor	%eax, %eax
   1.141 +//	cmpq	$8*16, %r9
   1.142 +	cmpq	$128, %r9
   1.143 +	jb	1f
   1.144 +//	leaq	-8*16(%r9), %r11
   1.145 +	leaq	-128(%r9), %r11
   1.146 +2:	movdqu	(%r8, %rax), %xmm3
   1.147 +	movdqu	16(%r8, %rax), %xmm4
   1.148 +	movdqu	32(%r8, %rax), %xmm5
   1.149 +	movdqu	48(%r8, %rax), %xmm6
   1.150 +	movdqu	64(%r8, %rax), %xmm7
   1.151 +	movdqu	80(%r8, %rax), %xmm8
   1.152 +	movdqu	96(%r8, %rax), %xmm9
   1.153 +	movdqu	112(%r8, %rax), %xmm10
   1.154 +	pxor	%xmm2, %xmm3
   1.155 +	pxor	%xmm2, %xmm4
   1.156 +	pxor	%xmm2, %xmm5
   1.157 +	pxor	%xmm2, %xmm6
   1.158 +	pxor	%xmm2, %xmm7
   1.159 +	pxor	%xmm2, %xmm8
   1.160 +	pxor	%xmm2, %xmm9
   1.161 +	pxor	%xmm2, %xmm10
   1.162 +
   1.163 +// complete loop unrolling
   1.164 +	movdqu 16(%rdi), %xmm1
   1.165 +	movdqu 32(%rdi), %xmm11
   1.166 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.167 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.168 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.169 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.170 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.171 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.172 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.173 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.174 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.175 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.176 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.177 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.178 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.179 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.180 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.181 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.182 +
   1.183 +	movdqu 48(%rdi), %xmm1
   1.184 +	movdqu 64(%rdi), %xmm11
   1.185 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.186 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.187 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.188 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.189 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.190 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.191 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.192 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.193 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.194 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.195 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.196 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.197 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.198 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.199 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.200 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.201 +
   1.202 +	movdqu 80(%rdi), %xmm1
   1.203 +	movdqu 96(%rdi), %xmm11
   1.204 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.205 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.206 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.207 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.208 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.209 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.210 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.211 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.212 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.213 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.214 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.215 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.216 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.217 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.218 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.219 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.220 +
   1.221 +	movdqu 112(%rdi), %xmm1
   1.222 +	movdqu 128(%rdi), %xmm11
   1.223 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.224 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.225 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.226 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.227 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.228 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.229 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.230 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.231 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.232 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.233 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.234 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.235 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.236 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.237 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.238 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.239 +
   1.240 +	movdqu 144(%rdi), %xmm1
   1.241 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.242 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.243 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.244 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.245 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.246 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.247 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.248 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.249 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdc 	/* aesenclast 	%xmm12, %xmm3 */
   1.250 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 	/* aesenclast 	%xmm12, %xmm4 */
   1.251 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xec 	/* aesenclast 	%xmm12, %xmm5 */
   1.252 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 	/* aesenclast 	%xmm12, %xmm6 */
   1.253 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfc 	/* aesenclast 	%xmm12, %xmm7 */
   1.254 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 	/* aesenclast 	%xmm12, %xmm8 */
   1.255 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcc 	/* aesenclast 	%xmm12, %xmm9 */
   1.256 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 	/* aesenclast 	%xmm12, %xmm10 */
   1.257 +
   1.258 +	movdqu	%xmm3, (%rsi, %rax)
   1.259 +	movdqu	%xmm4, 16(%rsi, %rax)
   1.260 +	movdqu	%xmm5, 32(%rsi, %rax)
   1.261 +	movdqu	%xmm6, 48(%rsi, %rax)
   1.262 +	movdqu	%xmm7, 64(%rsi, %rax)
   1.263 +	movdqu	%xmm8, 80(%rsi, %rax)
   1.264 +	movdqu	%xmm9, 96(%rsi, %rax)
   1.265 +	movdqu	%xmm10, 112(%rsi, %rax)
   1.266 +//	addq	$8*16, %rax
   1.267 +	addq	$128, %rax
   1.268 +	cmpq	%r11, %rax
   1.269 +	jbe	2b
   1.270 +1:	cmpq	%rax, %r9
   1.271 +	je	5f
   1.272 +
   1.273 +	movdqu	16(%rdi), %xmm3
   1.274 +	movdqu	32(%rdi), %xmm4
   1.275 +	movdqu	48(%rdi), %xmm5
   1.276 +	movdqu	64(%rdi), %xmm6
   1.277 +	movdqu	80(%rdi), %xmm7
   1.278 +	movdqu	96(%rdi), %xmm8
   1.279 +	movdqu	112(%rdi), %xmm9
   1.280 +	movdqu	128(%rdi), %xmm10
   1.281 +	movdqu	144(%rdi), %xmm11
   1.282 +
   1.283 +4:	movdqu	(%r8, %rax), %xmm1
   1.284 +	pxor	%xmm2, %xmm1
   1.285 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   1.286 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   1.287 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   1.288 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   1.289 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   1.290 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   1.291 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   1.292 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   1.293 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   1.294 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
   1.295 +	movdqu	%xmm1, (%rsi, %rax)
   1.296 +	addq	$16, %rax
   1.297 +	cmpq	%rax, %r9
   1.298 +	jne	4b
   1.299 +
   1.300 +5:	xor	%eax, %eax
   1.301 +	ret
   1.302 +	.size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
   1.303 +
   1.304 +
   1.305 +/* in %rdi : cx - context
   1.306 +   in %rsi : output - pointer to output buffer
   1.307 +   in %rdx : outputLen - pointer to variable for length of output
   1.308 +             (filled by caller)
   1.309 +   in %rcx : maxOutputLen - length of output buffer
   1.310 +   in %r8  : input - pointer to input buffer
   1.311 +   in %r9  : inputLen - length of input buffer
   1.312 +   on stack: blocksize - AES blocksize (always 16, unused)
   1.313 +*/
   1.314 +	.type intel_aes_decrypt_ecb_128,@function
   1.315 +	.globl intel_aes_decrypt_ecb_128
   1.316 +	.align	16
   1.317 +intel_aes_decrypt_ecb_128:
   1.318 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   1.319 +	leaq	48(%rdi), %rdi
   1.320 +
   1.321 +	movdqu	(%rdi), %xmm2
   1.322 +	movdqu	160(%rdi), %xmm12
   1.323 +	xorl	%eax, %eax
   1.324 +//	cmpq	$8*16, %r9
   1.325 +	cmpq	$128, %r9
   1.326 +	jb	1f
   1.327 +//	leaq	-8*16(%r9), %r11
   1.328 +	leaq	-128(%r9), %r11
   1.329 +2:	movdqu	(%r8, %rax), %xmm3
   1.330 +	movdqu	16(%r8, %rax), %xmm4
   1.331 +	movdqu	32(%r8, %rax), %xmm5
   1.332 +	movdqu	48(%r8, %rax), %xmm6
   1.333 +	movdqu	64(%r8, %rax), %xmm7
   1.334 +	movdqu	80(%r8, %rax), %xmm8
   1.335 +	movdqu	96(%r8, %rax), %xmm9
   1.336 +	movdqu	112(%r8, %rax), %xmm10
   1.337 +	pxor	%xmm12, %xmm3
   1.338 +	pxor	%xmm12, %xmm4
   1.339 +	pxor	%xmm12, %xmm5
   1.340 +	pxor	%xmm12, %xmm6
   1.341 +	pxor	%xmm12, %xmm7
   1.342 +	pxor	%xmm12, %xmm8
   1.343 +	pxor	%xmm12, %xmm9
   1.344 +	pxor	%xmm12, %xmm10
   1.345 +
   1.346 +// complete loop unrolling
   1.347 +	movdqu 144(%rdi), %xmm1
   1.348 +	movdqu 128(%rdi), %xmm11
   1.349 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.350 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.351 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.352 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.353 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.354 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.355 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.356 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.357 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.358 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.359 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.360 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.361 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.362 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.363 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.364 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.365 +
   1.366 +	movdqu 112(%rdi), %xmm1
   1.367 +	movdqu 96(%rdi), %xmm11
   1.368 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.369 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.370 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.371 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.372 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.373 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.374 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.375 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.376 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.377 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.378 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.379 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.380 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.381 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.382 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.383 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.384 +
   1.385 +	movdqu 80(%rdi), %xmm1
   1.386 +	movdqu 64(%rdi), %xmm11
   1.387 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.388 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.389 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.390 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.391 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.392 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.393 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.394 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.395 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.396 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.397 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.398 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.399 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.400 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.401 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.402 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.403 +
   1.404 +	movdqu 48(%rdi), %xmm1
   1.405 +	movdqu 32(%rdi), %xmm11
   1.406 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.407 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.408 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.409 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.410 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.411 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.412 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.413 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.414 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.415 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.416 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.417 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.418 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.419 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.420 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.421 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.422 +
   1.423 +	movdqu 16(%rdi), %xmm1
   1.424 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.425 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.426 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.427 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.428 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.429 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.430 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.431 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.432 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   1.433 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   1.434 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   1.435 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   1.436 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   1.437 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   1.438 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   1.439 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   1.440 +
   1.441 +	movdqu	%xmm3, (%rsi, %rax)
   1.442 +	movdqu	%xmm4, 16(%rsi, %rax)
   1.443 +	movdqu	%xmm5, 32(%rsi, %rax)
   1.444 +	movdqu	%xmm6, 48(%rsi, %rax)
   1.445 +	movdqu	%xmm7, 64(%rsi, %rax)
   1.446 +	movdqu	%xmm8, 80(%rsi, %rax)
   1.447 +	movdqu	%xmm9, 96(%rsi, %rax)
   1.448 +	movdqu	%xmm10, 112(%rsi, %rax)
   1.449 +//	addq	$8*16, %rax
   1.450 +	addq	$128, %rax
   1.451 +	cmpq	%r11, %rax
   1.452 +	jbe	2b
   1.453 +1:	cmpq	%rax, %r9
   1.454 +	je	5f
   1.455 +
   1.456 +	movdqu	16(%rdi), %xmm3
   1.457 +	movdqu	32(%rdi), %xmm4
   1.458 +	movdqu	48(%rdi), %xmm5
   1.459 +	movdqu	64(%rdi), %xmm6
   1.460 +	movdqu	80(%rdi), %xmm7
   1.461 +	movdqu	96(%rdi), %xmm8
   1.462 +	movdqu	112(%rdi), %xmm9
   1.463 +	movdqu	128(%rdi), %xmm10
   1.464 +	movdqu	144(%rdi), %xmm11
   1.465 +
   1.466 +4:	movdqu	(%r8, %rax), %xmm1
   1.467 +	pxor	%xmm12, %xmm1
   1.468 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   1.469 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   1.470 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   1.471 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   1.472 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   1.473 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
   1.474 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
   1.475 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
   1.476 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
   1.477 +	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   1.478 +	movdqu	%xmm1, (%rsi, %rax)
   1.479 +	addq	$16, %rax
   1.480 +	cmpq	%rax, %r9
   1.481 +	jne	4b
   1.482 +
   1.483 +5:	xor	%eax, %eax
   1.484 +	ret
   1.485 +	.size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
   1.486 +
   1.487 +
   1.488 +/* in %rdi : cx - context
   1.489 +   in %rsi : output - pointer to output buffer
   1.490 +   in %rdx : outputLen - pointer to variable for length of output
   1.491 +             (filled by caller)
   1.492 +   in %rcx : maxOutputLen - length of output buffer
   1.493 +   in %r8  : input - pointer to input buffer
   1.494 +   in %r9  : inputLen - length of input buffer
   1.495 +   on stack: blocksize - AES blocksize (always 16, unused)
   1.496 +*/
   1.497 +	.type intel_aes_encrypt_cbc_128,@function
   1.498 +	.globl intel_aes_encrypt_cbc_128
   1.499 +	.align	16
   1.500 +intel_aes_encrypt_cbc_128:
   1.501 +	testq	%r9, %r9
   1.502 +	je	2f
   1.503 +
   1.504 +//	leaq	IV_OFFSET(%rdi), %rdx
   1.505 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   1.506 +	leaq	16(%rdi), %rdx
   1.507 +	leaq	48(%rdi), %rdi
   1.508 +
   1.509 +	movdqu	(%rdx), %xmm0
   1.510 +	movdqu	(%rdi), %xmm2
   1.511 +	movdqu	16(%rdi), %xmm3
   1.512 +	movdqu	32(%rdi), %xmm4
   1.513 +	movdqu	48(%rdi), %xmm5
   1.514 +	movdqu	64(%rdi), %xmm6
   1.515 +	movdqu	80(%rdi), %xmm7
   1.516 +	movdqu	96(%rdi), %xmm8
   1.517 +	movdqu	112(%rdi), %xmm9
   1.518 +	movdqu	128(%rdi), %xmm10
   1.519 +	movdqu	144(%rdi), %xmm11
   1.520 +	movdqu	160(%rdi), %xmm12
   1.521 +
   1.522 +	xorl	%eax, %eax
   1.523 +1:	movdqu	(%r8, %rax), %xmm1
   1.524 +	pxor	%xmm0, %xmm1
   1.525 +	pxor	%xmm2, %xmm1
   1.526 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   1.527 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   1.528 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   1.529 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   1.530 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   1.531 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   1.532 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   1.533 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
   1.534 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
   1.535 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
   1.536 +	movdqu	%xmm1, (%rsi, %rax)
   1.537 +	movdqa	%xmm1, %xmm0
   1.538 +	addq	$16, %rax
   1.539 +	cmpq	%rax, %r9
   1.540 +	jne	1b
   1.541 +
   1.542 +	movdqu	%xmm0, (%rdx)
   1.543 +
   1.544 +2:	xor	%eax, %eax
   1.545 +	ret
   1.546 +	.size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
   1.547 +
   1.548 +
   1.549 +/* in %rdi : cx - context
   1.550 +   in %rsi : output - pointer to output buffer
   1.551 +   in %rdx : outputLen - pointer to variable for length of output
   1.552 +             (filled by caller)
   1.553 +   in %rcx : maxOutputLen - length of output buffer
   1.554 +   in %r8  : input - pointer to input buffer
   1.555 +   in %r9  : inputLen - length of input buffer
   1.556 +   on stack: blocksize - AES blocksize (always 16, unused)
   1.557 +*/
   1.558 +	.type intel_aes_decrypt_cbc_128,@function
   1.559 +	.globl intel_aes_decrypt_cbc_128
   1.560 +	.align	16
   1.561 +intel_aes_decrypt_cbc_128:
   1.562 +//	leaq	IV_OFFSET(%rdi), %rdx
   1.563 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   1.564 +	leaq	16(%rdi), %rdx
   1.565 +	leaq	48(%rdi), %rdi
   1.566 +
   1.567 +	movdqu	(%rdx), %xmm0   /* iv */
   1.568 +	movdqu	(%rdi), %xmm2   /* first key block */
   1.569 +	movdqu	160(%rdi), %xmm12 /* last key block */
   1.570 +	xorl	%eax, %eax
   1.571 +	cmpq	$128, %r9
   1.572 +	jb	1f
   1.573 +	leaq	-128(%r9), %r11
   1.574 +2:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
   1.575 +	movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
   1.576 +	movdqu	32(%r8, %rax), %xmm5
   1.577 +	movdqu	48(%r8, %rax), %xmm6
   1.578 +	movdqu	64(%r8, %rax), %xmm7
   1.579 +	movdqu	80(%r8, %rax), %xmm8
   1.580 +	movdqu	96(%r8, %rax), %xmm9
   1.581 +	movdqu	112(%r8, %rax), %xmm10
   1.582 +	pxor	%xmm12, %xmm3
   1.583 +	pxor	%xmm12, %xmm4
   1.584 +	pxor	%xmm12, %xmm5
   1.585 +	pxor	%xmm12, %xmm6
   1.586 +	pxor	%xmm12, %xmm7
   1.587 +	pxor	%xmm12, %xmm8
   1.588 +	pxor	%xmm12, %xmm9
   1.589 +	pxor	%xmm12, %xmm10
   1.590 +
   1.591 +// complete loop unrolling
   1.592 +	movdqu 144(%rdi), %xmm1
   1.593 +	movdqu 128(%rdi), %xmm11
   1.594 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.595 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.596 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.597 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.598 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.599 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.600 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.601 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.602 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.603 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.604 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.605 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.606 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.607 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.608 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.609 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.610 +
   1.611 +	movdqu 112(%rdi), %xmm1
   1.612 +	movdqu 96(%rdi), %xmm11
   1.613 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.614 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.615 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.616 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.617 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.618 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.619 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.620 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.621 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.622 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.623 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.624 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.625 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.626 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.627 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.628 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.629 +
   1.630 +	movdqu 80(%rdi), %xmm1
   1.631 +	movdqu 64(%rdi), %xmm11
   1.632 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.633 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.634 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.635 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.636 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.637 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.638 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.639 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.640 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.641 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.642 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.643 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.644 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.645 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.646 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.647 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.648 +
   1.649 +	movdqu 48(%rdi), %xmm1
   1.650 +	movdqu 32(%rdi), %xmm11
   1.651 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.652 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.653 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.654 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.655 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.656 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.657 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.658 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.659 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1.660 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1.661 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1.662 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1.663 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1.664 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1.665 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1.666 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1.667 +
   1.668 +	movdqu 16(%rdi), %xmm1
   1.669 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1.670 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1.671 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1.672 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1.673 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1.674 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1.675 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1.676 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1.677 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   1.678 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   1.679 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   1.680 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   1.681 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   1.682 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   1.683 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   1.684 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   1.685 +
   1.686 + 	pxor	%xmm0, %xmm3
   1.687 +	movdqu	(%r8, %rax), %xmm0
   1.688 +	pxor	%xmm0, %xmm4
   1.689 +	movdqu	16(%r8, %rax), %xmm0
   1.690 +	pxor	%xmm0, %xmm5
   1.691 +	movdqu	32(%r8, %rax), %xmm0
   1.692 +	pxor	%xmm0, %xmm6
   1.693 +	movdqu	48(%r8, %rax), %xmm0
   1.694 +	pxor	%xmm0, %xmm7
   1.695 +	movdqu	64(%r8, %rax), %xmm0
   1.696 +	pxor	%xmm0, %xmm8
   1.697 +	movdqu	80(%r8, %rax), %xmm0
   1.698 +	pxor	%xmm0, %xmm9
   1.699 +	movdqu	96(%r8, %rax), %xmm0
   1.700 +	pxor	%xmm0, %xmm10
   1.701 +	movdqu	112(%r8, %rax), %xmm0
   1.702 +	movdqu	%xmm3, (%rsi, %rax)
   1.703 +	movdqu	%xmm4, 16(%rsi, %rax)
   1.704 +	movdqu	%xmm5, 32(%rsi, %rax)
   1.705 +	movdqu	%xmm6, 48(%rsi, %rax)
   1.706 +	movdqu	%xmm7, 64(%rsi, %rax)
   1.707 +	movdqu	%xmm8, 80(%rsi, %rax)
   1.708 +	movdqu	%xmm9, 96(%rsi, %rax)
   1.709 +	movdqu	%xmm10, 112(%rsi, %rax)
   1.710 +	addq	$128, %rax
   1.711 +	cmpq	%r11, %rax
   1.712 +	jbe	2b
   1.713 +1:	cmpq	%rax, %r9
   1.714 +	je	5f
   1.715 +
   1.716 +	movdqu	16(%rdi), %xmm3
   1.717 +	movdqu	32(%rdi), %xmm4
   1.718 +	movdqu	48(%rdi), %xmm5
   1.719 +	movdqu	64(%rdi), %xmm6
   1.720 +	movdqu	80(%rdi), %xmm7
   1.721 +	movdqu	96(%rdi), %xmm8
   1.722 +	movdqu	112(%rdi), %xmm9
   1.723 +	movdqu	128(%rdi), %xmm10
   1.724 +	movdqu	144(%rdi), %xmm11
   1.725 +
   1.726 +4:	movdqu	(%r8, %rax), %xmm1
   1.727 +	movdqa	%xmm1, %xmm13
   1.728 +	pxor	%xmm12, %xmm1
   1.729 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   1.730 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   1.731 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   1.732 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   1.733 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   1.734 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   1.735 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   1.736 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   1.737 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   1.738 +	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   1.739 +	pxor	%xmm0, %xmm1
   1.740 +	movdqu	%xmm1, (%rsi, %rax)
   1.741 +	movdqa	%xmm13, %xmm0
   1.742 +	addq	$16, %rax
   1.743 +	cmpq	%rax, %r9
   1.744 +	jne	4b
   1.745 +
   1.746 +5:	movdqu	%xmm0, (%rdx)
   1.747 +
   1.748 +	xor	%eax, %eax
   1.749 +	ret
   1.750 +	.size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
   1.751 +        
   1.752 +/* in %rdi : the key
   1.753 +   in %rsi : buffer for expanded key
   1.754 +*/
   1.755 +	.type intel_aes_encrypt_init_192,@function
   1.756 +	.globl intel_aes_encrypt_init_192
   1.757 +	.align	16
   1.758 +intel_aes_encrypt_init_192:
   1.759 +	movdqu	(%rdi), %xmm1
   1.760 +	movq	16(%rdi), %xmm3
   1.761 +	movdqu	%xmm1, (%rsi)
   1.762 +	movq	%xmm3, 16(%rsi)
   1.763 +	leaq	24(%rsi), %rsi
   1.764 +
   1.765 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   1.766 +	call key_expansion192
   1.767 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   1.768 +	call key_expansion192
   1.769 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   1.770 +	call key_expansion192
   1.771 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   1.772 +	call key_expansion192
   1.773 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   1.774 +	call key_expansion192
   1.775 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   1.776 +	call key_expansion192
   1.777 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   1.778 +	call key_expansion192
   1.779 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
   1.780 +	call key_expansion192
   1.781 +
   1.782 +	ret
   1.783 +	.size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192
   1.784 +
   1.785 +
   1.786 +/* in %rdi : the key
   1.787 +   in %rsi : buffer for expanded key
   1.788 +*/
   1.789 +	.type intel_aes_decrypt_init_192,@function
   1.790 +	.globl intel_aes_decrypt_init_192
   1.791 +	.align	16
   1.792 +intel_aes_decrypt_init_192:
   1.793 +	movdqu	(%rdi), %xmm1
   1.794 +	movq	16(%rdi), %xmm3
   1.795 +	movdqu	%xmm1, (%rsi)
   1.796 +	movq	%xmm3, 16(%rsi)
   1.797 +	leaq	24(%rsi), %rsi
   1.798 +
   1.799 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   1.800 +	call key_expansion192
   1.801 +	movups	-32(%rsi), %xmm2
   1.802 +	movups	-16(%rsi), %xmm4
   1.803 +	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   1.804 +	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   1.805 +	movups	%xmm2, -32(%rsi)
   1.806 +	movups	%xmm4, -16(%rsi)
   1.807 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   1.808 +	call key_expansion192
   1.809 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   1.810 +	movups	%xmm2, -24(%rsi)
   1.811 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   1.812 +	call key_expansion192
   1.813 +	movups	-32(%rsi), %xmm2
   1.814 +	movups	-16(%rsi), %xmm4
   1.815 +	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   1.816 +	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   1.817 +	movups	%xmm2, -32(%rsi)
   1.818 +	movups	%xmm4, -16(%rsi)
   1.819 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   1.820 +	call key_expansion192
   1.821 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   1.822 +	movups	%xmm2, -24(%rsi)
   1.823 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   1.824 +	call key_expansion192
   1.825 +	movups	-32(%rsi), %xmm2
   1.826 +	movups	-16(%rsi), %xmm4
   1.827 +	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   1.828 +	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   1.829 +	movups	%xmm2, -32(%rsi)
   1.830 +	movups	%xmm4, -16(%rsi)
   1.831 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   1.832 +	call key_expansion192
   1.833 +	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   1.834 +	movups	%xmm2, -24(%rsi)
   1.835 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   1.836 +	call key_expansion192
   1.837 +	movups	-32(%rsi), %xmm2
   1.838 +	movups	-16(%rsi), %xmm4
   1.839 +	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   1.840 +	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   1.841 +	movups	%xmm2, -32(%rsi)
   1.842 +	movups	%xmm4, -16(%rsi)
   1.843 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
   1.844 +	call key_expansion192
   1.845 +
   1.846 +	ret
   1.847 +	.size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192
   1.848 +
   1.849 +
   1.850 +	.type key_expansion192,@function
   1.851 +	.align	16
   1.852 +key_expansion192:
   1.853 +	pshufd	$0x55, %xmm2, %xmm2
   1.854 +	xor	%eax, %eax
   1.855 +	movd	%eax, %xmm4
   1.856 +	shufps	$0x10, %xmm1, %xmm4
   1.857 +	pxor	%xmm4, %xmm1
   1.858 +	shufps	$0x8c, %xmm1, %xmm4
   1.859 +	pxor	%xmm2, %xmm1
   1.860 +	pxor	%xmm4, %xmm1
   1.861 +	movdqu	%xmm1, (%rsi)
   1.862 +	addq	$16, %rsi
   1.863 +
   1.864 +	pshufd	$0xff, %xmm1, %xmm4
   1.865 +	movd	%eax, %xmm5
   1.866 +	shufps	$0x00, %xmm3, %xmm5
   1.867 +	shufps	$0x08, %xmm3, %xmm5
   1.868 +	pxor	%xmm4, %xmm3
   1.869 +	pxor	%xmm5, %xmm3
   1.870 +	movq	%xmm3, (%rsi)
   1.871 +	addq	$8, %rsi
   1.872 +	ret
   1.873 +	.size key_expansion192, .-key_expansion192
   1.874 +
   1.875 +
   1.876 +/* in %rdi : cx - context
   1.877 +   in %rsi : output - pointer to output buffer
   1.878 +   in %rdx : outputLen - pointer to variable for length of output
   1.879 +             (filled by caller)
   1.880 +   in %rcx : maxOutputLen - length of output buffer
   1.881 +   in %r8  : input - pointer to input buffer
   1.882 +   in %r9  : inputLen - length of input buffer
   1.883 +   on stack: blocksize - AES blocksize (always 16, unused)
   1.884 +*/
   1.885 +	.type intel_aes_encrypt_ecb_192,@function
   1.886 +	.globl intel_aes_encrypt_ecb_192
   1.887 +	.align	16
   1.888 +intel_aes_encrypt_ecb_192:
   1.889 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   1.890 +	leaq	48(%rdi), %rdi
   1.891 +
   1.892 +	movdqu	(%rdi), %xmm2
   1.893 +	movdqu	192(%rdi), %xmm14
   1.894 +	xorl	%eax, %eax
   1.895 +//	cmpq	$8*16, %r9
   1.896 +	cmpq	$128, %r9
   1.897 +	jb	1f
   1.898 +//	leaq	-8*16(%r9), %r11
   1.899 +	leaq	-128(%r9), %r11
   1.900 +2:	movdqu	(%r8, %rax), %xmm3
   1.901 +	movdqu	16(%r8, %rax), %xmm4
   1.902 +	movdqu	32(%r8, %rax), %xmm5
   1.903 +	movdqu	48(%r8, %rax), %xmm6
   1.904 +	movdqu	64(%r8, %rax), %xmm7
   1.905 +	movdqu	80(%r8, %rax), %xmm8
   1.906 +	movdqu	96(%r8, %rax), %xmm9
   1.907 +	movdqu	112(%r8, %rax), %xmm10
   1.908 +	pxor	%xmm2, %xmm3
   1.909 +	pxor	%xmm2, %xmm4
   1.910 +	pxor	%xmm2, %xmm5
   1.911 +	pxor	%xmm2, %xmm6
   1.912 +	pxor	%xmm2, %xmm7
   1.913 +	pxor	%xmm2, %xmm8
   1.914 +	pxor	%xmm2, %xmm9
   1.915 +	pxor	%xmm2, %xmm10
   1.916 +
   1.917 +// complete loop unrolling
   1.918 +	movdqu 16(%rdi), %xmm1
   1.919 +	movdqu 32(%rdi), %xmm11
   1.920 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.921 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.922 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.923 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.924 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.925 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.926 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.927 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.928 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.929 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.930 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.931 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.932 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.933 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.934 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.935 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.936 +
   1.937 +	movdqu 48(%rdi), %xmm1
   1.938 +	movdqu 64(%rdi), %xmm11
   1.939 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.940 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.941 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.942 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.943 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.944 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.945 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.946 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.947 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.948 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.949 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.950 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.951 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.952 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.953 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.954 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.955 +
   1.956 +	movdqu 80(%rdi), %xmm1
   1.957 +	movdqu 96(%rdi), %xmm11
   1.958 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.959 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.960 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.961 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.962 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.963 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.964 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.965 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.966 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.967 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.968 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.969 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.970 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.971 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.972 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.973 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.974 +
   1.975 +	movdqu 112(%rdi), %xmm1
   1.976 +	movdqu 128(%rdi), %xmm11
   1.977 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.978 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.979 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.980 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1.981 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1.982 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1.983 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1.984 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1.985 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1.986 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1.987 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1.988 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1.989 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1.990 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1.991 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1.992 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1.993 +
   1.994 +	movdqu 144(%rdi), %xmm1
   1.995 +	movdqu 160(%rdi), %xmm11
   1.996 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1.997 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1.998 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1.999 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1000 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1001 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1002 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1003 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1004 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1005 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1006 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1007 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1008 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1009 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1010 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1011 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1012 +
  1.1013 +	movdqu 176(%rdi), %xmm1
  1.1014 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1015 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1016 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1017 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1018 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1019 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1020 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1021 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1022 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xde	/* aesenclast 	%xmm14, %xmm3 */
  1.1023 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe6	/* aesenclast 	%xmm14, %xmm4 */
  1.1024 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xee	/* aesenclast 	%xmm14, %xmm5 */
  1.1025 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf6	/* aesenclast 	%xmm14, %xmm7 */
  1.1026 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfe	/* aesenclast 	%xmm14, %xmm3 */
  1.1027 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc6	/* aesenclast 	%xmm14, %xmm8 */
  1.1028 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xce	/* aesenclast 	%xmm14, %xmm9 */
  1.1029 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd6	/* aesenclast 	%xmm14, %xmm10 */
  1.1030 +
  1.1031 +	movdqu	%xmm3, (%rsi, %rax)
  1.1032 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.1033 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.1034 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.1035 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.1036 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.1037 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.1038 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.1039 +//	addq	$8*16, %rax
  1.1040 +	addq	$128, %rax
  1.1041 +	cmpq	%r11, %rax
  1.1042 +	jbe	2b
  1.1043 +1:	cmpq	%rax, %r9
  1.1044 +	je	5f
  1.1045 +
  1.1046 +	movdqu	16(%rdi), %xmm3
  1.1047 +	movdqu	32(%rdi), %xmm4
  1.1048 +	movdqu	48(%rdi), %xmm5
  1.1049 +	movdqu	64(%rdi), %xmm6
  1.1050 +	movdqu	80(%rdi), %xmm7
  1.1051 +	movdqu	96(%rdi), %xmm8
  1.1052 +	movdqu	112(%rdi), %xmm9
  1.1053 +	movdqu	128(%rdi), %xmm10
  1.1054 +	movdqu	144(%rdi), %xmm11
  1.1055 +	movdqu	160(%rdi), %xmm12
  1.1056 +	movdqu	176(%rdi), %xmm13
  1.1057 +
  1.1058 +4:	movdqu	(%r8, %rax), %xmm1
  1.1059 +	pxor	%xmm2, %xmm1
  1.1060 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1.1061 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1.1062 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1.1063 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1.1064 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1.1065 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1.1066 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1.1067 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1.1068 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1.1069 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1.1070 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1.1071 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
  1.1072 +	movdqu	%xmm1, (%rsi, %rax)
  1.1073 +	addq	$16, %rax
  1.1074 +	cmpq	%rax, %r9
  1.1075 +	jne	4b
  1.1076 +
  1.1077 +5:	xor	%eax, %eax
  1.1078 +	ret
  1.1079 +	.size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
  1.1080 +
  1.1081 +
  1.1082 +/* in %rdi : cx - context
  1.1083 +   in %rsi : output - pointer to output buffer
  1.1084 +   in %rdx : outputLen - pointer to variable for length of output
  1.1085 +             (filled by caller)
  1.1086 +   in %rcx : maxOutputLen - length of output buffer
  1.1087 +   in %r8  : input - pointer to input buffer
  1.1088 +   in %r9  : inputLen - length of input buffer
  1.1089 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.1090 +*/
  1.1091 +	.type intel_aes_decrypt_ecb_192,@function
  1.1092 +	.globl intel_aes_decrypt_ecb_192
  1.1093 +	.align	16
  1.1094 +intel_aes_decrypt_ecb_192:
  1.1095 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.1096 +	leaq	48(%rdi), %rdi
  1.1097 +
  1.1098 +	movdqu	(%rdi), %xmm2
  1.1099 +	movdqu	192(%rdi), %xmm14
  1.1100 +	xorl	%eax, %eax
  1.1101 +//	cmpq	$8*16, %r9
  1.1102 +	cmpq	$128, %r9
  1.1103 +	jb	1f
  1.1104 +//	leaq	-8*16(%r9), %r11
  1.1105 +	leaq	-128(%r9), %r11
  1.1106 +2:	movdqu	(%r8, %rax), %xmm3
  1.1107 +	movdqu	16(%r8, %rax), %xmm4
  1.1108 +	movdqu	32(%r8, %rax), %xmm5
  1.1109 +	movdqu	48(%r8, %rax), %xmm6
  1.1110 +	movdqu	64(%r8, %rax), %xmm7
  1.1111 +	movdqu	80(%r8, %rax), %xmm8
  1.1112 +	movdqu	96(%r8, %rax), %xmm9
  1.1113 +	movdqu	112(%r8, %rax), %xmm10
  1.1114 +	pxor	%xmm14, %xmm3
  1.1115 +	pxor	%xmm14, %xmm4
  1.1116 +	pxor	%xmm14, %xmm5
  1.1117 +	pxor	%xmm14, %xmm6
  1.1118 +	pxor	%xmm14, %xmm7
  1.1119 +	pxor	%xmm14, %xmm8
  1.1120 +	pxor	%xmm14, %xmm9
  1.1121 +	pxor	%xmm14, %xmm10
  1.1122 +
  1.1123 +// complete loop unrolling
  1.1124 +	movdqu 176(%rdi), %xmm1
  1.1125 +	movdqu 160(%rdi), %xmm11
  1.1126 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1127 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1128 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1129 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1130 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1131 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1132 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1133 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1134 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1135 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1136 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1137 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1138 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1139 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1140 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1141 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1142 +
  1.1143 +	movdqu 144(%rdi), %xmm1
  1.1144 +	movdqu 128(%rdi), %xmm11
  1.1145 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1146 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1147 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1148 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1149 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1150 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1151 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1152 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1153 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1154 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1155 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1156 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1157 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1158 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1159 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1160 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1161 +
  1.1162 +	movdqu 112(%rdi), %xmm1
  1.1163 +	movdqu 96(%rdi), %xmm11
  1.1164 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1165 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1166 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1167 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1168 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1169 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1170 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1171 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1172 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1173 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1174 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1175 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1176 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1177 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1178 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1179 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1180 +
  1.1181 +	movdqu 80(%rdi), %xmm1
  1.1182 +	movdqu 64(%rdi), %xmm11
  1.1183 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1184 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1185 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1186 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1187 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1188 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1189 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1190 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1191 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1192 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1193 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1194 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1195 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1196 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1197 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1198 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1199 +
  1.1200 +	movdqu 48(%rdi), %xmm1
  1.1201 +	movdqu 32(%rdi), %xmm11
  1.1202 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1203 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1204 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1205 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1206 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1207 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1208 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1209 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1210 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1211 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1212 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1213 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1214 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1215 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1216 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1217 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1218 +
  1.1219 +	movdqu 16(%rdi), %xmm1
  1.1220 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1221 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1222 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1223 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1224 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1225 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1226 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1227 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1228 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1.1229 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1.1230 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1.1231 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1.1232 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1.1233 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1.1234 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1.1235 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1.1236 +
  1.1237 +	movdqu	%xmm3, (%rsi, %rax)
  1.1238 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.1239 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.1240 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.1241 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.1242 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.1243 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.1244 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.1245 +//	addq	$8*16, %rax
  1.1246 +	addq	$128, %rax
  1.1247 +	cmpq	%r11, %rax
  1.1248 +	jbe	2b
  1.1249 +1:	cmpq	%rax, %r9
  1.1250 +	je	5f
  1.1251 +
  1.1252 +	movdqu	16(%rdi), %xmm3
  1.1253 +	movdqu	32(%rdi), %xmm4
  1.1254 +	movdqu	48(%rdi), %xmm5
  1.1255 +	movdqu	64(%rdi), %xmm6
  1.1256 +	movdqu	80(%rdi), %xmm7
  1.1257 +	movdqu	96(%rdi), %xmm8
  1.1258 +	movdqu	112(%rdi), %xmm9
  1.1259 +	movdqu	128(%rdi), %xmm10
  1.1260 +	movdqu	144(%rdi), %xmm11
  1.1261 +	movdqu	160(%rdi), %xmm12
  1.1262 +	movdqu	176(%rdi), %xmm13
  1.1263 +
  1.1264 +4:	movdqu	(%r8, %rax), %xmm1
  1.1265 +	pxor	%xmm14, %xmm1
  1.1266 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1.1267 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1.1268 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1.1269 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1.1270 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1.1271 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1.1272 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1.1273 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1.1274 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1.1275 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1.1276 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1.1277 +	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
  1.1278 +	movdqu	%xmm1, (%rsi, %rax)
  1.1279 +	addq	$16, %rax
  1.1280 +	cmpq	%rax, %r9
  1.1281 +	jne	4b
  1.1282 +
  1.1283 +5:	xor	%eax, %eax
  1.1284 +	ret
  1.1285 +	.size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
  1.1286 +
  1.1287 +
  1.1288 +/* in %rdi : cx - context
  1.1289 +   in %rsi : output - pointer to output buffer
  1.1290 +   in %rdx : outputLen - pointer to variable for length of output
  1.1291 +             (filled by caller)
  1.1292 +   in %rcx : maxOutputLen - length of output buffer
  1.1293 +   in %r8  : input - pointer to input buffer
  1.1294 +   in %r9  : inputLen - length of input buffer
  1.1295 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.1296 +*/
  1.1297 +	.type intel_aes_encrypt_cbc_192,@function
  1.1298 +	.globl intel_aes_encrypt_cbc_192
  1.1299 +	.align	16
  1.1300 +intel_aes_encrypt_cbc_192:
  1.1301 +	testq	%r9, %r9
  1.1302 +	je	2f
  1.1303 +
  1.1304 +//	leaq	IV_OFFSET(%rdi), %rdx
  1.1305 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.1306 +	leaq	16(%rdi), %rdx
  1.1307 +	leaq	48(%rdi), %rdi
  1.1308 +
  1.1309 +	movdqu	(%rdx), %xmm0
  1.1310 +	movdqu	(%rdi), %xmm2
  1.1311 +	movdqu	16(%rdi), %xmm3
  1.1312 +	movdqu	32(%rdi), %xmm4
  1.1313 +	movdqu	48(%rdi), %xmm5
  1.1314 +	movdqu	64(%rdi), %xmm6
  1.1315 +	movdqu	80(%rdi), %xmm7
  1.1316 +	movdqu	96(%rdi), %xmm8
  1.1317 +	movdqu	112(%rdi), %xmm9
  1.1318 +	movdqu	128(%rdi), %xmm10
  1.1319 +	movdqu	144(%rdi), %xmm11
  1.1320 +	movdqu	160(%rdi), %xmm12
  1.1321 +	movdqu	176(%rdi), %xmm13
  1.1322 +	movdqu	192(%rdi), %xmm14
  1.1323 +
  1.1324 +	xorl	%eax, %eax
  1.1325 +1:	movdqu	(%r8, %rax), %xmm1
  1.1326 +	pxor	%xmm0, %xmm1
  1.1327 +	pxor	%xmm2, %xmm1
  1.1328 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1.1329 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1.1330 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1.1331 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1.1332 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1.1333 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1.1334 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1.1335 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1.1336 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1.1337 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1.1338 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1.1339 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
  1.1340 +	movdqu	%xmm1, (%rsi, %rax)
  1.1341 +	movdqa	%xmm1, %xmm0
  1.1342 +	addq	$16, %rax
  1.1343 +	cmpq	%rax, %r9
  1.1344 +	jne	1b
  1.1345 +
  1.1346 +	movdqu	%xmm0, (%rdx)
  1.1347 +
  1.1348 +2:	xor	%eax, %eax
  1.1349 +	ret
  1.1350 +	.size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
  1.1351 +
  1.1352 +
  1.1353 +/* in %rdi : cx - context
  1.1354 +   in %rsi : output - pointer to output buffer
  1.1355 +   in %rdx : outputLen - pointer to variable for length of output
  1.1356 +             (filled by caller)
  1.1357 +   in %rcx : maxOutputLen - length of output buffer
  1.1358 +   in %r8  : input - pointer to input buffer
  1.1359 +   in %r9  : inputLen - length of input buffer
  1.1360 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.1361 +*/
  1.1362 +	.type intel_aes_decrypt_cbc_192,@function
  1.1363 +	.globl intel_aes_decrypt_cbc_192
  1.1364 +	.align	16
  1.1365 +intel_aes_decrypt_cbc_192:
  1.1366 +	leaq	16(%rdi), %rdx
  1.1367 +	leaq	48(%rdi), %rdi
  1.1368 +
  1.1369 +	movdqu	(%rdx), %xmm0
  1.1370 +	movdqu	(%rdi), %xmm2
  1.1371 +	movdqu	192(%rdi), %xmm14
  1.1372 +	xorl	%eax, %eax
  1.1373 +	cmpq	$128, %r9
  1.1374 +	jb	1f
  1.1375 +	leaq	-128(%r9), %r11
  1.1376 +2:	movdqu	(%r8, %rax), %xmm3
  1.1377 +	movdqu	16(%r8, %rax), %xmm4
  1.1378 +	movdqu	32(%r8, %rax), %xmm5
  1.1379 +	movdqu	48(%r8, %rax), %xmm6
  1.1380 +	movdqu	64(%r8, %rax), %xmm7
  1.1381 +	movdqu	80(%r8, %rax), %xmm8
  1.1382 +	movdqu	96(%r8, %rax), %xmm9
  1.1383 +	movdqu	112(%r8, %rax), %xmm10
  1.1384 +	pxor	%xmm14, %xmm3
  1.1385 +	pxor	%xmm14, %xmm4
  1.1386 +	pxor	%xmm14, %xmm5
  1.1387 +	pxor	%xmm14, %xmm6
  1.1388 +	pxor	%xmm14, %xmm7
  1.1389 +	pxor	%xmm14, %xmm8
  1.1390 +	pxor	%xmm14, %xmm9
  1.1391 +	pxor	%xmm14, %xmm10
  1.1392 +
  1.1393 +// complete loop unrolling
  1.1394 +	movdqu 176(%rdi), %xmm1
  1.1395 +	movdqu 160(%rdi), %xmm11
  1.1396 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1397 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1398 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1399 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1400 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1401 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1402 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1403 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1404 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1405 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1406 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1407 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1408 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1409 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1410 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1411 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1412 +
  1.1413 +	movdqu 144(%rdi), %xmm1
  1.1414 +	movdqu 128(%rdi), %xmm11
  1.1415 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1416 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1417 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1418 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1419 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1420 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1421 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1422 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1423 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1424 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1425 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1426 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1427 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1428 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1429 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1430 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1431 +
  1.1432 +	movdqu 112(%rdi), %xmm1
  1.1433 +	movdqu 96(%rdi), %xmm11
  1.1434 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1435 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1436 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1437 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1438 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1439 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1440 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1441 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1442 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1443 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1444 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1445 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1446 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1447 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1448 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1449 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1450 +
  1.1451 +	movdqu 80(%rdi), %xmm1
  1.1452 +	movdqu 64(%rdi), %xmm11
  1.1453 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1454 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1455 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1456 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1457 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1458 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1459 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1460 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1461 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1462 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1463 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1464 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1465 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1466 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1467 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1468 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1469 +
  1.1470 +	movdqu 48(%rdi), %xmm1
  1.1471 +	movdqu 32(%rdi), %xmm11
  1.1472 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1473 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1474 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1475 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1476 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1477 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1478 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1479 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1480 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1481 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1482 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1483 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1484 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1485 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1486 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1487 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1488 +
  1.1489 +	movdqu 16(%rdi), %xmm1
  1.1490 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1491 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1492 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1493 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1494 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1495 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1496 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1497 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1498 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1.1499 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1.1500 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1.1501 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1.1502 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1.1503 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1.1504 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1.1505 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1.1506 +
  1.1507 + 	pxor	%xmm0, %xmm3
  1.1508 +	movdqu	(%r8, %rax), %xmm0
  1.1509 +	pxor	%xmm0, %xmm4
  1.1510 +	movdqu	16(%r8, %rax), %xmm0
  1.1511 +	pxor	%xmm0, %xmm5
  1.1512 +	movdqu	32(%r8, %rax), %xmm0
  1.1513 +	pxor	%xmm0, %xmm6
  1.1514 +	movdqu	48(%r8, %rax), %xmm0
  1.1515 +	pxor	%xmm0, %xmm7
  1.1516 +	movdqu	64(%r8, %rax), %xmm0
  1.1517 +	pxor	%xmm0, %xmm8
  1.1518 +	movdqu	80(%r8, %rax), %xmm0
  1.1519 +	pxor	%xmm0, %xmm9
  1.1520 +	movdqu	96(%r8, %rax), %xmm0
  1.1521 +	pxor	%xmm0, %xmm10
  1.1522 +	movdqu	112(%r8, %rax), %xmm0
  1.1523 +	movdqu	%xmm3, (%rsi, %rax)
  1.1524 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.1525 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.1526 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.1527 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.1528 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.1529 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.1530 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.1531 +	addq	$128, %rax
  1.1532 +	cmpq	%r11, %rax
  1.1533 +	jbe	2b
  1.1534 +1:	cmpq	%rax, %r9
  1.1535 +	je	5f
  1.1536 +
  1.1537 +	movdqu	16(%rdi), %xmm3
  1.1538 +	movdqu	32(%rdi), %xmm4
  1.1539 +	movdqu	48(%rdi), %xmm5
  1.1540 +	movdqu	64(%rdi), %xmm6
  1.1541 +	movdqu	80(%rdi), %xmm7
  1.1542 +	movdqu	96(%rdi), %xmm8
  1.1543 +	movdqu	112(%rdi), %xmm9
  1.1544 +	movdqu	128(%rdi), %xmm10
  1.1545 +	movdqu	144(%rdi), %xmm11
  1.1546 +	movdqu	160(%rdi), %xmm12
  1.1547 +	movdqu	176(%rdi), %xmm13
  1.1548 +
  1.1549 +4:	movdqu	(%r8, %rax), %xmm1
  1.1550 +	movdqa	%xmm1, %xmm15
  1.1551 +	pxor	%xmm14, %xmm1
  1.1552 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1.1553 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1.1554 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1.1555 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1.1556 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1.1557 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1.1558 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1.1559 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1.1560 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1.1561 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1.1562 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1.1563 +	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
  1.1564 +	pxor	%xmm0, %xmm1
  1.1565 +	movdqu	%xmm1, (%rsi, %rax)
  1.1566 +	movdqa	%xmm15, %xmm0
  1.1567 +	addq	$16, %rax
  1.1568 +	cmpq	%rax, %r9
  1.1569 +	jne	4b
  1.1570 +
  1.1571 +5:	movdqu	%xmm0, (%rdx)
  1.1572 +
  1.1573 +	xor	%eax, %eax
  1.1574 +	ret
  1.1575 +	.size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
  1.1576 +
  1.1577 +/* in %rdi : the key
  1.1578 +   in %rsi : buffer for expanded key
  1.1579 +*/
  1.1580 +	.type intel_aes_encrypt_init_256,@function
  1.1581 +	.globl intel_aes_encrypt_init_256
  1.1582 +	.align	16
  1.1583 +intel_aes_encrypt_init_256:
  1.1584 +	movdqu	(%rdi), %xmm1
  1.1585 +	movdqu	16(%rdi), %xmm3
  1.1586 +	movdqu	%xmm1, (%rsi)
  1.1587 +	movdqu	%xmm3, 16(%rsi)
  1.1588 +	leaq	32(%rsi), %rsi
  1.1589 +	xor	%eax, %eax
  1.1590 +
  1.1591 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
  1.1592 +	call key_expansion256
  1.1593 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
  1.1594 +	call key_expansion256
  1.1595 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
  1.1596 +	call key_expansion256
  1.1597 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
  1.1598 +	call key_expansion256
  1.1599 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
  1.1600 +	call key_expansion256
  1.1601 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
  1.1602 +	call key_expansion256
  1.1603 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
  1.1604 +	pxor	%xmm6, %xmm6
  1.1605 +	pshufd	$0xff, %xmm2, %xmm2
  1.1606 +	shufps	$0x10, %xmm1, %xmm6
  1.1607 +	pxor	%xmm6, %xmm1
  1.1608 +	shufps	$0x8c, %xmm1, %xmm6
  1.1609 +	pxor	%xmm2, %xmm1
  1.1610 +	pxor	%xmm6, %xmm1
  1.1611 +	movdqu	%xmm1, (%rsi)
  1.1612 +
  1.1613 +	ret
  1.1614 +	.size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256
  1.1615 +
  1.1616 +
  1.1617 +/* in %rdi : the key
  1.1618 +   in %rsi : buffer for expanded key
  1.1619 +*/
  1.1620 +	.type intel_aes_decrypt_init_256,@function
  1.1621 +	.globl intel_aes_decrypt_init_256
  1.1622 +	.align	16
  1.1623 +intel_aes_decrypt_init_256:
  1.1624 +	movdqu	(%rdi), %xmm1
  1.1625 +	movdqu	16(%rdi), %xmm3
  1.1626 +	movdqu	%xmm1, (%rsi)
  1.1627 +	.byte 0x66,0x0f,0x38,0xdb,0xe3	/* aesimc	%xmm3, %xmm4 */
  1.1628 +	movdqu	%xmm4, 16(%rsi)
  1.1629 +	leaq	32(%rsi), %rsi
  1.1630 +	xor	%eax, %eax
  1.1631 +
  1.1632 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
  1.1633 +	call key_expansion256
  1.1634 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1635 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1636 +	movdqu	%xmm4, -32(%rsi)
  1.1637 +	movdqu	%xmm5, -16(%rsi)
  1.1638 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
  1.1639 +	call key_expansion256
  1.1640 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1641 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1642 +	movdqu	%xmm4, -32(%rsi)
  1.1643 +	movdqu	%xmm5, -16(%rsi)
  1.1644 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
  1.1645 +	call key_expansion256
  1.1646 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1647 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1648 +	movdqu	%xmm4, -32(%rsi)
  1.1649 +	movdqu	%xmm5, -16(%rsi)
  1.1650 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
  1.1651 +	call key_expansion256
  1.1652 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1653 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1654 +	movdqu	%xmm4, -32(%rsi)
  1.1655 +	movdqu	%xmm5, -16(%rsi)
  1.1656 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
  1.1657 +	call key_expansion256
  1.1658 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1659 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1660 +	movdqu	%xmm4, -32(%rsi)
  1.1661 +	movdqu	%xmm5, -16(%rsi)
  1.1662 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
  1.1663 +	call key_expansion256
  1.1664 +	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1.1665 +	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1.1666 +	movdqu	%xmm4, -32(%rsi)
  1.1667 +	movdqu	%xmm5, -16(%rsi)
  1.1668 +	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
  1.1669 +	pxor	%xmm6, %xmm6
  1.1670 +	pshufd	$0xff, %xmm2, %xmm2
  1.1671 +	shufps	$0x10, %xmm1, %xmm6
  1.1672 +	pxor	%xmm6, %xmm1
  1.1673 +	shufps	$0x8c, %xmm1, %xmm6
  1.1674 +	pxor	%xmm2, %xmm1
  1.1675 +	pxor	%xmm6, %xmm1
  1.1676 +	movdqu	%xmm1, (%rsi)
  1.1677 +
  1.1678 +	ret
  1.1679 +	.size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256
  1.1680 +
  1.1681 +
  1.1682 +	.type key_expansion256,@function
  1.1683 +	.align	16
  1.1684 +key_expansion256:
  1.1685 +	movd	%eax, %xmm6
  1.1686 +	pshufd	$0xff, %xmm2, %xmm2
  1.1687 +	shufps	$0x10, %xmm1, %xmm6
  1.1688 +	pxor	%xmm6, %xmm1
  1.1689 +	shufps	$0x8c, %xmm1, %xmm6
  1.1690 +	pxor	%xmm2, %xmm1
  1.1691 +	pxor	%xmm6, %xmm1
  1.1692 +	movdqu	%xmm1, (%rsi)
  1.1693 +
  1.1694 +	addq	$16, %rsi
  1.1695 +	.byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00	/* aeskeygenassist $0, %xmm1, %xmm4 */
  1.1696 +	pshufd	$0xaa, %xmm4, %xmm4
  1.1697 +	shufps	$0x10, %xmm3, %xmm6
  1.1698 +	pxor	%xmm6, %xmm3
  1.1699 +	shufps	$0x8c, %xmm3, %xmm6
  1.1700 +	pxor	%xmm4, %xmm3
  1.1701 +	pxor	%xmm6, %xmm3
  1.1702 +	movdqu	%xmm3, (%rsi)
  1.1703 +	addq	$16, %rsi
  1.1704 +	ret
  1.1705 +	.size key_expansion256, .-key_expansion256
  1.1706 +
  1.1707 +
  1.1708 +/* in %rdi : cx - context
  1.1709 +   in %rsi : output - pointer to output buffer
  1.1710 +   in %rdx : outputLen - pointer to variable for length of output
  1.1711 +             (filled by caller)
  1.1712 +   in %rcx : maxOutputLen - length of output buffer
  1.1713 +   in %r8  : input - pointer to input buffer
  1.1714 +   in %r9  : inputLen - length of input buffer
  1.1715 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.1716 +*/
  1.1717 +	.type intel_aes_encrypt_ecb_256,@function
  1.1718 +	.globl intel_aes_encrypt_ecb_256
  1.1719 +	.align	16
  1.1720 +intel_aes_encrypt_ecb_256:
  1.1721 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.1722 +	leaq	48(%rdi), %rdi
  1.1723 +
  1.1724 +	movdqu	(%rdi), %xmm2
  1.1725 +	movdqu	224(%rdi), %xmm15
  1.1726 +	xorl	%eax, %eax
  1.1727 +//	cmpq	$8*16, %r9
  1.1728 +	cmpq	$128, %r9
  1.1729 +	jb	1f
  1.1730 +//	leaq	-8*16(%r9), %r11
  1.1731 +	leaq	-128(%r9), %r11
  1.1732 +2:	movdqu	(%r8, %rax), %xmm3
  1.1733 +	movdqu	16(%r8, %rax), %xmm4
  1.1734 +	movdqu	32(%r8, %rax), %xmm5
  1.1735 +	movdqu	48(%r8, %rax), %xmm6
  1.1736 +	movdqu	64(%r8, %rax), %xmm7
  1.1737 +	movdqu	80(%r8, %rax), %xmm8
  1.1738 +	movdqu	96(%r8, %rax), %xmm9
  1.1739 +	movdqu	112(%r8, %rax), %xmm10
  1.1740 +	pxor	%xmm2, %xmm3
  1.1741 +	pxor	%xmm2, %xmm4
  1.1742 +	pxor	%xmm2, %xmm5
  1.1743 +	pxor	%xmm2, %xmm6
  1.1744 +	pxor	%xmm2, %xmm7
  1.1745 +	pxor	%xmm2, %xmm8
  1.1746 +	pxor	%xmm2, %xmm9
  1.1747 +	pxor	%xmm2, %xmm10
  1.1748 +
  1.1749 +// complete loop unrolling
  1.1750 +	movdqu 16(%rdi), %xmm1
  1.1751 +	movdqu 32(%rdi), %xmm11
  1.1752 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1753 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1754 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1755 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1756 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1757 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1758 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1759 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1760 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1761 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1762 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1763 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1764 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1765 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1766 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1767 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1768 +
  1.1769 +	movdqu 48(%rdi), %xmm1
  1.1770 +	movdqu 64(%rdi), %xmm11
  1.1771 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1772 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1773 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1774 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1775 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1776 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1777 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1778 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1779 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1780 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1781 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1782 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1783 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1784 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1785 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1786 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1787 +
  1.1788 +	movdqu 80(%rdi), %xmm1
  1.1789 +	movdqu 96(%rdi), %xmm11
  1.1790 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1791 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1792 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1793 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1794 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1795 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1796 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1797 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1798 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1799 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1800 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1801 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1802 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1803 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1804 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1805 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1806 +
  1.1807 +	movdqu 112(%rdi), %xmm1
  1.1808 +	movdqu 128(%rdi), %xmm11
  1.1809 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1810 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1811 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1812 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1813 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1814 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1815 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1816 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1817 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1818 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1819 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1820 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1821 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1822 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1823 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1824 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1825 +
  1.1826 +	movdqu 144(%rdi), %xmm1
  1.1827 +	movdqu 160(%rdi), %xmm11
  1.1828 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1829 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1830 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1831 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1832 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1833 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1834 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1835 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1836 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1837 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1838 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1839 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1840 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1841 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1842 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1843 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1844 +
  1.1845 +	movdqu 176(%rdi), %xmm1
  1.1846 +	movdqu 192(%rdi), %xmm11
  1.1847 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1848 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1849 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1850 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1851 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1852 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1853 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1854 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1855 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1.1856 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1.1857 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1.1858 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1.1859 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1.1860 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1.1861 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1.1862 +	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1.1863 +
  1.1864 +	movdqu 208(%rdi), %xmm1
  1.1865 +	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1.1866 +	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1.1867 +	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1.1868 +	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1.1869 +	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1.1870 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1.1871 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1.1872 +	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1.1873 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdf	/* aesenclast 	%xmm15, %xmm3 */
  1.1874 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe7	/* aesenclast 	%xmm15, %xmm4 */
  1.1875 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xef	/* aesenclast 	%xmm15, %xmm5 */
  1.1876 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf7	/* aesenclast 	%xmm15, %xmm6 */
  1.1877 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xff	/* aesenclast 	%xmm15, %xmm7 */
  1.1878 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc7	/* aesenclast 	%xmm15, %xmm8 */
  1.1879 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcf	/* aesenclast 	%xmm15, %xmm9 */
  1.1880 +	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd7	/* aesenclast 	%xmm15, %xmm10 */
  1.1881 +
  1.1882 +	movdqu	%xmm3, (%rsi, %rax)
  1.1883 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.1884 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.1885 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.1886 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.1887 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.1888 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.1889 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.1890 +//	addq	$8*16, %rax
  1.1891 +	addq	$128, %rax
  1.1892 +	cmpq	%r11, %rax
  1.1893 +	jbe	2b
  1.1894 +1:	cmpq	%rax, %r9
  1.1895 +	je	5f
  1.1896 +
  1.1897 +	movdqu	(%rdi), %xmm8
  1.1898 +	movdqu	16(%rdi), %xmm2
  1.1899 +	movdqu	32(%rdi), %xmm3
  1.1900 +	movdqu	48(%rdi), %xmm4
  1.1901 +	movdqu	64(%rdi), %xmm5
  1.1902 +	movdqu	80(%rdi), %xmm6
  1.1903 +	movdqu	96(%rdi), %xmm7
  1.1904 +	movdqu	128(%rdi), %xmm9
  1.1905 +	movdqu	144(%rdi), %xmm10
  1.1906 +	movdqu	160(%rdi), %xmm11
  1.1907 +	movdqu	176(%rdi), %xmm12
  1.1908 +	movdqu	192(%rdi), %xmm13
  1.1909 +	movdqu	208(%rdi), %xmm14
  1.1910 +
  1.1911 +4:	movdqu	(%r8, %rax), %xmm1
  1.1912 +	pxor	%xmm8, %xmm1
  1.1913 +	movdqu	112(%rdi), %xmm8
  1.1914 +	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
  1.1915 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1.1916 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1.1917 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1.1918 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1.1919 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1.1920 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1.1921 +	movdqu	(%rdi), %xmm8
  1.1922 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1.1923 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1.1924 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1.1925 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1.1926 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1.1927 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
  1.1928 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
  1.1929 +	movdqu	%xmm1, (%rsi, %rax)
  1.1930 +	addq	$16, %rax
  1.1931 +	cmpq	%rax, %r9
  1.1932 +	jne	4b
  1.1933 +
  1.1934 +5:	xor	%eax, %eax
  1.1935 +	ret
  1.1936 +	.size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
  1.1937 +
  1.1938 +
  1.1939 +/* in %rdi : cx - context
  1.1940 +   in %rsi : output - pointer to output buffer
  1.1941 +   in %rdx : outputLen - pointer to variable for length of output
  1.1942 +             (filled by caller)
  1.1943 +   in %rcx : maxOutputLen - length of output buffer
  1.1944 +   in %r8  : input - pointer to input buffer
  1.1945 +   in %r9  : inputLen - length of input buffer
  1.1946 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.1947 +*/
  1.1948 +	.type intel_aes_decrypt_ecb_256,@function
  1.1949 +	.globl intel_aes_decrypt_ecb_256
  1.1950 +	.align	16
  1.1951 +intel_aes_decrypt_ecb_256:
  1.1952 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.1953 +	leaq	48(%rdi), %rdi
  1.1954 +
  1.1955 +	movdqu	(%rdi), %xmm2
  1.1956 +	movdqu	224(%rdi), %xmm15
  1.1957 +	xorl	%eax, %eax
  1.1958 +//	cmpq	$8*16, %r9
  1.1959 +	cmpq	$128, %r9
  1.1960 +	jb	1f
  1.1961 +//	leaq	-8*16(%r9), %r11
  1.1962 +	leaq	-128(%r9), %r11
  1.1963 +2:	movdqu	(%r8, %rax), %xmm3
  1.1964 +	movdqu	16(%r8, %rax), %xmm4
  1.1965 +	movdqu	32(%r8, %rax), %xmm5
  1.1966 +	movdqu	48(%r8, %rax), %xmm6
  1.1967 +	movdqu	64(%r8, %rax), %xmm7
  1.1968 +	movdqu	80(%r8, %rax), %xmm8
  1.1969 +	movdqu	96(%r8, %rax), %xmm9
  1.1970 +	movdqu	112(%r8, %rax), %xmm10
  1.1971 +	pxor	%xmm15, %xmm3
  1.1972 +	pxor	%xmm15, %xmm4
  1.1973 +	pxor	%xmm15, %xmm5
  1.1974 +	pxor	%xmm15, %xmm6
  1.1975 +	pxor	%xmm15, %xmm7
  1.1976 +	pxor	%xmm15, %xmm8
  1.1977 +	pxor	%xmm15, %xmm9
  1.1978 +	pxor	%xmm15, %xmm10
  1.1979 +
  1.1980 +// complete loop unrolling
  1.1981 +	movdqu 208(%rdi), %xmm1
  1.1982 +	movdqu 192(%rdi), %xmm11
  1.1983 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.1984 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.1985 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.1986 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.1987 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.1988 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.1989 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.1990 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.1991 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.1992 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.1993 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.1994 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.1995 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.1996 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.1997 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.1998 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.1999 +
  1.2000 +	movdqu 176(%rdi), %xmm1
  1.2001 +	movdqu 160(%rdi), %xmm11
  1.2002 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2003 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2004 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2005 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2006 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2007 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2008 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2009 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2010 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2011 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2012 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2013 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2014 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2015 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2016 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2017 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2018 +
  1.2019 +	movdqu 144(%rdi), %xmm1
  1.2020 +	movdqu 128(%rdi), %xmm11
  1.2021 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2022 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2023 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2024 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2025 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2026 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2027 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2028 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2029 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2030 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2031 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2032 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2033 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2034 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2035 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2036 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2037 +
  1.2038 +	movdqu 112(%rdi), %xmm1
  1.2039 +	movdqu 96(%rdi), %xmm11
  1.2040 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2041 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2042 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2043 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2044 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2045 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2046 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2047 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2048 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2049 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2050 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2051 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2052 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2053 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2054 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2055 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2056 +
  1.2057 +	movdqu 80(%rdi), %xmm1
  1.2058 +	movdqu 64(%rdi), %xmm11
  1.2059 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2060 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2061 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2062 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2063 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2064 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2065 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2066 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2067 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2068 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2069 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2070 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2071 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2072 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2073 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2074 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2075 +
  1.2076 +	movdqu 48(%rdi), %xmm1
  1.2077 +	movdqu 32(%rdi), %xmm11
  1.2078 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2079 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2080 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2081 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2082 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2083 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2084 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2085 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2086 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2087 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2088 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2089 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2090 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2091 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2092 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2093 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2094 +
  1.2095 +	movdqu 16(%rdi), %xmm1
  1.2096 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2097 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2098 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2099 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2100 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2101 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2102 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2103 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2104 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1.2105 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1.2106 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1.2107 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1.2108 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1.2109 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1.2110 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1.2111 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1.2112 +
  1.2113 +	movdqu	%xmm3, (%rsi, %rax)
  1.2114 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.2115 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.2116 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.2117 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.2118 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.2119 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.2120 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.2121 +//	addq	$8*16, %rax
  1.2122 +	addq	$128, %rax
  1.2123 +	cmpq	%r11, %rax
  1.2124 +	jbe	2b
  1.2125 +1:	cmpq	%rax, %r9
  1.2126 +	je	5f
  1.2127 +
  1.2128 +	movdqu	16(%rdi), %xmm2
  1.2129 +	movdqu	32(%rdi), %xmm3
  1.2130 +	movdqu	48(%rdi), %xmm4
  1.2131 +	movdqu	64(%rdi), %xmm5
  1.2132 +	movdqu	80(%rdi), %xmm6
  1.2133 +	movdqu	96(%rdi), %xmm7
  1.2134 +	movdqu	112(%rdi), %xmm8
  1.2135 +	movdqu	128(%rdi), %xmm9
  1.2136 +	movdqu	144(%rdi), %xmm10
  1.2137 +	movdqu	160(%rdi), %xmm11
  1.2138 +	movdqu	176(%rdi), %xmm12
  1.2139 +	movdqu	192(%rdi), %xmm13
  1.2140 +	movdqu	208(%rdi), %xmm14
  1.2141 +
  1.2142 +4:	movdqu	(%r8, %rax), %xmm1
  1.2143 +	pxor	%xmm15, %xmm1
  1.2144 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
  1.2145 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1.2146 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1.2147 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1.2148 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1.2149 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1.2150 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1.2151 +	movdqu	(%rdi), %xmm8
  1.2152 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1.2153 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1.2154 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1.2155 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1.2156 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1.2157 +	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
  1.2158 +	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
  1.2159 +	movdqu	112(%rdi), %xmm8
  1.2160 +	movdqu	%xmm1, (%rsi, %rax)
  1.2161 +	addq	$16, %rax
  1.2162 +	cmpq	%rax, %r9
  1.2163 +	jne	4b
  1.2164 +
  1.2165 +5:	xor	%eax, %eax
  1.2166 +	ret
  1.2167 +	.size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
  1.2168 +
  1.2169 +
  1.2170 +/* in %rdi : cx - context
  1.2171 +   in %rsi : output - pointer to output buffer
  1.2172 +   in %rdx : outputLen - pointer to variable for length of output
  1.2173 +             (filled by caller)
  1.2174 +   in %rcx : maxOutputLen - length of output buffer
  1.2175 +   in %r8  : input - pointer to input buffer
  1.2176 +   in %r9  : inputLen - length of input buffer
  1.2177 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.2178 +*/
  1.2179 +	.type intel_aes_encrypt_cbc_256,@function
  1.2180 +	.globl intel_aes_encrypt_cbc_256
  1.2181 +	.align	16
  1.2182 +intel_aes_encrypt_cbc_256:
  1.2183 +	testq	%r9, %r9
  1.2184 +	je	2f
  1.2185 +
  1.2186 +//	leaq	IV_OFFSET(%rdi), %rdx
  1.2187 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.2188 +	leaq	16(%rdi), %rdx
  1.2189 +	leaq	48(%rdi), %rdi
  1.2190 +
  1.2191 +	movdqu	(%rdx), %xmm0
  1.2192 +	movdqu	(%rdi), %xmm8
  1.2193 +	movdqu	16(%rdi), %xmm2
  1.2194 +	movdqu	32(%rdi), %xmm3
  1.2195 +	movdqu	48(%rdi), %xmm4
  1.2196 +	movdqu	64(%rdi), %xmm5
  1.2197 +	movdqu	80(%rdi), %xmm6
  1.2198 +	movdqu	96(%rdi), %xmm7
  1.2199 +	movdqu	128(%rdi), %xmm9
  1.2200 +	movdqu	144(%rdi), %xmm10
  1.2201 +	movdqu	160(%rdi), %xmm11
  1.2202 +	movdqu	176(%rdi), %xmm12
  1.2203 +	movdqu	192(%rdi), %xmm13
  1.2204 +	movdqu	208(%rdi), %xmm14
  1.2205 +	movdqu	224(%rdi), %xmm15
  1.2206 +
  1.2207 +	xorl	%eax, %eax
  1.2208 +1:	movdqu	(%r8, %rax), %xmm1
  1.2209 +	pxor	%xmm0, %xmm1
  1.2210 +	pxor	%xmm8, %xmm1
  1.2211 +	movdqu	112(%rdi), %xmm8
  1.2212 +	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
  1.2213 +	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1.2214 +	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1.2215 +	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1.2216 +	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1.2217 +	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1.2218 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1.2219 +	movdqu	(%rdi), %xmm8
  1.2220 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1.2221 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1.2222 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1.2223 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1.2224 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1.2225 +	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
  1.2226 +	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
  1.2227 +	movdqu	%xmm1, (%rsi, %rax)
  1.2228 +	movdqa	%xmm1, %xmm0
  1.2229 +	addq	$16, %rax
  1.2230 +	cmpq	%rax, %r9
  1.2231 +	jne	1b
  1.2232 +
  1.2233 +	movdqu	%xmm0, (%rdx)
  1.2234 +
  1.2235 +2:	xor	%eax, %eax
  1.2236 +	ret
  1.2237 +	.size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
  1.2238 +
  1.2239 +
  1.2240 +/* in %rdi : cx - context
  1.2241 +   in %rsi : output - pointer to output buffer
  1.2242 +   in %rdx : outputLen - pointer to variable for length of output
  1.2243 +             (filled by caller)
  1.2244 +   in %rcx : maxOutputLen - length of output buffer
  1.2245 +   in %r8  : input - pointer to input buffer
  1.2246 +   in %r9  : inputLen - length of input buffer
  1.2247 +   on stack: blocksize - AES blocksize (always 16, unused)
  1.2248 +*/
  1.2249 +	.type intel_aes_decrypt_cbc_256,@function
  1.2250 +	.globl intel_aes_decrypt_cbc_256
  1.2251 +	.align	16
  1.2252 +intel_aes_decrypt_cbc_256:
  1.2253 +//	leaq	IV_OFFSET(%rdi), %rdx
  1.2254 +//	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1.2255 +	leaq	16(%rdi), %rdx
  1.2256 +	leaq	48(%rdi), %rdi
  1.2257 +
  1.2258 +	movdqu	(%rdx), %xmm0
  1.2259 +	movdqu	(%rdi), %xmm2
  1.2260 +	movdqu	224(%rdi), %xmm15
  1.2261 +	xorl	%eax, %eax
  1.2262 +//	cmpq	$8*16, %r9
  1.2263 +	cmpq	$128, %r9
  1.2264 +	jb	1f
  1.2265 +//	leaq	-8*16(%r9), %r11
  1.2266 +	leaq	-128(%r9), %r11
  1.2267 +2:	movdqu  (%r8, %rax), %xmm3
  1.2268 +	movdqu	16(%r8, %rax), %xmm4
  1.2269 +	movdqu	32(%r8, %rax), %xmm5
  1.2270 +	movdqu	48(%r8, %rax), %xmm6
  1.2271 +	movdqu	64(%r8, %rax), %xmm7
  1.2272 +	movdqu	80(%r8, %rax), %xmm8
  1.2273 +	movdqu	96(%r8, %rax), %xmm9
  1.2274 +	movdqu	112(%r8, %rax), %xmm10
  1.2275 +	pxor	%xmm15, %xmm3
  1.2276 +	pxor	%xmm15, %xmm4
  1.2277 +	pxor	%xmm15, %xmm5
  1.2278 +	pxor	%xmm15, %xmm6
  1.2279 +	pxor	%xmm15, %xmm7
  1.2280 +	pxor	%xmm15, %xmm8
  1.2281 +	pxor	%xmm15, %xmm9
  1.2282 +	pxor	%xmm15, %xmm10
  1.2283 +
  1.2284 +// complete loop unrolling
  1.2285 +	movdqu 208(%rdi), %xmm1
  1.2286 +	movdqu 192(%rdi), %xmm11
  1.2287 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2288 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2289 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2290 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2291 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2292 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2293 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2294 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2295 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2296 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2297 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2298 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2299 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2300 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2301 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2302 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2303 +
  1.2304 +	movdqu 176(%rdi), %xmm1
  1.2305 +	movdqu 160(%rdi), %xmm11
  1.2306 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2307 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2308 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2309 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2310 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2311 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2312 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2313 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2314 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2315 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2316 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2317 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2318 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2319 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2320 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2321 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2322 +
  1.2323 +	movdqu 144(%rdi), %xmm1
  1.2324 +	movdqu 128(%rdi), %xmm11
  1.2325 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2326 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2327 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2328 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2329 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2330 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2331 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2332 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2333 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2334 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2335 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2336 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2337 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2338 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2339 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2340 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2341 +
  1.2342 +	movdqu 112(%rdi), %xmm1
  1.2343 +	movdqu 96(%rdi), %xmm11
  1.2344 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2345 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2346 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2347 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2348 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2349 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2350 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2351 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2352 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2353 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2354 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2355 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2356 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2357 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2358 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2359 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2360 +
  1.2361 +	movdqu 80(%rdi), %xmm1
  1.2362 +	movdqu 64(%rdi), %xmm11
  1.2363 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2364 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2365 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2366 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2367 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2368 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2369 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2370 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2371 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2372 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2373 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2374 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2375 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2376 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2377 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2378 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2379 +
  1.2380 +	movdqu 48(%rdi), %xmm1
  1.2381 +	movdqu 32(%rdi), %xmm11
  1.2382 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2383 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2384 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2385 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2386 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2387 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2388 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2389 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2390 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1.2391 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1.2392 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1.2393 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1.2394 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1.2395 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1.2396 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1.2397 +	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1.2398 +
  1.2399 +	movdqu 16(%rdi), %xmm1
  1.2400 +	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1.2401 +	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1.2402 +	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1.2403 +	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1.2404 +	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1.2405 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1.2406 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1.2407 +	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1.2408 +	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1.2409 +	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1.2410 +	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1.2411 +	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1.2412 +	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1.2413 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1.2414 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1.2415 +	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1.2416 +
  1.2417 + 	pxor	%xmm0, %xmm3
  1.2418 +	movdqu	(%r8, %rax), %xmm0
  1.2419 +	pxor	%xmm0, %xmm4
  1.2420 +	movdqu	16(%r8, %rax), %xmm0
  1.2421 +	pxor	%xmm0, %xmm5
  1.2422 +	movdqu	32(%r8, %rax), %xmm0
  1.2423 +	pxor	%xmm0, %xmm6
  1.2424 +	movdqu	48(%r8, %rax), %xmm0
  1.2425 +	pxor	%xmm0, %xmm7
  1.2426 +	movdqu	64(%r8, %rax), %xmm0
  1.2427 +	pxor	%xmm0, %xmm8
  1.2428 +	movdqu	80(%r8, %rax), %xmm0
  1.2429 +	pxor	%xmm0, %xmm9
  1.2430 +	movdqu	96(%r8, %rax), %xmm0
  1.2431 +	pxor	%xmm0, %xmm10
  1.2432 +	movdqu	112(%r8, %rax), %xmm0
  1.2433 +	movdqu	%xmm3, (%rsi, %rax)
  1.2434 +	movdqu	%xmm4, 16(%rsi, %rax)
  1.2435 +	movdqu	%xmm5, 32(%rsi, %rax)
  1.2436 +	movdqu	%xmm6, 48(%rsi, %rax)
  1.2437 +	movdqu	%xmm7, 64(%rsi, %rax)
  1.2438 +	movdqu	%xmm8, 80(%rsi, %rax)
  1.2439 +	movdqu	%xmm9, 96(%rsi, %rax)
  1.2440 +	movdqu	%xmm10, 112(%rsi, %rax)
  1.2441 +//	addq	$8*16, %rax
  1.2442 +	addq	$128, %rax
  1.2443 +	cmpq	%r11, %rax
  1.2444 +	jbe	2b
  1.2445 +1:	cmpq	%rax, %r9
  1.2446 +	je	5f
  1.2447 +
  1.2448 +	movdqu	16(%rdi), %xmm2
  1.2449 +	movdqu	32(%rdi), %xmm3
  1.2450 +	movdqu	48(%rdi), %xmm4
  1.2451 +	movdqu	64(%rdi), %xmm5
  1.2452 +	movdqu	80(%rdi), %xmm6
  1.2453 +	movdqu	96(%rdi), %xmm7
  1.2454 +	movdqu	112(%rdi), %xmm8
  1.2455 +	movdqu	128(%rdi), %xmm9
  1.2456 +	movdqu	144(%rdi), %xmm10
  1.2457 +	movdqu	160(%rdi), %xmm11
  1.2458 +	movdqu	176(%rdi), %xmm12
  1.2459 +	movdqu	192(%rdi), %xmm13
  1.2460 +	movdqu	208(%rdi), %xmm14
  1.2461 +
  1.2462 +4:	movdqu	(%r8, %rax), %xmm1
  1.2463 +	pxor	%xmm15, %xmm1
  1.2464 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
  1.2465 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1.2466 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1.2467 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1.2468 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1.2469 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1.2470 +	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1.2471 +	movdqu	(%rdi), %xmm8
  1.2472 +	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1.2473 +	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1.2474 +	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1.2475 +	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1.2476 +	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1.2477 +	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
  1.2478 +	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
  1.2479 +	movdqu	112(%rdi), %xmm8
  1.2480 +	pxor	%xmm0, %xmm1
  1.2481 +	movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
  1.2482 +	movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
  1.2483 +	addq	$16, %rax
  1.2484 +	cmpq	%rax, %r9
  1.2485 +	jne	4b
  1.2486 +
  1.2487 +5:	movdqu	%xmm0, (%rdx)
  1.2488 +
  1.2489 +	xor	%eax, %eax
  1.2490 +	ret
  1.2491 +	.size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256

mercurial