security/nss/lib/freebl/intel-aes.s

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 	.text
     7 #define IV_OFFSET 16
     8 #define EXPANDED_KEY_OFFSET 48
    11 /* in %rdi : the key
    12    in %rsi : buffer for expanded key
    13 */
    14 	.type intel_aes_encrypt_init_128,@function
    15 	.globl intel_aes_encrypt_init_128
    16 	.align	16
    17 intel_aes_encrypt_init_128:
    18 	movups	(%rdi), %xmm1
    19 	movups	%xmm1, (%rsi)
    20 	leaq	16(%rsi), %rsi
    21 	xorl	%eax, %eax
    23 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
    24 	call key_expansion128
    25 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
    26 	call key_expansion128
    27 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
    28 	call key_expansion128
    29 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
    30 	call key_expansion128
    31 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
    32 	call key_expansion128
    33 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
    34 	call key_expansion128
    35 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
    36 	call key_expansion128
    37 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
    38 	call key_expansion128
    39 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
    40 	call key_expansion128
    41 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
    42 	call key_expansion128
    44 	ret
    45 	.size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128
    48 /* in %rdi : the key
    49    in %rsi : buffer for expanded key
    50 */
    51 	.type intel_aes_decrypt_init_128,@function
    52 	.globl intel_aes_decrypt_init_128
    53 	.align	16
    54 intel_aes_decrypt_init_128:
    55 	movups	(%rdi), %xmm1
    56 	movups	%xmm1, (%rsi)
    57 	leaq	16(%rsi), %rsi
    58 	xorl	%eax, %eax
    60 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
    61 	call key_expansion128
    62 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    63 	movups	%xmm2, -16(%rsi)
    64 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
    65 	call key_expansion128
    66 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    67 	movups	%xmm2, -16(%rsi)
    68 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
    69 	call key_expansion128
    70 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    71 	movups	%xmm2, -16(%rsi)
    72 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
    73 	call key_expansion128
    74 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    75 	movups	%xmm2, -16(%rsi)
    76 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
    77 	call key_expansion128
    78 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    79 	movups	%xmm2, -16(%rsi)
    80 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
    81 	call key_expansion128
    82 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    83 	movups	%xmm2, -16(%rsi)
    84 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
    85 	call key_expansion128
    86 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    87 	movups	%xmm2, -16(%rsi)
    88 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
    89 	call key_expansion128
    90 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    91 	movups	%xmm2, -16(%rsi)
    92 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
    93 	call key_expansion128
    94 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    95 	movups	%xmm2, -16(%rsi)
    96 	.byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
    97 	call key_expansion128
    99 	ret
   100 	.size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128
   103 	.type key_expansion128,@function
   104 	.align	16
   105 key_expansion128:
   106 	movd	%eax, %xmm3
   107 	pshufd	$0xff, %xmm2, %xmm2
   108 	shufps	$0x10, %xmm1, %xmm3
   109 	pxor	%xmm3, %xmm1
   110 	shufps	$0x8c, %xmm1, %xmm3
   111 	pxor	%xmm2, %xmm1
   112 	pxor	%xmm3, %xmm1
   113 	movdqu	%xmm1, (%rsi)
   114 	addq	$16, %rsi
   115 	ret
   116 	.size key_expansion128, .-key_expansion128
   119 /* in %rdi : cx - context
   120    in %rsi : output - pointer to output buffer
   121    in %rdx : outputLen - pointer to variable for length of output
   122              (filled by caller)
   123    in %rcx : maxOutputLen - length of output buffer
   124    in %r8  : input - pointer to input buffer
   125    in %r9  : inputLen - length of input buffer
   126    on stack: blocksize - AES blocksize (always 16, unused)
   127 */
   128 	.type intel_aes_encrypt_ecb_128,@function
   129 	.globl intel_aes_encrypt_ecb_128
   130 	.align	16
   131 intel_aes_encrypt_ecb_128:
   132 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   133 	leaq	48(%rdi), %rdi
   135 	movdqu	(%rdi), %xmm2
   136 	movdqu	160(%rdi), %xmm12
   137 	xor	%eax, %eax
   138 //	cmpq	$8*16, %r9
   139 	cmpq	$128, %r9
   140 	jb	1f
   141 //	leaq	-8*16(%r9), %r11
   142 	leaq	-128(%r9), %r11
   143 2:	movdqu	(%r8, %rax), %xmm3
   144 	movdqu	16(%r8, %rax), %xmm4
   145 	movdqu	32(%r8, %rax), %xmm5
   146 	movdqu	48(%r8, %rax), %xmm6
   147 	movdqu	64(%r8, %rax), %xmm7
   148 	movdqu	80(%r8, %rax), %xmm8
   149 	movdqu	96(%r8, %rax), %xmm9
   150 	movdqu	112(%r8, %rax), %xmm10
   151 	pxor	%xmm2, %xmm3
   152 	pxor	%xmm2, %xmm4
   153 	pxor	%xmm2, %xmm5
   154 	pxor	%xmm2, %xmm6
   155 	pxor	%xmm2, %xmm7
   156 	pxor	%xmm2, %xmm8
   157 	pxor	%xmm2, %xmm9
   158 	pxor	%xmm2, %xmm10
   160 // complete loop unrolling
   161 	movdqu 16(%rdi), %xmm1
   162 	movdqu 32(%rdi), %xmm11
   163 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   164 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   165 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   166 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   167 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   168 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   169 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   170 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   171 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   172 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   173 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   174 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   175 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   176 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   177 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   178 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   180 	movdqu 48(%rdi), %xmm1
   181 	movdqu 64(%rdi), %xmm11
   182 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   183 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   184 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   185 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   186 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   187 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   188 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   189 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   190 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   191 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   192 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   193 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   194 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   195 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   196 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   197 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   199 	movdqu 80(%rdi), %xmm1
   200 	movdqu 96(%rdi), %xmm11
   201 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   202 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   203 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   204 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   205 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   206 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   207 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   208 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   209 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   210 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   211 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   212 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   213 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   214 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   215 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   216 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   218 	movdqu 112(%rdi), %xmm1
   219 	movdqu 128(%rdi), %xmm11
   220 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   221 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   222 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   223 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   224 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   225 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   226 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   227 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   228 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   229 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   230 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   231 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   232 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   233 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   234 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   235 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   237 	movdqu 144(%rdi), %xmm1
   238 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   239 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   240 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   241 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   242 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   243 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   244 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   245 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   246 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdc 	/* aesenclast 	%xmm12, %xmm3 */
   247 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 	/* aesenclast 	%xmm12, %xmm4 */
   248 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xec 	/* aesenclast 	%xmm12, %xmm5 */
   249 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 	/* aesenclast 	%xmm12, %xmm6 */
   250 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfc 	/* aesenclast 	%xmm12, %xmm7 */
   251 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 	/* aesenclast 	%xmm12, %xmm8 */
   252 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcc 	/* aesenclast 	%xmm12, %xmm9 */
   253 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 	/* aesenclast 	%xmm12, %xmm10 */
   255 	movdqu	%xmm3, (%rsi, %rax)
   256 	movdqu	%xmm4, 16(%rsi, %rax)
   257 	movdqu	%xmm5, 32(%rsi, %rax)
   258 	movdqu	%xmm6, 48(%rsi, %rax)
   259 	movdqu	%xmm7, 64(%rsi, %rax)
   260 	movdqu	%xmm8, 80(%rsi, %rax)
   261 	movdqu	%xmm9, 96(%rsi, %rax)
   262 	movdqu	%xmm10, 112(%rsi, %rax)
   263 //	addq	$8*16, %rax
   264 	addq	$128, %rax
   265 	cmpq	%r11, %rax
   266 	jbe	2b
   267 1:	cmpq	%rax, %r9
   268 	je	5f
   270 	movdqu	16(%rdi), %xmm3
   271 	movdqu	32(%rdi), %xmm4
   272 	movdqu	48(%rdi), %xmm5
   273 	movdqu	64(%rdi), %xmm6
   274 	movdqu	80(%rdi), %xmm7
   275 	movdqu	96(%rdi), %xmm8
   276 	movdqu	112(%rdi), %xmm9
   277 	movdqu	128(%rdi), %xmm10
   278 	movdqu	144(%rdi), %xmm11
   280 4:	movdqu	(%r8, %rax), %xmm1
   281 	pxor	%xmm2, %xmm1
   282 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   283 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   284 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   285 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   286 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   287 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   288 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   289 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   290 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   291 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
   292 	movdqu	%xmm1, (%rsi, %rax)
   293 	addq	$16, %rax
   294 	cmpq	%rax, %r9
   295 	jne	4b
   297 5:	xor	%eax, %eax
   298 	ret
   299 	.size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
   302 /* in %rdi : cx - context
   303    in %rsi : output - pointer to output buffer
   304    in %rdx : outputLen - pointer to variable for length of output
   305              (filled by caller)
   306    in %rcx : maxOutputLen - length of output buffer
   307    in %r8  : input - pointer to input buffer
   308    in %r9  : inputLen - length of input buffer
   309    on stack: blocksize - AES blocksize (always 16, unused)
   310 */
   311 	.type intel_aes_decrypt_ecb_128,@function
   312 	.globl intel_aes_decrypt_ecb_128
   313 	.align	16
   314 intel_aes_decrypt_ecb_128:
   315 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   316 	leaq	48(%rdi), %rdi
   318 	movdqu	(%rdi), %xmm2
   319 	movdqu	160(%rdi), %xmm12
   320 	xorl	%eax, %eax
   321 //	cmpq	$8*16, %r9
   322 	cmpq	$128, %r9
   323 	jb	1f
   324 //	leaq	-8*16(%r9), %r11
   325 	leaq	-128(%r9), %r11
   326 2:	movdqu	(%r8, %rax), %xmm3
   327 	movdqu	16(%r8, %rax), %xmm4
   328 	movdqu	32(%r8, %rax), %xmm5
   329 	movdqu	48(%r8, %rax), %xmm6
   330 	movdqu	64(%r8, %rax), %xmm7
   331 	movdqu	80(%r8, %rax), %xmm8
   332 	movdqu	96(%r8, %rax), %xmm9
   333 	movdqu	112(%r8, %rax), %xmm10
   334 	pxor	%xmm12, %xmm3
   335 	pxor	%xmm12, %xmm4
   336 	pxor	%xmm12, %xmm5
   337 	pxor	%xmm12, %xmm6
   338 	pxor	%xmm12, %xmm7
   339 	pxor	%xmm12, %xmm8
   340 	pxor	%xmm12, %xmm9
   341 	pxor	%xmm12, %xmm10
   343 // complete loop unrolling
   344 	movdqu 144(%rdi), %xmm1
   345 	movdqu 128(%rdi), %xmm11
   346 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   347 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   348 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   349 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   350 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   351 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   352 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   353 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   354 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   355 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   356 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   357 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   358 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   359 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   360 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   361 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   363 	movdqu 112(%rdi), %xmm1
   364 	movdqu 96(%rdi), %xmm11
   365 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   366 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   367 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   368 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   369 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   370 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   371 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   372 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   373 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   374 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   375 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   376 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   377 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   378 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   379 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   380 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   382 	movdqu 80(%rdi), %xmm1
   383 	movdqu 64(%rdi), %xmm11
   384 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   385 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   386 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   387 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   388 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   389 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   390 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   391 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   392 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   393 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   394 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   395 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   396 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   397 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   398 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   399 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   401 	movdqu 48(%rdi), %xmm1
   402 	movdqu 32(%rdi), %xmm11
   403 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   404 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   405 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   406 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   407 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   408 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   409 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   410 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   411 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   412 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   413 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   414 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   415 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   416 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   417 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   418 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   420 	movdqu 16(%rdi), %xmm1
   421 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   422 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   423 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   424 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   425 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   426 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   427 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   428 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   429 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   430 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   431 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   432 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   433 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   434 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   435 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   436 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   438 	movdqu	%xmm3, (%rsi, %rax)
   439 	movdqu	%xmm4, 16(%rsi, %rax)
   440 	movdqu	%xmm5, 32(%rsi, %rax)
   441 	movdqu	%xmm6, 48(%rsi, %rax)
   442 	movdqu	%xmm7, 64(%rsi, %rax)
   443 	movdqu	%xmm8, 80(%rsi, %rax)
   444 	movdqu	%xmm9, 96(%rsi, %rax)
   445 	movdqu	%xmm10, 112(%rsi, %rax)
   446 //	addq	$8*16, %rax
   447 	addq	$128, %rax
   448 	cmpq	%r11, %rax
   449 	jbe	2b
   450 1:	cmpq	%rax, %r9
   451 	je	5f
   453 	movdqu	16(%rdi), %xmm3
   454 	movdqu	32(%rdi), %xmm4
   455 	movdqu	48(%rdi), %xmm5
   456 	movdqu	64(%rdi), %xmm6
   457 	movdqu	80(%rdi), %xmm7
   458 	movdqu	96(%rdi), %xmm8
   459 	movdqu	112(%rdi), %xmm9
   460 	movdqu	128(%rdi), %xmm10
   461 	movdqu	144(%rdi), %xmm11
   463 4:	movdqu	(%r8, %rax), %xmm1
   464 	pxor	%xmm12, %xmm1
   465 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   466 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   467 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   468 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   469 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   470 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
   471 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
   472 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
   473 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
   474 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   475 	movdqu	%xmm1, (%rsi, %rax)
   476 	addq	$16, %rax
   477 	cmpq	%rax, %r9
   478 	jne	4b
   480 5:	xor	%eax, %eax
   481 	ret
   482 	.size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
   485 /* in %rdi : cx - context
   486    in %rsi : output - pointer to output buffer
   487    in %rdx : outputLen - pointer to variable for length of output
   488              (filled by caller)
   489    in %rcx : maxOutputLen - length of output buffer
   490    in %r8  : input - pointer to input buffer
   491    in %r9  : inputLen - length of input buffer
   492    on stack: blocksize - AES blocksize (always 16, unused)
   493 */
   494 	.type intel_aes_encrypt_cbc_128,@function
   495 	.globl intel_aes_encrypt_cbc_128
   496 	.align	16
   497 intel_aes_encrypt_cbc_128:
   498 	testq	%r9, %r9
   499 	je	2f
   501 //	leaq	IV_OFFSET(%rdi), %rdx
   502 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   503 	leaq	16(%rdi), %rdx
   504 	leaq	48(%rdi), %rdi
   506 	movdqu	(%rdx), %xmm0
   507 	movdqu	(%rdi), %xmm2
   508 	movdqu	16(%rdi), %xmm3
   509 	movdqu	32(%rdi), %xmm4
   510 	movdqu	48(%rdi), %xmm5
   511 	movdqu	64(%rdi), %xmm6
   512 	movdqu	80(%rdi), %xmm7
   513 	movdqu	96(%rdi), %xmm8
   514 	movdqu	112(%rdi), %xmm9
   515 	movdqu	128(%rdi), %xmm10
   516 	movdqu	144(%rdi), %xmm11
   517 	movdqu	160(%rdi), %xmm12
   519 	xorl	%eax, %eax
   520 1:	movdqu	(%r8, %rax), %xmm1
   521 	pxor	%xmm0, %xmm1
   522 	pxor	%xmm2, %xmm1
   523 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   524 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   525 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   526 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   527 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   528 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   529 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   530 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
   531 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
   532 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
   533 	movdqu	%xmm1, (%rsi, %rax)
   534 	movdqa	%xmm1, %xmm0
   535 	addq	$16, %rax
   536 	cmpq	%rax, %r9
   537 	jne	1b
   539 	movdqu	%xmm0, (%rdx)
   541 2:	xor	%eax, %eax
   542 	ret
   543 	.size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
   546 /* in %rdi : cx - context
   547    in %rsi : output - pointer to output buffer
   548    in %rdx : outputLen - pointer to variable for length of output
   549              (filled by caller)
   550    in %rcx : maxOutputLen - length of output buffer
   551    in %r8  : input - pointer to input buffer
   552    in %r9  : inputLen - length of input buffer
   553    on stack: blocksize - AES blocksize (always 16, unused)
   554 */
   555 	.type intel_aes_decrypt_cbc_128,@function
   556 	.globl intel_aes_decrypt_cbc_128
   557 	.align	16
   558 intel_aes_decrypt_cbc_128:
   559 //	leaq	IV_OFFSET(%rdi), %rdx
   560 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   561 	leaq	16(%rdi), %rdx
   562 	leaq	48(%rdi), %rdi
   564 	movdqu	(%rdx), %xmm0   /* iv */
   565 	movdqu	(%rdi), %xmm2   /* first key block */
   566 	movdqu	160(%rdi), %xmm12 /* last key block */
   567 	xorl	%eax, %eax
   568 	cmpq	$128, %r9
   569 	jb	1f
   570 	leaq	-128(%r9), %r11
   571 2:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
   572 	movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
   573 	movdqu	32(%r8, %rax), %xmm5
   574 	movdqu	48(%r8, %rax), %xmm6
   575 	movdqu	64(%r8, %rax), %xmm7
   576 	movdqu	80(%r8, %rax), %xmm8
   577 	movdqu	96(%r8, %rax), %xmm9
   578 	movdqu	112(%r8, %rax), %xmm10
   579 	pxor	%xmm12, %xmm3
   580 	pxor	%xmm12, %xmm4
   581 	pxor	%xmm12, %xmm5
   582 	pxor	%xmm12, %xmm6
   583 	pxor	%xmm12, %xmm7
   584 	pxor	%xmm12, %xmm8
   585 	pxor	%xmm12, %xmm9
   586 	pxor	%xmm12, %xmm10
   588 // complete loop unrolling
   589 	movdqu 144(%rdi), %xmm1
   590 	movdqu 128(%rdi), %xmm11
   591 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   592 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   593 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   594 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   595 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   596 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   597 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   598 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   599 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   600 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   601 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   602 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   603 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   604 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   605 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   606 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   608 	movdqu 112(%rdi), %xmm1
   609 	movdqu 96(%rdi), %xmm11
   610 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   611 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   612 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   613 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   614 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   615 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   616 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   617 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   618 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   619 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   620 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   621 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   622 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   623 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   624 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   625 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   627 	movdqu 80(%rdi), %xmm1
   628 	movdqu 64(%rdi), %xmm11
   629 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   630 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   631 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   632 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   633 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   634 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   635 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   636 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   637 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   638 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   639 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   640 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   641 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   642 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   643 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   644 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   646 	movdqu 48(%rdi), %xmm1
   647 	movdqu 32(%rdi), %xmm11
   648 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   649 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   650 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   651 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   652 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   653 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   654 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   655 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   656 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   657 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   658 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   659 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   660 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   661 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   662 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   663 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   665 	movdqu 16(%rdi), %xmm1
   666 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   667 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   668 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   669 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   670 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   671 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   672 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   673 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   674 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   675 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   676 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   677 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   678 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   679 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   680 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   681 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   683  	pxor	%xmm0, %xmm3
   684 	movdqu	(%r8, %rax), %xmm0
   685 	pxor	%xmm0, %xmm4
   686 	movdqu	16(%r8, %rax), %xmm0
   687 	pxor	%xmm0, %xmm5
   688 	movdqu	32(%r8, %rax), %xmm0
   689 	pxor	%xmm0, %xmm6
   690 	movdqu	48(%r8, %rax), %xmm0
   691 	pxor	%xmm0, %xmm7
   692 	movdqu	64(%r8, %rax), %xmm0
   693 	pxor	%xmm0, %xmm8
   694 	movdqu	80(%r8, %rax), %xmm0
   695 	pxor	%xmm0, %xmm9
   696 	movdqu	96(%r8, %rax), %xmm0
   697 	pxor	%xmm0, %xmm10
   698 	movdqu	112(%r8, %rax), %xmm0
   699 	movdqu	%xmm3, (%rsi, %rax)
   700 	movdqu	%xmm4, 16(%rsi, %rax)
   701 	movdqu	%xmm5, 32(%rsi, %rax)
   702 	movdqu	%xmm6, 48(%rsi, %rax)
   703 	movdqu	%xmm7, 64(%rsi, %rax)
   704 	movdqu	%xmm8, 80(%rsi, %rax)
   705 	movdqu	%xmm9, 96(%rsi, %rax)
   706 	movdqu	%xmm10, 112(%rsi, %rax)
   707 	addq	$128, %rax
   708 	cmpq	%r11, %rax
   709 	jbe	2b
   710 1:	cmpq	%rax, %r9
   711 	je	5f
   713 	movdqu	16(%rdi), %xmm3
   714 	movdqu	32(%rdi), %xmm4
   715 	movdqu	48(%rdi), %xmm5
   716 	movdqu	64(%rdi), %xmm6
   717 	movdqu	80(%rdi), %xmm7
   718 	movdqu	96(%rdi), %xmm8
   719 	movdqu	112(%rdi), %xmm9
   720 	movdqu	128(%rdi), %xmm10
   721 	movdqu	144(%rdi), %xmm11
   723 4:	movdqu	(%r8, %rax), %xmm1
   724 	movdqa	%xmm1, %xmm13
   725 	pxor	%xmm12, %xmm1
   726 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   727 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   728 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   729 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   730 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   731 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   732 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   733 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   734 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   735 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   736 	pxor	%xmm0, %xmm1
   737 	movdqu	%xmm1, (%rsi, %rax)
   738 	movdqa	%xmm13, %xmm0
   739 	addq	$16, %rax
   740 	cmpq	%rax, %r9
   741 	jne	4b
   743 5:	movdqu	%xmm0, (%rdx)
   745 	xor	%eax, %eax
   746 	ret
   747 	.size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
   749 /* in %rdi : the key
   750    in %rsi : buffer for expanded key
   751 */
   752 	.type intel_aes_encrypt_init_192,@function
   753 	.globl intel_aes_encrypt_init_192
   754 	.align	16
   755 intel_aes_encrypt_init_192:
   756 	movdqu	(%rdi), %xmm1
   757 	movq	16(%rdi), %xmm3
   758 	movdqu	%xmm1, (%rsi)
   759 	movq	%xmm3, 16(%rsi)
   760 	leaq	24(%rsi), %rsi
   762 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   763 	call key_expansion192
   764 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   765 	call key_expansion192
   766 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   767 	call key_expansion192
   768 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   769 	call key_expansion192
   770 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   771 	call key_expansion192
   772 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   773 	call key_expansion192
   774 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   775 	call key_expansion192
   776 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
   777 	call key_expansion192
   779 	ret
   780 	.size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192
   783 /* in %rdi : the key
   784    in %rsi : buffer for expanded key
   785 */
   786 	.type intel_aes_decrypt_init_192,@function
   787 	.globl intel_aes_decrypt_init_192
   788 	.align	16
   789 intel_aes_decrypt_init_192:
   790 	movdqu	(%rdi), %xmm1
   791 	movq	16(%rdi), %xmm3
   792 	movdqu	%xmm1, (%rsi)
   793 	movq	%xmm3, 16(%rsi)
   794 	leaq	24(%rsi), %rsi
   796 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   797 	call key_expansion192
   798 	movups	-32(%rsi), %xmm2
   799 	movups	-16(%rsi), %xmm4
   800 	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   801 	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   802 	movups	%xmm2, -32(%rsi)
   803 	movups	%xmm4, -16(%rsi)
   804 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   805 	call key_expansion192
   806 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   807 	movups	%xmm2, -24(%rsi)
   808 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   809 	call key_expansion192
   810 	movups	-32(%rsi), %xmm2
   811 	movups	-16(%rsi), %xmm4
   812 	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   813 	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   814 	movups	%xmm2, -32(%rsi)
   815 	movups	%xmm4, -16(%rsi)
   816 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   817 	call key_expansion192
   818 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   819 	movups	%xmm2, -24(%rsi)
   820 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   821 	call key_expansion192
   822 	movups	-32(%rsi), %xmm2
   823 	movups	-16(%rsi), %xmm4
   824 	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   825 	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   826 	movups	%xmm2, -32(%rsi)
   827 	movups	%xmm4, -16(%rsi)
   828 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   829 	call key_expansion192
   830 	.byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
   831 	movups	%xmm2, -24(%rsi)
   832 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   833 	call key_expansion192
   834 	movups	-32(%rsi), %xmm2
   835 	movups	-16(%rsi), %xmm4
   836 	.byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
   837 	.byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
   838 	movups	%xmm2, -32(%rsi)
   839 	movups	%xmm4, -16(%rsi)
   840 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
   841 	call key_expansion192
   843 	ret
   844 	.size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192
   847 	.type key_expansion192,@function
   848 	.align	16
   849 key_expansion192:
   850 	pshufd	$0x55, %xmm2, %xmm2
   851 	xor	%eax, %eax
   852 	movd	%eax, %xmm4
   853 	shufps	$0x10, %xmm1, %xmm4
   854 	pxor	%xmm4, %xmm1
   855 	shufps	$0x8c, %xmm1, %xmm4
   856 	pxor	%xmm2, %xmm1
   857 	pxor	%xmm4, %xmm1
   858 	movdqu	%xmm1, (%rsi)
   859 	addq	$16, %rsi
   861 	pshufd	$0xff, %xmm1, %xmm4
   862 	movd	%eax, %xmm5
   863 	shufps	$0x00, %xmm3, %xmm5
   864 	shufps	$0x08, %xmm3, %xmm5
   865 	pxor	%xmm4, %xmm3
   866 	pxor	%xmm5, %xmm3
   867 	movq	%xmm3, (%rsi)
   868 	addq	$8, %rsi
   869 	ret
   870 	.size key_expansion192, .-key_expansion192
   873 /* in %rdi : cx - context
   874    in %rsi : output - pointer to output buffer
   875    in %rdx : outputLen - pointer to variable for length of output
   876              (filled by caller)
   877    in %rcx : maxOutputLen - length of output buffer
   878    in %r8  : input - pointer to input buffer
   879    in %r9  : inputLen - length of input buffer
   880    on stack: blocksize - AES blocksize (always 16, unused)
   881 */
   882 	.type intel_aes_encrypt_ecb_192,@function
   883 	.globl intel_aes_encrypt_ecb_192
   884 	.align	16
   885 intel_aes_encrypt_ecb_192:
   886 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
   887 	leaq	48(%rdi), %rdi
   889 	movdqu	(%rdi), %xmm2
   890 	movdqu	192(%rdi), %xmm14
   891 	xorl	%eax, %eax
   892 //	cmpq	$8*16, %r9
   893 	cmpq	$128, %r9
   894 	jb	1f
   895 //	leaq	-8*16(%r9), %r11
   896 	leaq	-128(%r9), %r11
   897 2:	movdqu	(%r8, %rax), %xmm3
   898 	movdqu	16(%r8, %rax), %xmm4
   899 	movdqu	32(%r8, %rax), %xmm5
   900 	movdqu	48(%r8, %rax), %xmm6
   901 	movdqu	64(%r8, %rax), %xmm7
   902 	movdqu	80(%r8, %rax), %xmm8
   903 	movdqu	96(%r8, %rax), %xmm9
   904 	movdqu	112(%r8, %rax), %xmm10
   905 	pxor	%xmm2, %xmm3
   906 	pxor	%xmm2, %xmm4
   907 	pxor	%xmm2, %xmm5
   908 	pxor	%xmm2, %xmm6
   909 	pxor	%xmm2, %xmm7
   910 	pxor	%xmm2, %xmm8
   911 	pxor	%xmm2, %xmm9
   912 	pxor	%xmm2, %xmm10
   914 // complete loop unrolling
   915 	movdqu 16(%rdi), %xmm1
   916 	movdqu 32(%rdi), %xmm11
   917 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   918 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   919 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   920 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   921 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   922 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   923 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   924 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   925 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   926 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   927 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   928 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   929 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   930 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   931 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   932 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   934 	movdqu 48(%rdi), %xmm1
   935 	movdqu 64(%rdi), %xmm11
   936 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   937 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   938 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   939 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   940 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   941 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   942 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   943 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   944 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   945 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   946 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   947 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   948 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   949 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   950 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   951 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   953 	movdqu 80(%rdi), %xmm1
   954 	movdqu 96(%rdi), %xmm11
   955 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   956 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   957 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   958 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   959 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   960 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   961 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   962 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   963 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   964 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   965 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   966 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   967 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   968 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   969 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   970 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   972 	movdqu 112(%rdi), %xmm1
   973 	movdqu 128(%rdi), %xmm11
   974 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   975 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   976 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   977 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   978 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   979 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   980 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   981 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   982 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   983 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   984 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   985 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   986 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   987 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   988 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   989 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   991 	movdqu 144(%rdi), %xmm1
   992 	movdqu 160(%rdi), %xmm11
   993 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   994 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   995 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   996 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   997 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   998 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   999 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1000 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1001 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1002 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1003 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1004 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1005 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1006 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1007 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1008 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1010 	movdqu 176(%rdi), %xmm1
  1011 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1012 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1013 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1014 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1015 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1016 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1017 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1018 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1019 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xde	/* aesenclast 	%xmm14, %xmm3 */
  1020 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe6	/* aesenclast 	%xmm14, %xmm4 */
  1021 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xee	/* aesenclast 	%xmm14, %xmm5 */
  1022 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf6	/* aesenclast 	%xmm14, %xmm7 */
  1023 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xfe	/* aesenclast 	%xmm14, %xmm3 */
  1024 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc6	/* aesenclast 	%xmm14, %xmm8 */
  1025 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xce	/* aesenclast 	%xmm14, %xmm9 */
  1026 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd6	/* aesenclast 	%xmm14, %xmm10 */
  1028 	movdqu	%xmm3, (%rsi, %rax)
  1029 	movdqu	%xmm4, 16(%rsi, %rax)
  1030 	movdqu	%xmm5, 32(%rsi, %rax)
  1031 	movdqu	%xmm6, 48(%rsi, %rax)
  1032 	movdqu	%xmm7, 64(%rsi, %rax)
  1033 	movdqu	%xmm8, 80(%rsi, %rax)
  1034 	movdqu	%xmm9, 96(%rsi, %rax)
  1035 	movdqu	%xmm10, 112(%rsi, %rax)
  1036 //	addq	$8*16, %rax
  1037 	addq	$128, %rax
  1038 	cmpq	%r11, %rax
  1039 	jbe	2b
  1040 1:	cmpq	%rax, %r9
  1041 	je	5f
  1043 	movdqu	16(%rdi), %xmm3
  1044 	movdqu	32(%rdi), %xmm4
  1045 	movdqu	48(%rdi), %xmm5
  1046 	movdqu	64(%rdi), %xmm6
  1047 	movdqu	80(%rdi), %xmm7
  1048 	movdqu	96(%rdi), %xmm8
  1049 	movdqu	112(%rdi), %xmm9
  1050 	movdqu	128(%rdi), %xmm10
  1051 	movdqu	144(%rdi), %xmm11
  1052 	movdqu	160(%rdi), %xmm12
  1053 	movdqu	176(%rdi), %xmm13
  1055 4:	movdqu	(%r8, %rax), %xmm1
  1056 	pxor	%xmm2, %xmm1
  1057 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1058 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1059 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1060 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1061 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1062 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1063 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1064 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1065 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1066 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1067 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1068 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
  1069 	movdqu	%xmm1, (%rsi, %rax)
  1070 	addq	$16, %rax
  1071 	cmpq	%rax, %r9
  1072 	jne	4b
  1074 5:	xor	%eax, %eax
  1075 	ret
  1076 	.size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
  1079 /* in %rdi : cx - context
  1080    in %rsi : output - pointer to output buffer
  1081    in %rdx : outputLen - pointer to variable for length of output
  1082              (filled by caller)
  1083    in %rcx : maxOutputLen - length of output buffer
  1084    in %r8  : input - pointer to input buffer
  1085    in %r9  : inputLen - length of input buffer
  1086    on stack: blocksize - AES blocksize (always 16, unused)
  1087 */
  1088 	.type intel_aes_decrypt_ecb_192,@function
  1089 	.globl intel_aes_decrypt_ecb_192
  1090 	.align	16
  1091 intel_aes_decrypt_ecb_192:
  1092 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1093 	leaq	48(%rdi), %rdi
  1095 	movdqu	(%rdi), %xmm2
  1096 	movdqu	192(%rdi), %xmm14
  1097 	xorl	%eax, %eax
  1098 //	cmpq	$8*16, %r9
  1099 	cmpq	$128, %r9
  1100 	jb	1f
  1101 //	leaq	-8*16(%r9), %r11
  1102 	leaq	-128(%r9), %r11
  1103 2:	movdqu	(%r8, %rax), %xmm3
  1104 	movdqu	16(%r8, %rax), %xmm4
  1105 	movdqu	32(%r8, %rax), %xmm5
  1106 	movdqu	48(%r8, %rax), %xmm6
  1107 	movdqu	64(%r8, %rax), %xmm7
  1108 	movdqu	80(%r8, %rax), %xmm8
  1109 	movdqu	96(%r8, %rax), %xmm9
  1110 	movdqu	112(%r8, %rax), %xmm10
  1111 	pxor	%xmm14, %xmm3
  1112 	pxor	%xmm14, %xmm4
  1113 	pxor	%xmm14, %xmm5
  1114 	pxor	%xmm14, %xmm6
  1115 	pxor	%xmm14, %xmm7
  1116 	pxor	%xmm14, %xmm8
  1117 	pxor	%xmm14, %xmm9
  1118 	pxor	%xmm14, %xmm10
  1120 // complete loop unrolling
  1121 	movdqu 176(%rdi), %xmm1
  1122 	movdqu 160(%rdi), %xmm11
  1123 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1124 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1125 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1126 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1127 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1128 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1129 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1130 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1131 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1132 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1133 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1134 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1135 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1136 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1137 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1138 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1140 	movdqu 144(%rdi), %xmm1
  1141 	movdqu 128(%rdi), %xmm11
  1142 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1143 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1144 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1145 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1146 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1147 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1148 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1149 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1150 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1151 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1152 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1153 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1154 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1155 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1156 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1157 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1159 	movdqu 112(%rdi), %xmm1
  1160 	movdqu 96(%rdi), %xmm11
  1161 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1162 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1163 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1164 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1165 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1166 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1167 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1168 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1169 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1170 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1171 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1172 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1173 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1174 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1175 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1176 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1178 	movdqu 80(%rdi), %xmm1
  1179 	movdqu 64(%rdi), %xmm11
  1180 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1181 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1182 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1183 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1184 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1185 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1186 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1187 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1188 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1189 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1190 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1191 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1192 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1193 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1194 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1195 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1197 	movdqu 48(%rdi), %xmm1
  1198 	movdqu 32(%rdi), %xmm11
  1199 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1200 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1201 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1202 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1203 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1204 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1205 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1206 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1207 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1208 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1209 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1210 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1211 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1212 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1213 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1214 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1216 	movdqu 16(%rdi), %xmm1
  1217 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1218 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1219 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1220 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1221 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1222 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1223 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1224 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1225 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1226 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1227 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1228 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1229 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1230 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1231 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1232 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1234 	movdqu	%xmm3, (%rsi, %rax)
  1235 	movdqu	%xmm4, 16(%rsi, %rax)
  1236 	movdqu	%xmm5, 32(%rsi, %rax)
  1237 	movdqu	%xmm6, 48(%rsi, %rax)
  1238 	movdqu	%xmm7, 64(%rsi, %rax)
  1239 	movdqu	%xmm8, 80(%rsi, %rax)
  1240 	movdqu	%xmm9, 96(%rsi, %rax)
  1241 	movdqu	%xmm10, 112(%rsi, %rax)
  1242 //	addq	$8*16, %rax
  1243 	addq	$128, %rax
  1244 	cmpq	%r11, %rax
  1245 	jbe	2b
  1246 1:	cmpq	%rax, %r9
  1247 	je	5f
  1249 	movdqu	16(%rdi), %xmm3
  1250 	movdqu	32(%rdi), %xmm4
  1251 	movdqu	48(%rdi), %xmm5
  1252 	movdqu	64(%rdi), %xmm6
  1253 	movdqu	80(%rdi), %xmm7
  1254 	movdqu	96(%rdi), %xmm8
  1255 	movdqu	112(%rdi), %xmm9
  1256 	movdqu	128(%rdi), %xmm10
  1257 	movdqu	144(%rdi), %xmm11
  1258 	movdqu	160(%rdi), %xmm12
  1259 	movdqu	176(%rdi), %xmm13
  1261 4:	movdqu	(%r8, %rax), %xmm1
  1262 	pxor	%xmm14, %xmm1
  1263 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1264 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1265 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1266 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1267 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1268 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1269 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1270 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1271 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1272 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1273 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1274 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
  1275 	movdqu	%xmm1, (%rsi, %rax)
  1276 	addq	$16, %rax
  1277 	cmpq	%rax, %r9
  1278 	jne	4b
  1280 5:	xor	%eax, %eax
  1281 	ret
  1282 	.size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
  1285 /* in %rdi : cx - context
  1286    in %rsi : output - pointer to output buffer
  1287    in %rdx : outputLen - pointer to variable for length of output
  1288              (filled by caller)
  1289    in %rcx : maxOutputLen - length of output buffer
  1290    in %r8  : input - pointer to input buffer
  1291    in %r9  : inputLen - length of input buffer
  1292    on stack: blocksize - AES blocksize (always 16, unused)
  1293 */
  1294 	.type intel_aes_encrypt_cbc_192,@function
  1295 	.globl intel_aes_encrypt_cbc_192
  1296 	.align	16
  1297 intel_aes_encrypt_cbc_192:
  1298 	testq	%r9, %r9
  1299 	je	2f
  1301 //	leaq	IV_OFFSET(%rdi), %rdx
  1302 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1303 	leaq	16(%rdi), %rdx
  1304 	leaq	48(%rdi), %rdi
  1306 	movdqu	(%rdx), %xmm0
  1307 	movdqu	(%rdi), %xmm2
  1308 	movdqu	16(%rdi), %xmm3
  1309 	movdqu	32(%rdi), %xmm4
  1310 	movdqu	48(%rdi), %xmm5
  1311 	movdqu	64(%rdi), %xmm6
  1312 	movdqu	80(%rdi), %xmm7
  1313 	movdqu	96(%rdi), %xmm8
  1314 	movdqu	112(%rdi), %xmm9
  1315 	movdqu	128(%rdi), %xmm10
  1316 	movdqu	144(%rdi), %xmm11
  1317 	movdqu	160(%rdi), %xmm12
  1318 	movdqu	176(%rdi), %xmm13
  1319 	movdqu	192(%rdi), %xmm14
  1321 	xorl	%eax, %eax
  1322 1:	movdqu	(%r8, %rax), %xmm1
  1323 	pxor	%xmm0, %xmm1
  1324 	pxor	%xmm2, %xmm1
  1325 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1326 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1327 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1328 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1329 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1330 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1331 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1332 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1333 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1334 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1335 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1336 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
  1337 	movdqu	%xmm1, (%rsi, %rax)
  1338 	movdqa	%xmm1, %xmm0
  1339 	addq	$16, %rax
  1340 	cmpq	%rax, %r9
  1341 	jne	1b
  1343 	movdqu	%xmm0, (%rdx)
  1345 2:	xor	%eax, %eax
  1346 	ret
  1347 	.size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
  1350 /* in %rdi : cx - context
  1351    in %rsi : output - pointer to output buffer
  1352    in %rdx : outputLen - pointer to variable for length of output
  1353              (filled by caller)
  1354    in %rcx : maxOutputLen - length of output buffer
  1355    in %r8  : input - pointer to input buffer
  1356    in %r9  : inputLen - length of input buffer
  1357    on stack: blocksize - AES blocksize (always 16, unused)
  1358 */
  1359 	.type intel_aes_decrypt_cbc_192,@function
  1360 	.globl intel_aes_decrypt_cbc_192
  1361 	.align	16
  1362 intel_aes_decrypt_cbc_192:
  1363 	leaq	16(%rdi), %rdx
  1364 	leaq	48(%rdi), %rdi
  1366 	movdqu	(%rdx), %xmm0
  1367 	movdqu	(%rdi), %xmm2
  1368 	movdqu	192(%rdi), %xmm14
  1369 	xorl	%eax, %eax
  1370 	cmpq	$128, %r9
  1371 	jb	1f
  1372 	leaq	-128(%r9), %r11
  1373 2:	movdqu	(%r8, %rax), %xmm3
  1374 	movdqu	16(%r8, %rax), %xmm4
  1375 	movdqu	32(%r8, %rax), %xmm5
  1376 	movdqu	48(%r8, %rax), %xmm6
  1377 	movdqu	64(%r8, %rax), %xmm7
  1378 	movdqu	80(%r8, %rax), %xmm8
  1379 	movdqu	96(%r8, %rax), %xmm9
  1380 	movdqu	112(%r8, %rax), %xmm10
  1381 	pxor	%xmm14, %xmm3
  1382 	pxor	%xmm14, %xmm4
  1383 	pxor	%xmm14, %xmm5
  1384 	pxor	%xmm14, %xmm6
  1385 	pxor	%xmm14, %xmm7
  1386 	pxor	%xmm14, %xmm8
  1387 	pxor	%xmm14, %xmm9
  1388 	pxor	%xmm14, %xmm10
  1390 // complete loop unrolling
  1391 	movdqu 176(%rdi), %xmm1
  1392 	movdqu 160(%rdi), %xmm11
  1393 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1394 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1395 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1396 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1397 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1398 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1399 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1400 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1401 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1402 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1403 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1404 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1405 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1406 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1407 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1408 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1410 	movdqu 144(%rdi), %xmm1
  1411 	movdqu 128(%rdi), %xmm11
  1412 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1413 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1414 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1415 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1416 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1417 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1418 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1419 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1420 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1421 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1422 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1423 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1424 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1425 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1426 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1427 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1429 	movdqu 112(%rdi), %xmm1
  1430 	movdqu 96(%rdi), %xmm11
  1431 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1432 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1433 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1434 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1435 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1436 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1437 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1438 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1439 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1440 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1441 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1442 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1443 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1444 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1445 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1446 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1448 	movdqu 80(%rdi), %xmm1
  1449 	movdqu 64(%rdi), %xmm11
  1450 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1451 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1452 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1453 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1454 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1455 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1456 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1457 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1458 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1459 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1460 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1461 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1462 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1463 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1464 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1465 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1467 	movdqu 48(%rdi), %xmm1
  1468 	movdqu 32(%rdi), %xmm11
  1469 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1470 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1471 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1472 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1473 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1474 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1475 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1476 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1477 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1478 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1479 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1480 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1481 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1482 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1483 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1484 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1486 	movdqu 16(%rdi), %xmm1
  1487 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1488 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1489 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1490 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1491 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1492 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1493 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1494 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1495 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  1496 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  1497 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  1498 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  1499 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  1500 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  1501 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  1502 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  1504  	pxor	%xmm0, %xmm3
  1505 	movdqu	(%r8, %rax), %xmm0
  1506 	pxor	%xmm0, %xmm4
  1507 	movdqu	16(%r8, %rax), %xmm0
  1508 	pxor	%xmm0, %xmm5
  1509 	movdqu	32(%r8, %rax), %xmm0
  1510 	pxor	%xmm0, %xmm6
  1511 	movdqu	48(%r8, %rax), %xmm0
  1512 	pxor	%xmm0, %xmm7
  1513 	movdqu	64(%r8, %rax), %xmm0
  1514 	pxor	%xmm0, %xmm8
  1515 	movdqu	80(%r8, %rax), %xmm0
  1516 	pxor	%xmm0, %xmm9
  1517 	movdqu	96(%r8, %rax), %xmm0
  1518 	pxor	%xmm0, %xmm10
  1519 	movdqu	112(%r8, %rax), %xmm0
  1520 	movdqu	%xmm3, (%rsi, %rax)
  1521 	movdqu	%xmm4, 16(%rsi, %rax)
  1522 	movdqu	%xmm5, 32(%rsi, %rax)
  1523 	movdqu	%xmm6, 48(%rsi, %rax)
  1524 	movdqu	%xmm7, 64(%rsi, %rax)
  1525 	movdqu	%xmm8, 80(%rsi, %rax)
  1526 	movdqu	%xmm9, 96(%rsi, %rax)
  1527 	movdqu	%xmm10, 112(%rsi, %rax)
  1528 	addq	$128, %rax
  1529 	cmpq	%r11, %rax
  1530 	jbe	2b
  1531 1:	cmpq	%rax, %r9
  1532 	je	5f
  1534 	movdqu	16(%rdi), %xmm3
  1535 	movdqu	32(%rdi), %xmm4
  1536 	movdqu	48(%rdi), %xmm5
  1537 	movdqu	64(%rdi), %xmm6
  1538 	movdqu	80(%rdi), %xmm7
  1539 	movdqu	96(%rdi), %xmm8
  1540 	movdqu	112(%rdi), %xmm9
  1541 	movdqu	128(%rdi), %xmm10
  1542 	movdqu	144(%rdi), %xmm11
  1543 	movdqu	160(%rdi), %xmm12
  1544 	movdqu	176(%rdi), %xmm13
  1546 4:	movdqu	(%r8, %rax), %xmm1
  1547 	movdqa	%xmm1, %xmm15
  1548 	pxor	%xmm14, %xmm1
  1549 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  1550 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  1551 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  1552 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  1553 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  1554 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  1555 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  1556 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  1557 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  1558 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  1559 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  1560 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
  1561 	pxor	%xmm0, %xmm1
  1562 	movdqu	%xmm1, (%rsi, %rax)
  1563 	movdqa	%xmm15, %xmm0
  1564 	addq	$16, %rax
  1565 	cmpq	%rax, %r9
  1566 	jne	4b
  1568 5:	movdqu	%xmm0, (%rdx)
  1570 	xor	%eax, %eax
  1571 	ret
  1572 	.size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
  1574 /* in %rdi : the key
  1575    in %rsi : buffer for expanded key
  1576 */
  1577 	.type intel_aes_encrypt_init_256,@function
  1578 	.globl intel_aes_encrypt_init_256
  1579 	.align	16
  1580 intel_aes_encrypt_init_256:
  1581 	movdqu	(%rdi), %xmm1
  1582 	movdqu	16(%rdi), %xmm3
  1583 	movdqu	%xmm1, (%rsi)
  1584 	movdqu	%xmm3, 16(%rsi)
  1585 	leaq	32(%rsi), %rsi
  1586 	xor	%eax, %eax
  1588 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
  1589 	call key_expansion256
  1590 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
  1591 	call key_expansion256
  1592 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
  1593 	call key_expansion256
  1594 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
  1595 	call key_expansion256
  1596 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
  1597 	call key_expansion256
  1598 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
  1599 	call key_expansion256
  1600 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
  1601 	pxor	%xmm6, %xmm6
  1602 	pshufd	$0xff, %xmm2, %xmm2
  1603 	shufps	$0x10, %xmm1, %xmm6
  1604 	pxor	%xmm6, %xmm1
  1605 	shufps	$0x8c, %xmm1, %xmm6
  1606 	pxor	%xmm2, %xmm1
  1607 	pxor	%xmm6, %xmm1
  1608 	movdqu	%xmm1, (%rsi)
  1610 	ret
  1611 	.size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256
  1614 /* in %rdi : the key
  1615    in %rsi : buffer for expanded key
  1616 */
  1617 	.type intel_aes_decrypt_init_256,@function
  1618 	.globl intel_aes_decrypt_init_256
  1619 	.align	16
  1620 intel_aes_decrypt_init_256:
  1621 	movdqu	(%rdi), %xmm1
  1622 	movdqu	16(%rdi), %xmm3
  1623 	movdqu	%xmm1, (%rsi)
  1624 	.byte 0x66,0x0f,0x38,0xdb,0xe3	/* aesimc	%xmm3, %xmm4 */
  1625 	movdqu	%xmm4, 16(%rsi)
  1626 	leaq	32(%rsi), %rsi
  1627 	xor	%eax, %eax
  1629 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
  1630 	call key_expansion256
  1631 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1632 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1633 	movdqu	%xmm4, -32(%rsi)
  1634 	movdqu	%xmm5, -16(%rsi)
  1635 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
  1636 	call key_expansion256
  1637 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1638 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1639 	movdqu	%xmm4, -32(%rsi)
  1640 	movdqu	%xmm5, -16(%rsi)
  1641 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
  1642 	call key_expansion256
  1643 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1644 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1645 	movdqu	%xmm4, -32(%rsi)
  1646 	movdqu	%xmm5, -16(%rsi)
  1647 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
  1648 	call key_expansion256
  1649 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1650 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1651 	movdqu	%xmm4, -32(%rsi)
  1652 	movdqu	%xmm5, -16(%rsi)
  1653 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
  1654 	call key_expansion256
  1655 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1656 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1657 	movdqu	%xmm4, -32(%rsi)
  1658 	movdqu	%xmm5, -16(%rsi)
  1659 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
  1660 	call key_expansion256
  1661 	.byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
  1662 	.byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
  1663 	movdqu	%xmm4, -32(%rsi)
  1664 	movdqu	%xmm5, -16(%rsi)
  1665 	.byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
  1666 	pxor	%xmm6, %xmm6
  1667 	pshufd	$0xff, %xmm2, %xmm2
  1668 	shufps	$0x10, %xmm1, %xmm6
  1669 	pxor	%xmm6, %xmm1
  1670 	shufps	$0x8c, %xmm1, %xmm6
  1671 	pxor	%xmm2, %xmm1
  1672 	pxor	%xmm6, %xmm1
  1673 	movdqu	%xmm1, (%rsi)
  1675 	ret
  1676 	.size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256
  1679 	.type key_expansion256,@function
  1680 	.align	16
  1681 key_expansion256:
  1682 	movd	%eax, %xmm6
  1683 	pshufd	$0xff, %xmm2, %xmm2
  1684 	shufps	$0x10, %xmm1, %xmm6
  1685 	pxor	%xmm6, %xmm1
  1686 	shufps	$0x8c, %xmm1, %xmm6
  1687 	pxor	%xmm2, %xmm1
  1688 	pxor	%xmm6, %xmm1
  1689 	movdqu	%xmm1, (%rsi)
  1691 	addq	$16, %rsi
  1692 	.byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00	/* aeskeygenassist $0, %xmm1, %xmm4 */
  1693 	pshufd	$0xaa, %xmm4, %xmm4
  1694 	shufps	$0x10, %xmm3, %xmm6
  1695 	pxor	%xmm6, %xmm3
  1696 	shufps	$0x8c, %xmm3, %xmm6
  1697 	pxor	%xmm4, %xmm3
  1698 	pxor	%xmm6, %xmm3
  1699 	movdqu	%xmm3, (%rsi)
  1700 	addq	$16, %rsi
  1701 	ret
  1702 	.size key_expansion256, .-key_expansion256
  1705 /* in %rdi : cx - context
  1706    in %rsi : output - pointer to output buffer
  1707    in %rdx : outputLen - pointer to variable for length of output
  1708              (filled by caller)
  1709    in %rcx : maxOutputLen - length of output buffer
  1710    in %r8  : input - pointer to input buffer
  1711    in %r9  : inputLen - length of input buffer
  1712    on stack: blocksize - AES blocksize (always 16, unused)
  1713 */
  1714 	.type intel_aes_encrypt_ecb_256,@function
  1715 	.globl intel_aes_encrypt_ecb_256
  1716 	.align	16
  1717 intel_aes_encrypt_ecb_256:
  1718 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1719 	leaq	48(%rdi), %rdi
  1721 	movdqu	(%rdi), %xmm2
  1722 	movdqu	224(%rdi), %xmm15
  1723 	xorl	%eax, %eax
  1724 //	cmpq	$8*16, %r9
  1725 	cmpq	$128, %r9
  1726 	jb	1f
  1727 //	leaq	-8*16(%r9), %r11
  1728 	leaq	-128(%r9), %r11
  1729 2:	movdqu	(%r8, %rax), %xmm3
  1730 	movdqu	16(%r8, %rax), %xmm4
  1731 	movdqu	32(%r8, %rax), %xmm5
  1732 	movdqu	48(%r8, %rax), %xmm6
  1733 	movdqu	64(%r8, %rax), %xmm7
  1734 	movdqu	80(%r8, %rax), %xmm8
  1735 	movdqu	96(%r8, %rax), %xmm9
  1736 	movdqu	112(%r8, %rax), %xmm10
  1737 	pxor	%xmm2, %xmm3
  1738 	pxor	%xmm2, %xmm4
  1739 	pxor	%xmm2, %xmm5
  1740 	pxor	%xmm2, %xmm6
  1741 	pxor	%xmm2, %xmm7
  1742 	pxor	%xmm2, %xmm8
  1743 	pxor	%xmm2, %xmm9
  1744 	pxor	%xmm2, %xmm10
  1746 // complete loop unrolling
  1747 	movdqu 16(%rdi), %xmm1
  1748 	movdqu 32(%rdi), %xmm11
  1749 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1750 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1751 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1752 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1753 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1754 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1755 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1756 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1757 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1758 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1759 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1760 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1761 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1762 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1763 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1764 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1766 	movdqu 48(%rdi), %xmm1
  1767 	movdqu 64(%rdi), %xmm11
  1768 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1769 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1770 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1771 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1772 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1773 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1774 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1775 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1776 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1777 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1778 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1779 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1780 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1781 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1782 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1783 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1785 	movdqu 80(%rdi), %xmm1
  1786 	movdqu 96(%rdi), %xmm11
  1787 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1788 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1789 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1790 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1791 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1792 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1793 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1794 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1795 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1796 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1797 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1798 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1799 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1800 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1801 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1802 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1804 	movdqu 112(%rdi), %xmm1
  1805 	movdqu 128(%rdi), %xmm11
  1806 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1807 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1808 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1809 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1810 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1811 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1812 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1813 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1814 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1815 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1816 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1817 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1818 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1819 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1820 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1821 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1823 	movdqu 144(%rdi), %xmm1
  1824 	movdqu 160(%rdi), %xmm11
  1825 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1826 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1827 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1828 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1829 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1830 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1831 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1832 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1833 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1834 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1835 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1836 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1837 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1838 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1839 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1840 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1842 	movdqu 176(%rdi), %xmm1
  1843 	movdqu 192(%rdi), %xmm11
  1844 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1845 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1846 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1847 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1848 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1849 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1850 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1851 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1852 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
  1853 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
  1854 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
  1855 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
  1856 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
  1857 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
  1858 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
  1859 	.byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
  1861 	movdqu 208(%rdi), %xmm1
  1862 	.byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
  1863 	.byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
  1864 	.byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
  1865 	.byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
  1866 	.byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
  1867 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
  1868 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
  1869 	.byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
  1870 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xdf	/* aesenclast 	%xmm15, %xmm3 */
  1871 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xe7	/* aesenclast 	%xmm15, %xmm4 */
  1872 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xef	/* aesenclast 	%xmm15, %xmm5 */
  1873 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xf7	/* aesenclast 	%xmm15, %xmm6 */
  1874 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xff	/* aesenclast 	%xmm15, %xmm7 */
  1875 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xc7	/* aesenclast 	%xmm15, %xmm8 */
  1876 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xcf	/* aesenclast 	%xmm15, %xmm9 */
  1877 	.byte 0x66,0x45,0x0f,0x38,0xdd,0xd7	/* aesenclast 	%xmm15, %xmm10 */
  1879 	movdqu	%xmm3, (%rsi, %rax)
  1880 	movdqu	%xmm4, 16(%rsi, %rax)
  1881 	movdqu	%xmm5, 32(%rsi, %rax)
  1882 	movdqu	%xmm6, 48(%rsi, %rax)
  1883 	movdqu	%xmm7, 64(%rsi, %rax)
  1884 	movdqu	%xmm8, 80(%rsi, %rax)
  1885 	movdqu	%xmm9, 96(%rsi, %rax)
  1886 	movdqu	%xmm10, 112(%rsi, %rax)
  1887 //	addq	$8*16, %rax
  1888 	addq	$128, %rax
  1889 	cmpq	%r11, %rax
  1890 	jbe	2b
  1891 1:	cmpq	%rax, %r9
  1892 	je	5f
  1894 	movdqu	(%rdi), %xmm8
  1895 	movdqu	16(%rdi), %xmm2
  1896 	movdqu	32(%rdi), %xmm3
  1897 	movdqu	48(%rdi), %xmm4
  1898 	movdqu	64(%rdi), %xmm5
  1899 	movdqu	80(%rdi), %xmm6
  1900 	movdqu	96(%rdi), %xmm7
  1901 	movdqu	128(%rdi), %xmm9
  1902 	movdqu	144(%rdi), %xmm10
  1903 	movdqu	160(%rdi), %xmm11
  1904 	movdqu	176(%rdi), %xmm12
  1905 	movdqu	192(%rdi), %xmm13
  1906 	movdqu	208(%rdi), %xmm14
  1908 4:	movdqu	(%r8, %rax), %xmm1
  1909 	pxor	%xmm8, %xmm1
  1910 	movdqu	112(%rdi), %xmm8
  1911 	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
  1912 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  1913 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  1914 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  1915 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  1916 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  1917 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  1918 	movdqu	(%rdi), %xmm8
  1919 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  1920 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  1921 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  1922 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  1923 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  1924 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
  1925 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
  1926 	movdqu	%xmm1, (%rsi, %rax)
  1927 	addq	$16, %rax
  1928 	cmpq	%rax, %r9
  1929 	jne	4b
  1931 5:	xor	%eax, %eax
  1932 	ret
  1933 	.size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
  1936 /* in %rdi : cx - context
  1937    in %rsi : output - pointer to output buffer
  1938    in %rdx : outputLen - pointer to variable for length of output
  1939              (filled by caller)
  1940    in %rcx : maxOutputLen - length of output buffer
  1941    in %r8  : input - pointer to input buffer
  1942    in %r9  : inputLen - length of input buffer
  1943    on stack: blocksize - AES blocksize (always 16, unused)
  1944 */
  1945 	.type intel_aes_decrypt_ecb_256,@function
  1946 	.globl intel_aes_decrypt_ecb_256
  1947 	.align	16
  1948 intel_aes_decrypt_ecb_256:
  1949 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  1950 	leaq	48(%rdi), %rdi
  1952 	movdqu	(%rdi), %xmm2
  1953 	movdqu	224(%rdi), %xmm15
  1954 	xorl	%eax, %eax
  1955 //	cmpq	$8*16, %r9
  1956 	cmpq	$128, %r9
  1957 	jb	1f
  1958 //	leaq	-8*16(%r9), %r11
  1959 	leaq	-128(%r9), %r11
  1960 2:	movdqu	(%r8, %rax), %xmm3
  1961 	movdqu	16(%r8, %rax), %xmm4
  1962 	movdqu	32(%r8, %rax), %xmm5
  1963 	movdqu	48(%r8, %rax), %xmm6
  1964 	movdqu	64(%r8, %rax), %xmm7
  1965 	movdqu	80(%r8, %rax), %xmm8
  1966 	movdqu	96(%r8, %rax), %xmm9
  1967 	movdqu	112(%r8, %rax), %xmm10
  1968 	pxor	%xmm15, %xmm3
  1969 	pxor	%xmm15, %xmm4
  1970 	pxor	%xmm15, %xmm5
  1971 	pxor	%xmm15, %xmm6
  1972 	pxor	%xmm15, %xmm7
  1973 	pxor	%xmm15, %xmm8
  1974 	pxor	%xmm15, %xmm9
  1975 	pxor	%xmm15, %xmm10
  1977 // complete loop unrolling
  1978 	movdqu 208(%rdi), %xmm1
  1979 	movdqu 192(%rdi), %xmm11
  1980 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  1981 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  1982 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  1983 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  1984 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  1985 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  1986 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  1987 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  1988 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  1989 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  1990 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  1991 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  1992 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  1993 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  1994 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  1995 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  1997 	movdqu 176(%rdi), %xmm1
  1998 	movdqu 160(%rdi), %xmm11
  1999 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2000 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2001 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2002 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2003 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2004 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2005 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2006 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2007 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2008 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2009 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2010 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2011 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2012 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2013 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2014 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2016 	movdqu 144(%rdi), %xmm1
  2017 	movdqu 128(%rdi), %xmm11
  2018 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2019 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2020 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2021 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2022 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2023 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2024 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2025 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2026 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2027 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2028 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2029 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2030 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2031 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2032 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2033 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2035 	movdqu 112(%rdi), %xmm1
  2036 	movdqu 96(%rdi), %xmm11
  2037 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2038 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2039 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2040 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2041 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2042 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2043 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2044 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2045 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2046 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2047 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2048 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2049 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2050 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2051 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2052 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2054 	movdqu 80(%rdi), %xmm1
  2055 	movdqu 64(%rdi), %xmm11
  2056 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2057 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2058 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2059 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2060 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2061 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2062 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2063 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2064 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2065 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2066 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2067 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2068 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2069 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2070 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2071 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2073 	movdqu 48(%rdi), %xmm1
  2074 	movdqu 32(%rdi), %xmm11
  2075 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2076 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2077 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2078 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2079 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2080 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2081 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2082 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2083 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2084 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2085 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2086 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2087 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2088 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2089 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2090 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2092 	movdqu 16(%rdi), %xmm1
  2093 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2094 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2095 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2096 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2097 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2098 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2099 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2100 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2101 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  2102 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  2103 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  2104 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  2105 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  2106 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  2107 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  2108 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  2110 	movdqu	%xmm3, (%rsi, %rax)
  2111 	movdqu	%xmm4, 16(%rsi, %rax)
  2112 	movdqu	%xmm5, 32(%rsi, %rax)
  2113 	movdqu	%xmm6, 48(%rsi, %rax)
  2114 	movdqu	%xmm7, 64(%rsi, %rax)
  2115 	movdqu	%xmm8, 80(%rsi, %rax)
  2116 	movdqu	%xmm9, 96(%rsi, %rax)
  2117 	movdqu	%xmm10, 112(%rsi, %rax)
  2118 //	addq	$8*16, %rax
  2119 	addq	$128, %rax
  2120 	cmpq	%r11, %rax
  2121 	jbe	2b
  2122 1:	cmpq	%rax, %r9
  2123 	je	5f
  2125 	movdqu	16(%rdi), %xmm2
  2126 	movdqu	32(%rdi), %xmm3
  2127 	movdqu	48(%rdi), %xmm4
  2128 	movdqu	64(%rdi), %xmm5
  2129 	movdqu	80(%rdi), %xmm6
  2130 	movdqu	96(%rdi), %xmm7
  2131 	movdqu	112(%rdi), %xmm8
  2132 	movdqu	128(%rdi), %xmm9
  2133 	movdqu	144(%rdi), %xmm10
  2134 	movdqu	160(%rdi), %xmm11
  2135 	movdqu	176(%rdi), %xmm12
  2136 	movdqu	192(%rdi), %xmm13
  2137 	movdqu	208(%rdi), %xmm14
  2139 4:	movdqu	(%r8, %rax), %xmm1
  2140 	pxor	%xmm15, %xmm1
  2141 	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
  2142 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  2143 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  2144 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  2145 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  2146 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  2147 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  2148 	movdqu	(%rdi), %xmm8
  2149 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  2150 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  2151 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  2152 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  2153 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  2154 	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
  2155 	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
  2156 	movdqu	112(%rdi), %xmm8
  2157 	movdqu	%xmm1, (%rsi, %rax)
  2158 	addq	$16, %rax
  2159 	cmpq	%rax, %r9
  2160 	jne	4b
  2162 5:	xor	%eax, %eax
  2163 	ret
  2164 	.size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
  2167 /* in %rdi : cx - context
  2168    in %rsi : output - pointer to output buffer
  2169    in %rdx : outputLen - pointer to variable for length of output
  2170              (filled by caller)
  2171    in %rcx : maxOutputLen - length of output buffer
  2172    in %r8  : input - pointer to input buffer
  2173    in %r9  : inputLen - length of input buffer
  2174    on stack: blocksize - AES blocksize (always 16, unused)
  2175 */
  2176 	.type intel_aes_encrypt_cbc_256,@function
  2177 	.globl intel_aes_encrypt_cbc_256
  2178 	.align	16
  2179 intel_aes_encrypt_cbc_256:
  2180 	testq	%r9, %r9
  2181 	je	2f
  2183 //	leaq	IV_OFFSET(%rdi), %rdx
  2184 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  2185 	leaq	16(%rdi), %rdx
  2186 	leaq	48(%rdi), %rdi
  2188 	movdqu	(%rdx), %xmm0
  2189 	movdqu	(%rdi), %xmm8
  2190 	movdqu	16(%rdi), %xmm2
  2191 	movdqu	32(%rdi), %xmm3
  2192 	movdqu	48(%rdi), %xmm4
  2193 	movdqu	64(%rdi), %xmm5
  2194 	movdqu	80(%rdi), %xmm6
  2195 	movdqu	96(%rdi), %xmm7
  2196 	movdqu	128(%rdi), %xmm9
  2197 	movdqu	144(%rdi), %xmm10
  2198 	movdqu	160(%rdi), %xmm11
  2199 	movdqu	176(%rdi), %xmm12
  2200 	movdqu	192(%rdi), %xmm13
  2201 	movdqu	208(%rdi), %xmm14
  2202 	movdqu	224(%rdi), %xmm15
  2204 	xorl	%eax, %eax
  2205 1:	movdqu	(%r8, %rax), %xmm1
  2206 	pxor	%xmm0, %xmm1
  2207 	pxor	%xmm8, %xmm1
  2208 	movdqu	112(%rdi), %xmm8
  2209 	.byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
  2210 	.byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
  2211 	.byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
  2212 	.byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
  2213 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
  2214 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
  2215 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
  2216 	movdqu	(%rdi), %xmm8
  2217 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
  2218 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
  2219 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
  2220 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
  2221 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
  2222 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
  2223 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
  2224 	movdqu	%xmm1, (%rsi, %rax)
  2225 	movdqa	%xmm1, %xmm0
  2226 	addq	$16, %rax
  2227 	cmpq	%rax, %r9
  2228 	jne	1b
  2230 	movdqu	%xmm0, (%rdx)
  2232 2:	xor	%eax, %eax
  2233 	ret
  2234 	.size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
  2237 /* in %rdi : cx - context
  2238    in %rsi : output - pointer to output buffer
  2239    in %rdx : outputLen - pointer to variable for length of output
  2240              (filled by caller)
  2241    in %rcx : maxOutputLen - length of output buffer
  2242    in %r8  : input - pointer to input buffer
  2243    in %r9  : inputLen - length of input buffer
  2244    on stack: blocksize - AES blocksize (always 16, unused)
  2245 */
  2246 	.type intel_aes_decrypt_cbc_256,@function
  2247 	.globl intel_aes_decrypt_cbc_256
  2248 	.align	16
  2249 intel_aes_decrypt_cbc_256:
  2250 //	leaq	IV_OFFSET(%rdi), %rdx
  2251 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
  2252 	leaq	16(%rdi), %rdx
  2253 	leaq	48(%rdi), %rdi
  2255 	movdqu	(%rdx), %xmm0
  2256 	movdqu	(%rdi), %xmm2
  2257 	movdqu	224(%rdi), %xmm15
  2258 	xorl	%eax, %eax
  2259 //	cmpq	$8*16, %r9
  2260 	cmpq	$128, %r9
  2261 	jb	1f
  2262 //	leaq	-8*16(%r9), %r11
  2263 	leaq	-128(%r9), %r11
  2264 2:	movdqu  (%r8, %rax), %xmm3
  2265 	movdqu	16(%r8, %rax), %xmm4
  2266 	movdqu	32(%r8, %rax), %xmm5
  2267 	movdqu	48(%r8, %rax), %xmm6
  2268 	movdqu	64(%r8, %rax), %xmm7
  2269 	movdqu	80(%r8, %rax), %xmm8
  2270 	movdqu	96(%r8, %rax), %xmm9
  2271 	movdqu	112(%r8, %rax), %xmm10
  2272 	pxor	%xmm15, %xmm3
  2273 	pxor	%xmm15, %xmm4
  2274 	pxor	%xmm15, %xmm5
  2275 	pxor	%xmm15, %xmm6
  2276 	pxor	%xmm15, %xmm7
  2277 	pxor	%xmm15, %xmm8
  2278 	pxor	%xmm15, %xmm9
  2279 	pxor	%xmm15, %xmm10
  2281 // complete loop unrolling
  2282 	movdqu 208(%rdi), %xmm1
  2283 	movdqu 192(%rdi), %xmm11
  2284 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2285 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2286 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2287 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2288 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2289 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2290 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2291 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2292 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2293 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2294 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2295 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2296 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2297 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2298 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2299 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2301 	movdqu 176(%rdi), %xmm1
  2302 	movdqu 160(%rdi), %xmm11
  2303 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2304 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2305 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2306 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2307 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2308 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2309 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2310 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2311 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2312 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2313 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2314 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2315 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2316 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2317 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2318 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2320 	movdqu 144(%rdi), %xmm1
  2321 	movdqu 128(%rdi), %xmm11
  2322 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2323 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2324 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2325 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2326 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2327 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2328 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2329 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2330 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2331 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2332 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2333 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2334 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2335 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2336 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2337 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2339 	movdqu 112(%rdi), %xmm1
  2340 	movdqu 96(%rdi), %xmm11
  2341 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2342 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2343 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2344 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2345 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2346 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2347 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2348 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2349 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2350 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2351 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2352 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2353 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2354 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2355 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2356 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2358 	movdqu 80(%rdi), %xmm1
  2359 	movdqu 64(%rdi), %xmm11
  2360 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2361 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2362 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2363 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2364 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2365 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2366 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2367 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2368 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2369 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2370 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2371 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2372 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2373 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2374 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2375 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2377 	movdqu 48(%rdi), %xmm1
  2378 	movdqu 32(%rdi), %xmm11
  2379 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2380 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2381 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2382 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2383 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2384 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2385 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2386 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2387 	.byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
  2388 	.byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
  2389 	.byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
  2390 	.byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
  2391 	.byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
  2392 	.byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
  2393 	.byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
  2394 	.byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
  2396 	movdqu 16(%rdi), %xmm1
  2397 	.byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
  2398 	.byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
  2399 	.byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
  2400 	.byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
  2401 	.byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
  2402 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
  2403 	.byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
  2404 	.byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
  2405 	.byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
  2406 	.byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
  2407 	.byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
  2408 	.byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
  2409 	.byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
  2410 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
  2411 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
  2412 	.byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
  2414  	pxor	%xmm0, %xmm3
  2415 	movdqu	(%r8, %rax), %xmm0
  2416 	pxor	%xmm0, %xmm4
  2417 	movdqu	16(%r8, %rax), %xmm0
  2418 	pxor	%xmm0, %xmm5
  2419 	movdqu	32(%r8, %rax), %xmm0
  2420 	pxor	%xmm0, %xmm6
  2421 	movdqu	48(%r8, %rax), %xmm0
  2422 	pxor	%xmm0, %xmm7
  2423 	movdqu	64(%r8, %rax), %xmm0
  2424 	pxor	%xmm0, %xmm8
  2425 	movdqu	80(%r8, %rax), %xmm0
  2426 	pxor	%xmm0, %xmm9
  2427 	movdqu	96(%r8, %rax), %xmm0
  2428 	pxor	%xmm0, %xmm10
  2429 	movdqu	112(%r8, %rax), %xmm0
  2430 	movdqu	%xmm3, (%rsi, %rax)
  2431 	movdqu	%xmm4, 16(%rsi, %rax)
  2432 	movdqu	%xmm5, 32(%rsi, %rax)
  2433 	movdqu	%xmm6, 48(%rsi, %rax)
  2434 	movdqu	%xmm7, 64(%rsi, %rax)
  2435 	movdqu	%xmm8, 80(%rsi, %rax)
  2436 	movdqu	%xmm9, 96(%rsi, %rax)
  2437 	movdqu	%xmm10, 112(%rsi, %rax)
  2438 //	addq	$8*16, %rax
  2439 	addq	$128, %rax
  2440 	cmpq	%r11, %rax
  2441 	jbe	2b
  2442 1:	cmpq	%rax, %r9
  2443 	je	5f
  2445 	movdqu	16(%rdi), %xmm2
  2446 	movdqu	32(%rdi), %xmm3
  2447 	movdqu	48(%rdi), %xmm4
  2448 	movdqu	64(%rdi), %xmm5
  2449 	movdqu	80(%rdi), %xmm6
  2450 	movdqu	96(%rdi), %xmm7
  2451 	movdqu	112(%rdi), %xmm8
  2452 	movdqu	128(%rdi), %xmm9
  2453 	movdqu	144(%rdi), %xmm10
  2454 	movdqu	160(%rdi), %xmm11
  2455 	movdqu	176(%rdi), %xmm12
  2456 	movdqu	192(%rdi), %xmm13
  2457 	movdqu	208(%rdi), %xmm14
  2459 4:	movdqu	(%r8, %rax), %xmm1
  2460 	pxor	%xmm15, %xmm1
  2461 	.byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
  2462 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
  2463 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
  2464 	.byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
  2465 	.byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
  2466 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
  2467 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
  2468 	movdqu	(%rdi), %xmm8
  2469 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
  2470 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
  2471 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
  2472 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
  2473 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
  2474 	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
  2475 	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
  2476 	movdqu	112(%rdi), %xmm8
  2477 	pxor	%xmm0, %xmm1
  2478 	movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
  2479 	movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
  2480 	addq	$16, %rax
  2481 	cmpq	%rax, %r9
  2482 	jne	4b
  2484 5:	movdqu	%xmm0, (%rdx)
  2486 	xor	%eax, %eax
  2487 	ret
  2488 	.size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256

mercurial