|
1 ; LICENSE: |
|
2 ; This submission to NSS is to be made available under the terms of the |
|
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: |
|
4 ; //mozilla.org/MPL/2.0/. |
|
5 ;############################################################################### |
|
6 ; Copyright(c) 2014, Intel Corp. |
|
7 ; Developers and authors: |
|
8 ; Shay Gueron and Vlad Krasnov |
|
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel |
|
10 ; Please send feedback directly to crypto.feedback.alias@intel.com |
|
11 |
|
12 |
|
13 .MODEL FLAT, C |
|
14 .XMM |
|
15 |
|
16 .DATA |
|
17 ALIGN 16 |
|
18 Lone dq 1,0 |
|
19 Ltwo dq 2,0 |
|
20 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 |
|
21 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh |
|
22 Lpoly dq 01h, 0c200000000000000h |
|
23 |
|
24 .CODE |
|
25 |
|
26 |
|
27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 |
|
28 vpclmulqdq TMP1, SRC2, SRC1, 0h |
|
29 vpclmulqdq TMP4, SRC2, SRC1, 011h |
|
30 |
|
31 vpshufd TMP2, SRC2, 78 |
|
32 vpshufd TMP3, SRC1, 78 |
|
33 vpxor TMP2, TMP2, SRC2 |
|
34 vpxor TMP3, TMP3, SRC1 |
|
35 |
|
36 vpclmulqdq TMP2, TMP2, TMP3, 0h |
|
37 vpxor TMP2, TMP2, TMP1 |
|
38 vpxor TMP2, TMP2, TMP4 |
|
39 |
|
40 vpslldq TMP3, TMP2, 8 |
|
41 vpsrldq TMP2, TMP2, 8 |
|
42 |
|
43 vpxor TMP1, TMP1, TMP3 |
|
44 vpxor TMP4, TMP4, TMP2 |
|
45 |
|
46 vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
|
47 vpshufd TMP3, TMP1, 78 |
|
48 vpxor TMP1, TMP2, TMP3 |
|
49 |
|
50 vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
|
51 vpshufd TMP3, TMP1, 78 |
|
52 vpxor TMP1, TMP2, TMP3 |
|
53 |
|
54 vpxor DST, TMP1, TMP4 |
|
55 |
|
56 ENDM |
|
57 |
|
58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
59 ; |
|
60 ; Generates the final GCM tag |
|
61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], |
|
62 ; unsigned char *Tp, |
|
63 ; unsigned int Mlen, |
|
64 ; unsigned int Alen, |
|
65 ; unsigned char* X0, |
|
66 ; unsigned char* TAG); |
|
67 ; |
|
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
69 |
|
70 ALIGN 16 |
|
71 intel_aes_gcmTAG PROC |
|
72 |
|
73 Htbl textequ <eax> |
|
74 Tp textequ <ecx> |
|
75 X0 textequ <edx> |
|
76 TAG textequ <ebx> |
|
77 |
|
78 T textequ <xmm0> |
|
79 TMP0 textequ <xmm1> |
|
80 |
|
81 push ebx |
|
82 |
|
83 mov Htbl, [esp + 2*4 + 0*4] |
|
84 mov Tp, [esp + 2*4 + 1*4] |
|
85 mov X0, [esp + 2*4 + 4*4] |
|
86 mov TAG, [esp + 2*4 + 5*4] |
|
87 |
|
88 vzeroupper |
|
89 vmovdqu T, XMMWORD PTR[Tp] |
|
90 |
|
91 vpxor TMP0, TMP0, TMP0 |
|
92 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 |
|
93 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 |
|
94 vpsllq TMP0, TMP0, 3 |
|
95 |
|
96 vpxor T, T, TMP0 |
|
97 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
98 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
|
99 |
|
100 vpshufb T, T, [Lbswap_mask] |
|
101 vpxor T, T, [X0] |
|
102 vmovdqu XMMWORD PTR[TAG], T |
|
103 vzeroupper |
|
104 |
|
105 pop ebx |
|
106 |
|
107 ret |
|
108 |
|
109 intel_aes_gcmTAG ENDP |
|
110 |
|
111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
112 ; |
|
113 ; Generates the H table |
|
114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); |
|
115 ; |
|
116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
117 |
|
118 ALIGN 16 |
|
119 intel_aes_gcmINIT PROC |
|
120 |
|
121 Htbl textequ <eax> |
|
122 KS textequ <ecx> |
|
123 NR textequ <edx> |
|
124 |
|
125 T textequ <xmm0> |
|
126 TMP0 textequ <xmm1> |
|
127 |
|
128 mov Htbl, [esp + 4*1 + 0*4] |
|
129 mov KS, [esp + 4*1 + 1*4] |
|
130 mov NR, [esp + 4*1 + 2*4] |
|
131 |
|
132 vzeroupper |
|
133 ; AES-ENC(0) |
|
134 vmovdqu T, XMMWORD PTR[KS] |
|
135 lea KS, [16 + KS] |
|
136 dec NR |
|
137 Lenc_loop: |
|
138 vaesenc T, T, [KS] |
|
139 lea KS, [16 + KS] |
|
140 dec NR |
|
141 jnz Lenc_loop |
|
142 |
|
143 vaesenclast T, T, [KS] |
|
144 vpshufb T, T, [Lbswap_mask] |
|
145 |
|
146 ;Calculate H` = GFMUL(H, 2) |
|
147 vpsrad xmm3, T, 31 |
|
148 vpshufd xmm3, xmm3, 0ffh |
|
149 vpand xmm5, xmm3, [Lpoly] |
|
150 vpsrld xmm3, T, 31 |
|
151 vpslld xmm4, T, 1 |
|
152 vpslldq xmm3, xmm3, 4 |
|
153 vpxor T, xmm4, xmm3 |
|
154 vpxor T, T, xmm5 |
|
155 |
|
156 vmovdqu TMP0, T |
|
157 vmovdqu XMMWORD PTR[Htbl + 0*16], T |
|
158 |
|
159 vpshufd xmm2, T, 78 |
|
160 vpxor xmm2, xmm2, T |
|
161 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 |
|
162 |
|
163 i = 1 |
|
164 WHILE i LT 8 |
|
165 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
|
166 vmovdqu XMMWORD PTR[Htbl + i*16], T |
|
167 vpshufd xmm2, T, 78 |
|
168 vpxor xmm2, xmm2, T |
|
169 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 |
|
170 i = i+1 |
|
171 ENDM |
|
172 vzeroupper |
|
173 ret |
|
174 intel_aes_gcmINIT ENDP |
|
175 |
|
176 |
|
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
178 ; |
|
179 ; Authenticate only |
|
180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); |
|
181 ; |
|
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
183 |
|
184 ALIGN 16 |
|
185 intel_aes_gcmAAD PROC |
|
186 |
|
187 Htbl textequ <eax> |
|
188 inp textequ <ecx> |
|
189 len textequ <edx> |
|
190 Tp textequ <ebx> |
|
191 hlp0 textequ <esi> |
|
192 |
|
193 DATA textequ <xmm0> |
|
194 T textequ <xmm1> |
|
195 TMP0 textequ <xmm2> |
|
196 TMP1 textequ <xmm3> |
|
197 TMP2 textequ <xmm4> |
|
198 TMP3 textequ <xmm5> |
|
199 TMP4 textequ <xmm6> |
|
200 Xhi textequ <xmm7> |
|
201 |
|
202 KARATSUBA_AAD MACRO i |
|
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h |
|
204 vpxor TMP0, TMP0, TMP3 |
|
205 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h |
|
206 vpxor TMP1, TMP1, TMP3 |
|
207 vpshufd TMP3, DATA, 78 |
|
208 vpxor TMP3, TMP3, DATA |
|
209 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h |
|
210 vpxor TMP2, TMP2, TMP3 |
|
211 ENDM |
|
212 |
|
213 cmp DWORD PTR[esp + 1*3 + 2*4], 0 |
|
214 jnz LbeginAAD |
|
215 ret |
|
216 |
|
217 LbeginAAD: |
|
218 push ebx |
|
219 push esi |
|
220 |
|
221 mov Htbl, [esp + 4*3 + 0*4] |
|
222 mov inp, [esp + 4*3 + 1*4] |
|
223 mov len, [esp + 4*3 + 2*4] |
|
224 mov Tp, [esp + 4*3 + 3*4] |
|
225 |
|
226 vzeroupper |
|
227 |
|
228 vpxor Xhi, Xhi, Xhi |
|
229 |
|
230 vmovdqu T, XMMWORD PTR[Tp] |
|
231 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first |
|
232 mov hlp0, len |
|
233 and hlp0, 128-1 |
|
234 jz Lmod_loop |
|
235 |
|
236 and len, -128 |
|
237 sub hlp0, 16 |
|
238 |
|
239 ; Prefix block |
|
240 vmovdqu DATA, XMMWORD PTR[inp] |
|
241 vpshufb DATA, DATA, [Lbswap_mask] |
|
242 vpxor DATA, DATA, T |
|
243 |
|
244 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
|
245 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
|
246 vpshufd TMP3, DATA, 78 |
|
247 vpxor TMP3, TMP3, DATA |
|
248 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
|
249 |
|
250 lea inp, [inp+16] |
|
251 test hlp0, hlp0 |
|
252 jnz Lpre_loop |
|
253 jmp Lred1 |
|
254 |
|
255 ;hash remaining prefix bocks (up to 7 total prefix blocks) |
|
256 Lpre_loop: |
|
257 |
|
258 sub hlp0, 16 |
|
259 |
|
260 vmovdqu DATA, XMMWORD PTR[inp] |
|
261 vpshufb DATA, DATA, [Lbswap_mask] |
|
262 |
|
263 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h |
|
264 vpxor TMP0, TMP0, TMP3 |
|
265 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h |
|
266 vpxor TMP1, TMP1, TMP3 |
|
267 vpshufd TMP3, DATA, 78 |
|
268 vpxor TMP3, TMP3, DATA |
|
269 vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h |
|
270 vpxor TMP2, TMP2, TMP3 |
|
271 |
|
272 test hlp0, hlp0 |
|
273 lea inp, [inp+16] |
|
274 jnz Lpre_loop |
|
275 |
|
276 Lred1: |
|
277 |
|
278 vpxor TMP2, TMP2, TMP0 |
|
279 vpxor TMP2, TMP2, TMP1 |
|
280 vpsrldq TMP3, TMP2, 8 |
|
281 vpslldq TMP2, TMP2, 8 |
|
282 |
|
283 vpxor Xhi, TMP1, TMP3 |
|
284 vpxor T, TMP0, TMP2 |
|
285 |
|
286 Lmod_loop: |
|
287 |
|
288 sub len, 16*8 |
|
289 jb Ldone |
|
290 ; Block #0 |
|
291 vmovdqu DATA, XMMWORD PTR[inp + 16*7] |
|
292 vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] |
|
293 |
|
294 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h |
|
295 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h |
|
296 vpshufd TMP3, DATA, 78 |
|
297 vpxor TMP3, TMP3, DATA |
|
298 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h |
|
299 |
|
300 ; Block #1 |
|
301 vmovdqu DATA, XMMWORD PTR[inp + 16*6] |
|
302 vpshufb DATA, DATA, [Lbswap_mask] |
|
303 KARATSUBA_AAD 1 |
|
304 |
|
305 ; Block #2 |
|
306 vmovdqu DATA, XMMWORD PTR[inp + 16*5] |
|
307 vpshufb DATA, DATA, [Lbswap_mask] |
|
308 |
|
309 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a |
|
310 vpalignr T, T, T, 8 |
|
311 |
|
312 KARATSUBA_AAD 2 |
|
313 |
|
314 vpxor T, T, TMP4 ;reduction stage 1b |
|
315 |
|
316 ; Block #3 |
|
317 vmovdqu DATA, XMMWORD PTR[inp + 16*4] |
|
318 vpshufb DATA, DATA, [Lbswap_mask] |
|
319 KARATSUBA_AAD 3 |
|
320 ; Block #4 |
|
321 vmovdqu DATA, XMMWORD PTR[inp + 16*3] |
|
322 vpshufb DATA, DATA, [Lbswap_mask] |
|
323 |
|
324 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a |
|
325 vpalignr T, T, T, 8 |
|
326 |
|
327 KARATSUBA_AAD 4 |
|
328 |
|
329 vpxor T, T, TMP4 ;reduction stage 2b |
|
330 ; Block #5 |
|
331 vmovdqu DATA, XMMWORD PTR[inp + 16*2] |
|
332 vpshufb DATA, DATA, [Lbswap_mask] |
|
333 KARATSUBA_AAD 5 |
|
334 |
|
335 vpxor T, T, Xhi ;reduction finalize |
|
336 ; Block #6 |
|
337 vmovdqu DATA, XMMWORD PTR[inp + 16*1] |
|
338 vpshufb DATA, DATA, [Lbswap_mask] |
|
339 KARATSUBA_AAD 6 |
|
340 ; Block #7 |
|
341 vmovdqu DATA, XMMWORD PTR[inp + 16*0] |
|
342 vpshufb DATA, DATA, [Lbswap_mask] |
|
343 vpxor DATA, DATA, T |
|
344 KARATSUBA_AAD 7 |
|
345 ; Aggregated 8 blocks, now karatsuba fixup |
|
346 vpxor TMP2, TMP2, TMP0 |
|
347 vpxor TMP2, TMP2, TMP1 |
|
348 vpsrldq TMP3, TMP2, 8 |
|
349 vpslldq TMP2, TMP2, 8 |
|
350 |
|
351 vpxor Xhi, TMP1, TMP3 |
|
352 vpxor T, TMP0, TMP2 |
|
353 |
|
354 lea inp, [inp + 16*8] |
|
355 jmp Lmod_loop |
|
356 |
|
357 Ldone: |
|
358 vpclmulqdq TMP4, T, [Lpoly], 010h |
|
359 vpalignr T, T, T, 8 |
|
360 vpxor T, T, TMP4 |
|
361 |
|
362 vpclmulqdq TMP4, T, [Lpoly], 010h |
|
363 vpalignr T, T, T, 8 |
|
364 vpxor T, T, TMP4 |
|
365 |
|
366 vpxor T, T, Xhi |
|
367 vmovdqu XMMWORD PTR[Tp], T |
|
368 vzeroupper |
|
369 |
|
370 pop esi |
|
371 pop ebx |
|
372 ret |
|
373 |
|
374 intel_aes_gcmAAD ENDP |
|
375 |
|
376 |
|
377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
378 ; |
|
379 ; Encrypt and Authenticate |
|
380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); |
|
381 ; |
|
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
383 |
|
384 ALIGN 16 |
|
385 intel_aes_gcmENC PROC |
|
386 |
|
387 PT textequ <eax> |
|
388 CT textequ <ecx> |
|
389 Htbl textequ <edx> |
|
390 Gctx textequ <edx> |
|
391 len textequ <DWORD PTR[ebp + 5*4 + 3*4]> |
|
392 KS textequ <esi> |
|
393 NR textequ <DWORD PTR[-40 + KS]> |
|
394 |
|
395 aluCTR textequ <ebx> |
|
396 aluTMP textequ <edi> |
|
397 |
|
398 T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> |
|
399 TMP0 textequ <xmm1> |
|
400 TMP1 textequ <xmm2> |
|
401 TMP2 textequ <xmm3> |
|
402 TMP3 textequ <xmm4> |
|
403 TMP4 textequ <xmm5> |
|
404 TMP5 textequ <xmm6> |
|
405 |
|
406 CTR0 textequ <xmm0> |
|
407 CTR1 textequ <xmm1> |
|
408 CTR2 textequ <xmm2> |
|
409 CTR3 textequ <xmm3> |
|
410 CTR4 textequ <xmm4> |
|
411 CTR5 textequ <xmm5> |
|
412 CTR6 textequ <xmm6> |
|
413 |
|
414 ROUND MACRO i |
|
415 vmovdqu xmm7, XMMWORD PTR[i*16 + KS] |
|
416 vaesenc CTR0, CTR0, xmm7 |
|
417 vaesenc CTR1, CTR1, xmm7 |
|
418 vaesenc CTR2, CTR2, xmm7 |
|
419 vaesenc CTR3, CTR3, xmm7 |
|
420 vaesenc CTR4, CTR4, xmm7 |
|
421 vaesenc CTR5, CTR5, xmm7 |
|
422 vaesenc CTR6, CTR6, xmm7 |
|
423 ENDM |
|
424 |
|
425 KARATSUBA MACRO i |
|
426 vpshufd TMP4, TMP5, 78 |
|
427 vpxor TMP4, TMP4, TMP5 |
|
428 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h |
|
429 vpxor TMP0, TMP0, TMP3 |
|
430 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] |
|
431 vpclmulqdq TMP3, TMP5, TMP4, 011h |
|
432 vpxor TMP1, TMP1, TMP3 |
|
433 vpclmulqdq TMP3, TMP5, TMP4, 000h |
|
434 vpxor TMP2, TMP2, TMP3 |
|
435 ENDM |
|
436 |
|
437 NEXTCTR MACRO i |
|
438 add aluCTR, 1 |
|
439 mov aluTMP, aluCTR |
|
440 bswap aluTMP |
|
441 xor aluTMP, [3*4 + KS] |
|
442 mov [3*4 + 8*16 + i*16 + esp], aluTMP |
|
443 ENDM |
|
444 |
|
445 cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
|
446 jne LbeginENC |
|
447 ret |
|
448 |
|
449 LbeginENC: |
|
450 |
|
451 vzeroupper |
|
452 push ebp |
|
453 push ebx |
|
454 push esi |
|
455 push edi |
|
456 |
|
457 mov ebp, esp |
|
458 sub esp, 16*16 |
|
459 and esp, -16 |
|
460 |
|
461 mov PT, [ebp + 5*4 + 0*4] |
|
462 mov CT, [ebp + 5*4 + 1*4] |
|
463 mov Gctx, [ebp + 5*4 + 2*4] |
|
464 |
|
465 mov KS, [16*16 + 3*16 + Gctx] |
|
466 lea KS, [44 + KS] |
|
467 |
|
468 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
|
469 bswap aluCTR |
|
470 |
|
471 |
|
472 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
|
473 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
474 vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 |
|
475 |
|
476 cmp len, 16*7 |
|
477 jb LEncDataSingles |
|
478 ; Prepare the "top" counters |
|
479 vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 |
|
480 vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 |
|
481 vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 |
|
482 vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 |
|
483 vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 |
|
484 vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 |
|
485 |
|
486 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
487 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
|
488 ; Encrypt the initial 7 blocks |
|
489 sub len, 16*7 |
|
490 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] |
|
491 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] |
|
492 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] |
|
493 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] |
|
494 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] |
|
495 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] |
|
496 |
|
497 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
|
498 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
|
499 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
|
500 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
|
501 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
|
502 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
|
503 vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] |
|
504 |
|
505 vmovdqu xmm7, XMMWORD PTR[0*16 + KS] |
|
506 vpxor CTR0, CTR0, xmm7 |
|
507 vpxor CTR1, CTR1, xmm7 |
|
508 vpxor CTR2, CTR2, xmm7 |
|
509 vpxor CTR3, CTR3, xmm7 |
|
510 vpxor CTR4, CTR4, xmm7 |
|
511 vpxor CTR5, CTR5, xmm7 |
|
512 vpxor CTR6, CTR6, xmm7 |
|
513 |
|
514 ROUND 1 |
|
515 |
|
516 add aluCTR, 7 |
|
517 mov aluTMP, aluCTR |
|
518 bswap aluTMP |
|
519 xor aluTMP, [KS + 3*4] |
|
520 mov [8*16 + 0*16 + 3*4 + esp], aluTMP |
|
521 |
|
522 ROUND 2 |
|
523 NEXTCTR 1 |
|
524 ROUND 3 |
|
525 NEXTCTR 2 |
|
526 ROUND 4 |
|
527 NEXTCTR 3 |
|
528 ROUND 5 |
|
529 NEXTCTR 4 |
|
530 ROUND 6 |
|
531 NEXTCTR 5 |
|
532 ROUND 7 |
|
533 NEXTCTR 6 |
|
534 ROUND 8 |
|
535 ROUND 9 |
|
536 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
|
537 cmp NR, 10 |
|
538 je @f |
|
539 |
|
540 ROUND 10 |
|
541 ROUND 11 |
|
542 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
|
543 cmp NR, 12 |
|
544 je @f |
|
545 |
|
546 ROUND 12 |
|
547 ROUND 13 |
|
548 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
|
549 @@: |
|
550 vaesenclast CTR0, CTR0, xmm7 |
|
551 vaesenclast CTR1, CTR1, xmm7 |
|
552 vaesenclast CTR2, CTR2, xmm7 |
|
553 vaesenclast CTR3, CTR3, xmm7 |
|
554 vaesenclast CTR4, CTR4, xmm7 |
|
555 vaesenclast CTR5, CTR5, xmm7 |
|
556 vaesenclast CTR6, CTR6, xmm7 |
|
557 |
|
558 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
|
559 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
|
560 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
|
561 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
|
562 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
|
563 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
|
564 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
|
565 |
|
566 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
|
567 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
|
568 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
|
569 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
|
570 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
|
571 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
|
572 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
|
573 |
|
574 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
|
575 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
|
576 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
|
577 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
|
578 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
|
579 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
|
580 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
|
581 |
|
582 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
|
583 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
|
584 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
|
585 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
|
586 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
|
587 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
|
588 |
|
589 lea CT, [7*16 + CT] |
|
590 lea PT, [7*16 + PT] |
|
591 jmp LEncData7 |
|
592 |
|
593 LEncData7: |
|
594 cmp len, 16*7 |
|
595 jb LEndEnc7 |
|
596 sub len, 16*7 |
|
597 |
|
598 vpshufd TMP4, TMP5, 78 |
|
599 vpxor TMP4, TMP4, TMP5 |
|
600 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
|
601 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
|
602 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
603 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
604 |
|
605 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
|
606 KARATSUBA 1 |
|
607 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
|
608 KARATSUBA 2 |
|
609 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
|
610 KARATSUBA 3 |
|
611 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
|
612 KARATSUBA 4 |
|
613 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
|
614 KARATSUBA 5 |
|
615 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
|
616 vpxor TMP5, TMP5, T |
|
617 KARATSUBA 6 |
|
618 |
|
619 vpxor TMP0, TMP0, TMP1 |
|
620 vpxor TMP0, TMP0, TMP2 |
|
621 vpsrldq TMP3, TMP0, 8 |
|
622 vpxor TMP4, TMP1, TMP3 |
|
623 vpslldq TMP3, TMP0, 8 |
|
624 vpxor TMP5, TMP2, TMP3 |
|
625 |
|
626 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
627 vpalignr TMP5,TMP5,TMP5,8 |
|
628 vpxor TMP5, TMP5, TMP1 |
|
629 |
|
630 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
631 vpalignr TMP5,TMP5,TMP5,8 |
|
632 vpxor TMP5, TMP5, TMP1 |
|
633 |
|
634 vpxor TMP5, TMP5, TMP4 |
|
635 vmovdqu T, TMP5 |
|
636 |
|
637 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] |
|
638 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] |
|
639 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] |
|
640 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] |
|
641 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] |
|
642 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] |
|
643 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] |
|
644 |
|
645 ROUND 1 |
|
646 NEXTCTR 0 |
|
647 ROUND 2 |
|
648 NEXTCTR 1 |
|
649 ROUND 3 |
|
650 NEXTCTR 2 |
|
651 ROUND 4 |
|
652 NEXTCTR 3 |
|
653 ROUND 5 |
|
654 NEXTCTR 4 |
|
655 ROUND 6 |
|
656 NEXTCTR 5 |
|
657 ROUND 7 |
|
658 NEXTCTR 6 |
|
659 |
|
660 ROUND 8 |
|
661 ROUND 9 |
|
662 |
|
663 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
|
664 cmp NR, 10 |
|
665 je @f |
|
666 |
|
667 ROUND 10 |
|
668 ROUND 11 |
|
669 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
|
670 cmp NR, 12 |
|
671 je @f |
|
672 |
|
673 ROUND 12 |
|
674 ROUND 13 |
|
675 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
|
676 @@: |
|
677 vaesenclast CTR0, CTR0, xmm7 |
|
678 vaesenclast CTR1, CTR1, xmm7 |
|
679 vaesenclast CTR2, CTR2, xmm7 |
|
680 vaesenclast CTR3, CTR3, xmm7 |
|
681 vaesenclast CTR4, CTR4, xmm7 |
|
682 vaesenclast CTR5, CTR5, xmm7 |
|
683 vaesenclast CTR6, CTR6, xmm7 |
|
684 |
|
685 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] |
|
686 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] |
|
687 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] |
|
688 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] |
|
689 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] |
|
690 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] |
|
691 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] |
|
692 |
|
693 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
|
694 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
|
695 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
|
696 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
|
697 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
|
698 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
|
699 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
|
700 |
|
701 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] |
|
702 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] |
|
703 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] |
|
704 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] |
|
705 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] |
|
706 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] |
|
707 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] |
|
708 |
|
709 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 |
|
710 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 |
|
711 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 |
|
712 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 |
|
713 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 |
|
714 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 |
|
715 |
|
716 lea CT, [7*16 + CT] |
|
717 lea PT, [7*16 + PT] |
|
718 jmp LEncData7 |
|
719 |
|
720 LEndEnc7: |
|
721 |
|
722 vpshufd TMP4, TMP5, 78 |
|
723 vpxor TMP4, TMP4, TMP5 |
|
724 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
|
725 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
|
726 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
727 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
728 |
|
729 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] |
|
730 KARATSUBA 1 |
|
731 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] |
|
732 KARATSUBA 2 |
|
733 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] |
|
734 KARATSUBA 3 |
|
735 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] |
|
736 KARATSUBA 4 |
|
737 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] |
|
738 KARATSUBA 5 |
|
739 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] |
|
740 vpxor TMP5, TMP5, T |
|
741 KARATSUBA 6 |
|
742 |
|
743 vpxor TMP0, TMP0, TMP1 |
|
744 vpxor TMP0, TMP0, TMP2 |
|
745 vpsrldq TMP3, TMP0, 8 |
|
746 vpxor TMP4, TMP1, TMP3 |
|
747 vpslldq TMP3, TMP0, 8 |
|
748 vpxor TMP5, TMP2, TMP3 |
|
749 |
|
750 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
751 vpalignr TMP5,TMP5,TMP5,8 |
|
752 vpxor TMP5, TMP5, TMP1 |
|
753 |
|
754 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
755 vpalignr TMP5,TMP5,TMP5,8 |
|
756 vpxor TMP5, TMP5, TMP1 |
|
757 |
|
758 vpxor TMP5, TMP5, TMP4 |
|
759 vmovdqu T, TMP5 |
|
760 |
|
761 sub aluCTR, 6 |
|
762 |
|
763 LEncDataSingles: |
|
764 |
|
765 cmp len, 16 |
|
766 jb LEncDataTail |
|
767 sub len, 16 |
|
768 |
|
769 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
|
770 NEXTCTR 0 |
|
771 |
|
772 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
773 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
774 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
775 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
776 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
777 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
778 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
779 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
780 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
781 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
782 cmp NR, 10 |
|
783 je @f |
|
784 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
785 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
786 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
787 cmp NR, 12 |
|
788 je @f |
|
789 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
790 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
791 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
792 @@: |
|
793 vaesenclast TMP1, TMP1, TMP2 |
|
794 vpxor TMP1, TMP1, XMMWORD PTR[PT] |
|
795 vmovdqu XMMWORD PTR[CT], TMP1 |
|
796 |
|
797 lea PT, [16+PT] |
|
798 lea CT, [16+CT] |
|
799 |
|
800 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
|
801 vpxor TMP1, TMP1, T |
|
802 |
|
803 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
804 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
|
805 vmovdqu T, TMP1 |
|
806 |
|
807 jmp LEncDataSingles |
|
808 |
|
809 LEncDataTail: |
|
810 |
|
811 cmp len, 0 |
|
812 je LEncDataEnd |
|
813 |
|
814 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] |
|
815 |
|
816 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
817 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
818 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
819 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
820 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
821 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
822 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
823 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
824 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
825 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
826 cmp NR, 10 |
|
827 je @f |
|
828 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
829 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
830 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
831 cmp NR, 12 |
|
832 je @f |
|
833 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
834 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
835 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
836 @@: |
|
837 vaesenclast TMP1, TMP1, TMP2 |
|
838 ; zero a temp location |
|
839 vpxor TMP2, TMP2, TMP2 |
|
840 vmovdqa XMMWORD PTR[esp], TMP2 |
|
841 ; copy as many bytes as needed |
|
842 xor KS, KS |
|
843 mov aluTMP, edx |
|
844 @@: |
|
845 cmp len, KS |
|
846 je @f |
|
847 mov dl, BYTE PTR[PT + KS] |
|
848 mov BYTE PTR[esp + KS], dl |
|
849 inc KS |
|
850 jmp @b |
|
851 @@: |
|
852 vpxor TMP1, TMP1, XMMWORD PTR[esp] |
|
853 vmovdqa XMMWORD PTR[esp], TMP1 |
|
854 xor KS, KS |
|
855 @@: |
|
856 cmp len, KS |
|
857 je @f |
|
858 mov dl, BYTE PTR[esp + KS] |
|
859 mov BYTE PTR[CT + KS], dl |
|
860 inc KS |
|
861 jmp @b |
|
862 @@: |
|
863 cmp KS, 16 |
|
864 je @f |
|
865 mov BYTE PTR[esp + KS], 0 |
|
866 inc KS |
|
867 jmp @b |
|
868 @@: |
|
869 mov edx, aluTMP |
|
870 vmovdqa TMP1, XMMWORD PTR[esp] |
|
871 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
|
872 vpxor TMP1, TMP1, T |
|
873 |
|
874 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
875 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
|
876 vmovdqu T, TMP1 |
|
877 |
|
878 LEncDataEnd: |
|
879 inc aluCTR |
|
880 bswap aluCTR |
|
881 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
|
882 |
|
883 mov esp, ebp |
|
884 pop edi |
|
885 pop esi |
|
886 pop ebx |
|
887 pop ebp |
|
888 |
|
889 |
|
890 vzeroupper |
|
891 |
|
892 ret |
|
893 intel_aes_gcmENC ENDP |
|
894 |
|
895 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
896 ; |
|
897 ; Decrypt and Authenticate |
|
898 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); |
|
899 ; |
|
900 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
901 |
|
902 |
|
903 NEXTCTR MACRO i |
|
904 add aluCTR, 1 |
|
905 mov aluTMP, aluCTR |
|
906 bswap aluTMP |
|
907 xor aluTMP, [3*4 + KS] |
|
908 mov [3*4 + i*16 + esp], aluTMP |
|
909 ENDM |
|
910 |
|
911 intel_aes_gcmDEC PROC |
|
912 |
|
913 cmp DWORD PTR[1*4 + 3*4 + esp], 0 |
|
914 jne LbeginDEC |
|
915 ret |
|
916 |
|
917 LbeginDEC: |
|
918 |
|
919 vzeroupper |
|
920 push ebp |
|
921 push ebx |
|
922 push esi |
|
923 push edi |
|
924 |
|
925 mov ebp, esp |
|
926 sub esp, 8*16 |
|
927 and esp, -16 |
|
928 |
|
929 mov CT, [ebp + 5*4 + 0*4] |
|
930 mov PT, [ebp + 5*4 + 1*4] |
|
931 mov Gctx, [ebp + 5*4 + 2*4] |
|
932 |
|
933 mov KS, [16*16 + 3*16 + Gctx] |
|
934 lea KS, [44 + KS] |
|
935 |
|
936 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
|
937 bswap aluCTR |
|
938 |
|
939 |
|
940 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
|
941 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
942 vmovdqu XMMWORD PTR[0*16 + esp], TMP0 |
|
943 |
|
944 cmp len, 16*7 |
|
945 jb LDecDataSingles |
|
946 vmovdqu XMMWORD PTR[1*16 + esp], TMP0 |
|
947 vmovdqu XMMWORD PTR[2*16 + esp], TMP0 |
|
948 vmovdqu XMMWORD PTR[3*16 + esp], TMP0 |
|
949 vmovdqu XMMWORD PTR[4*16 + esp], TMP0 |
|
950 vmovdqu XMMWORD PTR[5*16 + esp], TMP0 |
|
951 vmovdqu XMMWORD PTR[6*16 + esp], TMP0 |
|
952 dec aluCTR |
|
953 |
|
954 LDecData7: |
|
955 cmp len, 16*7 |
|
956 jb LDecData7End |
|
957 sub len, 16*7 |
|
958 |
|
959 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] |
|
960 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
961 vpxor TMP5, TMP5, T |
|
962 vpshufd TMP4, TMP5, 78 |
|
963 vpxor TMP4, TMP4, TMP5 |
|
964 vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h |
|
965 vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] |
|
966 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
967 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
968 |
|
969 NEXTCTR 0 |
|
970 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] |
|
971 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
972 KARATSUBA 5 |
|
973 NEXTCTR 1 |
|
974 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] |
|
975 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
976 KARATSUBA 4 |
|
977 NEXTCTR 2 |
|
978 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] |
|
979 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
980 KARATSUBA 3 |
|
981 NEXTCTR 3 |
|
982 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] |
|
983 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
984 KARATSUBA 2 |
|
985 NEXTCTR 4 |
|
986 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] |
|
987 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
988 KARATSUBA 1 |
|
989 NEXTCTR 5 |
|
990 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] |
|
991 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] |
|
992 KARATSUBA 0 |
|
993 NEXTCTR 6 |
|
994 |
|
995 vpxor TMP0, TMP0, TMP1 |
|
996 vpxor TMP0, TMP0, TMP2 |
|
997 vpsrldq TMP3, TMP0, 8 |
|
998 vpxor TMP4, TMP1, TMP3 |
|
999 vpslldq TMP3, TMP0, 8 |
|
1000 vpxor TMP5, TMP2, TMP3 |
|
1001 |
|
1002 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
1003 vpalignr TMP5,TMP5,TMP5,8 |
|
1004 vpxor TMP5, TMP5, TMP1 |
|
1005 |
|
1006 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h |
|
1007 vpalignr TMP5,TMP5,TMP5,8 |
|
1008 vpxor TMP5, TMP5, TMP1 |
|
1009 |
|
1010 vpxor TMP5, TMP5, TMP4 |
|
1011 vmovdqu T, TMP5 |
|
1012 |
|
1013 vmovdqa CTR0, XMMWORD PTR[0*16 + esp] |
|
1014 vmovdqa CTR1, XMMWORD PTR[1*16 + esp] |
|
1015 vmovdqa CTR2, XMMWORD PTR[2*16 + esp] |
|
1016 vmovdqa CTR3, XMMWORD PTR[3*16 + esp] |
|
1017 vmovdqa CTR4, XMMWORD PTR[4*16 + esp] |
|
1018 vmovdqa CTR5, XMMWORD PTR[5*16 + esp] |
|
1019 vmovdqa CTR6, XMMWORD PTR[6*16 + esp] |
|
1020 |
|
1021 ROUND 1 |
|
1022 ROUND 2 |
|
1023 ROUND 3 |
|
1024 ROUND 4 |
|
1025 ROUND 5 |
|
1026 ROUND 6 |
|
1027 ROUND 7 |
|
1028 ROUND 8 |
|
1029 ROUND 9 |
|
1030 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] |
|
1031 cmp NR, 10 |
|
1032 je @f |
|
1033 |
|
1034 ROUND 10 |
|
1035 ROUND 11 |
|
1036 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] |
|
1037 cmp NR, 12 |
|
1038 je @f |
|
1039 |
|
1040 ROUND 12 |
|
1041 ROUND 13 |
|
1042 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] |
|
1043 @@: |
|
1044 vaesenclast CTR0, CTR0, xmm7 |
|
1045 vaesenclast CTR1, CTR1, xmm7 |
|
1046 vaesenclast CTR2, CTR2, xmm7 |
|
1047 vaesenclast CTR3, CTR3, xmm7 |
|
1048 vaesenclast CTR4, CTR4, xmm7 |
|
1049 vaesenclast CTR5, CTR5, xmm7 |
|
1050 vaesenclast CTR6, CTR6, xmm7 |
|
1051 |
|
1052 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] |
|
1053 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] |
|
1054 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] |
|
1055 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] |
|
1056 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] |
|
1057 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] |
|
1058 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] |
|
1059 |
|
1060 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 |
|
1061 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 |
|
1062 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 |
|
1063 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 |
|
1064 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 |
|
1065 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 |
|
1066 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 |
|
1067 |
|
1068 lea CT, [7*16 + CT] |
|
1069 lea PT, [7*16 + PT] |
|
1070 jmp LDecData7 |
|
1071 |
|
1072 LDecData7End: |
|
1073 |
|
1074 NEXTCTR 0 |
|
1075 |
|
1076 LDecDataSingles: |
|
1077 |
|
1078 cmp len, 16 |
|
1079 jb LDecDataTail |
|
1080 sub len, 16 |
|
1081 |
|
1082 vmovdqu TMP1, XMMWORD PTR[CT] |
|
1083 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
|
1084 vpxor TMP1, TMP1, T |
|
1085 |
|
1086 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
1087 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
|
1088 vmovdqu T, TMP1 |
|
1089 |
|
1090 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
|
1091 NEXTCTR 0 |
|
1092 |
|
1093 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
1094 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
1095 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
1096 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
1097 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
1098 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
1099 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
1100 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
1101 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
1102 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
1103 cmp NR, 10 |
|
1104 je @f |
|
1105 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
1106 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
1107 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
1108 cmp NR, 12 |
|
1109 je @f |
|
1110 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
1111 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
1112 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
1113 @@: |
|
1114 vaesenclast TMP1, TMP1, TMP2 |
|
1115 vpxor TMP1, TMP1, XMMWORD PTR[CT] |
|
1116 vmovdqu XMMWORD PTR[PT], TMP1 |
|
1117 |
|
1118 lea PT, [16+PT] |
|
1119 lea CT, [16+CT] |
|
1120 jmp LDecDataSingles |
|
1121 |
|
1122 LDecDataTail: |
|
1123 |
|
1124 cmp len, 0 |
|
1125 je LDecDataEnd |
|
1126 |
|
1127 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] |
|
1128 inc aluCTR |
|
1129 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
1130 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
1131 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
1132 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
1133 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
1134 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
1135 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
1136 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
1137 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
1138 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
1139 cmp NR, 10 |
|
1140 je @f |
|
1141 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
1142 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
1143 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
1144 cmp NR, 12 |
|
1145 je @f |
|
1146 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
1147 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
1148 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
1149 @@: |
|
1150 vaesenclast xmm7, TMP1, TMP2 |
|
1151 |
|
1152 ; copy as many bytes as needed |
|
1153 xor KS, KS |
|
1154 mov aluTMP, edx |
|
1155 @@: |
|
1156 cmp len, KS |
|
1157 je @f |
|
1158 mov dl, BYTE PTR[CT + KS] |
|
1159 mov BYTE PTR[esp + KS], dl |
|
1160 inc KS |
|
1161 jmp @b |
|
1162 @@: |
|
1163 cmp KS, 16 |
|
1164 je @f |
|
1165 mov BYTE PTR[esp + KS], 0 |
|
1166 inc KS |
|
1167 jmp @b |
|
1168 @@: |
|
1169 mov edx, aluTMP |
|
1170 vmovdqa TMP1, XMMWORD PTR[esp] |
|
1171 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] |
|
1172 vpxor TMP1, TMP1, T |
|
1173 |
|
1174 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
1175 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 |
|
1176 vmovdqu T, TMP1 |
|
1177 |
|
1178 vpxor xmm7, xmm7, XMMWORD PTR[esp] |
|
1179 vmovdqa XMMWORD PTR[esp], xmm7 |
|
1180 xor KS, KS |
|
1181 mov aluTMP, edx |
|
1182 @@: |
|
1183 cmp len, KS |
|
1184 je @f |
|
1185 mov dl, BYTE PTR[esp + KS] |
|
1186 mov BYTE PTR[PT + KS], dl |
|
1187 inc KS |
|
1188 jmp @b |
|
1189 @@: |
|
1190 mov edx, aluTMP |
|
1191 |
|
1192 LDecDataEnd: |
|
1193 |
|
1194 bswap aluCTR |
|
1195 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
|
1196 |
|
1197 mov esp, ebp |
|
1198 pop edi |
|
1199 pop esi |
|
1200 pop ebx |
|
1201 pop ebp |
|
1202 |
|
1203 vzeroupper |
|
1204 |
|
1205 ret |
|
1206 intel_aes_gcmDEC ENDP |
|
1207 |
|
1208 |
|
1209 END |