|
1 ; LICENSE: |
|
2 ; This submission to NSS is to be made available under the terms of the |
|
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: |
|
4 ; //mozilla.org/MPL/2.0/. |
|
5 ;############################################################################### |
|
6 ; Copyright(c) 2014, Intel Corp. |
|
7 ; Developers and authors: |
|
8 ; Shay Gueron and Vlad Krasnov |
|
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel |
|
10 ; Please send feedback directly to crypto.feedback.alias@intel.com |
|
11 |
|
12 |
|
13 .DATA |
|
14 ALIGN 16 |
|
15 Lone dq 1,0 |
|
16 Ltwo dq 2,0 |
|
17 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 |
|
18 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh |
|
19 Lpoly dq 01h, 0c200000000000000h |
|
20 |
|
21 .CODE |
|
22 |
|
23 |
|
24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 |
|
25 vpclmulqdq TMP1, SRC2, SRC1, 0h |
|
26 vpclmulqdq TMP4, SRC2, SRC1, 011h |
|
27 |
|
28 vpshufd TMP2, SRC2, 78 |
|
29 vpshufd TMP3, SRC1, 78 |
|
30 vpxor TMP2, TMP2, SRC2 |
|
31 vpxor TMP3, TMP3, SRC1 |
|
32 |
|
33 vpclmulqdq TMP2, TMP2, TMP3, 0h |
|
34 vpxor TMP2, TMP2, TMP1 |
|
35 vpxor TMP2, TMP2, TMP4 |
|
36 |
|
37 vpslldq TMP3, TMP2, 8 |
|
38 vpsrldq TMP2, TMP2, 8 |
|
39 |
|
40 vpxor TMP1, TMP1, TMP3 |
|
41 vpxor TMP4, TMP4, TMP2 |
|
42 |
|
43 vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
|
44 vpshufd TMP3, TMP1, 78 |
|
45 vpxor TMP1, TMP2, TMP3 |
|
46 |
|
47 vpclmulqdq TMP2, TMP1, [Lpoly], 010h |
|
48 vpshufd TMP3, TMP1, 78 |
|
49 vpxor TMP1, TMP2, TMP3 |
|
50 |
|
51 vpxor DST, TMP1, TMP4 |
|
52 |
|
53 ENDM |
|
54 |
|
55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
56 ; |
|
57 ; Generates the final GCM tag |
|
58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], |
|
59 ; unsigned char *Tp, |
|
60 ; unsigned int Mlen, |
|
61 ; unsigned int Alen, |
|
62 ; unsigned char *X0, |
|
63 ; unsigned char *TAG); |
|
64 ; |
|
65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
66 |
|
67 ALIGN 16 |
|
68 intel_aes_gcmTAG PROC |
|
69 |
|
70 Htbl textequ <rcx> |
|
71 Tp textequ <rdx> |
|
72 Mlen textequ <r8> |
|
73 Alen textequ <r9> |
|
74 X0 textequ <r10> |
|
75 TAG textequ <r11> |
|
76 |
|
77 T textequ <xmm0> |
|
78 TMP0 textequ <xmm1> |
|
79 |
|
80 mov X0, [rsp + 1*8 + 4*8] |
|
81 mov TAG, [rsp + 1*8 + 5*8] |
|
82 |
|
83 vzeroupper |
|
84 vmovdqu T, XMMWORD PTR[Tp] |
|
85 vpxor TMP0, TMP0, TMP0 |
|
86 |
|
87 shl Mlen, 3 |
|
88 shl Alen, 3 |
|
89 |
|
90 ;vpinsrq TMP0, TMP0, Mlen, 0 |
|
91 ;vpinsrq TMP0, TMP0, Alen, 1 |
|
92 ; workaround the ml64.exe vpinsrq issue |
|
93 vpinsrd TMP0, TMP0, r8d, 0 |
|
94 vpinsrd TMP0, TMP0, r9d, 2 |
|
95 shr Mlen, 32 |
|
96 shr Alen, 32 |
|
97 vpinsrd TMP0, TMP0, r8d, 1 |
|
98 vpinsrd TMP0, TMP0, r9d, 3 |
|
99 |
|
100 vpxor T, T, TMP0 |
|
101 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
102 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
|
103 |
|
104 vpshufb T, T, [Lbswap_mask] |
|
105 vpxor T, T, [X0] |
|
106 vmovdqu XMMWORD PTR[TAG], T |
|
107 vzeroupper |
|
108 |
|
109 ret |
|
110 |
|
111 intel_aes_gcmTAG ENDP |
|
112 |
|
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
114 ; |
|
115 ; Generates the H table |
|
116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); |
|
117 ; |
|
118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
119 |
|
120 ALIGN 16 |
|
121 intel_aes_gcmINIT PROC |
|
122 |
|
123 Htbl textequ <rcx> |
|
124 KS textequ <rdx> |
|
125 NR textequ <r8d> |
|
126 |
|
127 T textequ <xmm0> |
|
128 TMP0 textequ <xmm1> |
|
129 |
|
130 vzeroupper |
|
131 ; AES-ENC(0) |
|
132 vmovdqu T, XMMWORD PTR[KS] |
|
133 lea KS, [16 + KS] |
|
134 dec NR |
|
135 Lenc_loop: |
|
136 vaesenc T, T, [KS] |
|
137 lea KS, [16 + KS] |
|
138 dec NR |
|
139 jnz Lenc_loop |
|
140 |
|
141 vaesenclast T, T, [KS] |
|
142 vpshufb T, T, [Lbswap_mask] |
|
143 |
|
144 ;Calculate H` = GFMUL(H, 2) |
|
145 vpsrad xmm3, T, 31 |
|
146 vpshufd xmm3, xmm3, 0ffh |
|
147 vpand xmm5, xmm3, [Lpoly] |
|
148 vpsrld xmm3, T, 31 |
|
149 vpslld xmm4, T, 1 |
|
150 vpslldq xmm3, xmm3, 4 |
|
151 vpxor T, xmm4, xmm3 |
|
152 vpxor T, T, xmm5 |
|
153 |
|
154 vmovdqu TMP0, T |
|
155 vmovdqu XMMWORD PTR[Htbl + 0*16], T |
|
156 |
|
157 vpshufd xmm2, T, 78 |
|
158 vpxor xmm2, xmm2, T |
|
159 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 |
|
160 |
|
161 i = 1 |
|
162 WHILE i LT 8 |
|
163 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 |
|
164 vmovdqu XMMWORD PTR[Htbl + i*16], T |
|
165 vpshufd xmm2, T, 78 |
|
166 vpxor xmm2, xmm2, T |
|
167 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 |
|
168 i = i+1 |
|
169 ENDM |
|
170 vzeroupper |
|
171 ret |
|
172 intel_aes_gcmINIT ENDP |
|
173 |
|
174 |
|
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
176 ; |
|
177 ; Authenticate only |
|
178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); |
|
179 ; |
|
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
181 |
|
182 ALIGN 16 |
|
183 intel_aes_gcmAAD PROC |
|
184 |
|
185 Htbl textequ <rcx> |
|
186 inp textequ <rdx> |
|
187 len textequ <r8> |
|
188 Tp textequ <r9> |
|
189 hlp0 textequ <r10> |
|
190 |
|
191 DATA textequ <xmm0> |
|
192 T textequ <xmm1> |
|
193 TMP0 textequ <xmm2> |
|
194 TMP1 textequ <xmm3> |
|
195 TMP2 textequ <xmm4> |
|
196 TMP3 textequ <xmm5> |
|
197 TMP4 textequ <xmm6> |
|
198 Xhi textequ <xmm7> |
|
199 |
|
200 KARATSUBA_AAD MACRO i |
|
201 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h |
|
202 vpxor TMP0, TMP0, TMP3 |
|
203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h |
|
204 vpxor TMP1, TMP1, TMP3 |
|
205 vpshufd TMP3, DATA, 78 |
|
206 vpxor TMP3, TMP3, DATA |
|
207 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h |
|
208 vpxor TMP2, TMP2, TMP3 |
|
209 ENDM |
|
210 |
|
211 test len, len |
|
212 jnz LbeginAAD |
|
213 ret |
|
214 |
|
215 LbeginAAD: |
|
216 vzeroupper |
|
217 |
|
218 sub rsp, 2*16 |
|
219 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 |
|
220 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 |
|
221 |
|
222 vpxor Xhi, Xhi, Xhi |
|
223 |
|
224 vmovdqu T, XMMWORD PTR[Tp] |
|
225 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first |
|
226 mov hlp0, len |
|
227 and hlp0, 128-1 |
|
228 jz Lmod_loop |
|
229 |
|
230 and len, -128 |
|
231 sub hlp0, 16 |
|
232 |
|
233 ; Prefix block |
|
234 vmovdqu DATA, XMMWORD PTR[inp] |
|
235 vpshufb DATA, DATA, [Lbswap_mask] |
|
236 vpxor DATA, DATA, T |
|
237 |
|
238 vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h |
|
239 vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h |
|
240 vpshufd TMP3, DATA, 78 |
|
241 vpxor TMP3, TMP3, DATA |
|
242 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h |
|
243 |
|
244 lea inp, [inp+16] |
|
245 test hlp0, hlp0 |
|
246 jnz Lpre_loop |
|
247 jmp Lred1 |
|
248 |
|
249 ;hash remaining prefix bocks (up to 7 total prefix blocks) |
|
250 Lpre_loop: |
|
251 |
|
252 sub hlp0, 16 |
|
253 |
|
254 vmovdqu DATA, XMMWORD PTR[inp] |
|
255 vpshufb DATA, DATA, [Lbswap_mask] |
|
256 |
|
257 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h |
|
258 vpxor TMP0, TMP0, TMP3 |
|
259 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h |
|
260 vpxor TMP1, TMP1, TMP3 |
|
261 vpshufd TMP3, DATA, 78 |
|
262 vpxor TMP3, TMP3, DATA |
|
263 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h |
|
264 vpxor TMP2, TMP2, TMP3 |
|
265 |
|
266 test hlp0, hlp0 |
|
267 lea inp, [inp+16] |
|
268 jnz Lpre_loop |
|
269 |
|
270 Lred1: |
|
271 |
|
272 vpxor TMP2, TMP2, TMP0 |
|
273 vpxor TMP2, TMP2, TMP1 |
|
274 vpsrldq TMP3, TMP2, 8 |
|
275 vpslldq TMP2, TMP2, 8 |
|
276 |
|
277 vpxor Xhi, TMP1, TMP3 |
|
278 vpxor T, TMP0, TMP2 |
|
279 |
|
280 |
|
281 Lmod_loop: |
|
282 |
|
283 sub len, 16*8 |
|
284 jb Ldone |
|
285 ; Block #0 |
|
286 vmovdqu DATA, XMMWORD PTR[inp + 16*7] |
|
287 vpshufb DATA, DATA, [Lbswap_mask] |
|
288 |
|
289 vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h |
|
290 vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h |
|
291 vpshufd TMP3, DATA, 78 |
|
292 vpxor TMP3, TMP3, DATA |
|
293 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h |
|
294 |
|
295 ; Block #1 |
|
296 vmovdqu DATA, XMMWORD PTR[inp + 16*6] |
|
297 vpshufb DATA, DATA, [Lbswap_mask] |
|
298 KARATSUBA_AAD 1 |
|
299 |
|
300 ; Block #2 |
|
301 vmovdqu DATA, XMMWORD PTR[inp + 16*5] |
|
302 vpshufb DATA, DATA, [Lbswap_mask] |
|
303 |
|
304 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a |
|
305 vpalignr T, T, T, 8 |
|
306 |
|
307 KARATSUBA_AAD 2 |
|
308 |
|
309 vpxor T, T, TMP4 ;reduction stage 1b |
|
310 |
|
311 ; Block #3 |
|
312 vmovdqu DATA, XMMWORD PTR[inp + 16*4] |
|
313 vpshufb DATA, DATA, [Lbswap_mask] |
|
314 KARATSUBA_AAD 3 |
|
315 ; Block #4 |
|
316 vmovdqu DATA, XMMWORD PTR[inp + 16*3] |
|
317 vpshufb DATA, DATA, [Lbswap_mask] |
|
318 |
|
319 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a |
|
320 vpalignr T, T, T, 8 |
|
321 |
|
322 KARATSUBA_AAD 4 |
|
323 |
|
324 vpxor T, T, TMP4 ;reduction stage 2b |
|
325 ; Block #5 |
|
326 vmovdqu DATA, XMMWORD PTR[inp + 16*2] |
|
327 vpshufb DATA, DATA, [Lbswap_mask] |
|
328 KARATSUBA_AAD 5 |
|
329 |
|
330 vpxor T, T, Xhi ;reduction finalize |
|
331 ; Block #6 |
|
332 vmovdqu DATA, XMMWORD PTR[inp + 16*1] |
|
333 vpshufb DATA, DATA, [Lbswap_mask] |
|
334 KARATSUBA_AAD 6 |
|
335 ; Block #7 |
|
336 vmovdqu DATA, XMMWORD PTR[inp + 16*0] |
|
337 vpshufb DATA, DATA, [Lbswap_mask] |
|
338 vpxor DATA, DATA, T |
|
339 KARATSUBA_AAD 7 |
|
340 ; Aggregated 8 blocks, now karatsuba fixup |
|
341 vpxor TMP2, TMP2, TMP0 |
|
342 vpxor TMP2, TMP2, TMP1 |
|
343 vpsrldq TMP3, TMP2, 8 |
|
344 vpslldq TMP2, TMP2, 8 |
|
345 |
|
346 vpxor Xhi, TMP1, TMP3 |
|
347 vpxor T, TMP0, TMP2 |
|
348 |
|
349 lea inp, [inp + 16*8] |
|
350 jmp Lmod_loop |
|
351 |
|
352 Ldone: |
|
353 vpclmulqdq TMP4, T, [Lpoly], 010h |
|
354 vpalignr T, T, T, 8 |
|
355 vpxor T, T, TMP4 |
|
356 |
|
357 vpclmulqdq TMP4, T, [Lpoly], 010h |
|
358 vpalignr T, T, T, 8 |
|
359 vpxor T, T, TMP4 |
|
360 |
|
361 vpxor T, T, Xhi |
|
362 vmovdqu XMMWORD PTR[Tp], T |
|
363 vzeroupper |
|
364 |
|
365 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] |
|
366 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] |
|
367 add rsp, 16*2 |
|
368 |
|
369 ret |
|
370 |
|
371 intel_aes_gcmAAD ENDP |
|
372 |
|
373 |
|
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
375 ; |
|
376 ; Encrypt and Authenticate |
|
377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); |
|
378 ; |
|
379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
380 |
|
381 ALIGN 16 |
|
382 intel_aes_gcmENC PROC |
|
383 |
|
384 PT textequ <rcx> |
|
385 CT textequ <rdx> |
|
386 Htbl textequ <r8> |
|
387 Gctx textequ <r8> |
|
388 len textequ <r9> |
|
389 KS textequ <r10> |
|
390 NR textequ <eax> |
|
391 |
|
392 aluCTR textequ <r11d> |
|
393 aluKSl textequ <r12d> |
|
394 aluTMP textequ <r13d> |
|
395 |
|
396 T textequ <xmm0> |
|
397 TMP0 textequ <xmm1> |
|
398 TMP1 textequ <xmm2> |
|
399 TMP2 textequ <xmm3> |
|
400 TMP3 textequ <xmm4> |
|
401 TMP4 textequ <xmm5> |
|
402 TMP5 textequ <xmm6> |
|
403 CTR0 textequ <xmm7> |
|
404 CTR1 textequ <xmm8> |
|
405 CTR2 textequ <xmm9> |
|
406 CTR3 textequ <xmm10> |
|
407 CTR4 textequ <xmm11> |
|
408 CTR5 textequ <xmm12> |
|
409 CTR6 textequ <xmm13> |
|
410 CTR7 textequ <xmm14> |
|
411 BSWAPMASK textequ <xmm15> |
|
412 |
|
413 ROUND MACRO i |
|
414 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] |
|
415 vaesenc CTR0, CTR0, TMP3 |
|
416 vaesenc CTR1, CTR1, TMP3 |
|
417 vaesenc CTR2, CTR2, TMP3 |
|
418 vaesenc CTR3, CTR3, TMP3 |
|
419 vaesenc CTR4, CTR4, TMP3 |
|
420 vaesenc CTR5, CTR5, TMP3 |
|
421 vaesenc CTR6, CTR6, TMP3 |
|
422 vaesenc CTR7, CTR7, TMP3 |
|
423 ENDM |
|
424 ROUNDMUL MACRO i |
|
425 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] |
|
426 |
|
427 vaesenc CTR0, CTR0, TMP3 |
|
428 vaesenc CTR1, CTR1, TMP3 |
|
429 vaesenc CTR2, CTR2, TMP3 |
|
430 vaesenc CTR3, CTR3, TMP3 |
|
431 |
|
432 vpshufd TMP4, TMP5, 78 |
|
433 vpxor TMP4, TMP4, TMP5 |
|
434 |
|
435 vaesenc CTR4, CTR4, TMP3 |
|
436 vaesenc CTR5, CTR5, TMP3 |
|
437 vaesenc CTR6, CTR6, TMP3 |
|
438 vaesenc CTR7, CTR7, TMP3 |
|
439 |
|
440 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h |
|
441 vpxor TMP0, TMP0, TMP3 |
|
442 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] |
|
443 vpclmulqdq TMP3, TMP5, TMP4, 011h |
|
444 vpxor TMP1, TMP1, TMP3 |
|
445 vpclmulqdq TMP3, TMP5, TMP4, 000h |
|
446 vpxor TMP2, TMP2, TMP3 |
|
447 ENDM |
|
448 KARATSUBA MACRO i |
|
449 vpshufd TMP4, TMP5, 78 |
|
450 vpxor TMP4, TMP4, TMP5 |
|
451 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h |
|
452 vpxor TMP0, TMP0, TMP3 |
|
453 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] |
|
454 vpclmulqdq TMP3, TMP5, TMP4, 011h |
|
455 vpxor TMP1, TMP1, TMP3 |
|
456 vpclmulqdq TMP3, TMP5, TMP4, 000h |
|
457 vpxor TMP2, TMP2, TMP3 |
|
458 ENDM |
|
459 NEXTCTR MACRO i |
|
460 add aluCTR, 1 |
|
461 mov aluTMP, aluCTR |
|
462 xor aluTMP, aluKSl |
|
463 bswap aluTMP |
|
464 mov [3*4 + 8*16 + i*16 + rsp], aluTMP |
|
465 ENDM |
|
466 |
|
467 |
|
468 test len, len |
|
469 jnz LbeginENC |
|
470 ret |
|
471 |
|
472 LbeginENC: |
|
473 |
|
474 vzeroupper |
|
475 push r11 |
|
476 push r12 |
|
477 push r13 |
|
478 push rbp |
|
479 sub rsp, 10*16 |
|
480 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 |
|
481 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 |
|
482 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 |
|
483 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 |
|
484 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 |
|
485 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 |
|
486 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 |
|
487 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 |
|
488 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 |
|
489 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 |
|
490 |
|
491 mov rbp, rsp |
|
492 sub rsp, 16*16 |
|
493 and rsp, -16 |
|
494 |
|
495 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] |
|
496 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
497 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] |
|
498 mov KS, [16*16 + 3*16 + Gctx] |
|
499 mov NR, [4 + KS] |
|
500 lea KS, [48 + KS] |
|
501 |
|
502 vpshufb CTR0, CTR0, BSWAPMASK |
|
503 |
|
504 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
|
505 mov aluKSl, [3*4 + KS] |
|
506 bswap aluCTR |
|
507 bswap aluKSl |
|
508 |
|
509 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
|
510 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
511 vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 |
|
512 |
|
513 cmp len, 128 |
|
514 jb LEncDataSingles |
|
515 ; Prepare the "top" counters |
|
516 vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 |
|
517 vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 |
|
518 vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 |
|
519 vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 |
|
520 vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 |
|
521 vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 |
|
522 vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 |
|
523 |
|
524 ; Encrypt the initial 8 blocks |
|
525 sub len, 128 |
|
526 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] |
|
527 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] |
|
528 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] |
|
529 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] |
|
530 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] |
|
531 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] |
|
532 vpaddd CTR7, CTR6, XMMWORD PTR[Lone] |
|
533 |
|
534 vpshufb CTR0, CTR0, BSWAPMASK |
|
535 vpshufb CTR1, CTR1, BSWAPMASK |
|
536 vpshufb CTR2, CTR2, BSWAPMASK |
|
537 vpshufb CTR3, CTR3, BSWAPMASK |
|
538 vpshufb CTR4, CTR4, BSWAPMASK |
|
539 vpshufb CTR5, CTR5, BSWAPMASK |
|
540 vpshufb CTR6, CTR6, BSWAPMASK |
|
541 vpshufb CTR7, CTR7, BSWAPMASK |
|
542 |
|
543 vmovdqu TMP3, XMMWORD PTR[0*16 + KS] |
|
544 vpxor CTR0, CTR0, TMP3 |
|
545 vpxor CTR1, CTR1, TMP3 |
|
546 vpxor CTR2, CTR2, TMP3 |
|
547 vpxor CTR3, CTR3, TMP3 |
|
548 vpxor CTR4, CTR4, TMP3 |
|
549 vpxor CTR5, CTR5, TMP3 |
|
550 vpxor CTR6, CTR6, TMP3 |
|
551 vpxor CTR7, CTR7, TMP3 |
|
552 |
|
553 ROUND 1 |
|
554 |
|
555 add aluCTR, 8 |
|
556 mov aluTMP, aluCTR |
|
557 xor aluTMP, aluKSl |
|
558 bswap aluTMP |
|
559 mov [8*16 + 0*16 + 3*4 + rsp], aluTMP |
|
560 |
|
561 ROUND 2 |
|
562 NEXTCTR 1 |
|
563 ROUND 3 |
|
564 NEXTCTR 2 |
|
565 ROUND 4 |
|
566 NEXTCTR 3 |
|
567 ROUND 5 |
|
568 NEXTCTR 4 |
|
569 ROUND 6 |
|
570 NEXTCTR 5 |
|
571 ROUND 7 |
|
572 NEXTCTR 6 |
|
573 ROUND 8 |
|
574 NEXTCTR 7 |
|
575 ROUND 9 |
|
576 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] |
|
577 cmp NR, 10 |
|
578 je @f |
|
579 |
|
580 ROUND 10 |
|
581 ROUND 11 |
|
582 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] |
|
583 cmp NR, 12 |
|
584 je @f |
|
585 |
|
586 ROUND 12 |
|
587 ROUND 13 |
|
588 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] |
|
589 @@: |
|
590 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] |
|
591 vaesenclast CTR0, CTR0, TMP3 |
|
592 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] |
|
593 vaesenclast CTR1, CTR1, TMP3 |
|
594 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] |
|
595 vaesenclast CTR2, CTR2, TMP3 |
|
596 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] |
|
597 vaesenclast CTR3, CTR3, TMP3 |
|
598 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] |
|
599 vaesenclast CTR4, CTR4, TMP3 |
|
600 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] |
|
601 vaesenclast CTR5, CTR5, TMP3 |
|
602 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] |
|
603 vaesenclast CTR6, CTR6, TMP3 |
|
604 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] |
|
605 vaesenclast CTR7, CTR7, TMP3 |
|
606 |
|
607 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
|
608 vpshufb CTR0, CTR0, BSWAPMASK |
|
609 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
|
610 vpshufb CTR1, CTR1, BSWAPMASK |
|
611 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
|
612 vpshufb CTR2, CTR2, BSWAPMASK |
|
613 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
|
614 vpshufb CTR3, CTR3, BSWAPMASK |
|
615 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
|
616 vpshufb CTR4, CTR4, BSWAPMASK |
|
617 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
|
618 vpshufb CTR5, CTR5, BSWAPMASK |
|
619 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
|
620 vpshufb CTR6, CTR6, BSWAPMASK |
|
621 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 |
|
622 vpshufb TMP5, CTR7, BSWAPMASK |
|
623 |
|
624 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 |
|
625 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 |
|
626 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 |
|
627 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 |
|
628 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 |
|
629 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 |
|
630 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 |
|
631 |
|
632 lea CT, [8*16 + CT] |
|
633 lea PT, [8*16 + PT] |
|
634 jmp LEncDataOctets |
|
635 |
|
636 LEncDataOctets: |
|
637 cmp len, 128 |
|
638 jb LEndEncOctets |
|
639 sub len, 128 |
|
640 |
|
641 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] |
|
642 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] |
|
643 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] |
|
644 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] |
|
645 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] |
|
646 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] |
|
647 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] |
|
648 vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] |
|
649 |
|
650 vpshufd TMP4, TMP5, 78 |
|
651 vpxor TMP4, TMP4, TMP5 |
|
652 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
|
653 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
|
654 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
655 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
656 |
|
657 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] |
|
658 ROUNDMUL 1 |
|
659 NEXTCTR 0 |
|
660 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] |
|
661 ROUNDMUL 2 |
|
662 NEXTCTR 1 |
|
663 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] |
|
664 ROUNDMUL 3 |
|
665 NEXTCTR 2 |
|
666 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] |
|
667 ROUNDMUL 4 |
|
668 NEXTCTR 3 |
|
669 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] |
|
670 ROUNDMUL 5 |
|
671 NEXTCTR 4 |
|
672 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] |
|
673 ROUNDMUL 6 |
|
674 NEXTCTR 5 |
|
675 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] |
|
676 ROUNDMUL 7 |
|
677 NEXTCTR 6 |
|
678 |
|
679 ROUND 8 |
|
680 NEXTCTR 7 |
|
681 |
|
682 vpxor TMP0, TMP0, TMP1 |
|
683 vpxor TMP0, TMP0, TMP2 |
|
684 vpsrldq TMP3, TMP0, 8 |
|
685 vpxor TMP4, TMP1, TMP3 |
|
686 vpslldq TMP3, TMP0, 8 |
|
687 vpxor T, TMP2, TMP3 |
|
688 |
|
689 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
690 vpalignr T,T,T,8 |
|
691 vpxor T, T, TMP1 |
|
692 |
|
693 ROUND 9 |
|
694 |
|
695 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
696 vpalignr T,T,T,8 |
|
697 vpxor T, T, TMP1 |
|
698 |
|
699 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] |
|
700 cmp NR, 10 |
|
701 je @f |
|
702 |
|
703 ROUND 10 |
|
704 ROUND 11 |
|
705 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] |
|
706 cmp NR, 12 |
|
707 je @f |
|
708 |
|
709 ROUND 12 |
|
710 ROUND 13 |
|
711 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] |
|
712 @@: |
|
713 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] |
|
714 vaesenclast CTR0, CTR0, TMP3 |
|
715 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] |
|
716 vaesenclast CTR1, CTR1, TMP3 |
|
717 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] |
|
718 vaesenclast CTR2, CTR2, TMP3 |
|
719 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] |
|
720 vaesenclast CTR3, CTR3, TMP3 |
|
721 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] |
|
722 vaesenclast CTR4, CTR4, TMP3 |
|
723 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] |
|
724 vaesenclast CTR5, CTR5, TMP3 |
|
725 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] |
|
726 vaesenclast CTR6, CTR6, TMP3 |
|
727 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] |
|
728 vaesenclast CTR7, CTR7, TMP3 |
|
729 |
|
730 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 |
|
731 vpshufb CTR0, CTR0, BSWAPMASK |
|
732 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 |
|
733 vpshufb CTR1, CTR1, BSWAPMASK |
|
734 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 |
|
735 vpshufb CTR2, CTR2, BSWAPMASK |
|
736 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 |
|
737 vpshufb CTR3, CTR3, BSWAPMASK |
|
738 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 |
|
739 vpshufb CTR4, CTR4, BSWAPMASK |
|
740 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 |
|
741 vpshufb CTR5, CTR5, BSWAPMASK |
|
742 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 |
|
743 vpshufb CTR6, CTR6, BSWAPMASK |
|
744 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 |
|
745 vpshufb TMP5, CTR7, BSWAPMASK |
|
746 |
|
747 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 |
|
748 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 |
|
749 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 |
|
750 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 |
|
751 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 |
|
752 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 |
|
753 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 |
|
754 |
|
755 vpxor T, T, TMP4 |
|
756 |
|
757 lea CT, [8*16 + CT] |
|
758 lea PT, [8*16 + PT] |
|
759 jmp LEncDataOctets |
|
760 |
|
761 LEndEncOctets: |
|
762 |
|
763 vpshufd TMP4, TMP5, 78 |
|
764 vpxor TMP4, TMP4, TMP5 |
|
765 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
|
766 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
|
767 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
768 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
769 |
|
770 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] |
|
771 KARATSUBA 1 |
|
772 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] |
|
773 KARATSUBA 2 |
|
774 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] |
|
775 KARATSUBA 3 |
|
776 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] |
|
777 KARATSUBA 4 |
|
778 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] |
|
779 KARATSUBA 5 |
|
780 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] |
|
781 KARATSUBA 6 |
|
782 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] |
|
783 KARATSUBA 7 |
|
784 |
|
785 vpxor TMP0, TMP0, TMP1 |
|
786 vpxor TMP0, TMP0, TMP2 |
|
787 vpsrldq TMP3, TMP0, 8 |
|
788 vpxor TMP4, TMP1, TMP3 |
|
789 vpslldq TMP3, TMP0, 8 |
|
790 vpxor T, TMP2, TMP3 |
|
791 |
|
792 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
793 vpalignr T,T,T,8 |
|
794 vpxor T, T, TMP1 |
|
795 |
|
796 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
797 vpalignr T,T,T,8 |
|
798 vpxor T, T, TMP1 |
|
799 |
|
800 vpxor T, T, TMP4 |
|
801 |
|
802 sub aluCTR, 7 |
|
803 |
|
804 LEncDataSingles: |
|
805 |
|
806 cmp len, 16 |
|
807 jb LEncDataTail |
|
808 sub len, 16 |
|
809 |
|
810 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] |
|
811 NEXTCTR 0 |
|
812 |
|
813 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
814 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
815 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
816 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
817 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
818 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
819 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
820 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
821 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
822 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
823 cmp NR, 10 |
|
824 je @f |
|
825 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
826 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
827 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
828 cmp NR, 12 |
|
829 je @f |
|
830 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
831 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
832 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
833 @@: |
|
834 vaesenclast TMP1, TMP1, TMP2 |
|
835 vpxor TMP1, TMP1, XMMWORD PTR[PT] |
|
836 vmovdqu XMMWORD PTR[CT], TMP1 |
|
837 |
|
838 lea PT, [16+PT] |
|
839 lea CT, [16+CT] |
|
840 |
|
841 vpshufb TMP1, TMP1, BSWAPMASK |
|
842 vpxor T, T, TMP1 |
|
843 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
844 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 |
|
845 |
|
846 jmp LEncDataSingles |
|
847 |
|
848 LEncDataTail: |
|
849 |
|
850 test len, len |
|
851 jz LEncDataEnd |
|
852 |
|
853 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] |
|
854 |
|
855 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
856 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
857 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
858 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
859 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
860 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
861 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
862 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
863 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
864 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
865 cmp NR, 10 |
|
866 je @f |
|
867 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
868 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
869 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
870 cmp NR, 12 |
|
871 je @f |
|
872 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
873 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
874 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
875 @@: |
|
876 vaesenclast TMP1, TMP1, TMP2 |
|
877 ; zero a temp location |
|
878 vpxor TMP2, TMP2, TMP2 |
|
879 vmovdqa XMMWORD PTR[rsp], TMP2 |
|
880 ; copy as many bytes as needed |
|
881 xor KS, KS |
|
882 |
|
883 @@: |
|
884 cmp len, KS |
|
885 je @f |
|
886 mov al, [PT + KS] |
|
887 mov [rsp + KS], al |
|
888 inc KS |
|
889 jmp @b |
|
890 @@: |
|
891 vpxor TMP1, TMP1, XMMWORD PTR[rsp] |
|
892 vmovdqa XMMWORD PTR[rsp], TMP1 |
|
893 xor KS, KS |
|
894 @@: |
|
895 cmp len, KS |
|
896 je @f |
|
897 mov al, [rsp + KS] |
|
898 mov [CT + KS], al |
|
899 inc KS |
|
900 jmp @b |
|
901 @@: |
|
902 cmp KS, 16 |
|
903 je @f |
|
904 mov BYTE PTR[rsp + KS], 0 |
|
905 inc KS |
|
906 jmp @b |
|
907 @@: |
|
908 BAIL: |
|
909 vmovdqa TMP1, XMMWORD PTR[rsp] |
|
910 vpshufb TMP1, TMP1, BSWAPMASK |
|
911 vpxor T, T, TMP1 |
|
912 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
913 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 |
|
914 |
|
915 LEncDataEnd: |
|
916 |
|
917 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T |
|
918 bswap aluCTR |
|
919 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
|
920 |
|
921 mov rsp, rbp |
|
922 |
|
923 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] |
|
924 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] |
|
925 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] |
|
926 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] |
|
927 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] |
|
928 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] |
|
929 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] |
|
930 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] |
|
931 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] |
|
932 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] |
|
933 |
|
934 add rsp, 10*16 |
|
935 pop rbp |
|
936 pop r13 |
|
937 pop r12 |
|
938 pop r11 |
|
939 |
|
940 vzeroupper |
|
941 |
|
942 ret |
|
943 intel_aes_gcmENC ENDP |
|
944 |
|
945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
946 ; |
|
947 ; Decrypt and Authenticate |
|
948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); |
|
949 ; |
|
950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
|
951 |
|
952 ALIGN 16 |
|
953 intel_aes_gcmDEC PROC |
|
954 |
|
955 NEXTCTR MACRO i |
|
956 add aluCTR, 1 |
|
957 mov aluTMP, aluCTR |
|
958 xor aluTMP, aluKSl |
|
959 bswap aluTMP |
|
960 mov [3*4 + i*16 + rsp], aluTMP |
|
961 ENDM |
|
962 |
|
963 PT textequ <rdx> |
|
964 CT textequ <rcx> |
|
965 |
|
966 test len, len |
|
967 jnz LbeginDEC |
|
968 ret |
|
969 |
|
970 LbeginDEC: |
|
971 |
|
972 vzeroupper |
|
973 push r11 |
|
974 push r12 |
|
975 push r13 |
|
976 push rbp |
|
977 sub rsp, 10*16 |
|
978 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 |
|
979 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 |
|
980 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 |
|
981 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 |
|
982 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 |
|
983 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 |
|
984 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 |
|
985 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 |
|
986 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 |
|
987 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 |
|
988 |
|
989 mov rbp, rsp |
|
990 sub rsp, 8*16 |
|
991 and rsp, -16 |
|
992 |
|
993 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] |
|
994 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
995 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] |
|
996 mov KS, [16*16 + 3*16 + Gctx] |
|
997 mov NR, [4 + KS] |
|
998 lea KS, [48 + KS] |
|
999 |
|
1000 vpshufb CTR0, CTR0, BSWAPMASK |
|
1001 |
|
1002 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] |
|
1003 mov aluKSl, [3*4 + KS] |
|
1004 bswap aluCTR |
|
1005 bswap aluKSl |
|
1006 |
|
1007 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] |
|
1008 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] |
|
1009 vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 |
|
1010 |
|
1011 cmp len, 128 |
|
1012 jb LDecDataSingles |
|
1013 ; Prepare the "top" counters |
|
1014 vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 |
|
1015 vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 |
|
1016 vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 |
|
1017 vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 |
|
1018 vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 |
|
1019 vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 |
|
1020 vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 |
|
1021 |
|
1022 NEXTCTR 1 |
|
1023 NEXTCTR 2 |
|
1024 NEXTCTR 3 |
|
1025 NEXTCTR 4 |
|
1026 NEXTCTR 5 |
|
1027 NEXTCTR 6 |
|
1028 NEXTCTR 7 |
|
1029 |
|
1030 LDecDataOctets: |
|
1031 cmp len, 128 |
|
1032 jb LEndDecOctets |
|
1033 sub len, 128 |
|
1034 |
|
1035 vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] |
|
1036 vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] |
|
1037 vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] |
|
1038 vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] |
|
1039 vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] |
|
1040 vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] |
|
1041 vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] |
|
1042 vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] |
|
1043 |
|
1044 vmovdqu TMP5, XMMWORD PTR[7*16 + CT] |
|
1045 vpshufb TMP5, TMP5, BSWAPMASK |
|
1046 vpshufd TMP4, TMP5, 78 |
|
1047 vpxor TMP4, TMP4, TMP5 |
|
1048 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h |
|
1049 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] |
|
1050 vpclmulqdq TMP1, TMP5, TMP4, 011h |
|
1051 vpclmulqdq TMP2, TMP5, TMP4, 000h |
|
1052 |
|
1053 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] |
|
1054 vpshufb TMP5, TMP5, BSWAPMASK |
|
1055 ROUNDMUL 1 |
|
1056 NEXTCTR 0 |
|
1057 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] |
|
1058 vpshufb TMP5, TMP5, BSWAPMASK |
|
1059 ROUNDMUL 2 |
|
1060 NEXTCTR 1 |
|
1061 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] |
|
1062 vpshufb TMP5, TMP5, BSWAPMASK |
|
1063 ROUNDMUL 3 |
|
1064 NEXTCTR 2 |
|
1065 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] |
|
1066 vpshufb TMP5, TMP5, BSWAPMASK |
|
1067 ROUNDMUL 4 |
|
1068 NEXTCTR 3 |
|
1069 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] |
|
1070 vpshufb TMP5, TMP5, BSWAPMASK |
|
1071 ROUNDMUL 5 |
|
1072 NEXTCTR 4 |
|
1073 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] |
|
1074 vpshufb TMP5, TMP5, BSWAPMASK |
|
1075 ROUNDMUL 6 |
|
1076 NEXTCTR 5 |
|
1077 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] |
|
1078 vpshufb TMP5, TMP5, BSWAPMASK |
|
1079 vpxor TMP5, TMP5, T |
|
1080 ROUNDMUL 7 |
|
1081 NEXTCTR 6 |
|
1082 |
|
1083 ROUND 8 |
|
1084 NEXTCTR 7 |
|
1085 |
|
1086 vpxor TMP0, TMP0, TMP1 |
|
1087 vpxor TMP0, TMP0, TMP2 |
|
1088 vpsrldq TMP3, TMP0, 8 |
|
1089 vpxor TMP4, TMP1, TMP3 |
|
1090 vpslldq TMP3, TMP0, 8 |
|
1091 vpxor T, TMP2, TMP3 |
|
1092 |
|
1093 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
1094 vpalignr T,T,T,8 |
|
1095 vpxor T, T, TMP1 |
|
1096 |
|
1097 ROUND 9 |
|
1098 |
|
1099 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h |
|
1100 vpalignr T,T,T,8 |
|
1101 vpxor T, T, TMP1 |
|
1102 |
|
1103 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] |
|
1104 cmp NR, 10 |
|
1105 je @f |
|
1106 |
|
1107 ROUND 10 |
|
1108 ROUND 11 |
|
1109 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] |
|
1110 cmp NR, 12 |
|
1111 je @f |
|
1112 |
|
1113 ROUND 12 |
|
1114 ROUND 13 |
|
1115 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] |
|
1116 @@: |
|
1117 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] |
|
1118 vaesenclast CTR0, CTR0, TMP3 |
|
1119 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] |
|
1120 vaesenclast CTR1, CTR1, TMP3 |
|
1121 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] |
|
1122 vaesenclast CTR2, CTR2, TMP3 |
|
1123 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] |
|
1124 vaesenclast CTR3, CTR3, TMP3 |
|
1125 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] |
|
1126 vaesenclast CTR4, CTR4, TMP3 |
|
1127 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] |
|
1128 vaesenclast CTR5, CTR5, TMP3 |
|
1129 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] |
|
1130 vaesenclast CTR6, CTR6, TMP3 |
|
1131 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] |
|
1132 vaesenclast CTR7, CTR7, TMP3 |
|
1133 |
|
1134 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 |
|
1135 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 |
|
1136 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 |
|
1137 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 |
|
1138 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 |
|
1139 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 |
|
1140 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 |
|
1141 vmovdqu XMMWORD PTR[7*16 + PT], CTR7 |
|
1142 |
|
1143 vpxor T, T, TMP4 |
|
1144 |
|
1145 lea CT, [8*16 + CT] |
|
1146 lea PT, [8*16 + PT] |
|
1147 jmp LDecDataOctets |
|
1148 |
|
1149 LEndDecOctets: |
|
1150 |
|
1151 sub aluCTR, 7 |
|
1152 |
|
1153 LDecDataSingles: |
|
1154 |
|
1155 cmp len, 16 |
|
1156 jb LDecDataTail |
|
1157 sub len, 16 |
|
1158 |
|
1159 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] |
|
1160 NEXTCTR 0 |
|
1161 |
|
1162 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
1163 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
1164 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
1165 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
1166 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
1167 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
1168 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
1169 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
1170 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
1171 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
1172 cmp NR, 10 |
|
1173 je @f |
|
1174 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
1175 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
1176 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
1177 cmp NR, 12 |
|
1178 je @f |
|
1179 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
1180 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
1181 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
1182 @@: |
|
1183 vaesenclast TMP1, TMP1, TMP2 |
|
1184 |
|
1185 vmovdqu TMP2, XMMWORD PTR[CT] |
|
1186 vpxor TMP1, TMP1, TMP2 |
|
1187 vmovdqu XMMWORD PTR[PT], TMP1 |
|
1188 |
|
1189 lea PT, [16+PT] |
|
1190 lea CT, [16+CT] |
|
1191 |
|
1192 vpshufb TMP2, TMP2, BSWAPMASK |
|
1193 vpxor T, T, TMP2 |
|
1194 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
1195 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 |
|
1196 |
|
1197 jmp LDecDataSingles |
|
1198 |
|
1199 LDecDataTail: |
|
1200 |
|
1201 test len, len |
|
1202 jz LDecDataEnd |
|
1203 |
|
1204 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] |
|
1205 inc aluCTR |
|
1206 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] |
|
1207 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] |
|
1208 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] |
|
1209 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] |
|
1210 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] |
|
1211 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] |
|
1212 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] |
|
1213 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] |
|
1214 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] |
|
1215 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] |
|
1216 cmp NR, 10 |
|
1217 je @f |
|
1218 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] |
|
1219 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] |
|
1220 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] |
|
1221 cmp NR, 12 |
|
1222 je @f |
|
1223 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] |
|
1224 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] |
|
1225 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] |
|
1226 @@: |
|
1227 vaesenclast TMP1, TMP1, TMP2 |
|
1228 ; copy as many bytes as needed |
|
1229 xor KS, KS |
|
1230 @@: |
|
1231 cmp len, KS |
|
1232 je @f |
|
1233 mov al, [CT + KS] |
|
1234 mov [rsp + KS], al |
|
1235 inc KS |
|
1236 jmp @b |
|
1237 @@: |
|
1238 cmp KS, 16 |
|
1239 je @f |
|
1240 mov BYTE PTR[rsp + KS], 0 |
|
1241 inc KS |
|
1242 jmp @b |
|
1243 @@: |
|
1244 vmovdqa TMP2, XMMWORD PTR[rsp] |
|
1245 vpshufb TMP2, TMP2, BSWAPMASK |
|
1246 vpxor T, T, TMP2 |
|
1247 vmovdqu TMP0, XMMWORD PTR[Htbl] |
|
1248 GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 |
|
1249 |
|
1250 |
|
1251 vpxor TMP1, TMP1, XMMWORD PTR[rsp] |
|
1252 vmovdqa XMMWORD PTR[rsp], TMP1 |
|
1253 xor KS, KS |
|
1254 @@: |
|
1255 cmp len, KS |
|
1256 je @f |
|
1257 mov al, [rsp + KS] |
|
1258 mov [PT + KS], al |
|
1259 inc KS |
|
1260 jmp @b |
|
1261 @@: |
|
1262 |
|
1263 LDecDataEnd: |
|
1264 |
|
1265 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T |
|
1266 bswap aluCTR |
|
1267 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR |
|
1268 |
|
1269 mov rsp, rbp |
|
1270 |
|
1271 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] |
|
1272 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] |
|
1273 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] |
|
1274 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] |
|
1275 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] |
|
1276 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] |
|
1277 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] |
|
1278 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] |
|
1279 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] |
|
1280 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] |
|
1281 |
|
1282 add rsp, 10*16 |
|
1283 pop rbp |
|
1284 pop r13 |
|
1285 pop r12 |
|
1286 pop r11 |
|
1287 |
|
1288 vzeroupper |
|
1289 |
|
1290 ret |
|
1291 ret |
|
1292 intel_aes_gcmDEC ENDP |
|
1293 |
|
1294 |
|
1295 END |