|
1 ; LICENSE: |
|
2 ; This submission to NSS is to be made available under the terms of the |
|
3 ; Mozilla Public License, v. 2.0. You can obtain one at http: |
|
4 ; //mozilla.org/MPL/2.0/. |
|
5 ;############################################################################### |
|
6 ; Copyright(c) 2014, Intel Corp. |
|
7 ; Developers and authors: |
|
8 ; Shay Gueron and Vlad Krasnov |
|
9 ; Intel Corporation, Israel Development Centre, Haifa, Israel |
|
10 ; Please send feedback directly to crypto.feedback.alias@intel.com |
|
11 |
|
12 |
|
13 .DATA |
|
14 ALIGN 16 |
|
15 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh |
|
16 Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h |
|
17 Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh |
|
18 Lcon1 dd 1,1,1,1 |
|
19 Lcon2 dd 1bh,1bh,1bh,1bh |
|
20 |
|
21 .CODE |
|
22 |
|
23 ctx textequ <rcx> |
|
24 output textequ <rdx> |
|
25 input textequ <r8> |
|
26 inputLen textequ <r9d> |
|
27 |
|
28 |
|
29 aes_rnd MACRO i |
|
30 movdqu xmm8, [i*16 + ctx] |
|
31 aesenc xmm0, xmm8 |
|
32 aesenc xmm1, xmm8 |
|
33 aesenc xmm2, xmm8 |
|
34 aesenc xmm3, xmm8 |
|
35 aesenc xmm4, xmm8 |
|
36 aesenc xmm5, xmm8 |
|
37 aesenc xmm6, xmm8 |
|
38 aesenc xmm7, xmm8 |
|
39 ENDM |
|
40 |
|
41 aes_last_rnd MACRO i |
|
42 movdqu xmm8, [i*16 + ctx] |
|
43 aesenclast xmm0, xmm8 |
|
44 aesenclast xmm1, xmm8 |
|
45 aesenclast xmm2, xmm8 |
|
46 aesenclast xmm3, xmm8 |
|
47 aesenclast xmm4, xmm8 |
|
48 aesenclast xmm5, xmm8 |
|
49 aesenclast xmm6, xmm8 |
|
50 aesenclast xmm7, xmm8 |
|
51 ENDM |
|
52 |
|
53 aes_dec_rnd MACRO i |
|
54 movdqu xmm8, [i*16 + ctx] |
|
55 aesdec xmm0, xmm8 |
|
56 aesdec xmm1, xmm8 |
|
57 aesdec xmm2, xmm8 |
|
58 aesdec xmm3, xmm8 |
|
59 aesdec xmm4, xmm8 |
|
60 aesdec xmm5, xmm8 |
|
61 aesdec xmm6, xmm8 |
|
62 aesdec xmm7, xmm8 |
|
63 ENDM |
|
64 |
|
65 aes_dec_last_rnd MACRO i |
|
66 movdqu xmm8, [i*16 + ctx] |
|
67 aesdeclast xmm0, xmm8 |
|
68 aesdeclast xmm1, xmm8 |
|
69 aesdeclast xmm2, xmm8 |
|
70 aesdeclast xmm3, xmm8 |
|
71 aesdeclast xmm4, xmm8 |
|
72 aesdeclast xmm5, xmm8 |
|
73 aesdeclast xmm6, xmm8 |
|
74 aesdeclast xmm7, xmm8 |
|
75 ENDM |
|
76 |
|
77 |
|
78 gen_aes_ecb_func MACRO enc, rnds |
|
79 |
|
80 LOCAL loop8 |
|
81 LOCAL loop1 |
|
82 LOCAL bail |
|
83 |
|
84 xor inputLen, inputLen |
|
85 mov input, [rsp + 1*8 + 8*4] |
|
86 mov inputLen, [rsp + 1*8 + 8*5] |
|
87 |
|
88 sub rsp, 3*16 |
|
89 |
|
90 movdqu [rsp + 0*16], xmm6 |
|
91 movdqu [rsp + 1*16], xmm7 |
|
92 movdqu [rsp + 2*16], xmm8 |
|
93 |
|
94 lea ctx, [48+ctx] |
|
95 |
|
96 loop8: |
|
97 cmp inputLen, 8*16 |
|
98 jb loop1 |
|
99 |
|
100 movdqu xmm0, [0*16 + input] |
|
101 movdqu xmm1, [1*16 + input] |
|
102 movdqu xmm2, [2*16 + input] |
|
103 movdqu xmm3, [3*16 + input] |
|
104 movdqu xmm4, [4*16 + input] |
|
105 movdqu xmm5, [5*16 + input] |
|
106 movdqu xmm6, [6*16 + input] |
|
107 movdqu xmm7, [7*16 + input] |
|
108 |
|
109 movdqu xmm8, [0*16 + ctx] |
|
110 pxor xmm0, xmm8 |
|
111 pxor xmm1, xmm8 |
|
112 pxor xmm2, xmm8 |
|
113 pxor xmm3, xmm8 |
|
114 pxor xmm4, xmm8 |
|
115 pxor xmm5, xmm8 |
|
116 pxor xmm6, xmm8 |
|
117 pxor xmm7, xmm8 |
|
118 |
|
119 IF enc eq 1 |
|
120 rnd textequ <aes_rnd> |
|
121 lastrnd textequ <aes_last_rnd> |
|
122 aesinst textequ <aesenc> |
|
123 aeslastinst textequ <aesenclast> |
|
124 ELSE |
|
125 rnd textequ <aes_dec_rnd> |
|
126 lastrnd textequ <aes_dec_last_rnd> |
|
127 aesinst textequ <aesdec> |
|
128 aeslastinst textequ <aesdeclast> |
|
129 ENDIF |
|
130 |
|
131 i = 1 |
|
132 WHILE i LT rnds |
|
133 rnd i |
|
134 i = i+1 |
|
135 ENDM |
|
136 lastrnd rnds |
|
137 |
|
138 movdqu [0*16 + output], xmm0 |
|
139 movdqu [1*16 + output], xmm1 |
|
140 movdqu [2*16 + output], xmm2 |
|
141 movdqu [3*16 + output], xmm3 |
|
142 movdqu [4*16 + output], xmm4 |
|
143 movdqu [5*16 + output], xmm5 |
|
144 movdqu [6*16 + output], xmm6 |
|
145 movdqu [7*16 + output], xmm7 |
|
146 |
|
147 lea input, [8*16 + input] |
|
148 lea output, [8*16 + output] |
|
149 sub inputLen, 8*16 |
|
150 jmp loop8 |
|
151 |
|
152 loop1: |
|
153 cmp inputLen, 1*16 |
|
154 jb bail |
|
155 |
|
156 movdqu xmm0, [input] |
|
157 movdqu xmm7, [0*16 + ctx] |
|
158 pxor xmm0, xmm7 |
|
159 |
|
160 i = 1 |
|
161 WHILE i LT rnds |
|
162 movdqu xmm7, [i*16 + ctx] |
|
163 aesinst xmm0, xmm7 |
|
164 i = i+1 |
|
165 ENDM |
|
166 movdqu xmm7, [rnds*16 + ctx] |
|
167 aeslastinst xmm0, xmm7 |
|
168 |
|
169 movdqu [output], xmm0 |
|
170 |
|
171 lea input, [1*16 + input] |
|
172 lea output, [1*16 + output] |
|
173 sub inputLen, 1*16 |
|
174 jmp loop1 |
|
175 |
|
176 bail: |
|
177 xor rax, rax |
|
178 |
|
179 movdqu xmm6, [rsp + 0*16] |
|
180 movdqu xmm7, [rsp + 1*16] |
|
181 movdqu xmm8, [rsp + 2*16] |
|
182 add rsp, 3*16 |
|
183 ret |
|
184 ENDM |
|
185 |
|
186 intel_aes_encrypt_ecb_128 PROC |
|
187 gen_aes_ecb_func 1, 10 |
|
188 intel_aes_encrypt_ecb_128 ENDP |
|
189 |
|
190 intel_aes_encrypt_ecb_192 PROC |
|
191 gen_aes_ecb_func 1, 12 |
|
192 intel_aes_encrypt_ecb_192 ENDP |
|
193 |
|
194 intel_aes_encrypt_ecb_256 PROC |
|
195 gen_aes_ecb_func 1, 14 |
|
196 intel_aes_encrypt_ecb_256 ENDP |
|
197 |
|
198 intel_aes_decrypt_ecb_128 PROC |
|
199 gen_aes_ecb_func 0, 10 |
|
200 intel_aes_decrypt_ecb_128 ENDP |
|
201 |
|
202 intel_aes_decrypt_ecb_192 PROC |
|
203 gen_aes_ecb_func 0, 12 |
|
204 intel_aes_decrypt_ecb_192 ENDP |
|
205 |
|
206 intel_aes_decrypt_ecb_256 PROC |
|
207 gen_aes_ecb_func 0, 14 |
|
208 intel_aes_decrypt_ecb_256 ENDP |
|
209 |
|
210 |
|
211 KEY textequ <rcx> |
|
212 KS textequ <rdx> |
|
213 ITR textequ <r8> |
|
214 |
|
215 intel_aes_encrypt_init_128 PROC |
|
216 |
|
217 movdqu xmm1, [KEY] |
|
218 movdqu [KS], xmm1 |
|
219 movdqa xmm2, xmm1 |
|
220 |
|
221 lea ITR, Lcon1 |
|
222 movdqa xmm0, [ITR] |
|
223 lea ITR, Lmask |
|
224 movdqa xmm4, [ITR] |
|
225 |
|
226 mov ITR, 8 |
|
227 |
|
228 Lenc_128_ks_loop: |
|
229 lea KS, [16 + KS] |
|
230 dec ITR |
|
231 |
|
232 pshufb xmm2, xmm4 |
|
233 aesenclast xmm2, xmm0 |
|
234 pslld xmm0, 1 |
|
235 movdqa xmm3, xmm1 |
|
236 pslldq xmm3, 4 |
|
237 pxor xmm1, xmm3 |
|
238 pslldq xmm3, 4 |
|
239 pxor xmm1, xmm3 |
|
240 pslldq xmm3, 4 |
|
241 pxor xmm1, xmm3 |
|
242 pxor xmm1, xmm2 |
|
243 movdqu [KS], xmm1 |
|
244 movdqa xmm2, xmm1 |
|
245 |
|
246 jne Lenc_128_ks_loop |
|
247 |
|
248 lea ITR, Lcon2 |
|
249 movdqa xmm0, [ITR] |
|
250 |
|
251 pshufb xmm2, xmm4 |
|
252 aesenclast xmm2, xmm0 |
|
253 pslld xmm0, 1 |
|
254 movdqa xmm3, xmm1 |
|
255 pslldq xmm3, 4 |
|
256 pxor xmm1, xmm3 |
|
257 pslldq xmm3, 4 |
|
258 pxor xmm1, xmm3 |
|
259 pslldq xmm3, 4 |
|
260 pxor xmm1, xmm3 |
|
261 pxor xmm1, xmm2 |
|
262 movdqu [16 + KS], xmm1 |
|
263 movdqa xmm2, xmm1 |
|
264 |
|
265 pshufb xmm2, xmm4 |
|
266 aesenclast xmm2, xmm0 |
|
267 movdqa xmm3, xmm1 |
|
268 pslldq xmm3, 4 |
|
269 pxor xmm1, xmm3 |
|
270 pslldq xmm3, 4 |
|
271 pxor xmm1, xmm3 |
|
272 pslldq xmm3, 4 |
|
273 pxor xmm1, xmm3 |
|
274 pxor xmm1, xmm2 |
|
275 movdqu [32 + KS], xmm1 |
|
276 movdqa xmm2, xmm1 |
|
277 |
|
278 ret |
|
279 intel_aes_encrypt_init_128 ENDP |
|
280 |
|
281 |
|
282 intel_aes_decrypt_init_128 PROC |
|
283 |
|
284 push KS |
|
285 push KEY |
|
286 |
|
287 call intel_aes_encrypt_init_128 |
|
288 |
|
289 pop KEY |
|
290 pop KS |
|
291 |
|
292 movdqu xmm0, [0*16 + KS] |
|
293 movdqu xmm1, [10*16 + KS] |
|
294 movdqu [10*16 + KS], xmm0 |
|
295 movdqu [0*16 + KS], xmm1 |
|
296 |
|
297 i = 1 |
|
298 WHILE i LT 5 |
|
299 movdqu xmm0, [i*16 + KS] |
|
300 movdqu xmm1, [(10-i)*16 + KS] |
|
301 |
|
302 aesimc xmm0, xmm0 |
|
303 aesimc xmm1, xmm1 |
|
304 |
|
305 movdqu [(10-i)*16 + KS], xmm0 |
|
306 movdqu [i*16 + KS], xmm1 |
|
307 |
|
308 i = i+1 |
|
309 ENDM |
|
310 |
|
311 movdqu xmm0, [5*16 + KS] |
|
312 aesimc xmm0, xmm0 |
|
313 movdqu [5*16 + KS], xmm0 |
|
314 ret |
|
315 intel_aes_decrypt_init_128 ENDP |
|
316 |
|
317 |
|
318 intel_aes_encrypt_init_192 PROC |
|
319 |
|
320 sub rsp, 16*2 |
|
321 movdqu [16*0 + rsp], xmm6 |
|
322 movdqu [16*1 + rsp], xmm7 |
|
323 |
|
324 movdqu xmm1, [KEY] |
|
325 mov ITR, [16 + KEY] |
|
326 movd xmm3, ITR |
|
327 |
|
328 movdqu [KS], xmm1 |
|
329 movdqa xmm5, xmm3 |
|
330 |
|
331 lea ITR, Lcon1 |
|
332 movdqu xmm0, [ITR] |
|
333 lea ITR, Lmask192 |
|
334 movdqu xmm4, [ITR] |
|
335 |
|
336 mov ITR, 4 |
|
337 |
|
338 Lenc_192_ks_loop: |
|
339 movdqa xmm2, xmm3 |
|
340 pshufb xmm2, xmm4 |
|
341 aesenclast xmm2, xmm0 |
|
342 pslld xmm0, 1 |
|
343 |
|
344 movdqa xmm6, xmm1 |
|
345 movdqa xmm7, xmm3 |
|
346 pslldq xmm6, 4 |
|
347 pslldq xmm7, 4 |
|
348 pxor xmm1, xmm6 |
|
349 pxor xmm3, xmm7 |
|
350 pslldq xmm6, 4 |
|
351 pxor xmm1, xmm6 |
|
352 pslldq xmm6, 4 |
|
353 pxor xmm1, xmm6 |
|
354 pxor xmm1, xmm2 |
|
355 pshufd xmm2, xmm1, 0ffh |
|
356 pxor xmm3, xmm2 |
|
357 |
|
358 movdqa xmm6, xmm1 |
|
359 shufpd xmm5, xmm1, 00h |
|
360 shufpd xmm6, xmm3, 01h |
|
361 |
|
362 movdqu [16 + KS], xmm5 |
|
363 movdqu [32 + KS], xmm6 |
|
364 |
|
365 movdqa xmm2, xmm3 |
|
366 pshufb xmm2, xmm4 |
|
367 aesenclast xmm2, xmm0 |
|
368 pslld xmm0, 1 |
|
369 |
|
370 movdqa xmm6, xmm1 |
|
371 movdqa xmm7, xmm3 |
|
372 pslldq xmm6, 4 |
|
373 pslldq xmm7, 4 |
|
374 pxor xmm1, xmm6 |
|
375 pxor xmm3, xmm7 |
|
376 pslldq xmm6, 4 |
|
377 pxor xmm1, xmm6 |
|
378 pslldq xmm6, 4 |
|
379 pxor xmm1, xmm6 |
|
380 pxor xmm1, xmm2 |
|
381 pshufd xmm2, xmm1, 0ffh |
|
382 pxor xmm3, xmm2 |
|
383 |
|
384 movdqu [48 + KS], xmm1 |
|
385 movdqa xmm5, xmm3 |
|
386 |
|
387 lea KS, [48 + KS] |
|
388 |
|
389 dec ITR |
|
390 jnz Lenc_192_ks_loop |
|
391 |
|
392 movdqu [16 + KS], xmm5 |
|
393 |
|
394 movdqu xmm7, [16*1 + rsp] |
|
395 movdqu xmm6, [16*0 + rsp] |
|
396 add rsp, 16*2 |
|
397 ret |
|
398 intel_aes_encrypt_init_192 ENDP |
|
399 |
|
400 intel_aes_decrypt_init_192 PROC |
|
401 push KS |
|
402 push KEY |
|
403 |
|
404 call intel_aes_encrypt_init_192 |
|
405 |
|
406 pop KEY |
|
407 pop KS |
|
408 |
|
409 movdqu xmm0, [0*16 + KS] |
|
410 movdqu xmm1, [12*16 + KS] |
|
411 movdqu [12*16 + KS], xmm0 |
|
412 movdqu [0*16 + KS], xmm1 |
|
413 |
|
414 i = 1 |
|
415 WHILE i LT 6 |
|
416 movdqu xmm0, [i*16 + KS] |
|
417 movdqu xmm1, [(12-i)*16 + KS] |
|
418 |
|
419 aesimc xmm0, xmm0 |
|
420 aesimc xmm1, xmm1 |
|
421 |
|
422 movdqu [(12-i)*16 + KS], xmm0 |
|
423 movdqu [i*16 + KS], xmm1 |
|
424 |
|
425 i = i+1 |
|
426 ENDM |
|
427 |
|
428 movdqu xmm0, [6*16 + KS] |
|
429 aesimc xmm0, xmm0 |
|
430 movdqu [6*16 + KS], xmm0 |
|
431 ret |
|
432 intel_aes_decrypt_init_192 ENDP |
|
433 |
|
434 |
|
435 intel_aes_encrypt_init_256 PROC |
|
436 sub rsp, 16*2 |
|
437 movdqu [16*0 + rsp], xmm6 |
|
438 movdqu [16*1 + rsp], xmm7 |
|
439 |
|
440 movdqu xmm1, [16*0 + KEY] |
|
441 movdqu xmm3, [16*1 + KEY] |
|
442 |
|
443 movdqu [16*0 + KS], xmm1 |
|
444 movdqu [16*1 + KS], xmm3 |
|
445 |
|
446 lea ITR, Lcon1 |
|
447 movdqu xmm0, [ITR] |
|
448 lea ITR, Lmask256 |
|
449 movdqu xmm5, [ITR] |
|
450 |
|
451 pxor xmm6, xmm6 |
|
452 |
|
453 mov ITR, 6 |
|
454 |
|
455 Lenc_256_ks_loop: |
|
456 |
|
457 movdqa xmm2, xmm3 |
|
458 pshufb xmm2, xmm5 |
|
459 aesenclast xmm2, xmm0 |
|
460 pslld xmm0, 1 |
|
461 movdqa xmm4, xmm1 |
|
462 pslldq xmm4, 4 |
|
463 pxor xmm1, xmm4 |
|
464 pslldq xmm4, 4 |
|
465 pxor xmm1, xmm4 |
|
466 pslldq xmm4, 4 |
|
467 pxor xmm1, xmm4 |
|
468 pxor xmm1, xmm2 |
|
469 movdqu [16*2 + KS], xmm1 |
|
470 |
|
471 pshufd xmm2, xmm1, 0ffh |
|
472 aesenclast xmm2, xmm6 |
|
473 movdqa xmm4, xmm3 |
|
474 pslldq xmm4, 4 |
|
475 pxor xmm3, xmm4 |
|
476 pslldq xmm4, 4 |
|
477 pxor xmm3, xmm4 |
|
478 pslldq xmm4, 4 |
|
479 pxor xmm3, xmm4 |
|
480 pxor xmm3, xmm2 |
|
481 movdqu [16*3 + KS], xmm3 |
|
482 |
|
483 lea KS, [32 + KS] |
|
484 dec ITR |
|
485 jnz Lenc_256_ks_loop |
|
486 |
|
487 movdqa xmm2, xmm3 |
|
488 pshufb xmm2, xmm5 |
|
489 aesenclast xmm2, xmm0 |
|
490 movdqa xmm4, xmm1 |
|
491 pslldq xmm4, 4 |
|
492 pxor xmm1, xmm4 |
|
493 pslldq xmm4, 4 |
|
494 pxor xmm1, xmm4 |
|
495 pslldq xmm4, 4 |
|
496 pxor xmm1, xmm4 |
|
497 pxor xmm1, xmm2 |
|
498 movdqu [16*2 + KS], xmm1 |
|
499 |
|
500 movdqu xmm7, [16*1 + rsp] |
|
501 movdqu xmm6, [16*0 + rsp] |
|
502 add rsp, 16*2 |
|
503 ret |
|
504 |
|
505 intel_aes_encrypt_init_256 ENDP |
|
506 |
|
507 |
|
508 intel_aes_decrypt_init_256 PROC |
|
509 push KS |
|
510 push KEY |
|
511 |
|
512 call intel_aes_encrypt_init_256 |
|
513 |
|
514 pop KEY |
|
515 pop KS |
|
516 |
|
517 movdqu xmm0, [0*16 + KS] |
|
518 movdqu xmm1, [14*16 + KS] |
|
519 movdqu [14*16 + KS], xmm0 |
|
520 movdqu [0*16 + KS], xmm1 |
|
521 |
|
522 i = 1 |
|
523 WHILE i LT 7 |
|
524 movdqu xmm0, [i*16 + KS] |
|
525 movdqu xmm1, [(14-i)*16 + KS] |
|
526 |
|
527 aesimc xmm0, xmm0 |
|
528 aesimc xmm1, xmm1 |
|
529 |
|
530 movdqu [(14-i)*16 + KS], xmm0 |
|
531 movdqu [i*16 + KS], xmm1 |
|
532 |
|
533 i = i+1 |
|
534 ENDM |
|
535 |
|
536 movdqu xmm0, [7*16 + KS] |
|
537 aesimc xmm0, xmm0 |
|
538 movdqu [7*16 + KS], xmm0 |
|
539 ret |
|
540 intel_aes_decrypt_init_256 ENDP |
|
541 |
|
542 |
|
543 |
|
544 gen_aes_cbc_enc_func MACRO rnds |
|
545 |
|
546 LOCAL loop1 |
|
547 LOCAL bail |
|
548 |
|
549 mov input, [rsp + 1*8 + 8*4] |
|
550 mov inputLen, [rsp + 1*8 + 8*5] |
|
551 |
|
552 sub rsp, 3*16 |
|
553 |
|
554 movdqu [rsp + 0*16], xmm6 |
|
555 movdqu [rsp + 1*16], xmm7 |
|
556 movdqu [rsp + 2*16], xmm8 |
|
557 |
|
558 lea ctx, [48+ctx] |
|
559 |
|
560 movdqu xmm0, [-32+ctx] |
|
561 |
|
562 movdqu xmm2, [0*16 + ctx] |
|
563 movdqu xmm3, [1*16 + ctx] |
|
564 movdqu xmm4, [2*16 + ctx] |
|
565 movdqu xmm5, [3*16 + ctx] |
|
566 movdqu xmm6, [4*16 + ctx] |
|
567 movdqu xmm7, [5*16 + ctx] |
|
568 |
|
569 loop1: |
|
570 cmp inputLen, 1*16 |
|
571 jb bail |
|
572 |
|
573 movdqu xmm1, [input] |
|
574 pxor xmm1, xmm2 |
|
575 pxor xmm0, xmm1 |
|
576 |
|
577 aesenc xmm0, xmm3 |
|
578 aesenc xmm0, xmm4 |
|
579 aesenc xmm0, xmm5 |
|
580 aesenc xmm0, xmm6 |
|
581 aesenc xmm0, xmm7 |
|
582 |
|
583 i = 6 |
|
584 WHILE i LT rnds |
|
585 movdqu xmm8, [i*16 + ctx] |
|
586 aesenc xmm0, xmm8 |
|
587 i = i+1 |
|
588 ENDM |
|
589 movdqu xmm8, [rnds*16 + ctx] |
|
590 aesenclast xmm0, xmm8 |
|
591 |
|
592 movdqu [output], xmm0 |
|
593 |
|
594 lea input, [1*16 + input] |
|
595 lea output, [1*16 + output] |
|
596 sub inputLen, 1*16 |
|
597 jmp loop1 |
|
598 |
|
599 bail: |
|
600 movdqu [-32+ctx], xmm0 |
|
601 |
|
602 xor rax, rax |
|
603 |
|
604 movdqu xmm6, [rsp + 0*16] |
|
605 movdqu xmm7, [rsp + 1*16] |
|
606 movdqu xmm8, [rsp + 2*16] |
|
607 add rsp, 3*16 |
|
608 ret |
|
609 |
|
610 ENDM |
|
611 |
|
612 gen_aes_cbc_dec_func MACRO rnds |
|
613 |
|
614 LOCAL loop8 |
|
615 LOCAL loop1 |
|
616 LOCAL dec1 |
|
617 LOCAL bail |
|
618 |
|
619 mov input, [rsp + 1*8 + 8*4] |
|
620 mov inputLen, [rsp + 1*8 + 8*5] |
|
621 |
|
622 sub rsp, 3*16 |
|
623 |
|
624 movdqu [rsp + 0*16], xmm6 |
|
625 movdqu [rsp + 1*16], xmm7 |
|
626 movdqu [rsp + 2*16], xmm8 |
|
627 |
|
628 lea ctx, [48+ctx] |
|
629 |
|
630 loop8: |
|
631 cmp inputLen, 8*16 |
|
632 jb dec1 |
|
633 |
|
634 movdqu xmm0, [0*16 + input] |
|
635 movdqu xmm1, [1*16 + input] |
|
636 movdqu xmm2, [2*16 + input] |
|
637 movdqu xmm3, [3*16 + input] |
|
638 movdqu xmm4, [4*16 + input] |
|
639 movdqu xmm5, [5*16 + input] |
|
640 movdqu xmm6, [6*16 + input] |
|
641 movdqu xmm7, [7*16 + input] |
|
642 |
|
643 movdqu xmm8, [0*16 + ctx] |
|
644 pxor xmm0, xmm8 |
|
645 pxor xmm1, xmm8 |
|
646 pxor xmm2, xmm8 |
|
647 pxor xmm3, xmm8 |
|
648 pxor xmm4, xmm8 |
|
649 pxor xmm5, xmm8 |
|
650 pxor xmm6, xmm8 |
|
651 pxor xmm7, xmm8 |
|
652 |
|
653 i = 1 |
|
654 WHILE i LT rnds |
|
655 aes_dec_rnd i |
|
656 i = i+1 |
|
657 ENDM |
|
658 aes_dec_last_rnd rnds |
|
659 |
|
660 movdqu xmm8, [-32 + ctx] |
|
661 pxor xmm0, xmm8 |
|
662 movdqu xmm8, [0*16 + input] |
|
663 pxor xmm1, xmm8 |
|
664 movdqu xmm8, [1*16 + input] |
|
665 pxor xmm2, xmm8 |
|
666 movdqu xmm8, [2*16 + input] |
|
667 pxor xmm3, xmm8 |
|
668 movdqu xmm8, [3*16 + input] |
|
669 pxor xmm4, xmm8 |
|
670 movdqu xmm8, [4*16 + input] |
|
671 pxor xmm5, xmm8 |
|
672 movdqu xmm8, [5*16 + input] |
|
673 pxor xmm6, xmm8 |
|
674 movdqu xmm8, [6*16 + input] |
|
675 pxor xmm7, xmm8 |
|
676 movdqu xmm8, [7*16 + input] |
|
677 |
|
678 movdqu [0*16 + output], xmm0 |
|
679 movdqu [1*16 + output], xmm1 |
|
680 movdqu [2*16 + output], xmm2 |
|
681 movdqu [3*16 + output], xmm3 |
|
682 movdqu [4*16 + output], xmm4 |
|
683 movdqu [5*16 + output], xmm5 |
|
684 movdqu [6*16 + output], xmm6 |
|
685 movdqu [7*16 + output], xmm7 |
|
686 movdqu [-32 + ctx], xmm8 |
|
687 |
|
688 lea input, [8*16 + input] |
|
689 lea output, [8*16 + output] |
|
690 sub inputLen, 8*16 |
|
691 jmp loop8 |
|
692 dec1: |
|
693 |
|
694 movdqu xmm3, [-32 + ctx] |
|
695 |
|
696 loop1: |
|
697 cmp inputLen, 1*16 |
|
698 jb bail |
|
699 |
|
700 movdqu xmm0, [input] |
|
701 movdqa xmm4, xmm0 |
|
702 movdqu xmm7, [0*16 + ctx] |
|
703 pxor xmm0, xmm7 |
|
704 |
|
705 i = 1 |
|
706 WHILE i LT rnds |
|
707 movdqu xmm7, [i*16 + ctx] |
|
708 aesdec xmm0, xmm7 |
|
709 i = i+1 |
|
710 ENDM |
|
711 movdqu xmm7, [rnds*16 + ctx] |
|
712 aesdeclast xmm0, xmm7 |
|
713 pxor xmm3, xmm0 |
|
714 |
|
715 movdqu [output], xmm3 |
|
716 movdqa xmm3, xmm4 |
|
717 |
|
718 lea input, [1*16 + input] |
|
719 lea output, [1*16 + output] |
|
720 sub inputLen, 1*16 |
|
721 jmp loop1 |
|
722 |
|
723 bail: |
|
724 movdqu [-32 + ctx], xmm3 |
|
725 xor rax, rax |
|
726 |
|
727 movdqu xmm6, [rsp + 0*16] |
|
728 movdqu xmm7, [rsp + 1*16] |
|
729 movdqu xmm8, [rsp + 2*16] |
|
730 add rsp, 3*16 |
|
731 ret |
|
732 ENDM |
|
733 |
|
734 intel_aes_encrypt_cbc_128 PROC |
|
735 gen_aes_cbc_enc_func 10 |
|
736 intel_aes_encrypt_cbc_128 ENDP |
|
737 |
|
738 intel_aes_encrypt_cbc_192 PROC |
|
739 gen_aes_cbc_enc_func 12 |
|
740 intel_aes_encrypt_cbc_192 ENDP |
|
741 |
|
742 intel_aes_encrypt_cbc_256 PROC |
|
743 gen_aes_cbc_enc_func 14 |
|
744 intel_aes_encrypt_cbc_256 ENDP |
|
745 |
|
746 intel_aes_decrypt_cbc_128 PROC |
|
747 gen_aes_cbc_dec_func 10 |
|
748 intel_aes_decrypt_cbc_128 ENDP |
|
749 |
|
750 intel_aes_decrypt_cbc_192 PROC |
|
751 gen_aes_cbc_dec_func 12 |
|
752 intel_aes_decrypt_cbc_192 ENDP |
|
753 |
|
754 intel_aes_decrypt_cbc_256 PROC |
|
755 gen_aes_cbc_dec_func 14 |
|
756 intel_aes_decrypt_cbc_256 ENDP |
|
757 |
|
758 |
|
759 |
|
760 ctrCtx textequ <r10> |
|
761 CTR textequ <r11d> |
|
762 CTRSave textequ <eax> |
|
763 |
|
764 gen_aes_ctr_func MACRO rnds |
|
765 |
|
766 LOCAL loop8 |
|
767 LOCAL loop1 |
|
768 LOCAL enc1 |
|
769 LOCAL bail |
|
770 |
|
771 mov input, [rsp + 8*1 + 4*8] |
|
772 mov inputLen, [rsp + 8*1 + 5*8] |
|
773 |
|
774 mov ctrCtx, ctx |
|
775 mov ctx, [8+ctrCtx] |
|
776 lea ctx, [48+ctx] |
|
777 |
|
778 sub rsp, 3*16 |
|
779 movdqu [rsp + 0*16], xmm6 |
|
780 movdqu [rsp + 1*16], xmm7 |
|
781 movdqu [rsp + 2*16], xmm8 |
|
782 |
|
783 |
|
784 push rbp |
|
785 mov rbp, rsp |
|
786 sub rsp, 8*16 |
|
787 and rsp, -16 |
|
788 |
|
789 |
|
790 movdqu xmm0, [16+ctrCtx] |
|
791 mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] |
|
792 bswap CTRSave |
|
793 movdqu xmm1, [ctx + 0*16] |
|
794 |
|
795 pxor xmm0, xmm1 |
|
796 |
|
797 movdqa [rsp + 0*16], xmm0 |
|
798 movdqa [rsp + 1*16], xmm0 |
|
799 movdqa [rsp + 2*16], xmm0 |
|
800 movdqa [rsp + 3*16], xmm0 |
|
801 movdqa [rsp + 4*16], xmm0 |
|
802 movdqa [rsp + 5*16], xmm0 |
|
803 movdqa [rsp + 6*16], xmm0 |
|
804 movdqa [rsp + 7*16], xmm0 |
|
805 |
|
806 inc CTRSave |
|
807 mov CTR, CTRSave |
|
808 bswap CTR |
|
809 xor CTR, DWORD PTR [ctx + 3*4] |
|
810 mov DWORD PTR [rsp + 1*16 + 3*4], CTR |
|
811 |
|
812 inc CTRSave |
|
813 mov CTR, CTRSave |
|
814 bswap CTR |
|
815 xor CTR, DWORD PTR [ctx + 3*4] |
|
816 mov DWORD PTR [rsp + 2*16 + 3*4], CTR |
|
817 |
|
818 inc CTRSave |
|
819 mov CTR, CTRSave |
|
820 bswap CTR |
|
821 xor CTR, DWORD PTR [ctx + 3*4] |
|
822 mov DWORD PTR [rsp + 3*16 + 3*4], CTR |
|
823 |
|
824 inc CTRSave |
|
825 mov CTR, CTRSave |
|
826 bswap CTR |
|
827 xor CTR, DWORD PTR [ctx + 3*4] |
|
828 mov DWORD PTR [rsp + 4*16 + 3*4], CTR |
|
829 |
|
830 inc CTRSave |
|
831 mov CTR, CTRSave |
|
832 bswap CTR |
|
833 xor CTR, DWORD PTR [ctx + 3*4] |
|
834 mov DWORD PTR [rsp + 5*16 + 3*4], CTR |
|
835 |
|
836 inc CTRSave |
|
837 mov CTR, CTRSave |
|
838 bswap CTR |
|
839 xor CTR, DWORD PTR [ctx + 3*4] |
|
840 mov DWORD PTR [rsp + 6*16 + 3*4], CTR |
|
841 |
|
842 inc CTRSave |
|
843 mov CTR, CTRSave |
|
844 bswap CTR |
|
845 xor CTR, DWORD PTR [ctx + 3*4] |
|
846 mov DWORD PTR [rsp + 7*16 + 3*4], CTR |
|
847 |
|
848 |
|
849 loop8: |
|
850 cmp inputLen, 8*16 |
|
851 jb loop1 |
|
852 |
|
853 movdqu xmm0, [0*16 + rsp] |
|
854 movdqu xmm1, [1*16 + rsp] |
|
855 movdqu xmm2, [2*16 + rsp] |
|
856 movdqu xmm3, [3*16 + rsp] |
|
857 movdqu xmm4, [4*16 + rsp] |
|
858 movdqu xmm5, [5*16 + rsp] |
|
859 movdqu xmm6, [6*16 + rsp] |
|
860 movdqu xmm7, [7*16 + rsp] |
|
861 |
|
862 i = 1 |
|
863 WHILE i LE 8 |
|
864 aes_rnd i |
|
865 |
|
866 inc CTRSave |
|
867 mov CTR, CTRSave |
|
868 bswap CTR |
|
869 xor CTR, DWORD PTR [ctx + 3*4] |
|
870 mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR |
|
871 |
|
872 i = i+1 |
|
873 ENDM |
|
874 WHILE i LT rnds |
|
875 aes_rnd i |
|
876 i = i+1 |
|
877 ENDM |
|
878 aes_last_rnd rnds |
|
879 |
|
880 movdqu xmm8, [0*16 + input] |
|
881 pxor xmm0, xmm8 |
|
882 movdqu xmm8, [1*16 + input] |
|
883 pxor xmm1, xmm8 |
|
884 movdqu xmm8, [2*16 + input] |
|
885 pxor xmm2, xmm8 |
|
886 movdqu xmm8, [3*16 + input] |
|
887 pxor xmm3, xmm8 |
|
888 movdqu xmm8, [4*16 + input] |
|
889 pxor xmm4, xmm8 |
|
890 movdqu xmm8, [5*16 + input] |
|
891 pxor xmm5, xmm8 |
|
892 movdqu xmm8, [6*16 + input] |
|
893 pxor xmm6, xmm8 |
|
894 movdqu xmm8, [7*16 + input] |
|
895 pxor xmm7, xmm8 |
|
896 |
|
897 movdqu [0*16 + output], xmm0 |
|
898 movdqu [1*16 + output], xmm1 |
|
899 movdqu [2*16 + output], xmm2 |
|
900 movdqu [3*16 + output], xmm3 |
|
901 movdqu [4*16 + output], xmm4 |
|
902 movdqu [5*16 + output], xmm5 |
|
903 movdqu [6*16 + output], xmm6 |
|
904 movdqu [7*16 + output], xmm7 |
|
905 |
|
906 lea input, [8*16 + input] |
|
907 lea output, [8*16 + output] |
|
908 sub inputLen, 8*16 |
|
909 jmp loop8 |
|
910 |
|
911 |
|
912 loop1: |
|
913 cmp inputLen, 1*16 |
|
914 jb bail |
|
915 |
|
916 movdqu xmm0, [rsp] |
|
917 add rsp, 16 |
|
918 |
|
919 i = 1 |
|
920 WHILE i LT rnds |
|
921 movdqu xmm7, [i*16 + ctx] |
|
922 aesenc xmm0, xmm7 |
|
923 i = i+1 |
|
924 ENDM |
|
925 movdqu xmm7, [rnds*16 + ctx] |
|
926 aesenclast xmm0, xmm7 |
|
927 |
|
928 movdqu xmm7, [input] |
|
929 pxor xmm0, xmm7 |
|
930 movdqu [output], xmm0 |
|
931 |
|
932 lea input, [1*16 + input] |
|
933 lea output, [1*16 + output] |
|
934 sub inputLen, 1*16 |
|
935 jmp loop1 |
|
936 |
|
937 bail: |
|
938 |
|
939 movdqu xmm0, [rsp] |
|
940 movdqu xmm1, [ctx + 0*16] |
|
941 pxor xmm0, xmm1 |
|
942 movdqu [16+ctrCtx], xmm0 |
|
943 |
|
944 |
|
945 xor rax, rax |
|
946 mov rsp, rbp |
|
947 pop rbp |
|
948 |
|
949 movdqu xmm6, [rsp + 0*16] |
|
950 movdqu xmm7, [rsp + 1*16] |
|
951 movdqu xmm8, [rsp + 2*16] |
|
952 add rsp, 3*16 |
|
953 |
|
954 ret |
|
955 ENDM |
|
956 |
|
957 |
|
958 intel_aes_encrypt_ctr_128 PROC |
|
959 gen_aes_ctr_func 10 |
|
960 intel_aes_encrypt_ctr_128 ENDP |
|
961 |
|
962 intel_aes_encrypt_ctr_192 PROC |
|
963 gen_aes_ctr_func 12 |
|
964 intel_aes_encrypt_ctr_192 ENDP |
|
965 |
|
966 intel_aes_encrypt_ctr_256 PROC |
|
967 gen_aes_ctr_func 14 |
|
968 intel_aes_encrypt_ctr_256 ENDP |
|
969 |
|
970 |
|
971 END |