|
1 # LICENSE: |
|
2 # This submission to NSS is to be made available under the terms of the |
|
3 # Mozilla Public License, v. 2.0. You can obtain one at http: |
|
4 # //mozilla.org/MPL/2.0/. |
|
5 ################################################################################ |
|
6 # Copyright(c) 2012, Intel Corp. |
|
7 |
|
8 .align 16 |
|
9 .Lone: |
|
10 .quad 1,0 |
|
11 .Ltwo: |
|
12 .quad 2,0 |
|
13 .Lbswap_mask: |
|
14 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 |
|
15 .Lshuff_mask: |
|
16 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f |
|
17 .Lpoly: |
|
18 .quad 0x1, 0xc200000000000000 |
|
19 |
|
20 |
|
21 ################################################################################ |
|
22 # Generates the final GCM tag |
|
23 # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); |
|
24 .type intel_aes_gcmTAG,@function |
|
25 .globl intel_aes_gcmTAG |
|
26 .align 16 |
|
27 intel_aes_gcmTAG: |
|
28 |
|
29 .set Htbl, %rdi |
|
30 .set Tp, %rsi |
|
31 .set Mlen, %rdx |
|
32 .set Alen, %rcx |
|
33 .set X0, %r8 |
|
34 .set TAG, %r9 |
|
35 |
|
36 .set T,%xmm0 |
|
37 .set TMP0,%xmm1 |
|
38 |
|
39 vmovdqu (Tp), T |
|
40 vpshufb .Lbswap_mask(%rip), T, T |
|
41 vpxor TMP0, TMP0, TMP0 |
|
42 shl $3, Mlen |
|
43 shl $3, Alen |
|
44 vpinsrq $0, Mlen, TMP0, TMP0 |
|
45 vpinsrq $1, Alen, TMP0, TMP0 |
|
46 vpxor TMP0, T, T |
|
47 vmovdqu (Htbl), TMP0 |
|
48 call GFMUL |
|
49 vpshufb .Lbswap_mask(%rip), T, T |
|
50 vpxor (X0), T, T |
|
51 vmovdqu T, (TAG) |
|
52 |
|
53 ret |
|
54 .size intel_aes_gcmTAG, .-intel_aes_gcmTAG |
|
55 ################################################################################ |
|
56 # Generates the H table |
|
57 # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); |
|
58 .type intel_aes_gcmINIT,@function |
|
59 .globl intel_aes_gcmINIT |
|
60 .align 16 |
|
61 intel_aes_gcmINIT: |
|
62 |
|
63 .set Htbl, %rdi |
|
64 .set KS, %rsi |
|
65 .set NR, %edx |
|
66 |
|
67 .set T,%xmm0 |
|
68 .set TMP0,%xmm1 |
|
69 |
|
70 CALCULATE_POWERS_OF_H: |
|
71 vmovdqu 16*0(KS), T |
|
72 vaesenc 16*1(KS), T, T |
|
73 vaesenc 16*2(KS), T, T |
|
74 vaesenc 16*3(KS), T, T |
|
75 vaesenc 16*4(KS), T, T |
|
76 vaesenc 16*5(KS), T, T |
|
77 vaesenc 16*6(KS), T, T |
|
78 vaesenc 16*7(KS), T, T |
|
79 vaesenc 16*8(KS), T, T |
|
80 vaesenc 16*9(KS), T, T |
|
81 vmovdqu 16*10(KS), TMP0 |
|
82 cmp $10, NR |
|
83 je .LH0done |
|
84 vaesenc 16*10(KS), T, T |
|
85 vaesenc 16*11(KS), T, T |
|
86 vmovdqu 16*12(KS), TMP0 |
|
87 cmp $12, NR |
|
88 je .LH0done |
|
89 vaesenc 16*12(KS), T, T |
|
90 vaesenc 16*13(KS), T, T |
|
91 vmovdqu 16*14(KS), TMP0 |
|
92 |
|
93 .LH0done: |
|
94 vaesenclast TMP0, T, T |
|
95 |
|
96 vpshufb .Lbswap_mask(%rip), T, T |
|
97 |
|
98 vmovdqu T, TMP0 |
|
99 # Calculate H` = GFMUL(H, 2) |
|
100 vpsrld $7 , T , %xmm3 |
|
101 vmovdqu .Lshuff_mask(%rip), %xmm4 |
|
102 vpshufb %xmm4, %xmm3 , %xmm3 |
|
103 movq $0xff00 , %rax |
|
104 vmovq %rax, %xmm4 |
|
105 vpshufb %xmm3, %xmm4 , %xmm4 |
|
106 vmovdqu .Lpoly(%rip), %xmm5 |
|
107 vpand %xmm4, %xmm5, %xmm5 |
|
108 vpsrld $31, T, %xmm3 |
|
109 vpslld $1, T, %xmm4 |
|
110 vpslldq $4, %xmm3, %xmm3 |
|
111 vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 |
|
112 |
|
113 #adding p(x)<<1 to xmm5 |
|
114 vpxor %xmm5, T , T |
|
115 vmovdqu T, TMP0 |
|
116 vmovdqu T, (Htbl) # H * 2 |
|
117 call GFMUL |
|
118 vmovdqu T, 16(Htbl) # H^2 * 2 |
|
119 call GFMUL |
|
120 vmovdqu T, 32(Htbl) # H^3 * 2 |
|
121 call GFMUL |
|
122 vmovdqu T, 48(Htbl) # H^4 * 2 |
|
123 call GFMUL |
|
124 vmovdqu T, 64(Htbl) # H^5 * 2 |
|
125 call GFMUL |
|
126 vmovdqu T, 80(Htbl) # H^6 * 2 |
|
127 call GFMUL |
|
128 vmovdqu T, 96(Htbl) # H^7 * 2 |
|
129 call GFMUL |
|
130 vmovdqu T, 112(Htbl) # H^8 * 2 |
|
131 |
|
132 # Precalculations for the reduce 4 step |
|
133 vpshufd $78, (Htbl), %xmm8 |
|
134 vpshufd $78, 16(Htbl), %xmm9 |
|
135 vpshufd $78, 32(Htbl), %xmm10 |
|
136 vpshufd $78, 48(Htbl), %xmm11 |
|
137 vpshufd $78, 64(Htbl), %xmm12 |
|
138 vpshufd $78, 80(Htbl), %xmm13 |
|
139 vpshufd $78, 96(Htbl), %xmm14 |
|
140 vpshufd $78, 112(Htbl), %xmm15 |
|
141 |
|
142 vpxor (Htbl), %xmm8, %xmm8 |
|
143 vpxor 16(Htbl), %xmm9, %xmm9 |
|
144 vpxor 32(Htbl), %xmm10, %xmm10 |
|
145 vpxor 48(Htbl), %xmm11, %xmm11 |
|
146 vpxor 64(Htbl), %xmm12, %xmm12 |
|
147 vpxor 80(Htbl), %xmm13, %xmm13 |
|
148 vpxor 96(Htbl), %xmm14, %xmm14 |
|
149 vpxor 112(Htbl), %xmm15, %xmm15 |
|
150 |
|
151 vmovdqu %xmm8, 128(Htbl) |
|
152 vmovdqu %xmm9, 144(Htbl) |
|
153 vmovdqu %xmm10, 160(Htbl) |
|
154 vmovdqu %xmm11, 176(Htbl) |
|
155 vmovdqu %xmm12, 192(Htbl) |
|
156 vmovdqu %xmm13, 208(Htbl) |
|
157 vmovdqu %xmm14, 224(Htbl) |
|
158 vmovdqu %xmm15, 240(Htbl) |
|
159 |
|
160 ret |
|
161 .size intel_aes_gcmINIT, .-intel_aes_gcmINIT |
|
162 ################################################################################ |
|
163 # Authenticate only |
|
164 # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); |
|
165 |
|
166 .globl intel_aes_gcmAAD |
|
167 .type intel_aes_gcmAAD,@function |
|
168 .align 16 |
|
169 intel_aes_gcmAAD: |
|
170 |
|
171 .set DATA, %xmm0 |
|
172 .set T, %xmm1 |
|
173 .set BSWAP_MASK, %xmm2 |
|
174 .set TMP0, %xmm3 |
|
175 .set TMP1, %xmm4 |
|
176 .set TMP2, %xmm5 |
|
177 .set TMP3, %xmm6 |
|
178 .set TMP4, %xmm7 |
|
179 .set Xhi, %xmm9 |
|
180 |
|
181 .set Htbl, %rdi |
|
182 .set inp, %rsi |
|
183 .set len, %rdx |
|
184 .set Tp, %rcx |
|
185 |
|
186 .set hlp0, %r11 |
|
187 |
|
188 .macro KARATSUBA_AAD i |
|
189 vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 |
|
190 vpxor TMP3, TMP0, TMP0 |
|
191 vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 |
|
192 vpxor TMP3, TMP1, TMP1 |
|
193 vpshufd $78, DATA, TMP3 |
|
194 vpxor DATA, TMP3, TMP3 |
|
195 vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 |
|
196 vpxor TMP3, TMP2, TMP2 |
|
197 .endm |
|
198 |
|
199 test len, len |
|
200 jnz .LbeginAAD |
|
201 ret |
|
202 |
|
203 .LbeginAAD: |
|
204 |
|
205 push hlp0 |
|
206 vzeroupper |
|
207 |
|
208 vmovdqa .Lbswap_mask(%rip), BSWAP_MASK |
|
209 |
|
210 vpxor Xhi, Xhi, Xhi |
|
211 |
|
212 vmovdqu (Tp),T |
|
213 vpshufb BSWAP_MASK,T,T |
|
214 |
|
215 # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first |
|
216 mov len, hlp0 |
|
217 and $~-128, hlp0 |
|
218 |
|
219 jz .Lmod_loop |
|
220 |
|
221 sub hlp0, len |
|
222 sub $16, hlp0 |
|
223 |
|
224 #hash first prefix block |
|
225 vmovdqu (inp), DATA |
|
226 vpshufb BSWAP_MASK, DATA, DATA |
|
227 vpxor T, DATA, DATA |
|
228 |
|
229 vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 |
|
230 vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 |
|
231 vpshufd $78, DATA, TMP2 |
|
232 vpxor DATA, TMP2, TMP2 |
|
233 vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 |
|
234 |
|
235 lea 16(inp), inp |
|
236 test hlp0, hlp0 |
|
237 jnz .Lpre_loop |
|
238 jmp .Lred1 |
|
239 |
|
240 #hash remaining prefix bocks (up to 7 total prefix blocks) |
|
241 .align 64 |
|
242 .Lpre_loop: |
|
243 |
|
244 sub $16, hlp0 |
|
245 |
|
246 vmovdqu (inp),DATA # next data block |
|
247 vpshufb BSWAP_MASK,DATA,DATA |
|
248 |
|
249 vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 |
|
250 vpxor TMP3, TMP0, TMP0 |
|
251 vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 |
|
252 vpxor TMP3, TMP1, TMP1 |
|
253 vpshufd $78, DATA, TMP3 |
|
254 vpxor DATA, TMP3, TMP3 |
|
255 vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 |
|
256 vpxor TMP3, TMP2, TMP2 |
|
257 |
|
258 test hlp0, hlp0 |
|
259 |
|
260 lea 16(inp), inp |
|
261 |
|
262 jnz .Lpre_loop |
|
263 |
|
264 .Lred1: |
|
265 vpxor TMP0, TMP2, TMP2 |
|
266 vpxor TMP1, TMP2, TMP2 |
|
267 vpsrldq $8, TMP2, TMP3 |
|
268 vpslldq $8, TMP2, TMP2 |
|
269 |
|
270 vpxor TMP3, TMP1, Xhi |
|
271 vpxor TMP2, TMP0, T |
|
272 |
|
273 .align 64 |
|
274 .Lmod_loop: |
|
275 sub $0x80, len |
|
276 jb .Ldone |
|
277 |
|
278 vmovdqu 16*7(inp),DATA # Ii |
|
279 vpshufb BSWAP_MASK,DATA,DATA |
|
280 |
|
281 vpclmulqdq $0x00, (Htbl), DATA, TMP0 |
|
282 vpclmulqdq $0x11, (Htbl), DATA, TMP1 |
|
283 vpshufd $78, DATA, TMP2 |
|
284 vpxor DATA, TMP2, TMP2 |
|
285 vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 |
|
286 ######################################################### |
|
287 vmovdqu 16*6(inp),DATA |
|
288 vpshufb BSWAP_MASK,DATA,DATA |
|
289 KARATSUBA_AAD 1 |
|
290 ######################################################### |
|
291 vmovdqu 16*5(inp),DATA |
|
292 vpshufb BSWAP_MASK,DATA,DATA |
|
293 |
|
294 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a |
|
295 vpalignr $8, T, T, T |
|
296 |
|
297 KARATSUBA_AAD 2 |
|
298 |
|
299 vpxor TMP4, T, T #reduction stage 1b |
|
300 ######################################################### |
|
301 vmovdqu 16*4(inp),DATA |
|
302 vpshufb BSWAP_MASK,DATA,DATA |
|
303 |
|
304 KARATSUBA_AAD 3 |
|
305 ######################################################### |
|
306 vmovdqu 16*3(inp),DATA |
|
307 vpshufb BSWAP_MASK,DATA,DATA |
|
308 |
|
309 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a |
|
310 vpalignr $8, T, T, T |
|
311 |
|
312 KARATSUBA_AAD 4 |
|
313 |
|
314 vpxor TMP4, T, T #reduction stage 2b |
|
315 ######################################################### |
|
316 vmovdqu 16*2(inp),DATA |
|
317 vpshufb BSWAP_MASK,DATA,DATA |
|
318 |
|
319 KARATSUBA_AAD 5 |
|
320 |
|
321 vpxor Xhi, T, T #reduction finalize |
|
322 ######################################################### |
|
323 vmovdqu 16*1(inp),DATA |
|
324 vpshufb BSWAP_MASK,DATA,DATA |
|
325 |
|
326 KARATSUBA_AAD 6 |
|
327 ######################################################### |
|
328 vmovdqu 16*0(inp),DATA |
|
329 vpshufb BSWAP_MASK,DATA,DATA |
|
330 vpxor T,DATA,DATA |
|
331 |
|
332 KARATSUBA_AAD 7 |
|
333 ######################################################### |
|
334 vpxor TMP0, TMP2, TMP2 # karatsuba fixup |
|
335 vpxor TMP1, TMP2, TMP2 |
|
336 vpsrldq $8, TMP2, TMP3 |
|
337 vpslldq $8, TMP2, TMP2 |
|
338 |
|
339 vpxor TMP3, TMP1, Xhi |
|
340 vpxor TMP2, TMP0, T |
|
341 |
|
342 lea 16*8(inp), inp |
|
343 jmp .Lmod_loop |
|
344 ######################################################### |
|
345 |
|
346 .Ldone: |
|
347 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 |
|
348 vpalignr $8, T, T, T |
|
349 vpxor TMP3, T, T |
|
350 |
|
351 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 |
|
352 vpalignr $8, T, T, T |
|
353 vpxor TMP3, T, T |
|
354 |
|
355 vpxor Xhi, T, T |
|
356 |
|
357 .Lsave: |
|
358 vpshufb BSWAP_MASK,T, T |
|
359 vmovdqu T,(Tp) |
|
360 vzeroupper |
|
361 |
|
362 pop hlp0 |
|
363 ret |
|
364 .size intel_aes_gcmAAD,.-intel_aes_gcmAAD |
|
365 |
|
366 ################################################################################ |
|
367 # Encrypt and Authenticate |
|
368 # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); |
|
369 .type intel_aes_gcmENC,@function |
|
370 .globl intel_aes_gcmENC |
|
371 .align 16 |
|
372 intel_aes_gcmENC: |
|
373 |
|
374 .set PT,%rdi |
|
375 .set CT,%rsi |
|
376 .set Htbl, %rdx |
|
377 .set len, %rcx |
|
378 .set KS,%r9 |
|
379 .set NR,%r10d |
|
380 |
|
381 .set Gctx, %rdx |
|
382 |
|
383 .set T,%xmm0 |
|
384 .set TMP0,%xmm1 |
|
385 .set TMP1,%xmm2 |
|
386 .set TMP2,%xmm3 |
|
387 .set TMP3,%xmm4 |
|
388 .set TMP4,%xmm5 |
|
389 .set TMP5,%xmm6 |
|
390 .set CTR0,%xmm7 |
|
391 .set CTR1,%xmm8 |
|
392 .set CTR2,%xmm9 |
|
393 .set CTR3,%xmm10 |
|
394 .set CTR4,%xmm11 |
|
395 .set CTR5,%xmm12 |
|
396 .set CTR6,%xmm13 |
|
397 .set CTR7,%xmm14 |
|
398 .set CTR,%xmm15 |
|
399 |
|
400 .macro ROUND i |
|
401 vmovdqu \i*16(KS), TMP3 |
|
402 vaesenc TMP3, CTR0, CTR0 |
|
403 vaesenc TMP3, CTR1, CTR1 |
|
404 vaesenc TMP3, CTR2, CTR2 |
|
405 vaesenc TMP3, CTR3, CTR3 |
|
406 vaesenc TMP3, CTR4, CTR4 |
|
407 vaesenc TMP3, CTR5, CTR5 |
|
408 vaesenc TMP3, CTR6, CTR6 |
|
409 vaesenc TMP3, CTR7, CTR7 |
|
410 .endm |
|
411 |
|
412 .macro ROUNDMUL i |
|
413 |
|
414 vmovdqu \i*16(%rsp), TMP5 |
|
415 vmovdqu \i*16(KS), TMP3 |
|
416 |
|
417 vaesenc TMP3, CTR0, CTR0 |
|
418 vaesenc TMP3, CTR1, CTR1 |
|
419 vaesenc TMP3, CTR2, CTR2 |
|
420 vaesenc TMP3, CTR3, CTR3 |
|
421 |
|
422 vpshufd $78, TMP5, TMP4 |
|
423 vpxor TMP5, TMP4, TMP4 |
|
424 |
|
425 vaesenc TMP3, CTR4, CTR4 |
|
426 vaesenc TMP3, CTR5, CTR5 |
|
427 vaesenc TMP3, CTR6, CTR6 |
|
428 vaesenc TMP3, CTR7, CTR7 |
|
429 |
|
430 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 |
|
431 vpxor TMP3, TMP0, TMP0 |
|
432 vmovdqa \i*16(Htbl), TMP4 |
|
433 vpclmulqdq $0x11, TMP4, TMP5, TMP3 |
|
434 vpxor TMP3, TMP1, TMP1 |
|
435 vpclmulqdq $0x00, TMP4, TMP5, TMP3 |
|
436 vpxor TMP3, TMP2, TMP2 |
|
437 |
|
438 .endm |
|
439 |
|
440 .macro KARATSUBA i |
|
441 vmovdqu \i*16(%rsp), TMP5 |
|
442 |
|
443 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 |
|
444 vpxor TMP3, TMP1, TMP1 |
|
445 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 |
|
446 vpxor TMP3, TMP2, TMP2 |
|
447 vpshufd $78, TMP5, TMP3 |
|
448 vpxor TMP5, TMP3, TMP5 |
|
449 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 |
|
450 vpxor TMP3, TMP0, TMP0 |
|
451 .endm |
|
452 |
|
453 test len, len |
|
454 jnz .Lbegin |
|
455 ret |
|
456 |
|
457 .Lbegin: |
|
458 |
|
459 vzeroupper |
|
460 push %rbp |
|
461 push %rbx |
|
462 |
|
463 movq %rsp, %rbp |
|
464 sub $128, %rsp |
|
465 andq $-16, %rsp |
|
466 |
|
467 vmovdqu 288(Gctx), CTR |
|
468 vmovdqu 272(Gctx), T |
|
469 mov 304(Gctx), KS |
|
470 mov 4(KS), NR |
|
471 lea 48(KS), KS |
|
472 |
|
473 vpshufb .Lbswap_mask(%rip), CTR, CTR |
|
474 vpshufb .Lbswap_mask(%rip), T, T |
|
475 |
|
476 cmp $128, len |
|
477 jb .LDataSingles |
|
478 |
|
479 # Encrypt the first eight blocks |
|
480 sub $128, len |
|
481 vmovdqa CTR, CTR0 |
|
482 vpaddd .Lone(%rip), CTR0, CTR1 |
|
483 vpaddd .Ltwo(%rip), CTR0, CTR2 |
|
484 vpaddd .Lone(%rip), CTR2, CTR3 |
|
485 vpaddd .Ltwo(%rip), CTR2, CTR4 |
|
486 vpaddd .Lone(%rip), CTR4, CTR5 |
|
487 vpaddd .Ltwo(%rip), CTR4, CTR6 |
|
488 vpaddd .Lone(%rip), CTR6, CTR7 |
|
489 vpaddd .Ltwo(%rip), CTR6, CTR |
|
490 |
|
491 vpshufb .Lbswap_mask(%rip), CTR0, CTR0 |
|
492 vpshufb .Lbswap_mask(%rip), CTR1, CTR1 |
|
493 vpshufb .Lbswap_mask(%rip), CTR2, CTR2 |
|
494 vpshufb .Lbswap_mask(%rip), CTR3, CTR3 |
|
495 vpshufb .Lbswap_mask(%rip), CTR4, CTR4 |
|
496 vpshufb .Lbswap_mask(%rip), CTR5, CTR5 |
|
497 vpshufb .Lbswap_mask(%rip), CTR6, CTR6 |
|
498 vpshufb .Lbswap_mask(%rip), CTR7, CTR7 |
|
499 |
|
500 vpxor (KS), CTR0, CTR0 |
|
501 vpxor (KS), CTR1, CTR1 |
|
502 vpxor (KS), CTR2, CTR2 |
|
503 vpxor (KS), CTR3, CTR3 |
|
504 vpxor (KS), CTR4, CTR4 |
|
505 vpxor (KS), CTR5, CTR5 |
|
506 vpxor (KS), CTR6, CTR6 |
|
507 vpxor (KS), CTR7, CTR7 |
|
508 |
|
509 ROUND 1 |
|
510 ROUND 2 |
|
511 ROUND 3 |
|
512 ROUND 4 |
|
513 ROUND 5 |
|
514 ROUND 6 |
|
515 ROUND 7 |
|
516 ROUND 8 |
|
517 ROUND 9 |
|
518 |
|
519 vmovdqu 160(KS), TMP5 |
|
520 cmp $12, NR |
|
521 jb .LLast1 |
|
522 |
|
523 ROUND 10 |
|
524 ROUND 11 |
|
525 |
|
526 vmovdqu 192(KS), TMP5 |
|
527 cmp $14, NR |
|
528 jb .LLast1 |
|
529 |
|
530 ROUND 12 |
|
531 ROUND 13 |
|
532 |
|
533 vmovdqu 224(KS), TMP5 |
|
534 |
|
535 .LLast1: |
|
536 |
|
537 vpxor (PT), TMP5, TMP3 |
|
538 vaesenclast TMP3, CTR0, CTR0 |
|
539 vpxor 16(PT), TMP5, TMP3 |
|
540 vaesenclast TMP3, CTR1, CTR1 |
|
541 vpxor 32(PT), TMP5, TMP3 |
|
542 vaesenclast TMP3, CTR2, CTR2 |
|
543 vpxor 48(PT), TMP5, TMP3 |
|
544 vaesenclast TMP3, CTR3, CTR3 |
|
545 vpxor 64(PT), TMP5, TMP3 |
|
546 vaesenclast TMP3, CTR4, CTR4 |
|
547 vpxor 80(PT), TMP5, TMP3 |
|
548 vaesenclast TMP3, CTR5, CTR5 |
|
549 vpxor 96(PT), TMP5, TMP3 |
|
550 vaesenclast TMP3, CTR6, CTR6 |
|
551 vpxor 112(PT), TMP5, TMP3 |
|
552 vaesenclast TMP3, CTR7, CTR7 |
|
553 |
|
554 vmovdqu .Lbswap_mask(%rip), TMP3 |
|
555 |
|
556 vmovdqu CTR0, (CT) |
|
557 vpshufb TMP3, CTR0, CTR0 |
|
558 vmovdqu CTR1, 16(CT) |
|
559 vpshufb TMP3, CTR1, CTR1 |
|
560 vmovdqu CTR2, 32(CT) |
|
561 vpshufb TMP3, CTR2, CTR2 |
|
562 vmovdqu CTR3, 48(CT) |
|
563 vpshufb TMP3, CTR3, CTR3 |
|
564 vmovdqu CTR4, 64(CT) |
|
565 vpshufb TMP3, CTR4, CTR4 |
|
566 vmovdqu CTR5, 80(CT) |
|
567 vpshufb TMP3, CTR5, CTR5 |
|
568 vmovdqu CTR6, 96(CT) |
|
569 vpshufb TMP3, CTR6, CTR6 |
|
570 vmovdqu CTR7, 112(CT) |
|
571 vpshufb TMP3, CTR7, CTR7 |
|
572 |
|
573 lea 128(CT), CT |
|
574 lea 128(PT), PT |
|
575 jmp .LDataOctets |
|
576 |
|
577 # Encrypt 8 blocks each time while hashing previous 8 blocks |
|
578 .align 64 |
|
579 .LDataOctets: |
|
580 cmp $128, len |
|
581 jb .LEndOctets |
|
582 sub $128, len |
|
583 |
|
584 vmovdqa CTR7, TMP5 |
|
585 vmovdqa CTR6, 1*16(%rsp) |
|
586 vmovdqa CTR5, 2*16(%rsp) |
|
587 vmovdqa CTR4, 3*16(%rsp) |
|
588 vmovdqa CTR3, 4*16(%rsp) |
|
589 vmovdqa CTR2, 5*16(%rsp) |
|
590 vmovdqa CTR1, 6*16(%rsp) |
|
591 vmovdqa CTR0, 7*16(%rsp) |
|
592 |
|
593 vmovdqa CTR, CTR0 |
|
594 vpaddd .Lone(%rip), CTR0, CTR1 |
|
595 vpaddd .Ltwo(%rip), CTR0, CTR2 |
|
596 vpaddd .Lone(%rip), CTR2, CTR3 |
|
597 vpaddd .Ltwo(%rip), CTR2, CTR4 |
|
598 vpaddd .Lone(%rip), CTR4, CTR5 |
|
599 vpaddd .Ltwo(%rip), CTR4, CTR6 |
|
600 vpaddd .Lone(%rip), CTR6, CTR7 |
|
601 vpaddd .Ltwo(%rip), CTR6, CTR |
|
602 |
|
603 vmovdqu (KS), TMP4 |
|
604 vpshufb TMP3, CTR0, CTR0 |
|
605 vpxor TMP4, CTR0, CTR0 |
|
606 vpshufb TMP3, CTR1, CTR1 |
|
607 vpxor TMP4, CTR1, CTR1 |
|
608 vpshufb TMP3, CTR2, CTR2 |
|
609 vpxor TMP4, CTR2, CTR2 |
|
610 vpshufb TMP3, CTR3, CTR3 |
|
611 vpxor TMP4, CTR3, CTR3 |
|
612 vpshufb TMP3, CTR4, CTR4 |
|
613 vpxor TMP4, CTR4, CTR4 |
|
614 vpshufb TMP3, CTR5, CTR5 |
|
615 vpxor TMP4, CTR5, CTR5 |
|
616 vpshufb TMP3, CTR6, CTR6 |
|
617 vpxor TMP4, CTR6, CTR6 |
|
618 vpshufb TMP3, CTR7, CTR7 |
|
619 vpxor TMP4, CTR7, CTR7 |
|
620 |
|
621 vmovdqu 16*0(Htbl), TMP3 |
|
622 vpclmulqdq $0x11, TMP3, TMP5, TMP1 |
|
623 vpclmulqdq $0x00, TMP3, TMP5, TMP2 |
|
624 vpshufd $78, TMP5, TMP3 |
|
625 vpxor TMP5, TMP3, TMP5 |
|
626 vmovdqu 128+0*16(Htbl), TMP3 |
|
627 vpclmulqdq $0x00, TMP3, TMP5, TMP0 |
|
628 |
|
629 ROUNDMUL 1 |
|
630 |
|
631 ROUNDMUL 2 |
|
632 |
|
633 ROUNDMUL 3 |
|
634 |
|
635 ROUNDMUL 4 |
|
636 |
|
637 ROUNDMUL 5 |
|
638 |
|
639 ROUNDMUL 6 |
|
640 |
|
641 vpxor 7*16(%rsp), T, TMP5 |
|
642 vmovdqu 7*16(KS), TMP3 |
|
643 |
|
644 vaesenc TMP3, CTR0, CTR0 |
|
645 vaesenc TMP3, CTR1, CTR1 |
|
646 vaesenc TMP3, CTR2, CTR2 |
|
647 vaesenc TMP3, CTR3, CTR3 |
|
648 |
|
649 vpshufd $78, TMP5, TMP4 |
|
650 vpxor TMP5, TMP4, TMP4 |
|
651 |
|
652 vaesenc TMP3, CTR4, CTR4 |
|
653 vaesenc TMP3, CTR5, CTR5 |
|
654 vaesenc TMP3, CTR6, CTR6 |
|
655 vaesenc TMP3, CTR7, CTR7 |
|
656 |
|
657 vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 |
|
658 vpxor TMP3, TMP1, TMP1 |
|
659 vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 |
|
660 vpxor TMP3, TMP2, TMP2 |
|
661 vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 |
|
662 vpxor TMP3, TMP0, TMP0 |
|
663 |
|
664 ROUND 8 |
|
665 vmovdqa .Lpoly(%rip), TMP5 |
|
666 |
|
667 vpxor TMP1, TMP0, TMP0 |
|
668 vpxor TMP2, TMP0, TMP0 |
|
669 vpsrldq $8, TMP0, TMP3 |
|
670 vpxor TMP3, TMP1, TMP4 |
|
671 vpslldq $8, TMP0, TMP3 |
|
672 vpxor TMP3, TMP2, T |
|
673 |
|
674 vpclmulqdq $0x10, TMP5, T, TMP1 |
|
675 vpalignr $8, T, T, T |
|
676 vpxor T, TMP1, T |
|
677 |
|
678 ROUND 9 |
|
679 |
|
680 vpclmulqdq $0x10, TMP5, T, TMP1 |
|
681 vpalignr $8, T, T, T |
|
682 vpxor T, TMP1, T |
|
683 |
|
684 vmovdqu 160(KS), TMP5 |
|
685 cmp $10, NR |
|
686 jbe .LLast2 |
|
687 |
|
688 ROUND 10 |
|
689 ROUND 11 |
|
690 |
|
691 vmovdqu 192(KS), TMP5 |
|
692 cmp $12, NR |
|
693 jbe .LLast2 |
|
694 |
|
695 ROUND 12 |
|
696 ROUND 13 |
|
697 |
|
698 vmovdqu 224(KS), TMP5 |
|
699 |
|
700 .LLast2: |
|
701 |
|
702 vpxor (PT), TMP5, TMP3 |
|
703 vaesenclast TMP3, CTR0, CTR0 |
|
704 vpxor 16(PT), TMP5, TMP3 |
|
705 vaesenclast TMP3, CTR1, CTR1 |
|
706 vpxor 32(PT), TMP5, TMP3 |
|
707 vaesenclast TMP3, CTR2, CTR2 |
|
708 vpxor 48(PT), TMP5, TMP3 |
|
709 vaesenclast TMP3, CTR3, CTR3 |
|
710 vpxor 64(PT), TMP5, TMP3 |
|
711 vaesenclast TMP3, CTR4, CTR4 |
|
712 vpxor 80(PT), TMP5, TMP3 |
|
713 vaesenclast TMP3, CTR5, CTR5 |
|
714 vpxor 96(PT), TMP5, TMP3 |
|
715 vaesenclast TMP3, CTR6, CTR6 |
|
716 vpxor 112(PT), TMP5, TMP3 |
|
717 vaesenclast TMP3, CTR7, CTR7 |
|
718 |
|
719 vmovdqu .Lbswap_mask(%rip), TMP3 |
|
720 |
|
721 vmovdqu CTR0, (CT) |
|
722 vpshufb TMP3, CTR0, CTR0 |
|
723 vmovdqu CTR1, 16(CT) |
|
724 vpshufb TMP3, CTR1, CTR1 |
|
725 vmovdqu CTR2, 32(CT) |
|
726 vpshufb TMP3, CTR2, CTR2 |
|
727 vmovdqu CTR3, 48(CT) |
|
728 vpshufb TMP3, CTR3, CTR3 |
|
729 vmovdqu CTR4, 64(CT) |
|
730 vpshufb TMP3, CTR4, CTR4 |
|
731 vmovdqu CTR5, 80(CT) |
|
732 vpshufb TMP3, CTR5, CTR5 |
|
733 vmovdqu CTR6, 96(CT) |
|
734 vpshufb TMP3, CTR6, CTR6 |
|
735 vmovdqu CTR7,112(CT) |
|
736 vpshufb TMP3, CTR7, CTR7 |
|
737 |
|
738 vpxor TMP4, T, T |
|
739 |
|
740 lea 128(CT), CT |
|
741 lea 128(PT), PT |
|
742 jmp .LDataOctets |
|
743 |
|
744 .LEndOctets: |
|
745 |
|
746 vmovdqa CTR7, TMP5 |
|
747 vmovdqa CTR6, 1*16(%rsp) |
|
748 vmovdqa CTR5, 2*16(%rsp) |
|
749 vmovdqa CTR4, 3*16(%rsp) |
|
750 vmovdqa CTR3, 4*16(%rsp) |
|
751 vmovdqa CTR2, 5*16(%rsp) |
|
752 vmovdqa CTR1, 6*16(%rsp) |
|
753 vmovdqa CTR0, 7*16(%rsp) |
|
754 |
|
755 vmovdqu 16*0(Htbl), TMP3 |
|
756 vpclmulqdq $0x11, TMP3, TMP5, TMP1 |
|
757 vpclmulqdq $0x00, TMP3, TMP5, TMP2 |
|
758 vpshufd $78, TMP5, TMP3 |
|
759 vpxor TMP5, TMP3, TMP5 |
|
760 vmovdqu 128+0*16(Htbl), TMP3 |
|
761 vpclmulqdq $0x00, TMP3, TMP5, TMP0 |
|
762 |
|
763 KARATSUBA 1 |
|
764 KARATSUBA 2 |
|
765 KARATSUBA 3 |
|
766 KARATSUBA 4 |
|
767 KARATSUBA 5 |
|
768 KARATSUBA 6 |
|
769 |
|
770 vmovdqu 7*16(%rsp), TMP5 |
|
771 vpxor T, TMP5, TMP5 |
|
772 vmovdqu 16*7(Htbl), TMP4 |
|
773 vpclmulqdq $0x11, TMP4, TMP5, TMP3 |
|
774 vpxor TMP3, TMP1, TMP1 |
|
775 vpclmulqdq $0x00, TMP4, TMP5, TMP3 |
|
776 vpxor TMP3, TMP2, TMP2 |
|
777 vpshufd $78, TMP5, TMP3 |
|
778 vpxor TMP5, TMP3, TMP5 |
|
779 vmovdqu 128+7*16(Htbl), TMP4 |
|
780 vpclmulqdq $0x00, TMP4, TMP5, TMP3 |
|
781 vpxor TMP3, TMP0, TMP0 |
|
782 |
|
783 vpxor TMP1, TMP0, TMP0 |
|
784 vpxor TMP2, TMP0, TMP0 |
|
785 |
|
786 vpsrldq $8, TMP0, TMP3 |
|
787 vpxor TMP3, TMP1, TMP4 |
|
788 vpslldq $8, TMP0, TMP3 |
|
789 vpxor TMP3, TMP2, T |
|
790 |
|
791 vmovdqa .Lpoly(%rip), TMP2 |
|
792 |
|
793 vpalignr $8, T, T, TMP1 |
|
794 vpclmulqdq $0x10, TMP2, T, T |
|
795 vpxor T, TMP1, T |
|
796 |
|
797 vpalignr $8, T, T, TMP1 |
|
798 vpclmulqdq $0x10, TMP2, T, T |
|
799 vpxor T, TMP1, T |
|
800 |
|
801 vpxor TMP4, T, T |
|
802 |
|
803 #Here we encrypt any remaining whole block |
|
804 .LDataSingles: |
|
805 |
|
806 cmp $16, len |
|
807 jb .LDataTail |
|
808 sub $16, len |
|
809 |
|
810 vpshufb .Lbswap_mask(%rip), CTR, TMP1 |
|
811 vpaddd .Lone(%rip), CTR, CTR |
|
812 |
|
813 vpxor (KS), TMP1, TMP1 |
|
814 vaesenc 16*1(KS), TMP1, TMP1 |
|
815 vaesenc 16*2(KS), TMP1, TMP1 |
|
816 vaesenc 16*3(KS), TMP1, TMP1 |
|
817 vaesenc 16*4(KS), TMP1, TMP1 |
|
818 vaesenc 16*5(KS), TMP1, TMP1 |
|
819 vaesenc 16*6(KS), TMP1, TMP1 |
|
820 vaesenc 16*7(KS), TMP1, TMP1 |
|
821 vaesenc 16*8(KS), TMP1, TMP1 |
|
822 vaesenc 16*9(KS), TMP1, TMP1 |
|
823 vmovdqu 16*10(KS), TMP2 |
|
824 cmp $10, NR |
|
825 je .LLast3 |
|
826 vaesenc 16*10(KS), TMP1, TMP1 |
|
827 vaesenc 16*11(KS), TMP1, TMP1 |
|
828 vmovdqu 16*12(KS), TMP2 |
|
829 cmp $12, NR |
|
830 je .LLast3 |
|
831 vaesenc 16*12(KS), TMP1, TMP1 |
|
832 vaesenc 16*13(KS), TMP1, TMP1 |
|
833 vmovdqu 16*14(KS), TMP2 |
|
834 |
|
835 .LLast3: |
|
836 vaesenclast TMP2, TMP1, TMP1 |
|
837 |
|
838 vpxor (PT), TMP1, TMP1 |
|
839 vmovdqu TMP1, (CT) |
|
840 addq $16, CT |
|
841 addq $16, PT |
|
842 |
|
843 vpshufb .Lbswap_mask(%rip), TMP1, TMP1 |
|
844 vpxor TMP1, T, T |
|
845 vmovdqu (Htbl), TMP0 |
|
846 call GFMUL |
|
847 |
|
848 jmp .LDataSingles |
|
849 |
|
850 #Here we encypt the final partial block, if there is one |
|
851 .LDataTail: |
|
852 |
|
853 test len, len |
|
854 jz DATA_END |
|
855 # First prepare the counter block |
|
856 vpshufb .Lbswap_mask(%rip), CTR, TMP1 |
|
857 vpaddd .Lone(%rip), CTR, CTR |
|
858 |
|
859 vpxor (KS), TMP1, TMP1 |
|
860 vaesenc 16*1(KS), TMP1, TMP1 |
|
861 vaesenc 16*2(KS), TMP1, TMP1 |
|
862 vaesenc 16*3(KS), TMP1, TMP1 |
|
863 vaesenc 16*4(KS), TMP1, TMP1 |
|
864 vaesenc 16*5(KS), TMP1, TMP1 |
|
865 vaesenc 16*6(KS), TMP1, TMP1 |
|
866 vaesenc 16*7(KS), TMP1, TMP1 |
|
867 vaesenc 16*8(KS), TMP1, TMP1 |
|
868 vaesenc 16*9(KS), TMP1, TMP1 |
|
869 vmovdqu 16*10(KS), TMP2 |
|
870 cmp $10, NR |
|
871 je .LLast4 |
|
872 vaesenc 16*10(KS), TMP1, TMP1 |
|
873 vaesenc 16*11(KS), TMP1, TMP1 |
|
874 vmovdqu 16*12(KS), TMP2 |
|
875 cmp $12, NR |
|
876 je .LLast4 |
|
877 vaesenc 16*12(KS), TMP1, TMP1 |
|
878 vaesenc 16*13(KS), TMP1, TMP1 |
|
879 vmovdqu 16*14(KS), TMP2 |
|
880 |
|
881 .LLast4: |
|
882 vaesenclast TMP2, TMP1, TMP1 |
|
883 #Zero a temp location |
|
884 vpxor TMP2, TMP2, TMP2 |
|
885 vmovdqa TMP2, (%rsp) |
|
886 |
|
887 # Copy the required bytes only (could probably use rep movsb) |
|
888 xor KS, KS |
|
889 .LEncCpy: |
|
890 cmp KS, len |
|
891 je .LEncCpyEnd |
|
892 movb (PT, KS, 1), %r8b |
|
893 movb %r8b, (%rsp, KS, 1) |
|
894 inc KS |
|
895 jmp .LEncCpy |
|
896 .LEncCpyEnd: |
|
897 # Xor with the counter block |
|
898 vpxor (%rsp), TMP1, TMP0 |
|
899 # Again, store at temp location |
|
900 vmovdqa TMP0, (%rsp) |
|
901 # Copy only the required bytes to CT, and zero the rest for the hash |
|
902 xor KS, KS |
|
903 .LEncCpy2: |
|
904 cmp KS, len |
|
905 je .LEncCpy3 |
|
906 movb (%rsp, KS, 1), %r8b |
|
907 movb %r8b, (CT, KS, 1) |
|
908 inc KS |
|
909 jmp .LEncCpy2 |
|
910 .LEncCpy3: |
|
911 cmp $16, KS |
|
912 je .LEndCpy3 |
|
913 movb $0, (%rsp, KS, 1) |
|
914 inc KS |
|
915 jmp .LEncCpy3 |
|
916 .LEndCpy3: |
|
917 vmovdqa (%rsp), TMP0 |
|
918 |
|
919 vpshufb .Lbswap_mask(%rip), TMP0, TMP0 |
|
920 vpxor TMP0, T, T |
|
921 vmovdqu (Htbl), TMP0 |
|
922 call GFMUL |
|
923 |
|
924 DATA_END: |
|
925 |
|
926 vpshufb .Lbswap_mask(%rip), T, T |
|
927 vpshufb .Lbswap_mask(%rip), CTR, CTR |
|
928 vmovdqu T, 272(Gctx) |
|
929 vmovdqu CTR, 288(Gctx) |
|
930 |
|
931 movq %rbp, %rsp |
|
932 |
|
933 popq %rbx |
|
934 popq %rbp |
|
935 ret |
|
936 .size intel_aes_gcmENC, .-intel_aes_gcmENC |
|
937 |
|
938 ######################### |
|
939 # Decrypt and Authenticate |
|
940 # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); |
|
941 .type intel_aes_gcmDEC,@function |
|
942 .globl intel_aes_gcmDEC |
|
943 .align 16 |
|
944 intel_aes_gcmDEC: |
|
945 # parameter 1: CT # input |
|
946 # parameter 2: PT # output |
|
947 # parameter 3: %rdx # Gctx |
|
948 # parameter 4: %rcx # len |
|
949 |
|
950 .macro DEC_KARATSUBA i |
|
951 vmovdqu (7-\i)*16(CT), TMP5 |
|
952 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 |
|
953 |
|
954 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 |
|
955 vpxor TMP3, TMP1, TMP1 |
|
956 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 |
|
957 vpxor TMP3, TMP2, TMP2 |
|
958 vpshufd $78, TMP5, TMP3 |
|
959 vpxor TMP5, TMP3, TMP5 |
|
960 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 |
|
961 vpxor TMP3, TMP0, TMP0 |
|
962 .endm |
|
963 |
|
964 .set PT,%rsi |
|
965 .set CT,%rdi |
|
966 .set Htbl, %rdx |
|
967 .set len, %rcx |
|
968 .set KS,%r9 |
|
969 .set NR,%r10d |
|
970 |
|
971 .set Gctx, %rdx |
|
972 |
|
973 .set T,%xmm0 |
|
974 .set TMP0,%xmm1 |
|
975 .set TMP1,%xmm2 |
|
976 .set TMP2,%xmm3 |
|
977 .set TMP3,%xmm4 |
|
978 .set TMP4,%xmm5 |
|
979 .set TMP5,%xmm6 |
|
980 .set CTR0,%xmm7 |
|
981 .set CTR1,%xmm8 |
|
982 .set CTR2,%xmm9 |
|
983 .set CTR3,%xmm10 |
|
984 .set CTR4,%xmm11 |
|
985 .set CTR5,%xmm12 |
|
986 .set CTR6,%xmm13 |
|
987 .set CTR7,%xmm14 |
|
988 .set CTR,%xmm15 |
|
989 |
|
990 test len, len |
|
991 jnz .LbeginDec |
|
992 ret |
|
993 |
|
994 .LbeginDec: |
|
995 |
|
996 pushq %rbp |
|
997 pushq %rbx |
|
998 movq %rsp, %rbp |
|
999 sub $128, %rsp |
|
1000 andq $-16, %rsp |
|
1001 vmovdqu 288(Gctx), CTR |
|
1002 vmovdqu 272(Gctx), T |
|
1003 mov 304(Gctx), KS |
|
1004 mov 4(KS), NR |
|
1005 lea 48(KS), KS |
|
1006 |
|
1007 vpshufb .Lbswap_mask(%rip), CTR, CTR |
|
1008 vpshufb .Lbswap_mask(%rip), T, T |
|
1009 |
|
1010 vmovdqu .Lbswap_mask(%rip), TMP3 |
|
1011 jmp .LDECOctets |
|
1012 |
|
1013 # Decrypt 8 blocks each time while hashing them at the same time |
|
1014 .align 64 |
|
1015 .LDECOctets: |
|
1016 |
|
1017 cmp $128, len |
|
1018 jb .LDECSingles |
|
1019 sub $128, len |
|
1020 |
|
1021 vmovdqa CTR, CTR0 |
|
1022 vpaddd .Lone(%rip), CTR0, CTR1 |
|
1023 vpaddd .Ltwo(%rip), CTR0, CTR2 |
|
1024 vpaddd .Lone(%rip), CTR2, CTR3 |
|
1025 vpaddd .Ltwo(%rip), CTR2, CTR4 |
|
1026 vpaddd .Lone(%rip), CTR4, CTR5 |
|
1027 vpaddd .Ltwo(%rip), CTR4, CTR6 |
|
1028 vpaddd .Lone(%rip), CTR6, CTR7 |
|
1029 vpaddd .Ltwo(%rip), CTR6, CTR |
|
1030 |
|
1031 vpshufb TMP3, CTR0, CTR0 |
|
1032 vpshufb TMP3, CTR1, CTR1 |
|
1033 vpshufb TMP3, CTR2, CTR2 |
|
1034 vpshufb TMP3, CTR3, CTR3 |
|
1035 vpshufb TMP3, CTR4, CTR4 |
|
1036 vpshufb TMP3, CTR5, CTR5 |
|
1037 vpshufb TMP3, CTR6, CTR6 |
|
1038 vpshufb TMP3, CTR7, CTR7 |
|
1039 |
|
1040 vmovdqu (KS), TMP3 |
|
1041 vpxor TMP3, CTR0, CTR0 |
|
1042 vpxor TMP3, CTR1, CTR1 |
|
1043 vpxor TMP3, CTR2, CTR2 |
|
1044 vpxor TMP3, CTR3, CTR3 |
|
1045 vpxor TMP3, CTR4, CTR4 |
|
1046 vpxor TMP3, CTR5, CTR5 |
|
1047 vpxor TMP3, CTR6, CTR6 |
|
1048 vpxor TMP3, CTR7, CTR7 |
|
1049 |
|
1050 vmovdqu 7*16(CT), TMP5 |
|
1051 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 |
|
1052 vmovdqu 16*0(Htbl), TMP3 |
|
1053 vpclmulqdq $0x11, TMP3, TMP5, TMP1 |
|
1054 vpclmulqdq $0x00, TMP3, TMP5, TMP2 |
|
1055 vpshufd $78, TMP5, TMP3 |
|
1056 vpxor TMP5, TMP3, TMP5 |
|
1057 vmovdqu 128+0*16(Htbl), TMP3 |
|
1058 vpclmulqdq $0x00, TMP3, TMP5, TMP0 |
|
1059 |
|
1060 ROUND 1 |
|
1061 DEC_KARATSUBA 1 |
|
1062 |
|
1063 ROUND 2 |
|
1064 DEC_KARATSUBA 2 |
|
1065 |
|
1066 ROUND 3 |
|
1067 DEC_KARATSUBA 3 |
|
1068 |
|
1069 ROUND 4 |
|
1070 DEC_KARATSUBA 4 |
|
1071 |
|
1072 ROUND 5 |
|
1073 DEC_KARATSUBA 5 |
|
1074 |
|
1075 ROUND 6 |
|
1076 DEC_KARATSUBA 6 |
|
1077 |
|
1078 ROUND 7 |
|
1079 |
|
1080 vmovdqu 0*16(CT), TMP5 |
|
1081 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 |
|
1082 vpxor T, TMP5, TMP5 |
|
1083 vmovdqu 16*7(Htbl), TMP4 |
|
1084 |
|
1085 vpclmulqdq $0x11, TMP4, TMP5, TMP3 |
|
1086 vpxor TMP3, TMP1, TMP1 |
|
1087 vpclmulqdq $0x00, TMP4, TMP5, TMP3 |
|
1088 vpxor TMP3, TMP2, TMP2 |
|
1089 |
|
1090 vpshufd $78, TMP5, TMP3 |
|
1091 vpxor TMP5, TMP3, TMP5 |
|
1092 vmovdqu 128+7*16(Htbl), TMP4 |
|
1093 |
|
1094 vpclmulqdq $0x00, TMP4, TMP5, TMP3 |
|
1095 vpxor TMP3, TMP0, TMP0 |
|
1096 |
|
1097 ROUND 8 |
|
1098 |
|
1099 vpxor TMP1, TMP0, TMP0 |
|
1100 vpxor TMP2, TMP0, TMP0 |
|
1101 |
|
1102 vpsrldq $8, TMP0, TMP3 |
|
1103 vpxor TMP3, TMP1, TMP4 |
|
1104 vpslldq $8, TMP0, TMP3 |
|
1105 vpxor TMP3, TMP2, T |
|
1106 vmovdqa .Lpoly(%rip), TMP2 |
|
1107 |
|
1108 vpalignr $8, T, T, TMP1 |
|
1109 vpclmulqdq $0x10, TMP2, T, T |
|
1110 vpxor T, TMP1, T |
|
1111 |
|
1112 ROUND 9 |
|
1113 |
|
1114 vpalignr $8, T, T, TMP1 |
|
1115 vpclmulqdq $0x10, TMP2, T, T |
|
1116 vpxor T, TMP1, T |
|
1117 |
|
1118 vmovdqu 160(KS), TMP5 |
|
1119 cmp $10, NR |
|
1120 |
|
1121 jbe .LDECLast1 |
|
1122 |
|
1123 ROUND 10 |
|
1124 ROUND 11 |
|
1125 |
|
1126 vmovdqu 192(KS), TMP5 |
|
1127 cmp $12, NR |
|
1128 |
|
1129 jbe .LDECLast1 |
|
1130 |
|
1131 ROUND 12 |
|
1132 ROUND 13 |
|
1133 |
|
1134 vmovdqu 224(KS), TMP5 |
|
1135 |
|
1136 .LDECLast1: |
|
1137 |
|
1138 vpxor (CT), TMP5, TMP3 |
|
1139 vaesenclast TMP3, CTR0, CTR0 |
|
1140 vpxor 16(CT), TMP5, TMP3 |
|
1141 vaesenclast TMP3, CTR1, CTR1 |
|
1142 vpxor 32(CT), TMP5, TMP3 |
|
1143 vaesenclast TMP3, CTR2, CTR2 |
|
1144 vpxor 48(CT), TMP5, TMP3 |
|
1145 vaesenclast TMP3, CTR3, CTR3 |
|
1146 vpxor 64(CT), TMP5, TMP3 |
|
1147 vaesenclast TMP3, CTR4, CTR4 |
|
1148 vpxor 80(CT), TMP5, TMP3 |
|
1149 vaesenclast TMP3, CTR5, CTR5 |
|
1150 vpxor 96(CT), TMP5, TMP3 |
|
1151 vaesenclast TMP3, CTR6, CTR6 |
|
1152 vpxor 112(CT), TMP5, TMP3 |
|
1153 vaesenclast TMP3, CTR7, CTR7 |
|
1154 |
|
1155 vmovdqu .Lbswap_mask(%rip), TMP3 |
|
1156 |
|
1157 vmovdqu CTR0, (PT) |
|
1158 vmovdqu CTR1, 16(PT) |
|
1159 vmovdqu CTR2, 32(PT) |
|
1160 vmovdqu CTR3, 48(PT) |
|
1161 vmovdqu CTR4, 64(PT) |
|
1162 vmovdqu CTR5, 80(PT) |
|
1163 vmovdqu CTR6, 96(PT) |
|
1164 vmovdqu CTR7,112(PT) |
|
1165 |
|
1166 vpxor TMP4, T, T |
|
1167 |
|
1168 lea 128(CT), CT |
|
1169 lea 128(PT), PT |
|
1170 jmp .LDECOctets |
|
1171 |
|
1172 #Here we decrypt and hash any remaining whole block |
|
1173 .LDECSingles: |
|
1174 |
|
1175 cmp $16, len |
|
1176 jb .LDECTail |
|
1177 sub $16, len |
|
1178 |
|
1179 vmovdqu (CT), TMP1 |
|
1180 vpshufb .Lbswap_mask(%rip), TMP1, TMP1 |
|
1181 vpxor TMP1, T, T |
|
1182 vmovdqu (Htbl), TMP0 |
|
1183 call GFMUL |
|
1184 |
|
1185 |
|
1186 vpshufb .Lbswap_mask(%rip), CTR, TMP1 |
|
1187 vpaddd .Lone(%rip), CTR, CTR |
|
1188 |
|
1189 vpxor (KS), TMP1, TMP1 |
|
1190 vaesenc 16*1(KS), TMP1, TMP1 |
|
1191 vaesenc 16*2(KS), TMP1, TMP1 |
|
1192 vaesenc 16*3(KS), TMP1, TMP1 |
|
1193 vaesenc 16*4(KS), TMP1, TMP1 |
|
1194 vaesenc 16*5(KS), TMP1, TMP1 |
|
1195 vaesenc 16*6(KS), TMP1, TMP1 |
|
1196 vaesenc 16*7(KS), TMP1, TMP1 |
|
1197 vaesenc 16*8(KS), TMP1, TMP1 |
|
1198 vaesenc 16*9(KS), TMP1, TMP1 |
|
1199 vmovdqu 16*10(KS), TMP2 |
|
1200 cmp $10, NR |
|
1201 je .LDECLast2 |
|
1202 vaesenc 16*10(KS), TMP1, TMP1 |
|
1203 vaesenc 16*11(KS), TMP1, TMP1 |
|
1204 vmovdqu 16*12(KS), TMP2 |
|
1205 cmp $12, NR |
|
1206 je .LDECLast2 |
|
1207 vaesenc 16*12(KS), TMP1, TMP1 |
|
1208 vaesenc 16*13(KS), TMP1, TMP1 |
|
1209 vmovdqu 16*14(KS), TMP2 |
|
1210 .LDECLast2: |
|
1211 vaesenclast TMP2, TMP1, TMP1 |
|
1212 |
|
1213 vpxor (CT), TMP1, TMP1 |
|
1214 vmovdqu TMP1, (PT) |
|
1215 addq $16, CT |
|
1216 addq $16, PT |
|
1217 jmp .LDECSingles |
|
1218 |
|
1219 #Here we decrypt the final partial block, if there is one |
|
1220 .LDECTail: |
|
1221 test len, len |
|
1222 jz .LDEC_END |
|
1223 |
|
1224 vpshufb .Lbswap_mask(%rip), CTR, TMP1 |
|
1225 vpaddd .Lone(%rip), CTR, CTR |
|
1226 |
|
1227 vpxor (KS), TMP1, TMP1 |
|
1228 vaesenc 16*1(KS), TMP1, TMP1 |
|
1229 vaesenc 16*2(KS), TMP1, TMP1 |
|
1230 vaesenc 16*3(KS), TMP1, TMP1 |
|
1231 vaesenc 16*4(KS), TMP1, TMP1 |
|
1232 vaesenc 16*5(KS), TMP1, TMP1 |
|
1233 vaesenc 16*6(KS), TMP1, TMP1 |
|
1234 vaesenc 16*7(KS), TMP1, TMP1 |
|
1235 vaesenc 16*8(KS), TMP1, TMP1 |
|
1236 vaesenc 16*9(KS), TMP1, TMP1 |
|
1237 vmovdqu 16*10(KS), TMP2 |
|
1238 cmp $10, NR |
|
1239 je .LDECLast3 |
|
1240 vaesenc 16*10(KS), TMP1, TMP1 |
|
1241 vaesenc 16*11(KS), TMP1, TMP1 |
|
1242 vmovdqu 16*12(KS), TMP2 |
|
1243 cmp $12, NR |
|
1244 je .LDECLast3 |
|
1245 vaesenc 16*12(KS), TMP1, TMP1 |
|
1246 vaesenc 16*13(KS), TMP1, TMP1 |
|
1247 vmovdqu 16*14(KS), TMP2 |
|
1248 |
|
1249 .LDECLast3: |
|
1250 vaesenclast TMP2, TMP1, TMP1 |
|
1251 |
|
1252 vpxor TMP2, TMP2, TMP2 |
|
1253 vmovdqa TMP2, (%rsp) |
|
1254 # Copy the required bytes only (could probably use rep movsb) |
|
1255 xor KS, KS |
|
1256 .LDecCpy: |
|
1257 cmp KS, len |
|
1258 je .LDecCpy2 |
|
1259 movb (CT, KS, 1), %r8b |
|
1260 movb %r8b, (%rsp, KS, 1) |
|
1261 inc KS |
|
1262 jmp .LDecCpy |
|
1263 .LDecCpy2: |
|
1264 cmp $16, KS |
|
1265 je .LDecCpyEnd |
|
1266 movb $0, (%rsp, KS, 1) |
|
1267 inc KS |
|
1268 jmp .LDecCpy2 |
|
1269 .LDecCpyEnd: |
|
1270 # Xor with the counter block |
|
1271 vmovdqa (%rsp), TMP0 |
|
1272 vpxor TMP0, TMP1, TMP1 |
|
1273 # Again, store at temp location |
|
1274 vmovdqa TMP1, (%rsp) |
|
1275 # Copy only the required bytes to PT, and zero the rest for the hash |
|
1276 xor KS, KS |
|
1277 .LDecCpy3: |
|
1278 cmp KS, len |
|
1279 je .LDecCpyEnd3 |
|
1280 movb (%rsp, KS, 1), %r8b |
|
1281 movb %r8b, (PT, KS, 1) |
|
1282 inc KS |
|
1283 jmp .LDecCpy3 |
|
1284 .LDecCpyEnd3: |
|
1285 vpshufb .Lbswap_mask(%rip), TMP0, TMP0 |
|
1286 vpxor TMP0, T, T |
|
1287 vmovdqu (Htbl), TMP0 |
|
1288 call GFMUL |
|
1289 .LDEC_END: |
|
1290 |
|
1291 vpshufb .Lbswap_mask(%rip), T, T |
|
1292 vpshufb .Lbswap_mask(%rip), CTR, CTR |
|
1293 vmovdqu T, 272(Gctx) |
|
1294 vmovdqu CTR, 288(Gctx) |
|
1295 |
|
1296 movq %rbp, %rsp |
|
1297 |
|
1298 popq %rbx |
|
1299 popq %rbp |
|
1300 ret |
|
1301 .size intel_aes_gcmDEC, .-intel_aes_gcmDEC |
|
1302 ######################### |
|
1303 # a = T |
|
1304 # b = TMP0 - remains unchanged |
|
1305 # res = T |
|
1306 # uses also TMP1,TMP2,TMP3,TMP4 |
|
1307 # __m128i GFMUL(__m128i A, __m128i B); |
|
1308 .type GFMUL,@function |
|
1309 .globl GFMUL |
|
1310 GFMUL: |
|
1311 vpclmulqdq $0x00, TMP0, T, TMP1 |
|
1312 vpclmulqdq $0x11, TMP0, T, TMP4 |
|
1313 |
|
1314 vpshufd $78, T, TMP2 |
|
1315 vpshufd $78, TMP0, TMP3 |
|
1316 vpxor T, TMP2, TMP2 |
|
1317 vpxor TMP0, TMP3, TMP3 |
|
1318 |
|
1319 vpclmulqdq $0x00, TMP3, TMP2, TMP2 |
|
1320 vpxor TMP1, TMP2, TMP2 |
|
1321 vpxor TMP4, TMP2, TMP2 |
|
1322 |
|
1323 vpslldq $8, TMP2, TMP3 |
|
1324 vpsrldq $8, TMP2, TMP2 |
|
1325 |
|
1326 vpxor TMP3, TMP1, TMP1 |
|
1327 vpxor TMP2, TMP4, TMP4 |
|
1328 |
|
1329 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 |
|
1330 vpshufd $78, TMP1, TMP3 |
|
1331 vpxor TMP3, TMP2, TMP1 |
|
1332 |
|
1333 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 |
|
1334 vpshufd $78, TMP1, TMP3 |
|
1335 vpxor TMP3, TMP2, TMP1 |
|
1336 |
|
1337 vpxor TMP4, TMP1, T |
|
1338 ret |
|
1339 .size GFMUL, .-GFMUL |
|
1340 |