Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 # LICENSE:
2 # This submission to NSS is to be made available under the terms of the
3 # Mozilla Public License, v. 2.0. You can obtain one at http:
4 # //mozilla.org/MPL/2.0/.
5 ################################################################################
6 # Copyright(c) 2012, Intel Corp.
8 .align 16
9 .Lone:
10 .quad 1,0
11 .Ltwo:
12 .quad 2,0
13 .Lbswap_mask:
14 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
15 .Lshuff_mask:
16 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
17 .Lpoly:
18 .quad 0x1, 0xc200000000000000
21 ################################################################################
22 # Generates the final GCM tag
23 # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
24 .type intel_aes_gcmTAG,@function
25 .globl intel_aes_gcmTAG
26 .align 16
27 intel_aes_gcmTAG:
29 .set Htbl, %rdi
30 .set Tp, %rsi
31 .set Mlen, %rdx
32 .set Alen, %rcx
33 .set X0, %r8
34 .set TAG, %r9
36 .set T,%xmm0
37 .set TMP0,%xmm1
39 vmovdqu (Tp), T
40 vpshufb .Lbswap_mask(%rip), T, T
41 vpxor TMP0, TMP0, TMP0
42 shl $3, Mlen
43 shl $3, Alen
44 vpinsrq $0, Mlen, TMP0, TMP0
45 vpinsrq $1, Alen, TMP0, TMP0
46 vpxor TMP0, T, T
47 vmovdqu (Htbl), TMP0
48 call GFMUL
49 vpshufb .Lbswap_mask(%rip), T, T
50 vpxor (X0), T, T
51 vmovdqu T, (TAG)
53 ret
54 .size intel_aes_gcmTAG, .-intel_aes_gcmTAG
55 ################################################################################
56 # Generates the H table
57 # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
58 .type intel_aes_gcmINIT,@function
59 .globl intel_aes_gcmINIT
60 .align 16
61 intel_aes_gcmINIT:
63 .set Htbl, %rdi
64 .set KS, %rsi
65 .set NR, %edx
67 .set T,%xmm0
68 .set TMP0,%xmm1
70 CALCULATE_POWERS_OF_H:
71 vmovdqu 16*0(KS), T
72 vaesenc 16*1(KS), T, T
73 vaesenc 16*2(KS), T, T
74 vaesenc 16*3(KS), T, T
75 vaesenc 16*4(KS), T, T
76 vaesenc 16*5(KS), T, T
77 vaesenc 16*6(KS), T, T
78 vaesenc 16*7(KS), T, T
79 vaesenc 16*8(KS), T, T
80 vaesenc 16*9(KS), T, T
81 vmovdqu 16*10(KS), TMP0
82 cmp $10, NR
83 je .LH0done
84 vaesenc 16*10(KS), T, T
85 vaesenc 16*11(KS), T, T
86 vmovdqu 16*12(KS), TMP0
87 cmp $12, NR
88 je .LH0done
89 vaesenc 16*12(KS), T, T
90 vaesenc 16*13(KS), T, T
91 vmovdqu 16*14(KS), TMP0
93 .LH0done:
94 vaesenclast TMP0, T, T
96 vpshufb .Lbswap_mask(%rip), T, T
98 vmovdqu T, TMP0
99 # Calculate H` = GFMUL(H, 2)
100 vpsrld $7 , T , %xmm3
101 vmovdqu .Lshuff_mask(%rip), %xmm4
102 vpshufb %xmm4, %xmm3 , %xmm3
103 movq $0xff00 , %rax
104 vmovq %rax, %xmm4
105 vpshufb %xmm3, %xmm4 , %xmm4
106 vmovdqu .Lpoly(%rip), %xmm5
107 vpand %xmm4, %xmm5, %xmm5
108 vpsrld $31, T, %xmm3
109 vpslld $1, T, %xmm4
110 vpslldq $4, %xmm3, %xmm3
111 vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1
113 #adding p(x)<<1 to xmm5
114 vpxor %xmm5, T , T
115 vmovdqu T, TMP0
116 vmovdqu T, (Htbl) # H * 2
117 call GFMUL
118 vmovdqu T, 16(Htbl) # H^2 * 2
119 call GFMUL
120 vmovdqu T, 32(Htbl) # H^3 * 2
121 call GFMUL
122 vmovdqu T, 48(Htbl) # H^4 * 2
123 call GFMUL
124 vmovdqu T, 64(Htbl) # H^5 * 2
125 call GFMUL
126 vmovdqu T, 80(Htbl) # H^6 * 2
127 call GFMUL
128 vmovdqu T, 96(Htbl) # H^7 * 2
129 call GFMUL
130 vmovdqu T, 112(Htbl) # H^8 * 2
132 # Precalculations for the reduce 4 step
133 vpshufd $78, (Htbl), %xmm8
134 vpshufd $78, 16(Htbl), %xmm9
135 vpshufd $78, 32(Htbl), %xmm10
136 vpshufd $78, 48(Htbl), %xmm11
137 vpshufd $78, 64(Htbl), %xmm12
138 vpshufd $78, 80(Htbl), %xmm13
139 vpshufd $78, 96(Htbl), %xmm14
140 vpshufd $78, 112(Htbl), %xmm15
142 vpxor (Htbl), %xmm8, %xmm8
143 vpxor 16(Htbl), %xmm9, %xmm9
144 vpxor 32(Htbl), %xmm10, %xmm10
145 vpxor 48(Htbl), %xmm11, %xmm11
146 vpxor 64(Htbl), %xmm12, %xmm12
147 vpxor 80(Htbl), %xmm13, %xmm13
148 vpxor 96(Htbl), %xmm14, %xmm14
149 vpxor 112(Htbl), %xmm15, %xmm15
151 vmovdqu %xmm8, 128(Htbl)
152 vmovdqu %xmm9, 144(Htbl)
153 vmovdqu %xmm10, 160(Htbl)
154 vmovdqu %xmm11, 176(Htbl)
155 vmovdqu %xmm12, 192(Htbl)
156 vmovdqu %xmm13, 208(Htbl)
157 vmovdqu %xmm14, 224(Htbl)
158 vmovdqu %xmm15, 240(Htbl)
160 ret
161 .size intel_aes_gcmINIT, .-intel_aes_gcmINIT
162 ################################################################################
163 # Authenticate only
164 # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
166 .globl intel_aes_gcmAAD
167 .type intel_aes_gcmAAD,@function
168 .align 16
169 intel_aes_gcmAAD:
171 .set DATA, %xmm0
172 .set T, %xmm1
173 .set BSWAP_MASK, %xmm2
174 .set TMP0, %xmm3
175 .set TMP1, %xmm4
176 .set TMP2, %xmm5
177 .set TMP3, %xmm6
178 .set TMP4, %xmm7
179 .set Xhi, %xmm9
181 .set Htbl, %rdi
182 .set inp, %rsi
183 .set len, %rdx
184 .set Tp, %rcx
186 .set hlp0, %r11
188 .macro KARATSUBA_AAD i
189 vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3
190 vpxor TMP3, TMP0, TMP0
191 vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3
192 vpxor TMP3, TMP1, TMP1
193 vpshufd $78, DATA, TMP3
194 vpxor DATA, TMP3, TMP3
195 vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
196 vpxor TMP3, TMP2, TMP2
197 .endm
199 test len, len
200 jnz .LbeginAAD
201 ret
203 .LbeginAAD:
205 push hlp0
206 vzeroupper
208 vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
210 vpxor Xhi, Xhi, Xhi
212 vmovdqu (Tp),T
213 vpshufb BSWAP_MASK,T,T
215 # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
216 mov len, hlp0
217 and $~-128, hlp0
219 jz .Lmod_loop
221 sub hlp0, len
222 sub $16, hlp0
224 #hash first prefix block
225 vmovdqu (inp), DATA
226 vpshufb BSWAP_MASK, DATA, DATA
227 vpxor T, DATA, DATA
229 vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0
230 vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1
231 vpshufd $78, DATA, TMP2
232 vpxor DATA, TMP2, TMP2
233 vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
235 lea 16(inp), inp
236 test hlp0, hlp0
237 jnz .Lpre_loop
238 jmp .Lred1
240 #hash remaining prefix bocks (up to 7 total prefix blocks)
241 .align 64
242 .Lpre_loop:
244 sub $16, hlp0
246 vmovdqu (inp),DATA # next data block
247 vpshufb BSWAP_MASK,DATA,DATA
249 vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3
250 vpxor TMP3, TMP0, TMP0
251 vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3
252 vpxor TMP3, TMP1, TMP1
253 vpshufd $78, DATA, TMP3
254 vpxor DATA, TMP3, TMP3
255 vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
256 vpxor TMP3, TMP2, TMP2
258 test hlp0, hlp0
260 lea 16(inp), inp
262 jnz .Lpre_loop
264 .Lred1:
265 vpxor TMP0, TMP2, TMP2
266 vpxor TMP1, TMP2, TMP2
267 vpsrldq $8, TMP2, TMP3
268 vpslldq $8, TMP2, TMP2
270 vpxor TMP3, TMP1, Xhi
271 vpxor TMP2, TMP0, T
273 .align 64
274 .Lmod_loop:
275 sub $0x80, len
276 jb .Ldone
278 vmovdqu 16*7(inp),DATA # Ii
279 vpshufb BSWAP_MASK,DATA,DATA
281 vpclmulqdq $0x00, (Htbl), DATA, TMP0
282 vpclmulqdq $0x11, (Htbl), DATA, TMP1
283 vpshufd $78, DATA, TMP2
284 vpxor DATA, TMP2, TMP2
285 vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2
286 #########################################################
287 vmovdqu 16*6(inp),DATA
288 vpshufb BSWAP_MASK,DATA,DATA
289 KARATSUBA_AAD 1
290 #########################################################
291 vmovdqu 16*5(inp),DATA
292 vpshufb BSWAP_MASK,DATA,DATA
294 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a
295 vpalignr $8, T, T, T
297 KARATSUBA_AAD 2
299 vpxor TMP4, T, T #reduction stage 1b
300 #########################################################
301 vmovdqu 16*4(inp),DATA
302 vpshufb BSWAP_MASK,DATA,DATA
304 KARATSUBA_AAD 3
305 #########################################################
306 vmovdqu 16*3(inp),DATA
307 vpshufb BSWAP_MASK,DATA,DATA
309 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a
310 vpalignr $8, T, T, T
312 KARATSUBA_AAD 4
314 vpxor TMP4, T, T #reduction stage 2b
315 #########################################################
316 vmovdqu 16*2(inp),DATA
317 vpshufb BSWAP_MASK,DATA,DATA
319 KARATSUBA_AAD 5
321 vpxor Xhi, T, T #reduction finalize
322 #########################################################
323 vmovdqu 16*1(inp),DATA
324 vpshufb BSWAP_MASK,DATA,DATA
326 KARATSUBA_AAD 6
327 #########################################################
328 vmovdqu 16*0(inp),DATA
329 vpshufb BSWAP_MASK,DATA,DATA
330 vpxor T,DATA,DATA
332 KARATSUBA_AAD 7
333 #########################################################
334 vpxor TMP0, TMP2, TMP2 # karatsuba fixup
335 vpxor TMP1, TMP2, TMP2
336 vpsrldq $8, TMP2, TMP3
337 vpslldq $8, TMP2, TMP2
339 vpxor TMP3, TMP1, Xhi
340 vpxor TMP2, TMP0, T
342 lea 16*8(inp), inp
343 jmp .Lmod_loop
344 #########################################################
346 .Ldone:
347 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
348 vpalignr $8, T, T, T
349 vpxor TMP3, T, T
351 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
352 vpalignr $8, T, T, T
353 vpxor TMP3, T, T
355 vpxor Xhi, T, T
357 .Lsave:
358 vpshufb BSWAP_MASK,T, T
359 vmovdqu T,(Tp)
360 vzeroupper
362 pop hlp0
363 ret
364 .size intel_aes_gcmAAD,.-intel_aes_gcmAAD
366 ################################################################################
367 # Encrypt and Authenticate
368 # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
369 .type intel_aes_gcmENC,@function
370 .globl intel_aes_gcmENC
371 .align 16
372 intel_aes_gcmENC:
374 .set PT,%rdi
375 .set CT,%rsi
376 .set Htbl, %rdx
377 .set len, %rcx
378 .set KS,%r9
379 .set NR,%r10d
381 .set Gctx, %rdx
383 .set T,%xmm0
384 .set TMP0,%xmm1
385 .set TMP1,%xmm2
386 .set TMP2,%xmm3
387 .set TMP3,%xmm4
388 .set TMP4,%xmm5
389 .set TMP5,%xmm6
390 .set CTR0,%xmm7
391 .set CTR1,%xmm8
392 .set CTR2,%xmm9
393 .set CTR3,%xmm10
394 .set CTR4,%xmm11
395 .set CTR5,%xmm12
396 .set CTR6,%xmm13
397 .set CTR7,%xmm14
398 .set CTR,%xmm15
400 .macro ROUND i
401 vmovdqu \i*16(KS), TMP3
402 vaesenc TMP3, CTR0, CTR0
403 vaesenc TMP3, CTR1, CTR1
404 vaesenc TMP3, CTR2, CTR2
405 vaesenc TMP3, CTR3, CTR3
406 vaesenc TMP3, CTR4, CTR4
407 vaesenc TMP3, CTR5, CTR5
408 vaesenc TMP3, CTR6, CTR6
409 vaesenc TMP3, CTR7, CTR7
410 .endm
412 .macro ROUNDMUL i
414 vmovdqu \i*16(%rsp), TMP5
415 vmovdqu \i*16(KS), TMP3
417 vaesenc TMP3, CTR0, CTR0
418 vaesenc TMP3, CTR1, CTR1
419 vaesenc TMP3, CTR2, CTR2
420 vaesenc TMP3, CTR3, CTR3
422 vpshufd $78, TMP5, TMP4
423 vpxor TMP5, TMP4, TMP4
425 vaesenc TMP3, CTR4, CTR4
426 vaesenc TMP3, CTR5, CTR5
427 vaesenc TMP3, CTR6, CTR6
428 vaesenc TMP3, CTR7, CTR7
430 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3
431 vpxor TMP3, TMP0, TMP0
432 vmovdqa \i*16(Htbl), TMP4
433 vpclmulqdq $0x11, TMP4, TMP5, TMP3
434 vpxor TMP3, TMP1, TMP1
435 vpclmulqdq $0x00, TMP4, TMP5, TMP3
436 vpxor TMP3, TMP2, TMP2
438 .endm
440 .macro KARATSUBA i
441 vmovdqu \i*16(%rsp), TMP5
443 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
444 vpxor TMP3, TMP1, TMP1
445 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
446 vpxor TMP3, TMP2, TMP2
447 vpshufd $78, TMP5, TMP3
448 vpxor TMP5, TMP3, TMP5
449 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
450 vpxor TMP3, TMP0, TMP0
451 .endm
453 test len, len
454 jnz .Lbegin
455 ret
457 .Lbegin:
459 vzeroupper
460 push %rbp
461 push %rbx
463 movq %rsp, %rbp
464 sub $128, %rsp
465 andq $-16, %rsp
467 vmovdqu 288(Gctx), CTR
468 vmovdqu 272(Gctx), T
469 mov 304(Gctx), KS
470 mov 4(KS), NR
471 lea 48(KS), KS
473 vpshufb .Lbswap_mask(%rip), CTR, CTR
474 vpshufb .Lbswap_mask(%rip), T, T
476 cmp $128, len
477 jb .LDataSingles
479 # Encrypt the first eight blocks
480 sub $128, len
481 vmovdqa CTR, CTR0
482 vpaddd .Lone(%rip), CTR0, CTR1
483 vpaddd .Ltwo(%rip), CTR0, CTR2
484 vpaddd .Lone(%rip), CTR2, CTR3
485 vpaddd .Ltwo(%rip), CTR2, CTR4
486 vpaddd .Lone(%rip), CTR4, CTR5
487 vpaddd .Ltwo(%rip), CTR4, CTR6
488 vpaddd .Lone(%rip), CTR6, CTR7
489 vpaddd .Ltwo(%rip), CTR6, CTR
491 vpshufb .Lbswap_mask(%rip), CTR0, CTR0
492 vpshufb .Lbswap_mask(%rip), CTR1, CTR1
493 vpshufb .Lbswap_mask(%rip), CTR2, CTR2
494 vpshufb .Lbswap_mask(%rip), CTR3, CTR3
495 vpshufb .Lbswap_mask(%rip), CTR4, CTR4
496 vpshufb .Lbswap_mask(%rip), CTR5, CTR5
497 vpshufb .Lbswap_mask(%rip), CTR6, CTR6
498 vpshufb .Lbswap_mask(%rip), CTR7, CTR7
500 vpxor (KS), CTR0, CTR0
501 vpxor (KS), CTR1, CTR1
502 vpxor (KS), CTR2, CTR2
503 vpxor (KS), CTR3, CTR3
504 vpxor (KS), CTR4, CTR4
505 vpxor (KS), CTR5, CTR5
506 vpxor (KS), CTR6, CTR6
507 vpxor (KS), CTR7, CTR7
509 ROUND 1
510 ROUND 2
511 ROUND 3
512 ROUND 4
513 ROUND 5
514 ROUND 6
515 ROUND 7
516 ROUND 8
517 ROUND 9
519 vmovdqu 160(KS), TMP5
520 cmp $12, NR
521 jb .LLast1
523 ROUND 10
524 ROUND 11
526 vmovdqu 192(KS), TMP5
527 cmp $14, NR
528 jb .LLast1
530 ROUND 12
531 ROUND 13
533 vmovdqu 224(KS), TMP5
535 .LLast1:
537 vpxor (PT), TMP5, TMP3
538 vaesenclast TMP3, CTR0, CTR0
539 vpxor 16(PT), TMP5, TMP3
540 vaesenclast TMP3, CTR1, CTR1
541 vpxor 32(PT), TMP5, TMP3
542 vaesenclast TMP3, CTR2, CTR2
543 vpxor 48(PT), TMP5, TMP3
544 vaesenclast TMP3, CTR3, CTR3
545 vpxor 64(PT), TMP5, TMP3
546 vaesenclast TMP3, CTR4, CTR4
547 vpxor 80(PT), TMP5, TMP3
548 vaesenclast TMP3, CTR5, CTR5
549 vpxor 96(PT), TMP5, TMP3
550 vaesenclast TMP3, CTR6, CTR6
551 vpxor 112(PT), TMP5, TMP3
552 vaesenclast TMP3, CTR7, CTR7
554 vmovdqu .Lbswap_mask(%rip), TMP3
556 vmovdqu CTR0, (CT)
557 vpshufb TMP3, CTR0, CTR0
558 vmovdqu CTR1, 16(CT)
559 vpshufb TMP3, CTR1, CTR1
560 vmovdqu CTR2, 32(CT)
561 vpshufb TMP3, CTR2, CTR2
562 vmovdqu CTR3, 48(CT)
563 vpshufb TMP3, CTR3, CTR3
564 vmovdqu CTR4, 64(CT)
565 vpshufb TMP3, CTR4, CTR4
566 vmovdqu CTR5, 80(CT)
567 vpshufb TMP3, CTR5, CTR5
568 vmovdqu CTR6, 96(CT)
569 vpshufb TMP3, CTR6, CTR6
570 vmovdqu CTR7, 112(CT)
571 vpshufb TMP3, CTR7, CTR7
573 lea 128(CT), CT
574 lea 128(PT), PT
575 jmp .LDataOctets
577 # Encrypt 8 blocks each time while hashing previous 8 blocks
578 .align 64
579 .LDataOctets:
580 cmp $128, len
581 jb .LEndOctets
582 sub $128, len
584 vmovdqa CTR7, TMP5
585 vmovdqa CTR6, 1*16(%rsp)
586 vmovdqa CTR5, 2*16(%rsp)
587 vmovdqa CTR4, 3*16(%rsp)
588 vmovdqa CTR3, 4*16(%rsp)
589 vmovdqa CTR2, 5*16(%rsp)
590 vmovdqa CTR1, 6*16(%rsp)
591 vmovdqa CTR0, 7*16(%rsp)
593 vmovdqa CTR, CTR0
594 vpaddd .Lone(%rip), CTR0, CTR1
595 vpaddd .Ltwo(%rip), CTR0, CTR2
596 vpaddd .Lone(%rip), CTR2, CTR3
597 vpaddd .Ltwo(%rip), CTR2, CTR4
598 vpaddd .Lone(%rip), CTR4, CTR5
599 vpaddd .Ltwo(%rip), CTR4, CTR6
600 vpaddd .Lone(%rip), CTR6, CTR7
601 vpaddd .Ltwo(%rip), CTR6, CTR
603 vmovdqu (KS), TMP4
604 vpshufb TMP3, CTR0, CTR0
605 vpxor TMP4, CTR0, CTR0
606 vpshufb TMP3, CTR1, CTR1
607 vpxor TMP4, CTR1, CTR1
608 vpshufb TMP3, CTR2, CTR2
609 vpxor TMP4, CTR2, CTR2
610 vpshufb TMP3, CTR3, CTR3
611 vpxor TMP4, CTR3, CTR3
612 vpshufb TMP3, CTR4, CTR4
613 vpxor TMP4, CTR4, CTR4
614 vpshufb TMP3, CTR5, CTR5
615 vpxor TMP4, CTR5, CTR5
616 vpshufb TMP3, CTR6, CTR6
617 vpxor TMP4, CTR6, CTR6
618 vpshufb TMP3, CTR7, CTR7
619 vpxor TMP4, CTR7, CTR7
621 vmovdqu 16*0(Htbl), TMP3
622 vpclmulqdq $0x11, TMP3, TMP5, TMP1
623 vpclmulqdq $0x00, TMP3, TMP5, TMP2
624 vpshufd $78, TMP5, TMP3
625 vpxor TMP5, TMP3, TMP5
626 vmovdqu 128+0*16(Htbl), TMP3
627 vpclmulqdq $0x00, TMP3, TMP5, TMP0
629 ROUNDMUL 1
631 ROUNDMUL 2
633 ROUNDMUL 3
635 ROUNDMUL 4
637 ROUNDMUL 5
639 ROUNDMUL 6
641 vpxor 7*16(%rsp), T, TMP5
642 vmovdqu 7*16(KS), TMP3
644 vaesenc TMP3, CTR0, CTR0
645 vaesenc TMP3, CTR1, CTR1
646 vaesenc TMP3, CTR2, CTR2
647 vaesenc TMP3, CTR3, CTR3
649 vpshufd $78, TMP5, TMP4
650 vpxor TMP5, TMP4, TMP4
652 vaesenc TMP3, CTR4, CTR4
653 vaesenc TMP3, CTR5, CTR5
654 vaesenc TMP3, CTR6, CTR6
655 vaesenc TMP3, CTR7, CTR7
657 vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3
658 vpxor TMP3, TMP1, TMP1
659 vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3
660 vpxor TMP3, TMP2, TMP2
661 vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3
662 vpxor TMP3, TMP0, TMP0
664 ROUND 8
665 vmovdqa .Lpoly(%rip), TMP5
667 vpxor TMP1, TMP0, TMP0
668 vpxor TMP2, TMP0, TMP0
669 vpsrldq $8, TMP0, TMP3
670 vpxor TMP3, TMP1, TMP4
671 vpslldq $8, TMP0, TMP3
672 vpxor TMP3, TMP2, T
674 vpclmulqdq $0x10, TMP5, T, TMP1
675 vpalignr $8, T, T, T
676 vpxor T, TMP1, T
678 ROUND 9
680 vpclmulqdq $0x10, TMP5, T, TMP1
681 vpalignr $8, T, T, T
682 vpxor T, TMP1, T
684 vmovdqu 160(KS), TMP5
685 cmp $10, NR
686 jbe .LLast2
688 ROUND 10
689 ROUND 11
691 vmovdqu 192(KS), TMP5
692 cmp $12, NR
693 jbe .LLast2
695 ROUND 12
696 ROUND 13
698 vmovdqu 224(KS), TMP5
700 .LLast2:
702 vpxor (PT), TMP5, TMP3
703 vaesenclast TMP3, CTR0, CTR0
704 vpxor 16(PT), TMP5, TMP3
705 vaesenclast TMP3, CTR1, CTR1
706 vpxor 32(PT), TMP5, TMP3
707 vaesenclast TMP3, CTR2, CTR2
708 vpxor 48(PT), TMP5, TMP3
709 vaesenclast TMP3, CTR3, CTR3
710 vpxor 64(PT), TMP5, TMP3
711 vaesenclast TMP3, CTR4, CTR4
712 vpxor 80(PT), TMP5, TMP3
713 vaesenclast TMP3, CTR5, CTR5
714 vpxor 96(PT), TMP5, TMP3
715 vaesenclast TMP3, CTR6, CTR6
716 vpxor 112(PT), TMP5, TMP3
717 vaesenclast TMP3, CTR7, CTR7
719 vmovdqu .Lbswap_mask(%rip), TMP3
721 vmovdqu CTR0, (CT)
722 vpshufb TMP3, CTR0, CTR0
723 vmovdqu CTR1, 16(CT)
724 vpshufb TMP3, CTR1, CTR1
725 vmovdqu CTR2, 32(CT)
726 vpshufb TMP3, CTR2, CTR2
727 vmovdqu CTR3, 48(CT)
728 vpshufb TMP3, CTR3, CTR3
729 vmovdqu CTR4, 64(CT)
730 vpshufb TMP3, CTR4, CTR4
731 vmovdqu CTR5, 80(CT)
732 vpshufb TMP3, CTR5, CTR5
733 vmovdqu CTR6, 96(CT)
734 vpshufb TMP3, CTR6, CTR6
735 vmovdqu CTR7,112(CT)
736 vpshufb TMP3, CTR7, CTR7
738 vpxor TMP4, T, T
740 lea 128(CT), CT
741 lea 128(PT), PT
742 jmp .LDataOctets
744 .LEndOctets:
746 vmovdqa CTR7, TMP5
747 vmovdqa CTR6, 1*16(%rsp)
748 vmovdqa CTR5, 2*16(%rsp)
749 vmovdqa CTR4, 3*16(%rsp)
750 vmovdqa CTR3, 4*16(%rsp)
751 vmovdqa CTR2, 5*16(%rsp)
752 vmovdqa CTR1, 6*16(%rsp)
753 vmovdqa CTR0, 7*16(%rsp)
755 vmovdqu 16*0(Htbl), TMP3
756 vpclmulqdq $0x11, TMP3, TMP5, TMP1
757 vpclmulqdq $0x00, TMP3, TMP5, TMP2
758 vpshufd $78, TMP5, TMP3
759 vpxor TMP5, TMP3, TMP5
760 vmovdqu 128+0*16(Htbl), TMP3
761 vpclmulqdq $0x00, TMP3, TMP5, TMP0
763 KARATSUBA 1
764 KARATSUBA 2
765 KARATSUBA 3
766 KARATSUBA 4
767 KARATSUBA 5
768 KARATSUBA 6
770 vmovdqu 7*16(%rsp), TMP5
771 vpxor T, TMP5, TMP5
772 vmovdqu 16*7(Htbl), TMP4
773 vpclmulqdq $0x11, TMP4, TMP5, TMP3
774 vpxor TMP3, TMP1, TMP1
775 vpclmulqdq $0x00, TMP4, TMP5, TMP3
776 vpxor TMP3, TMP2, TMP2
777 vpshufd $78, TMP5, TMP3
778 vpxor TMP5, TMP3, TMP5
779 vmovdqu 128+7*16(Htbl), TMP4
780 vpclmulqdq $0x00, TMP4, TMP5, TMP3
781 vpxor TMP3, TMP0, TMP0
783 vpxor TMP1, TMP0, TMP0
784 vpxor TMP2, TMP0, TMP0
786 vpsrldq $8, TMP0, TMP3
787 vpxor TMP3, TMP1, TMP4
788 vpslldq $8, TMP0, TMP3
789 vpxor TMP3, TMP2, T
791 vmovdqa .Lpoly(%rip), TMP2
793 vpalignr $8, T, T, TMP1
794 vpclmulqdq $0x10, TMP2, T, T
795 vpxor T, TMP1, T
797 vpalignr $8, T, T, TMP1
798 vpclmulqdq $0x10, TMP2, T, T
799 vpxor T, TMP1, T
801 vpxor TMP4, T, T
803 #Here we encrypt any remaining whole block
804 .LDataSingles:
806 cmp $16, len
807 jb .LDataTail
808 sub $16, len
810 vpshufb .Lbswap_mask(%rip), CTR, TMP1
811 vpaddd .Lone(%rip), CTR, CTR
813 vpxor (KS), TMP1, TMP1
814 vaesenc 16*1(KS), TMP1, TMP1
815 vaesenc 16*2(KS), TMP1, TMP1
816 vaesenc 16*3(KS), TMP1, TMP1
817 vaesenc 16*4(KS), TMP1, TMP1
818 vaesenc 16*5(KS), TMP1, TMP1
819 vaesenc 16*6(KS), TMP1, TMP1
820 vaesenc 16*7(KS), TMP1, TMP1
821 vaesenc 16*8(KS), TMP1, TMP1
822 vaesenc 16*9(KS), TMP1, TMP1
823 vmovdqu 16*10(KS), TMP2
824 cmp $10, NR
825 je .LLast3
826 vaesenc 16*10(KS), TMP1, TMP1
827 vaesenc 16*11(KS), TMP1, TMP1
828 vmovdqu 16*12(KS), TMP2
829 cmp $12, NR
830 je .LLast3
831 vaesenc 16*12(KS), TMP1, TMP1
832 vaesenc 16*13(KS), TMP1, TMP1
833 vmovdqu 16*14(KS), TMP2
835 .LLast3:
836 vaesenclast TMP2, TMP1, TMP1
838 vpxor (PT), TMP1, TMP1
839 vmovdqu TMP1, (CT)
840 addq $16, CT
841 addq $16, PT
843 vpshufb .Lbswap_mask(%rip), TMP1, TMP1
844 vpxor TMP1, T, T
845 vmovdqu (Htbl), TMP0
846 call GFMUL
848 jmp .LDataSingles
850 #Here we encypt the final partial block, if there is one
851 .LDataTail:
853 test len, len
854 jz DATA_END
855 # First prepare the counter block
856 vpshufb .Lbswap_mask(%rip), CTR, TMP1
857 vpaddd .Lone(%rip), CTR, CTR
859 vpxor (KS), TMP1, TMP1
860 vaesenc 16*1(KS), TMP1, TMP1
861 vaesenc 16*2(KS), TMP1, TMP1
862 vaesenc 16*3(KS), TMP1, TMP1
863 vaesenc 16*4(KS), TMP1, TMP1
864 vaesenc 16*5(KS), TMP1, TMP1
865 vaesenc 16*6(KS), TMP1, TMP1
866 vaesenc 16*7(KS), TMP1, TMP1
867 vaesenc 16*8(KS), TMP1, TMP1
868 vaesenc 16*9(KS), TMP1, TMP1
869 vmovdqu 16*10(KS), TMP2
870 cmp $10, NR
871 je .LLast4
872 vaesenc 16*10(KS), TMP1, TMP1
873 vaesenc 16*11(KS), TMP1, TMP1
874 vmovdqu 16*12(KS), TMP2
875 cmp $12, NR
876 je .LLast4
877 vaesenc 16*12(KS), TMP1, TMP1
878 vaesenc 16*13(KS), TMP1, TMP1
879 vmovdqu 16*14(KS), TMP2
881 .LLast4:
882 vaesenclast TMP2, TMP1, TMP1
883 #Zero a temp location
884 vpxor TMP2, TMP2, TMP2
885 vmovdqa TMP2, (%rsp)
887 # Copy the required bytes only (could probably use rep movsb)
888 xor KS, KS
889 .LEncCpy:
890 cmp KS, len
891 je .LEncCpyEnd
892 movb (PT, KS, 1), %r8b
893 movb %r8b, (%rsp, KS, 1)
894 inc KS
895 jmp .LEncCpy
896 .LEncCpyEnd:
897 # Xor with the counter block
898 vpxor (%rsp), TMP1, TMP0
899 # Again, store at temp location
900 vmovdqa TMP0, (%rsp)
901 # Copy only the required bytes to CT, and zero the rest for the hash
902 xor KS, KS
903 .LEncCpy2:
904 cmp KS, len
905 je .LEncCpy3
906 movb (%rsp, KS, 1), %r8b
907 movb %r8b, (CT, KS, 1)
908 inc KS
909 jmp .LEncCpy2
910 .LEncCpy3:
911 cmp $16, KS
912 je .LEndCpy3
913 movb $0, (%rsp, KS, 1)
914 inc KS
915 jmp .LEncCpy3
916 .LEndCpy3:
917 vmovdqa (%rsp), TMP0
919 vpshufb .Lbswap_mask(%rip), TMP0, TMP0
920 vpxor TMP0, T, T
921 vmovdqu (Htbl), TMP0
922 call GFMUL
924 DATA_END:
926 vpshufb .Lbswap_mask(%rip), T, T
927 vpshufb .Lbswap_mask(%rip), CTR, CTR
928 vmovdqu T, 272(Gctx)
929 vmovdqu CTR, 288(Gctx)
931 movq %rbp, %rsp
933 popq %rbx
934 popq %rbp
935 ret
936 .size intel_aes_gcmENC, .-intel_aes_gcmENC
938 #########################
939 # Decrypt and Authenticate
940 # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
941 .type intel_aes_gcmDEC,@function
942 .globl intel_aes_gcmDEC
943 .align 16
944 intel_aes_gcmDEC:
945 # parameter 1: CT # input
946 # parameter 2: PT # output
947 # parameter 3: %rdx # Gctx
948 # parameter 4: %rcx # len
950 .macro DEC_KARATSUBA i
951 vmovdqu (7-\i)*16(CT), TMP5
952 vpshufb .Lbswap_mask(%rip), TMP5, TMP5
954 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
955 vpxor TMP3, TMP1, TMP1
956 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
957 vpxor TMP3, TMP2, TMP2
958 vpshufd $78, TMP5, TMP3
959 vpxor TMP5, TMP3, TMP5
960 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
961 vpxor TMP3, TMP0, TMP0
962 .endm
964 .set PT,%rsi
965 .set CT,%rdi
966 .set Htbl, %rdx
967 .set len, %rcx
968 .set KS,%r9
969 .set NR,%r10d
971 .set Gctx, %rdx
973 .set T,%xmm0
974 .set TMP0,%xmm1
975 .set TMP1,%xmm2
976 .set TMP2,%xmm3
977 .set TMP3,%xmm4
978 .set TMP4,%xmm5
979 .set TMP5,%xmm6
980 .set CTR0,%xmm7
981 .set CTR1,%xmm8
982 .set CTR2,%xmm9
983 .set CTR3,%xmm10
984 .set CTR4,%xmm11
985 .set CTR5,%xmm12
986 .set CTR6,%xmm13
987 .set CTR7,%xmm14
988 .set CTR,%xmm15
990 test len, len
991 jnz .LbeginDec
992 ret
994 .LbeginDec:
996 pushq %rbp
997 pushq %rbx
998 movq %rsp, %rbp
999 sub $128, %rsp
1000 andq $-16, %rsp
1001 vmovdqu 288(Gctx), CTR
1002 vmovdqu 272(Gctx), T
1003 mov 304(Gctx), KS
1004 mov 4(KS), NR
1005 lea 48(KS), KS
1007 vpshufb .Lbswap_mask(%rip), CTR, CTR
1008 vpshufb .Lbswap_mask(%rip), T, T
1010 vmovdqu .Lbswap_mask(%rip), TMP3
1011 jmp .LDECOctets
1013 # Decrypt 8 blocks each time while hashing them at the same time
1014 .align 64
1015 .LDECOctets:
1017 cmp $128, len
1018 jb .LDECSingles
1019 sub $128, len
1021 vmovdqa CTR, CTR0
1022 vpaddd .Lone(%rip), CTR0, CTR1
1023 vpaddd .Ltwo(%rip), CTR0, CTR2
1024 vpaddd .Lone(%rip), CTR2, CTR3
1025 vpaddd .Ltwo(%rip), CTR2, CTR4
1026 vpaddd .Lone(%rip), CTR4, CTR5
1027 vpaddd .Ltwo(%rip), CTR4, CTR6
1028 vpaddd .Lone(%rip), CTR6, CTR7
1029 vpaddd .Ltwo(%rip), CTR6, CTR
1031 vpshufb TMP3, CTR0, CTR0
1032 vpshufb TMP3, CTR1, CTR1
1033 vpshufb TMP3, CTR2, CTR2
1034 vpshufb TMP3, CTR3, CTR3
1035 vpshufb TMP3, CTR4, CTR4
1036 vpshufb TMP3, CTR5, CTR5
1037 vpshufb TMP3, CTR6, CTR6
1038 vpshufb TMP3, CTR7, CTR7
1040 vmovdqu (KS), TMP3
1041 vpxor TMP3, CTR0, CTR0
1042 vpxor TMP3, CTR1, CTR1
1043 vpxor TMP3, CTR2, CTR2
1044 vpxor TMP3, CTR3, CTR3
1045 vpxor TMP3, CTR4, CTR4
1046 vpxor TMP3, CTR5, CTR5
1047 vpxor TMP3, CTR6, CTR6
1048 vpxor TMP3, CTR7, CTR7
1050 vmovdqu 7*16(CT), TMP5
1051 vpshufb .Lbswap_mask(%rip), TMP5, TMP5
1052 vmovdqu 16*0(Htbl), TMP3
1053 vpclmulqdq $0x11, TMP3, TMP5, TMP1
1054 vpclmulqdq $0x00, TMP3, TMP5, TMP2
1055 vpshufd $78, TMP5, TMP3
1056 vpxor TMP5, TMP3, TMP5
1057 vmovdqu 128+0*16(Htbl), TMP3
1058 vpclmulqdq $0x00, TMP3, TMP5, TMP0
1060 ROUND 1
1061 DEC_KARATSUBA 1
1063 ROUND 2
1064 DEC_KARATSUBA 2
1066 ROUND 3
1067 DEC_KARATSUBA 3
1069 ROUND 4
1070 DEC_KARATSUBA 4
1072 ROUND 5
1073 DEC_KARATSUBA 5
1075 ROUND 6
1076 DEC_KARATSUBA 6
1078 ROUND 7
1080 vmovdqu 0*16(CT), TMP5
1081 vpshufb .Lbswap_mask(%rip), TMP5, TMP5
1082 vpxor T, TMP5, TMP5
1083 vmovdqu 16*7(Htbl), TMP4
1085 vpclmulqdq $0x11, TMP4, TMP5, TMP3
1086 vpxor TMP3, TMP1, TMP1
1087 vpclmulqdq $0x00, TMP4, TMP5, TMP3
1088 vpxor TMP3, TMP2, TMP2
1090 vpshufd $78, TMP5, TMP3
1091 vpxor TMP5, TMP3, TMP5
1092 vmovdqu 128+7*16(Htbl), TMP4
1094 vpclmulqdq $0x00, TMP4, TMP5, TMP3
1095 vpxor TMP3, TMP0, TMP0
1097 ROUND 8
1099 vpxor TMP1, TMP0, TMP0
1100 vpxor TMP2, TMP0, TMP0
1102 vpsrldq $8, TMP0, TMP3
1103 vpxor TMP3, TMP1, TMP4
1104 vpslldq $8, TMP0, TMP3
1105 vpxor TMP3, TMP2, T
1106 vmovdqa .Lpoly(%rip), TMP2
1108 vpalignr $8, T, T, TMP1
1109 vpclmulqdq $0x10, TMP2, T, T
1110 vpxor T, TMP1, T
1112 ROUND 9
1114 vpalignr $8, T, T, TMP1
1115 vpclmulqdq $0x10, TMP2, T, T
1116 vpxor T, TMP1, T
1118 vmovdqu 160(KS), TMP5
1119 cmp $10, NR
1121 jbe .LDECLast1
1123 ROUND 10
1124 ROUND 11
1126 vmovdqu 192(KS), TMP5
1127 cmp $12, NR
1129 jbe .LDECLast1
1131 ROUND 12
1132 ROUND 13
1134 vmovdqu 224(KS), TMP5
1136 .LDECLast1:
1138 vpxor (CT), TMP5, TMP3
1139 vaesenclast TMP3, CTR0, CTR0
1140 vpxor 16(CT), TMP5, TMP3
1141 vaesenclast TMP3, CTR1, CTR1
1142 vpxor 32(CT), TMP5, TMP3
1143 vaesenclast TMP3, CTR2, CTR2
1144 vpxor 48(CT), TMP5, TMP3
1145 vaesenclast TMP3, CTR3, CTR3
1146 vpxor 64(CT), TMP5, TMP3
1147 vaesenclast TMP3, CTR4, CTR4
1148 vpxor 80(CT), TMP5, TMP3
1149 vaesenclast TMP3, CTR5, CTR5
1150 vpxor 96(CT), TMP5, TMP3
1151 vaesenclast TMP3, CTR6, CTR6
1152 vpxor 112(CT), TMP5, TMP3
1153 vaesenclast TMP3, CTR7, CTR7
1155 vmovdqu .Lbswap_mask(%rip), TMP3
1157 vmovdqu CTR0, (PT)
1158 vmovdqu CTR1, 16(PT)
1159 vmovdqu CTR2, 32(PT)
1160 vmovdqu CTR3, 48(PT)
1161 vmovdqu CTR4, 64(PT)
1162 vmovdqu CTR5, 80(PT)
1163 vmovdqu CTR6, 96(PT)
1164 vmovdqu CTR7,112(PT)
1166 vpxor TMP4, T, T
1168 lea 128(CT), CT
1169 lea 128(PT), PT
1170 jmp .LDECOctets
1172 #Here we decrypt and hash any remaining whole block
1173 .LDECSingles:
1175 cmp $16, len
1176 jb .LDECTail
1177 sub $16, len
1179 vmovdqu (CT), TMP1
1180 vpshufb .Lbswap_mask(%rip), TMP1, TMP1
1181 vpxor TMP1, T, T
1182 vmovdqu (Htbl), TMP0
1183 call GFMUL
1186 vpshufb .Lbswap_mask(%rip), CTR, TMP1
1187 vpaddd .Lone(%rip), CTR, CTR
1189 vpxor (KS), TMP1, TMP1
1190 vaesenc 16*1(KS), TMP1, TMP1
1191 vaesenc 16*2(KS), TMP1, TMP1
1192 vaesenc 16*3(KS), TMP1, TMP1
1193 vaesenc 16*4(KS), TMP1, TMP1
1194 vaesenc 16*5(KS), TMP1, TMP1
1195 vaesenc 16*6(KS), TMP1, TMP1
1196 vaesenc 16*7(KS), TMP1, TMP1
1197 vaesenc 16*8(KS), TMP1, TMP1
1198 vaesenc 16*9(KS), TMP1, TMP1
1199 vmovdqu 16*10(KS), TMP2
1200 cmp $10, NR
1201 je .LDECLast2
1202 vaesenc 16*10(KS), TMP1, TMP1
1203 vaesenc 16*11(KS), TMP1, TMP1
1204 vmovdqu 16*12(KS), TMP2
1205 cmp $12, NR
1206 je .LDECLast2
1207 vaesenc 16*12(KS), TMP1, TMP1
1208 vaesenc 16*13(KS), TMP1, TMP1
1209 vmovdqu 16*14(KS), TMP2
1210 .LDECLast2:
1211 vaesenclast TMP2, TMP1, TMP1
1213 vpxor (CT), TMP1, TMP1
1214 vmovdqu TMP1, (PT)
1215 addq $16, CT
1216 addq $16, PT
1217 jmp .LDECSingles
1219 #Here we decrypt the final partial block, if there is one
1220 .LDECTail:
1221 test len, len
1222 jz .LDEC_END
1224 vpshufb .Lbswap_mask(%rip), CTR, TMP1
1225 vpaddd .Lone(%rip), CTR, CTR
1227 vpxor (KS), TMP1, TMP1
1228 vaesenc 16*1(KS), TMP1, TMP1
1229 vaesenc 16*2(KS), TMP1, TMP1
1230 vaesenc 16*3(KS), TMP1, TMP1
1231 vaesenc 16*4(KS), TMP1, TMP1
1232 vaesenc 16*5(KS), TMP1, TMP1
1233 vaesenc 16*6(KS), TMP1, TMP1
1234 vaesenc 16*7(KS), TMP1, TMP1
1235 vaesenc 16*8(KS), TMP1, TMP1
1236 vaesenc 16*9(KS), TMP1, TMP1
1237 vmovdqu 16*10(KS), TMP2
1238 cmp $10, NR
1239 je .LDECLast3
1240 vaesenc 16*10(KS), TMP1, TMP1
1241 vaesenc 16*11(KS), TMP1, TMP1
1242 vmovdqu 16*12(KS), TMP2
1243 cmp $12, NR
1244 je .LDECLast3
1245 vaesenc 16*12(KS), TMP1, TMP1
1246 vaesenc 16*13(KS), TMP1, TMP1
1247 vmovdqu 16*14(KS), TMP2
1249 .LDECLast3:
1250 vaesenclast TMP2, TMP1, TMP1
1252 vpxor TMP2, TMP2, TMP2
1253 vmovdqa TMP2, (%rsp)
1254 # Copy the required bytes only (could probably use rep movsb)
1255 xor KS, KS
1256 .LDecCpy:
1257 cmp KS, len
1258 je .LDecCpy2
1259 movb (CT, KS, 1), %r8b
1260 movb %r8b, (%rsp, KS, 1)
1261 inc KS
1262 jmp .LDecCpy
1263 .LDecCpy2:
1264 cmp $16, KS
1265 je .LDecCpyEnd
1266 movb $0, (%rsp, KS, 1)
1267 inc KS
1268 jmp .LDecCpy2
1269 .LDecCpyEnd:
1270 # Xor with the counter block
1271 vmovdqa (%rsp), TMP0
1272 vpxor TMP0, TMP1, TMP1
1273 # Again, store at temp location
1274 vmovdqa TMP1, (%rsp)
1275 # Copy only the required bytes to PT, and zero the rest for the hash
1276 xor KS, KS
1277 .LDecCpy3:
1278 cmp KS, len
1279 je .LDecCpyEnd3
1280 movb (%rsp, KS, 1), %r8b
1281 movb %r8b, (PT, KS, 1)
1282 inc KS
1283 jmp .LDecCpy3
1284 .LDecCpyEnd3:
1285 vpshufb .Lbswap_mask(%rip), TMP0, TMP0
1286 vpxor TMP0, T, T
1287 vmovdqu (Htbl), TMP0
1288 call GFMUL
1289 .LDEC_END:
1291 vpshufb .Lbswap_mask(%rip), T, T
1292 vpshufb .Lbswap_mask(%rip), CTR, CTR
1293 vmovdqu T, 272(Gctx)
1294 vmovdqu CTR, 288(Gctx)
1296 movq %rbp, %rsp
1298 popq %rbx
1299 popq %rbp
1300 ret
1301 .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
1302 #########################
1303 # a = T
1304 # b = TMP0 - remains unchanged
1305 # res = T
1306 # uses also TMP1,TMP2,TMP3,TMP4
1307 # __m128i GFMUL(__m128i A, __m128i B);
1308 .type GFMUL,@function
1309 .globl GFMUL
1310 GFMUL:
1311 vpclmulqdq $0x00, TMP0, T, TMP1
1312 vpclmulqdq $0x11, TMP0, T, TMP4
1314 vpshufd $78, T, TMP2
1315 vpshufd $78, TMP0, TMP3
1316 vpxor T, TMP2, TMP2
1317 vpxor TMP0, TMP3, TMP3
1319 vpclmulqdq $0x00, TMP3, TMP2, TMP2
1320 vpxor TMP1, TMP2, TMP2
1321 vpxor TMP4, TMP2, TMP2
1323 vpslldq $8, TMP2, TMP3
1324 vpsrldq $8, TMP2, TMP2
1326 vpxor TMP3, TMP1, TMP1
1327 vpxor TMP2, TMP4, TMP4
1329 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
1330 vpshufd $78, TMP1, TMP3
1331 vpxor TMP3, TMP2, TMP1
1333 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
1334 vpshufd $78, TMP1, TMP3
1335 vpxor TMP3, TMP2, TMP1
1337 vpxor TMP4, TMP1, T
1338 ret
1339 .size GFMUL, .-GFMUL