1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/intel-gcm.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1340 @@ 1.4 +# LICENSE: 1.5 +# This submission to NSS is to be made available under the terms of the 1.6 +# Mozilla Public License, v. 2.0. You can obtain one at http: 1.7 +# //mozilla.org/MPL/2.0/. 1.8 +################################################################################ 1.9 +# Copyright(c) 2012, Intel Corp. 1.10 + 1.11 +.align 16 1.12 +.Lone: 1.13 +.quad 1,0 1.14 +.Ltwo: 1.15 +.quad 2,0 1.16 +.Lbswap_mask: 1.17 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1.18 +.Lshuff_mask: 1.19 +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 1.20 +.Lpoly: 1.21 +.quad 0x1, 0xc200000000000000 1.22 + 1.23 + 1.24 +################################################################################ 1.25 +# Generates the final GCM tag 1.26 +# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); 1.27 +.type intel_aes_gcmTAG,@function 1.28 +.globl intel_aes_gcmTAG 1.29 +.align 16 1.30 +intel_aes_gcmTAG: 1.31 + 1.32 +.set Htbl, %rdi 1.33 +.set Tp, %rsi 1.34 +.set Mlen, %rdx 1.35 +.set Alen, %rcx 1.36 +.set X0, %r8 1.37 +.set TAG, %r9 1.38 + 1.39 +.set T,%xmm0 1.40 +.set TMP0,%xmm1 1.41 + 1.42 + vmovdqu (Tp), T 1.43 + vpshufb .Lbswap_mask(%rip), T, T 1.44 + vpxor TMP0, TMP0, TMP0 1.45 + shl $3, Mlen 1.46 + shl $3, Alen 1.47 + vpinsrq $0, Mlen, TMP0, TMP0 1.48 + vpinsrq $1, Alen, TMP0, TMP0 1.49 + vpxor TMP0, T, T 1.50 + vmovdqu (Htbl), TMP0 1.51 + call GFMUL 1.52 + vpshufb .Lbswap_mask(%rip), T, T 1.53 + vpxor (X0), T, T 1.54 + vmovdqu T, (TAG) 1.55 + 1.56 +ret 1.57 +.size intel_aes_gcmTAG, .-intel_aes_gcmTAG 1.58 +################################################################################ 1.59 +# Generates the H table 1.60 +# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); 1.61 +.type intel_aes_gcmINIT,@function 1.62 +.globl intel_aes_gcmINIT 1.63 +.align 16 1.64 +intel_aes_gcmINIT: 1.65 + 1.66 +.set Htbl, %rdi 1.67 +.set KS, %rsi 1.68 +.set NR, %edx 1.69 + 1.70 +.set T,%xmm0 1.71 +.set TMP0,%xmm1 1.72 + 1.73 +CALCULATE_POWERS_OF_H: 1.74 + vmovdqu 16*0(KS), T 1.75 + vaesenc 16*1(KS), T, T 1.76 + vaesenc 16*2(KS), T, T 1.77 + vaesenc 16*3(KS), T, T 1.78 + vaesenc 16*4(KS), T, T 1.79 + vaesenc 16*5(KS), T, T 1.80 + vaesenc 16*6(KS), T, T 1.81 + vaesenc 16*7(KS), T, T 1.82 + vaesenc 16*8(KS), T, T 1.83 + vaesenc 16*9(KS), T, T 1.84 + vmovdqu 16*10(KS), TMP0 1.85 + cmp $10, NR 1.86 + je .LH0done 1.87 + vaesenc 16*10(KS), T, T 1.88 + vaesenc 16*11(KS), T, T 1.89 + vmovdqu 16*12(KS), TMP0 1.90 + cmp $12, NR 1.91 + je .LH0done 1.92 + vaesenc 16*12(KS), T, T 1.93 + vaesenc 16*13(KS), T, T 1.94 + vmovdqu 16*14(KS), TMP0 1.95 + 1.96 +.LH0done: 1.97 + vaesenclast TMP0, T, T 1.98 + 1.99 + vpshufb .Lbswap_mask(%rip), T, T 1.100 + 1.101 + vmovdqu T, TMP0 1.102 + # Calculate H` = GFMUL(H, 2) 1.103 + vpsrld $7 , T , %xmm3 1.104 + vmovdqu .Lshuff_mask(%rip), %xmm4 1.105 + vpshufb %xmm4, %xmm3 , %xmm3 1.106 + movq $0xff00 , %rax 1.107 + vmovq %rax, %xmm4 1.108 + vpshufb %xmm3, %xmm4 , %xmm4 1.109 + vmovdqu .Lpoly(%rip), %xmm5 1.110 + vpand %xmm4, %xmm5, %xmm5 1.111 + vpsrld $31, T, %xmm3 1.112 + vpslld $1, T, %xmm4 1.113 + vpslldq $4, %xmm3, %xmm3 1.114 + vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 1.115 + 1.116 + #adding p(x)<<1 to xmm5 1.117 + vpxor %xmm5, T , T 1.118 + vmovdqu T, TMP0 1.119 + vmovdqu T, (Htbl) # H * 2 1.120 + call GFMUL 1.121 + vmovdqu T, 16(Htbl) # H^2 * 2 1.122 + call GFMUL 1.123 + vmovdqu T, 32(Htbl) # H^3 * 2 1.124 + call GFMUL 1.125 + vmovdqu T, 48(Htbl) # H^4 * 2 1.126 + call GFMUL 1.127 + vmovdqu T, 64(Htbl) # H^5 * 2 1.128 + call GFMUL 1.129 + vmovdqu T, 80(Htbl) # H^6 * 2 1.130 + call GFMUL 1.131 + vmovdqu T, 96(Htbl) # H^7 * 2 1.132 + call GFMUL 1.133 + vmovdqu T, 112(Htbl) # H^8 * 2 1.134 + 1.135 + # Precalculations for the reduce 4 step 1.136 + vpshufd $78, (Htbl), %xmm8 1.137 + vpshufd $78, 16(Htbl), %xmm9 1.138 + vpshufd $78, 32(Htbl), %xmm10 1.139 + vpshufd $78, 48(Htbl), %xmm11 1.140 + vpshufd $78, 64(Htbl), %xmm12 1.141 + vpshufd $78, 80(Htbl), %xmm13 1.142 + vpshufd $78, 96(Htbl), %xmm14 1.143 + vpshufd $78, 112(Htbl), %xmm15 1.144 + 1.145 + vpxor (Htbl), %xmm8, %xmm8 1.146 + vpxor 16(Htbl), %xmm9, %xmm9 1.147 + vpxor 32(Htbl), %xmm10, %xmm10 1.148 + vpxor 48(Htbl), %xmm11, %xmm11 1.149 + vpxor 64(Htbl), %xmm12, %xmm12 1.150 + vpxor 80(Htbl), %xmm13, %xmm13 1.151 + vpxor 96(Htbl), %xmm14, %xmm14 1.152 + vpxor 112(Htbl), %xmm15, %xmm15 1.153 + 1.154 + vmovdqu %xmm8, 128(Htbl) 1.155 + vmovdqu %xmm9, 144(Htbl) 1.156 + vmovdqu %xmm10, 160(Htbl) 1.157 + vmovdqu %xmm11, 176(Htbl) 1.158 + vmovdqu %xmm12, 192(Htbl) 1.159 + vmovdqu %xmm13, 208(Htbl) 1.160 + vmovdqu %xmm14, 224(Htbl) 1.161 + vmovdqu %xmm15, 240(Htbl) 1.162 + 1.163 + ret 1.164 +.size intel_aes_gcmINIT, .-intel_aes_gcmINIT 1.165 +################################################################################ 1.166 +# Authenticate only 1.167 +# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); 1.168 + 1.169 +.globl intel_aes_gcmAAD 1.170 +.type intel_aes_gcmAAD,@function 1.171 +.align 16 1.172 +intel_aes_gcmAAD: 1.173 + 1.174 +.set DATA, %xmm0 1.175 +.set T, %xmm1 1.176 +.set BSWAP_MASK, %xmm2 1.177 +.set TMP0, %xmm3 1.178 +.set TMP1, %xmm4 1.179 +.set TMP2, %xmm5 1.180 +.set TMP3, %xmm6 1.181 +.set TMP4, %xmm7 1.182 +.set Xhi, %xmm9 1.183 + 1.184 +.set Htbl, %rdi 1.185 +.set inp, %rsi 1.186 +.set len, %rdx 1.187 +.set Tp, %rcx 1.188 + 1.189 +.set hlp0, %r11 1.190 + 1.191 +.macro KARATSUBA_AAD i 1.192 + vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 1.193 + vpxor TMP3, TMP0, TMP0 1.194 + vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 1.195 + vpxor TMP3, TMP1, TMP1 1.196 + vpshufd $78, DATA, TMP3 1.197 + vpxor DATA, TMP3, TMP3 1.198 + vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 1.199 + vpxor TMP3, TMP2, TMP2 1.200 +.endm 1.201 + 1.202 + test len, len 1.203 + jnz .LbeginAAD 1.204 + ret 1.205 + 1.206 +.LbeginAAD: 1.207 + 1.208 + push hlp0 1.209 + vzeroupper 1.210 + 1.211 + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK 1.212 + 1.213 + vpxor Xhi, Xhi, Xhi 1.214 + 1.215 + vmovdqu (Tp),T 1.216 + vpshufb BSWAP_MASK,T,T 1.217 + 1.218 + # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 1.219 + mov len, hlp0 1.220 + and $~-128, hlp0 1.221 + 1.222 + jz .Lmod_loop 1.223 + 1.224 + sub hlp0, len 1.225 + sub $16, hlp0 1.226 + 1.227 + #hash first prefix block 1.228 + vmovdqu (inp), DATA 1.229 + vpshufb BSWAP_MASK, DATA, DATA 1.230 + vpxor T, DATA, DATA 1.231 + 1.232 + vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 1.233 + vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 1.234 + vpshufd $78, DATA, TMP2 1.235 + vpxor DATA, TMP2, TMP2 1.236 + vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 1.237 + 1.238 + lea 16(inp), inp 1.239 + test hlp0, hlp0 1.240 + jnz .Lpre_loop 1.241 + jmp .Lred1 1.242 + 1.243 + #hash remaining prefix bocks (up to 7 total prefix blocks) 1.244 +.align 64 1.245 +.Lpre_loop: 1.246 + 1.247 + sub $16, hlp0 1.248 + 1.249 + vmovdqu (inp),DATA # next data block 1.250 + vpshufb BSWAP_MASK,DATA,DATA 1.251 + 1.252 + vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 1.253 + vpxor TMP3, TMP0, TMP0 1.254 + vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 1.255 + vpxor TMP3, TMP1, TMP1 1.256 + vpshufd $78, DATA, TMP3 1.257 + vpxor DATA, TMP3, TMP3 1.258 + vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 1.259 + vpxor TMP3, TMP2, TMP2 1.260 + 1.261 + test hlp0, hlp0 1.262 + 1.263 + lea 16(inp), inp 1.264 + 1.265 + jnz .Lpre_loop 1.266 + 1.267 +.Lred1: 1.268 + vpxor TMP0, TMP2, TMP2 1.269 + vpxor TMP1, TMP2, TMP2 1.270 + vpsrldq $8, TMP2, TMP3 1.271 + vpslldq $8, TMP2, TMP2 1.272 + 1.273 + vpxor TMP3, TMP1, Xhi 1.274 + vpxor TMP2, TMP0, T 1.275 + 1.276 +.align 64 1.277 +.Lmod_loop: 1.278 + sub $0x80, len 1.279 + jb .Ldone 1.280 + 1.281 + vmovdqu 16*7(inp),DATA # Ii 1.282 + vpshufb BSWAP_MASK,DATA,DATA 1.283 + 1.284 + vpclmulqdq $0x00, (Htbl), DATA, TMP0 1.285 + vpclmulqdq $0x11, (Htbl), DATA, TMP1 1.286 + vpshufd $78, DATA, TMP2 1.287 + vpxor DATA, TMP2, TMP2 1.288 + vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 1.289 + ######################################################### 1.290 + vmovdqu 16*6(inp),DATA 1.291 + vpshufb BSWAP_MASK,DATA,DATA 1.292 + KARATSUBA_AAD 1 1.293 + ######################################################### 1.294 + vmovdqu 16*5(inp),DATA 1.295 + vpshufb BSWAP_MASK,DATA,DATA 1.296 + 1.297 + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a 1.298 + vpalignr $8, T, T, T 1.299 + 1.300 + KARATSUBA_AAD 2 1.301 + 1.302 + vpxor TMP4, T, T #reduction stage 1b 1.303 + ######################################################### 1.304 + vmovdqu 16*4(inp),DATA 1.305 + vpshufb BSWAP_MASK,DATA,DATA 1.306 + 1.307 + KARATSUBA_AAD 3 1.308 + ######################################################### 1.309 + vmovdqu 16*3(inp),DATA 1.310 + vpshufb BSWAP_MASK,DATA,DATA 1.311 + 1.312 + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a 1.313 + vpalignr $8, T, T, T 1.314 + 1.315 + KARATSUBA_AAD 4 1.316 + 1.317 + vpxor TMP4, T, T #reduction stage 2b 1.318 + ######################################################### 1.319 + vmovdqu 16*2(inp),DATA 1.320 + vpshufb BSWAP_MASK,DATA,DATA 1.321 + 1.322 + KARATSUBA_AAD 5 1.323 + 1.324 + vpxor Xhi, T, T #reduction finalize 1.325 + ######################################################### 1.326 + vmovdqu 16*1(inp),DATA 1.327 + vpshufb BSWAP_MASK,DATA,DATA 1.328 + 1.329 + KARATSUBA_AAD 6 1.330 + ######################################################### 1.331 + vmovdqu 16*0(inp),DATA 1.332 + vpshufb BSWAP_MASK,DATA,DATA 1.333 + vpxor T,DATA,DATA 1.334 + 1.335 + KARATSUBA_AAD 7 1.336 + ######################################################### 1.337 + vpxor TMP0, TMP2, TMP2 # karatsuba fixup 1.338 + vpxor TMP1, TMP2, TMP2 1.339 + vpsrldq $8, TMP2, TMP3 1.340 + vpslldq $8, TMP2, TMP2 1.341 + 1.342 + vpxor TMP3, TMP1, Xhi 1.343 + vpxor TMP2, TMP0, T 1.344 + 1.345 + lea 16*8(inp), inp 1.346 + jmp .Lmod_loop 1.347 + ######################################################### 1.348 + 1.349 +.Ldone: 1.350 + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 1.351 + vpalignr $8, T, T, T 1.352 + vpxor TMP3, T, T 1.353 + 1.354 + vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 1.355 + vpalignr $8, T, T, T 1.356 + vpxor TMP3, T, T 1.357 + 1.358 + vpxor Xhi, T, T 1.359 + 1.360 +.Lsave: 1.361 + vpshufb BSWAP_MASK,T, T 1.362 + vmovdqu T,(Tp) 1.363 + vzeroupper 1.364 + 1.365 + pop hlp0 1.366 + ret 1.367 +.size intel_aes_gcmAAD,.-intel_aes_gcmAAD 1.368 + 1.369 +################################################################################ 1.370 +# Encrypt and Authenticate 1.371 +# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); 1.372 +.type intel_aes_gcmENC,@function 1.373 +.globl intel_aes_gcmENC 1.374 +.align 16 1.375 +intel_aes_gcmENC: 1.376 + 1.377 +.set PT,%rdi 1.378 +.set CT,%rsi 1.379 +.set Htbl, %rdx 1.380 +.set len, %rcx 1.381 +.set KS,%r9 1.382 +.set NR,%r10d 1.383 + 1.384 +.set Gctx, %rdx 1.385 + 1.386 +.set T,%xmm0 1.387 +.set TMP0,%xmm1 1.388 +.set TMP1,%xmm2 1.389 +.set TMP2,%xmm3 1.390 +.set TMP3,%xmm4 1.391 +.set TMP4,%xmm5 1.392 +.set TMP5,%xmm6 1.393 +.set CTR0,%xmm7 1.394 +.set CTR1,%xmm8 1.395 +.set CTR2,%xmm9 1.396 +.set CTR3,%xmm10 1.397 +.set CTR4,%xmm11 1.398 +.set CTR5,%xmm12 1.399 +.set CTR6,%xmm13 1.400 +.set CTR7,%xmm14 1.401 +.set CTR,%xmm15 1.402 + 1.403 +.macro ROUND i 1.404 + vmovdqu \i*16(KS), TMP3 1.405 + vaesenc TMP3, CTR0, CTR0 1.406 + vaesenc TMP3, CTR1, CTR1 1.407 + vaesenc TMP3, CTR2, CTR2 1.408 + vaesenc TMP3, CTR3, CTR3 1.409 + vaesenc TMP3, CTR4, CTR4 1.410 + vaesenc TMP3, CTR5, CTR5 1.411 + vaesenc TMP3, CTR6, CTR6 1.412 + vaesenc TMP3, CTR7, CTR7 1.413 +.endm 1.414 + 1.415 +.macro ROUNDMUL i 1.416 + 1.417 + vmovdqu \i*16(%rsp), TMP5 1.418 + vmovdqu \i*16(KS), TMP3 1.419 + 1.420 + vaesenc TMP3, CTR0, CTR0 1.421 + vaesenc TMP3, CTR1, CTR1 1.422 + vaesenc TMP3, CTR2, CTR2 1.423 + vaesenc TMP3, CTR3, CTR3 1.424 + 1.425 + vpshufd $78, TMP5, TMP4 1.426 + vpxor TMP5, TMP4, TMP4 1.427 + 1.428 + vaesenc TMP3, CTR4, CTR4 1.429 + vaesenc TMP3, CTR5, CTR5 1.430 + vaesenc TMP3, CTR6, CTR6 1.431 + vaesenc TMP3, CTR7, CTR7 1.432 + 1.433 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 1.434 + vpxor TMP3, TMP0, TMP0 1.435 + vmovdqa \i*16(Htbl), TMP4 1.436 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 1.437 + vpxor TMP3, TMP1, TMP1 1.438 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 1.439 + vpxor TMP3, TMP2, TMP2 1.440 + 1.441 +.endm 1.442 + 1.443 +.macro KARATSUBA i 1.444 + vmovdqu \i*16(%rsp), TMP5 1.445 + 1.446 + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 1.447 + vpxor TMP3, TMP1, TMP1 1.448 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 1.449 + vpxor TMP3, TMP2, TMP2 1.450 + vpshufd $78, TMP5, TMP3 1.451 + vpxor TMP5, TMP3, TMP5 1.452 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 1.453 + vpxor TMP3, TMP0, TMP0 1.454 +.endm 1.455 + 1.456 + test len, len 1.457 + jnz .Lbegin 1.458 + ret 1.459 + 1.460 +.Lbegin: 1.461 + 1.462 + vzeroupper 1.463 + push %rbp 1.464 + push %rbx 1.465 + 1.466 + movq %rsp, %rbp 1.467 + sub $128, %rsp 1.468 + andq $-16, %rsp 1.469 + 1.470 + vmovdqu 288(Gctx), CTR 1.471 + vmovdqu 272(Gctx), T 1.472 + mov 304(Gctx), KS 1.473 + mov 4(KS), NR 1.474 + lea 48(KS), KS 1.475 + 1.476 + vpshufb .Lbswap_mask(%rip), CTR, CTR 1.477 + vpshufb .Lbswap_mask(%rip), T, T 1.478 + 1.479 + cmp $128, len 1.480 + jb .LDataSingles 1.481 + 1.482 +# Encrypt the first eight blocks 1.483 + sub $128, len 1.484 + vmovdqa CTR, CTR0 1.485 + vpaddd .Lone(%rip), CTR0, CTR1 1.486 + vpaddd .Ltwo(%rip), CTR0, CTR2 1.487 + vpaddd .Lone(%rip), CTR2, CTR3 1.488 + vpaddd .Ltwo(%rip), CTR2, CTR4 1.489 + vpaddd .Lone(%rip), CTR4, CTR5 1.490 + vpaddd .Ltwo(%rip), CTR4, CTR6 1.491 + vpaddd .Lone(%rip), CTR6, CTR7 1.492 + vpaddd .Ltwo(%rip), CTR6, CTR 1.493 + 1.494 + vpshufb .Lbswap_mask(%rip), CTR0, CTR0 1.495 + vpshufb .Lbswap_mask(%rip), CTR1, CTR1 1.496 + vpshufb .Lbswap_mask(%rip), CTR2, CTR2 1.497 + vpshufb .Lbswap_mask(%rip), CTR3, CTR3 1.498 + vpshufb .Lbswap_mask(%rip), CTR4, CTR4 1.499 + vpshufb .Lbswap_mask(%rip), CTR5, CTR5 1.500 + vpshufb .Lbswap_mask(%rip), CTR6, CTR6 1.501 + vpshufb .Lbswap_mask(%rip), CTR7, CTR7 1.502 + 1.503 + vpxor (KS), CTR0, CTR0 1.504 + vpxor (KS), CTR1, CTR1 1.505 + vpxor (KS), CTR2, CTR2 1.506 + vpxor (KS), CTR3, CTR3 1.507 + vpxor (KS), CTR4, CTR4 1.508 + vpxor (KS), CTR5, CTR5 1.509 + vpxor (KS), CTR6, CTR6 1.510 + vpxor (KS), CTR7, CTR7 1.511 + 1.512 + ROUND 1 1.513 + ROUND 2 1.514 + ROUND 3 1.515 + ROUND 4 1.516 + ROUND 5 1.517 + ROUND 6 1.518 + ROUND 7 1.519 + ROUND 8 1.520 + ROUND 9 1.521 + 1.522 + vmovdqu 160(KS), TMP5 1.523 + cmp $12, NR 1.524 + jb .LLast1 1.525 + 1.526 + ROUND 10 1.527 + ROUND 11 1.528 + 1.529 + vmovdqu 192(KS), TMP5 1.530 + cmp $14, NR 1.531 + jb .LLast1 1.532 + 1.533 + ROUND 12 1.534 + ROUND 13 1.535 + 1.536 + vmovdqu 224(KS), TMP5 1.537 + 1.538 +.LLast1: 1.539 + 1.540 + vpxor (PT), TMP5, TMP3 1.541 + vaesenclast TMP3, CTR0, CTR0 1.542 + vpxor 16(PT), TMP5, TMP3 1.543 + vaesenclast TMP3, CTR1, CTR1 1.544 + vpxor 32(PT), TMP5, TMP3 1.545 + vaesenclast TMP3, CTR2, CTR2 1.546 + vpxor 48(PT), TMP5, TMP3 1.547 + vaesenclast TMP3, CTR3, CTR3 1.548 + vpxor 64(PT), TMP5, TMP3 1.549 + vaesenclast TMP3, CTR4, CTR4 1.550 + vpxor 80(PT), TMP5, TMP3 1.551 + vaesenclast TMP3, CTR5, CTR5 1.552 + vpxor 96(PT), TMP5, TMP3 1.553 + vaesenclast TMP3, CTR6, CTR6 1.554 + vpxor 112(PT), TMP5, TMP3 1.555 + vaesenclast TMP3, CTR7, CTR7 1.556 + 1.557 + vmovdqu .Lbswap_mask(%rip), TMP3 1.558 + 1.559 + vmovdqu CTR0, (CT) 1.560 + vpshufb TMP3, CTR0, CTR0 1.561 + vmovdqu CTR1, 16(CT) 1.562 + vpshufb TMP3, CTR1, CTR1 1.563 + vmovdqu CTR2, 32(CT) 1.564 + vpshufb TMP3, CTR2, CTR2 1.565 + vmovdqu CTR3, 48(CT) 1.566 + vpshufb TMP3, CTR3, CTR3 1.567 + vmovdqu CTR4, 64(CT) 1.568 + vpshufb TMP3, CTR4, CTR4 1.569 + vmovdqu CTR5, 80(CT) 1.570 + vpshufb TMP3, CTR5, CTR5 1.571 + vmovdqu CTR6, 96(CT) 1.572 + vpshufb TMP3, CTR6, CTR6 1.573 + vmovdqu CTR7, 112(CT) 1.574 + vpshufb TMP3, CTR7, CTR7 1.575 + 1.576 + lea 128(CT), CT 1.577 + lea 128(PT), PT 1.578 + jmp .LDataOctets 1.579 + 1.580 +# Encrypt 8 blocks each time while hashing previous 8 blocks 1.581 +.align 64 1.582 +.LDataOctets: 1.583 + cmp $128, len 1.584 + jb .LEndOctets 1.585 + sub $128, len 1.586 + 1.587 + vmovdqa CTR7, TMP5 1.588 + vmovdqa CTR6, 1*16(%rsp) 1.589 + vmovdqa CTR5, 2*16(%rsp) 1.590 + vmovdqa CTR4, 3*16(%rsp) 1.591 + vmovdqa CTR3, 4*16(%rsp) 1.592 + vmovdqa CTR2, 5*16(%rsp) 1.593 + vmovdqa CTR1, 6*16(%rsp) 1.594 + vmovdqa CTR0, 7*16(%rsp) 1.595 + 1.596 + vmovdqa CTR, CTR0 1.597 + vpaddd .Lone(%rip), CTR0, CTR1 1.598 + vpaddd .Ltwo(%rip), CTR0, CTR2 1.599 + vpaddd .Lone(%rip), CTR2, CTR3 1.600 + vpaddd .Ltwo(%rip), CTR2, CTR4 1.601 + vpaddd .Lone(%rip), CTR4, CTR5 1.602 + vpaddd .Ltwo(%rip), CTR4, CTR6 1.603 + vpaddd .Lone(%rip), CTR6, CTR7 1.604 + vpaddd .Ltwo(%rip), CTR6, CTR 1.605 + 1.606 + vmovdqu (KS), TMP4 1.607 + vpshufb TMP3, CTR0, CTR0 1.608 + vpxor TMP4, CTR0, CTR0 1.609 + vpshufb TMP3, CTR1, CTR1 1.610 + vpxor TMP4, CTR1, CTR1 1.611 + vpshufb TMP3, CTR2, CTR2 1.612 + vpxor TMP4, CTR2, CTR2 1.613 + vpshufb TMP3, CTR3, CTR3 1.614 + vpxor TMP4, CTR3, CTR3 1.615 + vpshufb TMP3, CTR4, CTR4 1.616 + vpxor TMP4, CTR4, CTR4 1.617 + vpshufb TMP3, CTR5, CTR5 1.618 + vpxor TMP4, CTR5, CTR5 1.619 + vpshufb TMP3, CTR6, CTR6 1.620 + vpxor TMP4, CTR6, CTR6 1.621 + vpshufb TMP3, CTR7, CTR7 1.622 + vpxor TMP4, CTR7, CTR7 1.623 + 1.624 + vmovdqu 16*0(Htbl), TMP3 1.625 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 1.626 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 1.627 + vpshufd $78, TMP5, TMP3 1.628 + vpxor TMP5, TMP3, TMP5 1.629 + vmovdqu 128+0*16(Htbl), TMP3 1.630 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 1.631 + 1.632 + ROUNDMUL 1 1.633 + 1.634 + ROUNDMUL 2 1.635 + 1.636 + ROUNDMUL 3 1.637 + 1.638 + ROUNDMUL 4 1.639 + 1.640 + ROUNDMUL 5 1.641 + 1.642 + ROUNDMUL 6 1.643 + 1.644 + vpxor 7*16(%rsp), T, TMP5 1.645 + vmovdqu 7*16(KS), TMP3 1.646 + 1.647 + vaesenc TMP3, CTR0, CTR0 1.648 + vaesenc TMP3, CTR1, CTR1 1.649 + vaesenc TMP3, CTR2, CTR2 1.650 + vaesenc TMP3, CTR3, CTR3 1.651 + 1.652 + vpshufd $78, TMP5, TMP4 1.653 + vpxor TMP5, TMP4, TMP4 1.654 + 1.655 + vaesenc TMP3, CTR4, CTR4 1.656 + vaesenc TMP3, CTR5, CTR5 1.657 + vaesenc TMP3, CTR6, CTR6 1.658 + vaesenc TMP3, CTR7, CTR7 1.659 + 1.660 + vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 1.661 + vpxor TMP3, TMP1, TMP1 1.662 + vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 1.663 + vpxor TMP3, TMP2, TMP2 1.664 + vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 1.665 + vpxor TMP3, TMP0, TMP0 1.666 + 1.667 + ROUND 8 1.668 + vmovdqa .Lpoly(%rip), TMP5 1.669 + 1.670 + vpxor TMP1, TMP0, TMP0 1.671 + vpxor TMP2, TMP0, TMP0 1.672 + vpsrldq $8, TMP0, TMP3 1.673 + vpxor TMP3, TMP1, TMP4 1.674 + vpslldq $8, TMP0, TMP3 1.675 + vpxor TMP3, TMP2, T 1.676 + 1.677 + vpclmulqdq $0x10, TMP5, T, TMP1 1.678 + vpalignr $8, T, T, T 1.679 + vpxor T, TMP1, T 1.680 + 1.681 + ROUND 9 1.682 + 1.683 + vpclmulqdq $0x10, TMP5, T, TMP1 1.684 + vpalignr $8, T, T, T 1.685 + vpxor T, TMP1, T 1.686 + 1.687 + vmovdqu 160(KS), TMP5 1.688 + cmp $10, NR 1.689 + jbe .LLast2 1.690 + 1.691 + ROUND 10 1.692 + ROUND 11 1.693 + 1.694 + vmovdqu 192(KS), TMP5 1.695 + cmp $12, NR 1.696 + jbe .LLast2 1.697 + 1.698 + ROUND 12 1.699 + ROUND 13 1.700 + 1.701 + vmovdqu 224(KS), TMP5 1.702 + 1.703 +.LLast2: 1.704 + 1.705 + vpxor (PT), TMP5, TMP3 1.706 + vaesenclast TMP3, CTR0, CTR0 1.707 + vpxor 16(PT), TMP5, TMP3 1.708 + vaesenclast TMP3, CTR1, CTR1 1.709 + vpxor 32(PT), TMP5, TMP3 1.710 + vaesenclast TMP3, CTR2, CTR2 1.711 + vpxor 48(PT), TMP5, TMP3 1.712 + vaesenclast TMP3, CTR3, CTR3 1.713 + vpxor 64(PT), TMP5, TMP3 1.714 + vaesenclast TMP3, CTR4, CTR4 1.715 + vpxor 80(PT), TMP5, TMP3 1.716 + vaesenclast TMP3, CTR5, CTR5 1.717 + vpxor 96(PT), TMP5, TMP3 1.718 + vaesenclast TMP3, CTR6, CTR6 1.719 + vpxor 112(PT), TMP5, TMP3 1.720 + vaesenclast TMP3, CTR7, CTR7 1.721 + 1.722 + vmovdqu .Lbswap_mask(%rip), TMP3 1.723 + 1.724 + vmovdqu CTR0, (CT) 1.725 + vpshufb TMP3, CTR0, CTR0 1.726 + vmovdqu CTR1, 16(CT) 1.727 + vpshufb TMP3, CTR1, CTR1 1.728 + vmovdqu CTR2, 32(CT) 1.729 + vpshufb TMP3, CTR2, CTR2 1.730 + vmovdqu CTR3, 48(CT) 1.731 + vpshufb TMP3, CTR3, CTR3 1.732 + vmovdqu CTR4, 64(CT) 1.733 + vpshufb TMP3, CTR4, CTR4 1.734 + vmovdqu CTR5, 80(CT) 1.735 + vpshufb TMP3, CTR5, CTR5 1.736 + vmovdqu CTR6, 96(CT) 1.737 + vpshufb TMP3, CTR6, CTR6 1.738 + vmovdqu CTR7,112(CT) 1.739 + vpshufb TMP3, CTR7, CTR7 1.740 + 1.741 + vpxor TMP4, T, T 1.742 + 1.743 + lea 128(CT), CT 1.744 + lea 128(PT), PT 1.745 + jmp .LDataOctets 1.746 + 1.747 +.LEndOctets: 1.748 + 1.749 + vmovdqa CTR7, TMP5 1.750 + vmovdqa CTR6, 1*16(%rsp) 1.751 + vmovdqa CTR5, 2*16(%rsp) 1.752 + vmovdqa CTR4, 3*16(%rsp) 1.753 + vmovdqa CTR3, 4*16(%rsp) 1.754 + vmovdqa CTR2, 5*16(%rsp) 1.755 + vmovdqa CTR1, 6*16(%rsp) 1.756 + vmovdqa CTR0, 7*16(%rsp) 1.757 + 1.758 + vmovdqu 16*0(Htbl), TMP3 1.759 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 1.760 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 1.761 + vpshufd $78, TMP5, TMP3 1.762 + vpxor TMP5, TMP3, TMP5 1.763 + vmovdqu 128+0*16(Htbl), TMP3 1.764 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 1.765 + 1.766 + KARATSUBA 1 1.767 + KARATSUBA 2 1.768 + KARATSUBA 3 1.769 + KARATSUBA 4 1.770 + KARATSUBA 5 1.771 + KARATSUBA 6 1.772 + 1.773 + vmovdqu 7*16(%rsp), TMP5 1.774 + vpxor T, TMP5, TMP5 1.775 + vmovdqu 16*7(Htbl), TMP4 1.776 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 1.777 + vpxor TMP3, TMP1, TMP1 1.778 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 1.779 + vpxor TMP3, TMP2, TMP2 1.780 + vpshufd $78, TMP5, TMP3 1.781 + vpxor TMP5, TMP3, TMP5 1.782 + vmovdqu 128+7*16(Htbl), TMP4 1.783 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 1.784 + vpxor TMP3, TMP0, TMP0 1.785 + 1.786 + vpxor TMP1, TMP0, TMP0 1.787 + vpxor TMP2, TMP0, TMP0 1.788 + 1.789 + vpsrldq $8, TMP0, TMP3 1.790 + vpxor TMP3, TMP1, TMP4 1.791 + vpslldq $8, TMP0, TMP3 1.792 + vpxor TMP3, TMP2, T 1.793 + 1.794 + vmovdqa .Lpoly(%rip), TMP2 1.795 + 1.796 + vpalignr $8, T, T, TMP1 1.797 + vpclmulqdq $0x10, TMP2, T, T 1.798 + vpxor T, TMP1, T 1.799 + 1.800 + vpalignr $8, T, T, TMP1 1.801 + vpclmulqdq $0x10, TMP2, T, T 1.802 + vpxor T, TMP1, T 1.803 + 1.804 + vpxor TMP4, T, T 1.805 + 1.806 +#Here we encrypt any remaining whole block 1.807 +.LDataSingles: 1.808 + 1.809 + cmp $16, len 1.810 + jb .LDataTail 1.811 + sub $16, len 1.812 + 1.813 + vpshufb .Lbswap_mask(%rip), CTR, TMP1 1.814 + vpaddd .Lone(%rip), CTR, CTR 1.815 + 1.816 + vpxor (KS), TMP1, TMP1 1.817 + vaesenc 16*1(KS), TMP1, TMP1 1.818 + vaesenc 16*2(KS), TMP1, TMP1 1.819 + vaesenc 16*3(KS), TMP1, TMP1 1.820 + vaesenc 16*4(KS), TMP1, TMP1 1.821 + vaesenc 16*5(KS), TMP1, TMP1 1.822 + vaesenc 16*6(KS), TMP1, TMP1 1.823 + vaesenc 16*7(KS), TMP1, TMP1 1.824 + vaesenc 16*8(KS), TMP1, TMP1 1.825 + vaesenc 16*9(KS), TMP1, TMP1 1.826 + vmovdqu 16*10(KS), TMP2 1.827 + cmp $10, NR 1.828 + je .LLast3 1.829 + vaesenc 16*10(KS), TMP1, TMP1 1.830 + vaesenc 16*11(KS), TMP1, TMP1 1.831 + vmovdqu 16*12(KS), TMP2 1.832 + cmp $12, NR 1.833 + je .LLast3 1.834 + vaesenc 16*12(KS), TMP1, TMP1 1.835 + vaesenc 16*13(KS), TMP1, TMP1 1.836 + vmovdqu 16*14(KS), TMP2 1.837 + 1.838 +.LLast3: 1.839 + vaesenclast TMP2, TMP1, TMP1 1.840 + 1.841 + vpxor (PT), TMP1, TMP1 1.842 + vmovdqu TMP1, (CT) 1.843 + addq $16, CT 1.844 + addq $16, PT 1.845 + 1.846 + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 1.847 + vpxor TMP1, T, T 1.848 + vmovdqu (Htbl), TMP0 1.849 + call GFMUL 1.850 + 1.851 + jmp .LDataSingles 1.852 + 1.853 +#Here we encypt the final partial block, if there is one 1.854 +.LDataTail: 1.855 + 1.856 + test len, len 1.857 + jz DATA_END 1.858 +# First prepare the counter block 1.859 + vpshufb .Lbswap_mask(%rip), CTR, TMP1 1.860 + vpaddd .Lone(%rip), CTR, CTR 1.861 + 1.862 + vpxor (KS), TMP1, TMP1 1.863 + vaesenc 16*1(KS), TMP1, TMP1 1.864 + vaesenc 16*2(KS), TMP1, TMP1 1.865 + vaesenc 16*3(KS), TMP1, TMP1 1.866 + vaesenc 16*4(KS), TMP1, TMP1 1.867 + vaesenc 16*5(KS), TMP1, TMP1 1.868 + vaesenc 16*6(KS), TMP1, TMP1 1.869 + vaesenc 16*7(KS), TMP1, TMP1 1.870 + vaesenc 16*8(KS), TMP1, TMP1 1.871 + vaesenc 16*9(KS), TMP1, TMP1 1.872 + vmovdqu 16*10(KS), TMP2 1.873 + cmp $10, NR 1.874 + je .LLast4 1.875 + vaesenc 16*10(KS), TMP1, TMP1 1.876 + vaesenc 16*11(KS), TMP1, TMP1 1.877 + vmovdqu 16*12(KS), TMP2 1.878 + cmp $12, NR 1.879 + je .LLast4 1.880 + vaesenc 16*12(KS), TMP1, TMP1 1.881 + vaesenc 16*13(KS), TMP1, TMP1 1.882 + vmovdqu 16*14(KS), TMP2 1.883 + 1.884 +.LLast4: 1.885 + vaesenclast TMP2, TMP1, TMP1 1.886 +#Zero a temp location 1.887 + vpxor TMP2, TMP2, TMP2 1.888 + vmovdqa TMP2, (%rsp) 1.889 + 1.890 +# Copy the required bytes only (could probably use rep movsb) 1.891 + xor KS, KS 1.892 +.LEncCpy: 1.893 + cmp KS, len 1.894 + je .LEncCpyEnd 1.895 + movb (PT, KS, 1), %r8b 1.896 + movb %r8b, (%rsp, KS, 1) 1.897 + inc KS 1.898 + jmp .LEncCpy 1.899 +.LEncCpyEnd: 1.900 +# Xor with the counter block 1.901 + vpxor (%rsp), TMP1, TMP0 1.902 +# Again, store at temp location 1.903 + vmovdqa TMP0, (%rsp) 1.904 +# Copy only the required bytes to CT, and zero the rest for the hash 1.905 + xor KS, KS 1.906 +.LEncCpy2: 1.907 + cmp KS, len 1.908 + je .LEncCpy3 1.909 + movb (%rsp, KS, 1), %r8b 1.910 + movb %r8b, (CT, KS, 1) 1.911 + inc KS 1.912 + jmp .LEncCpy2 1.913 +.LEncCpy3: 1.914 + cmp $16, KS 1.915 + je .LEndCpy3 1.916 + movb $0, (%rsp, KS, 1) 1.917 + inc KS 1.918 + jmp .LEncCpy3 1.919 +.LEndCpy3: 1.920 + vmovdqa (%rsp), TMP0 1.921 + 1.922 + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 1.923 + vpxor TMP0, T, T 1.924 + vmovdqu (Htbl), TMP0 1.925 + call GFMUL 1.926 + 1.927 +DATA_END: 1.928 + 1.929 + vpshufb .Lbswap_mask(%rip), T, T 1.930 + vpshufb .Lbswap_mask(%rip), CTR, CTR 1.931 + vmovdqu T, 272(Gctx) 1.932 + vmovdqu CTR, 288(Gctx) 1.933 + 1.934 + movq %rbp, %rsp 1.935 + 1.936 + popq %rbx 1.937 + popq %rbp 1.938 + ret 1.939 + .size intel_aes_gcmENC, .-intel_aes_gcmENC 1.940 + 1.941 +######################### 1.942 +# Decrypt and Authenticate 1.943 +# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); 1.944 +.type intel_aes_gcmDEC,@function 1.945 +.globl intel_aes_gcmDEC 1.946 +.align 16 1.947 +intel_aes_gcmDEC: 1.948 +# parameter 1: CT # input 1.949 +# parameter 2: PT # output 1.950 +# parameter 3: %rdx # Gctx 1.951 +# parameter 4: %rcx # len 1.952 + 1.953 +.macro DEC_KARATSUBA i 1.954 + vmovdqu (7-\i)*16(CT), TMP5 1.955 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 1.956 + 1.957 + vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 1.958 + vpxor TMP3, TMP1, TMP1 1.959 + vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 1.960 + vpxor TMP3, TMP2, TMP2 1.961 + vpshufd $78, TMP5, TMP3 1.962 + vpxor TMP5, TMP3, TMP5 1.963 + vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 1.964 + vpxor TMP3, TMP0, TMP0 1.965 +.endm 1.966 + 1.967 +.set PT,%rsi 1.968 +.set CT,%rdi 1.969 +.set Htbl, %rdx 1.970 +.set len, %rcx 1.971 +.set KS,%r9 1.972 +.set NR,%r10d 1.973 + 1.974 +.set Gctx, %rdx 1.975 + 1.976 +.set T,%xmm0 1.977 +.set TMP0,%xmm1 1.978 +.set TMP1,%xmm2 1.979 +.set TMP2,%xmm3 1.980 +.set TMP3,%xmm4 1.981 +.set TMP4,%xmm5 1.982 +.set TMP5,%xmm6 1.983 +.set CTR0,%xmm7 1.984 +.set CTR1,%xmm8 1.985 +.set CTR2,%xmm9 1.986 +.set CTR3,%xmm10 1.987 +.set CTR4,%xmm11 1.988 +.set CTR5,%xmm12 1.989 +.set CTR6,%xmm13 1.990 +.set CTR7,%xmm14 1.991 +.set CTR,%xmm15 1.992 + 1.993 + test len, len 1.994 + jnz .LbeginDec 1.995 + ret 1.996 + 1.997 +.LbeginDec: 1.998 + 1.999 + pushq %rbp 1.1000 + pushq %rbx 1.1001 + movq %rsp, %rbp 1.1002 + sub $128, %rsp 1.1003 + andq $-16, %rsp 1.1004 + vmovdqu 288(Gctx), CTR 1.1005 + vmovdqu 272(Gctx), T 1.1006 + mov 304(Gctx), KS 1.1007 + mov 4(KS), NR 1.1008 + lea 48(KS), KS 1.1009 + 1.1010 + vpshufb .Lbswap_mask(%rip), CTR, CTR 1.1011 + vpshufb .Lbswap_mask(%rip), T, T 1.1012 + 1.1013 + vmovdqu .Lbswap_mask(%rip), TMP3 1.1014 + jmp .LDECOctets 1.1015 + 1.1016 +# Decrypt 8 blocks each time while hashing them at the same time 1.1017 +.align 64 1.1018 +.LDECOctets: 1.1019 + 1.1020 + cmp $128, len 1.1021 + jb .LDECSingles 1.1022 + sub $128, len 1.1023 + 1.1024 + vmovdqa CTR, CTR0 1.1025 + vpaddd .Lone(%rip), CTR0, CTR1 1.1026 + vpaddd .Ltwo(%rip), CTR0, CTR2 1.1027 + vpaddd .Lone(%rip), CTR2, CTR3 1.1028 + vpaddd .Ltwo(%rip), CTR2, CTR4 1.1029 + vpaddd .Lone(%rip), CTR4, CTR5 1.1030 + vpaddd .Ltwo(%rip), CTR4, CTR6 1.1031 + vpaddd .Lone(%rip), CTR6, CTR7 1.1032 + vpaddd .Ltwo(%rip), CTR6, CTR 1.1033 + 1.1034 + vpshufb TMP3, CTR0, CTR0 1.1035 + vpshufb TMP3, CTR1, CTR1 1.1036 + vpshufb TMP3, CTR2, CTR2 1.1037 + vpshufb TMP3, CTR3, CTR3 1.1038 + vpshufb TMP3, CTR4, CTR4 1.1039 + vpshufb TMP3, CTR5, CTR5 1.1040 + vpshufb TMP3, CTR6, CTR6 1.1041 + vpshufb TMP3, CTR7, CTR7 1.1042 + 1.1043 + vmovdqu (KS), TMP3 1.1044 + vpxor TMP3, CTR0, CTR0 1.1045 + vpxor TMP3, CTR1, CTR1 1.1046 + vpxor TMP3, CTR2, CTR2 1.1047 + vpxor TMP3, CTR3, CTR3 1.1048 + vpxor TMP3, CTR4, CTR4 1.1049 + vpxor TMP3, CTR5, CTR5 1.1050 + vpxor TMP3, CTR6, CTR6 1.1051 + vpxor TMP3, CTR7, CTR7 1.1052 + 1.1053 + vmovdqu 7*16(CT), TMP5 1.1054 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 1.1055 + vmovdqu 16*0(Htbl), TMP3 1.1056 + vpclmulqdq $0x11, TMP3, TMP5, TMP1 1.1057 + vpclmulqdq $0x00, TMP3, TMP5, TMP2 1.1058 + vpshufd $78, TMP5, TMP3 1.1059 + vpxor TMP5, TMP3, TMP5 1.1060 + vmovdqu 128+0*16(Htbl), TMP3 1.1061 + vpclmulqdq $0x00, TMP3, TMP5, TMP0 1.1062 + 1.1063 + ROUND 1 1.1064 + DEC_KARATSUBA 1 1.1065 + 1.1066 + ROUND 2 1.1067 + DEC_KARATSUBA 2 1.1068 + 1.1069 + ROUND 3 1.1070 + DEC_KARATSUBA 3 1.1071 + 1.1072 + ROUND 4 1.1073 + DEC_KARATSUBA 4 1.1074 + 1.1075 + ROUND 5 1.1076 + DEC_KARATSUBA 5 1.1077 + 1.1078 + ROUND 6 1.1079 + DEC_KARATSUBA 6 1.1080 + 1.1081 + ROUND 7 1.1082 + 1.1083 + vmovdqu 0*16(CT), TMP5 1.1084 + vpshufb .Lbswap_mask(%rip), TMP5, TMP5 1.1085 + vpxor T, TMP5, TMP5 1.1086 + vmovdqu 16*7(Htbl), TMP4 1.1087 + 1.1088 + vpclmulqdq $0x11, TMP4, TMP5, TMP3 1.1089 + vpxor TMP3, TMP1, TMP1 1.1090 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 1.1091 + vpxor TMP3, TMP2, TMP2 1.1092 + 1.1093 + vpshufd $78, TMP5, TMP3 1.1094 + vpxor TMP5, TMP3, TMP5 1.1095 + vmovdqu 128+7*16(Htbl), TMP4 1.1096 + 1.1097 + vpclmulqdq $0x00, TMP4, TMP5, TMP3 1.1098 + vpxor TMP3, TMP0, TMP0 1.1099 + 1.1100 + ROUND 8 1.1101 + 1.1102 + vpxor TMP1, TMP0, TMP0 1.1103 + vpxor TMP2, TMP0, TMP0 1.1104 + 1.1105 + vpsrldq $8, TMP0, TMP3 1.1106 + vpxor TMP3, TMP1, TMP4 1.1107 + vpslldq $8, TMP0, TMP3 1.1108 + vpxor TMP3, TMP2, T 1.1109 + vmovdqa .Lpoly(%rip), TMP2 1.1110 + 1.1111 + vpalignr $8, T, T, TMP1 1.1112 + vpclmulqdq $0x10, TMP2, T, T 1.1113 + vpxor T, TMP1, T 1.1114 + 1.1115 + ROUND 9 1.1116 + 1.1117 + vpalignr $8, T, T, TMP1 1.1118 + vpclmulqdq $0x10, TMP2, T, T 1.1119 + vpxor T, TMP1, T 1.1120 + 1.1121 + vmovdqu 160(KS), TMP5 1.1122 + cmp $10, NR 1.1123 + 1.1124 + jbe .LDECLast1 1.1125 + 1.1126 + ROUND 10 1.1127 + ROUND 11 1.1128 + 1.1129 + vmovdqu 192(KS), TMP5 1.1130 + cmp $12, NR 1.1131 + 1.1132 + jbe .LDECLast1 1.1133 + 1.1134 + ROUND 12 1.1135 + ROUND 13 1.1136 + 1.1137 + vmovdqu 224(KS), TMP5 1.1138 + 1.1139 +.LDECLast1: 1.1140 + 1.1141 + vpxor (CT), TMP5, TMP3 1.1142 + vaesenclast TMP3, CTR0, CTR0 1.1143 + vpxor 16(CT), TMP5, TMP3 1.1144 + vaesenclast TMP3, CTR1, CTR1 1.1145 + vpxor 32(CT), TMP5, TMP3 1.1146 + vaesenclast TMP3, CTR2, CTR2 1.1147 + vpxor 48(CT), TMP5, TMP3 1.1148 + vaesenclast TMP3, CTR3, CTR3 1.1149 + vpxor 64(CT), TMP5, TMP3 1.1150 + vaesenclast TMP3, CTR4, CTR4 1.1151 + vpxor 80(CT), TMP5, TMP3 1.1152 + vaesenclast TMP3, CTR5, CTR5 1.1153 + vpxor 96(CT), TMP5, TMP3 1.1154 + vaesenclast TMP3, CTR6, CTR6 1.1155 + vpxor 112(CT), TMP5, TMP3 1.1156 + vaesenclast TMP3, CTR7, CTR7 1.1157 + 1.1158 + vmovdqu .Lbswap_mask(%rip), TMP3 1.1159 + 1.1160 + vmovdqu CTR0, (PT) 1.1161 + vmovdqu CTR1, 16(PT) 1.1162 + vmovdqu CTR2, 32(PT) 1.1163 + vmovdqu CTR3, 48(PT) 1.1164 + vmovdqu CTR4, 64(PT) 1.1165 + vmovdqu CTR5, 80(PT) 1.1166 + vmovdqu CTR6, 96(PT) 1.1167 + vmovdqu CTR7,112(PT) 1.1168 + 1.1169 + vpxor TMP4, T, T 1.1170 + 1.1171 + lea 128(CT), CT 1.1172 + lea 128(PT), PT 1.1173 + jmp .LDECOctets 1.1174 + 1.1175 +#Here we decrypt and hash any remaining whole block 1.1176 +.LDECSingles: 1.1177 + 1.1178 + cmp $16, len 1.1179 + jb .LDECTail 1.1180 + sub $16, len 1.1181 + 1.1182 + vmovdqu (CT), TMP1 1.1183 + vpshufb .Lbswap_mask(%rip), TMP1, TMP1 1.1184 + vpxor TMP1, T, T 1.1185 + vmovdqu (Htbl), TMP0 1.1186 + call GFMUL 1.1187 + 1.1188 + 1.1189 + vpshufb .Lbswap_mask(%rip), CTR, TMP1 1.1190 + vpaddd .Lone(%rip), CTR, CTR 1.1191 + 1.1192 + vpxor (KS), TMP1, TMP1 1.1193 + vaesenc 16*1(KS), TMP1, TMP1 1.1194 + vaesenc 16*2(KS), TMP1, TMP1 1.1195 + vaesenc 16*3(KS), TMP1, TMP1 1.1196 + vaesenc 16*4(KS), TMP1, TMP1 1.1197 + vaesenc 16*5(KS), TMP1, TMP1 1.1198 + vaesenc 16*6(KS), TMP1, TMP1 1.1199 + vaesenc 16*7(KS), TMP1, TMP1 1.1200 + vaesenc 16*8(KS), TMP1, TMP1 1.1201 + vaesenc 16*9(KS), TMP1, TMP1 1.1202 + vmovdqu 16*10(KS), TMP2 1.1203 + cmp $10, NR 1.1204 + je .LDECLast2 1.1205 + vaesenc 16*10(KS), TMP1, TMP1 1.1206 + vaesenc 16*11(KS), TMP1, TMP1 1.1207 + vmovdqu 16*12(KS), TMP2 1.1208 + cmp $12, NR 1.1209 + je .LDECLast2 1.1210 + vaesenc 16*12(KS), TMP1, TMP1 1.1211 + vaesenc 16*13(KS), TMP1, TMP1 1.1212 + vmovdqu 16*14(KS), TMP2 1.1213 +.LDECLast2: 1.1214 + vaesenclast TMP2, TMP1, TMP1 1.1215 + 1.1216 + vpxor (CT), TMP1, TMP1 1.1217 + vmovdqu TMP1, (PT) 1.1218 + addq $16, CT 1.1219 + addq $16, PT 1.1220 + jmp .LDECSingles 1.1221 + 1.1222 +#Here we decrypt the final partial block, if there is one 1.1223 +.LDECTail: 1.1224 + test len, len 1.1225 + jz .LDEC_END 1.1226 + 1.1227 + vpshufb .Lbswap_mask(%rip), CTR, TMP1 1.1228 + vpaddd .Lone(%rip), CTR, CTR 1.1229 + 1.1230 + vpxor (KS), TMP1, TMP1 1.1231 + vaesenc 16*1(KS), TMP1, TMP1 1.1232 + vaesenc 16*2(KS), TMP1, TMP1 1.1233 + vaesenc 16*3(KS), TMP1, TMP1 1.1234 + vaesenc 16*4(KS), TMP1, TMP1 1.1235 + vaesenc 16*5(KS), TMP1, TMP1 1.1236 + vaesenc 16*6(KS), TMP1, TMP1 1.1237 + vaesenc 16*7(KS), TMP1, TMP1 1.1238 + vaesenc 16*8(KS), TMP1, TMP1 1.1239 + vaesenc 16*9(KS), TMP1, TMP1 1.1240 + vmovdqu 16*10(KS), TMP2 1.1241 + cmp $10, NR 1.1242 + je .LDECLast3 1.1243 + vaesenc 16*10(KS), TMP1, TMP1 1.1244 + vaesenc 16*11(KS), TMP1, TMP1 1.1245 + vmovdqu 16*12(KS), TMP2 1.1246 + cmp $12, NR 1.1247 + je .LDECLast3 1.1248 + vaesenc 16*12(KS), TMP1, TMP1 1.1249 + vaesenc 16*13(KS), TMP1, TMP1 1.1250 + vmovdqu 16*14(KS), TMP2 1.1251 + 1.1252 +.LDECLast3: 1.1253 + vaesenclast TMP2, TMP1, TMP1 1.1254 + 1.1255 + vpxor TMP2, TMP2, TMP2 1.1256 + vmovdqa TMP2, (%rsp) 1.1257 +# Copy the required bytes only (could probably use rep movsb) 1.1258 + xor KS, KS 1.1259 +.LDecCpy: 1.1260 + cmp KS, len 1.1261 + je .LDecCpy2 1.1262 + movb (CT, KS, 1), %r8b 1.1263 + movb %r8b, (%rsp, KS, 1) 1.1264 + inc KS 1.1265 + jmp .LDecCpy 1.1266 +.LDecCpy2: 1.1267 + cmp $16, KS 1.1268 + je .LDecCpyEnd 1.1269 + movb $0, (%rsp, KS, 1) 1.1270 + inc KS 1.1271 + jmp .LDecCpy2 1.1272 +.LDecCpyEnd: 1.1273 +# Xor with the counter block 1.1274 + vmovdqa (%rsp), TMP0 1.1275 + vpxor TMP0, TMP1, TMP1 1.1276 +# Again, store at temp location 1.1277 + vmovdqa TMP1, (%rsp) 1.1278 +# Copy only the required bytes to PT, and zero the rest for the hash 1.1279 + xor KS, KS 1.1280 +.LDecCpy3: 1.1281 + cmp KS, len 1.1282 + je .LDecCpyEnd3 1.1283 + movb (%rsp, KS, 1), %r8b 1.1284 + movb %r8b, (PT, KS, 1) 1.1285 + inc KS 1.1286 + jmp .LDecCpy3 1.1287 +.LDecCpyEnd3: 1.1288 + vpshufb .Lbswap_mask(%rip), TMP0, TMP0 1.1289 + vpxor TMP0, T, T 1.1290 + vmovdqu (Htbl), TMP0 1.1291 + call GFMUL 1.1292 +.LDEC_END: 1.1293 + 1.1294 + vpshufb .Lbswap_mask(%rip), T, T 1.1295 + vpshufb .Lbswap_mask(%rip), CTR, CTR 1.1296 + vmovdqu T, 272(Gctx) 1.1297 + vmovdqu CTR, 288(Gctx) 1.1298 + 1.1299 + movq %rbp, %rsp 1.1300 + 1.1301 + popq %rbx 1.1302 + popq %rbp 1.1303 + ret 1.1304 + .size intel_aes_gcmDEC, .-intel_aes_gcmDEC 1.1305 +######################### 1.1306 +# a = T 1.1307 +# b = TMP0 - remains unchanged 1.1308 +# res = T 1.1309 +# uses also TMP1,TMP2,TMP3,TMP4 1.1310 +# __m128i GFMUL(__m128i A, __m128i B); 1.1311 +.type GFMUL,@function 1.1312 +.globl GFMUL 1.1313 +GFMUL: 1.1314 + vpclmulqdq $0x00, TMP0, T, TMP1 1.1315 + vpclmulqdq $0x11, TMP0, T, TMP4 1.1316 + 1.1317 + vpshufd $78, T, TMP2 1.1318 + vpshufd $78, TMP0, TMP3 1.1319 + vpxor T, TMP2, TMP2 1.1320 + vpxor TMP0, TMP3, TMP3 1.1321 + 1.1322 + vpclmulqdq $0x00, TMP3, TMP2, TMP2 1.1323 + vpxor TMP1, TMP2, TMP2 1.1324 + vpxor TMP4, TMP2, TMP2 1.1325 + 1.1326 + vpslldq $8, TMP2, TMP3 1.1327 + vpsrldq $8, TMP2, TMP2 1.1328 + 1.1329 + vpxor TMP3, TMP1, TMP1 1.1330 + vpxor TMP2, TMP4, TMP4 1.1331 + 1.1332 + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 1.1333 + vpshufd $78, TMP1, TMP3 1.1334 + vpxor TMP3, TMP2, TMP1 1.1335 + 1.1336 + vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 1.1337 + vpshufd $78, TMP1, TMP3 1.1338 + vpxor TMP3, TMP2, TMP1 1.1339 + 1.1340 + vpxor TMP4, TMP1, T 1.1341 + ret 1.1342 +.size GFMUL, .-GFMUL 1.1343 +