1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/intel-gcm-x86-masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1209 @@ 1.4 +; LICENSE: 1.5 +; This submission to NSS is to be made available under the terms of the 1.6 +; Mozilla Public License, v. 2.0. You can obtain one at http: 1.7 +; //mozilla.org/MPL/2.0/. 1.8 +;############################################################################### 1.9 +; Copyright(c) 2014, Intel Corp. 1.10 +; Developers and authors: 1.11 +; Shay Gueron and Vlad Krasnov 1.12 +; Intel Corporation, Israel Development Centre, Haifa, Israel 1.13 +; Please send feedback directly to crypto.feedback.alias@intel.com 1.14 + 1.15 + 1.16 +.MODEL FLAT, C 1.17 +.XMM 1.18 + 1.19 +.DATA 1.20 +ALIGN 16 1.21 +Lone dq 1,0 1.22 +Ltwo dq 2,0 1.23 +Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1.24 +Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh 1.25 +Lpoly dq 01h, 0c200000000000000h 1.26 + 1.27 +.CODE 1.28 + 1.29 + 1.30 +GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 1.31 + vpclmulqdq TMP1, SRC2, SRC1, 0h 1.32 + vpclmulqdq TMP4, SRC2, SRC1, 011h 1.33 + 1.34 + vpshufd TMP2, SRC2, 78 1.35 + vpshufd TMP3, SRC1, 78 1.36 + vpxor TMP2, TMP2, SRC2 1.37 + vpxor TMP3, TMP3, SRC1 1.38 + 1.39 + vpclmulqdq TMP2, TMP2, TMP3, 0h 1.40 + vpxor TMP2, TMP2, TMP1 1.41 + vpxor TMP2, TMP2, TMP4 1.42 + 1.43 + vpslldq TMP3, TMP2, 8 1.44 + vpsrldq TMP2, TMP2, 8 1.45 + 1.46 + vpxor TMP1, TMP1, TMP3 1.47 + vpxor TMP4, TMP4, TMP2 1.48 + 1.49 + vpclmulqdq TMP2, TMP1, [Lpoly], 010h 1.50 + vpshufd TMP3, TMP1, 78 1.51 + vpxor TMP1, TMP2, TMP3 1.52 + 1.53 + vpclmulqdq TMP2, TMP1, [Lpoly], 010h 1.54 + vpshufd TMP3, TMP1, 78 1.55 + vpxor TMP1, TMP2, TMP3 1.56 + 1.57 + vpxor DST, TMP1, TMP4 1.58 + 1.59 + ENDM 1.60 + 1.61 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.62 +; 1.63 +; Generates the final GCM tag 1.64 +; void intel_aes_gcmTAG(unsigned char Htbl[16*16], 1.65 +; unsigned char *Tp, 1.66 +; unsigned int Mlen, 1.67 +; unsigned int Alen, 1.68 +; unsigned char* X0, 1.69 +; unsigned char* TAG); 1.70 +; 1.71 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.72 + 1.73 +ALIGN 16 1.74 +intel_aes_gcmTAG PROC 1.75 + 1.76 +Htbl textequ <eax> 1.77 +Tp textequ <ecx> 1.78 +X0 textequ <edx> 1.79 +TAG textequ <ebx> 1.80 + 1.81 +T textequ <xmm0> 1.82 +TMP0 textequ <xmm1> 1.83 + 1.84 + push ebx 1.85 + 1.86 + mov Htbl, [esp + 2*4 + 0*4] 1.87 + mov Tp, [esp + 2*4 + 1*4] 1.88 + mov X0, [esp + 2*4 + 4*4] 1.89 + mov TAG, [esp + 2*4 + 5*4] 1.90 + 1.91 + vzeroupper 1.92 + vmovdqu T, XMMWORD PTR[Tp] 1.93 + 1.94 + vpxor TMP0, TMP0, TMP0 1.95 + vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 1.96 + vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 1.97 + vpsllq TMP0, TMP0, 3 1.98 + 1.99 + vpxor T, T, TMP0 1.100 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.101 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 1.102 + 1.103 + vpshufb T, T, [Lbswap_mask] 1.104 + vpxor T, T, [X0] 1.105 + vmovdqu XMMWORD PTR[TAG], T 1.106 + vzeroupper 1.107 + 1.108 + pop ebx 1.109 + 1.110 + ret 1.111 + 1.112 +intel_aes_gcmTAG ENDP 1.113 + 1.114 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.115 +; 1.116 +; Generates the H table 1.117 +; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); 1.118 +; 1.119 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.120 + 1.121 +ALIGN 16 1.122 +intel_aes_gcmINIT PROC 1.123 + 1.124 +Htbl textequ <eax> 1.125 +KS textequ <ecx> 1.126 +NR textequ <edx> 1.127 + 1.128 +T textequ <xmm0> 1.129 +TMP0 textequ <xmm1> 1.130 + 1.131 + mov Htbl, [esp + 4*1 + 0*4] 1.132 + mov KS, [esp + 4*1 + 1*4] 1.133 + mov NR, [esp + 4*1 + 2*4] 1.134 + 1.135 + vzeroupper 1.136 + ; AES-ENC(0) 1.137 + vmovdqu T, XMMWORD PTR[KS] 1.138 + lea KS, [16 + KS] 1.139 + dec NR 1.140 +Lenc_loop: 1.141 + vaesenc T, T, [KS] 1.142 + lea KS, [16 + KS] 1.143 + dec NR 1.144 + jnz Lenc_loop 1.145 + 1.146 + vaesenclast T, T, [KS] 1.147 + vpshufb T, T, [Lbswap_mask] 1.148 + 1.149 + ;Calculate H` = GFMUL(H, 2) 1.150 + vpsrad xmm3, T, 31 1.151 + vpshufd xmm3, xmm3, 0ffh 1.152 + vpand xmm5, xmm3, [Lpoly] 1.153 + vpsrld xmm3, T, 31 1.154 + vpslld xmm4, T, 1 1.155 + vpslldq xmm3, xmm3, 4 1.156 + vpxor T, xmm4, xmm3 1.157 + vpxor T, T, xmm5 1.158 + 1.159 + vmovdqu TMP0, T 1.160 + vmovdqu XMMWORD PTR[Htbl + 0*16], T 1.161 + 1.162 + vpshufd xmm2, T, 78 1.163 + vpxor xmm2, xmm2, T 1.164 + vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 1.165 + 1.166 + i = 1 1.167 + WHILE i LT 8 1.168 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 1.169 + vmovdqu XMMWORD PTR[Htbl + i*16], T 1.170 + vpshufd xmm2, T, 78 1.171 + vpxor xmm2, xmm2, T 1.172 + vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 1.173 + i = i+1 1.174 + ENDM 1.175 + vzeroupper 1.176 + ret 1.177 +intel_aes_gcmINIT ENDP 1.178 + 1.179 + 1.180 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.181 +; 1.182 +; Authenticate only 1.183 +; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); 1.184 +; 1.185 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.186 + 1.187 +ALIGN 16 1.188 +intel_aes_gcmAAD PROC 1.189 + 1.190 +Htbl textequ <eax> 1.191 +inp textequ <ecx> 1.192 +len textequ <edx> 1.193 +Tp textequ <ebx> 1.194 +hlp0 textequ <esi> 1.195 + 1.196 +DATA textequ <xmm0> 1.197 +T textequ <xmm1> 1.198 +TMP0 textequ <xmm2> 1.199 +TMP1 textequ <xmm3> 1.200 +TMP2 textequ <xmm4> 1.201 +TMP3 textequ <xmm5> 1.202 +TMP4 textequ <xmm6> 1.203 +Xhi textequ <xmm7> 1.204 + 1.205 +KARATSUBA_AAD MACRO i 1.206 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h 1.207 + vpxor TMP0, TMP0, TMP3 1.208 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h 1.209 + vpxor TMP1, TMP1, TMP3 1.210 + vpshufd TMP3, DATA, 78 1.211 + vpxor TMP3, TMP3, DATA 1.212 + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h 1.213 + vpxor TMP2, TMP2, TMP3 1.214 +ENDM 1.215 + 1.216 + cmp DWORD PTR[esp + 1*3 + 2*4], 0 1.217 + jnz LbeginAAD 1.218 + ret 1.219 + 1.220 +LbeginAAD: 1.221 + push ebx 1.222 + push esi 1.223 + 1.224 + mov Htbl, [esp + 4*3 + 0*4] 1.225 + mov inp, [esp + 4*3 + 1*4] 1.226 + mov len, [esp + 4*3 + 2*4] 1.227 + mov Tp, [esp + 4*3 + 3*4] 1.228 + 1.229 + vzeroupper 1.230 + 1.231 + vpxor Xhi, Xhi, Xhi 1.232 + 1.233 + vmovdqu T, XMMWORD PTR[Tp] 1.234 + ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 1.235 + mov hlp0, len 1.236 + and hlp0, 128-1 1.237 + jz Lmod_loop 1.238 + 1.239 + and len, -128 1.240 + sub hlp0, 16 1.241 + 1.242 + ; Prefix block 1.243 + vmovdqu DATA, XMMWORD PTR[inp] 1.244 + vpshufb DATA, DATA, [Lbswap_mask] 1.245 + vpxor DATA, DATA, T 1.246 + 1.247 + vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h 1.248 + vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h 1.249 + vpshufd TMP3, DATA, 78 1.250 + vpxor TMP3, TMP3, DATA 1.251 + vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h 1.252 + 1.253 + lea inp, [inp+16] 1.254 + test hlp0, hlp0 1.255 + jnz Lpre_loop 1.256 + jmp Lred1 1.257 + 1.258 + ;hash remaining prefix bocks (up to 7 total prefix blocks) 1.259 +Lpre_loop: 1.260 + 1.261 + sub hlp0, 16 1.262 + 1.263 + vmovdqu DATA, XMMWORD PTR[inp] 1.264 + vpshufb DATA, DATA, [Lbswap_mask] 1.265 + 1.266 + vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h 1.267 + vpxor TMP0, TMP0, TMP3 1.268 + vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h 1.269 + vpxor TMP1, TMP1, TMP3 1.270 + vpshufd TMP3, DATA, 78 1.271 + vpxor TMP3, TMP3, DATA 1.272 + vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h 1.273 + vpxor TMP2, TMP2, TMP3 1.274 + 1.275 + test hlp0, hlp0 1.276 + lea inp, [inp+16] 1.277 + jnz Lpre_loop 1.278 + 1.279 +Lred1: 1.280 + 1.281 + vpxor TMP2, TMP2, TMP0 1.282 + vpxor TMP2, TMP2, TMP1 1.283 + vpsrldq TMP3, TMP2, 8 1.284 + vpslldq TMP2, TMP2, 8 1.285 + 1.286 + vpxor Xhi, TMP1, TMP3 1.287 + vpxor T, TMP0, TMP2 1.288 + 1.289 +Lmod_loop: 1.290 + 1.291 + sub len, 16*8 1.292 + jb Ldone 1.293 + ; Block #0 1.294 + vmovdqu DATA, XMMWORD PTR[inp + 16*7] 1.295 + vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] 1.296 + 1.297 + vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h 1.298 + vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h 1.299 + vpshufd TMP3, DATA, 78 1.300 + vpxor TMP3, TMP3, DATA 1.301 + vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h 1.302 + 1.303 + ; Block #1 1.304 + vmovdqu DATA, XMMWORD PTR[inp + 16*6] 1.305 + vpshufb DATA, DATA, [Lbswap_mask] 1.306 + KARATSUBA_AAD 1 1.307 + 1.308 + ; Block #2 1.309 + vmovdqu DATA, XMMWORD PTR[inp + 16*5] 1.310 + vpshufb DATA, DATA, [Lbswap_mask] 1.311 + 1.312 + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a 1.313 + vpalignr T, T, T, 8 1.314 + 1.315 + KARATSUBA_AAD 2 1.316 + 1.317 + vpxor T, T, TMP4 ;reduction stage 1b 1.318 + 1.319 + ; Block #3 1.320 + vmovdqu DATA, XMMWORD PTR[inp + 16*4] 1.321 + vpshufb DATA, DATA, [Lbswap_mask] 1.322 + KARATSUBA_AAD 3 1.323 + ; Block #4 1.324 + vmovdqu DATA, XMMWORD PTR[inp + 16*3] 1.325 + vpshufb DATA, DATA, [Lbswap_mask] 1.326 + 1.327 + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a 1.328 + vpalignr T, T, T, 8 1.329 + 1.330 + KARATSUBA_AAD 4 1.331 + 1.332 + vpxor T, T, TMP4 ;reduction stage 2b 1.333 + ; Block #5 1.334 + vmovdqu DATA, XMMWORD PTR[inp + 16*2] 1.335 + vpshufb DATA, DATA, [Lbswap_mask] 1.336 + KARATSUBA_AAD 5 1.337 + 1.338 + vpxor T, T, Xhi ;reduction finalize 1.339 + ; Block #6 1.340 + vmovdqu DATA, XMMWORD PTR[inp + 16*1] 1.341 + vpshufb DATA, DATA, [Lbswap_mask] 1.342 + KARATSUBA_AAD 6 1.343 + ; Block #7 1.344 + vmovdqu DATA, XMMWORD PTR[inp + 16*0] 1.345 + vpshufb DATA, DATA, [Lbswap_mask] 1.346 + vpxor DATA, DATA, T 1.347 + KARATSUBA_AAD 7 1.348 + ; Aggregated 8 blocks, now karatsuba fixup 1.349 + vpxor TMP2, TMP2, TMP0 1.350 + vpxor TMP2, TMP2, TMP1 1.351 + vpsrldq TMP3, TMP2, 8 1.352 + vpslldq TMP2, TMP2, 8 1.353 + 1.354 + vpxor Xhi, TMP1, TMP3 1.355 + vpxor T, TMP0, TMP2 1.356 + 1.357 + lea inp, [inp + 16*8] 1.358 + jmp Lmod_loop 1.359 + 1.360 +Ldone: 1.361 + vpclmulqdq TMP4, T, [Lpoly], 010h 1.362 + vpalignr T, T, T, 8 1.363 + vpxor T, T, TMP4 1.364 + 1.365 + vpclmulqdq TMP4, T, [Lpoly], 010h 1.366 + vpalignr T, T, T, 8 1.367 + vpxor T, T, TMP4 1.368 + 1.369 + vpxor T, T, Xhi 1.370 + vmovdqu XMMWORD PTR[Tp], T 1.371 + vzeroupper 1.372 + 1.373 + pop esi 1.374 + pop ebx 1.375 + ret 1.376 + 1.377 +intel_aes_gcmAAD ENDP 1.378 + 1.379 + 1.380 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.381 +; 1.382 +; Encrypt and Authenticate 1.383 +; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); 1.384 +; 1.385 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.386 + 1.387 +ALIGN 16 1.388 +intel_aes_gcmENC PROC 1.389 + 1.390 +PT textequ <eax> 1.391 +CT textequ <ecx> 1.392 +Htbl textequ <edx> 1.393 +Gctx textequ <edx> 1.394 +len textequ <DWORD PTR[ebp + 5*4 + 3*4]> 1.395 +KS textequ <esi> 1.396 +NR textequ <DWORD PTR[-40 + KS]> 1.397 + 1.398 +aluCTR textequ <ebx> 1.399 +aluTMP textequ <edi> 1.400 + 1.401 +T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> 1.402 +TMP0 textequ <xmm1> 1.403 +TMP1 textequ <xmm2> 1.404 +TMP2 textequ <xmm3> 1.405 +TMP3 textequ <xmm4> 1.406 +TMP4 textequ <xmm5> 1.407 +TMP5 textequ <xmm6> 1.408 + 1.409 +CTR0 textequ <xmm0> 1.410 +CTR1 textequ <xmm1> 1.411 +CTR2 textequ <xmm2> 1.412 +CTR3 textequ <xmm3> 1.413 +CTR4 textequ <xmm4> 1.414 +CTR5 textequ <xmm5> 1.415 +CTR6 textequ <xmm6> 1.416 + 1.417 +ROUND MACRO i 1.418 + vmovdqu xmm7, XMMWORD PTR[i*16 + KS] 1.419 + vaesenc CTR0, CTR0, xmm7 1.420 + vaesenc CTR1, CTR1, xmm7 1.421 + vaesenc CTR2, CTR2, xmm7 1.422 + vaesenc CTR3, CTR3, xmm7 1.423 + vaesenc CTR4, CTR4, xmm7 1.424 + vaesenc CTR5, CTR5, xmm7 1.425 + vaesenc CTR6, CTR6, xmm7 1.426 +ENDM 1.427 + 1.428 +KARATSUBA MACRO i 1.429 + vpshufd TMP4, TMP5, 78 1.430 + vpxor TMP4, TMP4, TMP5 1.431 + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 1.432 + vpxor TMP0, TMP0, TMP3 1.433 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 1.434 + vpclmulqdq TMP3, TMP5, TMP4, 011h 1.435 + vpxor TMP1, TMP1, TMP3 1.436 + vpclmulqdq TMP3, TMP5, TMP4, 000h 1.437 + vpxor TMP2, TMP2, TMP3 1.438 +ENDM 1.439 + 1.440 +NEXTCTR MACRO i 1.441 + add aluCTR, 1 1.442 + mov aluTMP, aluCTR 1.443 + bswap aluTMP 1.444 + xor aluTMP, [3*4 + KS] 1.445 + mov [3*4 + 8*16 + i*16 + esp], aluTMP 1.446 +ENDM 1.447 + 1.448 + cmp DWORD PTR[1*4 + 3*4 + esp], 0 1.449 + jne LbeginENC 1.450 + ret 1.451 + 1.452 +LbeginENC: 1.453 + 1.454 + vzeroupper 1.455 + push ebp 1.456 + push ebx 1.457 + push esi 1.458 + push edi 1.459 + 1.460 + mov ebp, esp 1.461 + sub esp, 16*16 1.462 + and esp, -16 1.463 + 1.464 + mov PT, [ebp + 5*4 + 0*4] 1.465 + mov CT, [ebp + 5*4 + 1*4] 1.466 + mov Gctx, [ebp + 5*4 + 2*4] 1.467 + 1.468 + mov KS, [16*16 + 3*16 + Gctx] 1.469 + lea KS, [44 + KS] 1.470 + 1.471 + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 1.472 + bswap aluCTR 1.473 + 1.474 + 1.475 + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 1.476 + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.477 + vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 1.478 + 1.479 + cmp len, 16*7 1.480 + jb LEncDataSingles 1.481 +; Prepare the "top" counters 1.482 + vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 1.483 + vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 1.484 + vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 1.485 + vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 1.486 + vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 1.487 + vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 1.488 + 1.489 + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.490 + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 1.491 +; Encrypt the initial 7 blocks 1.492 + sub len, 16*7 1.493 + vpaddd CTR1, CTR0, XMMWORD PTR[Lone] 1.494 + vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] 1.495 + vpaddd CTR3, CTR2, XMMWORD PTR[Lone] 1.496 + vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] 1.497 + vpaddd CTR5, CTR4, XMMWORD PTR[Lone] 1.498 + vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] 1.499 + 1.500 + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 1.501 + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 1.502 + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 1.503 + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 1.504 + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 1.505 + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 1.506 + vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] 1.507 + 1.508 + vmovdqu xmm7, XMMWORD PTR[0*16 + KS] 1.509 + vpxor CTR0, CTR0, xmm7 1.510 + vpxor CTR1, CTR1, xmm7 1.511 + vpxor CTR2, CTR2, xmm7 1.512 + vpxor CTR3, CTR3, xmm7 1.513 + vpxor CTR4, CTR4, xmm7 1.514 + vpxor CTR5, CTR5, xmm7 1.515 + vpxor CTR6, CTR6, xmm7 1.516 + 1.517 + ROUND 1 1.518 + 1.519 + add aluCTR, 7 1.520 + mov aluTMP, aluCTR 1.521 + bswap aluTMP 1.522 + xor aluTMP, [KS + 3*4] 1.523 + mov [8*16 + 0*16 + 3*4 + esp], aluTMP 1.524 + 1.525 + ROUND 2 1.526 + NEXTCTR 1 1.527 + ROUND 3 1.528 + NEXTCTR 2 1.529 + ROUND 4 1.530 + NEXTCTR 3 1.531 + ROUND 5 1.532 + NEXTCTR 4 1.533 + ROUND 6 1.534 + NEXTCTR 5 1.535 + ROUND 7 1.536 + NEXTCTR 6 1.537 + ROUND 8 1.538 + ROUND 9 1.539 + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 1.540 + cmp NR, 10 1.541 + je @f 1.542 + 1.543 + ROUND 10 1.544 + ROUND 11 1.545 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 1.546 + cmp NR, 12 1.547 + je @f 1.548 + 1.549 + ROUND 12 1.550 + ROUND 13 1.551 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 1.552 +@@: 1.553 + vaesenclast CTR0, CTR0, xmm7 1.554 + vaesenclast CTR1, CTR1, xmm7 1.555 + vaesenclast CTR2, CTR2, xmm7 1.556 + vaesenclast CTR3, CTR3, xmm7 1.557 + vaesenclast CTR4, CTR4, xmm7 1.558 + vaesenclast CTR5, CTR5, xmm7 1.559 + vaesenclast CTR6, CTR6, xmm7 1.560 + 1.561 + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] 1.562 + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] 1.563 + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] 1.564 + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] 1.565 + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] 1.566 + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] 1.567 + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] 1.568 + 1.569 + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 1.570 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 1.571 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 1.572 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 1.573 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 1.574 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 1.575 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 1.576 + 1.577 + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 1.578 + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 1.579 + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 1.580 + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 1.581 + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 1.582 + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 1.583 + vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] 1.584 + 1.585 + vmovdqa XMMWORD PTR[1*16 + esp], CTR5 1.586 + vmovdqa XMMWORD PTR[2*16 + esp], CTR4 1.587 + vmovdqa XMMWORD PTR[3*16 + esp], CTR3 1.588 + vmovdqa XMMWORD PTR[4*16 + esp], CTR2 1.589 + vmovdqa XMMWORD PTR[5*16 + esp], CTR1 1.590 + vmovdqa XMMWORD PTR[6*16 + esp], CTR0 1.591 + 1.592 + lea CT, [7*16 + CT] 1.593 + lea PT, [7*16 + PT] 1.594 + jmp LEncData7 1.595 + 1.596 +LEncData7: 1.597 + cmp len, 16*7 1.598 + jb LEndEnc7 1.599 + sub len, 16*7 1.600 + 1.601 + vpshufd TMP4, TMP5, 78 1.602 + vpxor TMP4, TMP4, TMP5 1.603 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1.604 + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1.605 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.606 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.607 + 1.608 + vmovdqu TMP5, XMMWORD PTR[1*16 + esp] 1.609 + KARATSUBA 1 1.610 + vmovdqu TMP5, XMMWORD PTR[2*16 + esp] 1.611 + KARATSUBA 2 1.612 + vmovdqu TMP5, XMMWORD PTR[3*16 + esp] 1.613 + KARATSUBA 3 1.614 + vmovdqu TMP5, XMMWORD PTR[4*16 + esp] 1.615 + KARATSUBA 4 1.616 + vmovdqu TMP5, XMMWORD PTR[5*16 + esp] 1.617 + KARATSUBA 5 1.618 + vmovdqu TMP5, XMMWORD PTR[6*16 + esp] 1.619 + vpxor TMP5, TMP5, T 1.620 + KARATSUBA 6 1.621 + 1.622 + vpxor TMP0, TMP0, TMP1 1.623 + vpxor TMP0, TMP0, TMP2 1.624 + vpsrldq TMP3, TMP0, 8 1.625 + vpxor TMP4, TMP1, TMP3 1.626 + vpslldq TMP3, TMP0, 8 1.627 + vpxor TMP5, TMP2, TMP3 1.628 + 1.629 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.630 + vpalignr TMP5,TMP5,TMP5,8 1.631 + vpxor TMP5, TMP5, TMP1 1.632 + 1.633 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.634 + vpalignr TMP5,TMP5,TMP5,8 1.635 + vpxor TMP5, TMP5, TMP1 1.636 + 1.637 + vpxor TMP5, TMP5, TMP4 1.638 + vmovdqu T, TMP5 1.639 + 1.640 + vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] 1.641 + vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] 1.642 + vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] 1.643 + vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] 1.644 + vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] 1.645 + vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] 1.646 + vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] 1.647 + 1.648 + ROUND 1 1.649 + NEXTCTR 0 1.650 + ROUND 2 1.651 + NEXTCTR 1 1.652 + ROUND 3 1.653 + NEXTCTR 2 1.654 + ROUND 4 1.655 + NEXTCTR 3 1.656 + ROUND 5 1.657 + NEXTCTR 4 1.658 + ROUND 6 1.659 + NEXTCTR 5 1.660 + ROUND 7 1.661 + NEXTCTR 6 1.662 + 1.663 + ROUND 8 1.664 + ROUND 9 1.665 + 1.666 + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 1.667 + cmp NR, 10 1.668 + je @f 1.669 + 1.670 + ROUND 10 1.671 + ROUND 11 1.672 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 1.673 + cmp NR, 12 1.674 + je @f 1.675 + 1.676 + ROUND 12 1.677 + ROUND 13 1.678 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 1.679 +@@: 1.680 + vaesenclast CTR0, CTR0, xmm7 1.681 + vaesenclast CTR1, CTR1, xmm7 1.682 + vaesenclast CTR2, CTR2, xmm7 1.683 + vaesenclast CTR3, CTR3, xmm7 1.684 + vaesenclast CTR4, CTR4, xmm7 1.685 + vaesenclast CTR5, CTR5, xmm7 1.686 + vaesenclast CTR6, CTR6, xmm7 1.687 + 1.688 + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] 1.689 + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] 1.690 + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] 1.691 + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] 1.692 + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] 1.693 + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] 1.694 + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] 1.695 + 1.696 + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 1.697 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 1.698 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 1.699 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 1.700 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 1.701 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 1.702 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 1.703 + 1.704 + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 1.705 + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 1.706 + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 1.707 + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 1.708 + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 1.709 + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 1.710 + vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] 1.711 + 1.712 + vmovdqa XMMWORD PTR[1*16 + esp], CTR5 1.713 + vmovdqa XMMWORD PTR[2*16 + esp], CTR4 1.714 + vmovdqa XMMWORD PTR[3*16 + esp], CTR3 1.715 + vmovdqa XMMWORD PTR[4*16 + esp], CTR2 1.716 + vmovdqa XMMWORD PTR[5*16 + esp], CTR1 1.717 + vmovdqa XMMWORD PTR[6*16 + esp], CTR0 1.718 + 1.719 + lea CT, [7*16 + CT] 1.720 + lea PT, [7*16 + PT] 1.721 + jmp LEncData7 1.722 + 1.723 +LEndEnc7: 1.724 + 1.725 + vpshufd TMP4, TMP5, 78 1.726 + vpxor TMP4, TMP4, TMP5 1.727 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1.728 + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1.729 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.730 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.731 + 1.732 + vmovdqu TMP5, XMMWORD PTR[1*16 + esp] 1.733 + KARATSUBA 1 1.734 + vmovdqu TMP5, XMMWORD PTR[2*16 + esp] 1.735 + KARATSUBA 2 1.736 + vmovdqu TMP5, XMMWORD PTR[3*16 + esp] 1.737 + KARATSUBA 3 1.738 + vmovdqu TMP5, XMMWORD PTR[4*16 + esp] 1.739 + KARATSUBA 4 1.740 + vmovdqu TMP5, XMMWORD PTR[5*16 + esp] 1.741 + KARATSUBA 5 1.742 + vmovdqu TMP5, XMMWORD PTR[6*16 + esp] 1.743 + vpxor TMP5, TMP5, T 1.744 + KARATSUBA 6 1.745 + 1.746 + vpxor TMP0, TMP0, TMP1 1.747 + vpxor TMP0, TMP0, TMP2 1.748 + vpsrldq TMP3, TMP0, 8 1.749 + vpxor TMP4, TMP1, TMP3 1.750 + vpslldq TMP3, TMP0, 8 1.751 + vpxor TMP5, TMP2, TMP3 1.752 + 1.753 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.754 + vpalignr TMP5,TMP5,TMP5,8 1.755 + vpxor TMP5, TMP5, TMP1 1.756 + 1.757 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.758 + vpalignr TMP5,TMP5,TMP5,8 1.759 + vpxor TMP5, TMP5, TMP1 1.760 + 1.761 + vpxor TMP5, TMP5, TMP4 1.762 + vmovdqu T, TMP5 1.763 + 1.764 + sub aluCTR, 6 1.765 + 1.766 +LEncDataSingles: 1.767 + 1.768 + cmp len, 16 1.769 + jb LEncDataTail 1.770 + sub len, 16 1.771 + 1.772 + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] 1.773 + NEXTCTR 0 1.774 + 1.775 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.776 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.777 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.778 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.779 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.780 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.781 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.782 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.783 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.784 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.785 + cmp NR, 10 1.786 + je @f 1.787 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.788 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.789 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.790 + cmp NR, 12 1.791 + je @f 1.792 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.793 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.794 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.795 +@@: 1.796 + vaesenclast TMP1, TMP1, TMP2 1.797 + vpxor TMP1, TMP1, XMMWORD PTR[PT] 1.798 + vmovdqu XMMWORD PTR[CT], TMP1 1.799 + 1.800 + lea PT, [16+PT] 1.801 + lea CT, [16+CT] 1.802 + 1.803 + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1.804 + vpxor TMP1, TMP1, T 1.805 + 1.806 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.807 + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1.808 + vmovdqu T, TMP1 1.809 + 1.810 + jmp LEncDataSingles 1.811 + 1.812 +LEncDataTail: 1.813 + 1.814 + cmp len, 0 1.815 + je LEncDataEnd 1.816 + 1.817 + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] 1.818 + 1.819 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.820 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.821 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.822 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.823 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.824 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.825 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.826 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.827 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.828 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.829 + cmp NR, 10 1.830 + je @f 1.831 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.832 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.833 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.834 + cmp NR, 12 1.835 + je @f 1.836 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.837 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.838 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.839 +@@: 1.840 + vaesenclast TMP1, TMP1, TMP2 1.841 +; zero a temp location 1.842 + vpxor TMP2, TMP2, TMP2 1.843 + vmovdqa XMMWORD PTR[esp], TMP2 1.844 +; copy as many bytes as needed 1.845 + xor KS, KS 1.846 + mov aluTMP, edx 1.847 +@@: 1.848 + cmp len, KS 1.849 + je @f 1.850 + mov dl, BYTE PTR[PT + KS] 1.851 + mov BYTE PTR[esp + KS], dl 1.852 + inc KS 1.853 + jmp @b 1.854 +@@: 1.855 + vpxor TMP1, TMP1, XMMWORD PTR[esp] 1.856 + vmovdqa XMMWORD PTR[esp], TMP1 1.857 + xor KS, KS 1.858 +@@: 1.859 + cmp len, KS 1.860 + je @f 1.861 + mov dl, BYTE PTR[esp + KS] 1.862 + mov BYTE PTR[CT + KS], dl 1.863 + inc KS 1.864 + jmp @b 1.865 +@@: 1.866 + cmp KS, 16 1.867 + je @f 1.868 + mov BYTE PTR[esp + KS], 0 1.869 + inc KS 1.870 + jmp @b 1.871 +@@: 1.872 + mov edx, aluTMP 1.873 + vmovdqa TMP1, XMMWORD PTR[esp] 1.874 + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1.875 + vpxor TMP1, TMP1, T 1.876 + 1.877 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.878 + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1.879 + vmovdqu T, TMP1 1.880 + 1.881 +LEncDataEnd: 1.882 + inc aluCTR 1.883 + bswap aluCTR 1.884 + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1.885 + 1.886 + mov esp, ebp 1.887 + pop edi 1.888 + pop esi 1.889 + pop ebx 1.890 + pop ebp 1.891 + 1.892 + 1.893 + vzeroupper 1.894 + 1.895 + ret 1.896 +intel_aes_gcmENC ENDP 1.897 + 1.898 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.899 +; 1.900 +; Decrypt and Authenticate 1.901 +; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); 1.902 +; 1.903 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.904 + 1.905 + 1.906 +NEXTCTR MACRO i 1.907 + add aluCTR, 1 1.908 + mov aluTMP, aluCTR 1.909 + bswap aluTMP 1.910 + xor aluTMP, [3*4 + KS] 1.911 + mov [3*4 + i*16 + esp], aluTMP 1.912 +ENDM 1.913 + 1.914 +intel_aes_gcmDEC PROC 1.915 + 1.916 + cmp DWORD PTR[1*4 + 3*4 + esp], 0 1.917 + jne LbeginDEC 1.918 + ret 1.919 + 1.920 +LbeginDEC: 1.921 + 1.922 + vzeroupper 1.923 + push ebp 1.924 + push ebx 1.925 + push esi 1.926 + push edi 1.927 + 1.928 + mov ebp, esp 1.929 + sub esp, 8*16 1.930 + and esp, -16 1.931 + 1.932 + mov CT, [ebp + 5*4 + 0*4] 1.933 + mov PT, [ebp + 5*4 + 1*4] 1.934 + mov Gctx, [ebp + 5*4 + 2*4] 1.935 + 1.936 + mov KS, [16*16 + 3*16 + Gctx] 1.937 + lea KS, [44 + KS] 1.938 + 1.939 + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 1.940 + bswap aluCTR 1.941 + 1.942 + 1.943 + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 1.944 + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.945 + vmovdqu XMMWORD PTR[0*16 + esp], TMP0 1.946 + 1.947 + cmp len, 16*7 1.948 + jb LDecDataSingles 1.949 + vmovdqu XMMWORD PTR[1*16 + esp], TMP0 1.950 + vmovdqu XMMWORD PTR[2*16 + esp], TMP0 1.951 + vmovdqu XMMWORD PTR[3*16 + esp], TMP0 1.952 + vmovdqu XMMWORD PTR[4*16 + esp], TMP0 1.953 + vmovdqu XMMWORD PTR[5*16 + esp], TMP0 1.954 + vmovdqu XMMWORD PTR[6*16 + esp], TMP0 1.955 + dec aluCTR 1.956 + 1.957 +LDecData7: 1.958 + cmp len, 16*7 1.959 + jb LDecData7End 1.960 + sub len, 16*7 1.961 + 1.962 + vmovdqu TMP5, XMMWORD PTR[0*16 + CT] 1.963 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.964 + vpxor TMP5, TMP5, T 1.965 + vpshufd TMP4, TMP5, 78 1.966 + vpxor TMP4, TMP4, TMP5 1.967 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h 1.968 + vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] 1.969 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.970 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.971 + 1.972 + NEXTCTR 0 1.973 + vmovdqu TMP5, XMMWORD PTR[1*16 + CT] 1.974 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.975 + KARATSUBA 5 1.976 + NEXTCTR 1 1.977 + vmovdqu TMP5, XMMWORD PTR[2*16 + CT] 1.978 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.979 + KARATSUBA 4 1.980 + NEXTCTR 2 1.981 + vmovdqu TMP5, XMMWORD PTR[3*16 + CT] 1.982 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.983 + KARATSUBA 3 1.984 + NEXTCTR 3 1.985 + vmovdqu TMP5, XMMWORD PTR[4*16 + CT] 1.986 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.987 + KARATSUBA 2 1.988 + NEXTCTR 4 1.989 + vmovdqu TMP5, XMMWORD PTR[5*16 + CT] 1.990 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.991 + KARATSUBA 1 1.992 + NEXTCTR 5 1.993 + vmovdqu TMP5, XMMWORD PTR[6*16 + CT] 1.994 + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 1.995 + KARATSUBA 0 1.996 + NEXTCTR 6 1.997 + 1.998 + vpxor TMP0, TMP0, TMP1 1.999 + vpxor TMP0, TMP0, TMP2 1.1000 + vpsrldq TMP3, TMP0, 8 1.1001 + vpxor TMP4, TMP1, TMP3 1.1002 + vpslldq TMP3, TMP0, 8 1.1003 + vpxor TMP5, TMP2, TMP3 1.1004 + 1.1005 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.1006 + vpalignr TMP5,TMP5,TMP5,8 1.1007 + vpxor TMP5, TMP5, TMP1 1.1008 + 1.1009 + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1.1010 + vpalignr TMP5,TMP5,TMP5,8 1.1011 + vpxor TMP5, TMP5, TMP1 1.1012 + 1.1013 + vpxor TMP5, TMP5, TMP4 1.1014 + vmovdqu T, TMP5 1.1015 + 1.1016 + vmovdqa CTR0, XMMWORD PTR[0*16 + esp] 1.1017 + vmovdqa CTR1, XMMWORD PTR[1*16 + esp] 1.1018 + vmovdqa CTR2, XMMWORD PTR[2*16 + esp] 1.1019 + vmovdqa CTR3, XMMWORD PTR[3*16 + esp] 1.1020 + vmovdqa CTR4, XMMWORD PTR[4*16 + esp] 1.1021 + vmovdqa CTR5, XMMWORD PTR[5*16 + esp] 1.1022 + vmovdqa CTR6, XMMWORD PTR[6*16 + esp] 1.1023 + 1.1024 + ROUND 1 1.1025 + ROUND 2 1.1026 + ROUND 3 1.1027 + ROUND 4 1.1028 + ROUND 5 1.1029 + ROUND 6 1.1030 + ROUND 7 1.1031 + ROUND 8 1.1032 + ROUND 9 1.1033 + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 1.1034 + cmp NR, 10 1.1035 + je @f 1.1036 + 1.1037 + ROUND 10 1.1038 + ROUND 11 1.1039 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 1.1040 + cmp NR, 12 1.1041 + je @f 1.1042 + 1.1043 + ROUND 12 1.1044 + ROUND 13 1.1045 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 1.1046 +@@: 1.1047 + vaesenclast CTR0, CTR0, xmm7 1.1048 + vaesenclast CTR1, CTR1, xmm7 1.1049 + vaesenclast CTR2, CTR2, xmm7 1.1050 + vaesenclast CTR3, CTR3, xmm7 1.1051 + vaesenclast CTR4, CTR4, xmm7 1.1052 + vaesenclast CTR5, CTR5, xmm7 1.1053 + vaesenclast CTR6, CTR6, xmm7 1.1054 + 1.1055 + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] 1.1056 + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] 1.1057 + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] 1.1058 + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] 1.1059 + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] 1.1060 + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] 1.1061 + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] 1.1062 + 1.1063 + vmovdqu XMMWORD PTR[0*16 + PT], CTR0 1.1064 + vmovdqu XMMWORD PTR[1*16 + PT], CTR1 1.1065 + vmovdqu XMMWORD PTR[2*16 + PT], CTR2 1.1066 + vmovdqu XMMWORD PTR[3*16 + PT], CTR3 1.1067 + vmovdqu XMMWORD PTR[4*16 + PT], CTR4 1.1068 + vmovdqu XMMWORD PTR[5*16 + PT], CTR5 1.1069 + vmovdqu XMMWORD PTR[6*16 + PT], CTR6 1.1070 + 1.1071 + lea CT, [7*16 + CT] 1.1072 + lea PT, [7*16 + PT] 1.1073 + jmp LDecData7 1.1074 + 1.1075 +LDecData7End: 1.1076 + 1.1077 + NEXTCTR 0 1.1078 + 1.1079 +LDecDataSingles: 1.1080 + 1.1081 + cmp len, 16 1.1082 + jb LDecDataTail 1.1083 + sub len, 16 1.1084 + 1.1085 + vmovdqu TMP1, XMMWORD PTR[CT] 1.1086 + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1.1087 + vpxor TMP1, TMP1, T 1.1088 + 1.1089 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.1090 + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1.1091 + vmovdqu T, TMP1 1.1092 + 1.1093 + vmovdqa TMP1, XMMWORD PTR[0*16 + esp] 1.1094 + NEXTCTR 0 1.1095 + 1.1096 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.1097 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.1098 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.1099 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.1100 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.1101 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.1102 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.1103 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.1104 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.1105 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.1106 + cmp NR, 10 1.1107 + je @f 1.1108 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.1109 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.1110 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.1111 + cmp NR, 12 1.1112 + je @f 1.1113 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.1114 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.1115 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.1116 +@@: 1.1117 + vaesenclast TMP1, TMP1, TMP2 1.1118 + vpxor TMP1, TMP1, XMMWORD PTR[CT] 1.1119 + vmovdqu XMMWORD PTR[PT], TMP1 1.1120 + 1.1121 + lea PT, [16+PT] 1.1122 + lea CT, [16+CT] 1.1123 + jmp LDecDataSingles 1.1124 + 1.1125 +LDecDataTail: 1.1126 + 1.1127 + cmp len, 0 1.1128 + je LDecDataEnd 1.1129 + 1.1130 + vmovdqa TMP1, XMMWORD PTR[0*16 + esp] 1.1131 + inc aluCTR 1.1132 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.1133 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.1134 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.1135 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.1136 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.1137 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.1138 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.1139 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.1140 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.1141 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.1142 + cmp NR, 10 1.1143 + je @f 1.1144 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.1145 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.1146 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.1147 + cmp NR, 12 1.1148 + je @f 1.1149 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.1150 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.1151 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.1152 +@@: 1.1153 + vaesenclast xmm7, TMP1, TMP2 1.1154 + 1.1155 +; copy as many bytes as needed 1.1156 + xor KS, KS 1.1157 + mov aluTMP, edx 1.1158 +@@: 1.1159 + cmp len, KS 1.1160 + je @f 1.1161 + mov dl, BYTE PTR[CT + KS] 1.1162 + mov BYTE PTR[esp + KS], dl 1.1163 + inc KS 1.1164 + jmp @b 1.1165 +@@: 1.1166 + cmp KS, 16 1.1167 + je @f 1.1168 + mov BYTE PTR[esp + KS], 0 1.1169 + inc KS 1.1170 + jmp @b 1.1171 +@@: 1.1172 + mov edx, aluTMP 1.1173 + vmovdqa TMP1, XMMWORD PTR[esp] 1.1174 + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1.1175 + vpxor TMP1, TMP1, T 1.1176 + 1.1177 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.1178 + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1.1179 + vmovdqu T, TMP1 1.1180 + 1.1181 + vpxor xmm7, xmm7, XMMWORD PTR[esp] 1.1182 + vmovdqa XMMWORD PTR[esp], xmm7 1.1183 + xor KS, KS 1.1184 + mov aluTMP, edx 1.1185 +@@: 1.1186 + cmp len, KS 1.1187 + je @f 1.1188 + mov dl, BYTE PTR[esp + KS] 1.1189 + mov BYTE PTR[PT + KS], dl 1.1190 + inc KS 1.1191 + jmp @b 1.1192 +@@: 1.1193 + mov edx, aluTMP 1.1194 + 1.1195 +LDecDataEnd: 1.1196 + 1.1197 + bswap aluCTR 1.1198 + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1.1199 + 1.1200 + mov esp, ebp 1.1201 + pop edi 1.1202 + pop esi 1.1203 + pop ebx 1.1204 + pop ebp 1.1205 + 1.1206 + vzeroupper 1.1207 + 1.1208 + ret 1.1209 +intel_aes_gcmDEC ENDP 1.1210 + 1.1211 + 1.1212 +END