1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/intel-gcm-x64-masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1295 @@ 1.4 +; LICENSE: 1.5 +; This submission to NSS is to be made available under the terms of the 1.6 +; Mozilla Public License, v. 2.0. You can obtain one at http: 1.7 +; //mozilla.org/MPL/2.0/. 1.8 +;############################################################################### 1.9 +; Copyright(c) 2014, Intel Corp. 1.10 +; Developers and authors: 1.11 +; Shay Gueron and Vlad Krasnov 1.12 +; Intel Corporation, Israel Development Centre, Haifa, Israel 1.13 +; Please send feedback directly to crypto.feedback.alias@intel.com 1.14 + 1.15 + 1.16 +.DATA 1.17 +ALIGN 16 1.18 +Lone dq 1,0 1.19 +Ltwo dq 2,0 1.20 +Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1.21 +Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh 1.22 +Lpoly dq 01h, 0c200000000000000h 1.23 + 1.24 +.CODE 1.25 + 1.26 + 1.27 +GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 1.28 + vpclmulqdq TMP1, SRC2, SRC1, 0h 1.29 + vpclmulqdq TMP4, SRC2, SRC1, 011h 1.30 + 1.31 + vpshufd TMP2, SRC2, 78 1.32 + vpshufd TMP3, SRC1, 78 1.33 + vpxor TMP2, TMP2, SRC2 1.34 + vpxor TMP3, TMP3, SRC1 1.35 + 1.36 + vpclmulqdq TMP2, TMP2, TMP3, 0h 1.37 + vpxor TMP2, TMP2, TMP1 1.38 + vpxor TMP2, TMP2, TMP4 1.39 + 1.40 + vpslldq TMP3, TMP2, 8 1.41 + vpsrldq TMP2, TMP2, 8 1.42 + 1.43 + vpxor TMP1, TMP1, TMP3 1.44 + vpxor TMP4, TMP4, TMP2 1.45 + 1.46 + vpclmulqdq TMP2, TMP1, [Lpoly], 010h 1.47 + vpshufd TMP3, TMP1, 78 1.48 + vpxor TMP1, TMP2, TMP3 1.49 + 1.50 + vpclmulqdq TMP2, TMP1, [Lpoly], 010h 1.51 + vpshufd TMP3, TMP1, 78 1.52 + vpxor TMP1, TMP2, TMP3 1.53 + 1.54 + vpxor DST, TMP1, TMP4 1.55 + 1.56 + ENDM 1.57 + 1.58 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.59 +; 1.60 +; Generates the final GCM tag 1.61 +; void intel_aes_gcmTAG(unsigned char Htbl[16*16], 1.62 +; unsigned char *Tp, 1.63 +; unsigned int Mlen, 1.64 +; unsigned int Alen, 1.65 +; unsigned char *X0, 1.66 +; unsigned char *TAG); 1.67 +; 1.68 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.69 + 1.70 +ALIGN 16 1.71 +intel_aes_gcmTAG PROC 1.72 + 1.73 +Htbl textequ <rcx> 1.74 +Tp textequ <rdx> 1.75 +Mlen textequ <r8> 1.76 +Alen textequ <r9> 1.77 +X0 textequ <r10> 1.78 +TAG textequ <r11> 1.79 + 1.80 +T textequ <xmm0> 1.81 +TMP0 textequ <xmm1> 1.82 + 1.83 + mov X0, [rsp + 1*8 + 4*8] 1.84 + mov TAG, [rsp + 1*8 + 5*8] 1.85 + 1.86 + vzeroupper 1.87 + vmovdqu T, XMMWORD PTR[Tp] 1.88 + vpxor TMP0, TMP0, TMP0 1.89 + 1.90 + shl Mlen, 3 1.91 + shl Alen, 3 1.92 + 1.93 + ;vpinsrq TMP0, TMP0, Mlen, 0 1.94 + ;vpinsrq TMP0, TMP0, Alen, 1 1.95 + ; workaround the ml64.exe vpinsrq issue 1.96 + vpinsrd TMP0, TMP0, r8d, 0 1.97 + vpinsrd TMP0, TMP0, r9d, 2 1.98 + shr Mlen, 32 1.99 + shr Alen, 32 1.100 + vpinsrd TMP0, TMP0, r8d, 1 1.101 + vpinsrd TMP0, TMP0, r9d, 3 1.102 + 1.103 + vpxor T, T, TMP0 1.104 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.105 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 1.106 + 1.107 + vpshufb T, T, [Lbswap_mask] 1.108 + vpxor T, T, [X0] 1.109 + vmovdqu XMMWORD PTR[TAG], T 1.110 + vzeroupper 1.111 + 1.112 + ret 1.113 + 1.114 +intel_aes_gcmTAG ENDP 1.115 + 1.116 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.117 +; 1.118 +; Generates the H table 1.119 +; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); 1.120 +; 1.121 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.122 + 1.123 +ALIGN 16 1.124 +intel_aes_gcmINIT PROC 1.125 + 1.126 +Htbl textequ <rcx> 1.127 +KS textequ <rdx> 1.128 +NR textequ <r8d> 1.129 + 1.130 +T textequ <xmm0> 1.131 +TMP0 textequ <xmm1> 1.132 + 1.133 + vzeroupper 1.134 + ; AES-ENC(0) 1.135 + vmovdqu T, XMMWORD PTR[KS] 1.136 + lea KS, [16 + KS] 1.137 + dec NR 1.138 +Lenc_loop: 1.139 + vaesenc T, T, [KS] 1.140 + lea KS, [16 + KS] 1.141 + dec NR 1.142 + jnz Lenc_loop 1.143 + 1.144 + vaesenclast T, T, [KS] 1.145 + vpshufb T, T, [Lbswap_mask] 1.146 + 1.147 + ;Calculate H` = GFMUL(H, 2) 1.148 + vpsrad xmm3, T, 31 1.149 + vpshufd xmm3, xmm3, 0ffh 1.150 + vpand xmm5, xmm3, [Lpoly] 1.151 + vpsrld xmm3, T, 31 1.152 + vpslld xmm4, T, 1 1.153 + vpslldq xmm3, xmm3, 4 1.154 + vpxor T, xmm4, xmm3 1.155 + vpxor T, T, xmm5 1.156 + 1.157 + vmovdqu TMP0, T 1.158 + vmovdqu XMMWORD PTR[Htbl + 0*16], T 1.159 + 1.160 + vpshufd xmm2, T, 78 1.161 + vpxor xmm2, xmm2, T 1.162 + vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 1.163 + 1.164 + i = 1 1.165 + WHILE i LT 8 1.166 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 1.167 + vmovdqu XMMWORD PTR[Htbl + i*16], T 1.168 + vpshufd xmm2, T, 78 1.169 + vpxor xmm2, xmm2, T 1.170 + vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 1.171 + i = i+1 1.172 + ENDM 1.173 + vzeroupper 1.174 + ret 1.175 +intel_aes_gcmINIT ENDP 1.176 + 1.177 + 1.178 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.179 +; 1.180 +; Authenticate only 1.181 +; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); 1.182 +; 1.183 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.184 + 1.185 +ALIGN 16 1.186 +intel_aes_gcmAAD PROC 1.187 + 1.188 +Htbl textequ <rcx> 1.189 +inp textequ <rdx> 1.190 +len textequ <r8> 1.191 +Tp textequ <r9> 1.192 +hlp0 textequ <r10> 1.193 + 1.194 +DATA textequ <xmm0> 1.195 +T textequ <xmm1> 1.196 +TMP0 textequ <xmm2> 1.197 +TMP1 textequ <xmm3> 1.198 +TMP2 textequ <xmm4> 1.199 +TMP3 textequ <xmm5> 1.200 +TMP4 textequ <xmm6> 1.201 +Xhi textequ <xmm7> 1.202 + 1.203 +KARATSUBA_AAD MACRO i 1.204 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h 1.205 + vpxor TMP0, TMP0, TMP3 1.206 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h 1.207 + vpxor TMP1, TMP1, TMP3 1.208 + vpshufd TMP3, DATA, 78 1.209 + vpxor TMP3, TMP3, DATA 1.210 + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h 1.211 + vpxor TMP2, TMP2, TMP3 1.212 +ENDM 1.213 + 1.214 + test len, len 1.215 + jnz LbeginAAD 1.216 + ret 1.217 + 1.218 +LbeginAAD: 1.219 + vzeroupper 1.220 + 1.221 + sub rsp, 2*16 1.222 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 1.223 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 1.224 + 1.225 + vpxor Xhi, Xhi, Xhi 1.226 + 1.227 + vmovdqu T, XMMWORD PTR[Tp] 1.228 + ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 1.229 + mov hlp0, len 1.230 + and hlp0, 128-1 1.231 + jz Lmod_loop 1.232 + 1.233 + and len, -128 1.234 + sub hlp0, 16 1.235 + 1.236 + ; Prefix block 1.237 + vmovdqu DATA, XMMWORD PTR[inp] 1.238 + vpshufb DATA, DATA, [Lbswap_mask] 1.239 + vpxor DATA, DATA, T 1.240 + 1.241 + vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h 1.242 + vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h 1.243 + vpshufd TMP3, DATA, 78 1.244 + vpxor TMP3, TMP3, DATA 1.245 + vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h 1.246 + 1.247 + lea inp, [inp+16] 1.248 + test hlp0, hlp0 1.249 + jnz Lpre_loop 1.250 + jmp Lred1 1.251 + 1.252 + ;hash remaining prefix bocks (up to 7 total prefix blocks) 1.253 +Lpre_loop: 1.254 + 1.255 + sub hlp0, 16 1.256 + 1.257 + vmovdqu DATA, XMMWORD PTR[inp] 1.258 + vpshufb DATA, DATA, [Lbswap_mask] 1.259 + 1.260 + vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h 1.261 + vpxor TMP0, TMP0, TMP3 1.262 + vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h 1.263 + vpxor TMP1, TMP1, TMP3 1.264 + vpshufd TMP3, DATA, 78 1.265 + vpxor TMP3, TMP3, DATA 1.266 + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h 1.267 + vpxor TMP2, TMP2, TMP3 1.268 + 1.269 + test hlp0, hlp0 1.270 + lea inp, [inp+16] 1.271 + jnz Lpre_loop 1.272 + 1.273 +Lred1: 1.274 + 1.275 + vpxor TMP2, TMP2, TMP0 1.276 + vpxor TMP2, TMP2, TMP1 1.277 + vpsrldq TMP3, TMP2, 8 1.278 + vpslldq TMP2, TMP2, 8 1.279 + 1.280 + vpxor Xhi, TMP1, TMP3 1.281 + vpxor T, TMP0, TMP2 1.282 + 1.283 + 1.284 +Lmod_loop: 1.285 + 1.286 + sub len, 16*8 1.287 + jb Ldone 1.288 + ; Block #0 1.289 + vmovdqu DATA, XMMWORD PTR[inp + 16*7] 1.290 + vpshufb DATA, DATA, [Lbswap_mask] 1.291 + 1.292 + vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h 1.293 + vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h 1.294 + vpshufd TMP3, DATA, 78 1.295 + vpxor TMP3, TMP3, DATA 1.296 + vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h 1.297 + 1.298 + ; Block #1 1.299 + vmovdqu DATA, XMMWORD PTR[inp + 16*6] 1.300 + vpshufb DATA, DATA, [Lbswap_mask] 1.301 + KARATSUBA_AAD 1 1.302 + 1.303 + ; Block #2 1.304 + vmovdqu DATA, XMMWORD PTR[inp + 16*5] 1.305 + vpshufb DATA, DATA, [Lbswap_mask] 1.306 + 1.307 + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a 1.308 + vpalignr T, T, T, 8 1.309 + 1.310 + KARATSUBA_AAD 2 1.311 + 1.312 + vpxor T, T, TMP4 ;reduction stage 1b 1.313 + 1.314 + ; Block #3 1.315 + vmovdqu DATA, XMMWORD PTR[inp + 16*4] 1.316 + vpshufb DATA, DATA, [Lbswap_mask] 1.317 + KARATSUBA_AAD 3 1.318 + ; Block #4 1.319 + vmovdqu DATA, XMMWORD PTR[inp + 16*3] 1.320 + vpshufb DATA, DATA, [Lbswap_mask] 1.321 + 1.322 + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a 1.323 + vpalignr T, T, T, 8 1.324 + 1.325 + KARATSUBA_AAD 4 1.326 + 1.327 + vpxor T, T, TMP4 ;reduction stage 2b 1.328 + ; Block #5 1.329 + vmovdqu DATA, XMMWORD PTR[inp + 16*2] 1.330 + vpshufb DATA, DATA, [Lbswap_mask] 1.331 + KARATSUBA_AAD 5 1.332 + 1.333 + vpxor T, T, Xhi ;reduction finalize 1.334 + ; Block #6 1.335 + vmovdqu DATA, XMMWORD PTR[inp + 16*1] 1.336 + vpshufb DATA, DATA, [Lbswap_mask] 1.337 + KARATSUBA_AAD 6 1.338 + ; Block #7 1.339 + vmovdqu DATA, XMMWORD PTR[inp + 16*0] 1.340 + vpshufb DATA, DATA, [Lbswap_mask] 1.341 + vpxor DATA, DATA, T 1.342 + KARATSUBA_AAD 7 1.343 + ; Aggregated 8 blocks, now karatsuba fixup 1.344 + vpxor TMP2, TMP2, TMP0 1.345 + vpxor TMP2, TMP2, TMP1 1.346 + vpsrldq TMP3, TMP2, 8 1.347 + vpslldq TMP2, TMP2, 8 1.348 + 1.349 + vpxor Xhi, TMP1, TMP3 1.350 + vpxor T, TMP0, TMP2 1.351 + 1.352 + lea inp, [inp + 16*8] 1.353 + jmp Lmod_loop 1.354 + 1.355 +Ldone: 1.356 + vpclmulqdq TMP4, T, [Lpoly], 010h 1.357 + vpalignr T, T, T, 8 1.358 + vpxor T, T, TMP4 1.359 + 1.360 + vpclmulqdq TMP4, T, [Lpoly], 010h 1.361 + vpalignr T, T, T, 8 1.362 + vpxor T, T, TMP4 1.363 + 1.364 + vpxor T, T, Xhi 1.365 + vmovdqu XMMWORD PTR[Tp], T 1.366 + vzeroupper 1.367 + 1.368 + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 1.369 + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 1.370 + add rsp, 16*2 1.371 + 1.372 + ret 1.373 + 1.374 +intel_aes_gcmAAD ENDP 1.375 + 1.376 + 1.377 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.378 +; 1.379 +; Encrypt and Authenticate 1.380 +; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); 1.381 +; 1.382 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.383 + 1.384 +ALIGN 16 1.385 +intel_aes_gcmENC PROC 1.386 + 1.387 +PT textequ <rcx> 1.388 +CT textequ <rdx> 1.389 +Htbl textequ <r8> 1.390 +Gctx textequ <r8> 1.391 +len textequ <r9> 1.392 +KS textequ <r10> 1.393 +NR textequ <eax> 1.394 + 1.395 +aluCTR textequ <r11d> 1.396 +aluKSl textequ <r12d> 1.397 +aluTMP textequ <r13d> 1.398 + 1.399 +T textequ <xmm0> 1.400 +TMP0 textequ <xmm1> 1.401 +TMP1 textequ <xmm2> 1.402 +TMP2 textequ <xmm3> 1.403 +TMP3 textequ <xmm4> 1.404 +TMP4 textequ <xmm5> 1.405 +TMP5 textequ <xmm6> 1.406 +CTR0 textequ <xmm7> 1.407 +CTR1 textequ <xmm8> 1.408 +CTR2 textequ <xmm9> 1.409 +CTR3 textequ <xmm10> 1.410 +CTR4 textequ <xmm11> 1.411 +CTR5 textequ <xmm12> 1.412 +CTR6 textequ <xmm13> 1.413 +CTR7 textequ <xmm14> 1.414 +BSWAPMASK textequ <xmm15> 1.415 + 1.416 +ROUND MACRO i 1.417 + vmovdqu TMP3, XMMWORD PTR[i*16 + KS] 1.418 + vaesenc CTR0, CTR0, TMP3 1.419 + vaesenc CTR1, CTR1, TMP3 1.420 + vaesenc CTR2, CTR2, TMP3 1.421 + vaesenc CTR3, CTR3, TMP3 1.422 + vaesenc CTR4, CTR4, TMP3 1.423 + vaesenc CTR5, CTR5, TMP3 1.424 + vaesenc CTR6, CTR6, TMP3 1.425 + vaesenc CTR7, CTR7, TMP3 1.426 +ENDM 1.427 +ROUNDMUL MACRO i 1.428 + vmovdqu TMP3, XMMWORD PTR[i*16 + KS] 1.429 + 1.430 + vaesenc CTR0, CTR0, TMP3 1.431 + vaesenc CTR1, CTR1, TMP3 1.432 + vaesenc CTR2, CTR2, TMP3 1.433 + vaesenc CTR3, CTR3, TMP3 1.434 + 1.435 + vpshufd TMP4, TMP5, 78 1.436 + vpxor TMP4, TMP4, TMP5 1.437 + 1.438 + vaesenc CTR4, CTR4, TMP3 1.439 + vaesenc CTR5, CTR5, TMP3 1.440 + vaesenc CTR6, CTR6, TMP3 1.441 + vaesenc CTR7, CTR7, TMP3 1.442 + 1.443 + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 1.444 + vpxor TMP0, TMP0, TMP3 1.445 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 1.446 + vpclmulqdq TMP3, TMP5, TMP4, 011h 1.447 + vpxor TMP1, TMP1, TMP3 1.448 + vpclmulqdq TMP3, TMP5, TMP4, 000h 1.449 + vpxor TMP2, TMP2, TMP3 1.450 +ENDM 1.451 +KARATSUBA MACRO i 1.452 + vpshufd TMP4, TMP5, 78 1.453 + vpxor TMP4, TMP4, TMP5 1.454 + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 1.455 + vpxor TMP0, TMP0, TMP3 1.456 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 1.457 + vpclmulqdq TMP3, TMP5, TMP4, 011h 1.458 + vpxor TMP1, TMP1, TMP3 1.459 + vpclmulqdq TMP3, TMP5, TMP4, 000h 1.460 + vpxor TMP2, TMP2, TMP3 1.461 +ENDM 1.462 +NEXTCTR MACRO i 1.463 + add aluCTR, 1 1.464 + mov aluTMP, aluCTR 1.465 + xor aluTMP, aluKSl 1.466 + bswap aluTMP 1.467 + mov [3*4 + 8*16 + i*16 + rsp], aluTMP 1.468 +ENDM 1.469 + 1.470 + 1.471 + test len, len 1.472 + jnz LbeginENC 1.473 + ret 1.474 + 1.475 +LbeginENC: 1.476 + 1.477 + vzeroupper 1.478 + push r11 1.479 + push r12 1.480 + push r13 1.481 + push rbp 1.482 + sub rsp, 10*16 1.483 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 1.484 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 1.485 + vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 1.486 + vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 1.487 + vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 1.488 + vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 1.489 + vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 1.490 + vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 1.491 + vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 1.492 + vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 1.493 + 1.494 + mov rbp, rsp 1.495 + sub rsp, 16*16 1.496 + and rsp, -16 1.497 + 1.498 + vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] 1.499 + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.500 + vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] 1.501 + mov KS, [16*16 + 3*16 + Gctx] 1.502 + mov NR, [4 + KS] 1.503 + lea KS, [48 + KS] 1.504 + 1.505 + vpshufb CTR0, CTR0, BSWAPMASK 1.506 + 1.507 + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 1.508 + mov aluKSl, [3*4 + KS] 1.509 + bswap aluCTR 1.510 + bswap aluKSl 1.511 + 1.512 + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 1.513 + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.514 + vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 1.515 + 1.516 + cmp len, 128 1.517 + jb LEncDataSingles 1.518 +; Prepare the "top" counters 1.519 + vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 1.520 + vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 1.521 + vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 1.522 + vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 1.523 + vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 1.524 + vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 1.525 + vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 1.526 + 1.527 +; Encrypt the initial 8 blocks 1.528 + sub len, 128 1.529 + vpaddd CTR1, CTR0, XMMWORD PTR[Lone] 1.530 + vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] 1.531 + vpaddd CTR3, CTR2, XMMWORD PTR[Lone] 1.532 + vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] 1.533 + vpaddd CTR5, CTR4, XMMWORD PTR[Lone] 1.534 + vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] 1.535 + vpaddd CTR7, CTR6, XMMWORD PTR[Lone] 1.536 + 1.537 + vpshufb CTR0, CTR0, BSWAPMASK 1.538 + vpshufb CTR1, CTR1, BSWAPMASK 1.539 + vpshufb CTR2, CTR2, BSWAPMASK 1.540 + vpshufb CTR3, CTR3, BSWAPMASK 1.541 + vpshufb CTR4, CTR4, BSWAPMASK 1.542 + vpshufb CTR5, CTR5, BSWAPMASK 1.543 + vpshufb CTR6, CTR6, BSWAPMASK 1.544 + vpshufb CTR7, CTR7, BSWAPMASK 1.545 + 1.546 + vmovdqu TMP3, XMMWORD PTR[0*16 + KS] 1.547 + vpxor CTR0, CTR0, TMP3 1.548 + vpxor CTR1, CTR1, TMP3 1.549 + vpxor CTR2, CTR2, TMP3 1.550 + vpxor CTR3, CTR3, TMP3 1.551 + vpxor CTR4, CTR4, TMP3 1.552 + vpxor CTR5, CTR5, TMP3 1.553 + vpxor CTR6, CTR6, TMP3 1.554 + vpxor CTR7, CTR7, TMP3 1.555 + 1.556 + ROUND 1 1.557 + 1.558 + add aluCTR, 8 1.559 + mov aluTMP, aluCTR 1.560 + xor aluTMP, aluKSl 1.561 + bswap aluTMP 1.562 + mov [8*16 + 0*16 + 3*4 + rsp], aluTMP 1.563 + 1.564 + ROUND 2 1.565 + NEXTCTR 1 1.566 + ROUND 3 1.567 + NEXTCTR 2 1.568 + ROUND 4 1.569 + NEXTCTR 3 1.570 + ROUND 5 1.571 + NEXTCTR 4 1.572 + ROUND 6 1.573 + NEXTCTR 5 1.574 + ROUND 7 1.575 + NEXTCTR 6 1.576 + ROUND 8 1.577 + NEXTCTR 7 1.578 + ROUND 9 1.579 + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 1.580 + cmp NR, 10 1.581 + je @f 1.582 + 1.583 + ROUND 10 1.584 + ROUND 11 1.585 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 1.586 + cmp NR, 12 1.587 + je @f 1.588 + 1.589 + ROUND 12 1.590 + ROUND 13 1.591 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 1.592 +@@: 1.593 + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] 1.594 + vaesenclast CTR0, CTR0, TMP3 1.595 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] 1.596 + vaesenclast CTR1, CTR1, TMP3 1.597 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] 1.598 + vaesenclast CTR2, CTR2, TMP3 1.599 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] 1.600 + vaesenclast CTR3, CTR3, TMP3 1.601 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] 1.602 + vaesenclast CTR4, CTR4, TMP3 1.603 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] 1.604 + vaesenclast CTR5, CTR5, TMP3 1.605 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] 1.606 + vaesenclast CTR6, CTR6, TMP3 1.607 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] 1.608 + vaesenclast CTR7, CTR7, TMP3 1.609 + 1.610 + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 1.611 + vpshufb CTR0, CTR0, BSWAPMASK 1.612 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 1.613 + vpshufb CTR1, CTR1, BSWAPMASK 1.614 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 1.615 + vpshufb CTR2, CTR2, BSWAPMASK 1.616 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 1.617 + vpshufb CTR3, CTR3, BSWAPMASK 1.618 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 1.619 + vpshufb CTR4, CTR4, BSWAPMASK 1.620 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 1.621 + vpshufb CTR5, CTR5, BSWAPMASK 1.622 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 1.623 + vpshufb CTR6, CTR6, BSWAPMASK 1.624 + vmovdqu XMMWORD PTR[7*16 + CT], CTR7 1.625 + vpshufb TMP5, CTR7, BSWAPMASK 1.626 + 1.627 + vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 1.628 + vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 1.629 + vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 1.630 + vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 1.631 + vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 1.632 + vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 1.633 + vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 1.634 + 1.635 + lea CT, [8*16 + CT] 1.636 + lea PT, [8*16 + PT] 1.637 + jmp LEncDataOctets 1.638 + 1.639 +LEncDataOctets: 1.640 + cmp len, 128 1.641 + jb LEndEncOctets 1.642 + sub len, 128 1.643 + 1.644 + vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] 1.645 + vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] 1.646 + vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] 1.647 + vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] 1.648 + vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] 1.649 + vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] 1.650 + vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] 1.651 + vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] 1.652 + 1.653 + vpshufd TMP4, TMP5, 78 1.654 + vpxor TMP4, TMP4, TMP5 1.655 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1.656 + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1.657 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.658 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.659 + 1.660 + vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] 1.661 + ROUNDMUL 1 1.662 + NEXTCTR 0 1.663 + vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] 1.664 + ROUNDMUL 2 1.665 + NEXTCTR 1 1.666 + vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] 1.667 + ROUNDMUL 3 1.668 + NEXTCTR 2 1.669 + vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] 1.670 + ROUNDMUL 4 1.671 + NEXTCTR 3 1.672 + vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] 1.673 + ROUNDMUL 5 1.674 + NEXTCTR 4 1.675 + vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] 1.676 + ROUNDMUL 6 1.677 + NEXTCTR 5 1.678 + vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] 1.679 + ROUNDMUL 7 1.680 + NEXTCTR 6 1.681 + 1.682 + ROUND 8 1.683 + NEXTCTR 7 1.684 + 1.685 + vpxor TMP0, TMP0, TMP1 1.686 + vpxor TMP0, TMP0, TMP2 1.687 + vpsrldq TMP3, TMP0, 8 1.688 + vpxor TMP4, TMP1, TMP3 1.689 + vpslldq TMP3, TMP0, 8 1.690 + vpxor T, TMP2, TMP3 1.691 + 1.692 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.693 + vpalignr T,T,T,8 1.694 + vpxor T, T, TMP1 1.695 + 1.696 + ROUND 9 1.697 + 1.698 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.699 + vpalignr T,T,T,8 1.700 + vpxor T, T, TMP1 1.701 + 1.702 + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 1.703 + cmp NR, 10 1.704 + je @f 1.705 + 1.706 + ROUND 10 1.707 + ROUND 11 1.708 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 1.709 + cmp NR, 12 1.710 + je @f 1.711 + 1.712 + ROUND 12 1.713 + ROUND 13 1.714 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 1.715 +@@: 1.716 + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] 1.717 + vaesenclast CTR0, CTR0, TMP3 1.718 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] 1.719 + vaesenclast CTR1, CTR1, TMP3 1.720 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] 1.721 + vaesenclast CTR2, CTR2, TMP3 1.722 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] 1.723 + vaesenclast CTR3, CTR3, TMP3 1.724 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] 1.725 + vaesenclast CTR4, CTR4, TMP3 1.726 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] 1.727 + vaesenclast CTR5, CTR5, TMP3 1.728 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] 1.729 + vaesenclast CTR6, CTR6, TMP3 1.730 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] 1.731 + vaesenclast CTR7, CTR7, TMP3 1.732 + 1.733 + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 1.734 + vpshufb CTR0, CTR0, BSWAPMASK 1.735 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 1.736 + vpshufb CTR1, CTR1, BSWAPMASK 1.737 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 1.738 + vpshufb CTR2, CTR2, BSWAPMASK 1.739 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 1.740 + vpshufb CTR3, CTR3, BSWAPMASK 1.741 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 1.742 + vpshufb CTR4, CTR4, BSWAPMASK 1.743 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 1.744 + vpshufb CTR5, CTR5, BSWAPMASK 1.745 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 1.746 + vpshufb CTR6, CTR6, BSWAPMASK 1.747 + vmovdqu XMMWORD PTR[7*16 + CT], CTR7 1.748 + vpshufb TMP5, CTR7, BSWAPMASK 1.749 + 1.750 + vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 1.751 + vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 1.752 + vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 1.753 + vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 1.754 + vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 1.755 + vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 1.756 + vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 1.757 + 1.758 + vpxor T, T, TMP4 1.759 + 1.760 + lea CT, [8*16 + CT] 1.761 + lea PT, [8*16 + PT] 1.762 + jmp LEncDataOctets 1.763 + 1.764 +LEndEncOctets: 1.765 + 1.766 + vpshufd TMP4, TMP5, 78 1.767 + vpxor TMP4, TMP4, TMP5 1.768 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1.769 + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1.770 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.771 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.772 + 1.773 + vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] 1.774 + KARATSUBA 1 1.775 + vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] 1.776 + KARATSUBA 2 1.777 + vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] 1.778 + KARATSUBA 3 1.779 + vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] 1.780 + KARATSUBA 4 1.781 + vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] 1.782 + KARATSUBA 5 1.783 + vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] 1.784 + KARATSUBA 6 1.785 + vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] 1.786 + KARATSUBA 7 1.787 + 1.788 + vpxor TMP0, TMP0, TMP1 1.789 + vpxor TMP0, TMP0, TMP2 1.790 + vpsrldq TMP3, TMP0, 8 1.791 + vpxor TMP4, TMP1, TMP3 1.792 + vpslldq TMP3, TMP0, 8 1.793 + vpxor T, TMP2, TMP3 1.794 + 1.795 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.796 + vpalignr T,T,T,8 1.797 + vpxor T, T, TMP1 1.798 + 1.799 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.800 + vpalignr T,T,T,8 1.801 + vpxor T, T, TMP1 1.802 + 1.803 + vpxor T, T, TMP4 1.804 + 1.805 + sub aluCTR, 7 1.806 + 1.807 +LEncDataSingles: 1.808 + 1.809 + cmp len, 16 1.810 + jb LEncDataTail 1.811 + sub len, 16 1.812 + 1.813 + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] 1.814 + NEXTCTR 0 1.815 + 1.816 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.817 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.818 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.819 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.820 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.821 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.822 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.823 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.824 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.825 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.826 + cmp NR, 10 1.827 + je @f 1.828 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.829 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.830 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.831 + cmp NR, 12 1.832 + je @f 1.833 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.834 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.835 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.836 +@@: 1.837 + vaesenclast TMP1, TMP1, TMP2 1.838 + vpxor TMP1, TMP1, XMMWORD PTR[PT] 1.839 + vmovdqu XMMWORD PTR[CT], TMP1 1.840 + 1.841 + lea PT, [16+PT] 1.842 + lea CT, [16+CT] 1.843 + 1.844 + vpshufb TMP1, TMP1, BSWAPMASK 1.845 + vpxor T, T, TMP1 1.846 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.847 + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 1.848 + 1.849 + jmp LEncDataSingles 1.850 + 1.851 +LEncDataTail: 1.852 + 1.853 + test len, len 1.854 + jz LEncDataEnd 1.855 + 1.856 + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] 1.857 + 1.858 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.859 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.860 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.861 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.862 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.863 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.864 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.865 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.866 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.867 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.868 + cmp NR, 10 1.869 + je @f 1.870 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.871 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.872 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.873 + cmp NR, 12 1.874 + je @f 1.875 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.876 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.877 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.878 +@@: 1.879 + vaesenclast TMP1, TMP1, TMP2 1.880 +; zero a temp location 1.881 + vpxor TMP2, TMP2, TMP2 1.882 + vmovdqa XMMWORD PTR[rsp], TMP2 1.883 +; copy as many bytes as needed 1.884 + xor KS, KS 1.885 + 1.886 +@@: 1.887 + cmp len, KS 1.888 + je @f 1.889 + mov al, [PT + KS] 1.890 + mov [rsp + KS], al 1.891 + inc KS 1.892 + jmp @b 1.893 +@@: 1.894 + vpxor TMP1, TMP1, XMMWORD PTR[rsp] 1.895 + vmovdqa XMMWORD PTR[rsp], TMP1 1.896 + xor KS, KS 1.897 +@@: 1.898 + cmp len, KS 1.899 + je @f 1.900 + mov al, [rsp + KS] 1.901 + mov [CT + KS], al 1.902 + inc KS 1.903 + jmp @b 1.904 +@@: 1.905 + cmp KS, 16 1.906 + je @f 1.907 + mov BYTE PTR[rsp + KS], 0 1.908 + inc KS 1.909 + jmp @b 1.910 +@@: 1.911 +BAIL: 1.912 + vmovdqa TMP1, XMMWORD PTR[rsp] 1.913 + vpshufb TMP1, TMP1, BSWAPMASK 1.914 + vpxor T, T, TMP1 1.915 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.916 + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 1.917 + 1.918 +LEncDataEnd: 1.919 + 1.920 + vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T 1.921 + bswap aluCTR 1.922 + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1.923 + 1.924 + mov rsp, rbp 1.925 + 1.926 + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 1.927 + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 1.928 + vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] 1.929 + vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] 1.930 + vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] 1.931 + vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] 1.932 + vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] 1.933 + vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] 1.934 + vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] 1.935 + vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] 1.936 + 1.937 + add rsp, 10*16 1.938 + pop rbp 1.939 + pop r13 1.940 + pop r12 1.941 + pop r11 1.942 + 1.943 + vzeroupper 1.944 + 1.945 + ret 1.946 +intel_aes_gcmENC ENDP 1.947 + 1.948 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.949 +; 1.950 +; Decrypt and Authenticate 1.951 +; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); 1.952 +; 1.953 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1.954 + 1.955 +ALIGN 16 1.956 +intel_aes_gcmDEC PROC 1.957 + 1.958 +NEXTCTR MACRO i 1.959 + add aluCTR, 1 1.960 + mov aluTMP, aluCTR 1.961 + xor aluTMP, aluKSl 1.962 + bswap aluTMP 1.963 + mov [3*4 + i*16 + rsp], aluTMP 1.964 +ENDM 1.965 + 1.966 +PT textequ <rdx> 1.967 +CT textequ <rcx> 1.968 + 1.969 + test len, len 1.970 + jnz LbeginDEC 1.971 + ret 1.972 + 1.973 +LbeginDEC: 1.974 + 1.975 + vzeroupper 1.976 + push r11 1.977 + push r12 1.978 + push r13 1.979 + push rbp 1.980 + sub rsp, 10*16 1.981 + vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 1.982 + vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 1.983 + vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 1.984 + vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 1.985 + vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 1.986 + vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 1.987 + vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 1.988 + vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 1.989 + vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 1.990 + vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 1.991 + 1.992 + mov rbp, rsp 1.993 + sub rsp, 8*16 1.994 + and rsp, -16 1.995 + 1.996 + vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] 1.997 + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.998 + vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] 1.999 + mov KS, [16*16 + 3*16 + Gctx] 1.1000 + mov NR, [4 + KS] 1.1001 + lea KS, [48 + KS] 1.1002 + 1.1003 + vpshufb CTR0, CTR0, BSWAPMASK 1.1004 + 1.1005 + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 1.1006 + mov aluKSl, [3*4 + KS] 1.1007 + bswap aluCTR 1.1008 + bswap aluKSl 1.1009 + 1.1010 + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 1.1011 + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1.1012 + vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 1.1013 + 1.1014 + cmp len, 128 1.1015 + jb LDecDataSingles 1.1016 +; Prepare the "top" counters 1.1017 + vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 1.1018 + vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 1.1019 + vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 1.1020 + vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 1.1021 + vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 1.1022 + vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 1.1023 + vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 1.1024 + 1.1025 + NEXTCTR 1 1.1026 + NEXTCTR 2 1.1027 + NEXTCTR 3 1.1028 + NEXTCTR 4 1.1029 + NEXTCTR 5 1.1030 + NEXTCTR 6 1.1031 + NEXTCTR 7 1.1032 + 1.1033 +LDecDataOctets: 1.1034 + cmp len, 128 1.1035 + jb LEndDecOctets 1.1036 + sub len, 128 1.1037 + 1.1038 + vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] 1.1039 + vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] 1.1040 + vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] 1.1041 + vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] 1.1042 + vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] 1.1043 + vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] 1.1044 + vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] 1.1045 + vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] 1.1046 + 1.1047 + vmovdqu TMP5, XMMWORD PTR[7*16 + CT] 1.1048 + vpshufb TMP5, TMP5, BSWAPMASK 1.1049 + vpshufd TMP4, TMP5, 78 1.1050 + vpxor TMP4, TMP4, TMP5 1.1051 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1.1052 + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1.1053 + vpclmulqdq TMP1, TMP5, TMP4, 011h 1.1054 + vpclmulqdq TMP2, TMP5, TMP4, 000h 1.1055 + 1.1056 + vmovdqu TMP5, XMMWORD PTR[6*16 + CT] 1.1057 + vpshufb TMP5, TMP5, BSWAPMASK 1.1058 + ROUNDMUL 1 1.1059 + NEXTCTR 0 1.1060 + vmovdqu TMP5, XMMWORD PTR[5*16 + CT] 1.1061 + vpshufb TMP5, TMP5, BSWAPMASK 1.1062 + ROUNDMUL 2 1.1063 + NEXTCTR 1 1.1064 + vmovdqu TMP5, XMMWORD PTR[4*16 + CT] 1.1065 + vpshufb TMP5, TMP5, BSWAPMASK 1.1066 + ROUNDMUL 3 1.1067 + NEXTCTR 2 1.1068 + vmovdqu TMP5, XMMWORD PTR[3*16 + CT] 1.1069 + vpshufb TMP5, TMP5, BSWAPMASK 1.1070 + ROUNDMUL 4 1.1071 + NEXTCTR 3 1.1072 + vmovdqu TMP5, XMMWORD PTR[2*16 + CT] 1.1073 + vpshufb TMP5, TMP5, BSWAPMASK 1.1074 + ROUNDMUL 5 1.1075 + NEXTCTR 4 1.1076 + vmovdqu TMP5, XMMWORD PTR[1*16 + CT] 1.1077 + vpshufb TMP5, TMP5, BSWAPMASK 1.1078 + ROUNDMUL 6 1.1079 + NEXTCTR 5 1.1080 + vmovdqu TMP5, XMMWORD PTR[0*16 + CT] 1.1081 + vpshufb TMP5, TMP5, BSWAPMASK 1.1082 + vpxor TMP5, TMP5, T 1.1083 + ROUNDMUL 7 1.1084 + NEXTCTR 6 1.1085 + 1.1086 + ROUND 8 1.1087 + NEXTCTR 7 1.1088 + 1.1089 + vpxor TMP0, TMP0, TMP1 1.1090 + vpxor TMP0, TMP0, TMP2 1.1091 + vpsrldq TMP3, TMP0, 8 1.1092 + vpxor TMP4, TMP1, TMP3 1.1093 + vpslldq TMP3, TMP0, 8 1.1094 + vpxor T, TMP2, TMP3 1.1095 + 1.1096 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.1097 + vpalignr T,T,T,8 1.1098 + vpxor T, T, TMP1 1.1099 + 1.1100 + ROUND 9 1.1101 + 1.1102 + vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1.1103 + vpalignr T,T,T,8 1.1104 + vpxor T, T, TMP1 1.1105 + 1.1106 + vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 1.1107 + cmp NR, 10 1.1108 + je @f 1.1109 + 1.1110 + ROUND 10 1.1111 + ROUND 11 1.1112 + vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 1.1113 + cmp NR, 12 1.1114 + je @f 1.1115 + 1.1116 + ROUND 12 1.1117 + ROUND 13 1.1118 + vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 1.1119 +@@: 1.1120 + vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] 1.1121 + vaesenclast CTR0, CTR0, TMP3 1.1122 + vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] 1.1123 + vaesenclast CTR1, CTR1, TMP3 1.1124 + vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] 1.1125 + vaesenclast CTR2, CTR2, TMP3 1.1126 + vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] 1.1127 + vaesenclast CTR3, CTR3, TMP3 1.1128 + vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] 1.1129 + vaesenclast CTR4, CTR4, TMP3 1.1130 + vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] 1.1131 + vaesenclast CTR5, CTR5, TMP3 1.1132 + vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] 1.1133 + vaesenclast CTR6, CTR6, TMP3 1.1134 + vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] 1.1135 + vaesenclast CTR7, CTR7, TMP3 1.1136 + 1.1137 + vmovdqu XMMWORD PTR[0*16 + PT], CTR0 1.1138 + vmovdqu XMMWORD PTR[1*16 + PT], CTR1 1.1139 + vmovdqu XMMWORD PTR[2*16 + PT], CTR2 1.1140 + vmovdqu XMMWORD PTR[3*16 + PT], CTR3 1.1141 + vmovdqu XMMWORD PTR[4*16 + PT], CTR4 1.1142 + vmovdqu XMMWORD PTR[5*16 + PT], CTR5 1.1143 + vmovdqu XMMWORD PTR[6*16 + PT], CTR6 1.1144 + vmovdqu XMMWORD PTR[7*16 + PT], CTR7 1.1145 + 1.1146 + vpxor T, T, TMP4 1.1147 + 1.1148 + lea CT, [8*16 + CT] 1.1149 + lea PT, [8*16 + PT] 1.1150 + jmp LDecDataOctets 1.1151 + 1.1152 +LEndDecOctets: 1.1153 + 1.1154 + sub aluCTR, 7 1.1155 + 1.1156 +LDecDataSingles: 1.1157 + 1.1158 + cmp len, 16 1.1159 + jb LDecDataTail 1.1160 + sub len, 16 1.1161 + 1.1162 + vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] 1.1163 + NEXTCTR 0 1.1164 + 1.1165 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.1166 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.1167 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.1168 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.1169 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.1170 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.1171 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.1172 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.1173 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.1174 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.1175 + cmp NR, 10 1.1176 + je @f 1.1177 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.1178 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.1179 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.1180 + cmp NR, 12 1.1181 + je @f 1.1182 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.1183 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.1184 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.1185 +@@: 1.1186 + vaesenclast TMP1, TMP1, TMP2 1.1187 + 1.1188 + vmovdqu TMP2, XMMWORD PTR[CT] 1.1189 + vpxor TMP1, TMP1, TMP2 1.1190 + vmovdqu XMMWORD PTR[PT], TMP1 1.1191 + 1.1192 + lea PT, [16+PT] 1.1193 + lea CT, [16+CT] 1.1194 + 1.1195 + vpshufb TMP2, TMP2, BSWAPMASK 1.1196 + vpxor T, T, TMP2 1.1197 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.1198 + GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 1.1199 + 1.1200 + jmp LDecDataSingles 1.1201 + 1.1202 +LDecDataTail: 1.1203 + 1.1204 + test len, len 1.1205 + jz LDecDataEnd 1.1206 + 1.1207 + vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] 1.1208 + inc aluCTR 1.1209 + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1.1210 + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1.1211 + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1.1212 + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1.1213 + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1.1214 + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1.1215 + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1.1216 + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1.1217 + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1.1218 + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1.1219 + cmp NR, 10 1.1220 + je @f 1.1221 + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1.1222 + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1.1223 + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1.1224 + cmp NR, 12 1.1225 + je @f 1.1226 + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1.1227 + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1.1228 + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1.1229 +@@: 1.1230 + vaesenclast TMP1, TMP1, TMP2 1.1231 +; copy as many bytes as needed 1.1232 + xor KS, KS 1.1233 +@@: 1.1234 + cmp len, KS 1.1235 + je @f 1.1236 + mov al, [CT + KS] 1.1237 + mov [rsp + KS], al 1.1238 + inc KS 1.1239 + jmp @b 1.1240 +@@: 1.1241 + cmp KS, 16 1.1242 + je @f 1.1243 + mov BYTE PTR[rsp + KS], 0 1.1244 + inc KS 1.1245 + jmp @b 1.1246 +@@: 1.1247 + vmovdqa TMP2, XMMWORD PTR[rsp] 1.1248 + vpshufb TMP2, TMP2, BSWAPMASK 1.1249 + vpxor T, T, TMP2 1.1250 + vmovdqu TMP0, XMMWORD PTR[Htbl] 1.1251 + GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 1.1252 + 1.1253 + 1.1254 + vpxor TMP1, TMP1, XMMWORD PTR[rsp] 1.1255 + vmovdqa XMMWORD PTR[rsp], TMP1 1.1256 + xor KS, KS 1.1257 +@@: 1.1258 + cmp len, KS 1.1259 + je @f 1.1260 + mov al, [rsp + KS] 1.1261 + mov [PT + KS], al 1.1262 + inc KS 1.1263 + jmp @b 1.1264 +@@: 1.1265 + 1.1266 +LDecDataEnd: 1.1267 + 1.1268 + vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T 1.1269 + bswap aluCTR 1.1270 + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1.1271 + 1.1272 + mov rsp, rbp 1.1273 + 1.1274 + vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 1.1275 + vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 1.1276 + vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] 1.1277 + vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] 1.1278 + vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] 1.1279 + vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] 1.1280 + vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] 1.1281 + vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] 1.1282 + vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] 1.1283 + vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] 1.1284 + 1.1285 + add rsp, 10*16 1.1286 + pop rbp 1.1287 + pop r13 1.1288 + pop r12 1.1289 + pop r11 1.1290 + 1.1291 + vzeroupper 1.1292 + 1.1293 + ret 1.1294 +ret 1.1295 +intel_aes_gcmDEC ENDP 1.1296 + 1.1297 + 1.1298 +END