1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/intel-aes-x64-masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,971 @@ 1.4 +; LICENSE: 1.5 +; This submission to NSS is to be made available under the terms of the 1.6 +; Mozilla Public License, v. 2.0. You can obtain one at http: 1.7 +; //mozilla.org/MPL/2.0/. 1.8 +;############################################################################### 1.9 +; Copyright(c) 2014, Intel Corp. 1.10 +; Developers and authors: 1.11 +; Shay Gueron and Vlad Krasnov 1.12 +; Intel Corporation, Israel Development Centre, Haifa, Israel 1.13 +; Please send feedback directly to crypto.feedback.alias@intel.com 1.14 + 1.15 + 1.16 +.DATA 1.17 +ALIGN 16 1.18 +Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh 1.19 +Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h 1.20 +Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh 1.21 +Lcon1 dd 1,1,1,1 1.22 +Lcon2 dd 1bh,1bh,1bh,1bh 1.23 + 1.24 +.CODE 1.25 + 1.26 +ctx textequ <rcx> 1.27 +output textequ <rdx> 1.28 +input textequ <r8> 1.29 +inputLen textequ <r9d> 1.30 + 1.31 + 1.32 +aes_rnd MACRO i 1.33 + movdqu xmm8, [i*16 + ctx] 1.34 + aesenc xmm0, xmm8 1.35 + aesenc xmm1, xmm8 1.36 + aesenc xmm2, xmm8 1.37 + aesenc xmm3, xmm8 1.38 + aesenc xmm4, xmm8 1.39 + aesenc xmm5, xmm8 1.40 + aesenc xmm6, xmm8 1.41 + aesenc xmm7, xmm8 1.42 + ENDM 1.43 + 1.44 +aes_last_rnd MACRO i 1.45 + movdqu xmm8, [i*16 + ctx] 1.46 + aesenclast xmm0, xmm8 1.47 + aesenclast xmm1, xmm8 1.48 + aesenclast xmm2, xmm8 1.49 + aesenclast xmm3, xmm8 1.50 + aesenclast xmm4, xmm8 1.51 + aesenclast xmm5, xmm8 1.52 + aesenclast xmm6, xmm8 1.53 + aesenclast xmm7, xmm8 1.54 + ENDM 1.55 + 1.56 +aes_dec_rnd MACRO i 1.57 + movdqu xmm8, [i*16 + ctx] 1.58 + aesdec xmm0, xmm8 1.59 + aesdec xmm1, xmm8 1.60 + aesdec xmm2, xmm8 1.61 + aesdec xmm3, xmm8 1.62 + aesdec xmm4, xmm8 1.63 + aesdec xmm5, xmm8 1.64 + aesdec xmm6, xmm8 1.65 + aesdec xmm7, xmm8 1.66 + ENDM 1.67 + 1.68 +aes_dec_last_rnd MACRO i 1.69 + movdqu xmm8, [i*16 + ctx] 1.70 + aesdeclast xmm0, xmm8 1.71 + aesdeclast xmm1, xmm8 1.72 + aesdeclast xmm2, xmm8 1.73 + aesdeclast xmm3, xmm8 1.74 + aesdeclast xmm4, xmm8 1.75 + aesdeclast xmm5, xmm8 1.76 + aesdeclast xmm6, xmm8 1.77 + aesdeclast xmm7, xmm8 1.78 + ENDM 1.79 + 1.80 + 1.81 +gen_aes_ecb_func MACRO enc, rnds 1.82 + 1.83 +LOCAL loop8 1.84 +LOCAL loop1 1.85 +LOCAL bail 1.86 + 1.87 + xor inputLen, inputLen 1.88 + mov input, [rsp + 1*8 + 8*4] 1.89 + mov inputLen, [rsp + 1*8 + 8*5] 1.90 + 1.91 + sub rsp, 3*16 1.92 + 1.93 + movdqu [rsp + 0*16], xmm6 1.94 + movdqu [rsp + 1*16], xmm7 1.95 + movdqu [rsp + 2*16], xmm8 1.96 + 1.97 + lea ctx, [48+ctx] 1.98 + 1.99 +loop8: 1.100 + cmp inputLen, 8*16 1.101 + jb loop1 1.102 + 1.103 + movdqu xmm0, [0*16 + input] 1.104 + movdqu xmm1, [1*16 + input] 1.105 + movdqu xmm2, [2*16 + input] 1.106 + movdqu xmm3, [3*16 + input] 1.107 + movdqu xmm4, [4*16 + input] 1.108 + movdqu xmm5, [5*16 + input] 1.109 + movdqu xmm6, [6*16 + input] 1.110 + movdqu xmm7, [7*16 + input] 1.111 + 1.112 + movdqu xmm8, [0*16 + ctx] 1.113 + pxor xmm0, xmm8 1.114 + pxor xmm1, xmm8 1.115 + pxor xmm2, xmm8 1.116 + pxor xmm3, xmm8 1.117 + pxor xmm4, xmm8 1.118 + pxor xmm5, xmm8 1.119 + pxor xmm6, xmm8 1.120 + pxor xmm7, xmm8 1.121 + 1.122 +IF enc eq 1 1.123 + rnd textequ <aes_rnd> 1.124 + lastrnd textequ <aes_last_rnd> 1.125 + aesinst textequ <aesenc> 1.126 + aeslastinst textequ <aesenclast> 1.127 +ELSE 1.128 + rnd textequ <aes_dec_rnd> 1.129 + lastrnd textequ <aes_dec_last_rnd> 1.130 + aesinst textequ <aesdec> 1.131 + aeslastinst textequ <aesdeclast> 1.132 +ENDIF 1.133 + 1.134 + i = 1 1.135 + WHILE i LT rnds 1.136 + rnd i 1.137 + i = i+1 1.138 + ENDM 1.139 + lastrnd rnds 1.140 + 1.141 + movdqu [0*16 + output], xmm0 1.142 + movdqu [1*16 + output], xmm1 1.143 + movdqu [2*16 + output], xmm2 1.144 + movdqu [3*16 + output], xmm3 1.145 + movdqu [4*16 + output], xmm4 1.146 + movdqu [5*16 + output], xmm5 1.147 + movdqu [6*16 + output], xmm6 1.148 + movdqu [7*16 + output], xmm7 1.149 + 1.150 + lea input, [8*16 + input] 1.151 + lea output, [8*16 + output] 1.152 + sub inputLen, 8*16 1.153 + jmp loop8 1.154 + 1.155 +loop1: 1.156 + cmp inputLen, 1*16 1.157 + jb bail 1.158 + 1.159 + movdqu xmm0, [input] 1.160 + movdqu xmm7, [0*16 + ctx] 1.161 + pxor xmm0, xmm7 1.162 + 1.163 + i = 1 1.164 + WHILE i LT rnds 1.165 + movdqu xmm7, [i*16 + ctx] 1.166 + aesinst xmm0, xmm7 1.167 + i = i+1 1.168 + ENDM 1.169 + movdqu xmm7, [rnds*16 + ctx] 1.170 + aeslastinst xmm0, xmm7 1.171 + 1.172 + movdqu [output], xmm0 1.173 + 1.174 + lea input, [1*16 + input] 1.175 + lea output, [1*16 + output] 1.176 + sub inputLen, 1*16 1.177 + jmp loop1 1.178 + 1.179 +bail: 1.180 + xor rax, rax 1.181 + 1.182 + movdqu xmm6, [rsp + 0*16] 1.183 + movdqu xmm7, [rsp + 1*16] 1.184 + movdqu xmm8, [rsp + 2*16] 1.185 + add rsp, 3*16 1.186 + ret 1.187 +ENDM 1.188 + 1.189 +intel_aes_encrypt_ecb_128 PROC 1.190 +gen_aes_ecb_func 1, 10 1.191 +intel_aes_encrypt_ecb_128 ENDP 1.192 + 1.193 +intel_aes_encrypt_ecb_192 PROC 1.194 +gen_aes_ecb_func 1, 12 1.195 +intel_aes_encrypt_ecb_192 ENDP 1.196 + 1.197 +intel_aes_encrypt_ecb_256 PROC 1.198 +gen_aes_ecb_func 1, 14 1.199 +intel_aes_encrypt_ecb_256 ENDP 1.200 + 1.201 +intel_aes_decrypt_ecb_128 PROC 1.202 +gen_aes_ecb_func 0, 10 1.203 +intel_aes_decrypt_ecb_128 ENDP 1.204 + 1.205 +intel_aes_decrypt_ecb_192 PROC 1.206 +gen_aes_ecb_func 0, 12 1.207 +intel_aes_decrypt_ecb_192 ENDP 1.208 + 1.209 +intel_aes_decrypt_ecb_256 PROC 1.210 +gen_aes_ecb_func 0, 14 1.211 +intel_aes_decrypt_ecb_256 ENDP 1.212 + 1.213 + 1.214 +KEY textequ <rcx> 1.215 +KS textequ <rdx> 1.216 +ITR textequ <r8> 1.217 + 1.218 +intel_aes_encrypt_init_128 PROC 1.219 + 1.220 + movdqu xmm1, [KEY] 1.221 + movdqu [KS], xmm1 1.222 + movdqa xmm2, xmm1 1.223 + 1.224 + lea ITR, Lcon1 1.225 + movdqa xmm0, [ITR] 1.226 + lea ITR, Lmask 1.227 + movdqa xmm4, [ITR] 1.228 + 1.229 + mov ITR, 8 1.230 + 1.231 +Lenc_128_ks_loop: 1.232 + lea KS, [16 + KS] 1.233 + dec ITR 1.234 + 1.235 + pshufb xmm2, xmm4 1.236 + aesenclast xmm2, xmm0 1.237 + pslld xmm0, 1 1.238 + movdqa xmm3, xmm1 1.239 + pslldq xmm3, 4 1.240 + pxor xmm1, xmm3 1.241 + pslldq xmm3, 4 1.242 + pxor xmm1, xmm3 1.243 + pslldq xmm3, 4 1.244 + pxor xmm1, xmm3 1.245 + pxor xmm1, xmm2 1.246 + movdqu [KS], xmm1 1.247 + movdqa xmm2, xmm1 1.248 + 1.249 + jne Lenc_128_ks_loop 1.250 + 1.251 + lea ITR, Lcon2 1.252 + movdqa xmm0, [ITR] 1.253 + 1.254 + pshufb xmm2, xmm4 1.255 + aesenclast xmm2, xmm0 1.256 + pslld xmm0, 1 1.257 + movdqa xmm3, xmm1 1.258 + pslldq xmm3, 4 1.259 + pxor xmm1, xmm3 1.260 + pslldq xmm3, 4 1.261 + pxor xmm1, xmm3 1.262 + pslldq xmm3, 4 1.263 + pxor xmm1, xmm3 1.264 + pxor xmm1, xmm2 1.265 + movdqu [16 + KS], xmm1 1.266 + movdqa xmm2, xmm1 1.267 + 1.268 + pshufb xmm2, xmm4 1.269 + aesenclast xmm2, xmm0 1.270 + movdqa xmm3, xmm1 1.271 + pslldq xmm3, 4 1.272 + pxor xmm1, xmm3 1.273 + pslldq xmm3, 4 1.274 + pxor xmm1, xmm3 1.275 + pslldq xmm3, 4 1.276 + pxor xmm1, xmm3 1.277 + pxor xmm1, xmm2 1.278 + movdqu [32 + KS], xmm1 1.279 + movdqa xmm2, xmm1 1.280 + 1.281 + ret 1.282 +intel_aes_encrypt_init_128 ENDP 1.283 + 1.284 + 1.285 +intel_aes_decrypt_init_128 PROC 1.286 + 1.287 + push KS 1.288 + push KEY 1.289 + 1.290 + call intel_aes_encrypt_init_128 1.291 + 1.292 + pop KEY 1.293 + pop KS 1.294 + 1.295 + movdqu xmm0, [0*16 + KS] 1.296 + movdqu xmm1, [10*16 + KS] 1.297 + movdqu [10*16 + KS], xmm0 1.298 + movdqu [0*16 + KS], xmm1 1.299 + 1.300 + i = 1 1.301 + WHILE i LT 5 1.302 + movdqu xmm0, [i*16 + KS] 1.303 + movdqu xmm1, [(10-i)*16 + KS] 1.304 + 1.305 + aesimc xmm0, xmm0 1.306 + aesimc xmm1, xmm1 1.307 + 1.308 + movdqu [(10-i)*16 + KS], xmm0 1.309 + movdqu [i*16 + KS], xmm1 1.310 + 1.311 + i = i+1 1.312 + ENDM 1.313 + 1.314 + movdqu xmm0, [5*16 + KS] 1.315 + aesimc xmm0, xmm0 1.316 + movdqu [5*16 + KS], xmm0 1.317 + ret 1.318 +intel_aes_decrypt_init_128 ENDP 1.319 + 1.320 + 1.321 +intel_aes_encrypt_init_192 PROC 1.322 + 1.323 + sub rsp, 16*2 1.324 + movdqu [16*0 + rsp], xmm6 1.325 + movdqu [16*1 + rsp], xmm7 1.326 + 1.327 + movdqu xmm1, [KEY] 1.328 + mov ITR, [16 + KEY] 1.329 + movd xmm3, ITR 1.330 + 1.331 + movdqu [KS], xmm1 1.332 + movdqa xmm5, xmm3 1.333 + 1.334 + lea ITR, Lcon1 1.335 + movdqu xmm0, [ITR] 1.336 + lea ITR, Lmask192 1.337 + movdqu xmm4, [ITR] 1.338 + 1.339 + mov ITR, 4 1.340 + 1.341 +Lenc_192_ks_loop: 1.342 + movdqa xmm2, xmm3 1.343 + pshufb xmm2, xmm4 1.344 + aesenclast xmm2, xmm0 1.345 + pslld xmm0, 1 1.346 + 1.347 + movdqa xmm6, xmm1 1.348 + movdqa xmm7, xmm3 1.349 + pslldq xmm6, 4 1.350 + pslldq xmm7, 4 1.351 + pxor xmm1, xmm6 1.352 + pxor xmm3, xmm7 1.353 + pslldq xmm6, 4 1.354 + pxor xmm1, xmm6 1.355 + pslldq xmm6, 4 1.356 + pxor xmm1, xmm6 1.357 + pxor xmm1, xmm2 1.358 + pshufd xmm2, xmm1, 0ffh 1.359 + pxor xmm3, xmm2 1.360 + 1.361 + movdqa xmm6, xmm1 1.362 + shufpd xmm5, xmm1, 00h 1.363 + shufpd xmm6, xmm3, 01h 1.364 + 1.365 + movdqu [16 + KS], xmm5 1.366 + movdqu [32 + KS], xmm6 1.367 + 1.368 + movdqa xmm2, xmm3 1.369 + pshufb xmm2, xmm4 1.370 + aesenclast xmm2, xmm0 1.371 + pslld xmm0, 1 1.372 + 1.373 + movdqa xmm6, xmm1 1.374 + movdqa xmm7, xmm3 1.375 + pslldq xmm6, 4 1.376 + pslldq xmm7, 4 1.377 + pxor xmm1, xmm6 1.378 + pxor xmm3, xmm7 1.379 + pslldq xmm6, 4 1.380 + pxor xmm1, xmm6 1.381 + pslldq xmm6, 4 1.382 + pxor xmm1, xmm6 1.383 + pxor xmm1, xmm2 1.384 + pshufd xmm2, xmm1, 0ffh 1.385 + pxor xmm3, xmm2 1.386 + 1.387 + movdqu [48 + KS], xmm1 1.388 + movdqa xmm5, xmm3 1.389 + 1.390 + lea KS, [48 + KS] 1.391 + 1.392 + dec ITR 1.393 + jnz Lenc_192_ks_loop 1.394 + 1.395 + movdqu [16 + KS], xmm5 1.396 + 1.397 + movdqu xmm7, [16*1 + rsp] 1.398 + movdqu xmm6, [16*0 + rsp] 1.399 + add rsp, 16*2 1.400 + ret 1.401 +intel_aes_encrypt_init_192 ENDP 1.402 + 1.403 +intel_aes_decrypt_init_192 PROC 1.404 + push KS 1.405 + push KEY 1.406 + 1.407 + call intel_aes_encrypt_init_192 1.408 + 1.409 + pop KEY 1.410 + pop KS 1.411 + 1.412 + movdqu xmm0, [0*16 + KS] 1.413 + movdqu xmm1, [12*16 + KS] 1.414 + movdqu [12*16 + KS], xmm0 1.415 + movdqu [0*16 + KS], xmm1 1.416 + 1.417 + i = 1 1.418 + WHILE i LT 6 1.419 + movdqu xmm0, [i*16 + KS] 1.420 + movdqu xmm1, [(12-i)*16 + KS] 1.421 + 1.422 + aesimc xmm0, xmm0 1.423 + aesimc xmm1, xmm1 1.424 + 1.425 + movdqu [(12-i)*16 + KS], xmm0 1.426 + movdqu [i*16 + KS], xmm1 1.427 + 1.428 + i = i+1 1.429 + ENDM 1.430 + 1.431 + movdqu xmm0, [6*16 + KS] 1.432 + aesimc xmm0, xmm0 1.433 + movdqu [6*16 + KS], xmm0 1.434 + ret 1.435 +intel_aes_decrypt_init_192 ENDP 1.436 + 1.437 + 1.438 +intel_aes_encrypt_init_256 PROC 1.439 + sub rsp, 16*2 1.440 + movdqu [16*0 + rsp], xmm6 1.441 + movdqu [16*1 + rsp], xmm7 1.442 + 1.443 + movdqu xmm1, [16*0 + KEY] 1.444 + movdqu xmm3, [16*1 + KEY] 1.445 + 1.446 + movdqu [16*0 + KS], xmm1 1.447 + movdqu [16*1 + KS], xmm3 1.448 + 1.449 + lea ITR, Lcon1 1.450 + movdqu xmm0, [ITR] 1.451 + lea ITR, Lmask256 1.452 + movdqu xmm5, [ITR] 1.453 + 1.454 + pxor xmm6, xmm6 1.455 + 1.456 + mov ITR, 6 1.457 + 1.458 +Lenc_256_ks_loop: 1.459 + 1.460 + movdqa xmm2, xmm3 1.461 + pshufb xmm2, xmm5 1.462 + aesenclast xmm2, xmm0 1.463 + pslld xmm0, 1 1.464 + movdqa xmm4, xmm1 1.465 + pslldq xmm4, 4 1.466 + pxor xmm1, xmm4 1.467 + pslldq xmm4, 4 1.468 + pxor xmm1, xmm4 1.469 + pslldq xmm4, 4 1.470 + pxor xmm1, xmm4 1.471 + pxor xmm1, xmm2 1.472 + movdqu [16*2 + KS], xmm1 1.473 + 1.474 + pshufd xmm2, xmm1, 0ffh 1.475 + aesenclast xmm2, xmm6 1.476 + movdqa xmm4, xmm3 1.477 + pslldq xmm4, 4 1.478 + pxor xmm3, xmm4 1.479 + pslldq xmm4, 4 1.480 + pxor xmm3, xmm4 1.481 + pslldq xmm4, 4 1.482 + pxor xmm3, xmm4 1.483 + pxor xmm3, xmm2 1.484 + movdqu [16*3 + KS], xmm3 1.485 + 1.486 + lea KS, [32 + KS] 1.487 + dec ITR 1.488 + jnz Lenc_256_ks_loop 1.489 + 1.490 + movdqa xmm2, xmm3 1.491 + pshufb xmm2, xmm5 1.492 + aesenclast xmm2, xmm0 1.493 + movdqa xmm4, xmm1 1.494 + pslldq xmm4, 4 1.495 + pxor xmm1, xmm4 1.496 + pslldq xmm4, 4 1.497 + pxor xmm1, xmm4 1.498 + pslldq xmm4, 4 1.499 + pxor xmm1, xmm4 1.500 + pxor xmm1, xmm2 1.501 + movdqu [16*2 + KS], xmm1 1.502 + 1.503 + movdqu xmm7, [16*1 + rsp] 1.504 + movdqu xmm6, [16*0 + rsp] 1.505 + add rsp, 16*2 1.506 + ret 1.507 + 1.508 +intel_aes_encrypt_init_256 ENDP 1.509 + 1.510 + 1.511 +intel_aes_decrypt_init_256 PROC 1.512 + push KS 1.513 + push KEY 1.514 + 1.515 + call intel_aes_encrypt_init_256 1.516 + 1.517 + pop KEY 1.518 + pop KS 1.519 + 1.520 + movdqu xmm0, [0*16 + KS] 1.521 + movdqu xmm1, [14*16 + KS] 1.522 + movdqu [14*16 + KS], xmm0 1.523 + movdqu [0*16 + KS], xmm1 1.524 + 1.525 + i = 1 1.526 + WHILE i LT 7 1.527 + movdqu xmm0, [i*16 + KS] 1.528 + movdqu xmm1, [(14-i)*16 + KS] 1.529 + 1.530 + aesimc xmm0, xmm0 1.531 + aesimc xmm1, xmm1 1.532 + 1.533 + movdqu [(14-i)*16 + KS], xmm0 1.534 + movdqu [i*16 + KS], xmm1 1.535 + 1.536 + i = i+1 1.537 + ENDM 1.538 + 1.539 + movdqu xmm0, [7*16 + KS] 1.540 + aesimc xmm0, xmm0 1.541 + movdqu [7*16 + KS], xmm0 1.542 + ret 1.543 +intel_aes_decrypt_init_256 ENDP 1.544 + 1.545 + 1.546 + 1.547 +gen_aes_cbc_enc_func MACRO rnds 1.548 + 1.549 +LOCAL loop1 1.550 +LOCAL bail 1.551 + 1.552 + mov input, [rsp + 1*8 + 8*4] 1.553 + mov inputLen, [rsp + 1*8 + 8*5] 1.554 + 1.555 + sub rsp, 3*16 1.556 + 1.557 + movdqu [rsp + 0*16], xmm6 1.558 + movdqu [rsp + 1*16], xmm7 1.559 + movdqu [rsp + 2*16], xmm8 1.560 + 1.561 + lea ctx, [48+ctx] 1.562 + 1.563 + movdqu xmm0, [-32+ctx] 1.564 + 1.565 + movdqu xmm2, [0*16 + ctx] 1.566 + movdqu xmm3, [1*16 + ctx] 1.567 + movdqu xmm4, [2*16 + ctx] 1.568 + movdqu xmm5, [3*16 + ctx] 1.569 + movdqu xmm6, [4*16 + ctx] 1.570 + movdqu xmm7, [5*16 + ctx] 1.571 + 1.572 +loop1: 1.573 + cmp inputLen, 1*16 1.574 + jb bail 1.575 + 1.576 + movdqu xmm1, [input] 1.577 + pxor xmm1, xmm2 1.578 + pxor xmm0, xmm1 1.579 + 1.580 + aesenc xmm0, xmm3 1.581 + aesenc xmm0, xmm4 1.582 + aesenc xmm0, xmm5 1.583 + aesenc xmm0, xmm6 1.584 + aesenc xmm0, xmm7 1.585 + 1.586 + i = 6 1.587 + WHILE i LT rnds 1.588 + movdqu xmm8, [i*16 + ctx] 1.589 + aesenc xmm0, xmm8 1.590 + i = i+1 1.591 + ENDM 1.592 + movdqu xmm8, [rnds*16 + ctx] 1.593 + aesenclast xmm0, xmm8 1.594 + 1.595 + movdqu [output], xmm0 1.596 + 1.597 + lea input, [1*16 + input] 1.598 + lea output, [1*16 + output] 1.599 + sub inputLen, 1*16 1.600 + jmp loop1 1.601 + 1.602 +bail: 1.603 + movdqu [-32+ctx], xmm0 1.604 + 1.605 + xor rax, rax 1.606 + 1.607 + movdqu xmm6, [rsp + 0*16] 1.608 + movdqu xmm7, [rsp + 1*16] 1.609 + movdqu xmm8, [rsp + 2*16] 1.610 + add rsp, 3*16 1.611 + ret 1.612 + 1.613 +ENDM 1.614 + 1.615 +gen_aes_cbc_dec_func MACRO rnds 1.616 + 1.617 +LOCAL loop8 1.618 +LOCAL loop1 1.619 +LOCAL dec1 1.620 +LOCAL bail 1.621 + 1.622 + mov input, [rsp + 1*8 + 8*4] 1.623 + mov inputLen, [rsp + 1*8 + 8*5] 1.624 + 1.625 + sub rsp, 3*16 1.626 + 1.627 + movdqu [rsp + 0*16], xmm6 1.628 + movdqu [rsp + 1*16], xmm7 1.629 + movdqu [rsp + 2*16], xmm8 1.630 + 1.631 + lea ctx, [48+ctx] 1.632 + 1.633 +loop8: 1.634 + cmp inputLen, 8*16 1.635 + jb dec1 1.636 + 1.637 + movdqu xmm0, [0*16 + input] 1.638 + movdqu xmm1, [1*16 + input] 1.639 + movdqu xmm2, [2*16 + input] 1.640 + movdqu xmm3, [3*16 + input] 1.641 + movdqu xmm4, [4*16 + input] 1.642 + movdqu xmm5, [5*16 + input] 1.643 + movdqu xmm6, [6*16 + input] 1.644 + movdqu xmm7, [7*16 + input] 1.645 + 1.646 + movdqu xmm8, [0*16 + ctx] 1.647 + pxor xmm0, xmm8 1.648 + pxor xmm1, xmm8 1.649 + pxor xmm2, xmm8 1.650 + pxor xmm3, xmm8 1.651 + pxor xmm4, xmm8 1.652 + pxor xmm5, xmm8 1.653 + pxor xmm6, xmm8 1.654 + pxor xmm7, xmm8 1.655 + 1.656 + i = 1 1.657 + WHILE i LT rnds 1.658 + aes_dec_rnd i 1.659 + i = i+1 1.660 + ENDM 1.661 + aes_dec_last_rnd rnds 1.662 + 1.663 + movdqu xmm8, [-32 + ctx] 1.664 + pxor xmm0, xmm8 1.665 + movdqu xmm8, [0*16 + input] 1.666 + pxor xmm1, xmm8 1.667 + movdqu xmm8, [1*16 + input] 1.668 + pxor xmm2, xmm8 1.669 + movdqu xmm8, [2*16 + input] 1.670 + pxor xmm3, xmm8 1.671 + movdqu xmm8, [3*16 + input] 1.672 + pxor xmm4, xmm8 1.673 + movdqu xmm8, [4*16 + input] 1.674 + pxor xmm5, xmm8 1.675 + movdqu xmm8, [5*16 + input] 1.676 + pxor xmm6, xmm8 1.677 + movdqu xmm8, [6*16 + input] 1.678 + pxor xmm7, xmm8 1.679 + movdqu xmm8, [7*16 + input] 1.680 + 1.681 + movdqu [0*16 + output], xmm0 1.682 + movdqu [1*16 + output], xmm1 1.683 + movdqu [2*16 + output], xmm2 1.684 + movdqu [3*16 + output], xmm3 1.685 + movdqu [4*16 + output], xmm4 1.686 + movdqu [5*16 + output], xmm5 1.687 + movdqu [6*16 + output], xmm6 1.688 + movdqu [7*16 + output], xmm7 1.689 + movdqu [-32 + ctx], xmm8 1.690 + 1.691 + lea input, [8*16 + input] 1.692 + lea output, [8*16 + output] 1.693 + sub inputLen, 8*16 1.694 + jmp loop8 1.695 +dec1: 1.696 + 1.697 + movdqu xmm3, [-32 + ctx] 1.698 + 1.699 +loop1: 1.700 + cmp inputLen, 1*16 1.701 + jb bail 1.702 + 1.703 + movdqu xmm0, [input] 1.704 + movdqa xmm4, xmm0 1.705 + movdqu xmm7, [0*16 + ctx] 1.706 + pxor xmm0, xmm7 1.707 + 1.708 + i = 1 1.709 + WHILE i LT rnds 1.710 + movdqu xmm7, [i*16 + ctx] 1.711 + aesdec xmm0, xmm7 1.712 + i = i+1 1.713 + ENDM 1.714 + movdqu xmm7, [rnds*16 + ctx] 1.715 + aesdeclast xmm0, xmm7 1.716 + pxor xmm3, xmm0 1.717 + 1.718 + movdqu [output], xmm3 1.719 + movdqa xmm3, xmm4 1.720 + 1.721 + lea input, [1*16 + input] 1.722 + lea output, [1*16 + output] 1.723 + sub inputLen, 1*16 1.724 + jmp loop1 1.725 + 1.726 +bail: 1.727 + movdqu [-32 + ctx], xmm3 1.728 + xor rax, rax 1.729 + 1.730 + movdqu xmm6, [rsp + 0*16] 1.731 + movdqu xmm7, [rsp + 1*16] 1.732 + movdqu xmm8, [rsp + 2*16] 1.733 + add rsp, 3*16 1.734 + ret 1.735 +ENDM 1.736 + 1.737 +intel_aes_encrypt_cbc_128 PROC 1.738 +gen_aes_cbc_enc_func 10 1.739 +intel_aes_encrypt_cbc_128 ENDP 1.740 + 1.741 +intel_aes_encrypt_cbc_192 PROC 1.742 +gen_aes_cbc_enc_func 12 1.743 +intel_aes_encrypt_cbc_192 ENDP 1.744 + 1.745 +intel_aes_encrypt_cbc_256 PROC 1.746 +gen_aes_cbc_enc_func 14 1.747 +intel_aes_encrypt_cbc_256 ENDP 1.748 + 1.749 +intel_aes_decrypt_cbc_128 PROC 1.750 +gen_aes_cbc_dec_func 10 1.751 +intel_aes_decrypt_cbc_128 ENDP 1.752 + 1.753 +intel_aes_decrypt_cbc_192 PROC 1.754 +gen_aes_cbc_dec_func 12 1.755 +intel_aes_decrypt_cbc_192 ENDP 1.756 + 1.757 +intel_aes_decrypt_cbc_256 PROC 1.758 +gen_aes_cbc_dec_func 14 1.759 +intel_aes_decrypt_cbc_256 ENDP 1.760 + 1.761 + 1.762 + 1.763 +ctrCtx textequ <r10> 1.764 +CTR textequ <r11d> 1.765 +CTRSave textequ <eax> 1.766 + 1.767 +gen_aes_ctr_func MACRO rnds 1.768 + 1.769 +LOCAL loop8 1.770 +LOCAL loop1 1.771 +LOCAL enc1 1.772 +LOCAL bail 1.773 + 1.774 + mov input, [rsp + 8*1 + 4*8] 1.775 + mov inputLen, [rsp + 8*1 + 5*8] 1.776 + 1.777 + mov ctrCtx, ctx 1.778 + mov ctx, [8+ctrCtx] 1.779 + lea ctx, [48+ctx] 1.780 + 1.781 + sub rsp, 3*16 1.782 + movdqu [rsp + 0*16], xmm6 1.783 + movdqu [rsp + 1*16], xmm7 1.784 + movdqu [rsp + 2*16], xmm8 1.785 + 1.786 + 1.787 + push rbp 1.788 + mov rbp, rsp 1.789 + sub rsp, 8*16 1.790 + and rsp, -16 1.791 + 1.792 + 1.793 + movdqu xmm0, [16+ctrCtx] 1.794 + mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] 1.795 + bswap CTRSave 1.796 + movdqu xmm1, [ctx + 0*16] 1.797 + 1.798 + pxor xmm0, xmm1 1.799 + 1.800 + movdqa [rsp + 0*16], xmm0 1.801 + movdqa [rsp + 1*16], xmm0 1.802 + movdqa [rsp + 2*16], xmm0 1.803 + movdqa [rsp + 3*16], xmm0 1.804 + movdqa [rsp + 4*16], xmm0 1.805 + movdqa [rsp + 5*16], xmm0 1.806 + movdqa [rsp + 6*16], xmm0 1.807 + movdqa [rsp + 7*16], xmm0 1.808 + 1.809 + inc CTRSave 1.810 + mov CTR, CTRSave 1.811 + bswap CTR 1.812 + xor CTR, DWORD PTR [ctx + 3*4] 1.813 + mov DWORD PTR [rsp + 1*16 + 3*4], CTR 1.814 + 1.815 + inc CTRSave 1.816 + mov CTR, CTRSave 1.817 + bswap CTR 1.818 + xor CTR, DWORD PTR [ctx + 3*4] 1.819 + mov DWORD PTR [rsp + 2*16 + 3*4], CTR 1.820 + 1.821 + inc CTRSave 1.822 + mov CTR, CTRSave 1.823 + bswap CTR 1.824 + xor CTR, DWORD PTR [ctx + 3*4] 1.825 + mov DWORD PTR [rsp + 3*16 + 3*4], CTR 1.826 + 1.827 + inc CTRSave 1.828 + mov CTR, CTRSave 1.829 + bswap CTR 1.830 + xor CTR, DWORD PTR [ctx + 3*4] 1.831 + mov DWORD PTR [rsp + 4*16 + 3*4], CTR 1.832 + 1.833 + inc CTRSave 1.834 + mov CTR, CTRSave 1.835 + bswap CTR 1.836 + xor CTR, DWORD PTR [ctx + 3*4] 1.837 + mov DWORD PTR [rsp + 5*16 + 3*4], CTR 1.838 + 1.839 + inc CTRSave 1.840 + mov CTR, CTRSave 1.841 + bswap CTR 1.842 + xor CTR, DWORD PTR [ctx + 3*4] 1.843 + mov DWORD PTR [rsp + 6*16 + 3*4], CTR 1.844 + 1.845 + inc CTRSave 1.846 + mov CTR, CTRSave 1.847 + bswap CTR 1.848 + xor CTR, DWORD PTR [ctx + 3*4] 1.849 + mov DWORD PTR [rsp + 7*16 + 3*4], CTR 1.850 + 1.851 + 1.852 +loop8: 1.853 + cmp inputLen, 8*16 1.854 + jb loop1 1.855 + 1.856 + movdqu xmm0, [0*16 + rsp] 1.857 + movdqu xmm1, [1*16 + rsp] 1.858 + movdqu xmm2, [2*16 + rsp] 1.859 + movdqu xmm3, [3*16 + rsp] 1.860 + movdqu xmm4, [4*16 + rsp] 1.861 + movdqu xmm5, [5*16 + rsp] 1.862 + movdqu xmm6, [6*16 + rsp] 1.863 + movdqu xmm7, [7*16 + rsp] 1.864 + 1.865 + i = 1 1.866 + WHILE i LE 8 1.867 + aes_rnd i 1.868 + 1.869 + inc CTRSave 1.870 + mov CTR, CTRSave 1.871 + bswap CTR 1.872 + xor CTR, DWORD PTR [ctx + 3*4] 1.873 + mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR 1.874 + 1.875 + i = i+1 1.876 + ENDM 1.877 + WHILE i LT rnds 1.878 + aes_rnd i 1.879 + i = i+1 1.880 + ENDM 1.881 + aes_last_rnd rnds 1.882 + 1.883 + movdqu xmm8, [0*16 + input] 1.884 + pxor xmm0, xmm8 1.885 + movdqu xmm8, [1*16 + input] 1.886 + pxor xmm1, xmm8 1.887 + movdqu xmm8, [2*16 + input] 1.888 + pxor xmm2, xmm8 1.889 + movdqu xmm8, [3*16 + input] 1.890 + pxor xmm3, xmm8 1.891 + movdqu xmm8, [4*16 + input] 1.892 + pxor xmm4, xmm8 1.893 + movdqu xmm8, [5*16 + input] 1.894 + pxor xmm5, xmm8 1.895 + movdqu xmm8, [6*16 + input] 1.896 + pxor xmm6, xmm8 1.897 + movdqu xmm8, [7*16 + input] 1.898 + pxor xmm7, xmm8 1.899 + 1.900 + movdqu [0*16 + output], xmm0 1.901 + movdqu [1*16 + output], xmm1 1.902 + movdqu [2*16 + output], xmm2 1.903 + movdqu [3*16 + output], xmm3 1.904 + movdqu [4*16 + output], xmm4 1.905 + movdqu [5*16 + output], xmm5 1.906 + movdqu [6*16 + output], xmm6 1.907 + movdqu [7*16 + output], xmm7 1.908 + 1.909 + lea input, [8*16 + input] 1.910 + lea output, [8*16 + output] 1.911 + sub inputLen, 8*16 1.912 + jmp loop8 1.913 + 1.914 + 1.915 +loop1: 1.916 + cmp inputLen, 1*16 1.917 + jb bail 1.918 + 1.919 + movdqu xmm0, [rsp] 1.920 + add rsp, 16 1.921 + 1.922 + i = 1 1.923 + WHILE i LT rnds 1.924 + movdqu xmm7, [i*16 + ctx] 1.925 + aesenc xmm0, xmm7 1.926 + i = i+1 1.927 + ENDM 1.928 + movdqu xmm7, [rnds*16 + ctx] 1.929 + aesenclast xmm0, xmm7 1.930 + 1.931 + movdqu xmm7, [input] 1.932 + pxor xmm0, xmm7 1.933 + movdqu [output], xmm0 1.934 + 1.935 + lea input, [1*16 + input] 1.936 + lea output, [1*16 + output] 1.937 + sub inputLen, 1*16 1.938 + jmp loop1 1.939 + 1.940 +bail: 1.941 + 1.942 + movdqu xmm0, [rsp] 1.943 + movdqu xmm1, [ctx + 0*16] 1.944 + pxor xmm0, xmm1 1.945 + movdqu [16+ctrCtx], xmm0 1.946 + 1.947 + 1.948 + xor rax, rax 1.949 + mov rsp, rbp 1.950 + pop rbp 1.951 + 1.952 + movdqu xmm6, [rsp + 0*16] 1.953 + movdqu xmm7, [rsp + 1*16] 1.954 + movdqu xmm8, [rsp + 2*16] 1.955 + add rsp, 3*16 1.956 + 1.957 + ret 1.958 +ENDM 1.959 + 1.960 + 1.961 +intel_aes_encrypt_ctr_128 PROC 1.962 +gen_aes_ctr_func 10 1.963 +intel_aes_encrypt_ctr_128 ENDP 1.964 + 1.965 +intel_aes_encrypt_ctr_192 PROC 1.966 +gen_aes_ctr_func 12 1.967 +intel_aes_encrypt_ctr_192 ENDP 1.968 + 1.969 +intel_aes_encrypt_ctr_256 PROC 1.970 +gen_aes_ctr_func 14 1.971 +intel_aes_encrypt_ctr_256 ENDP 1.972 + 1.973 + 1.974 +END