1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/intel-aes-x86-masm.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,949 @@ 1.4 +; LICENSE: 1.5 +; This submission to NSS is to be made available under the terms of the 1.6 +; Mozilla Public License, v. 2.0. You can obtain one at http: 1.7 +; //mozilla.org/MPL/2.0/. 1.8 +;############################################################################### 1.9 +; Copyright(c) 2014, Intel Corp. 1.10 +; Developers and authors: 1.11 +; Shay Gueron and Vlad Krasnov 1.12 +; Intel Corporation, Israel Development Centre, Haifa, Israel 1.13 +; Please send feedback directly to crypto.feedback.alias@intel.com 1.14 + 1.15 + 1.16 +.MODEL FLAT, C 1.17 +.XMM 1.18 + 1.19 +.DATA 1.20 +ALIGN 16 1.21 +Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh 1.22 +Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h 1.23 +Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh 1.24 +Lcon1 dd 1,1,1,1 1.25 +Lcon2 dd 1bh,1bh,1bh,1bh 1.26 + 1.27 +.CODE 1.28 + 1.29 +ctx textequ <ecx> 1.30 +output textequ <edx> 1.31 +input textequ <eax> 1.32 +inputLen textequ <edi> 1.33 + 1.34 + 1.35 +aes_rnd MACRO i 1.36 + movdqu xmm7, [i*16 + ctx] 1.37 + aesenc xmm0, xmm7 1.38 + aesenc xmm1, xmm7 1.39 + aesenc xmm2, xmm7 1.40 + aesenc xmm3, xmm7 1.41 + aesenc xmm4, xmm7 1.42 + aesenc xmm5, xmm7 1.43 + aesenc xmm6, xmm7 1.44 + ENDM 1.45 + 1.46 +aes_last_rnd MACRO i 1.47 + movdqu xmm7, [i*16 + ctx] 1.48 + aesenclast xmm0, xmm7 1.49 + aesenclast xmm1, xmm7 1.50 + aesenclast xmm2, xmm7 1.51 + aesenclast xmm3, xmm7 1.52 + aesenclast xmm4, xmm7 1.53 + aesenclast xmm5, xmm7 1.54 + aesenclast xmm6, xmm7 1.55 + ENDM 1.56 + 1.57 +aes_dec_rnd MACRO i 1.58 + movdqu xmm7, [i*16 + ctx] 1.59 + aesdec xmm0, xmm7 1.60 + aesdec xmm1, xmm7 1.61 + aesdec xmm2, xmm7 1.62 + aesdec xmm3, xmm7 1.63 + aesdec xmm4, xmm7 1.64 + aesdec xmm5, xmm7 1.65 + aesdec xmm6, xmm7 1.66 + ENDM 1.67 + 1.68 +aes_dec_last_rnd MACRO i 1.69 + movdqu xmm7, [i*16 + ctx] 1.70 + aesdeclast xmm0, xmm7 1.71 + aesdeclast xmm1, xmm7 1.72 + aesdeclast xmm2, xmm7 1.73 + aesdeclast xmm3, xmm7 1.74 + aesdeclast xmm4, xmm7 1.75 + aesdeclast xmm5, xmm7 1.76 + aesdeclast xmm6, xmm7 1.77 + ENDM 1.78 + 1.79 + 1.80 +gen_aes_ecb_func MACRO enc, rnds 1.81 + 1.82 +LOCAL loop7 1.83 +LOCAL loop1 1.84 +LOCAL bail 1.85 + 1.86 + push inputLen 1.87 + 1.88 + mov ctx, [esp + 2*4 + 0*4] 1.89 + mov output, [esp + 2*4 + 1*4] 1.90 + mov input, [esp + 2*4 + 4*4] 1.91 + mov inputLen, [esp + 2*4 + 5*4] 1.92 + 1.93 + lea ctx, [44+ctx] 1.94 + 1.95 +loop7: 1.96 + cmp inputLen, 7*16 1.97 + jb loop1 1.98 + 1.99 + movdqu xmm0, [0*16 + input] 1.100 + movdqu xmm1, [1*16 + input] 1.101 + movdqu xmm2, [2*16 + input] 1.102 + movdqu xmm3, [3*16 + input] 1.103 + movdqu xmm4, [4*16 + input] 1.104 + movdqu xmm5, [5*16 + input] 1.105 + movdqu xmm6, [6*16 + input] 1.106 + 1.107 + movdqu xmm7, [0*16 + ctx] 1.108 + pxor xmm0, xmm7 1.109 + pxor xmm1, xmm7 1.110 + pxor xmm2, xmm7 1.111 + pxor xmm3, xmm7 1.112 + pxor xmm4, xmm7 1.113 + pxor xmm5, xmm7 1.114 + pxor xmm6, xmm7 1.115 + 1.116 +IF enc eq 1 1.117 + rnd textequ <aes_rnd> 1.118 + lastrnd textequ <aes_last_rnd> 1.119 + aesinst textequ <aesenc> 1.120 + aeslastinst textequ <aesenclast> 1.121 +ELSE 1.122 + rnd textequ <aes_dec_rnd> 1.123 + lastrnd textequ <aes_dec_last_rnd> 1.124 + aesinst textequ <aesdec> 1.125 + aeslastinst textequ <aesdeclast> 1.126 +ENDIF 1.127 + 1.128 + i = 1 1.129 + WHILE i LT rnds 1.130 + rnd i 1.131 + i = i+1 1.132 + ENDM 1.133 + lastrnd rnds 1.134 + 1.135 + movdqu [0*16 + output], xmm0 1.136 + movdqu [1*16 + output], xmm1 1.137 + movdqu [2*16 + output], xmm2 1.138 + movdqu [3*16 + output], xmm3 1.139 + movdqu [4*16 + output], xmm4 1.140 + movdqu [5*16 + output], xmm5 1.141 + movdqu [6*16 + output], xmm6 1.142 + 1.143 + lea input, [7*16 + input] 1.144 + lea output, [7*16 + output] 1.145 + sub inputLen, 7*16 1.146 + jmp loop7 1.147 + 1.148 +loop1: 1.149 + cmp inputLen, 1*16 1.150 + jb bail 1.151 + 1.152 + movdqu xmm0, [input] 1.153 + movdqu xmm7, [0*16 + ctx] 1.154 + pxor xmm0, xmm7 1.155 + 1.156 + i = 1 1.157 + WHILE i LT rnds 1.158 + movdqu xmm7, [i*16 + ctx] 1.159 + aesinst xmm0, xmm7 1.160 + i = i+1 1.161 + ENDM 1.162 + movdqu xmm7, [rnds*16 + ctx] 1.163 + aeslastinst xmm0, xmm7 1.164 + 1.165 + movdqu [output], xmm0 1.166 + 1.167 + lea input, [1*16 + input] 1.168 + lea output, [1*16 + output] 1.169 + sub inputLen, 1*16 1.170 + jmp loop1 1.171 + 1.172 +bail: 1.173 + xor eax, eax 1.174 + pop inputLen 1.175 + ret 1.176 + 1.177 +ENDM 1.178 + 1.179 +ALIGN 16 1.180 +intel_aes_encrypt_ecb_128 PROC 1.181 +gen_aes_ecb_func 1, 10 1.182 +intel_aes_encrypt_ecb_128 ENDP 1.183 + 1.184 +ALIGN 16 1.185 +intel_aes_encrypt_ecb_192 PROC 1.186 +gen_aes_ecb_func 1, 12 1.187 +intel_aes_encrypt_ecb_192 ENDP 1.188 + 1.189 +ALIGN 16 1.190 +intel_aes_encrypt_ecb_256 PROC 1.191 +gen_aes_ecb_func 1, 14 1.192 +intel_aes_encrypt_ecb_256 ENDP 1.193 + 1.194 +ALIGN 16 1.195 +intel_aes_decrypt_ecb_128 PROC 1.196 +gen_aes_ecb_func 0, 10 1.197 +intel_aes_decrypt_ecb_128 ENDP 1.198 + 1.199 +ALIGN 16 1.200 +intel_aes_decrypt_ecb_192 PROC 1.201 +gen_aes_ecb_func 0, 12 1.202 +intel_aes_decrypt_ecb_192 ENDP 1.203 + 1.204 +ALIGN 16 1.205 +intel_aes_decrypt_ecb_256 PROC 1.206 +gen_aes_ecb_func 0, 14 1.207 +intel_aes_decrypt_ecb_256 ENDP 1.208 + 1.209 + 1.210 +KEY textequ <ecx> 1.211 +KS textequ <edx> 1.212 +ITR textequ <eax> 1.213 + 1.214 +ALIGN 16 1.215 +intel_aes_encrypt_init_128 PROC 1.216 + 1.217 + mov KEY, [esp + 1*4 + 0*4] 1.218 + mov KS, [esp + 1*4 + 1*4] 1.219 + 1.220 + 1.221 + movdqu xmm1, [KEY] 1.222 + movdqu [KS], xmm1 1.223 + movdqa xmm2, xmm1 1.224 + 1.225 + lea ITR, Lcon1 1.226 + movdqa xmm0, [ITR] 1.227 + lea ITR, Lmask 1.228 + movdqa xmm4, [ITR] 1.229 + 1.230 + mov ITR, 8 1.231 + 1.232 +Lenc_128_ks_loop: 1.233 + lea KS, [16 + KS] 1.234 + dec ITR 1.235 + 1.236 + pshufb xmm2, xmm4 1.237 + aesenclast xmm2, xmm0 1.238 + pslld xmm0, 1 1.239 + movdqa xmm3, xmm1 1.240 + pslldq xmm3, 4 1.241 + pxor xmm1, xmm3 1.242 + pslldq xmm3, 4 1.243 + pxor xmm1, xmm3 1.244 + pslldq xmm3, 4 1.245 + pxor xmm1, xmm3 1.246 + pxor xmm1, xmm2 1.247 + movdqu [KS], xmm1 1.248 + movdqa xmm2, xmm1 1.249 + 1.250 + jne Lenc_128_ks_loop 1.251 + 1.252 + lea ITR, Lcon2 1.253 + movdqa xmm0, [ITR] 1.254 + 1.255 + pshufb xmm2, xmm4 1.256 + aesenclast xmm2, xmm0 1.257 + pslld xmm0, 1 1.258 + movdqa xmm3, xmm1 1.259 + pslldq xmm3, 4 1.260 + pxor xmm1, xmm3 1.261 + pslldq xmm3, 4 1.262 + pxor xmm1, xmm3 1.263 + pslldq xmm3, 4 1.264 + pxor xmm1, xmm3 1.265 + pxor xmm1, xmm2 1.266 + movdqu [16 + KS], xmm1 1.267 + movdqa xmm2, xmm1 1.268 + 1.269 + pshufb xmm2, xmm4 1.270 + aesenclast xmm2, xmm0 1.271 + movdqa xmm3, xmm1 1.272 + pslldq xmm3, 4 1.273 + pxor xmm1, xmm3 1.274 + pslldq xmm3, 4 1.275 + pxor xmm1, xmm3 1.276 + pslldq xmm3, 4 1.277 + pxor xmm1, xmm3 1.278 + pxor xmm1, xmm2 1.279 + movdqu [32 + KS], xmm1 1.280 + movdqa xmm2, xmm1 1.281 + 1.282 + ret 1.283 +intel_aes_encrypt_init_128 ENDP 1.284 + 1.285 + 1.286 +ALIGN 16 1.287 +intel_aes_decrypt_init_128 PROC 1.288 + 1.289 + mov KEY, [esp + 1*4 + 0*4] 1.290 + mov KS, [esp + 1*4 + 1*4] 1.291 + 1.292 + push KS 1.293 + push KEY 1.294 + 1.295 + call intel_aes_encrypt_init_128 1.296 + 1.297 + pop KEY 1.298 + pop KS 1.299 + 1.300 + movdqu xmm0, [0*16 + KS] 1.301 + movdqu xmm1, [10*16 + KS] 1.302 + movdqu [10*16 + KS], xmm0 1.303 + movdqu [0*16 + KS], xmm1 1.304 + 1.305 + i = 1 1.306 + WHILE i LT 5 1.307 + movdqu xmm0, [i*16 + KS] 1.308 + movdqu xmm1, [(10-i)*16 + KS] 1.309 + 1.310 + aesimc xmm0, xmm0 1.311 + aesimc xmm1, xmm1 1.312 + 1.313 + movdqu [(10-i)*16 + KS], xmm0 1.314 + movdqu [i*16 + KS], xmm1 1.315 + 1.316 + i = i+1 1.317 + ENDM 1.318 + 1.319 + movdqu xmm0, [5*16 + KS] 1.320 + aesimc xmm0, xmm0 1.321 + movdqu [5*16 + KS], xmm0 1.322 + ret 1.323 +intel_aes_decrypt_init_128 ENDP 1.324 + 1.325 + 1.326 +ALIGN 16 1.327 +intel_aes_encrypt_init_192 PROC 1.328 + 1.329 + mov KEY, [esp + 1*4 + 0*4] 1.330 + mov KS, [esp + 1*4 + 1*4] 1.331 + 1.332 + pxor xmm3, xmm3 1.333 + movdqu xmm1, [KEY] 1.334 + pinsrd xmm3, DWORD PTR [16 + KEY], 0 1.335 + pinsrd xmm3, DWORD PTR [20 + KEY], 1 1.336 + 1.337 + movdqu [KS], xmm1 1.338 + movdqa xmm5, xmm3 1.339 + 1.340 + lea ITR, Lcon1 1.341 + movdqu xmm0, [ITR] 1.342 + lea ITR, Lmask192 1.343 + movdqu xmm4, [ITR] 1.344 + 1.345 + mov ITR, 4 1.346 + 1.347 +Lenc_192_ks_loop: 1.348 + movdqa xmm2, xmm3 1.349 + pshufb xmm2, xmm4 1.350 + aesenclast xmm2, xmm0 1.351 + pslld xmm0, 1 1.352 + 1.353 + movdqa xmm6, xmm1 1.354 + movdqa xmm7, xmm3 1.355 + pslldq xmm6, 4 1.356 + pslldq xmm7, 4 1.357 + pxor xmm1, xmm6 1.358 + pxor xmm3, xmm7 1.359 + pslldq xmm6, 4 1.360 + pxor xmm1, xmm6 1.361 + pslldq xmm6, 4 1.362 + pxor xmm1, xmm6 1.363 + pxor xmm1, xmm2 1.364 + pshufd xmm2, xmm1, 0ffh 1.365 + pxor xmm3, xmm2 1.366 + 1.367 + movdqa xmm6, xmm1 1.368 + shufpd xmm5, xmm1, 00h 1.369 + shufpd xmm6, xmm3, 01h 1.370 + 1.371 + movdqu [16 + KS], xmm5 1.372 + movdqu [32 + KS], xmm6 1.373 + 1.374 + movdqa xmm2, xmm3 1.375 + pshufb xmm2, xmm4 1.376 + aesenclast xmm2, xmm0 1.377 + pslld xmm0, 1 1.378 + 1.379 + movdqa xmm6, xmm1 1.380 + movdqa xmm7, xmm3 1.381 + pslldq xmm6, 4 1.382 + pslldq xmm7, 4 1.383 + pxor xmm1, xmm6 1.384 + pxor xmm3, xmm7 1.385 + pslldq xmm6, 4 1.386 + pxor xmm1, xmm6 1.387 + pslldq xmm6, 4 1.388 + pxor xmm1, xmm6 1.389 + pxor xmm1, xmm2 1.390 + pshufd xmm2, xmm1, 0ffh 1.391 + pxor xmm3, xmm2 1.392 + 1.393 + movdqu [48 + KS], xmm1 1.394 + movdqa xmm5, xmm3 1.395 + 1.396 + lea KS, [48 + KS] 1.397 + 1.398 + dec ITR 1.399 + jnz Lenc_192_ks_loop 1.400 + 1.401 + movdqu [16 + KS], xmm5 1.402 +ret 1.403 +intel_aes_encrypt_init_192 ENDP 1.404 + 1.405 +ALIGN 16 1.406 +intel_aes_decrypt_init_192 PROC 1.407 + mov KEY, [esp + 1*4 + 0*4] 1.408 + mov KS, [esp + 1*4 + 1*4] 1.409 + 1.410 + push KS 1.411 + push KEY 1.412 + 1.413 + call intel_aes_encrypt_init_192 1.414 + 1.415 + pop KEY 1.416 + pop KS 1.417 + 1.418 + movdqu xmm0, [0*16 + KS] 1.419 + movdqu xmm1, [12*16 + KS] 1.420 + movdqu [12*16 + KS], xmm0 1.421 + movdqu [0*16 + KS], xmm1 1.422 + 1.423 + i = 1 1.424 + WHILE i LT 6 1.425 + movdqu xmm0, [i*16 + KS] 1.426 + movdqu xmm1, [(12-i)*16 + KS] 1.427 + 1.428 + aesimc xmm0, xmm0 1.429 + aesimc xmm1, xmm1 1.430 + 1.431 + movdqu [(12-i)*16 + KS], xmm0 1.432 + movdqu [i*16 + KS], xmm1 1.433 + 1.434 + i = i+1 1.435 + ENDM 1.436 + 1.437 + movdqu xmm0, [6*16 + KS] 1.438 + aesimc xmm0, xmm0 1.439 + movdqu [6*16 + KS], xmm0 1.440 + ret 1.441 +intel_aes_decrypt_init_192 ENDP 1.442 + 1.443 +ALIGN 16 1.444 +intel_aes_encrypt_init_256 PROC 1.445 + 1.446 + mov KEY, [esp + 1*4 + 0*4] 1.447 + mov KS, [esp + 1*4 + 1*4] 1.448 + movdqu xmm1, [16*0 + KEY] 1.449 + movdqu xmm3, [16*1 + KEY] 1.450 + 1.451 + movdqu [16*0 + KS], xmm1 1.452 + movdqu [16*1 + KS], xmm3 1.453 + 1.454 + lea ITR, Lcon1 1.455 + movdqu xmm0, [ITR] 1.456 + lea ITR, Lmask256 1.457 + movdqu xmm5, [ITR] 1.458 + 1.459 + pxor xmm6, xmm6 1.460 + 1.461 + mov ITR, 6 1.462 + 1.463 +Lenc_256_ks_loop: 1.464 + 1.465 + movdqa xmm2, xmm3 1.466 + pshufb xmm2, xmm5 1.467 + aesenclast xmm2, xmm0 1.468 + pslld xmm0, 1 1.469 + movdqa xmm4, xmm1 1.470 + pslldq xmm4, 4 1.471 + pxor xmm1, xmm4 1.472 + pslldq xmm4, 4 1.473 + pxor xmm1, xmm4 1.474 + pslldq xmm4, 4 1.475 + pxor xmm1, xmm4 1.476 + pxor xmm1, xmm2 1.477 + movdqu [16*2 + KS], xmm1 1.478 + 1.479 + pshufd xmm2, xmm1, 0ffh 1.480 + aesenclast xmm2, xmm6 1.481 + movdqa xmm4, xmm3 1.482 + pslldq xmm4, 4 1.483 + pxor xmm3, xmm4 1.484 + pslldq xmm4, 4 1.485 + pxor xmm3, xmm4 1.486 + pslldq xmm4, 4 1.487 + pxor xmm3, xmm4 1.488 + pxor xmm3, xmm2 1.489 + movdqu [16*3 + KS], xmm3 1.490 + 1.491 + lea KS, [32 + KS] 1.492 + dec ITR 1.493 + jnz Lenc_256_ks_loop 1.494 + 1.495 + movdqa xmm2, xmm3 1.496 + pshufb xmm2, xmm5 1.497 + aesenclast xmm2, xmm0 1.498 + movdqa xmm4, xmm1 1.499 + pslldq xmm4, 4 1.500 + pxor xmm1, xmm4 1.501 + pslldq xmm4, 4 1.502 + pxor xmm1, xmm4 1.503 + pslldq xmm4, 4 1.504 + pxor xmm1, xmm4 1.505 + pxor xmm1, xmm2 1.506 + movdqu [16*2 + KS], xmm1 1.507 + 1.508 + ret 1.509 +intel_aes_encrypt_init_256 ENDP 1.510 + 1.511 +ALIGN 16 1.512 +intel_aes_decrypt_init_256 PROC 1.513 + mov KEY, [esp + 1*4 + 0*4] 1.514 + mov KS, [esp + 1*4 + 1*4] 1.515 + 1.516 + push KS 1.517 + push KEY 1.518 + 1.519 + call intel_aes_encrypt_init_256 1.520 + 1.521 + pop KEY 1.522 + pop KS 1.523 + 1.524 + movdqu xmm0, [0*16 + KS] 1.525 + movdqu xmm1, [14*16 + KS] 1.526 + movdqu [14*16 + KS], xmm0 1.527 + movdqu [0*16 + KS], xmm1 1.528 + 1.529 + i = 1 1.530 + WHILE i LT 7 1.531 + movdqu xmm0, [i*16 + KS] 1.532 + movdqu xmm1, [(14-i)*16 + KS] 1.533 + 1.534 + aesimc xmm0, xmm0 1.535 + aesimc xmm1, xmm1 1.536 + 1.537 + movdqu [(14-i)*16 + KS], xmm0 1.538 + movdqu [i*16 + KS], xmm1 1.539 + 1.540 + i = i+1 1.541 + ENDM 1.542 + 1.543 + movdqu xmm0, [7*16 + KS] 1.544 + aesimc xmm0, xmm0 1.545 + movdqu [7*16 + KS], xmm0 1.546 + ret 1.547 +intel_aes_decrypt_init_256 ENDP 1.548 + 1.549 + 1.550 + 1.551 +gen_aes_cbc_enc_func MACRO rnds 1.552 + 1.553 +LOCAL loop1 1.554 +LOCAL bail 1.555 + 1.556 + push inputLen 1.557 + 1.558 + mov ctx, [esp + 2*4 + 0*4] 1.559 + mov output, [esp + 2*4 + 1*4] 1.560 + mov input, [esp + 2*4 + 4*4] 1.561 + mov inputLen, [esp + 2*4 + 5*4] 1.562 + 1.563 + lea ctx, [44+ctx] 1.564 + 1.565 + movdqu xmm0, [-32+ctx] 1.566 + 1.567 + movdqu xmm2, [0*16 + ctx] 1.568 + movdqu xmm3, [1*16 + ctx] 1.569 + movdqu xmm4, [2*16 + ctx] 1.570 + movdqu xmm5, [3*16 + ctx] 1.571 + movdqu xmm6, [4*16 + ctx] 1.572 + 1.573 +loop1: 1.574 + cmp inputLen, 1*16 1.575 + jb bail 1.576 + 1.577 + movdqu xmm1, [input] 1.578 + pxor xmm1, xmm2 1.579 + pxor xmm0, xmm1 1.580 + 1.581 + aesenc xmm0, xmm3 1.582 + aesenc xmm0, xmm4 1.583 + aesenc xmm0, xmm5 1.584 + aesenc xmm0, xmm6 1.585 + 1.586 + i = 5 1.587 + WHILE i LT rnds 1.588 + movdqu xmm7, [i*16 + ctx] 1.589 + aesenc xmm0, xmm7 1.590 + i = i+1 1.591 + ENDM 1.592 + movdqu xmm7, [rnds*16 + ctx] 1.593 + aesenclast xmm0, xmm7 1.594 + 1.595 + movdqu [output], xmm0 1.596 + 1.597 + lea input, [1*16 + input] 1.598 + lea output, [1*16 + output] 1.599 + sub inputLen, 1*16 1.600 + jmp loop1 1.601 + 1.602 +bail: 1.603 + movdqu [-32+ctx], xmm0 1.604 + 1.605 + xor eax, eax 1.606 + pop inputLen 1.607 + ret 1.608 + 1.609 +ENDM 1.610 + 1.611 +gen_aes_cbc_dec_func MACRO rnds 1.612 + 1.613 +LOCAL loop7 1.614 +LOCAL loop1 1.615 +LOCAL dec1 1.616 +LOCAL bail 1.617 + 1.618 + push inputLen 1.619 + 1.620 + mov ctx, [esp + 2*4 + 0*4] 1.621 + mov output, [esp + 2*4 + 1*4] 1.622 + mov input, [esp + 2*4 + 4*4] 1.623 + mov inputLen, [esp + 2*4 + 5*4] 1.624 + 1.625 + lea ctx, [44+ctx] 1.626 + 1.627 +loop7: 1.628 + cmp inputLen, 7*16 1.629 + jb dec1 1.630 + 1.631 + movdqu xmm0, [0*16 + input] 1.632 + movdqu xmm1, [1*16 + input] 1.633 + movdqu xmm2, [2*16 + input] 1.634 + movdqu xmm3, [3*16 + input] 1.635 + movdqu xmm4, [4*16 + input] 1.636 + movdqu xmm5, [5*16 + input] 1.637 + movdqu xmm6, [6*16 + input] 1.638 + 1.639 + movdqu xmm7, [0*16 + ctx] 1.640 + pxor xmm0, xmm7 1.641 + pxor xmm1, xmm7 1.642 + pxor xmm2, xmm7 1.643 + pxor xmm3, xmm7 1.644 + pxor xmm4, xmm7 1.645 + pxor xmm5, xmm7 1.646 + pxor xmm6, xmm7 1.647 + 1.648 + i = 1 1.649 + WHILE i LT rnds 1.650 + aes_dec_rnd i 1.651 + i = i+1 1.652 + ENDM 1.653 + aes_dec_last_rnd rnds 1.654 + 1.655 + movdqu xmm7, [-32 + ctx] 1.656 + pxor xmm0, xmm7 1.657 + movdqu xmm7, [0*16 + input] 1.658 + pxor xmm1, xmm7 1.659 + movdqu xmm7, [1*16 + input] 1.660 + pxor xmm2, xmm7 1.661 + movdqu xmm7, [2*16 + input] 1.662 + pxor xmm3, xmm7 1.663 + movdqu xmm7, [3*16 + input] 1.664 + pxor xmm4, xmm7 1.665 + movdqu xmm7, [4*16 + input] 1.666 + pxor xmm5, xmm7 1.667 + movdqu xmm7, [5*16 + input] 1.668 + pxor xmm6, xmm7 1.669 + movdqu xmm7, [6*16 + input] 1.670 + 1.671 + movdqu [0*16 + output], xmm0 1.672 + movdqu [1*16 + output], xmm1 1.673 + movdqu [2*16 + output], xmm2 1.674 + movdqu [3*16 + output], xmm3 1.675 + movdqu [4*16 + output], xmm4 1.676 + movdqu [5*16 + output], xmm5 1.677 + movdqu [6*16 + output], xmm6 1.678 + movdqu [-32 + ctx], xmm7 1.679 + 1.680 + lea input, [7*16 + input] 1.681 + lea output, [7*16 + output] 1.682 + sub inputLen, 7*16 1.683 + jmp loop7 1.684 +dec1: 1.685 + 1.686 + movdqu xmm3, [-32 + ctx] 1.687 + 1.688 +loop1: 1.689 + cmp inputLen, 1*16 1.690 + jb bail 1.691 + 1.692 + movdqu xmm0, [input] 1.693 + movdqa xmm4, xmm0 1.694 + movdqu xmm7, [0*16 + ctx] 1.695 + pxor xmm0, xmm7 1.696 + 1.697 + i = 1 1.698 + WHILE i LT rnds 1.699 + movdqu xmm7, [i*16 + ctx] 1.700 + aesdec xmm0, xmm7 1.701 + i = i+1 1.702 + ENDM 1.703 + movdqu xmm7, [rnds*16 + ctx] 1.704 + aesdeclast xmm0, xmm7 1.705 + pxor xmm3, xmm0 1.706 + 1.707 + movdqu [output], xmm3 1.708 + movdqa xmm3, xmm4 1.709 + 1.710 + lea input, [1*16 + input] 1.711 + lea output, [1*16 + output] 1.712 + sub inputLen, 1*16 1.713 + jmp loop1 1.714 + 1.715 +bail: 1.716 + movdqu [-32 + ctx], xmm3 1.717 + xor eax, eax 1.718 + pop inputLen 1.719 + ret 1.720 +ENDM 1.721 + 1.722 +ALIGN 16 1.723 +intel_aes_encrypt_cbc_128 PROC 1.724 +gen_aes_cbc_enc_func 10 1.725 +intel_aes_encrypt_cbc_128 ENDP 1.726 + 1.727 +ALIGN 16 1.728 +intel_aes_encrypt_cbc_192 PROC 1.729 +gen_aes_cbc_enc_func 12 1.730 +intel_aes_encrypt_cbc_192 ENDP 1.731 + 1.732 +ALIGN 16 1.733 +intel_aes_encrypt_cbc_256 PROC 1.734 +gen_aes_cbc_enc_func 14 1.735 +intel_aes_encrypt_cbc_256 ENDP 1.736 + 1.737 +ALIGN 16 1.738 +intel_aes_decrypt_cbc_128 PROC 1.739 +gen_aes_cbc_dec_func 10 1.740 +intel_aes_decrypt_cbc_128 ENDP 1.741 + 1.742 +ALIGN 16 1.743 +intel_aes_decrypt_cbc_192 PROC 1.744 +gen_aes_cbc_dec_func 12 1.745 +intel_aes_decrypt_cbc_192 ENDP 1.746 + 1.747 +ALIGN 16 1.748 +intel_aes_decrypt_cbc_256 PROC 1.749 +gen_aes_cbc_dec_func 14 1.750 +intel_aes_decrypt_cbc_256 ENDP 1.751 + 1.752 + 1.753 + 1.754 +ctrCtx textequ <esi> 1.755 +CTR textequ <ebx> 1.756 + 1.757 +gen_aes_ctr_func MACRO rnds 1.758 + 1.759 +LOCAL loop7 1.760 +LOCAL loop1 1.761 +LOCAL enc1 1.762 +LOCAL bail 1.763 + 1.764 + push inputLen 1.765 + push ctrCtx 1.766 + push CTR 1.767 + push ebp 1.768 + 1.769 + mov ctrCtx, [esp + 4*5 + 0*4] 1.770 + mov output, [esp + 4*5 + 1*4] 1.771 + mov input, [esp + 4*5 + 4*4] 1.772 + mov inputLen, [esp + 4*5 + 5*4] 1.773 + 1.774 + mov ctx, [4+ctrCtx] 1.775 + lea ctx, [44+ctx] 1.776 + 1.777 + mov ebp, esp 1.778 + sub esp, 7*16 1.779 + and esp, -16 1.780 + 1.781 + movdqu xmm0, [8+ctrCtx] 1.782 + mov ctrCtx, [ctrCtx + 8 + 3*4] 1.783 + bswap ctrCtx 1.784 + movdqu xmm1, [ctx + 0*16] 1.785 + 1.786 + pxor xmm0, xmm1 1.787 + 1.788 + movdqa [esp + 0*16], xmm0 1.789 + movdqa [esp + 1*16], xmm0 1.790 + movdqa [esp + 2*16], xmm0 1.791 + movdqa [esp + 3*16], xmm0 1.792 + movdqa [esp + 4*16], xmm0 1.793 + movdqa [esp + 5*16], xmm0 1.794 + movdqa [esp + 6*16], xmm0 1.795 + 1.796 + inc ctrCtx 1.797 + mov CTR, ctrCtx 1.798 + bswap CTR 1.799 + xor CTR, [ctx + 3*4] 1.800 + mov [esp + 1*16 + 3*4], CTR 1.801 + 1.802 + inc ctrCtx 1.803 + mov CTR, ctrCtx 1.804 + bswap CTR 1.805 + xor CTR, [ctx + 3*4] 1.806 + mov [esp + 2*16 + 3*4], CTR 1.807 + 1.808 + inc ctrCtx 1.809 + mov CTR, ctrCtx 1.810 + bswap CTR 1.811 + xor CTR, [ctx + 3*4] 1.812 + mov [esp + 3*16 + 3*4], CTR 1.813 + 1.814 + inc ctrCtx 1.815 + mov CTR, ctrCtx 1.816 + bswap CTR 1.817 + xor CTR, [ctx + 3*4] 1.818 + mov [esp + 4*16 + 3*4], CTR 1.819 + 1.820 + inc ctrCtx 1.821 + mov CTR, ctrCtx 1.822 + bswap CTR 1.823 + xor CTR, [ctx + 3*4] 1.824 + mov [esp + 5*16 + 3*4], CTR 1.825 + 1.826 + inc ctrCtx 1.827 + mov CTR, ctrCtx 1.828 + bswap CTR 1.829 + xor CTR, [ctx + 3*4] 1.830 + mov [esp + 6*16 + 3*4], CTR 1.831 + 1.832 + 1.833 +loop7: 1.834 + cmp inputLen, 7*16 1.835 + jb loop1 1.836 + 1.837 + movdqu xmm0, [0*16 + esp] 1.838 + movdqu xmm1, [1*16 + esp] 1.839 + movdqu xmm2, [2*16 + esp] 1.840 + movdqu xmm3, [3*16 + esp] 1.841 + movdqu xmm4, [4*16 + esp] 1.842 + movdqu xmm5, [5*16 + esp] 1.843 + movdqu xmm6, [6*16 + esp] 1.844 + 1.845 + i = 1 1.846 + WHILE i LE 7 1.847 + aes_rnd i 1.848 + 1.849 + inc ctrCtx 1.850 + mov CTR, ctrCtx 1.851 + bswap CTR 1.852 + xor CTR, [ctx + 3*4] 1.853 + mov [esp + (i-1)*16 + 3*4], CTR 1.854 + 1.855 + i = i+1 1.856 + ENDM 1.857 + WHILE i LT rnds 1.858 + aes_rnd i 1.859 + i = i+1 1.860 + ENDM 1.861 + aes_last_rnd rnds 1.862 + 1.863 + movdqu xmm7, [0*16 + input] 1.864 + pxor xmm0, xmm7 1.865 + movdqu xmm7, [1*16 + input] 1.866 + pxor xmm1, xmm7 1.867 + movdqu xmm7, [2*16 + input] 1.868 + pxor xmm2, xmm7 1.869 + movdqu xmm7, [3*16 + input] 1.870 + pxor xmm3, xmm7 1.871 + movdqu xmm7, [4*16 + input] 1.872 + pxor xmm4, xmm7 1.873 + movdqu xmm7, [5*16 + input] 1.874 + pxor xmm5, xmm7 1.875 + movdqu xmm7, [6*16 + input] 1.876 + pxor xmm6, xmm7 1.877 + 1.878 + movdqu [0*16 + output], xmm0 1.879 + movdqu [1*16 + output], xmm1 1.880 + movdqu [2*16 + output], xmm2 1.881 + movdqu [3*16 + output], xmm3 1.882 + movdqu [4*16 + output], xmm4 1.883 + movdqu [5*16 + output], xmm5 1.884 + movdqu [6*16 + output], xmm6 1.885 + 1.886 + lea input, [7*16 + input] 1.887 + lea output, [7*16 + output] 1.888 + sub inputLen, 7*16 1.889 + jmp loop7 1.890 + 1.891 + 1.892 +loop1: 1.893 + cmp inputLen, 1*16 1.894 + jb bail 1.895 + 1.896 + movdqu xmm0, [esp] 1.897 + add esp, 16 1.898 + 1.899 + i = 1 1.900 + WHILE i LT rnds 1.901 + movdqu xmm7, [i*16 + ctx] 1.902 + aesenc xmm0, xmm7 1.903 + i = i+1 1.904 + ENDM 1.905 + movdqu xmm7, [rnds*16 + ctx] 1.906 + aesenclast xmm0, xmm7 1.907 + 1.908 + movdqu xmm7, [input] 1.909 + pxor xmm0, xmm7 1.910 + movdqu [output], xmm0 1.911 + 1.912 + lea input, [1*16 + input] 1.913 + lea output, [1*16 + output] 1.914 + sub inputLen, 1*16 1.915 + jmp loop1 1.916 + 1.917 +bail: 1.918 + 1.919 + mov ctrCtx, [ebp + 4*5 + 0*4] 1.920 + movdqu xmm0, [esp] 1.921 + movdqu xmm1, [ctx + 0*16] 1.922 + pxor xmm0, xmm1 1.923 + movdqu [8+ctrCtx], xmm0 1.924 + 1.925 + 1.926 + xor eax, eax 1.927 + mov esp, ebp 1.928 + pop ebp 1.929 + pop CTR 1.930 + pop ctrCtx 1.931 + pop inputLen 1.932 + ret 1.933 +ENDM 1.934 + 1.935 + 1.936 +ALIGN 16 1.937 +intel_aes_encrypt_ctr_128 PROC 1.938 +gen_aes_ctr_func 10 1.939 +intel_aes_encrypt_ctr_128 ENDP 1.940 + 1.941 +ALIGN 16 1.942 +intel_aes_encrypt_ctr_192 PROC 1.943 +gen_aes_ctr_func 12 1.944 +intel_aes_encrypt_ctr_192 ENDP 1.945 + 1.946 +ALIGN 16 1.947 +intel_aes_encrypt_ctr_256 PROC 1.948 +gen_aes_ctr_func 14 1.949 +intel_aes_encrypt_ctr_256 ENDP 1.950 + 1.951 + 1.952 +END