michael@0: ; LICENSE: michael@0: ; This submission to NSS is to be made available under the terms of the michael@0: ; Mozilla Public License, v. 2.0. You can obtain one at http: michael@0: ; //mozilla.org/MPL/2.0/. michael@0: ;############################################################################### michael@0: ; Copyright(c) 2014, Intel Corp. michael@0: ; Developers and authors: michael@0: ; Shay Gueron and Vlad Krasnov michael@0: ; Intel Corporation, Israel Development Centre, Haifa, Israel michael@0: ; Please send feedback directly to crypto.feedback.alias@intel.com michael@0: michael@0: michael@0: .MODEL FLAT, C michael@0: .XMM michael@0: michael@0: .DATA michael@0: ALIGN 16 michael@0: Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh michael@0: Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h michael@0: Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh michael@0: Lcon1 dd 1,1,1,1 michael@0: Lcon2 dd 1bh,1bh,1bh,1bh michael@0: michael@0: .CODE michael@0: michael@0: ctx textequ michael@0: output textequ michael@0: input textequ michael@0: inputLen textequ michael@0: michael@0: michael@0: aes_rnd MACRO i michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesenc xmm0, xmm7 michael@0: aesenc xmm1, xmm7 michael@0: aesenc xmm2, xmm7 michael@0: aesenc xmm3, xmm7 michael@0: aesenc xmm4, xmm7 michael@0: aesenc xmm5, xmm7 michael@0: aesenc xmm6, xmm7 michael@0: ENDM michael@0: michael@0: aes_last_rnd MACRO i michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesenclast xmm0, xmm7 michael@0: aesenclast xmm1, xmm7 michael@0: aesenclast xmm2, xmm7 michael@0: aesenclast xmm3, xmm7 michael@0: aesenclast xmm4, xmm7 michael@0: aesenclast xmm5, xmm7 michael@0: aesenclast xmm6, xmm7 michael@0: ENDM michael@0: michael@0: aes_dec_rnd MACRO i michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesdec xmm0, xmm7 michael@0: aesdec xmm1, xmm7 michael@0: aesdec xmm2, xmm7 michael@0: aesdec xmm3, xmm7 michael@0: aesdec xmm4, xmm7 michael@0: aesdec xmm5, xmm7 michael@0: aesdec xmm6, xmm7 michael@0: ENDM michael@0: michael@0: aes_dec_last_rnd MACRO i michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesdeclast xmm0, xmm7 michael@0: aesdeclast xmm1, xmm7 michael@0: aesdeclast xmm2, xmm7 michael@0: aesdeclast xmm3, xmm7 michael@0: aesdeclast xmm4, xmm7 michael@0: aesdeclast xmm5, xmm7 michael@0: aesdeclast xmm6, xmm7 michael@0: ENDM michael@0: michael@0: michael@0: gen_aes_ecb_func MACRO enc, rnds michael@0: michael@0: LOCAL loop7 michael@0: LOCAL loop1 michael@0: LOCAL bail michael@0: michael@0: push inputLen michael@0: michael@0: mov ctx, [esp + 2*4 + 0*4] michael@0: mov output, [esp + 2*4 + 1*4] michael@0: mov input, [esp + 2*4 + 4*4] michael@0: mov inputLen, [esp + 2*4 + 5*4] michael@0: michael@0: lea ctx, [44+ctx] michael@0: michael@0: loop7: michael@0: cmp inputLen, 7*16 michael@0: jb loop1 michael@0: michael@0: movdqu xmm0, [0*16 + input] michael@0: movdqu xmm1, [1*16 + input] michael@0: movdqu xmm2, [2*16 + input] michael@0: movdqu xmm3, [3*16 + input] michael@0: movdqu xmm4, [4*16 + input] michael@0: movdqu xmm5, [5*16 + input] michael@0: movdqu xmm6, [6*16 + input] michael@0: michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: pxor xmm1, xmm7 michael@0: pxor xmm2, xmm7 michael@0: pxor xmm3, xmm7 michael@0: pxor xmm4, xmm7 michael@0: pxor xmm5, xmm7 michael@0: pxor xmm6, xmm7 michael@0: michael@0: IF enc eq 1 michael@0: rnd textequ michael@0: lastrnd textequ michael@0: aesinst textequ michael@0: aeslastinst textequ michael@0: ELSE michael@0: rnd textequ michael@0: lastrnd textequ michael@0: aesinst textequ michael@0: aeslastinst textequ michael@0: ENDIF michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: rnd i michael@0: i = i+1 michael@0: ENDM michael@0: lastrnd rnds michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: michael@0: lea input, [7*16 + input] michael@0: lea output, [7*16 + output] michael@0: sub inputLen, 7*16 michael@0: jmp loop7 michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [input] michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesinst xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aeslastinst xmm0, xmm7 michael@0: michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: xor eax, eax michael@0: pop inputLen michael@0: ret michael@0: michael@0: ENDM michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ecb_128 PROC michael@0: gen_aes_ecb_func 1, 10 michael@0: intel_aes_encrypt_ecb_128 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ecb_192 PROC michael@0: gen_aes_ecb_func 1, 12 michael@0: intel_aes_encrypt_ecb_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ecb_256 PROC michael@0: gen_aes_ecb_func 1, 14 michael@0: intel_aes_encrypt_ecb_256 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_ecb_128 PROC michael@0: gen_aes_ecb_func 0, 10 michael@0: intel_aes_decrypt_ecb_128 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_ecb_192 PROC michael@0: gen_aes_ecb_func 0, 12 michael@0: intel_aes_decrypt_ecb_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_ecb_256 PROC michael@0: gen_aes_ecb_func 0, 14 michael@0: intel_aes_decrypt_ecb_256 ENDP michael@0: michael@0: michael@0: KEY textequ michael@0: KS textequ michael@0: ITR textequ michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_init_128 PROC michael@0: michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: michael@0: michael@0: movdqu xmm1, [KEY] michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqa xmm0, [ITR] michael@0: lea ITR, Lmask michael@0: movdqa xmm4, [ITR] michael@0: michael@0: mov ITR, 8 michael@0: michael@0: Lenc_128_ks_loop: michael@0: lea KS, [16 + KS] michael@0: dec ITR michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: jne Lenc_128_ks_loop michael@0: michael@0: lea ITR, Lcon2 michael@0: movdqa xmm0, [ITR] michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16 + KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [32 + KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: ret michael@0: intel_aes_encrypt_init_128 ENDP michael@0: michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_init_128 PROC michael@0: michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_128 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [10*16 + KS] michael@0: movdqu [10*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 5 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(10-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(10-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [5*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [5*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_128 ENDP michael@0: michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_init_192 PROC michael@0: michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: michael@0: pxor xmm3, xmm3 michael@0: movdqu xmm1, [KEY] michael@0: pinsrd xmm3, DWORD PTR [16 + KEY], 0 michael@0: pinsrd xmm3, DWORD PTR [20 + KEY], 1 michael@0: michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm5, xmm3 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqu xmm0, [ITR] michael@0: lea ITR, Lmask192 michael@0: movdqu xmm4, [ITR] michael@0: michael@0: mov ITR, 4 michael@0: michael@0: Lenc_192_ks_loop: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: movdqa xmm7, xmm3 michael@0: pslldq xmm6, 4 michael@0: pslldq xmm7, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm3, xmm7 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm1, xmm2 michael@0: pshufd xmm2, xmm1, 0ffh michael@0: pxor xmm3, xmm2 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: shufpd xmm5, xmm1, 00h michael@0: shufpd xmm6, xmm3, 01h michael@0: michael@0: movdqu [16 + KS], xmm5 michael@0: movdqu [32 + KS], xmm6 michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: movdqa xmm7, xmm3 michael@0: pslldq xmm6, 4 michael@0: pslldq xmm7, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm3, xmm7 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm1, xmm2 michael@0: pshufd xmm2, xmm1, 0ffh michael@0: pxor xmm3, xmm2 michael@0: michael@0: movdqu [48 + KS], xmm1 michael@0: movdqa xmm5, xmm3 michael@0: michael@0: lea KS, [48 + KS] michael@0: michael@0: dec ITR michael@0: jnz Lenc_192_ks_loop michael@0: michael@0: movdqu [16 + KS], xmm5 michael@0: ret michael@0: intel_aes_encrypt_init_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_init_192 PROC michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_192 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [12*16 + KS] michael@0: movdqu [12*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 6 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(12-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(12-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [6*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [6*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_init_256 PROC michael@0: michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: movdqu xmm1, [16*0 + KEY] michael@0: movdqu xmm3, [16*1 + KEY] michael@0: michael@0: movdqu [16*0 + KS], xmm1 michael@0: movdqu [16*1 + KS], xmm3 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqu xmm0, [ITR] michael@0: lea ITR, Lmask256 michael@0: movdqu xmm5, [ITR] michael@0: michael@0: pxor xmm6, xmm6 michael@0: michael@0: mov ITR, 6 michael@0: michael@0: Lenc_256_ks_loop: michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm5 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm4, xmm1 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16*2 + KS], xmm1 michael@0: michael@0: pshufd xmm2, xmm1, 0ffh michael@0: aesenclast xmm2, xmm6 michael@0: movdqa xmm4, xmm3 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pxor xmm3, xmm2 michael@0: movdqu [16*3 + KS], xmm3 michael@0: michael@0: lea KS, [32 + KS] michael@0: dec ITR michael@0: jnz Lenc_256_ks_loop michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm5 michael@0: aesenclast xmm2, xmm0 michael@0: movdqa xmm4, xmm1 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16*2 + KS], xmm1 michael@0: michael@0: ret michael@0: intel_aes_encrypt_init_256 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_init_256 PROC michael@0: mov KEY, [esp + 1*4 + 0*4] michael@0: mov KS, [esp + 1*4 + 1*4] michael@0: michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_256 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [14*16 + KS] michael@0: movdqu [14*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 7 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(14-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(14-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [7*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [7*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_256 ENDP michael@0: michael@0: michael@0: michael@0: gen_aes_cbc_enc_func MACRO rnds michael@0: michael@0: LOCAL loop1 michael@0: LOCAL bail michael@0: michael@0: push inputLen michael@0: michael@0: mov ctx, [esp + 2*4 + 0*4] michael@0: mov output, [esp + 2*4 + 1*4] michael@0: mov input, [esp + 2*4 + 4*4] michael@0: mov inputLen, [esp + 2*4 + 5*4] michael@0: michael@0: lea ctx, [44+ctx] michael@0: michael@0: movdqu xmm0, [-32+ctx] michael@0: michael@0: movdqu xmm2, [0*16 + ctx] michael@0: movdqu xmm3, [1*16 + ctx] michael@0: movdqu xmm4, [2*16 + ctx] michael@0: movdqu xmm5, [3*16 + ctx] michael@0: movdqu xmm6, [4*16 + ctx] michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm1, [input] michael@0: pxor xmm1, xmm2 michael@0: pxor xmm0, xmm1 michael@0: michael@0: aesenc xmm0, xmm3 michael@0: aesenc xmm0, xmm4 michael@0: aesenc xmm0, xmm5 michael@0: aesenc xmm0, xmm6 michael@0: michael@0: i = 5 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesenc xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aesenclast xmm0, xmm7 michael@0: michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: movdqu [-32+ctx], xmm0 michael@0: michael@0: xor eax, eax michael@0: pop inputLen michael@0: ret michael@0: michael@0: ENDM michael@0: michael@0: gen_aes_cbc_dec_func MACRO rnds michael@0: michael@0: LOCAL loop7 michael@0: LOCAL loop1 michael@0: LOCAL dec1 michael@0: LOCAL bail michael@0: michael@0: push inputLen michael@0: michael@0: mov ctx, [esp + 2*4 + 0*4] michael@0: mov output, [esp + 2*4 + 1*4] michael@0: mov input, [esp + 2*4 + 4*4] michael@0: mov inputLen, [esp + 2*4 + 5*4] michael@0: michael@0: lea ctx, [44+ctx] michael@0: michael@0: loop7: michael@0: cmp inputLen, 7*16 michael@0: jb dec1 michael@0: michael@0: movdqu xmm0, [0*16 + input] michael@0: movdqu xmm1, [1*16 + input] michael@0: movdqu xmm2, [2*16 + input] michael@0: movdqu xmm3, [3*16 + input] michael@0: movdqu xmm4, [4*16 + input] michael@0: movdqu xmm5, [5*16 + input] michael@0: movdqu xmm6, [6*16 + input] michael@0: michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: pxor xmm1, xmm7 michael@0: pxor xmm2, xmm7 michael@0: pxor xmm3, xmm7 michael@0: pxor xmm4, xmm7 michael@0: pxor xmm5, xmm7 michael@0: pxor xmm6, xmm7 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: aes_dec_rnd i michael@0: i = i+1 michael@0: ENDM michael@0: aes_dec_last_rnd rnds michael@0: michael@0: movdqu xmm7, [-32 + ctx] michael@0: pxor xmm0, xmm7 michael@0: movdqu xmm7, [0*16 + input] michael@0: pxor xmm1, xmm7 michael@0: movdqu xmm7, [1*16 + input] michael@0: pxor xmm2, xmm7 michael@0: movdqu xmm7, [2*16 + input] michael@0: pxor xmm3, xmm7 michael@0: movdqu xmm7, [3*16 + input] michael@0: pxor xmm4, xmm7 michael@0: movdqu xmm7, [4*16 + input] michael@0: pxor xmm5, xmm7 michael@0: movdqu xmm7, [5*16 + input] michael@0: pxor xmm6, xmm7 michael@0: movdqu xmm7, [6*16 + input] michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: movdqu [-32 + ctx], xmm7 michael@0: michael@0: lea input, [7*16 + input] michael@0: lea output, [7*16 + output] michael@0: sub inputLen, 7*16 michael@0: jmp loop7 michael@0: dec1: michael@0: michael@0: movdqu xmm3, [-32 + ctx] michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [input] michael@0: movdqa xmm4, xmm0 michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesdec xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aesdeclast xmm0, xmm7 michael@0: pxor xmm3, xmm0 michael@0: michael@0: movdqu [output], xmm3 michael@0: movdqa xmm3, xmm4 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: movdqu [-32 + ctx], xmm3 michael@0: xor eax, eax michael@0: pop inputLen michael@0: ret michael@0: ENDM michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_cbc_128 PROC michael@0: gen_aes_cbc_enc_func 10 michael@0: intel_aes_encrypt_cbc_128 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_cbc_192 PROC michael@0: gen_aes_cbc_enc_func 12 michael@0: intel_aes_encrypt_cbc_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_cbc_256 PROC michael@0: gen_aes_cbc_enc_func 14 michael@0: intel_aes_encrypt_cbc_256 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_cbc_128 PROC michael@0: gen_aes_cbc_dec_func 10 michael@0: intel_aes_decrypt_cbc_128 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_cbc_192 PROC michael@0: gen_aes_cbc_dec_func 12 michael@0: intel_aes_decrypt_cbc_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_decrypt_cbc_256 PROC michael@0: gen_aes_cbc_dec_func 14 michael@0: intel_aes_decrypt_cbc_256 ENDP michael@0: michael@0: michael@0: michael@0: ctrCtx textequ michael@0: CTR textequ michael@0: michael@0: gen_aes_ctr_func MACRO rnds michael@0: michael@0: LOCAL loop7 michael@0: LOCAL loop1 michael@0: LOCAL enc1 michael@0: LOCAL bail michael@0: michael@0: push inputLen michael@0: push ctrCtx michael@0: push CTR michael@0: push ebp michael@0: michael@0: mov ctrCtx, [esp + 4*5 + 0*4] michael@0: mov output, [esp + 4*5 + 1*4] michael@0: mov input, [esp + 4*5 + 4*4] michael@0: mov inputLen, [esp + 4*5 + 5*4] michael@0: michael@0: mov ctx, [4+ctrCtx] michael@0: lea ctx, [44+ctx] michael@0: michael@0: mov ebp, esp michael@0: sub esp, 7*16 michael@0: and esp, -16 michael@0: michael@0: movdqu xmm0, [8+ctrCtx] michael@0: mov ctrCtx, [ctrCtx + 8 + 3*4] michael@0: bswap ctrCtx michael@0: movdqu xmm1, [ctx + 0*16] michael@0: michael@0: pxor xmm0, xmm1 michael@0: michael@0: movdqa [esp + 0*16], xmm0 michael@0: movdqa [esp + 1*16], xmm0 michael@0: movdqa [esp + 2*16], xmm0 michael@0: movdqa [esp + 3*16], xmm0 michael@0: movdqa [esp + 4*16], xmm0 michael@0: movdqa [esp + 5*16], xmm0 michael@0: movdqa [esp + 6*16], xmm0 michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 1*16 + 3*4], CTR michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 2*16 + 3*4], CTR michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 3*16 + 3*4], CTR michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 4*16 + 3*4], CTR michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 5*16 + 3*4], CTR michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + 6*16 + 3*4], CTR michael@0: michael@0: michael@0: loop7: michael@0: cmp inputLen, 7*16 michael@0: jb loop1 michael@0: michael@0: movdqu xmm0, [0*16 + esp] michael@0: movdqu xmm1, [1*16 + esp] michael@0: movdqu xmm2, [2*16 + esp] michael@0: movdqu xmm3, [3*16 + esp] michael@0: movdqu xmm4, [4*16 + esp] michael@0: movdqu xmm5, [5*16 + esp] michael@0: movdqu xmm6, [6*16 + esp] michael@0: michael@0: i = 1 michael@0: WHILE i LE 7 michael@0: aes_rnd i michael@0: michael@0: inc ctrCtx michael@0: mov CTR, ctrCtx michael@0: bswap CTR michael@0: xor CTR, [ctx + 3*4] michael@0: mov [esp + (i-1)*16 + 3*4], CTR michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: WHILE i LT rnds michael@0: aes_rnd i michael@0: i = i+1 michael@0: ENDM michael@0: aes_last_rnd rnds michael@0: michael@0: movdqu xmm7, [0*16 + input] michael@0: pxor xmm0, xmm7 michael@0: movdqu xmm7, [1*16 + input] michael@0: pxor xmm1, xmm7 michael@0: movdqu xmm7, [2*16 + input] michael@0: pxor xmm2, xmm7 michael@0: movdqu xmm7, [3*16 + input] michael@0: pxor xmm3, xmm7 michael@0: movdqu xmm7, [4*16 + input] michael@0: pxor xmm4, xmm7 michael@0: movdqu xmm7, [5*16 + input] michael@0: pxor xmm5, xmm7 michael@0: movdqu xmm7, [6*16 + input] michael@0: pxor xmm6, xmm7 michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: michael@0: lea input, [7*16 + input] michael@0: lea output, [7*16 + output] michael@0: sub inputLen, 7*16 michael@0: jmp loop7 michael@0: michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [esp] michael@0: add esp, 16 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesenc xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aesenclast xmm0, xmm7 michael@0: michael@0: movdqu xmm7, [input] michael@0: pxor xmm0, xmm7 michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: michael@0: mov ctrCtx, [ebp + 4*5 + 0*4] michael@0: movdqu xmm0, [esp] michael@0: movdqu xmm1, [ctx + 0*16] michael@0: pxor xmm0, xmm1 michael@0: movdqu [8+ctrCtx], xmm0 michael@0: michael@0: michael@0: xor eax, eax michael@0: mov esp, ebp michael@0: pop ebp michael@0: pop CTR michael@0: pop ctrCtx michael@0: pop inputLen michael@0: ret michael@0: ENDM michael@0: michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ctr_128 PROC michael@0: gen_aes_ctr_func 10 michael@0: intel_aes_encrypt_ctr_128 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ctr_192 PROC michael@0: gen_aes_ctr_func 12 michael@0: intel_aes_encrypt_ctr_192 ENDP michael@0: michael@0: ALIGN 16 michael@0: intel_aes_encrypt_ctr_256 PROC michael@0: gen_aes_ctr_func 14 michael@0: intel_aes_encrypt_ctr_256 ENDP michael@0: michael@0: michael@0: END