michael@0: ; LICENSE: michael@0: ; This submission to NSS is to be made available under the terms of the michael@0: ; Mozilla Public License, v. 2.0. You can obtain one at http: michael@0: ; //mozilla.org/MPL/2.0/. michael@0: ;############################################################################### michael@0: ; Copyright(c) 2014, Intel Corp. michael@0: ; Developers and authors: michael@0: ; Shay Gueron and Vlad Krasnov michael@0: ; Intel Corporation, Israel Development Centre, Haifa, Israel michael@0: ; Please send feedback directly to crypto.feedback.alias@intel.com michael@0: michael@0: michael@0: .DATA michael@0: ALIGN 16 michael@0: Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh michael@0: Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h michael@0: Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh michael@0: Lcon1 dd 1,1,1,1 michael@0: Lcon2 dd 1bh,1bh,1bh,1bh michael@0: michael@0: .CODE michael@0: michael@0: ctx textequ michael@0: output textequ michael@0: input textequ michael@0: inputLen textequ michael@0: michael@0: michael@0: aes_rnd MACRO i michael@0: movdqu xmm8, [i*16 + ctx] michael@0: aesenc xmm0, xmm8 michael@0: aesenc xmm1, xmm8 michael@0: aesenc xmm2, xmm8 michael@0: aesenc xmm3, xmm8 michael@0: aesenc xmm4, xmm8 michael@0: aesenc xmm5, xmm8 michael@0: aesenc xmm6, xmm8 michael@0: aesenc xmm7, xmm8 michael@0: ENDM michael@0: michael@0: aes_last_rnd MACRO i michael@0: movdqu xmm8, [i*16 + ctx] michael@0: aesenclast xmm0, xmm8 michael@0: aesenclast xmm1, xmm8 michael@0: aesenclast xmm2, xmm8 michael@0: aesenclast xmm3, xmm8 michael@0: aesenclast xmm4, xmm8 michael@0: aesenclast xmm5, xmm8 michael@0: aesenclast xmm6, xmm8 michael@0: aesenclast xmm7, xmm8 michael@0: ENDM michael@0: michael@0: aes_dec_rnd MACRO i michael@0: movdqu xmm8, [i*16 + ctx] michael@0: aesdec xmm0, xmm8 michael@0: aesdec xmm1, xmm8 michael@0: aesdec xmm2, xmm8 michael@0: aesdec xmm3, xmm8 michael@0: aesdec xmm4, xmm8 michael@0: aesdec xmm5, xmm8 michael@0: aesdec xmm6, xmm8 michael@0: aesdec xmm7, xmm8 michael@0: ENDM michael@0: michael@0: aes_dec_last_rnd MACRO i michael@0: movdqu xmm8, [i*16 + ctx] michael@0: aesdeclast xmm0, xmm8 michael@0: aesdeclast xmm1, xmm8 michael@0: aesdeclast xmm2, xmm8 michael@0: aesdeclast xmm3, xmm8 michael@0: aesdeclast xmm4, xmm8 michael@0: aesdeclast xmm5, xmm8 michael@0: aesdeclast xmm6, xmm8 michael@0: aesdeclast xmm7, xmm8 michael@0: ENDM michael@0: michael@0: michael@0: gen_aes_ecb_func MACRO enc, rnds michael@0: michael@0: LOCAL loop8 michael@0: LOCAL loop1 michael@0: LOCAL bail michael@0: michael@0: xor inputLen, inputLen michael@0: mov input, [rsp + 1*8 + 8*4] michael@0: mov inputLen, [rsp + 1*8 + 8*5] michael@0: michael@0: sub rsp, 3*16 michael@0: michael@0: movdqu [rsp + 0*16], xmm6 michael@0: movdqu [rsp + 1*16], xmm7 michael@0: movdqu [rsp + 2*16], xmm8 michael@0: michael@0: lea ctx, [48+ctx] michael@0: michael@0: loop8: michael@0: cmp inputLen, 8*16 michael@0: jb loop1 michael@0: michael@0: movdqu xmm0, [0*16 + input] michael@0: movdqu xmm1, [1*16 + input] michael@0: movdqu xmm2, [2*16 + input] michael@0: movdqu xmm3, [3*16 + input] michael@0: movdqu xmm4, [4*16 + input] michael@0: movdqu xmm5, [5*16 + input] michael@0: movdqu xmm6, [6*16 + input] michael@0: movdqu xmm7, [7*16 + input] michael@0: michael@0: movdqu xmm8, [0*16 + ctx] michael@0: pxor xmm0, xmm8 michael@0: pxor xmm1, xmm8 michael@0: pxor xmm2, xmm8 michael@0: pxor xmm3, xmm8 michael@0: pxor xmm4, xmm8 michael@0: pxor xmm5, xmm8 michael@0: pxor xmm6, xmm8 michael@0: pxor xmm7, xmm8 michael@0: michael@0: IF enc eq 1 michael@0: rnd textequ michael@0: lastrnd textequ michael@0: aesinst textequ michael@0: aeslastinst textequ michael@0: ELSE michael@0: rnd textequ michael@0: lastrnd textequ michael@0: aesinst textequ michael@0: aeslastinst textequ michael@0: ENDIF michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: rnd i michael@0: i = i+1 michael@0: ENDM michael@0: lastrnd rnds michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: movdqu [7*16 + output], xmm7 michael@0: michael@0: lea input, [8*16 + input] michael@0: lea output, [8*16 + output] michael@0: sub inputLen, 8*16 michael@0: jmp loop8 michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [input] michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesinst xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aeslastinst xmm0, xmm7 michael@0: michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: xor rax, rax michael@0: michael@0: movdqu xmm6, [rsp + 0*16] michael@0: movdqu xmm7, [rsp + 1*16] michael@0: movdqu xmm8, [rsp + 2*16] michael@0: add rsp, 3*16 michael@0: ret michael@0: ENDM michael@0: michael@0: intel_aes_encrypt_ecb_128 PROC michael@0: gen_aes_ecb_func 1, 10 michael@0: intel_aes_encrypt_ecb_128 ENDP michael@0: michael@0: intel_aes_encrypt_ecb_192 PROC michael@0: gen_aes_ecb_func 1, 12 michael@0: intel_aes_encrypt_ecb_192 ENDP michael@0: michael@0: intel_aes_encrypt_ecb_256 PROC michael@0: gen_aes_ecb_func 1, 14 michael@0: intel_aes_encrypt_ecb_256 ENDP michael@0: michael@0: intel_aes_decrypt_ecb_128 PROC michael@0: gen_aes_ecb_func 0, 10 michael@0: intel_aes_decrypt_ecb_128 ENDP michael@0: michael@0: intel_aes_decrypt_ecb_192 PROC michael@0: gen_aes_ecb_func 0, 12 michael@0: intel_aes_decrypt_ecb_192 ENDP michael@0: michael@0: intel_aes_decrypt_ecb_256 PROC michael@0: gen_aes_ecb_func 0, 14 michael@0: intel_aes_decrypt_ecb_256 ENDP michael@0: michael@0: michael@0: KEY textequ michael@0: KS textequ michael@0: ITR textequ michael@0: michael@0: intel_aes_encrypt_init_128 PROC michael@0: michael@0: movdqu xmm1, [KEY] michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqa xmm0, [ITR] michael@0: lea ITR, Lmask michael@0: movdqa xmm4, [ITR] michael@0: michael@0: mov ITR, 8 michael@0: michael@0: Lenc_128_ks_loop: michael@0: lea KS, [16 + KS] michael@0: dec ITR michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: jne Lenc_128_ks_loop michael@0: michael@0: lea ITR, Lcon2 michael@0: movdqa xmm0, [ITR] michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16 + KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pslldq xmm3, 4 michael@0: pxor xmm1, xmm3 michael@0: pxor xmm1, xmm2 michael@0: movdqu [32 + KS], xmm1 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: ret michael@0: intel_aes_encrypt_init_128 ENDP michael@0: michael@0: michael@0: intel_aes_decrypt_init_128 PROC michael@0: michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_128 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [10*16 + KS] michael@0: movdqu [10*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 5 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(10-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(10-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [5*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [5*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_128 ENDP michael@0: michael@0: michael@0: intel_aes_encrypt_init_192 PROC michael@0: michael@0: sub rsp, 16*2 michael@0: movdqu [16*0 + rsp], xmm6 michael@0: movdqu [16*1 + rsp], xmm7 michael@0: michael@0: movdqu xmm1, [KEY] michael@0: mov ITR, [16 + KEY] michael@0: movd xmm3, ITR michael@0: michael@0: movdqu [KS], xmm1 michael@0: movdqa xmm5, xmm3 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqu xmm0, [ITR] michael@0: lea ITR, Lmask192 michael@0: movdqu xmm4, [ITR] michael@0: michael@0: mov ITR, 4 michael@0: michael@0: Lenc_192_ks_loop: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: movdqa xmm7, xmm3 michael@0: pslldq xmm6, 4 michael@0: pslldq xmm7, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm3, xmm7 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm1, xmm2 michael@0: pshufd xmm2, xmm1, 0ffh michael@0: pxor xmm3, xmm2 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: shufpd xmm5, xmm1, 00h michael@0: shufpd xmm6, xmm3, 01h michael@0: michael@0: movdqu [16 + KS], xmm5 michael@0: movdqu [32 + KS], xmm6 michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm4 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: michael@0: movdqa xmm6, xmm1 michael@0: movdqa xmm7, xmm3 michael@0: pslldq xmm6, 4 michael@0: pslldq xmm7, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm3, xmm7 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pslldq xmm6, 4 michael@0: pxor xmm1, xmm6 michael@0: pxor xmm1, xmm2 michael@0: pshufd xmm2, xmm1, 0ffh michael@0: pxor xmm3, xmm2 michael@0: michael@0: movdqu [48 + KS], xmm1 michael@0: movdqa xmm5, xmm3 michael@0: michael@0: lea KS, [48 + KS] michael@0: michael@0: dec ITR michael@0: jnz Lenc_192_ks_loop michael@0: michael@0: movdqu [16 + KS], xmm5 michael@0: michael@0: movdqu xmm7, [16*1 + rsp] michael@0: movdqu xmm6, [16*0 + rsp] michael@0: add rsp, 16*2 michael@0: ret michael@0: intel_aes_encrypt_init_192 ENDP michael@0: michael@0: intel_aes_decrypt_init_192 PROC michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_192 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [12*16 + KS] michael@0: movdqu [12*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 6 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(12-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(12-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [6*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [6*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_192 ENDP michael@0: michael@0: michael@0: intel_aes_encrypt_init_256 PROC michael@0: sub rsp, 16*2 michael@0: movdqu [16*0 + rsp], xmm6 michael@0: movdqu [16*1 + rsp], xmm7 michael@0: michael@0: movdqu xmm1, [16*0 + KEY] michael@0: movdqu xmm3, [16*1 + KEY] michael@0: michael@0: movdqu [16*0 + KS], xmm1 michael@0: movdqu [16*1 + KS], xmm3 michael@0: michael@0: lea ITR, Lcon1 michael@0: movdqu xmm0, [ITR] michael@0: lea ITR, Lmask256 michael@0: movdqu xmm5, [ITR] michael@0: michael@0: pxor xmm6, xmm6 michael@0: michael@0: mov ITR, 6 michael@0: michael@0: Lenc_256_ks_loop: michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm5 michael@0: aesenclast xmm2, xmm0 michael@0: pslld xmm0, 1 michael@0: movdqa xmm4, xmm1 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16*2 + KS], xmm1 michael@0: michael@0: pshufd xmm2, xmm1, 0ffh michael@0: aesenclast xmm2, xmm6 michael@0: movdqa xmm4, xmm3 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm3, xmm4 michael@0: pxor xmm3, xmm2 michael@0: movdqu [16*3 + KS], xmm3 michael@0: michael@0: lea KS, [32 + KS] michael@0: dec ITR michael@0: jnz Lenc_256_ks_loop michael@0: michael@0: movdqa xmm2, xmm3 michael@0: pshufb xmm2, xmm5 michael@0: aesenclast xmm2, xmm0 michael@0: movdqa xmm4, xmm1 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pslldq xmm4, 4 michael@0: pxor xmm1, xmm4 michael@0: pxor xmm1, xmm2 michael@0: movdqu [16*2 + KS], xmm1 michael@0: michael@0: movdqu xmm7, [16*1 + rsp] michael@0: movdqu xmm6, [16*0 + rsp] michael@0: add rsp, 16*2 michael@0: ret michael@0: michael@0: intel_aes_encrypt_init_256 ENDP michael@0: michael@0: michael@0: intel_aes_decrypt_init_256 PROC michael@0: push KS michael@0: push KEY michael@0: michael@0: call intel_aes_encrypt_init_256 michael@0: michael@0: pop KEY michael@0: pop KS michael@0: michael@0: movdqu xmm0, [0*16 + KS] michael@0: movdqu xmm1, [14*16 + KS] michael@0: movdqu [14*16 + KS], xmm0 michael@0: movdqu [0*16 + KS], xmm1 michael@0: michael@0: i = 1 michael@0: WHILE i LT 7 michael@0: movdqu xmm0, [i*16 + KS] michael@0: movdqu xmm1, [(14-i)*16 + KS] michael@0: michael@0: aesimc xmm0, xmm0 michael@0: aesimc xmm1, xmm1 michael@0: michael@0: movdqu [(14-i)*16 + KS], xmm0 michael@0: movdqu [i*16 + KS], xmm1 michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: michael@0: movdqu xmm0, [7*16 + KS] michael@0: aesimc xmm0, xmm0 michael@0: movdqu [7*16 + KS], xmm0 michael@0: ret michael@0: intel_aes_decrypt_init_256 ENDP michael@0: michael@0: michael@0: michael@0: gen_aes_cbc_enc_func MACRO rnds michael@0: michael@0: LOCAL loop1 michael@0: LOCAL bail michael@0: michael@0: mov input, [rsp + 1*8 + 8*4] michael@0: mov inputLen, [rsp + 1*8 + 8*5] michael@0: michael@0: sub rsp, 3*16 michael@0: michael@0: movdqu [rsp + 0*16], xmm6 michael@0: movdqu [rsp + 1*16], xmm7 michael@0: movdqu [rsp + 2*16], xmm8 michael@0: michael@0: lea ctx, [48+ctx] michael@0: michael@0: movdqu xmm0, [-32+ctx] michael@0: michael@0: movdqu xmm2, [0*16 + ctx] michael@0: movdqu xmm3, [1*16 + ctx] michael@0: movdqu xmm4, [2*16 + ctx] michael@0: movdqu xmm5, [3*16 + ctx] michael@0: movdqu xmm6, [4*16 + ctx] michael@0: movdqu xmm7, [5*16 + ctx] michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm1, [input] michael@0: pxor xmm1, xmm2 michael@0: pxor xmm0, xmm1 michael@0: michael@0: aesenc xmm0, xmm3 michael@0: aesenc xmm0, xmm4 michael@0: aesenc xmm0, xmm5 michael@0: aesenc xmm0, xmm6 michael@0: aesenc xmm0, xmm7 michael@0: michael@0: i = 6 michael@0: WHILE i LT rnds michael@0: movdqu xmm8, [i*16 + ctx] michael@0: aesenc xmm0, xmm8 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm8, [rnds*16 + ctx] michael@0: aesenclast xmm0, xmm8 michael@0: michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: movdqu [-32+ctx], xmm0 michael@0: michael@0: xor rax, rax michael@0: michael@0: movdqu xmm6, [rsp + 0*16] michael@0: movdqu xmm7, [rsp + 1*16] michael@0: movdqu xmm8, [rsp + 2*16] michael@0: add rsp, 3*16 michael@0: ret michael@0: michael@0: ENDM michael@0: michael@0: gen_aes_cbc_dec_func MACRO rnds michael@0: michael@0: LOCAL loop8 michael@0: LOCAL loop1 michael@0: LOCAL dec1 michael@0: LOCAL bail michael@0: michael@0: mov input, [rsp + 1*8 + 8*4] michael@0: mov inputLen, [rsp + 1*8 + 8*5] michael@0: michael@0: sub rsp, 3*16 michael@0: michael@0: movdqu [rsp + 0*16], xmm6 michael@0: movdqu [rsp + 1*16], xmm7 michael@0: movdqu [rsp + 2*16], xmm8 michael@0: michael@0: lea ctx, [48+ctx] michael@0: michael@0: loop8: michael@0: cmp inputLen, 8*16 michael@0: jb dec1 michael@0: michael@0: movdqu xmm0, [0*16 + input] michael@0: movdqu xmm1, [1*16 + input] michael@0: movdqu xmm2, [2*16 + input] michael@0: movdqu xmm3, [3*16 + input] michael@0: movdqu xmm4, [4*16 + input] michael@0: movdqu xmm5, [5*16 + input] michael@0: movdqu xmm6, [6*16 + input] michael@0: movdqu xmm7, [7*16 + input] michael@0: michael@0: movdqu xmm8, [0*16 + ctx] michael@0: pxor xmm0, xmm8 michael@0: pxor xmm1, xmm8 michael@0: pxor xmm2, xmm8 michael@0: pxor xmm3, xmm8 michael@0: pxor xmm4, xmm8 michael@0: pxor xmm5, xmm8 michael@0: pxor xmm6, xmm8 michael@0: pxor xmm7, xmm8 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: aes_dec_rnd i michael@0: i = i+1 michael@0: ENDM michael@0: aes_dec_last_rnd rnds michael@0: michael@0: movdqu xmm8, [-32 + ctx] michael@0: pxor xmm0, xmm8 michael@0: movdqu xmm8, [0*16 + input] michael@0: pxor xmm1, xmm8 michael@0: movdqu xmm8, [1*16 + input] michael@0: pxor xmm2, xmm8 michael@0: movdqu xmm8, [2*16 + input] michael@0: pxor xmm3, xmm8 michael@0: movdqu xmm8, [3*16 + input] michael@0: pxor xmm4, xmm8 michael@0: movdqu xmm8, [4*16 + input] michael@0: pxor xmm5, xmm8 michael@0: movdqu xmm8, [5*16 + input] michael@0: pxor xmm6, xmm8 michael@0: movdqu xmm8, [6*16 + input] michael@0: pxor xmm7, xmm8 michael@0: movdqu xmm8, [7*16 + input] michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: movdqu [7*16 + output], xmm7 michael@0: movdqu [-32 + ctx], xmm8 michael@0: michael@0: lea input, [8*16 + input] michael@0: lea output, [8*16 + output] michael@0: sub inputLen, 8*16 michael@0: jmp loop8 michael@0: dec1: michael@0: michael@0: movdqu xmm3, [-32 + ctx] michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [input] michael@0: movdqa xmm4, xmm0 michael@0: movdqu xmm7, [0*16 + ctx] michael@0: pxor xmm0, xmm7 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesdec xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aesdeclast xmm0, xmm7 michael@0: pxor xmm3, xmm0 michael@0: michael@0: movdqu [output], xmm3 michael@0: movdqa xmm3, xmm4 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: movdqu [-32 + ctx], xmm3 michael@0: xor rax, rax michael@0: michael@0: movdqu xmm6, [rsp + 0*16] michael@0: movdqu xmm7, [rsp + 1*16] michael@0: movdqu xmm8, [rsp + 2*16] michael@0: add rsp, 3*16 michael@0: ret michael@0: ENDM michael@0: michael@0: intel_aes_encrypt_cbc_128 PROC michael@0: gen_aes_cbc_enc_func 10 michael@0: intel_aes_encrypt_cbc_128 ENDP michael@0: michael@0: intel_aes_encrypt_cbc_192 PROC michael@0: gen_aes_cbc_enc_func 12 michael@0: intel_aes_encrypt_cbc_192 ENDP michael@0: michael@0: intel_aes_encrypt_cbc_256 PROC michael@0: gen_aes_cbc_enc_func 14 michael@0: intel_aes_encrypt_cbc_256 ENDP michael@0: michael@0: intel_aes_decrypt_cbc_128 PROC michael@0: gen_aes_cbc_dec_func 10 michael@0: intel_aes_decrypt_cbc_128 ENDP michael@0: michael@0: intel_aes_decrypt_cbc_192 PROC michael@0: gen_aes_cbc_dec_func 12 michael@0: intel_aes_decrypt_cbc_192 ENDP michael@0: michael@0: intel_aes_decrypt_cbc_256 PROC michael@0: gen_aes_cbc_dec_func 14 michael@0: intel_aes_decrypt_cbc_256 ENDP michael@0: michael@0: michael@0: michael@0: ctrCtx textequ michael@0: CTR textequ michael@0: CTRSave textequ michael@0: michael@0: gen_aes_ctr_func MACRO rnds michael@0: michael@0: LOCAL loop8 michael@0: LOCAL loop1 michael@0: LOCAL enc1 michael@0: LOCAL bail michael@0: michael@0: mov input, [rsp + 8*1 + 4*8] michael@0: mov inputLen, [rsp + 8*1 + 5*8] michael@0: michael@0: mov ctrCtx, ctx michael@0: mov ctx, [8+ctrCtx] michael@0: lea ctx, [48+ctx] michael@0: michael@0: sub rsp, 3*16 michael@0: movdqu [rsp + 0*16], xmm6 michael@0: movdqu [rsp + 1*16], xmm7 michael@0: movdqu [rsp + 2*16], xmm8 michael@0: michael@0: michael@0: push rbp michael@0: mov rbp, rsp michael@0: sub rsp, 8*16 michael@0: and rsp, -16 michael@0: michael@0: michael@0: movdqu xmm0, [16+ctrCtx] michael@0: mov CTRSave, DWORD PTR [ctrCtx + 16 + 3*4] michael@0: bswap CTRSave michael@0: movdqu xmm1, [ctx + 0*16] michael@0: michael@0: pxor xmm0, xmm1 michael@0: michael@0: movdqa [rsp + 0*16], xmm0 michael@0: movdqa [rsp + 1*16], xmm0 michael@0: movdqa [rsp + 2*16], xmm0 michael@0: movdqa [rsp + 3*16], xmm0 michael@0: movdqa [rsp + 4*16], xmm0 michael@0: movdqa [rsp + 5*16], xmm0 michael@0: movdqa [rsp + 6*16], xmm0 michael@0: movdqa [rsp + 7*16], xmm0 michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 1*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 2*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 3*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 4*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 5*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 6*16 + 3*4], CTR michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + 7*16 + 3*4], CTR michael@0: michael@0: michael@0: loop8: michael@0: cmp inputLen, 8*16 michael@0: jb loop1 michael@0: michael@0: movdqu xmm0, [0*16 + rsp] michael@0: movdqu xmm1, [1*16 + rsp] michael@0: movdqu xmm2, [2*16 + rsp] michael@0: movdqu xmm3, [3*16 + rsp] michael@0: movdqu xmm4, [4*16 + rsp] michael@0: movdqu xmm5, [5*16 + rsp] michael@0: movdqu xmm6, [6*16 + rsp] michael@0: movdqu xmm7, [7*16 + rsp] michael@0: michael@0: i = 1 michael@0: WHILE i LE 8 michael@0: aes_rnd i michael@0: michael@0: inc CTRSave michael@0: mov CTR, CTRSave michael@0: bswap CTR michael@0: xor CTR, DWORD PTR [ctx + 3*4] michael@0: mov DWORD PTR [rsp + (i-1)*16 + 3*4], CTR michael@0: michael@0: i = i+1 michael@0: ENDM michael@0: WHILE i LT rnds michael@0: aes_rnd i michael@0: i = i+1 michael@0: ENDM michael@0: aes_last_rnd rnds michael@0: michael@0: movdqu xmm8, [0*16 + input] michael@0: pxor xmm0, xmm8 michael@0: movdqu xmm8, [1*16 + input] michael@0: pxor xmm1, xmm8 michael@0: movdqu xmm8, [2*16 + input] michael@0: pxor xmm2, xmm8 michael@0: movdqu xmm8, [3*16 + input] michael@0: pxor xmm3, xmm8 michael@0: movdqu xmm8, [4*16 + input] michael@0: pxor xmm4, xmm8 michael@0: movdqu xmm8, [5*16 + input] michael@0: pxor xmm5, xmm8 michael@0: movdqu xmm8, [6*16 + input] michael@0: pxor xmm6, xmm8 michael@0: movdqu xmm8, [7*16 + input] michael@0: pxor xmm7, xmm8 michael@0: michael@0: movdqu [0*16 + output], xmm0 michael@0: movdqu [1*16 + output], xmm1 michael@0: movdqu [2*16 + output], xmm2 michael@0: movdqu [3*16 + output], xmm3 michael@0: movdqu [4*16 + output], xmm4 michael@0: movdqu [5*16 + output], xmm5 michael@0: movdqu [6*16 + output], xmm6 michael@0: movdqu [7*16 + output], xmm7 michael@0: michael@0: lea input, [8*16 + input] michael@0: lea output, [8*16 + output] michael@0: sub inputLen, 8*16 michael@0: jmp loop8 michael@0: michael@0: michael@0: loop1: michael@0: cmp inputLen, 1*16 michael@0: jb bail michael@0: michael@0: movdqu xmm0, [rsp] michael@0: add rsp, 16 michael@0: michael@0: i = 1 michael@0: WHILE i LT rnds michael@0: movdqu xmm7, [i*16 + ctx] michael@0: aesenc xmm0, xmm7 michael@0: i = i+1 michael@0: ENDM michael@0: movdqu xmm7, [rnds*16 + ctx] michael@0: aesenclast xmm0, xmm7 michael@0: michael@0: movdqu xmm7, [input] michael@0: pxor xmm0, xmm7 michael@0: movdqu [output], xmm0 michael@0: michael@0: lea input, [1*16 + input] michael@0: lea output, [1*16 + output] michael@0: sub inputLen, 1*16 michael@0: jmp loop1 michael@0: michael@0: bail: michael@0: michael@0: movdqu xmm0, [rsp] michael@0: movdqu xmm1, [ctx + 0*16] michael@0: pxor xmm0, xmm1 michael@0: movdqu [16+ctrCtx], xmm0 michael@0: michael@0: michael@0: xor rax, rax michael@0: mov rsp, rbp michael@0: pop rbp michael@0: michael@0: movdqu xmm6, [rsp + 0*16] michael@0: movdqu xmm7, [rsp + 1*16] michael@0: movdqu xmm8, [rsp + 2*16] michael@0: add rsp, 3*16 michael@0: michael@0: ret michael@0: ENDM michael@0: michael@0: michael@0: intel_aes_encrypt_ctr_128 PROC michael@0: gen_aes_ctr_func 10 michael@0: intel_aes_encrypt_ctr_128 ENDP michael@0: michael@0: intel_aes_encrypt_ctr_192 PROC michael@0: gen_aes_ctr_func 12 michael@0: intel_aes_encrypt_ctr_192 ENDP michael@0: michael@0: intel_aes_encrypt_ctr_256 PROC michael@0: gen_aes_ctr_func 14 michael@0: intel_aes_encrypt_ctr_256 ENDP michael@0: michael@0: michael@0: END