security/nss/lib/freebl/intel-gcm-x86-masm.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ; LICENSE:
     2 ; This submission to NSS is to be made available under the terms of the
     3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
     4 ; //mozilla.org/MPL/2.0/.
     5 ;###############################################################################
     6 ; Copyright(c) 2014, Intel Corp.
     7 ; Developers and authors:
     8 ; Shay Gueron and Vlad Krasnov
     9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
    10 ; Please send feedback directly to crypto.feedback.alias@intel.com
    13 .MODEL FLAT, C
    14 .XMM
    16 .DATA
    17 ALIGN 16
    18 Lone            dq 1,0
    19 Ltwo            dq 2,0
    20 Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    21 Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
    22 Lpoly           dq 01h, 0c200000000000000h
    24 .CODE
    27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
    28     vpclmulqdq  TMP1, SRC2, SRC1, 0h
    29     vpclmulqdq  TMP4, SRC2, SRC1, 011h
    31     vpshufd     TMP2, SRC2, 78
    32     vpshufd     TMP3, SRC1, 78
    33     vpxor       TMP2, TMP2, SRC2
    34     vpxor       TMP3, TMP3, SRC1
    36     vpclmulqdq  TMP2, TMP2, TMP3, 0h
    37     vpxor       TMP2, TMP2, TMP1
    38     vpxor       TMP2, TMP2, TMP4
    40     vpslldq     TMP3, TMP2, 8
    41     vpsrldq     TMP2, TMP2, 8
    43     vpxor       TMP1, TMP1, TMP3
    44     vpxor       TMP4, TMP4, TMP2
    46     vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
    47     vpshufd     TMP3, TMP1, 78
    48     vpxor       TMP1, TMP2, TMP3
    50     vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
    51     vpshufd     TMP3, TMP1, 78
    52     vpxor       TMP1, TMP2, TMP3
    54     vpxor       DST, TMP1, TMP4
    56     ENDM
    58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    59 ;
    60 ; Generates the final GCM tag
    61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
    62 ;                       unsigned char *Tp,
    63 ;                       unsigned int Mlen,
    64 ;                       unsigned int Alen,
    65 ;                       unsigned char* X0,
    66 ;                       unsigned char* TAG);
    67 ;
    68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    70 ALIGN 16
    71 intel_aes_gcmTAG PROC
    73 Htbl    textequ <eax>
    74 Tp      textequ <ecx>
    75 X0      textequ <edx>
    76 TAG     textequ <ebx>
    78 T       textequ <xmm0>
    79 TMP0    textequ <xmm1>
    81     push    ebx
    83     mov     Htbl,   [esp + 2*4 + 0*4]
    84     mov     Tp,     [esp + 2*4 + 1*4]
    85     mov     X0,     [esp + 2*4 + 4*4]
    86     mov     TAG,    [esp + 2*4 + 5*4]
    88     vzeroupper
    89     vmovdqu T, XMMWORD PTR[Tp]
    91     vpxor   TMP0, TMP0, TMP0
    92     vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
    93     vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
    94     vpsllq  TMP0, TMP0, 3
    96     vpxor   T, T, TMP0
    97     vmovdqu TMP0, XMMWORD PTR[Htbl]
    98     GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
   100     vpshufb T, T, [Lbswap_mask]
   101     vpxor   T, T, [X0]
   102     vmovdqu XMMWORD PTR[TAG], T
   103     vzeroupper
   105     pop ebx
   107     ret
   109 intel_aes_gcmTAG ENDP
   111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   112 ;
   113 ; Generates the H table
   114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
   115 ;
   116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   118 ALIGN 16
   119 intel_aes_gcmINIT PROC
   121 Htbl    textequ <eax>
   122 KS      textequ <ecx>
   123 NR      textequ <edx>
   125 T       textequ <xmm0>
   126 TMP0    textequ <xmm1>
   128     mov     Htbl,   [esp + 4*1 + 0*4]
   129     mov     KS,     [esp + 4*1 + 1*4]
   130     mov     NR,     [esp + 4*1 + 2*4]
   132     vzeroupper
   133     ; AES-ENC(0)
   134     vmovdqu T, XMMWORD PTR[KS]
   135     lea KS, [16 + KS]
   136     dec NR
   137 Lenc_loop:
   138         vaesenc T, T, [KS]
   139         lea KS, [16 + KS]
   140         dec NR
   141         jnz Lenc_loop
   143     vaesenclast T, T, [KS]
   144     vpshufb T, T, [Lbswap_mask]
   146     ;Calculate H` = GFMUL(H, 2)
   147     vpsrad  xmm3, T, 31
   148     vpshufd xmm3, xmm3, 0ffh
   149     vpand   xmm5, xmm3, [Lpoly]
   150     vpsrld  xmm3, T, 31
   151     vpslld  xmm4, T, 1
   152     vpslldq xmm3, xmm3, 4
   153     vpxor   T, xmm4, xmm3
   154     vpxor   T, T, xmm5
   156     vmovdqu TMP0, T
   157     vmovdqu XMMWORD PTR[Htbl + 0*16], T
   159     vpshufd xmm2, T, 78
   160     vpxor   xmm2, xmm2, T
   161     vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
   163     i = 1
   164     WHILE i LT 8
   165         GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
   166         vmovdqu XMMWORD PTR[Htbl + i*16], T
   167         vpshufd xmm2, T, 78
   168         vpxor   xmm2, xmm2, T
   169         vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
   170         i = i+1
   171         ENDM
   172     vzeroupper
   173     ret
   174 intel_aes_gcmINIT ENDP
   177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   178 ;
   179 ; Authenticate only
   180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
   181 ;
   182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   184 ALIGN 16
   185 intel_aes_gcmAAD PROC
   187 Htbl    textequ <eax>
   188 inp     textequ <ecx>
   189 len     textequ <edx>
   190 Tp      textequ <ebx>
   191 hlp0    textequ <esi>
   193 DATA    textequ <xmm0>
   194 T       textequ <xmm1>
   195 TMP0    textequ <xmm2>
   196 TMP1    textequ <xmm3>
   197 TMP2    textequ <xmm4>
   198 TMP3    textequ <xmm5>
   199 TMP4    textequ <xmm6>
   200 Xhi     textequ <xmm7>
   202 KARATSUBA_AAD MACRO i
   203     vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
   204     vpxor       TMP0, TMP0, TMP3
   205     vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
   206     vpxor       TMP1, TMP1, TMP3
   207     vpshufd     TMP3, DATA, 78
   208     vpxor       TMP3, TMP3, DATA
   209     vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
   210     vpxor       TMP2, TMP2, TMP3
   211 ENDM
   213     cmp   DWORD PTR[esp + 1*3 + 2*4], 0
   214     jnz   LbeginAAD
   215     ret
   217 LbeginAAD:
   218     push    ebx
   219     push    esi
   221     mov     Htbl,   [esp + 4*3 + 0*4]
   222     mov     inp,    [esp + 4*3 + 1*4]
   223     mov     len,    [esp + 4*3 + 2*4]
   224     mov     Tp,     [esp + 4*3 + 3*4]
   226     vzeroupper
   228     vpxor   Xhi, Xhi, Xhi
   230     vmovdqu T, XMMWORD PTR[Tp]
   231     ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
   232     mov hlp0, len
   233     and hlp0, 128-1
   234     jz  Lmod_loop
   236     and len, -128
   237     sub hlp0, 16
   239     ; Prefix block
   240     vmovdqu DATA, XMMWORD PTR[inp]
   241     vpshufb DATA, DATA, [Lbswap_mask]
   242     vpxor   DATA, DATA, T
   244     vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
   245     vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
   246     vpshufd     TMP3, DATA, 78
   247     vpxor       TMP3, TMP3, DATA
   248     vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
   250     lea     inp, [inp+16]
   251     test    hlp0, hlp0
   252     jnz     Lpre_loop
   253     jmp     Lred1
   255     ;hash remaining prefix bocks (up to 7 total prefix blocks)
   256 Lpre_loop:
   258         sub hlp0, 16
   260         vmovdqu DATA, XMMWORD PTR[inp]
   261         vpshufb DATA, DATA, [Lbswap_mask]
   263         vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
   264         vpxor       TMP0, TMP0, TMP3
   265         vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
   266         vpxor       TMP1, TMP1, TMP3
   267         vpshufd     TMP3, DATA, 78
   268         vpxor       TMP3, TMP3, DATA
   269         vpclmulqdq  TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
   270         vpxor       TMP2, TMP2, TMP3
   272         test    hlp0, hlp0
   273         lea     inp, [inp+16]
   274         jnz     Lpre_loop
   276 Lred1:
   278     vpxor       TMP2, TMP2, TMP0
   279     vpxor       TMP2, TMP2, TMP1
   280     vpsrldq     TMP3, TMP2, 8
   281     vpslldq     TMP2, TMP2, 8
   283     vpxor       Xhi, TMP1, TMP3
   284     vpxor       T, TMP0, TMP2
   286 Lmod_loop:
   288         sub len, 16*8
   289         jb  Ldone
   290         ; Block #0
   291         vmovdqu DATA, XMMWORD PTR[inp + 16*7]
   292         vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
   294         vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
   295         vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
   296         vpshufd     TMP3, DATA, 78
   297         vpxor       TMP3, TMP3, DATA
   298         vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
   300         ; Block #1
   301         vmovdqu DATA, XMMWORD PTR[inp + 16*6]
   302         vpshufb DATA, DATA, [Lbswap_mask]
   303         KARATSUBA_AAD 1
   305         ; Block #2
   306         vmovdqu DATA, XMMWORD PTR[inp + 16*5]
   307         vpshufb DATA, DATA, [Lbswap_mask]
   309         vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
   310         vpalignr    T, T, T, 8
   312         KARATSUBA_AAD 2
   314         vpxor       T, T, TMP4                          ;reduction stage 1b
   316         ; Block #3
   317         vmovdqu DATA, XMMWORD PTR[inp + 16*4]
   318         vpshufb DATA, DATA, [Lbswap_mask]
   319         KARATSUBA_AAD 3
   320         ; Block #4
   321         vmovdqu DATA, XMMWORD PTR[inp + 16*3]
   322         vpshufb DATA, DATA, [Lbswap_mask]
   324         vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
   325         vpalignr    T, T, T, 8
   327         KARATSUBA_AAD 4
   329         vpxor       T, T, TMP4                          ;reduction stage 2b
   330         ; Block #5
   331         vmovdqu DATA, XMMWORD PTR[inp + 16*2]
   332         vpshufb DATA, DATA, [Lbswap_mask]
   333         KARATSUBA_AAD 5
   335         vpxor   T, T, Xhi                               ;reduction finalize
   336         ; Block #6
   337         vmovdqu DATA, XMMWORD PTR[inp + 16*1]
   338         vpshufb DATA, DATA, [Lbswap_mask]
   339         KARATSUBA_AAD 6
   340         ; Block #7
   341         vmovdqu DATA, XMMWORD PTR[inp + 16*0]
   342         vpshufb DATA, DATA, [Lbswap_mask]
   343         vpxor   DATA, DATA, T
   344         KARATSUBA_AAD 7
   345         ; Aggregated 8 blocks, now karatsuba fixup
   346         vpxor   TMP2, TMP2, TMP0
   347         vpxor   TMP2, TMP2, TMP1
   348         vpsrldq TMP3, TMP2, 8
   349         vpslldq TMP2, TMP2, 8
   351         vpxor   Xhi, TMP1, TMP3
   352         vpxor   T, TMP0, TMP2
   354         lea inp, [inp + 16*8]
   355         jmp Lmod_loop
   357 Ldone:
   358     vpclmulqdq  TMP4, T, [Lpoly], 010h
   359     vpalignr    T, T, T, 8
   360     vpxor       T, T, TMP4
   362     vpclmulqdq  TMP4, T, [Lpoly], 010h
   363     vpalignr    T, T, T, 8
   364     vpxor       T, T, TMP4
   366     vpxor       T, T, Xhi
   367     vmovdqu     XMMWORD PTR[Tp], T
   368     vzeroupper
   370     pop esi
   371     pop ebx
   372     ret
   374 intel_aes_gcmAAD ENDP
   377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   378 ;
   379 ; Encrypt and Authenticate
   380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
   381 ;
   382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   384 ALIGN 16
   385 intel_aes_gcmENC PROC
   387 PT      textequ <eax>
   388 CT      textequ <ecx>
   389 Htbl    textequ <edx>
   390 Gctx    textequ <edx>
   391 len     textequ <DWORD PTR[ebp + 5*4 + 3*4]>
   392 KS      textequ <esi>
   393 NR      textequ <DWORD PTR[-40 + KS]>
   395 aluCTR  textequ <ebx>
   396 aluTMP  textequ <edi>
   398 T       textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
   399 TMP0    textequ <xmm1>
   400 TMP1    textequ <xmm2>
   401 TMP2    textequ <xmm3>
   402 TMP3    textequ <xmm4>
   403 TMP4    textequ <xmm5>
   404 TMP5    textequ <xmm6>
   406 CTR0    textequ <xmm0>
   407 CTR1    textequ <xmm1>
   408 CTR2    textequ <xmm2>
   409 CTR3    textequ <xmm3>
   410 CTR4    textequ <xmm4>
   411 CTR5    textequ <xmm5>
   412 CTR6    textequ <xmm6>
   414 ROUND MACRO i
   415     vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
   416     vaesenc CTR0, CTR0, xmm7
   417     vaesenc CTR1, CTR1, xmm7
   418     vaesenc CTR2, CTR2, xmm7
   419     vaesenc CTR3, CTR3, xmm7
   420     vaesenc CTR4, CTR4, xmm7
   421     vaesenc CTR5, CTR5, xmm7
   422     vaesenc CTR6, CTR6, xmm7
   423 ENDM
   425 KARATSUBA MACRO i
   426     vpshufd TMP4, TMP5, 78
   427     vpxor   TMP4, TMP4, TMP5
   428     vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
   429     vpxor       TMP0, TMP0, TMP3
   430     vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
   431     vpclmulqdq  TMP3, TMP5, TMP4, 011h
   432     vpxor       TMP1, TMP1, TMP3
   433     vpclmulqdq  TMP3, TMP5, TMP4, 000h
   434     vpxor       TMP2, TMP2, TMP3
   435 ENDM
   437 NEXTCTR MACRO i
   438     add     aluCTR, 1
   439     mov     aluTMP, aluCTR
   440     bswap   aluTMP
   441     xor     aluTMP, [3*4 + KS]
   442     mov     [3*4 + 8*16 + i*16 + esp], aluTMP
   443 ENDM
   445     cmp DWORD PTR[1*4 + 3*4 + esp], 0
   446     jne LbeginENC
   447     ret
   449 LbeginENC:
   451     vzeroupper
   452     push    ebp
   453     push    ebx
   454     push    esi
   455     push    edi
   457     mov ebp, esp
   458     sub esp, 16*16
   459     and esp, -16
   461     mov PT, [ebp + 5*4 + 0*4]
   462     mov CT, [ebp + 5*4 + 1*4]
   463     mov Gctx, [ebp + 5*4 + 2*4]
   465     mov     KS, [16*16 + 3*16 + Gctx]
   466     lea     KS, [44 + KS]
   468     mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
   469     bswap   aluCTR
   472     vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
   473     vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   474     vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
   476     cmp len, 16*7
   477     jb  LEncDataSingles
   478 ; Prepare the "top" counters
   479     vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
   480     vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
   481     vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
   482     vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
   483     vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
   484     vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
   486     vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   487     vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
   488 ; Encrypt the initial 7 blocks
   489     sub len, 16*7
   490     vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
   491     vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
   492     vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
   493     vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
   494     vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
   495     vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
   497     vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
   498     vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
   499     vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
   500     vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
   501     vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
   502     vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
   503     vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
   505     vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
   506     vpxor   CTR0, CTR0, xmm7
   507     vpxor   CTR1, CTR1, xmm7
   508     vpxor   CTR2, CTR2, xmm7
   509     vpxor   CTR3, CTR3, xmm7
   510     vpxor   CTR4, CTR4, xmm7
   511     vpxor   CTR5, CTR5, xmm7
   512     vpxor   CTR6, CTR6, xmm7
   514     ROUND   1
   516     add aluCTR, 7
   517     mov aluTMP, aluCTR
   518     bswap   aluTMP
   519     xor aluTMP, [KS + 3*4]
   520     mov [8*16 + 0*16 + 3*4 + esp], aluTMP
   522     ROUND   2
   523     NEXTCTR 1
   524     ROUND   3
   525     NEXTCTR 2
   526     ROUND   4
   527     NEXTCTR 3
   528     ROUND   5
   529     NEXTCTR 4
   530     ROUND   6
   531     NEXTCTR 5
   532     ROUND   7
   533     NEXTCTR 6
   534     ROUND   8
   535     ROUND   9
   536     vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
   537     cmp     NR, 10
   538     je      @f
   540     ROUND   10
   541     ROUND   11
   542     vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
   543     cmp     NR, 12
   544     je      @f
   546     ROUND   12
   547     ROUND   13
   548     vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
   549 @@:
   550     vaesenclast CTR0, CTR0, xmm7
   551     vaesenclast CTR1, CTR1, xmm7
   552     vaesenclast CTR2, CTR2, xmm7
   553     vaesenclast CTR3, CTR3, xmm7
   554     vaesenclast CTR4, CTR4, xmm7
   555     vaesenclast CTR5, CTR5, xmm7
   556     vaesenclast CTR6, CTR6, xmm7
   558     vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
   559     vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
   560     vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
   561     vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
   562     vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
   563     vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
   564     vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
   566     vmovdqu XMMWORD PTR[0*16 + CT], CTR0
   567     vmovdqu XMMWORD PTR[1*16 + CT], CTR1
   568     vmovdqu XMMWORD PTR[2*16 + CT], CTR2
   569     vmovdqu XMMWORD PTR[3*16 + CT], CTR3
   570     vmovdqu XMMWORD PTR[4*16 + CT], CTR4
   571     vmovdqu XMMWORD PTR[5*16 + CT], CTR5
   572     vmovdqu XMMWORD PTR[6*16 + CT], CTR6
   574     vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
   575     vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
   576     vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
   577     vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
   578     vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
   579     vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
   580     vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
   582     vmovdqa XMMWORD PTR[1*16 + esp], CTR5
   583     vmovdqa XMMWORD PTR[2*16 + esp], CTR4
   584     vmovdqa XMMWORD PTR[3*16 + esp], CTR3
   585     vmovdqa XMMWORD PTR[4*16 + esp], CTR2
   586     vmovdqa XMMWORD PTR[5*16 + esp], CTR1
   587     vmovdqa XMMWORD PTR[6*16 + esp], CTR0
   589     lea CT, [7*16 + CT]
   590     lea PT, [7*16 + PT]
   591     jmp LEncData7
   593 LEncData7:
   594         cmp len, 16*7
   595         jb  LEndEnc7
   596         sub len, 16*7
   598         vpshufd TMP4, TMP5, 78
   599         vpxor   TMP4, TMP4, TMP5
   600         vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
   601         vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
   602         vpclmulqdq  TMP1, TMP5, TMP4, 011h
   603         vpclmulqdq  TMP2, TMP5, TMP4, 000h
   605         vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
   606         KARATSUBA 1
   607         vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
   608         KARATSUBA 2
   609         vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
   610         KARATSUBA 3
   611         vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
   612         KARATSUBA 4
   613         vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
   614         KARATSUBA 5
   615         vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
   616         vpxor   TMP5, TMP5, T
   617         KARATSUBA 6
   619         vpxor   TMP0, TMP0, TMP1
   620         vpxor   TMP0, TMP0, TMP2
   621         vpsrldq TMP3, TMP0, 8
   622         vpxor   TMP4, TMP1, TMP3
   623         vpslldq TMP3, TMP0, 8
   624         vpxor   TMP5, TMP2, TMP3
   626         vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   627         vpalignr    TMP5,TMP5,TMP5,8
   628         vpxor       TMP5, TMP5, TMP1
   630         vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   631         vpalignr    TMP5,TMP5,TMP5,8
   632         vpxor       TMP5, TMP5, TMP1
   634         vpxor       TMP5, TMP5, TMP4
   635         vmovdqu     T, TMP5
   637         vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
   638         vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
   639         vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
   640         vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
   641         vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
   642         vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
   643         vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
   645         ROUND 1
   646         NEXTCTR 0
   647         ROUND 2
   648         NEXTCTR 1
   649         ROUND 3
   650         NEXTCTR 2
   651         ROUND 4
   652         NEXTCTR 3
   653         ROUND 5
   654         NEXTCTR 4
   655         ROUND 6
   656         NEXTCTR 5
   657         ROUND 7
   658         NEXTCTR 6
   660         ROUND 8
   661         ROUND 9
   663         vmovdqu     xmm7, XMMWORD PTR[10*16 + KS]
   664         cmp         NR, 10
   665         je          @f
   667         ROUND 10
   668         ROUND 11
   669         vmovdqu     xmm7, XMMWORD PTR[12*16 + KS]
   670         cmp         NR, 12
   671         je          @f
   673         ROUND 12
   674         ROUND 13
   675         vmovdqu     xmm7, XMMWORD PTR[14*16 + KS]
   676 @@:
   677         vaesenclast CTR0, CTR0, xmm7
   678         vaesenclast CTR1, CTR1, xmm7
   679         vaesenclast CTR2, CTR2, xmm7
   680         vaesenclast CTR3, CTR3, xmm7
   681         vaesenclast CTR4, CTR4, xmm7
   682         vaesenclast CTR5, CTR5, xmm7
   683         vaesenclast CTR6, CTR6, xmm7
   685         vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
   686         vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
   687         vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
   688         vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
   689         vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
   690         vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
   691         vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
   693         vmovdqu XMMWORD PTR[0*16 + CT], CTR0
   694         vmovdqu XMMWORD PTR[1*16 + CT], CTR1
   695         vmovdqu XMMWORD PTR[2*16 + CT], CTR2
   696         vmovdqu XMMWORD PTR[3*16 + CT], CTR3
   697         vmovdqu XMMWORD PTR[4*16 + CT], CTR4
   698         vmovdqu XMMWORD PTR[5*16 + CT], CTR5
   699         vmovdqu XMMWORD PTR[6*16 + CT], CTR6
   701         vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
   702         vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
   703         vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
   704         vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
   705         vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
   706         vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
   707         vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
   709         vmovdqa XMMWORD PTR[1*16 + esp], CTR5
   710         vmovdqa XMMWORD PTR[2*16 + esp], CTR4
   711         vmovdqa XMMWORD PTR[3*16 + esp], CTR3
   712         vmovdqa XMMWORD PTR[4*16 + esp], CTR2
   713         vmovdqa XMMWORD PTR[5*16 + esp], CTR1
   714         vmovdqa XMMWORD PTR[6*16 + esp], CTR0
   716         lea CT, [7*16 + CT]
   717         lea PT, [7*16 + PT]
   718         jmp LEncData7
   720 LEndEnc7:
   722     vpshufd TMP4, TMP5, 78
   723     vpxor   TMP4, TMP4, TMP5
   724     vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
   725     vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
   726     vpclmulqdq  TMP1, TMP5, TMP4, 011h
   727     vpclmulqdq  TMP2, TMP5, TMP4, 000h
   729     vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
   730     KARATSUBA 1
   731     vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
   732     KARATSUBA 2
   733     vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
   734     KARATSUBA 3
   735     vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
   736     KARATSUBA 4
   737     vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
   738     KARATSUBA 5
   739     vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
   740     vpxor   TMP5, TMP5, T
   741     KARATSUBA 6
   743     vpxor   TMP0, TMP0, TMP1
   744     vpxor   TMP0, TMP0, TMP2
   745     vpsrldq TMP3, TMP0, 8
   746     vpxor   TMP4, TMP1, TMP3
   747     vpslldq TMP3, TMP0, 8
   748     vpxor   TMP5, TMP2, TMP3
   750     vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   751     vpalignr    TMP5,TMP5,TMP5,8
   752     vpxor       TMP5, TMP5, TMP1
   754     vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   755     vpalignr    TMP5,TMP5,TMP5,8
   756     vpxor       TMP5, TMP5, TMP1
   758     vpxor       TMP5, TMP5, TMP4
   759     vmovdqu     T, TMP5
   761     sub aluCTR, 6
   763 LEncDataSingles:
   765         cmp len, 16
   766         jb  LEncDataTail
   767         sub len, 16
   769         vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
   770         NEXTCTR 0
   772         vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   773         vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   774         vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   775         vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   776         vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   777         vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   778         vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   779         vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   780         vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   781         vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   782         cmp NR, 10
   783         je  @f
   784         vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   785         vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   786         vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   787         cmp NR, 12
   788         je  @f
   789         vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   790         vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   791         vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   792 @@:
   793         vaesenclast TMP1, TMP1, TMP2
   794         vpxor   TMP1, TMP1, XMMWORD PTR[PT]
   795         vmovdqu XMMWORD PTR[CT], TMP1
   797         lea PT, [16+PT]
   798         lea CT, [16+CT]
   800         vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
   801         vpxor   TMP1, TMP1, T
   803         vmovdqu TMP0, XMMWORD PTR[Htbl]
   804         GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
   805         vmovdqu T, TMP1
   807         jmp LEncDataSingles
   809 LEncDataTail:
   811     cmp len, 0
   812     je  LEncDataEnd
   814     vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
   816     vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   817     vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   818     vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   819     vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   820     vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   821     vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   822     vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   823     vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   824     vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   825     vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   826     cmp NR, 10
   827     je  @f
   828     vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   829     vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   830     vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   831     cmp NR, 12
   832     je  @f
   833     vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   834     vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   835     vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   836 @@:
   837     vaesenclast TMP1, TMP1, TMP2
   838 ; zero a temp location
   839     vpxor   TMP2, TMP2, TMP2
   840     vmovdqa XMMWORD PTR[esp], TMP2
   841 ; copy as many bytes as needed
   842     xor KS, KS
   843     mov aluTMP, edx
   844 @@:
   845         cmp len, KS
   846         je  @f
   847         mov dl, BYTE PTR[PT + KS]
   848         mov BYTE PTR[esp + KS], dl
   849         inc KS
   850         jmp @b
   851 @@:
   852     vpxor   TMP1, TMP1, XMMWORD PTR[esp]
   853     vmovdqa XMMWORD PTR[esp], TMP1
   854     xor KS, KS
   855 @@:
   856         cmp len, KS
   857         je  @f
   858         mov dl, BYTE PTR[esp + KS]
   859         mov BYTE PTR[CT + KS], dl
   860         inc KS
   861         jmp @b
   862 @@:
   863         cmp KS, 16
   864         je  @f
   865         mov BYTE PTR[esp + KS], 0
   866         inc KS
   867         jmp @b
   868 @@:
   869     mov edx, aluTMP
   870     vmovdqa TMP1, XMMWORD PTR[esp]
   871     vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
   872     vpxor   TMP1, TMP1, T
   874     vmovdqu TMP0, XMMWORD PTR[Htbl]
   875     GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
   876     vmovdqu T, TMP1
   878 LEncDataEnd:
   879     inc     aluCTR
   880     bswap   aluCTR
   881     mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
   883     mov esp, ebp
   884     pop edi
   885     pop esi
   886     pop ebx
   887     pop ebp
   890     vzeroupper
   892     ret
   893 intel_aes_gcmENC ENDP
   895 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   896 ;
   897 ; Decrypt and Authenticate
   898 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
   899 ;
   900 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   903 NEXTCTR MACRO i
   904     add     aluCTR, 1
   905     mov     aluTMP, aluCTR
   906     bswap   aluTMP
   907     xor     aluTMP, [3*4 + KS]
   908     mov     [3*4 + i*16 + esp], aluTMP
   909 ENDM
   911 intel_aes_gcmDEC PROC
   913     cmp DWORD PTR[1*4 + 3*4 + esp], 0
   914     jne LbeginDEC
   915     ret
   917 LbeginDEC:
   919     vzeroupper
   920     push    ebp
   921     push    ebx
   922     push    esi
   923     push    edi
   925     mov ebp, esp
   926     sub esp, 8*16
   927     and esp, -16
   929     mov CT, [ebp + 5*4 + 0*4]
   930     mov PT, [ebp + 5*4 + 1*4]
   931     mov Gctx, [ebp + 5*4 + 2*4]
   933     mov     KS, [16*16 + 3*16 + Gctx]
   934     lea     KS, [44 + KS]
   936     mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
   937     bswap   aluCTR
   940     vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
   941     vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   942     vmovdqu XMMWORD PTR[0*16 + esp], TMP0
   944     cmp len, 16*7
   945     jb  LDecDataSingles
   946     vmovdqu XMMWORD PTR[1*16 + esp], TMP0
   947     vmovdqu XMMWORD PTR[2*16 + esp], TMP0
   948     vmovdqu XMMWORD PTR[3*16 + esp], TMP0
   949     vmovdqu XMMWORD PTR[4*16 + esp], TMP0
   950     vmovdqu XMMWORD PTR[5*16 + esp], TMP0
   951     vmovdqu XMMWORD PTR[6*16 + esp], TMP0
   952     dec aluCTR
   954 LDecData7:
   955     cmp len, 16*7
   956     jb  LDecData7End
   957     sub len, 16*7
   959     vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
   960     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   961     vpxor   TMP5, TMP5, T
   962     vpshufd TMP4, TMP5, 78
   963     vpxor   TMP4, TMP4, TMP5
   964     vpclmulqdq  TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
   965     vmovdqu     TMP4, XMMWORD PTR[6*16 + Htbl]
   966     vpclmulqdq  TMP1, TMP5, TMP4, 011h
   967     vpclmulqdq  TMP2, TMP5, TMP4, 000h
   969     NEXTCTR 0
   970     vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
   971     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   972     KARATSUBA 5
   973     NEXTCTR 1
   974     vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
   975     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   976     KARATSUBA 4
   977     NEXTCTR 2
   978     vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
   979     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   980     KARATSUBA 3
   981     NEXTCTR 3
   982     vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
   983     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   984     KARATSUBA 2
   985     NEXTCTR 4
   986     vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
   987     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   988     KARATSUBA 1
   989     NEXTCTR 5
   990     vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
   991     vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
   992     KARATSUBA 0
   993     NEXTCTR 6
   995     vpxor   TMP0, TMP0, TMP1
   996     vpxor   TMP0, TMP0, TMP2
   997     vpsrldq TMP3, TMP0, 8
   998     vpxor   TMP4, TMP1, TMP3
   999     vpslldq TMP3, TMP0, 8
  1000     vpxor   TMP5, TMP2, TMP3
  1002     vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
  1003     vpalignr    TMP5,TMP5,TMP5,8
  1004     vpxor       TMP5, TMP5, TMP1
  1006     vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
  1007     vpalignr    TMP5,TMP5,TMP5,8
  1008     vpxor       TMP5, TMP5, TMP1
  1010     vpxor       TMP5, TMP5, TMP4
  1011     vmovdqu     T, TMP5
  1013     vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
  1014     vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
  1015     vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
  1016     vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
  1017     vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
  1018     vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
  1019     vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
  1021     ROUND   1
  1022     ROUND   2
  1023     ROUND   3
  1024     ROUND   4
  1025     ROUND   5
  1026     ROUND   6
  1027     ROUND   7
  1028     ROUND   8
  1029     ROUND   9
  1030     vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
  1031     cmp     NR, 10
  1032     je      @f
  1034     ROUND   10
  1035     ROUND   11
  1036     vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
  1037     cmp     NR, 12
  1038     je      @f
  1040     ROUND   12
  1041     ROUND   13
  1042     vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
  1043 @@:
  1044     vaesenclast CTR0, CTR0, xmm7
  1045     vaesenclast CTR1, CTR1, xmm7
  1046     vaesenclast CTR2, CTR2, xmm7
  1047     vaesenclast CTR3, CTR3, xmm7
  1048     vaesenclast CTR4, CTR4, xmm7
  1049     vaesenclast CTR5, CTR5, xmm7
  1050     vaesenclast CTR6, CTR6, xmm7
  1052     vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + CT]
  1053     vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + CT]
  1054     vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + CT]
  1055     vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + CT]
  1056     vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + CT]
  1057     vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + CT]
  1058     vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + CT]
  1060     vmovdqu XMMWORD PTR[0*16 + PT], CTR0
  1061     vmovdqu XMMWORD PTR[1*16 + PT], CTR1
  1062     vmovdqu XMMWORD PTR[2*16 + PT], CTR2
  1063     vmovdqu XMMWORD PTR[3*16 + PT], CTR3
  1064     vmovdqu XMMWORD PTR[4*16 + PT], CTR4
  1065     vmovdqu XMMWORD PTR[5*16 + PT], CTR5
  1066     vmovdqu XMMWORD PTR[6*16 + PT], CTR6
  1068     lea CT, [7*16 + CT]
  1069     lea PT, [7*16 + PT]
  1070     jmp LDecData7
  1072 LDecData7End:
  1074     NEXTCTR 0
  1076 LDecDataSingles:
  1078         cmp len, 16
  1079         jb  LDecDataTail
  1080         sub len, 16
  1082         vmovdqu TMP1, XMMWORD PTR[CT]
  1083         vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
  1084         vpxor   TMP1, TMP1, T
  1086         vmovdqu TMP0, XMMWORD PTR[Htbl]
  1087         GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
  1088         vmovdqu T, TMP1
  1090         vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
  1091         NEXTCTR 0
  1093         vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
  1094         vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
  1095         vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
  1096         vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
  1097         vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
  1098         vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
  1099         vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
  1100         vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
  1101         vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
  1102         vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
  1103         cmp NR, 10
  1104         je  @f
  1105         vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
  1106         vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
  1107         vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
  1108         cmp NR, 12
  1109         je  @f
  1110         vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
  1111         vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
  1112         vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
  1113 @@:
  1114         vaesenclast TMP1, TMP1, TMP2
  1115         vpxor   TMP1, TMP1, XMMWORD PTR[CT]
  1116         vmovdqu XMMWORD PTR[PT], TMP1
  1118         lea PT, [16+PT]
  1119         lea CT, [16+CT]
  1120         jmp LDecDataSingles
  1122 LDecDataTail:
  1124     cmp len, 0
  1125     je  LDecDataEnd
  1127     vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
  1128     inc aluCTR
  1129     vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
  1130     vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
  1131     vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
  1132     vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
  1133     vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
  1134     vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
  1135     vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
  1136     vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
  1137     vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
  1138     vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
  1139     cmp NR, 10
  1140     je  @f
  1141     vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
  1142     vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
  1143     vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
  1144     cmp NR, 12
  1145     je  @f
  1146     vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
  1147     vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
  1148     vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
  1149 @@:
  1150     vaesenclast xmm7, TMP1, TMP2
  1152 ; copy as many bytes as needed
  1153     xor KS, KS
  1154     mov aluTMP, edx
  1155 @@:
  1156         cmp len, KS
  1157         je  @f
  1158         mov dl, BYTE PTR[CT + KS]
  1159         mov BYTE PTR[esp + KS], dl
  1160         inc KS
  1161         jmp @b
  1162 @@:
  1163         cmp KS, 16
  1164         je  @f
  1165         mov BYTE PTR[esp + KS], 0
  1166         inc KS
  1167         jmp @b
  1168 @@:
  1169     mov edx, aluTMP
  1170     vmovdqa TMP1, XMMWORD PTR[esp]
  1171     vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
  1172     vpxor   TMP1, TMP1, T
  1174     vmovdqu TMP0, XMMWORD PTR[Htbl]
  1175     GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
  1176     vmovdqu T, TMP1
  1178     vpxor   xmm7, xmm7, XMMWORD PTR[esp]
  1179     vmovdqa XMMWORD PTR[esp], xmm7
  1180     xor     KS, KS
  1181     mov aluTMP, edx
  1182 @@:
  1183         cmp len, KS
  1184         je  @f
  1185         mov dl, BYTE PTR[esp + KS]
  1186         mov BYTE PTR[PT + KS], dl
  1187         inc KS
  1188         jmp @b
  1189 @@:
  1190     mov edx, aluTMP
  1192 LDecDataEnd:
  1194     bswap   aluCTR
  1195     mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
  1197     mov esp, ebp
  1198     pop edi
  1199     pop esi
  1200     pop ebx
  1201     pop ebp
  1203     vzeroupper
  1205     ret
  1206 intel_aes_gcmDEC ENDP
  1209 END

mercurial