security/nss/lib/freebl/intel-gcm-x64-masm.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ; LICENSE:
     2 ; This submission to NSS is to be made available under the terms of the
     3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
     4 ; //mozilla.org/MPL/2.0/.
     5 ;###############################################################################
     6 ; Copyright(c) 2014, Intel Corp.
     7 ; Developers and authors:
     8 ; Shay Gueron and Vlad Krasnov
     9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
    10 ; Please send feedback directly to crypto.feedback.alias@intel.com
    13 .DATA
    14 ALIGN 16
    15 Lone            dq 1,0
    16 Ltwo            dq 2,0
    17 Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    18 Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
    19 Lpoly           dq 01h, 0c200000000000000h
    21 .CODE
    24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
    25     vpclmulqdq  TMP1, SRC2, SRC1, 0h
    26     vpclmulqdq  TMP4, SRC2, SRC1, 011h
    28     vpshufd     TMP2, SRC2, 78
    29     vpshufd     TMP3, SRC1, 78
    30     vpxor       TMP2, TMP2, SRC2
    31     vpxor       TMP3, TMP3, SRC1
    33     vpclmulqdq  TMP2, TMP2, TMP3, 0h
    34     vpxor       TMP2, TMP2, TMP1
    35     vpxor       TMP2, TMP2, TMP4
    37     vpslldq     TMP3, TMP2, 8
    38     vpsrldq     TMP2, TMP2, 8
    40     vpxor       TMP1, TMP1, TMP3
    41     vpxor       TMP4, TMP4, TMP2
    43     vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
    44     vpshufd     TMP3, TMP1, 78
    45     vpxor       TMP1, TMP2, TMP3
    47     vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
    48     vpshufd     TMP3, TMP1, 78
    49     vpxor       TMP1, TMP2, TMP3
    51     vpxor       DST, TMP1, TMP4
    53     ENDM
    55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    56 ;
    57 ; Generates the final GCM tag
    58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
    59 ;                       unsigned char *Tp,
    60 ;                       unsigned int Mlen,
    61 ;                       unsigned int Alen,
    62 ;                       unsigned char *X0,
    63 ;                       unsigned char *TAG);
    64 ;
    65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    67 ALIGN 16
    68 intel_aes_gcmTAG PROC
    70 Htbl    textequ <rcx>
    71 Tp      textequ <rdx>
    72 Mlen    textequ <r8>
    73 Alen    textequ <r9>
    74 X0      textequ <r10>
    75 TAG     textequ <r11>
    77 T       textequ <xmm0>
    78 TMP0    textequ <xmm1>
    80     mov     X0, [rsp + 1*8 + 4*8]
    81     mov     TAG, [rsp + 1*8 + 5*8]
    83     vzeroupper
    84     vmovdqu T, XMMWORD PTR[Tp]
    85     vpxor   TMP0, TMP0, TMP0
    87     shl     Mlen, 3
    88     shl     Alen, 3
    90     ;vpinsrq    TMP0, TMP0, Mlen, 0
    91     ;vpinsrq    TMP0, TMP0, Alen, 1
    92     ; workaround the ml64.exe vpinsrq issue
    93     vpinsrd TMP0, TMP0, r8d, 0
    94     vpinsrd TMP0, TMP0, r9d, 2
    95     shr Mlen, 32
    96     shr Alen, 32
    97     vpinsrd TMP0, TMP0, r8d, 1
    98     vpinsrd TMP0, TMP0, r9d, 3
   100     vpxor   T, T, TMP0
   101     vmovdqu TMP0, XMMWORD PTR[Htbl]
   102     GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
   104     vpshufb T, T, [Lbswap_mask]
   105     vpxor   T, T, [X0]
   106     vmovdqu XMMWORD PTR[TAG], T
   107     vzeroupper
   109     ret
   111 intel_aes_gcmTAG ENDP
   113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   114 ;
   115 ; Generates the H table
   116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
   117 ;
   118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   120 ALIGN 16
   121 intel_aes_gcmINIT PROC
   123 Htbl    textequ <rcx>
   124 KS      textequ <rdx>
   125 NR      textequ <r8d>
   127 T       textequ <xmm0>
   128 TMP0    textequ <xmm1>
   130     vzeroupper
   131     ; AES-ENC(0)
   132     vmovdqu T, XMMWORD PTR[KS]
   133     lea KS, [16 + KS]
   134     dec NR
   135 Lenc_loop:
   136         vaesenc T, T, [KS]
   137         lea KS, [16 + KS]
   138         dec NR
   139         jnz Lenc_loop
   141     vaesenclast T, T, [KS]
   142     vpshufb T, T, [Lbswap_mask]
   144     ;Calculate H` = GFMUL(H, 2)
   145     vpsrad  xmm3, T, 31
   146     vpshufd xmm3, xmm3, 0ffh
   147     vpand   xmm5, xmm3, [Lpoly]
   148     vpsrld  xmm3, T, 31
   149     vpslld  xmm4, T, 1
   150     vpslldq xmm3, xmm3, 4
   151     vpxor   T, xmm4, xmm3
   152     vpxor   T, T, xmm5
   154     vmovdqu TMP0, T
   155     vmovdqu XMMWORD PTR[Htbl + 0*16], T
   157     vpshufd xmm2, T, 78
   158     vpxor   xmm2, xmm2, T
   159     vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
   161     i = 1
   162     WHILE i LT 8
   163         GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
   164         vmovdqu XMMWORD PTR[Htbl + i*16], T
   165         vpshufd xmm2, T, 78
   166         vpxor   xmm2, xmm2, T
   167         vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
   168         i = i+1
   169         ENDM
   170     vzeroupper
   171     ret
   172 intel_aes_gcmINIT ENDP
   175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   176 ;
   177 ; Authenticate only
   178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
   179 ;
   180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   182 ALIGN 16
   183 intel_aes_gcmAAD PROC
   185 Htbl    textequ <rcx>
   186 inp     textequ <rdx>
   187 len     textequ <r8>
   188 Tp      textequ <r9>
   189 hlp0    textequ <r10>
   191 DATA    textequ <xmm0>
   192 T       textequ <xmm1>
   193 TMP0    textequ <xmm2>
   194 TMP1    textequ <xmm3>
   195 TMP2    textequ <xmm4>
   196 TMP3    textequ <xmm5>
   197 TMP4    textequ <xmm6>
   198 Xhi     textequ <xmm7>
   200 KARATSUBA_AAD MACRO i
   201     vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
   202     vpxor       TMP0, TMP0, TMP3
   203     vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
   204     vpxor       TMP1, TMP1, TMP3
   205     vpshufd     TMP3, DATA, 78
   206     vpxor       TMP3, TMP3, DATA
   207     vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
   208     vpxor       TMP2, TMP2, TMP3
   209 ENDM
   211     test  len, len
   212     jnz   LbeginAAD
   213     ret
   215 LbeginAAD:
   216     vzeroupper
   218     sub rsp, 2*16
   219     vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
   220     vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
   222     vpxor   Xhi, Xhi, Xhi
   224     vmovdqu T, XMMWORD PTR[Tp]
   225     ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
   226     mov hlp0, len
   227     and hlp0, 128-1
   228     jz  Lmod_loop
   230     and len, -128
   231     sub hlp0, 16
   233     ; Prefix block
   234     vmovdqu DATA, XMMWORD PTR[inp]
   235     vpshufb DATA, DATA, [Lbswap_mask]
   236     vpxor   DATA, DATA, T
   238     vpclmulqdq  TMP0, DATA, [Htbl + hlp0], 0h
   239     vpclmulqdq  TMP1, DATA, [Htbl + hlp0], 011h
   240     vpshufd     TMP3, DATA, 78
   241     vpxor       TMP3, TMP3, DATA
   242     vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
   244     lea     inp, [inp+16]
   245     test    hlp0, hlp0
   246     jnz     Lpre_loop
   247     jmp     Lred1
   249     ;hash remaining prefix bocks (up to 7 total prefix blocks)
   250 Lpre_loop:
   252         sub hlp0, 16
   254         vmovdqu DATA, XMMWORD PTR[inp]
   255         vpshufb DATA, DATA, [Lbswap_mask]
   257         vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 0h
   258         vpxor       TMP0, TMP0, TMP3
   259         vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 011h
   260         vpxor       TMP1, TMP1, TMP3
   261         vpshufd     TMP3, DATA, 78
   262         vpxor       TMP3, TMP3, DATA
   263         vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
   264         vpxor       TMP2, TMP2, TMP3
   266         test    hlp0, hlp0
   267         lea     inp, [inp+16]
   268         jnz     Lpre_loop
   270 Lred1:
   272     vpxor       TMP2, TMP2, TMP0
   273     vpxor       TMP2, TMP2, TMP1
   274     vpsrldq     TMP3, TMP2, 8
   275     vpslldq     TMP2, TMP2, 8
   277     vpxor       Xhi, TMP1, TMP3
   278     vpxor       T, TMP0, TMP2
   281 Lmod_loop:
   283         sub len, 16*8
   284         jb  Ldone
   285         ; Block #0
   286         vmovdqu DATA, XMMWORD PTR[inp + 16*7]
   287         vpshufb DATA, DATA, [Lbswap_mask]
   289         vpclmulqdq  TMP0, DATA, [Htbl + 0*16], 0h
   290         vpclmulqdq  TMP1, DATA, [Htbl + 0*16], 011h
   291         vpshufd     TMP3, DATA, 78
   292         vpxor       TMP3, TMP3, DATA
   293         vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
   295         ; Block #1
   296         vmovdqu DATA, XMMWORD PTR[inp + 16*6]
   297         vpshufb DATA, DATA, [Lbswap_mask]
   298         KARATSUBA_AAD 1
   300         ; Block #2
   301         vmovdqu DATA, XMMWORD PTR[inp + 16*5]
   302         vpshufb DATA, DATA, [Lbswap_mask]
   304         vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
   305         vpalignr    T, T, T, 8
   307         KARATSUBA_AAD 2
   309         vpxor       T, T, TMP4                          ;reduction stage 1b
   311         ; Block #3
   312         vmovdqu DATA, XMMWORD PTR[inp + 16*4]
   313         vpshufb DATA, DATA, [Lbswap_mask]
   314         KARATSUBA_AAD 3
   315         ; Block #4
   316         vmovdqu DATA, XMMWORD PTR[inp + 16*3]
   317         vpshufb DATA, DATA, [Lbswap_mask]
   319         vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
   320         vpalignr    T, T, T, 8
   322         KARATSUBA_AAD 4
   324         vpxor       T, T, TMP4                          ;reduction stage 2b
   325         ; Block #5
   326         vmovdqu DATA, XMMWORD PTR[inp + 16*2]
   327         vpshufb DATA, DATA, [Lbswap_mask]
   328         KARATSUBA_AAD 5
   330         vpxor   T, T, Xhi                               ;reduction finalize
   331         ; Block #6
   332         vmovdqu DATA, XMMWORD PTR[inp + 16*1]
   333         vpshufb DATA, DATA, [Lbswap_mask]
   334         KARATSUBA_AAD 6
   335         ; Block #7
   336         vmovdqu DATA, XMMWORD PTR[inp + 16*0]
   337         vpshufb DATA, DATA, [Lbswap_mask]
   338         vpxor   DATA, DATA, T
   339         KARATSUBA_AAD 7
   340         ; Aggregated 8 blocks, now karatsuba fixup
   341         vpxor   TMP2, TMP2, TMP0
   342         vpxor   TMP2, TMP2, TMP1
   343         vpsrldq TMP3, TMP2, 8
   344         vpslldq TMP2, TMP2, 8
   346         vpxor   Xhi, TMP1, TMP3
   347         vpxor   T, TMP0, TMP2
   349         lea inp, [inp + 16*8]
   350         jmp Lmod_loop
   352 Ldone:
   353     vpclmulqdq  TMP4, T, [Lpoly], 010h
   354     vpalignr    T, T, T, 8
   355     vpxor       T, T, TMP4
   357     vpclmulqdq  TMP4, T, [Lpoly], 010h
   358     vpalignr    T, T, T, 8
   359     vpxor       T, T, TMP4
   361     vpxor       T, T, Xhi
   362     vmovdqu     XMMWORD PTR[Tp], T
   363     vzeroupper
   365     vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
   366     vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
   367     add rsp, 16*2
   369     ret
   371 intel_aes_gcmAAD ENDP
   374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   375 ;
   376 ; Encrypt and Authenticate
   377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
   378 ;
   379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   381 ALIGN 16
   382 intel_aes_gcmENC PROC
   384 PT      textequ <rcx>
   385 CT      textequ <rdx>
   386 Htbl    textequ <r8>
   387 Gctx    textequ <r8>
   388 len     textequ <r9>
   389 KS      textequ <r10>
   390 NR      textequ <eax>
   392 aluCTR  textequ <r11d>
   393 aluKSl  textequ <r12d>
   394 aluTMP  textequ <r13d>
   396 T       textequ <xmm0>
   397 TMP0    textequ <xmm1>
   398 TMP1    textequ <xmm2>
   399 TMP2    textequ <xmm3>
   400 TMP3    textequ <xmm4>
   401 TMP4    textequ <xmm5>
   402 TMP5    textequ <xmm6>
   403 CTR0    textequ <xmm7>
   404 CTR1    textequ <xmm8>
   405 CTR2    textequ <xmm9>
   406 CTR3    textequ <xmm10>
   407 CTR4    textequ <xmm11>
   408 CTR5    textequ <xmm12>
   409 CTR6    textequ <xmm13>
   410 CTR7    textequ <xmm14>
   411 BSWAPMASK   textequ <xmm15>
   413 ROUND MACRO i
   414     vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
   415     vaesenc CTR0, CTR0, TMP3
   416     vaesenc CTR1, CTR1, TMP3
   417     vaesenc CTR2, CTR2, TMP3
   418     vaesenc CTR3, CTR3, TMP3
   419     vaesenc CTR4, CTR4, TMP3
   420     vaesenc CTR5, CTR5, TMP3
   421     vaesenc CTR6, CTR6, TMP3
   422     vaesenc CTR7, CTR7, TMP3
   423 ENDM
   424 ROUNDMUL MACRO i
   425     vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
   427     vaesenc CTR0, CTR0, TMP3
   428     vaesenc CTR1, CTR1, TMP3
   429     vaesenc CTR2, CTR2, TMP3
   430     vaesenc CTR3, CTR3, TMP3
   432     vpshufd TMP4, TMP5, 78
   433     vpxor   TMP4, TMP4, TMP5
   435     vaesenc CTR4, CTR4, TMP3
   436     vaesenc CTR5, CTR5, TMP3
   437     vaesenc CTR6, CTR6, TMP3
   438     vaesenc CTR7, CTR7, TMP3
   440     vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
   441     vpxor       TMP0, TMP0, TMP3
   442     vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
   443     vpclmulqdq  TMP3, TMP5, TMP4, 011h
   444     vpxor       TMP1, TMP1, TMP3
   445     vpclmulqdq  TMP3, TMP5, TMP4, 000h
   446     vpxor       TMP2, TMP2, TMP3
   447 ENDM
   448 KARATSUBA MACRO i
   449     vpshufd TMP4, TMP5, 78
   450     vpxor   TMP4, TMP4, TMP5
   451     vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
   452     vpxor       TMP0, TMP0, TMP3
   453     vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
   454     vpclmulqdq  TMP3, TMP5, TMP4, 011h
   455     vpxor       TMP1, TMP1, TMP3
   456     vpclmulqdq  TMP3, TMP5, TMP4, 000h
   457     vpxor       TMP2, TMP2, TMP3
   458 ENDM
   459 NEXTCTR MACRO i
   460     add aluCTR, 1
   461     mov aluTMP, aluCTR
   462     xor aluTMP, aluKSl
   463     bswap   aluTMP
   464     mov [3*4 + 8*16 + i*16 + rsp], aluTMP
   465 ENDM
   468     test  len, len
   469     jnz   LbeginENC
   470     ret
   472 LbeginENC:
   474     vzeroupper
   475     push    r11
   476     push    r12
   477     push    r13
   478     push    rbp
   479     sub rsp, 10*16
   480     vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
   481     vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
   482     vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
   483     vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
   484     vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
   485     vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
   486     vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
   487     vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
   488     vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
   489     vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
   491     mov rbp, rsp
   492     sub rsp, 16*16
   493     and rsp, -16
   495     vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
   496     vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   497     vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
   498     mov     KS, [16*16 + 3*16 + Gctx]
   499     mov     NR, [4 + KS]
   500     lea     KS, [48 + KS]
   502     vpshufb CTR0, CTR0, BSWAPMASK
   504     mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
   505     mov aluKSl, [3*4 + KS]
   506     bswap   aluCTR
   507     bswap   aluKSl
   509     vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
   510     vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   511     vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
   513     cmp len, 128
   514     jb  LEncDataSingles
   515 ; Prepare the "top" counters
   516     vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
   517     vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
   518     vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
   519     vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
   520     vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
   521     vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
   522     vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
   524 ; Encrypt the initial 8 blocks
   525     sub len, 128
   526     vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
   527     vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
   528     vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
   529     vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
   530     vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
   531     vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
   532     vpaddd  CTR7, CTR6, XMMWORD PTR[Lone]
   534     vpshufb CTR0, CTR0, BSWAPMASK
   535     vpshufb CTR1, CTR1, BSWAPMASK
   536     vpshufb CTR2, CTR2, BSWAPMASK
   537     vpshufb CTR3, CTR3, BSWAPMASK
   538     vpshufb CTR4, CTR4, BSWAPMASK
   539     vpshufb CTR5, CTR5, BSWAPMASK
   540     vpshufb CTR6, CTR6, BSWAPMASK
   541     vpshufb CTR7, CTR7, BSWAPMASK
   543     vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
   544     vpxor   CTR0, CTR0, TMP3
   545     vpxor   CTR1, CTR1, TMP3
   546     vpxor   CTR2, CTR2, TMP3
   547     vpxor   CTR3, CTR3, TMP3
   548     vpxor   CTR4, CTR4, TMP3
   549     vpxor   CTR5, CTR5, TMP3
   550     vpxor   CTR6, CTR6, TMP3
   551     vpxor   CTR7, CTR7, TMP3
   553     ROUND   1
   555     add aluCTR, 8
   556     mov aluTMP, aluCTR
   557     xor aluTMP, aluKSl
   558     bswap   aluTMP
   559     mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
   561     ROUND   2
   562     NEXTCTR 1
   563     ROUND   3
   564     NEXTCTR 2
   565     ROUND   4
   566     NEXTCTR 3
   567     ROUND   5
   568     NEXTCTR 4
   569     ROUND   6
   570     NEXTCTR 5
   571     ROUND   7
   572     NEXTCTR 6
   573     ROUND   8
   574     NEXTCTR 7
   575     ROUND   9
   576     vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
   577     cmp     NR, 10
   578     je      @f
   580     ROUND   10
   581     ROUND   11
   582     vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
   583     cmp     NR, 12
   584     je      @f
   586     ROUND   12
   587     ROUND   13
   588     vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
   589 @@:
   590     vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
   591     vaesenclast CTR0, CTR0, TMP3
   592     vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
   593     vaesenclast CTR1, CTR1, TMP3
   594     vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
   595     vaesenclast CTR2, CTR2, TMP3
   596     vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
   597     vaesenclast CTR3, CTR3, TMP3
   598     vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
   599     vaesenclast CTR4, CTR4, TMP3
   600     vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
   601     vaesenclast CTR5, CTR5, TMP3
   602     vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
   603     vaesenclast CTR6, CTR6, TMP3
   604     vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
   605     vaesenclast CTR7, CTR7, TMP3
   607     vmovdqu XMMWORD PTR[0*16 + CT], CTR0
   608     vpshufb CTR0, CTR0, BSWAPMASK
   609     vmovdqu XMMWORD PTR[1*16 + CT], CTR1
   610     vpshufb CTR1, CTR1, BSWAPMASK
   611     vmovdqu XMMWORD PTR[2*16 + CT], CTR2
   612     vpshufb CTR2, CTR2, BSWAPMASK
   613     vmovdqu XMMWORD PTR[3*16 + CT], CTR3
   614     vpshufb CTR3, CTR3, BSWAPMASK
   615     vmovdqu XMMWORD PTR[4*16 + CT], CTR4
   616     vpshufb CTR4, CTR4, BSWAPMASK
   617     vmovdqu XMMWORD PTR[5*16 + CT], CTR5
   618     vpshufb CTR5, CTR5, BSWAPMASK
   619     vmovdqu XMMWORD PTR[6*16 + CT], CTR6
   620     vpshufb CTR6, CTR6, BSWAPMASK
   621     vmovdqu XMMWORD PTR[7*16 + CT], CTR7
   622     vpshufb TMP5, CTR7, BSWAPMASK
   624     vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
   625     vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
   626     vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
   627     vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
   628     vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
   629     vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
   630     vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
   632     lea CT, [8*16 + CT]
   633     lea PT, [8*16 + PT]
   634     jmp LEncDataOctets
   636 LEncDataOctets:
   637         cmp len, 128
   638         jb  LEndEncOctets
   639         sub len, 128
   641         vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
   642         vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
   643         vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
   644         vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
   645         vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
   646         vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
   647         vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
   648         vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
   650         vpshufd TMP4, TMP5, 78
   651         vpxor   TMP4, TMP4, TMP5
   652         vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
   653         vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
   654         vpclmulqdq  TMP1, TMP5, TMP4, 011h
   655         vpclmulqdq  TMP2, TMP5, TMP4, 000h
   657         vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
   658         ROUNDMUL 1
   659         NEXTCTR 0
   660         vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
   661         ROUNDMUL 2
   662         NEXTCTR 1
   663         vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
   664         ROUNDMUL 3
   665         NEXTCTR 2
   666         vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
   667         ROUNDMUL 4
   668         NEXTCTR 3
   669         vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
   670         ROUNDMUL 5
   671         NEXTCTR 4
   672         vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
   673         ROUNDMUL 6
   674         NEXTCTR 5
   675         vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
   676         ROUNDMUL 7
   677         NEXTCTR 6
   679         ROUND 8
   680         NEXTCTR 7
   682         vpxor   TMP0, TMP0, TMP1
   683         vpxor   TMP0, TMP0, TMP2
   684         vpsrldq TMP3, TMP0, 8
   685         vpxor   TMP4, TMP1, TMP3
   686         vpslldq TMP3, TMP0, 8
   687         vpxor   T, TMP2, TMP3
   689         vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   690         vpalignr    T,T,T,8
   691         vpxor       T, T, TMP1
   693         ROUND 9
   695         vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   696         vpalignr    T,T,T,8
   697         vpxor       T, T, TMP1
   699         vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
   700         cmp         NR, 10
   701         je          @f
   703         ROUND 10
   704         ROUND 11
   705         vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
   706         cmp         NR, 12
   707         je          @f
   709         ROUND 12
   710         ROUND 13
   711         vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
   712 @@:
   713         vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
   714         vaesenclast CTR0, CTR0, TMP3
   715         vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
   716         vaesenclast CTR1, CTR1, TMP3
   717         vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
   718         vaesenclast CTR2, CTR2, TMP3
   719         vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
   720         vaesenclast CTR3, CTR3, TMP3
   721         vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
   722         vaesenclast CTR4, CTR4, TMP3
   723         vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
   724         vaesenclast CTR5, CTR5, TMP3
   725         vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
   726         vaesenclast CTR6, CTR6, TMP3
   727         vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
   728         vaesenclast CTR7, CTR7, TMP3
   730         vmovdqu XMMWORD PTR[0*16 + CT], CTR0
   731         vpshufb CTR0, CTR0, BSWAPMASK
   732         vmovdqu XMMWORD PTR[1*16 + CT], CTR1
   733         vpshufb CTR1, CTR1, BSWAPMASK
   734         vmovdqu XMMWORD PTR[2*16 + CT], CTR2
   735         vpshufb CTR2, CTR2, BSWAPMASK
   736         vmovdqu XMMWORD PTR[3*16 + CT], CTR3
   737         vpshufb CTR3, CTR3, BSWAPMASK
   738         vmovdqu XMMWORD PTR[4*16 + CT], CTR4
   739         vpshufb CTR4, CTR4, BSWAPMASK
   740         vmovdqu XMMWORD PTR[5*16 + CT], CTR5
   741         vpshufb CTR5, CTR5, BSWAPMASK
   742         vmovdqu XMMWORD PTR[6*16 + CT], CTR6
   743         vpshufb CTR6, CTR6, BSWAPMASK
   744         vmovdqu XMMWORD PTR[7*16 + CT], CTR7
   745         vpshufb TMP5, CTR7, BSWAPMASK
   747         vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
   748         vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
   749         vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
   750         vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
   751         vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
   752         vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
   753         vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
   755         vpxor   T, T, TMP4
   757         lea CT, [8*16 + CT]
   758         lea PT, [8*16 + PT]
   759         jmp LEncDataOctets
   761 LEndEncOctets:
   763     vpshufd TMP4, TMP5, 78
   764     vpxor   TMP4, TMP4, TMP5
   765     vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
   766     vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
   767     vpclmulqdq  TMP1, TMP5, TMP4, 011h
   768     vpclmulqdq  TMP2, TMP5, TMP4, 000h
   770     vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
   771     KARATSUBA 1
   772     vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
   773     KARATSUBA 2
   774     vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
   775     KARATSUBA 3
   776     vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
   777     KARATSUBA 4
   778     vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
   779     KARATSUBA 5
   780     vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
   781     KARATSUBA 6
   782     vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
   783     KARATSUBA 7
   785     vpxor   TMP0, TMP0, TMP1
   786     vpxor   TMP0, TMP0, TMP2
   787     vpsrldq TMP3, TMP0, 8
   788     vpxor   TMP4, TMP1, TMP3
   789     vpslldq TMP3, TMP0, 8
   790     vpxor   T, TMP2, TMP3
   792     vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   793     vpalignr    T,T,T,8
   794     vpxor       T, T, TMP1
   796     vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   797     vpalignr    T,T,T,8
   798     vpxor       T, T, TMP1
   800     vpxor       T, T, TMP4
   802     sub aluCTR, 7
   804 LEncDataSingles:
   806         cmp len, 16
   807         jb  LEncDataTail
   808         sub len, 16
   810         vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
   811         NEXTCTR 0
   813         vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   814         vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   815         vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   816         vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   817         vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   818         vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   819         vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   820         vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   821         vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   822         vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   823         cmp NR, 10
   824         je  @f
   825         vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   826         vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   827         vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   828         cmp NR, 12
   829         je  @f
   830         vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   831         vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   832         vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   833 @@:
   834         vaesenclast TMP1, TMP1, TMP2
   835         vpxor   TMP1, TMP1, XMMWORD PTR[PT]
   836         vmovdqu XMMWORD PTR[CT], TMP1
   838         lea PT, [16+PT]
   839         lea CT, [16+CT]
   841         vpshufb TMP1, TMP1, BSWAPMASK
   842         vpxor   T, T, TMP1
   843         vmovdqu TMP0, XMMWORD PTR[Htbl]
   844         GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
   846         jmp LEncDataSingles
   848 LEncDataTail:
   850     test    len, len
   851     jz  LEncDataEnd
   853     vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
   855     vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   856     vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   857     vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   858     vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   859     vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   860     vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   861     vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   862     vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   863     vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   864     vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   865     cmp NR, 10
   866     je  @f
   867     vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   868     vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   869     vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   870     cmp NR, 12
   871     je  @f
   872     vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   873     vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   874     vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   875 @@:
   876     vaesenclast TMP1, TMP1, TMP2
   877 ; zero a temp location
   878     vpxor   TMP2, TMP2, TMP2
   879     vmovdqa XMMWORD PTR[rsp], TMP2
   880 ; copy as many bytes as needed
   881     xor KS, KS
   883 @@:
   884         cmp len, KS
   885         je  @f
   886         mov al, [PT + KS]
   887         mov [rsp + KS], al
   888         inc KS
   889         jmp @b
   890 @@:
   891     vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
   892     vmovdqa XMMWORD PTR[rsp], TMP1
   893     xor KS, KS
   894 @@:
   895         cmp len, KS
   896         je  @f
   897         mov al, [rsp + KS]
   898         mov [CT + KS], al
   899         inc KS
   900         jmp @b
   901 @@:
   902         cmp KS, 16
   903         je  @f
   904         mov BYTE PTR[rsp + KS], 0
   905         inc KS
   906         jmp @b
   907 @@:
   908 BAIL:
   909     vmovdqa TMP1, XMMWORD PTR[rsp]
   910     vpshufb TMP1, TMP1, BSWAPMASK
   911     vpxor   T, T, TMP1
   912     vmovdqu TMP0, XMMWORD PTR[Htbl]
   913     GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
   915 LEncDataEnd:
   917     vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
   918     bswap   aluCTR
   919     mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
   921     mov rsp, rbp
   923     vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
   924     vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
   925     vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
   926     vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
   927     vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
   928     vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
   929     vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
   930     vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
   931     vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
   932     vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
   934     add rsp, 10*16
   935     pop rbp
   936     pop r13
   937     pop r12
   938     pop r11
   940     vzeroupper
   942     ret
   943 intel_aes_gcmENC ENDP
   945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   946 ;
   947 ; Decrypt and Authenticate
   948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
   949 ;
   950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   952 ALIGN 16
   953 intel_aes_gcmDEC PROC
   955 NEXTCTR MACRO i
   956     add aluCTR, 1
   957     mov aluTMP, aluCTR
   958     xor aluTMP, aluKSl
   959     bswap   aluTMP
   960     mov [3*4 + i*16 + rsp], aluTMP
   961 ENDM
   963 PT      textequ <rdx>
   964 CT      textequ <rcx>
   966     test  len, len
   967     jnz   LbeginDEC
   968     ret
   970 LbeginDEC:
   972     vzeroupper
   973     push    r11
   974     push    r12
   975     push    r13
   976     push    rbp
   977     sub rsp, 10*16
   978     vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
   979     vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
   980     vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
   981     vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
   982     vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
   983     vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
   984     vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
   985     vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
   986     vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
   987     vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
   989     mov rbp, rsp
   990     sub rsp, 8*16
   991     and rsp, -16
   993     vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
   994     vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   995     vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
   996     mov     KS, [16*16 + 3*16 + Gctx]
   997     mov     NR, [4 + KS]
   998     lea     KS, [48 + KS]
  1000     vpshufb CTR0, CTR0, BSWAPMASK
  1002     mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
  1003     mov aluKSl, [3*4 + KS]
  1004     bswap   aluCTR
  1005     bswap   aluKSl
  1007     vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
  1008     vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
  1009     vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
  1011     cmp len, 128
  1012     jb  LDecDataSingles
  1013 ; Prepare the "top" counters
  1014     vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
  1015     vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
  1016     vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
  1017     vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
  1018     vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
  1019     vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
  1020     vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
  1022     NEXTCTR 1
  1023     NEXTCTR 2
  1024     NEXTCTR 3
  1025     NEXTCTR 4
  1026     NEXTCTR 5
  1027     NEXTCTR 6
  1028     NEXTCTR 7
  1030 LDecDataOctets:
  1031         cmp len, 128
  1032         jb  LEndDecOctets
  1033         sub len, 128
  1035         vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
  1036         vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
  1037         vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
  1038         vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
  1039         vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
  1040         vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
  1041         vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
  1042         vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
  1044         vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
  1045         vpshufb TMP5, TMP5, BSWAPMASK
  1046         vpshufd TMP4, TMP5, 78
  1047         vpxor   TMP4, TMP4, TMP5
  1048         vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
  1049         vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
  1050         vpclmulqdq  TMP1, TMP5, TMP4, 011h
  1051         vpclmulqdq  TMP2, TMP5, TMP4, 000h
  1053         vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
  1054         vpshufb TMP5, TMP5, BSWAPMASK
  1055         ROUNDMUL 1
  1056         NEXTCTR 0
  1057         vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
  1058         vpshufb TMP5, TMP5, BSWAPMASK
  1059         ROUNDMUL 2
  1060         NEXTCTR 1
  1061         vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
  1062         vpshufb TMP5, TMP5, BSWAPMASK
  1063         ROUNDMUL 3
  1064         NEXTCTR 2
  1065         vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
  1066         vpshufb TMP5, TMP5, BSWAPMASK
  1067         ROUNDMUL 4
  1068         NEXTCTR 3
  1069         vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
  1070         vpshufb TMP5, TMP5, BSWAPMASK
  1071         ROUNDMUL 5
  1072         NEXTCTR 4
  1073         vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
  1074         vpshufb TMP5, TMP5, BSWAPMASK
  1075         ROUNDMUL 6
  1076         NEXTCTR 5
  1077         vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
  1078         vpshufb TMP5, TMP5, BSWAPMASK
  1079         vpxor   TMP5, TMP5, T
  1080         ROUNDMUL 7
  1081         NEXTCTR 6
  1083         ROUND 8
  1084         NEXTCTR 7
  1086         vpxor   TMP0, TMP0, TMP1
  1087         vpxor   TMP0, TMP0, TMP2
  1088         vpsrldq TMP3, TMP0, 8
  1089         vpxor   TMP4, TMP1, TMP3
  1090         vpslldq TMP3, TMP0, 8
  1091         vpxor   T, TMP2, TMP3
  1093         vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
  1094         vpalignr    T,T,T,8
  1095         vpxor       T, T, TMP1
  1097         ROUND 9
  1099         vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
  1100         vpalignr    T,T,T,8
  1101         vpxor       T, T, TMP1
  1103         vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
  1104         cmp         NR, 10
  1105         je          @f
  1107         ROUND 10
  1108         ROUND 11
  1109         vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
  1110         cmp         NR, 12
  1111         je          @f
  1113         ROUND 12
  1114         ROUND 13
  1115         vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
  1116 @@:
  1117         vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + CT]
  1118         vaesenclast CTR0, CTR0, TMP3
  1119         vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + CT]
  1120         vaesenclast CTR1, CTR1, TMP3
  1121         vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + CT]
  1122         vaesenclast CTR2, CTR2, TMP3
  1123         vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + CT]
  1124         vaesenclast CTR3, CTR3, TMP3
  1125         vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + CT]
  1126         vaesenclast CTR4, CTR4, TMP3
  1127         vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + CT]
  1128         vaesenclast CTR5, CTR5, TMP3
  1129         vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + CT]
  1130         vaesenclast CTR6, CTR6, TMP3
  1131         vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + CT]
  1132         vaesenclast CTR7, CTR7, TMP3
  1134         vmovdqu XMMWORD PTR[0*16 + PT], CTR0
  1135         vmovdqu XMMWORD PTR[1*16 + PT], CTR1
  1136         vmovdqu XMMWORD PTR[2*16 + PT], CTR2
  1137         vmovdqu XMMWORD PTR[3*16 + PT], CTR3
  1138         vmovdqu XMMWORD PTR[4*16 + PT], CTR4
  1139         vmovdqu XMMWORD PTR[5*16 + PT], CTR5
  1140         vmovdqu XMMWORD PTR[6*16 + PT], CTR6
  1141         vmovdqu XMMWORD PTR[7*16 + PT], CTR7
  1143         vpxor   T, T, TMP4
  1145         lea CT, [8*16 + CT]
  1146         lea PT, [8*16 + PT]
  1147         jmp LDecDataOctets
  1149 LEndDecOctets:
  1151     sub aluCTR, 7
  1153 LDecDataSingles:
  1155         cmp len, 16
  1156         jb  LDecDataTail
  1157         sub len, 16
  1159         vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
  1160         NEXTCTR 0
  1162         vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
  1163         vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
  1164         vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
  1165         vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
  1166         vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
  1167         vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
  1168         vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
  1169         vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
  1170         vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
  1171         vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
  1172         cmp NR, 10
  1173         je  @f
  1174         vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
  1175         vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
  1176         vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
  1177         cmp NR, 12
  1178         je  @f
  1179         vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
  1180         vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
  1181         vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
  1182 @@:
  1183         vaesenclast TMP1, TMP1, TMP2
  1185         vmovdqu TMP2, XMMWORD PTR[CT]
  1186         vpxor   TMP1, TMP1, TMP2
  1187         vmovdqu XMMWORD PTR[PT], TMP1
  1189         lea PT, [16+PT]
  1190         lea CT, [16+CT]
  1192         vpshufb TMP2, TMP2, BSWAPMASK
  1193         vpxor   T, T, TMP2
  1194         vmovdqu TMP0, XMMWORD PTR[Htbl]
  1195         GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
  1197         jmp LDecDataSingles
  1199 LDecDataTail:
  1201     test    len, len
  1202     jz      LDecDataEnd
  1204     vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
  1205     inc aluCTR
  1206     vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
  1207     vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
  1208     vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
  1209     vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
  1210     vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
  1211     vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
  1212     vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
  1213     vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
  1214     vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
  1215     vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
  1216     cmp NR, 10
  1217     je  @f
  1218     vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
  1219     vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
  1220     vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
  1221     cmp NR, 12
  1222     je  @f
  1223     vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
  1224     vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
  1225     vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
  1226 @@:
  1227     vaesenclast TMP1, TMP1, TMP2
  1228 ; copy as many bytes as needed
  1229     xor KS, KS
  1230 @@:
  1231         cmp len, KS
  1232         je  @f
  1233         mov al, [CT + KS]
  1234         mov [rsp + KS], al
  1235         inc KS
  1236         jmp @b
  1237 @@:
  1238         cmp KS, 16
  1239         je  @f
  1240         mov BYTE PTR[rsp + KS], 0
  1241         inc KS
  1242         jmp @b
  1243 @@:
  1244     vmovdqa TMP2, XMMWORD PTR[rsp]
  1245     vpshufb TMP2, TMP2, BSWAPMASK
  1246     vpxor   T, T, TMP2
  1247     vmovdqu TMP0, XMMWORD PTR[Htbl]
  1248     GFMUL   T, T, TMP0, TMP5, TMP2, TMP3, TMP4
  1251     vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
  1252     vmovdqa XMMWORD PTR[rsp], TMP1
  1253     xor KS, KS
  1254 @@:
  1255         cmp len, KS
  1256         je  @f
  1257         mov al, [rsp + KS]
  1258         mov [PT + KS], al
  1259         inc KS
  1260         jmp @b
  1261 @@:
  1263 LDecDataEnd:
  1265     vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
  1266     bswap   aluCTR
  1267     mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
  1269     mov rsp, rbp
  1271     vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
  1272     vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
  1273     vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
  1274     vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
  1275     vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
  1276     vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
  1277     vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
  1278     vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
  1279     vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
  1280     vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
  1282     add rsp, 10*16
  1283     pop rbp
  1284     pop r13
  1285     pop r12
  1286     pop r11
  1288     vzeroupper
  1290     ret
  1291 ret
  1292 intel_aes_gcmDEC ENDP
  1295 END

mercurial