security/nss/lib/freebl/intel-gcm.s

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 # LICENSE:                                                                  
     2 # This submission to NSS is to be made available under the terms of the
     3 # Mozilla Public License, v. 2.0. You can obtain one at http:         
     4 # //mozilla.org/MPL/2.0/. 
     5 ################################################################################
     6 # Copyright(c) 2012, Intel Corp.
     8 .align  16
     9 .Lone:
    10 .quad 1,0
    11 .Ltwo:
    12 .quad 2,0
    13 .Lbswap_mask:
    14 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    15 .Lshuff_mask:
    16 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
    17 .Lpoly:
    18 .quad 0x1, 0xc200000000000000 
    21 ################################################################################
    22 # Generates the final GCM tag
    23 # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
    24 .type intel_aes_gcmTAG,@function
    25 .globl intel_aes_gcmTAG
    26 .align 16
    27 intel_aes_gcmTAG:
    29 .set  Htbl, %rdi
    30 .set  Tp, %rsi
    31 .set  Mlen, %rdx
    32 .set  Alen, %rcx
    33 .set  X0, %r8
    34 .set  TAG, %r9
    36 .set T,%xmm0
    37 .set TMP0,%xmm1
    39    vmovdqu  (Tp), T
    40    vpshufb  .Lbswap_mask(%rip), T, T
    41    vpxor    TMP0, TMP0, TMP0
    42    shl      $3, Mlen
    43    shl      $3, Alen
    44    vpinsrq  $0, Mlen, TMP0, TMP0
    45    vpinsrq  $1, Alen, TMP0, TMP0
    46    vpxor    TMP0, T, T
    47    vmovdqu  (Htbl), TMP0
    48    call     GFMUL
    49    vpshufb  .Lbswap_mask(%rip), T, T
    50    vpxor    (X0), T, T
    51    vmovdqu  T, (TAG)
    53 ret
    54 .size intel_aes_gcmTAG, .-intel_aes_gcmTAG
    55 ################################################################################
    56 # Generates the H table
    57 # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
    58 .type intel_aes_gcmINIT,@function
    59 .globl intel_aes_gcmINIT
    60 .align 16
    61 intel_aes_gcmINIT:
    63 .set  Htbl, %rdi
    64 .set  KS, %rsi
    65 .set  NR, %edx
    67 .set T,%xmm0
    68 .set TMP0,%xmm1
    70 CALCULATE_POWERS_OF_H:
    71     vmovdqu      16*0(KS), T
    72     vaesenc      16*1(KS), T, T
    73     vaesenc      16*2(KS), T, T
    74     vaesenc      16*3(KS), T, T
    75     vaesenc      16*4(KS), T, T
    76     vaesenc      16*5(KS), T, T
    77     vaesenc      16*6(KS), T, T
    78     vaesenc      16*7(KS), T, T
    79     vaesenc      16*8(KS), T, T
    80     vaesenc      16*9(KS), T, T
    81     vmovdqu      16*10(KS), TMP0
    82     cmp          $10, NR
    83     je           .LH0done
    84     vaesenc      16*10(KS), T, T
    85     vaesenc      16*11(KS), T, T
    86     vmovdqu      16*12(KS), TMP0
    87     cmp          $12, NR
    88     je           .LH0done
    89     vaesenc      16*12(KS), T, T
    90     vaesenc      16*13(KS), T, T
    91     vmovdqu      16*14(KS), TMP0
    93 .LH0done:
    94     vaesenclast  TMP0, T, T
    96     vpshufb      .Lbswap_mask(%rip), T, T  
    98     vmovdqu	T, TMP0
    99     # Calculate H` = GFMUL(H, 2)
   100     vpsrld	$7 , T , %xmm3
   101     vmovdqu	.Lshuff_mask(%rip), %xmm4
   102     vpshufb	%xmm4, %xmm3 , %xmm3
   103     movq	$0xff00 , %rax
   104     vmovq	%rax, %xmm4
   105     vpshufb	%xmm3, %xmm4 , %xmm4
   106     vmovdqu	.Lpoly(%rip), %xmm5
   107     vpand	%xmm4, %xmm5, %xmm5
   108     vpsrld	$31, T, %xmm3
   109     vpslld	$1, T, %xmm4
   110     vpslldq	$4, %xmm3, %xmm3
   111     vpxor	%xmm3, %xmm4, T  #xmm1 holds now p(x)<<1
   113     #adding p(x)<<1 to xmm5
   114     vpxor     %xmm5, T , T
   115     vmovdqu   T, TMP0
   116     vmovdqu   T, (Htbl)     # H * 2
   117     call  GFMUL
   118     vmovdqu  T, 16(Htbl)    # H^2 * 2
   119     call  GFMUL
   120     vmovdqu  T, 32(Htbl)    # H^3 * 2
   121     call  GFMUL
   122     vmovdqu  T, 48(Htbl)    # H^4 * 2
   123     call  GFMUL
   124     vmovdqu  T, 64(Htbl)    # H^5 * 2
   125     call  GFMUL
   126     vmovdqu  T, 80(Htbl)    # H^6 * 2
   127     call  GFMUL
   128     vmovdqu  T, 96(Htbl)    # H^7 * 2
   129     call  GFMUL
   130     vmovdqu  T, 112(Htbl)   # H^8 * 2  
   132     # Precalculations for the reduce 4 step
   133     vpshufd  $78, (Htbl), %xmm8
   134     vpshufd  $78, 16(Htbl), %xmm9
   135     vpshufd  $78, 32(Htbl), %xmm10
   136     vpshufd  $78, 48(Htbl), %xmm11
   137     vpshufd  $78, 64(Htbl), %xmm12
   138     vpshufd  $78, 80(Htbl), %xmm13
   139     vpshufd  $78, 96(Htbl), %xmm14
   140     vpshufd  $78, 112(Htbl), %xmm15
   142     vpxor  (Htbl), %xmm8, %xmm8
   143     vpxor  16(Htbl), %xmm9, %xmm9
   144     vpxor  32(Htbl), %xmm10, %xmm10
   145     vpxor  48(Htbl), %xmm11, %xmm11
   146     vpxor  64(Htbl), %xmm12, %xmm12
   147     vpxor  80(Htbl), %xmm13, %xmm13
   148     vpxor  96(Htbl), %xmm14, %xmm14
   149     vpxor  112(Htbl), %xmm15, %xmm15
   151     vmovdqu   %xmm8, 128(Htbl)
   152     vmovdqu   %xmm9, 144(Htbl)
   153     vmovdqu   %xmm10, 160(Htbl)
   154     vmovdqu   %xmm11, 176(Htbl)
   155     vmovdqu   %xmm12, 192(Htbl)
   156     vmovdqu   %xmm13, 208(Htbl)
   157     vmovdqu   %xmm14, 224(Htbl)
   158     vmovdqu   %xmm15, 240(Htbl)
   160     ret
   161 .size intel_aes_gcmINIT, .-intel_aes_gcmINIT
   162 ################################################################################
   163 # Authenticate only
   164 # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
   166 .globl  intel_aes_gcmAAD
   167 .type   intel_aes_gcmAAD,@function
   168 .align  16
   169 intel_aes_gcmAAD:
   171 .set DATA, %xmm0
   172 .set T, %xmm1
   173 .set BSWAP_MASK, %xmm2
   174 .set TMP0, %xmm3
   175 .set TMP1, %xmm4
   176 .set TMP2, %xmm5
   177 .set TMP3, %xmm6
   178 .set TMP4, %xmm7
   179 .set Xhi, %xmm9
   181 .set Htbl, %rdi
   182 .set inp, %rsi
   183 .set len, %rdx
   184 .set Tp, %rcx
   186 .set hlp0, %r11
   188 .macro KARATSUBA_AAD i
   189     vpclmulqdq  $0x00, 16*\i(Htbl), DATA, TMP3
   190     vpxor       TMP3, TMP0, TMP0
   191     vpclmulqdq  $0x11, 16*\i(Htbl), DATA, TMP3
   192     vpxor       TMP3, TMP1, TMP1
   193     vpshufd     $78,  DATA, TMP3
   194     vpxor       DATA, TMP3, TMP3
   195     vpclmulqdq  $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
   196     vpxor       TMP3, TMP2, TMP2
   197 .endm
   199     test  len, len
   200     jnz   .LbeginAAD
   201     ret
   203 .LbeginAAD:
   205    push  hlp0
   206    vzeroupper
   208    vmovdqa  .Lbswap_mask(%rip), BSWAP_MASK
   210    vpxor    Xhi, Xhi, Xhi
   212    vmovdqu  (Tp),T
   213    vpshufb  BSWAP_MASK,T,T
   215    # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
   216     mov     len, hlp0
   217     and	    $~-128, hlp0
   219     jz      .Lmod_loop
   221     sub     hlp0, len
   222     sub     $16, hlp0
   224    #hash first prefix block
   225 	vmovdqu (inp), DATA
   226 	vpshufb  BSWAP_MASK, DATA, DATA
   227 	vpxor    T, DATA, DATA
   229 	vpclmulqdq  $0x00, (Htbl, hlp0), DATA, TMP0
   230 	vpclmulqdq  $0x11, (Htbl, hlp0), DATA, TMP1
   231 	vpshufd     $78, DATA, TMP2
   232 	vpxor       DATA, TMP2, TMP2
   233 	vpclmulqdq  $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
   235 	lea	    16(inp), inp
   236 	test    hlp0, hlp0
   237 	jnz	    .Lpre_loop
   238 	jmp	    .Lred1
   240     #hash remaining prefix bocks (up to 7 total prefix blocks)
   241 .align 64
   242 .Lpre_loop:
   244     sub	$16, hlp0
   246     vmovdqu     (inp),DATA           # next data block
   247     vpshufb     BSWAP_MASK,DATA,DATA
   249     vpclmulqdq  $0x00, (Htbl,hlp0), DATA, TMP3
   250     vpxor       TMP3, TMP0, TMP0
   251     vpclmulqdq  $0x11, (Htbl,hlp0), DATA, TMP3
   252     vpxor       TMP3, TMP1, TMP1
   253     vpshufd	    $78, DATA, TMP3
   254     vpxor       DATA, TMP3, TMP3
   255     vpclmulqdq  $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
   256     vpxor       TMP3, TMP2, TMP2
   258     test	hlp0, hlp0
   260     lea	16(inp), inp
   262     jnz	.Lpre_loop
   264 .Lred1:
   265     vpxor       TMP0, TMP2, TMP2
   266     vpxor       TMP1, TMP2, TMP2
   267     vpsrldq     $8, TMP2, TMP3
   268     vpslldq     $8, TMP2, TMP2
   270     vpxor       TMP3, TMP1, Xhi
   271     vpxor       TMP2, TMP0, T
   273 .align 64
   274 .Lmod_loop:
   275     sub	$0x80, len
   276     jb	.Ldone
   278     vmovdqu     16*7(inp),DATA		# Ii
   279     vpshufb     BSWAP_MASK,DATA,DATA
   281     vpclmulqdq  $0x00, (Htbl), DATA, TMP0
   282     vpclmulqdq  $0x11, (Htbl), DATA, TMP1
   283     vpshufd     $78, DATA, TMP2
   284     vpxor       DATA, TMP2, TMP2
   285     vpclmulqdq  $0x00, 16*8(Htbl), TMP2, TMP2
   286     #########################################################
   287     vmovdqu     16*6(inp),DATA
   288     vpshufb     BSWAP_MASK,DATA,DATA
   289     KARATSUBA_AAD 1
   290     #########################################################
   291     vmovdqu     16*5(inp),DATA
   292     vpshufb     BSWAP_MASK,DATA,DATA
   294     vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 1a
   295     vpalignr    $8, T, T, T
   297     KARATSUBA_AAD 2
   299     vpxor       TMP4, T, T                 #reduction stage 1b
   300     #########################################################
   301     vmovdqu		16*4(inp),DATA
   302     vpshufb	    BSWAP_MASK,DATA,DATA
   304     KARATSUBA_AAD 3
   305     #########################################################
   306     vmovdqu     16*3(inp),DATA
   307     vpshufb     BSWAP_MASK,DATA,DATA
   309     vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 2a
   310     vpalignr    $8, T, T, T
   312     KARATSUBA_AAD 4
   314     vpxor       TMP4, T, T                 #reduction stage 2b
   315     #########################################################
   316     vmovdqu     16*2(inp),DATA
   317     vpshufb     BSWAP_MASK,DATA,DATA
   319     KARATSUBA_AAD 5
   321     vpxor       Xhi, T, T                  #reduction finalize
   322     #########################################################
   323     vmovdqu     16*1(inp),DATA
   324     vpshufb     BSWAP_MASK,DATA,DATA
   326     KARATSUBA_AAD 6
   327     #########################################################
   328     vmovdqu     16*0(inp),DATA
   329     vpshufb     BSWAP_MASK,DATA,DATA
   330     vpxor       T,DATA,DATA
   332     KARATSUBA_AAD 7
   333     #########################################################
   334     vpxor       TMP0, TMP2, TMP2              # karatsuba fixup
   335     vpxor       TMP1, TMP2, TMP2
   336     vpsrldq     $8, TMP2, TMP3
   337     vpslldq     $8, TMP2, TMP2
   339     vpxor       TMP3, TMP1, Xhi
   340     vpxor       TMP2, TMP0, T
   342     lea	16*8(inp), inp
   343     jmp .Lmod_loop
   344     #########################################################
   346 .Ldone:
   347     vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
   348     vpalignr    $8, T, T, T
   349     vpxor       TMP3, T, T
   351     vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
   352     vpalignr    $8, T, T, T
   353     vpxor       TMP3, T, T
   355     vpxor       Xhi, T, T
   357 .Lsave:
   358     vpshufb     BSWAP_MASK,T, T
   359     vmovdqu     T,(Tp)
   360     vzeroupper
   362     pop hlp0
   363     ret
   364 .size   intel_aes_gcmAAD,.-intel_aes_gcmAAD
   366 ################################################################################
   367 # Encrypt and Authenticate
   368 # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
   369 .type intel_aes_gcmENC,@function
   370 .globl intel_aes_gcmENC
   371 .align 16
   372 intel_aes_gcmENC:
   374 .set PT,%rdi
   375 .set CT,%rsi
   376 .set Htbl, %rdx
   377 .set len, %rcx
   378 .set KS,%r9
   379 .set NR,%r10d
   381 .set Gctx, %rdx
   383 .set T,%xmm0
   384 .set TMP0,%xmm1
   385 .set TMP1,%xmm2
   386 .set TMP2,%xmm3
   387 .set TMP3,%xmm4
   388 .set TMP4,%xmm5
   389 .set TMP5,%xmm6
   390 .set CTR0,%xmm7
   391 .set CTR1,%xmm8
   392 .set CTR2,%xmm9
   393 .set CTR3,%xmm10
   394 .set CTR4,%xmm11
   395 .set CTR5,%xmm12
   396 .set CTR6,%xmm13
   397 .set CTR7,%xmm14
   398 .set CTR,%xmm15
   400 .macro ROUND i
   401     vmovdqu \i*16(KS), TMP3
   402     vaesenc TMP3, CTR0, CTR0
   403     vaesenc TMP3, CTR1, CTR1
   404     vaesenc TMP3, CTR2, CTR2
   405     vaesenc TMP3, CTR3, CTR3
   406     vaesenc TMP3, CTR4, CTR4
   407     vaesenc TMP3, CTR5, CTR5
   408     vaesenc TMP3, CTR6, CTR6
   409     vaesenc TMP3, CTR7, CTR7
   410 .endm
   412 .macro ROUNDMUL i
   414     vmovdqu \i*16(%rsp), TMP5
   415     vmovdqu \i*16(KS), TMP3
   417     vaesenc TMP3, CTR0, CTR0
   418     vaesenc TMP3, CTR1, CTR1
   419     vaesenc TMP3, CTR2, CTR2
   420     vaesenc TMP3, CTR3, CTR3
   422     vpshufd $78, TMP5, TMP4
   423     vpxor   TMP5, TMP4, TMP4
   425     vaesenc TMP3, CTR4, CTR4
   426     vaesenc TMP3, CTR5, CTR5
   427     vaesenc TMP3, CTR6, CTR6
   428     vaesenc TMP3, CTR7, CTR7
   430     vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP4, TMP3
   431     vpxor       TMP3, TMP0, TMP0
   432     vmovdqa     \i*16(Htbl), TMP4
   433     vpclmulqdq  $0x11, TMP4, TMP5, TMP3
   434     vpxor       TMP3, TMP1, TMP1
   435     vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   436     vpxor       TMP3, TMP2, TMP2
   438 .endm
   440 .macro KARATSUBA i
   441     vmovdqu \i*16(%rsp), TMP5
   443     vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
   444     vpxor       TMP3, TMP1, TMP1
   445     vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
   446     vpxor       TMP3, TMP2, TMP2
   447     vpshufd     $78, TMP5, TMP3
   448     vpxor       TMP5, TMP3, TMP5
   449     vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
   450     vpxor       TMP3, TMP0, TMP0
   451 .endm
   453     test len, len
   454     jnz  .Lbegin
   455     ret
   457 .Lbegin:
   459     vzeroupper
   460     push %rbp
   461     push %rbx
   463     movq %rsp, %rbp   
   464     sub  $128, %rsp
   465     andq $-16, %rsp
   467     vmovdqu  288(Gctx), CTR
   468     vmovdqu  272(Gctx), T
   469     mov  304(Gctx), KS
   470     mov  4(KS), NR
   471     lea  48(KS), KS
   473     vpshufb  .Lbswap_mask(%rip), CTR, CTR
   474     vpshufb  .Lbswap_mask(%rip), T, T
   476     cmp  $128, len
   477     jb   .LDataSingles
   479 # Encrypt the first eight blocks
   480     sub     $128, len
   481     vmovdqa CTR, CTR0
   482     vpaddd  .Lone(%rip), CTR0, CTR1
   483     vpaddd  .Ltwo(%rip), CTR0, CTR2
   484     vpaddd  .Lone(%rip), CTR2, CTR3
   485     vpaddd  .Ltwo(%rip), CTR2, CTR4
   486     vpaddd  .Lone(%rip), CTR4, CTR5
   487     vpaddd  .Ltwo(%rip), CTR4, CTR6
   488     vpaddd  .Lone(%rip), CTR6, CTR7
   489     vpaddd  .Ltwo(%rip), CTR6, CTR
   491     vpshufb .Lbswap_mask(%rip), CTR0, CTR0
   492     vpshufb .Lbswap_mask(%rip), CTR1, CTR1
   493     vpshufb .Lbswap_mask(%rip), CTR2, CTR2
   494     vpshufb .Lbswap_mask(%rip), CTR3, CTR3
   495     vpshufb .Lbswap_mask(%rip), CTR4, CTR4
   496     vpshufb .Lbswap_mask(%rip), CTR5, CTR5
   497     vpshufb .Lbswap_mask(%rip), CTR6, CTR6
   498     vpshufb .Lbswap_mask(%rip), CTR7, CTR7
   500     vpxor   (KS), CTR0, CTR0
   501     vpxor   (KS), CTR1, CTR1
   502     vpxor   (KS), CTR2, CTR2
   503     vpxor   (KS), CTR3, CTR3
   504     vpxor   (KS), CTR4, CTR4
   505     vpxor   (KS), CTR5, CTR5
   506     vpxor   (KS), CTR6, CTR6
   507     vpxor   (KS), CTR7, CTR7
   509     ROUND 1
   510     ROUND 2
   511     ROUND 3
   512     ROUND 4
   513     ROUND 5
   514     ROUND 6
   515     ROUND 7
   516     ROUND 8
   517     ROUND 9
   519     vmovdqu 160(KS), TMP5
   520     cmp $12, NR
   521     jb  .LLast1
   523     ROUND 10
   524     ROUND 11
   526     vmovdqu 192(KS), TMP5
   527     cmp $14, NR
   528     jb  .LLast1
   530     ROUND 12
   531     ROUND 13
   533     vmovdqu 224(KS), TMP5
   535 .LLast1:
   537     vpxor       (PT), TMP5, TMP3
   538     vaesenclast TMP3, CTR0, CTR0
   539     vpxor       16(PT), TMP5, TMP3
   540     vaesenclast TMP3, CTR1, CTR1
   541     vpxor       32(PT), TMP5, TMP3
   542     vaesenclast TMP3, CTR2, CTR2
   543     vpxor       48(PT), TMP5, TMP3
   544     vaesenclast TMP3, CTR3, CTR3
   545     vpxor       64(PT), TMP5, TMP3
   546     vaesenclast TMP3, CTR4, CTR4
   547     vpxor       80(PT), TMP5, TMP3
   548     vaesenclast TMP3, CTR5, CTR5
   549     vpxor       96(PT), TMP5, TMP3
   550     vaesenclast TMP3, CTR6, CTR6
   551     vpxor       112(PT), TMP5, TMP3
   552     vaesenclast TMP3, CTR7, CTR7
   554     vmovdqu     .Lbswap_mask(%rip), TMP3
   556     vmovdqu CTR0, (CT)
   557     vpshufb TMP3, CTR0, CTR0
   558     vmovdqu CTR1, 16(CT)
   559     vpshufb TMP3, CTR1, CTR1
   560     vmovdqu CTR2, 32(CT)
   561     vpshufb TMP3, CTR2, CTR2
   562     vmovdqu CTR3, 48(CT)
   563     vpshufb TMP3, CTR3, CTR3
   564     vmovdqu CTR4, 64(CT)
   565     vpshufb TMP3, CTR4, CTR4
   566     vmovdqu CTR5, 80(CT)
   567     vpshufb TMP3, CTR5, CTR5
   568     vmovdqu CTR6, 96(CT)
   569     vpshufb TMP3, CTR6, CTR6
   570     vmovdqu CTR7, 112(CT)
   571     vpshufb TMP3, CTR7, CTR7
   573     lea 128(CT), CT
   574     lea 128(PT), PT
   575     jmp .LDataOctets
   577 # Encrypt 8 blocks each time while hashing previous 8 blocks
   578 .align 64
   579 .LDataOctets:
   580         cmp $128, len
   581         jb  .LEndOctets
   582         sub $128, len
   584         vmovdqa CTR7, TMP5
   585         vmovdqa CTR6, 1*16(%rsp)
   586         vmovdqa CTR5, 2*16(%rsp)
   587         vmovdqa CTR4, 3*16(%rsp)
   588         vmovdqa CTR3, 4*16(%rsp)
   589         vmovdqa CTR2, 5*16(%rsp)
   590         vmovdqa CTR1, 6*16(%rsp)
   591         vmovdqa CTR0, 7*16(%rsp)
   593         vmovdqa CTR, CTR0
   594         vpaddd  .Lone(%rip), CTR0, CTR1
   595         vpaddd  .Ltwo(%rip), CTR0, CTR2
   596         vpaddd  .Lone(%rip), CTR2, CTR3
   597         vpaddd  .Ltwo(%rip), CTR2, CTR4
   598         vpaddd  .Lone(%rip), CTR4, CTR5
   599         vpaddd  .Ltwo(%rip), CTR4, CTR6
   600         vpaddd  .Lone(%rip), CTR6, CTR7
   601         vpaddd  .Ltwo(%rip), CTR6, CTR
   603         vmovdqu (KS), TMP4
   604         vpshufb TMP3, CTR0, CTR0
   605         vpxor   TMP4, CTR0, CTR0
   606         vpshufb TMP3, CTR1, CTR1
   607         vpxor   TMP4, CTR1, CTR1
   608         vpshufb TMP3, CTR2, CTR2
   609         vpxor   TMP4, CTR2, CTR2
   610         vpshufb TMP3, CTR3, CTR3
   611         vpxor   TMP4, CTR3, CTR3
   612         vpshufb TMP3, CTR4, CTR4
   613         vpxor   TMP4, CTR4, CTR4
   614         vpshufb TMP3, CTR5, CTR5
   615         vpxor   TMP4, CTR5, CTR5
   616         vpshufb TMP3, CTR6, CTR6
   617         vpxor   TMP4, CTR6, CTR6
   618         vpshufb TMP3, CTR7, CTR7
   619         vpxor   TMP4, CTR7, CTR7
   621         vmovdqu     16*0(Htbl), TMP3
   622         vpclmulqdq  $0x11, TMP3, TMP5, TMP1
   623         vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
   624         vpshufd     $78, TMP5, TMP3
   625         vpxor       TMP5, TMP3, TMP5
   626         vmovdqu     128+0*16(Htbl), TMP3      
   627         vpclmulqdq  $0x00, TMP3, TMP5, TMP0
   629         ROUNDMUL 1
   631         ROUNDMUL 2
   633         ROUNDMUL 3
   635         ROUNDMUL 4
   637         ROUNDMUL 5
   639         ROUNDMUL 6
   641         vpxor   7*16(%rsp), T, TMP5
   642         vmovdqu 7*16(KS), TMP3
   644         vaesenc TMP3, CTR0, CTR0
   645         vaesenc TMP3, CTR1, CTR1
   646         vaesenc TMP3, CTR2, CTR2
   647         vaesenc TMP3, CTR3, CTR3
   649         vpshufd $78, TMP5, TMP4
   650         vpxor   TMP5, TMP4, TMP4
   652         vaesenc TMP3, CTR4, CTR4
   653         vaesenc TMP3, CTR5, CTR5
   654         vaesenc TMP3, CTR6, CTR6
   655         vaesenc TMP3, CTR7, CTR7
   657         vpclmulqdq  $0x11, 7*16(Htbl), TMP5, TMP3
   658         vpxor       TMP3, TMP1, TMP1
   659         vpclmulqdq  $0x00, 7*16(Htbl), TMP5, TMP3
   660         vpxor       TMP3, TMP2, TMP2
   661         vpclmulqdq  $0x00, 128+7*16(Htbl), TMP4, TMP3
   662         vpxor       TMP3, TMP0, TMP0
   664         ROUND 8    
   665         vmovdqa .Lpoly(%rip), TMP5
   667         vpxor   TMP1, TMP0, TMP0
   668         vpxor   TMP2, TMP0, TMP0
   669         vpsrldq $8, TMP0, TMP3
   670         vpxor   TMP3, TMP1, TMP4
   671         vpslldq $8, TMP0, TMP3
   672         vpxor   TMP3, TMP2, T
   674         vpclmulqdq  $0x10, TMP5, T, TMP1
   675         vpalignr    $8, T, T, T
   676         vpxor       T, TMP1, T
   678         ROUND 9
   680         vpclmulqdq  $0x10, TMP5, T, TMP1
   681         vpalignr    $8, T, T, T
   682         vpxor       T, TMP1, T
   684         vmovdqu 160(KS), TMP5
   685         cmp     $10, NR
   686         jbe     .LLast2
   688         ROUND 10
   689         ROUND 11
   691         vmovdqu 192(KS), TMP5
   692         cmp     $12, NR
   693         jbe     .LLast2
   695         ROUND 12
   696         ROUND 13
   698         vmovdqu 224(KS), TMP5
   700 .LLast2:
   702         vpxor       (PT), TMP5, TMP3
   703         vaesenclast TMP3, CTR0, CTR0
   704         vpxor       16(PT), TMP5, TMP3
   705         vaesenclast TMP3, CTR1, CTR1
   706         vpxor       32(PT), TMP5, TMP3
   707         vaesenclast TMP3, CTR2, CTR2
   708         vpxor       48(PT), TMP5, TMP3
   709         vaesenclast TMP3, CTR3, CTR3
   710         vpxor       64(PT), TMP5, TMP3
   711         vaesenclast TMP3, CTR4, CTR4
   712         vpxor       80(PT), TMP5, TMP3
   713         vaesenclast TMP3, CTR5, CTR5
   714         vpxor       96(PT), TMP5, TMP3
   715         vaesenclast TMP3, CTR6, CTR6
   716         vpxor       112(PT), TMP5, TMP3
   717         vaesenclast TMP3, CTR7, CTR7
   719         vmovdqu .Lbswap_mask(%rip), TMP3
   721         vmovdqu CTR0, (CT)
   722         vpshufb TMP3, CTR0, CTR0
   723         vmovdqu CTR1, 16(CT)
   724         vpshufb TMP3, CTR1, CTR1
   725         vmovdqu CTR2, 32(CT)
   726         vpshufb TMP3, CTR2, CTR2
   727         vmovdqu CTR3, 48(CT)
   728         vpshufb TMP3, CTR3, CTR3
   729         vmovdqu CTR4, 64(CT)
   730         vpshufb TMP3, CTR4, CTR4
   731         vmovdqu CTR5, 80(CT)
   732         vpshufb TMP3, CTR5, CTR5
   733         vmovdqu CTR6, 96(CT)
   734         vpshufb TMP3, CTR6, CTR6
   735         vmovdqu CTR7,112(CT)
   736         vpshufb TMP3, CTR7, CTR7
   738         vpxor   TMP4, T, T
   740         lea 128(CT), CT
   741         lea 128(PT), PT
   742     jmp  .LDataOctets
   744 .LEndOctets:
   746     vmovdqa CTR7, TMP5
   747     vmovdqa CTR6, 1*16(%rsp)
   748     vmovdqa CTR5, 2*16(%rsp)
   749     vmovdqa CTR4, 3*16(%rsp)
   750     vmovdqa CTR3, 4*16(%rsp)
   751     vmovdqa CTR2, 5*16(%rsp)
   752     vmovdqa CTR1, 6*16(%rsp)
   753     vmovdqa CTR0, 7*16(%rsp)
   755     vmovdqu     16*0(Htbl), TMP3
   756     vpclmulqdq  $0x11, TMP3, TMP5, TMP1
   757     vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
   758     vpshufd     $78, TMP5, TMP3
   759     vpxor       TMP5, TMP3, TMP5
   760     vmovdqu     128+0*16(Htbl), TMP3      
   761     vpclmulqdq  $0x00, TMP3, TMP5, TMP0
   763     KARATSUBA 1
   764     KARATSUBA 2
   765     KARATSUBA 3      
   766     KARATSUBA 4
   767     KARATSUBA 5
   768     KARATSUBA 6
   770     vmovdqu     7*16(%rsp), TMP5
   771     vpxor       T, TMP5, TMP5
   772     vmovdqu     16*7(Htbl), TMP4            
   773     vpclmulqdq  $0x11, TMP4, TMP5, TMP3
   774     vpxor       TMP3, TMP1, TMP1
   775     vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   776     vpxor       TMP3, TMP2, TMP2      
   777     vpshufd     $78, TMP5, TMP3
   778     vpxor       TMP5, TMP3, TMP5
   779     vmovdqu     128+7*16(Htbl), TMP4      
   780     vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   781     vpxor       TMP3, TMP0, TMP0
   783     vpxor       TMP1, TMP0, TMP0
   784     vpxor       TMP2, TMP0, TMP0
   786     vpsrldq     $8, TMP0, TMP3
   787     vpxor       TMP3, TMP1, TMP4
   788     vpslldq     $8, TMP0, TMP3
   789     vpxor       TMP3, TMP2, T
   791     vmovdqa     .Lpoly(%rip), TMP2
   793     vpalignr    $8, T, T, TMP1
   794     vpclmulqdq  $0x10, TMP2, T, T
   795     vpxor       T, TMP1, T
   797     vpalignr    $8, T, T, TMP1
   798     vpclmulqdq  $0x10, TMP2, T, T
   799     vpxor       T, TMP1, T
   801     vpxor       TMP4, T, T
   803 #Here we encrypt any remaining whole block
   804 .LDataSingles:
   806     cmp $16, len
   807     jb  .LDataTail
   808     sub $16, len
   810     vpshufb .Lbswap_mask(%rip), CTR, TMP1
   811     vpaddd  .Lone(%rip), CTR, CTR
   813     vpxor   (KS), TMP1, TMP1
   814     vaesenc 16*1(KS), TMP1, TMP1
   815     vaesenc 16*2(KS), TMP1, TMP1
   816     vaesenc 16*3(KS), TMP1, TMP1
   817     vaesenc 16*4(KS), TMP1, TMP1
   818     vaesenc 16*5(KS), TMP1, TMP1
   819     vaesenc 16*6(KS), TMP1, TMP1
   820     vaesenc 16*7(KS), TMP1, TMP1
   821     vaesenc 16*8(KS), TMP1, TMP1
   822     vaesenc 16*9(KS), TMP1, TMP1
   823     vmovdqu 16*10(KS), TMP2
   824     cmp     $10, NR
   825     je      .LLast3
   826     vaesenc 16*10(KS), TMP1, TMP1
   827     vaesenc 16*11(KS), TMP1, TMP1
   828     vmovdqu 16*12(KS), TMP2
   829     cmp     $12, NR
   830     je      .LLast3
   831     vaesenc 16*12(KS), TMP1, TMP1
   832     vaesenc 16*13(KS), TMP1, TMP1
   833     vmovdqu 16*14(KS), TMP2
   835 .LLast3:
   836     vaesenclast TMP2, TMP1, TMP1
   838     vpxor   (PT), TMP1, TMP1
   839     vmovdqu TMP1, (CT)
   840     addq    $16, CT
   841     addq    $16, PT
   843     vpshufb .Lbswap_mask(%rip), TMP1, TMP1
   844     vpxor   TMP1, T, T
   845     vmovdqu (Htbl), TMP0
   846     call    GFMUL
   848     jmp .LDataSingles
   850 #Here we encypt the final partial block, if there is one
   851 .LDataTail:
   853     test    len, len
   854     jz      DATA_END
   855 # First prepare the counter block
   856     vpshufb .Lbswap_mask(%rip), CTR, TMP1
   857     vpaddd  .Lone(%rip), CTR, CTR
   859     vpxor   (KS), TMP1, TMP1
   860     vaesenc 16*1(KS), TMP1, TMP1
   861     vaesenc 16*2(KS), TMP1, TMP1
   862     vaesenc 16*3(KS), TMP1, TMP1
   863     vaesenc 16*4(KS), TMP1, TMP1
   864     vaesenc 16*5(KS), TMP1, TMP1
   865     vaesenc 16*6(KS), TMP1, TMP1
   866     vaesenc 16*7(KS), TMP1, TMP1
   867     vaesenc 16*8(KS), TMP1, TMP1
   868     vaesenc 16*9(KS), TMP1, TMP1
   869     vmovdqu 16*10(KS), TMP2
   870     cmp     $10, NR
   871     je      .LLast4
   872     vaesenc 16*10(KS), TMP1, TMP1
   873     vaesenc 16*11(KS), TMP1, TMP1
   874     vmovdqu 16*12(KS), TMP2
   875     cmp     $12, NR
   876     je      .LLast4
   877     vaesenc 16*12(KS), TMP1, TMP1
   878     vaesenc 16*13(KS), TMP1, TMP1
   879     vmovdqu 16*14(KS), TMP2
   881 .LLast4:
   882     vaesenclast TMP2, TMP1, TMP1
   883 #Zero a temp location
   884     vpxor   TMP2, TMP2, TMP2
   885     vmovdqa TMP2, (%rsp)
   887 # Copy the required bytes only (could probably use rep movsb)
   888     xor KS, KS  
   889 .LEncCpy:
   890         cmp     KS, len
   891         je      .LEncCpyEnd
   892         movb    (PT, KS, 1), %r8b
   893         movb    %r8b, (%rsp, KS, 1)
   894         inc     KS
   895         jmp .LEncCpy
   896 .LEncCpyEnd:
   897 # Xor with the counter block
   898     vpxor   (%rsp), TMP1, TMP0
   899 # Again, store at temp location
   900     vmovdqa TMP0, (%rsp)
   901 # Copy only the required bytes to CT, and zero the rest for the hash
   902     xor KS, KS
   903 .LEncCpy2:
   904     cmp     KS, len
   905     je      .LEncCpy3
   906     movb    (%rsp, KS, 1), %r8b
   907     movb    %r8b, (CT, KS, 1)
   908     inc     KS
   909     jmp .LEncCpy2
   910 .LEncCpy3:
   911     cmp     $16, KS
   912     je      .LEndCpy3
   913     movb    $0, (%rsp, KS, 1)
   914     inc     KS
   915     jmp .LEncCpy3
   916 .LEndCpy3:
   917    vmovdqa  (%rsp), TMP0
   919    vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
   920    vpxor    TMP0, T, T
   921    vmovdqu  (Htbl), TMP0
   922    call     GFMUL
   924 DATA_END:
   926    vpshufb  .Lbswap_mask(%rip), T, T
   927    vpshufb  .Lbswap_mask(%rip), CTR, CTR
   928    vmovdqu  T, 272(Gctx)
   929    vmovdqu  CTR, 288(Gctx)
   931    movq   %rbp, %rsp
   933    popq   %rbx
   934    popq   %rbp
   935    ret
   936    .size intel_aes_gcmENC, .-intel_aes_gcmENC
   938 #########################
   939 # Decrypt and Authenticate
   940 # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
   941 .type intel_aes_gcmDEC,@function
   942 .globl intel_aes_gcmDEC
   943 .align 16
   944 intel_aes_gcmDEC:
   945 # parameter 1: CT    # input
   946 # parameter 2: PT    # output
   947 # parameter 3: %rdx  # Gctx
   948 # parameter 4: %rcx  # len
   950 .macro DEC_KARATSUBA i
   951     vmovdqu     (7-\i)*16(CT), TMP5
   952     vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
   954     vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
   955     vpxor       TMP3, TMP1, TMP1
   956     vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
   957     vpxor       TMP3, TMP2, TMP2
   958     vpshufd     $78, TMP5, TMP3
   959     vpxor       TMP5, TMP3, TMP5
   960     vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
   961     vpxor       TMP3, TMP0, TMP0
   962 .endm
   964 .set PT,%rsi
   965 .set CT,%rdi
   966 .set Htbl, %rdx
   967 .set len, %rcx
   968 .set KS,%r9
   969 .set NR,%r10d
   971 .set Gctx, %rdx
   973 .set T,%xmm0
   974 .set TMP0,%xmm1
   975 .set TMP1,%xmm2
   976 .set TMP2,%xmm3
   977 .set TMP3,%xmm4
   978 .set TMP4,%xmm5
   979 .set TMP5,%xmm6
   980 .set CTR0,%xmm7
   981 .set CTR1,%xmm8
   982 .set CTR2,%xmm9
   983 .set CTR3,%xmm10
   984 .set CTR4,%xmm11
   985 .set CTR5,%xmm12
   986 .set CTR6,%xmm13
   987 .set CTR7,%xmm14
   988 .set CTR,%xmm15
   990     test  len, len
   991     jnz   .LbeginDec
   992     ret
   994 .LbeginDec:
   996     pushq   %rbp
   997     pushq   %rbx
   998     movq    %rsp, %rbp   
   999     sub     $128, %rsp
  1000     andq    $-16, %rsp
  1001     vmovdqu 288(Gctx), CTR
  1002     vmovdqu 272(Gctx), T
  1003     mov     304(Gctx), KS
  1004     mov     4(KS), NR
  1005     lea     48(KS), KS
  1007     vpshufb .Lbswap_mask(%rip), CTR, CTR
  1008     vpshufb .Lbswap_mask(%rip), T, T
  1010     vmovdqu .Lbswap_mask(%rip), TMP3
  1011     jmp     .LDECOctets
  1013 # Decrypt 8 blocks each time while hashing them at the same time
  1014 .align 64
  1015 .LDECOctets:
  1017         cmp $128, len
  1018         jb  .LDECSingles
  1019         sub $128, len
  1021         vmovdqa CTR, CTR0
  1022         vpaddd  .Lone(%rip), CTR0, CTR1
  1023         vpaddd  .Ltwo(%rip), CTR0, CTR2
  1024         vpaddd  .Lone(%rip), CTR2, CTR3
  1025         vpaddd  .Ltwo(%rip), CTR2, CTR4
  1026         vpaddd  .Lone(%rip), CTR4, CTR5
  1027         vpaddd  .Ltwo(%rip), CTR4, CTR6
  1028         vpaddd  .Lone(%rip), CTR6, CTR7
  1029         vpaddd  .Ltwo(%rip), CTR6, CTR
  1031         vpshufb TMP3, CTR0, CTR0
  1032         vpshufb TMP3, CTR1, CTR1
  1033         vpshufb TMP3, CTR2, CTR2
  1034         vpshufb TMP3, CTR3, CTR3
  1035         vpshufb TMP3, CTR4, CTR4
  1036         vpshufb TMP3, CTR5, CTR5
  1037         vpshufb TMP3, CTR6, CTR6
  1038         vpshufb TMP3, CTR7, CTR7
  1040         vmovdqu (KS), TMP3
  1041         vpxor  TMP3, CTR0, CTR0
  1042         vpxor  TMP3, CTR1, CTR1
  1043         vpxor  TMP3, CTR2, CTR2
  1044         vpxor  TMP3, CTR3, CTR3
  1045         vpxor  TMP3, CTR4, CTR4
  1046         vpxor  TMP3, CTR5, CTR5
  1047         vpxor  TMP3, CTR6, CTR6
  1048         vpxor  TMP3, CTR7, CTR7
  1050         vmovdqu     7*16(CT), TMP5
  1051         vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
  1052         vmovdqu     16*0(Htbl), TMP3
  1053         vpclmulqdq  $0x11, TMP3, TMP5, TMP1
  1054         vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
  1055         vpshufd     $78, TMP5, TMP3
  1056         vpxor       TMP5, TMP3, TMP5
  1057         vmovdqu     128+0*16(Htbl), TMP3      
  1058         vpclmulqdq  $0x00, TMP3, TMP5, TMP0
  1060         ROUND 1
  1061         DEC_KARATSUBA 1
  1063         ROUND 2
  1064         DEC_KARATSUBA 2
  1066         ROUND 3
  1067         DEC_KARATSUBA 3
  1069         ROUND 4
  1070         DEC_KARATSUBA 4
  1072         ROUND 5
  1073         DEC_KARATSUBA 5
  1075         ROUND 6
  1076         DEC_KARATSUBA 6
  1078         ROUND 7
  1080         vmovdqu     0*16(CT), TMP5
  1081         vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
  1082         vpxor       T, TMP5, TMP5
  1083         vmovdqu     16*7(Htbl), TMP4
  1085         vpclmulqdq  $0x11, TMP4, TMP5, TMP3
  1086         vpxor       TMP3, TMP1, TMP1
  1087         vpclmulqdq  $0x00, TMP4, TMP5, TMP3
  1088         vpxor       TMP3, TMP2, TMP2
  1090         vpshufd     $78, TMP5, TMP3
  1091         vpxor       TMP5, TMP3, TMP5
  1092         vmovdqu     128+7*16(Htbl), TMP4
  1094         vpclmulqdq  $0x00, TMP4, TMP5, TMP3
  1095         vpxor       TMP3, TMP0, TMP0
  1097         ROUND 8      
  1099         vpxor       TMP1, TMP0, TMP0
  1100         vpxor       TMP2, TMP0, TMP0
  1102         vpsrldq     $8, TMP0, TMP3
  1103         vpxor       TMP3, TMP1, TMP4
  1104         vpslldq     $8, TMP0, TMP3
  1105         vpxor       TMP3, TMP2, T
  1106         vmovdqa	  .Lpoly(%rip), TMP2
  1108         vpalignr    $8, T, T, TMP1
  1109         vpclmulqdq  $0x10, TMP2, T, T
  1110         vpxor       T, TMP1, T
  1112         ROUND 9
  1114         vpalignr    $8, T, T, TMP1
  1115         vpclmulqdq  $0x10, TMP2, T, T
  1116         vpxor       T, TMP1, T
  1118         vmovdqu     160(KS), TMP5
  1119         cmp         $10, NR
  1121         jbe  .LDECLast1
  1123         ROUND 10
  1124         ROUND 11
  1126         vmovdqu     192(KS), TMP5
  1127         cmp         $12, NR       
  1129         jbe  .LDECLast1
  1131         ROUND 12
  1132         ROUND 13
  1134         vmovdqu  224(KS), TMP5
  1136 .LDECLast1:      
  1138         vpxor   (CT), TMP5, TMP3
  1139         vaesenclast TMP3, CTR0, CTR0
  1140         vpxor   16(CT), TMP5, TMP3
  1141         vaesenclast TMP3, CTR1, CTR1
  1142         vpxor   32(CT), TMP5, TMP3
  1143         vaesenclast TMP3, CTR2, CTR2
  1144         vpxor   48(CT), TMP5, TMP3
  1145         vaesenclast TMP3, CTR3, CTR3
  1146         vpxor   64(CT), TMP5, TMP3
  1147         vaesenclast TMP3, CTR4, CTR4
  1148         vpxor   80(CT), TMP5, TMP3
  1149         vaesenclast TMP3, CTR5, CTR5
  1150         vpxor   96(CT), TMP5, TMP3
  1151         vaesenclast TMP3, CTR6, CTR6
  1152         vpxor   112(CT), TMP5, TMP3
  1153         vaesenclast TMP3, CTR7, CTR7
  1155         vmovdqu .Lbswap_mask(%rip), TMP3
  1157         vmovdqu CTR0, (PT)
  1158         vmovdqu CTR1, 16(PT)
  1159         vmovdqu CTR2, 32(PT)
  1160         vmovdqu CTR3, 48(PT)
  1161         vmovdqu CTR4, 64(PT)
  1162         vmovdqu CTR5, 80(PT)
  1163         vmovdqu CTR6, 96(PT)
  1164         vmovdqu CTR7,112(PT)
  1166         vpxor   TMP4, T, T
  1168         lea 128(CT), CT
  1169         lea 128(PT), PT
  1170    jmp  .LDECOctets
  1172 #Here we decrypt and hash any remaining whole block
  1173 .LDECSingles:
  1175     cmp   $16, len
  1176     jb    .LDECTail
  1177     sub   $16, len
  1179     vmovdqu  (CT), TMP1
  1180     vpshufb  .Lbswap_mask(%rip), TMP1, TMP1
  1181     vpxor    TMP1, T, T
  1182     vmovdqu  (Htbl), TMP0
  1183     call     GFMUL
  1186     vpshufb  .Lbswap_mask(%rip), CTR, TMP1
  1187     vpaddd   .Lone(%rip), CTR, CTR
  1189     vpxor    (KS), TMP1, TMP1
  1190     vaesenc  16*1(KS), TMP1, TMP1
  1191     vaesenc  16*2(KS), TMP1, TMP1
  1192     vaesenc  16*3(KS), TMP1, TMP1
  1193     vaesenc  16*4(KS), TMP1, TMP1
  1194     vaesenc  16*5(KS), TMP1, TMP1
  1195     vaesenc  16*6(KS), TMP1, TMP1
  1196     vaesenc  16*7(KS), TMP1, TMP1
  1197     vaesenc  16*8(KS), TMP1, TMP1
  1198     vaesenc  16*9(KS), TMP1, TMP1
  1199     vmovdqu  16*10(KS), TMP2
  1200     cmp      $10, NR
  1201     je       .LDECLast2
  1202     vaesenc  16*10(KS), TMP1, TMP1
  1203     vaesenc  16*11(KS), TMP1, TMP1
  1204     vmovdqu  16*12(KS), TMP2
  1205     cmp      $12, NR
  1206     je       .LDECLast2
  1207     vaesenc  16*12(KS), TMP1, TMP1
  1208     vaesenc  16*13(KS), TMP1, TMP1
  1209     vmovdqu  16*14(KS), TMP2
  1210 .LDECLast2:
  1211     vaesenclast TMP2, TMP1, TMP1
  1213     vpxor    (CT), TMP1, TMP1
  1214     vmovdqu  TMP1, (PT)
  1215     addq     $16, CT
  1216     addq     $16, PT  
  1217     jmp   .LDECSingles
  1219 #Here we decrypt the final partial block, if there is one
  1220 .LDECTail:
  1221    test   len, len
  1222    jz     .LDEC_END
  1224    vpshufb  .Lbswap_mask(%rip), CTR, TMP1
  1225    vpaddd .Lone(%rip), CTR, CTR
  1227    vpxor  (KS), TMP1, TMP1
  1228    vaesenc  16*1(KS), TMP1, TMP1
  1229    vaesenc  16*2(KS), TMP1, TMP1
  1230    vaesenc  16*3(KS), TMP1, TMP1
  1231    vaesenc  16*4(KS), TMP1, TMP1
  1232    vaesenc  16*5(KS), TMP1, TMP1
  1233    vaesenc  16*6(KS), TMP1, TMP1
  1234    vaesenc  16*7(KS), TMP1, TMP1
  1235    vaesenc  16*8(KS), TMP1, TMP1
  1236    vaesenc  16*9(KS), TMP1, TMP1
  1237    vmovdqu  16*10(KS), TMP2
  1238    cmp      $10, NR
  1239    je       .LDECLast3
  1240    vaesenc  16*10(KS), TMP1, TMP1
  1241    vaesenc  16*11(KS), TMP1, TMP1
  1242    vmovdqu  16*12(KS), TMP2
  1243    cmp      $12, NR
  1244    je       .LDECLast3
  1245    vaesenc  16*12(KS), TMP1, TMP1
  1246    vaesenc  16*13(KS), TMP1, TMP1
  1247    vmovdqu  16*14(KS), TMP2
  1249 .LDECLast3:
  1250    vaesenclast TMP2, TMP1, TMP1
  1252    vpxor   TMP2, TMP2, TMP2
  1253    vmovdqa TMP2, (%rsp) 
  1254 # Copy the required bytes only (could probably use rep movsb)
  1255     xor KS, KS  
  1256 .LDecCpy:
  1257         cmp     KS, len
  1258         je      .LDecCpy2
  1259         movb    (CT, KS, 1), %r8b
  1260         movb    %r8b, (%rsp, KS, 1)
  1261         inc     KS
  1262         jmp     .LDecCpy
  1263 .LDecCpy2:
  1264         cmp     $16, KS
  1265         je      .LDecCpyEnd
  1266         movb    $0, (%rsp, KS, 1)
  1267         inc     KS
  1268         jmp     .LDecCpy2
  1269 .LDecCpyEnd:
  1270 # Xor with the counter block
  1271     vmovdqa (%rsp), TMP0
  1272     vpxor   TMP0, TMP1, TMP1
  1273 # Again, store at temp location
  1274     vmovdqa TMP1, (%rsp)
  1275 # Copy only the required bytes to PT, and zero the rest for the hash
  1276     xor KS, KS
  1277 .LDecCpy3:
  1278     cmp     KS, len
  1279     je      .LDecCpyEnd3
  1280     movb    (%rsp, KS, 1), %r8b
  1281     movb    %r8b, (PT, KS, 1)
  1282     inc     KS
  1283     jmp     .LDecCpy3
  1284 .LDecCpyEnd3:
  1285    vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
  1286    vpxor    TMP0, T, T
  1287    vmovdqu  (Htbl), TMP0
  1288    call     GFMUL
  1289 .LDEC_END:
  1291    vpshufb  .Lbswap_mask(%rip), T, T
  1292    vpshufb  .Lbswap_mask(%rip), CTR, CTR
  1293    vmovdqu  T, 272(Gctx)
  1294    vmovdqu  CTR, 288(Gctx)
  1296    movq   %rbp, %rsp
  1298    popq   %rbx
  1299    popq   %rbp
  1300    ret
  1301   .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
  1302 #########################
  1303 # a = T
  1304 # b = TMP0 - remains unchanged
  1305 # res = T
  1306 # uses also TMP1,TMP2,TMP3,TMP4
  1307 # __m128i GFMUL(__m128i A, __m128i B);
  1308 .type GFMUL,@function
  1309 .globl GFMUL
  1310 GFMUL:  
  1311     vpclmulqdq  $0x00, TMP0, T, TMP1
  1312     vpclmulqdq  $0x11, TMP0, T, TMP4
  1314     vpshufd     $78, T, TMP2
  1315     vpshufd     $78, TMP0, TMP3
  1316     vpxor       T, TMP2, TMP2
  1317     vpxor       TMP0, TMP3, TMP3
  1319     vpclmulqdq  $0x00, TMP3, TMP2, TMP2
  1320     vpxor       TMP1, TMP2, TMP2
  1321     vpxor       TMP4, TMP2, TMP2
  1323     vpslldq     $8, TMP2, TMP3
  1324     vpsrldq     $8, TMP2, TMP2
  1326     vpxor       TMP3, TMP1, TMP1
  1327     vpxor       TMP2, TMP4, TMP4
  1329     vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
  1330     vpshufd     $78, TMP1, TMP3
  1331     vpxor       TMP3, TMP2, TMP1
  1333     vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
  1334     vpshufd     $78, TMP1, TMP3
  1335     vpxor       TMP3, TMP2, TMP1
  1337     vpxor       TMP4, TMP1, T
  1338     ret
  1339 .size GFMUL, .-GFMUL

mercurial