security/nss/lib/freebl/intel-gcm.s

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/intel-gcm.s	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1340 @@
     1.4 +# LICENSE:                                                                  
     1.5 +# This submission to NSS is to be made available under the terms of the
     1.6 +# Mozilla Public License, v. 2.0. You can obtain one at http:         
     1.7 +# //mozilla.org/MPL/2.0/. 
     1.8 +################################################################################
     1.9 +# Copyright(c) 2012, Intel Corp.
    1.10 +
    1.11 +.align  16
    1.12 +.Lone:
    1.13 +.quad 1,0
    1.14 +.Ltwo:
    1.15 +.quad 2,0
    1.16 +.Lbswap_mask:
    1.17 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
    1.18 +.Lshuff_mask:
    1.19 +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
    1.20 +.Lpoly:
    1.21 +.quad 0x1, 0xc200000000000000 
    1.22 +
    1.23 +
    1.24 +################################################################################
    1.25 +# Generates the final GCM tag
    1.26 +# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
    1.27 +.type intel_aes_gcmTAG,@function
    1.28 +.globl intel_aes_gcmTAG
    1.29 +.align 16
    1.30 +intel_aes_gcmTAG:
    1.31 +
    1.32 +.set  Htbl, %rdi
    1.33 +.set  Tp, %rsi
    1.34 +.set  Mlen, %rdx
    1.35 +.set  Alen, %rcx
    1.36 +.set  X0, %r8
    1.37 +.set  TAG, %r9
    1.38 +
    1.39 +.set T,%xmm0
    1.40 +.set TMP0,%xmm1
    1.41 +
    1.42 +   vmovdqu  (Tp), T
    1.43 +   vpshufb  .Lbswap_mask(%rip), T, T
    1.44 +   vpxor    TMP0, TMP0, TMP0
    1.45 +   shl      $3, Mlen
    1.46 +   shl      $3, Alen
    1.47 +   vpinsrq  $0, Mlen, TMP0, TMP0
    1.48 +   vpinsrq  $1, Alen, TMP0, TMP0
    1.49 +   vpxor    TMP0, T, T
    1.50 +   vmovdqu  (Htbl), TMP0
    1.51 +   call     GFMUL
    1.52 +   vpshufb  .Lbswap_mask(%rip), T, T
    1.53 +   vpxor    (X0), T, T
    1.54 +   vmovdqu  T, (TAG)
    1.55 +   
    1.56 +ret
    1.57 +.size intel_aes_gcmTAG, .-intel_aes_gcmTAG
    1.58 +################################################################################
    1.59 +# Generates the H table
    1.60 +# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
    1.61 +.type intel_aes_gcmINIT,@function
    1.62 +.globl intel_aes_gcmINIT
    1.63 +.align 16
    1.64 +intel_aes_gcmINIT:
    1.65 +   
    1.66 +.set  Htbl, %rdi
    1.67 +.set  KS, %rsi
    1.68 +.set  NR, %edx
    1.69 +
    1.70 +.set T,%xmm0
    1.71 +.set TMP0,%xmm1
    1.72 +
    1.73 +CALCULATE_POWERS_OF_H:
    1.74 +    vmovdqu      16*0(KS), T
    1.75 +    vaesenc      16*1(KS), T, T
    1.76 +    vaesenc      16*2(KS), T, T
    1.77 +    vaesenc      16*3(KS), T, T
    1.78 +    vaesenc      16*4(KS), T, T
    1.79 +    vaesenc      16*5(KS), T, T
    1.80 +    vaesenc      16*6(KS), T, T
    1.81 +    vaesenc      16*7(KS), T, T
    1.82 +    vaesenc      16*8(KS), T, T
    1.83 +    vaesenc      16*9(KS), T, T
    1.84 +    vmovdqu      16*10(KS), TMP0
    1.85 +    cmp          $10, NR
    1.86 +    je           .LH0done
    1.87 +    vaesenc      16*10(KS), T, T
    1.88 +    vaesenc      16*11(KS), T, T
    1.89 +    vmovdqu      16*12(KS), TMP0
    1.90 +    cmp          $12, NR
    1.91 +    je           .LH0done
    1.92 +    vaesenc      16*12(KS), T, T
    1.93 +    vaesenc      16*13(KS), T, T
    1.94 +    vmovdqu      16*14(KS), TMP0
    1.95 +  
    1.96 +.LH0done:
    1.97 +    vaesenclast  TMP0, T, T
    1.98 +
    1.99 +    vpshufb      .Lbswap_mask(%rip), T, T  
   1.100 +
   1.101 +    vmovdqu	T, TMP0
   1.102 +    # Calculate H` = GFMUL(H, 2)
   1.103 +    vpsrld	$7 , T , %xmm3
   1.104 +    vmovdqu	.Lshuff_mask(%rip), %xmm4
   1.105 +    vpshufb	%xmm4, %xmm3 , %xmm3
   1.106 +    movq	$0xff00 , %rax
   1.107 +    vmovq	%rax, %xmm4
   1.108 +    vpshufb	%xmm3, %xmm4 , %xmm4
   1.109 +    vmovdqu	.Lpoly(%rip), %xmm5
   1.110 +    vpand	%xmm4, %xmm5, %xmm5
   1.111 +    vpsrld	$31, T, %xmm3
   1.112 +    vpslld	$1, T, %xmm4
   1.113 +    vpslldq	$4, %xmm3, %xmm3
   1.114 +    vpxor	%xmm3, %xmm4, T  #xmm1 holds now p(x)<<1
   1.115 +
   1.116 +    #adding p(x)<<1 to xmm5
   1.117 +    vpxor     %xmm5, T , T
   1.118 +    vmovdqu   T, TMP0
   1.119 +    vmovdqu   T, (Htbl)     # H * 2
   1.120 +    call  GFMUL
   1.121 +    vmovdqu  T, 16(Htbl)    # H^2 * 2
   1.122 +    call  GFMUL
   1.123 +    vmovdqu  T, 32(Htbl)    # H^3 * 2
   1.124 +    call  GFMUL
   1.125 +    vmovdqu  T, 48(Htbl)    # H^4 * 2
   1.126 +    call  GFMUL
   1.127 +    vmovdqu  T, 64(Htbl)    # H^5 * 2
   1.128 +    call  GFMUL
   1.129 +    vmovdqu  T, 80(Htbl)    # H^6 * 2
   1.130 +    call  GFMUL
   1.131 +    vmovdqu  T, 96(Htbl)    # H^7 * 2
   1.132 +    call  GFMUL
   1.133 +    vmovdqu  T, 112(Htbl)   # H^8 * 2  
   1.134 +
   1.135 +    # Precalculations for the reduce 4 step
   1.136 +    vpshufd  $78, (Htbl), %xmm8
   1.137 +    vpshufd  $78, 16(Htbl), %xmm9
   1.138 +    vpshufd  $78, 32(Htbl), %xmm10
   1.139 +    vpshufd  $78, 48(Htbl), %xmm11
   1.140 +    vpshufd  $78, 64(Htbl), %xmm12
   1.141 +    vpshufd  $78, 80(Htbl), %xmm13
   1.142 +    vpshufd  $78, 96(Htbl), %xmm14
   1.143 +    vpshufd  $78, 112(Htbl), %xmm15
   1.144 +
   1.145 +    vpxor  (Htbl), %xmm8, %xmm8
   1.146 +    vpxor  16(Htbl), %xmm9, %xmm9
   1.147 +    vpxor  32(Htbl), %xmm10, %xmm10
   1.148 +    vpxor  48(Htbl), %xmm11, %xmm11
   1.149 +    vpxor  64(Htbl), %xmm12, %xmm12
   1.150 +    vpxor  80(Htbl), %xmm13, %xmm13
   1.151 +    vpxor  96(Htbl), %xmm14, %xmm14
   1.152 +    vpxor  112(Htbl), %xmm15, %xmm15
   1.153 +
   1.154 +    vmovdqu   %xmm8, 128(Htbl)
   1.155 +    vmovdqu   %xmm9, 144(Htbl)
   1.156 +    vmovdqu   %xmm10, 160(Htbl)
   1.157 +    vmovdqu   %xmm11, 176(Htbl)
   1.158 +    vmovdqu   %xmm12, 192(Htbl)
   1.159 +    vmovdqu   %xmm13, 208(Htbl)
   1.160 +    vmovdqu   %xmm14, 224(Htbl)
   1.161 +    vmovdqu   %xmm15, 240(Htbl)
   1.162 +
   1.163 +    ret
   1.164 +.size intel_aes_gcmINIT, .-intel_aes_gcmINIT
   1.165 +################################################################################
   1.166 +# Authenticate only
   1.167 +# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
   1.168 +
   1.169 +.globl  intel_aes_gcmAAD
   1.170 +.type   intel_aes_gcmAAD,@function
   1.171 +.align  16
   1.172 +intel_aes_gcmAAD:
   1.173 +
   1.174 +.set DATA, %xmm0
   1.175 +.set T, %xmm1
   1.176 +.set BSWAP_MASK, %xmm2
   1.177 +.set TMP0, %xmm3
   1.178 +.set TMP1, %xmm4
   1.179 +.set TMP2, %xmm5
   1.180 +.set TMP3, %xmm6
   1.181 +.set TMP4, %xmm7
   1.182 +.set Xhi, %xmm9
   1.183 +
   1.184 +.set Htbl, %rdi
   1.185 +.set inp, %rsi
   1.186 +.set len, %rdx
   1.187 +.set Tp, %rcx
   1.188 +
   1.189 +.set hlp0, %r11
   1.190 +
   1.191 +.macro KARATSUBA_AAD i
   1.192 +    vpclmulqdq  $0x00, 16*\i(Htbl), DATA, TMP3
   1.193 +    vpxor       TMP3, TMP0, TMP0
   1.194 +    vpclmulqdq  $0x11, 16*\i(Htbl), DATA, TMP3
   1.195 +    vpxor       TMP3, TMP1, TMP1
   1.196 +    vpshufd     $78,  DATA, TMP3
   1.197 +    vpxor       DATA, TMP3, TMP3
   1.198 +    vpclmulqdq  $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
   1.199 +    vpxor       TMP3, TMP2, TMP2
   1.200 +.endm
   1.201 +
   1.202 +    test  len, len
   1.203 +    jnz   .LbeginAAD
   1.204 +    ret
   1.205 +
   1.206 +.LbeginAAD:
   1.207 +
   1.208 +   push  hlp0
   1.209 +   vzeroupper
   1.210 +   
   1.211 +   vmovdqa  .Lbswap_mask(%rip), BSWAP_MASK
   1.212 +   
   1.213 +   vpxor    Xhi, Xhi, Xhi
   1.214 +   
   1.215 +   vmovdqu  (Tp),T
   1.216 +   vpshufb  BSWAP_MASK,T,T
   1.217 +
   1.218 +   # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
   1.219 +    mov     len, hlp0
   1.220 +    and	    $~-128, hlp0
   1.221 +
   1.222 +    jz      .Lmod_loop
   1.223 +
   1.224 +    sub     hlp0, len
   1.225 +    sub     $16, hlp0
   1.226 +
   1.227 +   #hash first prefix block
   1.228 +	vmovdqu (inp), DATA
   1.229 +	vpshufb  BSWAP_MASK, DATA, DATA
   1.230 +	vpxor    T, DATA, DATA
   1.231 +	
   1.232 +	vpclmulqdq  $0x00, (Htbl, hlp0), DATA, TMP0
   1.233 +	vpclmulqdq  $0x11, (Htbl, hlp0), DATA, TMP1
   1.234 +	vpshufd     $78, DATA, TMP2
   1.235 +	vpxor       DATA, TMP2, TMP2
   1.236 +	vpclmulqdq  $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
   1.237 +	
   1.238 +	lea	    16(inp), inp
   1.239 +	test    hlp0, hlp0
   1.240 +	jnz	    .Lpre_loop
   1.241 +	jmp	    .Lred1
   1.242 +
   1.243 +    #hash remaining prefix bocks (up to 7 total prefix blocks)
   1.244 +.align 64
   1.245 +.Lpre_loop:
   1.246 +
   1.247 +    sub	$16, hlp0
   1.248 +
   1.249 +    vmovdqu     (inp),DATA           # next data block
   1.250 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.251 +
   1.252 +    vpclmulqdq  $0x00, (Htbl,hlp0), DATA, TMP3
   1.253 +    vpxor       TMP3, TMP0, TMP0
   1.254 +    vpclmulqdq  $0x11, (Htbl,hlp0), DATA, TMP3
   1.255 +    vpxor       TMP3, TMP1, TMP1
   1.256 +    vpshufd	    $78, DATA, TMP3
   1.257 +    vpxor       DATA, TMP3, TMP3
   1.258 +    vpclmulqdq  $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
   1.259 +    vpxor       TMP3, TMP2, TMP2
   1.260 +
   1.261 +    test	hlp0, hlp0
   1.262 +
   1.263 +    lea	16(inp), inp
   1.264 +
   1.265 +    jnz	.Lpre_loop
   1.266 +	
   1.267 +.Lred1:
   1.268 +    vpxor       TMP0, TMP2, TMP2
   1.269 +    vpxor       TMP1, TMP2, TMP2
   1.270 +    vpsrldq     $8, TMP2, TMP3
   1.271 +    vpslldq     $8, TMP2, TMP2
   1.272 +
   1.273 +    vpxor       TMP3, TMP1, Xhi
   1.274 +    vpxor       TMP2, TMP0, T
   1.275 +	
   1.276 +.align 64
   1.277 +.Lmod_loop:
   1.278 +    sub	$0x80, len
   1.279 +    jb	.Ldone
   1.280 +
   1.281 +    vmovdqu     16*7(inp),DATA		# Ii
   1.282 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.283 +
   1.284 +    vpclmulqdq  $0x00, (Htbl), DATA, TMP0
   1.285 +    vpclmulqdq  $0x11, (Htbl), DATA, TMP1
   1.286 +    vpshufd     $78, DATA, TMP2
   1.287 +    vpxor       DATA, TMP2, TMP2
   1.288 +    vpclmulqdq  $0x00, 16*8(Htbl), TMP2, TMP2
   1.289 +    #########################################################
   1.290 +    vmovdqu     16*6(inp),DATA
   1.291 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.292 +    KARATSUBA_AAD 1
   1.293 +    #########################################################
   1.294 +    vmovdqu     16*5(inp),DATA
   1.295 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.296 +
   1.297 +    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 1a
   1.298 +    vpalignr    $8, T, T, T
   1.299 +
   1.300 +    KARATSUBA_AAD 2
   1.301 +
   1.302 +    vpxor       TMP4, T, T                 #reduction stage 1b
   1.303 +    #########################################################
   1.304 +    vmovdqu		16*4(inp),DATA
   1.305 +    vpshufb	    BSWAP_MASK,DATA,DATA
   1.306 +
   1.307 +    KARATSUBA_AAD 3
   1.308 +    #########################################################
   1.309 +    vmovdqu     16*3(inp),DATA
   1.310 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.311 +
   1.312 +    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 2a
   1.313 +    vpalignr    $8, T, T, T
   1.314 +
   1.315 +    KARATSUBA_AAD 4
   1.316 +
   1.317 +    vpxor       TMP4, T, T                 #reduction stage 2b
   1.318 +    #########################################################
   1.319 +    vmovdqu     16*2(inp),DATA
   1.320 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.321 +
   1.322 +    KARATSUBA_AAD 5
   1.323 +
   1.324 +    vpxor       Xhi, T, T                  #reduction finalize
   1.325 +    #########################################################
   1.326 +    vmovdqu     16*1(inp),DATA
   1.327 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.328 +
   1.329 +    KARATSUBA_AAD 6
   1.330 +    #########################################################
   1.331 +    vmovdqu     16*0(inp),DATA
   1.332 +    vpshufb     BSWAP_MASK,DATA,DATA
   1.333 +    vpxor       T,DATA,DATA
   1.334 +
   1.335 +    KARATSUBA_AAD 7
   1.336 +    #########################################################
   1.337 +    vpxor       TMP0, TMP2, TMP2              # karatsuba fixup
   1.338 +    vpxor       TMP1, TMP2, TMP2
   1.339 +    vpsrldq     $8, TMP2, TMP3
   1.340 +    vpslldq     $8, TMP2, TMP2
   1.341 +
   1.342 +    vpxor       TMP3, TMP1, Xhi
   1.343 +    vpxor       TMP2, TMP0, T
   1.344 +
   1.345 +    lea	16*8(inp), inp
   1.346 +    jmp .Lmod_loop
   1.347 +    #########################################################
   1.348 +
   1.349 +.Ldone:
   1.350 +    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
   1.351 +    vpalignr    $8, T, T, T
   1.352 +    vpxor       TMP3, T, T
   1.353 +
   1.354 +    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
   1.355 +    vpalignr    $8, T, T, T
   1.356 +    vpxor       TMP3, T, T
   1.357 +
   1.358 +    vpxor       Xhi, T, T
   1.359 +   
   1.360 +.Lsave:
   1.361 +    vpshufb     BSWAP_MASK,T, T
   1.362 +    vmovdqu     T,(Tp)
   1.363 +    vzeroupper
   1.364 +
   1.365 +    pop hlp0
   1.366 +    ret
   1.367 +.size   intel_aes_gcmAAD,.-intel_aes_gcmAAD
   1.368 +
   1.369 +################################################################################
   1.370 +# Encrypt and Authenticate
   1.371 +# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
   1.372 +.type intel_aes_gcmENC,@function
   1.373 +.globl intel_aes_gcmENC
   1.374 +.align 16
   1.375 +intel_aes_gcmENC:
   1.376 +
   1.377 +.set PT,%rdi
   1.378 +.set CT,%rsi
   1.379 +.set Htbl, %rdx
   1.380 +.set len, %rcx
   1.381 +.set KS,%r9
   1.382 +.set NR,%r10d
   1.383 +
   1.384 +.set Gctx, %rdx
   1.385 +
   1.386 +.set T,%xmm0
   1.387 +.set TMP0,%xmm1
   1.388 +.set TMP1,%xmm2
   1.389 +.set TMP2,%xmm3
   1.390 +.set TMP3,%xmm4
   1.391 +.set TMP4,%xmm5
   1.392 +.set TMP5,%xmm6
   1.393 +.set CTR0,%xmm7
   1.394 +.set CTR1,%xmm8
   1.395 +.set CTR2,%xmm9
   1.396 +.set CTR3,%xmm10
   1.397 +.set CTR4,%xmm11
   1.398 +.set CTR5,%xmm12
   1.399 +.set CTR6,%xmm13
   1.400 +.set CTR7,%xmm14
   1.401 +.set CTR,%xmm15
   1.402 +
   1.403 +.macro ROUND i
   1.404 +    vmovdqu \i*16(KS), TMP3
   1.405 +    vaesenc TMP3, CTR0, CTR0
   1.406 +    vaesenc TMP3, CTR1, CTR1
   1.407 +    vaesenc TMP3, CTR2, CTR2
   1.408 +    vaesenc TMP3, CTR3, CTR3
   1.409 +    vaesenc TMP3, CTR4, CTR4
   1.410 +    vaesenc TMP3, CTR5, CTR5
   1.411 +    vaesenc TMP3, CTR6, CTR6
   1.412 +    vaesenc TMP3, CTR7, CTR7
   1.413 +.endm
   1.414 +
   1.415 +.macro ROUNDMUL i
   1.416 +
   1.417 +    vmovdqu \i*16(%rsp), TMP5
   1.418 +    vmovdqu \i*16(KS), TMP3
   1.419 +
   1.420 +    vaesenc TMP3, CTR0, CTR0
   1.421 +    vaesenc TMP3, CTR1, CTR1
   1.422 +    vaesenc TMP3, CTR2, CTR2
   1.423 +    vaesenc TMP3, CTR3, CTR3
   1.424 +
   1.425 +    vpshufd $78, TMP5, TMP4
   1.426 +    vpxor   TMP5, TMP4, TMP4
   1.427 +
   1.428 +    vaesenc TMP3, CTR4, CTR4
   1.429 +    vaesenc TMP3, CTR5, CTR5
   1.430 +    vaesenc TMP3, CTR6, CTR6
   1.431 +    vaesenc TMP3, CTR7, CTR7
   1.432 +
   1.433 +    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP4, TMP3
   1.434 +    vpxor       TMP3, TMP0, TMP0
   1.435 +    vmovdqa     \i*16(Htbl), TMP4
   1.436 +    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
   1.437 +    vpxor       TMP3, TMP1, TMP1
   1.438 +    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   1.439 +    vpxor       TMP3, TMP2, TMP2
   1.440 +  
   1.441 +.endm
   1.442 +
   1.443 +.macro KARATSUBA i
   1.444 +    vmovdqu \i*16(%rsp), TMP5
   1.445 +
   1.446 +    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
   1.447 +    vpxor       TMP3, TMP1, TMP1
   1.448 +    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
   1.449 +    vpxor       TMP3, TMP2, TMP2
   1.450 +    vpshufd     $78, TMP5, TMP3
   1.451 +    vpxor       TMP5, TMP3, TMP5
   1.452 +    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
   1.453 +    vpxor       TMP3, TMP0, TMP0
   1.454 +.endm
   1.455 +
   1.456 +    test len, len
   1.457 +    jnz  .Lbegin
   1.458 +    ret
   1.459 +   
   1.460 +.Lbegin:
   1.461 +
   1.462 +    vzeroupper
   1.463 +    push %rbp
   1.464 +    push %rbx
   1.465 +
   1.466 +    movq %rsp, %rbp   
   1.467 +    sub  $128, %rsp
   1.468 +    andq $-16, %rsp
   1.469 +
   1.470 +    vmovdqu  288(Gctx), CTR
   1.471 +    vmovdqu  272(Gctx), T
   1.472 +    mov  304(Gctx), KS
   1.473 +    mov  4(KS), NR
   1.474 +    lea  48(KS), KS
   1.475 +
   1.476 +    vpshufb  .Lbswap_mask(%rip), CTR, CTR
   1.477 +    vpshufb  .Lbswap_mask(%rip), T, T
   1.478 +
   1.479 +    cmp  $128, len
   1.480 +    jb   .LDataSingles
   1.481 +   
   1.482 +# Encrypt the first eight blocks
   1.483 +    sub     $128, len
   1.484 +    vmovdqa CTR, CTR0
   1.485 +    vpaddd  .Lone(%rip), CTR0, CTR1
   1.486 +    vpaddd  .Ltwo(%rip), CTR0, CTR2
   1.487 +    vpaddd  .Lone(%rip), CTR2, CTR3
   1.488 +    vpaddd  .Ltwo(%rip), CTR2, CTR4
   1.489 +    vpaddd  .Lone(%rip), CTR4, CTR5
   1.490 +    vpaddd  .Ltwo(%rip), CTR4, CTR6
   1.491 +    vpaddd  .Lone(%rip), CTR6, CTR7
   1.492 +    vpaddd  .Ltwo(%rip), CTR6, CTR
   1.493 +
   1.494 +    vpshufb .Lbswap_mask(%rip), CTR0, CTR0
   1.495 +    vpshufb .Lbswap_mask(%rip), CTR1, CTR1
   1.496 +    vpshufb .Lbswap_mask(%rip), CTR2, CTR2
   1.497 +    vpshufb .Lbswap_mask(%rip), CTR3, CTR3
   1.498 +    vpshufb .Lbswap_mask(%rip), CTR4, CTR4
   1.499 +    vpshufb .Lbswap_mask(%rip), CTR5, CTR5
   1.500 +    vpshufb .Lbswap_mask(%rip), CTR6, CTR6
   1.501 +    vpshufb .Lbswap_mask(%rip), CTR7, CTR7
   1.502 +
   1.503 +    vpxor   (KS), CTR0, CTR0
   1.504 +    vpxor   (KS), CTR1, CTR1
   1.505 +    vpxor   (KS), CTR2, CTR2
   1.506 +    vpxor   (KS), CTR3, CTR3
   1.507 +    vpxor   (KS), CTR4, CTR4
   1.508 +    vpxor   (KS), CTR5, CTR5
   1.509 +    vpxor   (KS), CTR6, CTR6
   1.510 +    vpxor   (KS), CTR7, CTR7
   1.511 +
   1.512 +    ROUND 1
   1.513 +    ROUND 2
   1.514 +    ROUND 3
   1.515 +    ROUND 4
   1.516 +    ROUND 5
   1.517 +    ROUND 6
   1.518 +    ROUND 7
   1.519 +    ROUND 8
   1.520 +    ROUND 9
   1.521 +
   1.522 +    vmovdqu 160(KS), TMP5
   1.523 +    cmp $12, NR
   1.524 +    jb  .LLast1
   1.525 +
   1.526 +    ROUND 10
   1.527 +    ROUND 11
   1.528 +
   1.529 +    vmovdqu 192(KS), TMP5
   1.530 +    cmp $14, NR
   1.531 +    jb  .LLast1
   1.532 +
   1.533 +    ROUND 12
   1.534 +    ROUND 13
   1.535 +
   1.536 +    vmovdqu 224(KS), TMP5
   1.537 +  
   1.538 +.LLast1:
   1.539 +
   1.540 +    vpxor       (PT), TMP5, TMP3
   1.541 +    vaesenclast TMP3, CTR0, CTR0
   1.542 +    vpxor       16(PT), TMP5, TMP3
   1.543 +    vaesenclast TMP3, CTR1, CTR1
   1.544 +    vpxor       32(PT), TMP5, TMP3
   1.545 +    vaesenclast TMP3, CTR2, CTR2
   1.546 +    vpxor       48(PT), TMP5, TMP3
   1.547 +    vaesenclast TMP3, CTR3, CTR3
   1.548 +    vpxor       64(PT), TMP5, TMP3
   1.549 +    vaesenclast TMP3, CTR4, CTR4
   1.550 +    vpxor       80(PT), TMP5, TMP3
   1.551 +    vaesenclast TMP3, CTR5, CTR5
   1.552 +    vpxor       96(PT), TMP5, TMP3
   1.553 +    vaesenclast TMP3, CTR6, CTR6
   1.554 +    vpxor       112(PT), TMP5, TMP3
   1.555 +    vaesenclast TMP3, CTR7, CTR7
   1.556 +    
   1.557 +    vmovdqu     .Lbswap_mask(%rip), TMP3
   1.558 +   
   1.559 +    vmovdqu CTR0, (CT)
   1.560 +    vpshufb TMP3, CTR0, CTR0
   1.561 +    vmovdqu CTR1, 16(CT)
   1.562 +    vpshufb TMP3, CTR1, CTR1
   1.563 +    vmovdqu CTR2, 32(CT)
   1.564 +    vpshufb TMP3, CTR2, CTR2
   1.565 +    vmovdqu CTR3, 48(CT)
   1.566 +    vpshufb TMP3, CTR3, CTR3
   1.567 +    vmovdqu CTR4, 64(CT)
   1.568 +    vpshufb TMP3, CTR4, CTR4
   1.569 +    vmovdqu CTR5, 80(CT)
   1.570 +    vpshufb TMP3, CTR5, CTR5
   1.571 +    vmovdqu CTR6, 96(CT)
   1.572 +    vpshufb TMP3, CTR6, CTR6
   1.573 +    vmovdqu CTR7, 112(CT)
   1.574 +    vpshufb TMP3, CTR7, CTR7
   1.575 +
   1.576 +    lea 128(CT), CT
   1.577 +    lea 128(PT), PT
   1.578 +    jmp .LDataOctets
   1.579 +
   1.580 +# Encrypt 8 blocks each time while hashing previous 8 blocks
   1.581 +.align 64
   1.582 +.LDataOctets:
   1.583 +        cmp $128, len
   1.584 +        jb  .LEndOctets
   1.585 +        sub $128, len
   1.586 +
   1.587 +        vmovdqa CTR7, TMP5
   1.588 +        vmovdqa CTR6, 1*16(%rsp)
   1.589 +        vmovdqa CTR5, 2*16(%rsp)
   1.590 +        vmovdqa CTR4, 3*16(%rsp)
   1.591 +        vmovdqa CTR3, 4*16(%rsp)
   1.592 +        vmovdqa CTR2, 5*16(%rsp)
   1.593 +        vmovdqa CTR1, 6*16(%rsp)
   1.594 +        vmovdqa CTR0, 7*16(%rsp)
   1.595 +
   1.596 +        vmovdqa CTR, CTR0
   1.597 +        vpaddd  .Lone(%rip), CTR0, CTR1
   1.598 +        vpaddd  .Ltwo(%rip), CTR0, CTR2
   1.599 +        vpaddd  .Lone(%rip), CTR2, CTR3
   1.600 +        vpaddd  .Ltwo(%rip), CTR2, CTR4
   1.601 +        vpaddd  .Lone(%rip), CTR4, CTR5
   1.602 +        vpaddd  .Ltwo(%rip), CTR4, CTR6
   1.603 +        vpaddd  .Lone(%rip), CTR6, CTR7
   1.604 +        vpaddd  .Ltwo(%rip), CTR6, CTR
   1.605 +
   1.606 +        vmovdqu (KS), TMP4
   1.607 +        vpshufb TMP3, CTR0, CTR0
   1.608 +        vpxor   TMP4, CTR0, CTR0
   1.609 +        vpshufb TMP3, CTR1, CTR1
   1.610 +        vpxor   TMP4, CTR1, CTR1
   1.611 +        vpshufb TMP3, CTR2, CTR2
   1.612 +        vpxor   TMP4, CTR2, CTR2
   1.613 +        vpshufb TMP3, CTR3, CTR3
   1.614 +        vpxor   TMP4, CTR3, CTR3
   1.615 +        vpshufb TMP3, CTR4, CTR4
   1.616 +        vpxor   TMP4, CTR4, CTR4
   1.617 +        vpshufb TMP3, CTR5, CTR5
   1.618 +        vpxor   TMP4, CTR5, CTR5
   1.619 +        vpshufb TMP3, CTR6, CTR6
   1.620 +        vpxor   TMP4, CTR6, CTR6
   1.621 +        vpshufb TMP3, CTR7, CTR7
   1.622 +        vpxor   TMP4, CTR7, CTR7
   1.623 +
   1.624 +        vmovdqu     16*0(Htbl), TMP3
   1.625 +        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
   1.626 +        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
   1.627 +        vpshufd     $78, TMP5, TMP3
   1.628 +        vpxor       TMP5, TMP3, TMP5
   1.629 +        vmovdqu     128+0*16(Htbl), TMP3      
   1.630 +        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
   1.631 +
   1.632 +        ROUNDMUL 1
   1.633 +
   1.634 +        ROUNDMUL 2
   1.635 +
   1.636 +        ROUNDMUL 3
   1.637 +
   1.638 +        ROUNDMUL 4
   1.639 +
   1.640 +        ROUNDMUL 5
   1.641 +
   1.642 +        ROUNDMUL 6
   1.643 +
   1.644 +        vpxor   7*16(%rsp), T, TMP5
   1.645 +        vmovdqu 7*16(KS), TMP3
   1.646 +
   1.647 +        vaesenc TMP3, CTR0, CTR0
   1.648 +        vaesenc TMP3, CTR1, CTR1
   1.649 +        vaesenc TMP3, CTR2, CTR2
   1.650 +        vaesenc TMP3, CTR3, CTR3
   1.651 +
   1.652 +        vpshufd $78, TMP5, TMP4
   1.653 +        vpxor   TMP5, TMP4, TMP4
   1.654 +
   1.655 +        vaesenc TMP3, CTR4, CTR4
   1.656 +        vaesenc TMP3, CTR5, CTR5
   1.657 +        vaesenc TMP3, CTR6, CTR6
   1.658 +        vaesenc TMP3, CTR7, CTR7
   1.659 +
   1.660 +        vpclmulqdq  $0x11, 7*16(Htbl), TMP5, TMP3
   1.661 +        vpxor       TMP3, TMP1, TMP1
   1.662 +        vpclmulqdq  $0x00, 7*16(Htbl), TMP5, TMP3
   1.663 +        vpxor       TMP3, TMP2, TMP2
   1.664 +        vpclmulqdq  $0x00, 128+7*16(Htbl), TMP4, TMP3
   1.665 +        vpxor       TMP3, TMP0, TMP0
   1.666 +
   1.667 +        ROUND 8    
   1.668 +        vmovdqa .Lpoly(%rip), TMP5
   1.669 +
   1.670 +        vpxor   TMP1, TMP0, TMP0
   1.671 +        vpxor   TMP2, TMP0, TMP0
   1.672 +        vpsrldq $8, TMP0, TMP3
   1.673 +        vpxor   TMP3, TMP1, TMP4
   1.674 +        vpslldq $8, TMP0, TMP3
   1.675 +        vpxor   TMP3, TMP2, T
   1.676 +
   1.677 +        vpclmulqdq  $0x10, TMP5, T, TMP1
   1.678 +        vpalignr    $8, T, T, T
   1.679 +        vpxor       T, TMP1, T
   1.680 +
   1.681 +        ROUND 9
   1.682 +
   1.683 +        vpclmulqdq  $0x10, TMP5, T, TMP1
   1.684 +        vpalignr    $8, T, T, T
   1.685 +        vpxor       T, TMP1, T
   1.686 +
   1.687 +        vmovdqu 160(KS), TMP5
   1.688 +        cmp     $10, NR
   1.689 +        jbe     .LLast2
   1.690 +
   1.691 +        ROUND 10
   1.692 +        ROUND 11
   1.693 +
   1.694 +        vmovdqu 192(KS), TMP5
   1.695 +        cmp     $12, NR
   1.696 +        jbe     .LLast2
   1.697 +
   1.698 +        ROUND 12
   1.699 +        ROUND 13
   1.700 +
   1.701 +        vmovdqu 224(KS), TMP5
   1.702 +
   1.703 +.LLast2:
   1.704 +      
   1.705 +        vpxor       (PT), TMP5, TMP3
   1.706 +        vaesenclast TMP3, CTR0, CTR0
   1.707 +        vpxor       16(PT), TMP5, TMP3
   1.708 +        vaesenclast TMP3, CTR1, CTR1
   1.709 +        vpxor       32(PT), TMP5, TMP3
   1.710 +        vaesenclast TMP3, CTR2, CTR2
   1.711 +        vpxor       48(PT), TMP5, TMP3
   1.712 +        vaesenclast TMP3, CTR3, CTR3
   1.713 +        vpxor       64(PT), TMP5, TMP3
   1.714 +        vaesenclast TMP3, CTR4, CTR4
   1.715 +        vpxor       80(PT), TMP5, TMP3
   1.716 +        vaesenclast TMP3, CTR5, CTR5
   1.717 +        vpxor       96(PT), TMP5, TMP3
   1.718 +        vaesenclast TMP3, CTR6, CTR6
   1.719 +        vpxor       112(PT), TMP5, TMP3
   1.720 +        vaesenclast TMP3, CTR7, CTR7
   1.721 +
   1.722 +        vmovdqu .Lbswap_mask(%rip), TMP3
   1.723 +
   1.724 +        vmovdqu CTR0, (CT)
   1.725 +        vpshufb TMP3, CTR0, CTR0
   1.726 +        vmovdqu CTR1, 16(CT)
   1.727 +        vpshufb TMP3, CTR1, CTR1
   1.728 +        vmovdqu CTR2, 32(CT)
   1.729 +        vpshufb TMP3, CTR2, CTR2
   1.730 +        vmovdqu CTR3, 48(CT)
   1.731 +        vpshufb TMP3, CTR3, CTR3
   1.732 +        vmovdqu CTR4, 64(CT)
   1.733 +        vpshufb TMP3, CTR4, CTR4
   1.734 +        vmovdqu CTR5, 80(CT)
   1.735 +        vpshufb TMP3, CTR5, CTR5
   1.736 +        vmovdqu CTR6, 96(CT)
   1.737 +        vpshufb TMP3, CTR6, CTR6
   1.738 +        vmovdqu CTR7,112(CT)
   1.739 +        vpshufb TMP3, CTR7, CTR7
   1.740 +
   1.741 +        vpxor   TMP4, T, T
   1.742 +
   1.743 +        lea 128(CT), CT
   1.744 +        lea 128(PT), PT
   1.745 +    jmp  .LDataOctets
   1.746 +
   1.747 +.LEndOctets:
   1.748 +    
   1.749 +    vmovdqa CTR7, TMP5
   1.750 +    vmovdqa CTR6, 1*16(%rsp)
   1.751 +    vmovdqa CTR5, 2*16(%rsp)
   1.752 +    vmovdqa CTR4, 3*16(%rsp)
   1.753 +    vmovdqa CTR3, 4*16(%rsp)
   1.754 +    vmovdqa CTR2, 5*16(%rsp)
   1.755 +    vmovdqa CTR1, 6*16(%rsp)
   1.756 +    vmovdqa CTR0, 7*16(%rsp)
   1.757 +
   1.758 +    vmovdqu     16*0(Htbl), TMP3
   1.759 +    vpclmulqdq  $0x11, TMP3, TMP5, TMP1
   1.760 +    vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
   1.761 +    vpshufd     $78, TMP5, TMP3
   1.762 +    vpxor       TMP5, TMP3, TMP5
   1.763 +    vmovdqu     128+0*16(Htbl), TMP3      
   1.764 +    vpclmulqdq  $0x00, TMP3, TMP5, TMP0
   1.765 +
   1.766 +    KARATSUBA 1
   1.767 +    KARATSUBA 2
   1.768 +    KARATSUBA 3      
   1.769 +    KARATSUBA 4
   1.770 +    KARATSUBA 5
   1.771 +    KARATSUBA 6
   1.772 +
   1.773 +    vmovdqu     7*16(%rsp), TMP5
   1.774 +    vpxor       T, TMP5, TMP5
   1.775 +    vmovdqu     16*7(Htbl), TMP4            
   1.776 +    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
   1.777 +    vpxor       TMP3, TMP1, TMP1
   1.778 +    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   1.779 +    vpxor       TMP3, TMP2, TMP2      
   1.780 +    vpshufd     $78, TMP5, TMP3
   1.781 +    vpxor       TMP5, TMP3, TMP5
   1.782 +    vmovdqu     128+7*16(Htbl), TMP4      
   1.783 +    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   1.784 +    vpxor       TMP3, TMP0, TMP0
   1.785 +
   1.786 +    vpxor       TMP1, TMP0, TMP0
   1.787 +    vpxor       TMP2, TMP0, TMP0
   1.788 +
   1.789 +    vpsrldq     $8, TMP0, TMP3
   1.790 +    vpxor       TMP3, TMP1, TMP4
   1.791 +    vpslldq     $8, TMP0, TMP3
   1.792 +    vpxor       TMP3, TMP2, T
   1.793 +
   1.794 +    vmovdqa     .Lpoly(%rip), TMP2
   1.795 +
   1.796 +    vpalignr    $8, T, T, TMP1
   1.797 +    vpclmulqdq  $0x10, TMP2, T, T
   1.798 +    vpxor       T, TMP1, T
   1.799 +
   1.800 +    vpalignr    $8, T, T, TMP1
   1.801 +    vpclmulqdq  $0x10, TMP2, T, T
   1.802 +    vpxor       T, TMP1, T
   1.803 +
   1.804 +    vpxor       TMP4, T, T
   1.805 +
   1.806 +#Here we encrypt any remaining whole block
   1.807 +.LDataSingles:
   1.808 +
   1.809 +    cmp $16, len
   1.810 +    jb  .LDataTail
   1.811 +    sub $16, len
   1.812 +
   1.813 +    vpshufb .Lbswap_mask(%rip), CTR, TMP1
   1.814 +    vpaddd  .Lone(%rip), CTR, CTR
   1.815 +
   1.816 +    vpxor   (KS), TMP1, TMP1
   1.817 +    vaesenc 16*1(KS), TMP1, TMP1
   1.818 +    vaesenc 16*2(KS), TMP1, TMP1
   1.819 +    vaesenc 16*3(KS), TMP1, TMP1
   1.820 +    vaesenc 16*4(KS), TMP1, TMP1
   1.821 +    vaesenc 16*5(KS), TMP1, TMP1
   1.822 +    vaesenc 16*6(KS), TMP1, TMP1
   1.823 +    vaesenc 16*7(KS), TMP1, TMP1
   1.824 +    vaesenc 16*8(KS), TMP1, TMP1
   1.825 +    vaesenc 16*9(KS), TMP1, TMP1
   1.826 +    vmovdqu 16*10(KS), TMP2
   1.827 +    cmp     $10, NR
   1.828 +    je      .LLast3
   1.829 +    vaesenc 16*10(KS), TMP1, TMP1
   1.830 +    vaesenc 16*11(KS), TMP1, TMP1
   1.831 +    vmovdqu 16*12(KS), TMP2
   1.832 +    cmp     $12, NR
   1.833 +    je      .LLast3
   1.834 +    vaesenc 16*12(KS), TMP1, TMP1
   1.835 +    vaesenc 16*13(KS), TMP1, TMP1
   1.836 +    vmovdqu 16*14(KS), TMP2
   1.837 +
   1.838 +.LLast3:
   1.839 +    vaesenclast TMP2, TMP1, TMP1
   1.840 +
   1.841 +    vpxor   (PT), TMP1, TMP1
   1.842 +    vmovdqu TMP1, (CT)
   1.843 +    addq    $16, CT
   1.844 +    addq    $16, PT
   1.845 +
   1.846 +    vpshufb .Lbswap_mask(%rip), TMP1, TMP1
   1.847 +    vpxor   TMP1, T, T
   1.848 +    vmovdqu (Htbl), TMP0
   1.849 +    call    GFMUL
   1.850 +
   1.851 +    jmp .LDataSingles
   1.852 +
   1.853 +#Here we encypt the final partial block, if there is one
   1.854 +.LDataTail:
   1.855 +
   1.856 +    test    len, len
   1.857 +    jz      DATA_END
   1.858 +# First prepare the counter block
   1.859 +    vpshufb .Lbswap_mask(%rip), CTR, TMP1
   1.860 +    vpaddd  .Lone(%rip), CTR, CTR
   1.861 +
   1.862 +    vpxor   (KS), TMP1, TMP1
   1.863 +    vaesenc 16*1(KS), TMP1, TMP1
   1.864 +    vaesenc 16*2(KS), TMP1, TMP1
   1.865 +    vaesenc 16*3(KS), TMP1, TMP1
   1.866 +    vaesenc 16*4(KS), TMP1, TMP1
   1.867 +    vaesenc 16*5(KS), TMP1, TMP1
   1.868 +    vaesenc 16*6(KS), TMP1, TMP1
   1.869 +    vaesenc 16*7(KS), TMP1, TMP1
   1.870 +    vaesenc 16*8(KS), TMP1, TMP1
   1.871 +    vaesenc 16*9(KS), TMP1, TMP1
   1.872 +    vmovdqu 16*10(KS), TMP2
   1.873 +    cmp     $10, NR
   1.874 +    je      .LLast4
   1.875 +    vaesenc 16*10(KS), TMP1, TMP1
   1.876 +    vaesenc 16*11(KS), TMP1, TMP1
   1.877 +    vmovdqu 16*12(KS), TMP2
   1.878 +    cmp     $12, NR
   1.879 +    je      .LLast4
   1.880 +    vaesenc 16*12(KS), TMP1, TMP1
   1.881 +    vaesenc 16*13(KS), TMP1, TMP1
   1.882 +    vmovdqu 16*14(KS), TMP2
   1.883 +  
   1.884 +.LLast4:
   1.885 +    vaesenclast TMP2, TMP1, TMP1
   1.886 +#Zero a temp location
   1.887 +    vpxor   TMP2, TMP2, TMP2
   1.888 +    vmovdqa TMP2, (%rsp)
   1.889 +    
   1.890 +# Copy the required bytes only (could probably use rep movsb)
   1.891 +    xor KS, KS  
   1.892 +.LEncCpy:
   1.893 +        cmp     KS, len
   1.894 +        je      .LEncCpyEnd
   1.895 +        movb    (PT, KS, 1), %r8b
   1.896 +        movb    %r8b, (%rsp, KS, 1)
   1.897 +        inc     KS
   1.898 +        jmp .LEncCpy
   1.899 +.LEncCpyEnd:
   1.900 +# Xor with the counter block
   1.901 +    vpxor   (%rsp), TMP1, TMP0
   1.902 +# Again, store at temp location
   1.903 +    vmovdqa TMP0, (%rsp)
   1.904 +# Copy only the required bytes to CT, and zero the rest for the hash
   1.905 +    xor KS, KS
   1.906 +.LEncCpy2:
   1.907 +    cmp     KS, len
   1.908 +    je      .LEncCpy3
   1.909 +    movb    (%rsp, KS, 1), %r8b
   1.910 +    movb    %r8b, (CT, KS, 1)
   1.911 +    inc     KS
   1.912 +    jmp .LEncCpy2
   1.913 +.LEncCpy3:
   1.914 +    cmp     $16, KS
   1.915 +    je      .LEndCpy3
   1.916 +    movb    $0, (%rsp, KS, 1)
   1.917 +    inc     KS
   1.918 +    jmp .LEncCpy3
   1.919 +.LEndCpy3:
   1.920 +   vmovdqa  (%rsp), TMP0
   1.921 +
   1.922 +   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
   1.923 +   vpxor    TMP0, T, T
   1.924 +   vmovdqu  (Htbl), TMP0
   1.925 +   call     GFMUL
   1.926 +
   1.927 +DATA_END:
   1.928 +
   1.929 +   vpshufb  .Lbswap_mask(%rip), T, T
   1.930 +   vpshufb  .Lbswap_mask(%rip), CTR, CTR
   1.931 +   vmovdqu  T, 272(Gctx)
   1.932 +   vmovdqu  CTR, 288(Gctx)
   1.933 +
   1.934 +   movq   %rbp, %rsp
   1.935 +
   1.936 +   popq   %rbx
   1.937 +   popq   %rbp
   1.938 +   ret
   1.939 +   .size intel_aes_gcmENC, .-intel_aes_gcmENC
   1.940 +  
   1.941 +#########################
   1.942 +# Decrypt and Authenticate
   1.943 +# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
   1.944 +.type intel_aes_gcmDEC,@function
   1.945 +.globl intel_aes_gcmDEC
   1.946 +.align 16
   1.947 +intel_aes_gcmDEC:
   1.948 +# parameter 1: CT    # input
   1.949 +# parameter 2: PT    # output
   1.950 +# parameter 3: %rdx  # Gctx
   1.951 +# parameter 4: %rcx  # len
   1.952 +
   1.953 +.macro DEC_KARATSUBA i
   1.954 +    vmovdqu     (7-\i)*16(CT), TMP5
   1.955 +    vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
   1.956 +
   1.957 +    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
   1.958 +    vpxor       TMP3, TMP1, TMP1
   1.959 +    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
   1.960 +    vpxor       TMP3, TMP2, TMP2
   1.961 +    vpshufd     $78, TMP5, TMP3
   1.962 +    vpxor       TMP5, TMP3, TMP5
   1.963 +    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
   1.964 +    vpxor       TMP3, TMP0, TMP0
   1.965 +.endm
   1.966 +
   1.967 +.set PT,%rsi
   1.968 +.set CT,%rdi
   1.969 +.set Htbl, %rdx
   1.970 +.set len, %rcx
   1.971 +.set KS,%r9
   1.972 +.set NR,%r10d
   1.973 +
   1.974 +.set Gctx, %rdx
   1.975 +
   1.976 +.set T,%xmm0
   1.977 +.set TMP0,%xmm1
   1.978 +.set TMP1,%xmm2
   1.979 +.set TMP2,%xmm3
   1.980 +.set TMP3,%xmm4
   1.981 +.set TMP4,%xmm5
   1.982 +.set TMP5,%xmm6
   1.983 +.set CTR0,%xmm7
   1.984 +.set CTR1,%xmm8
   1.985 +.set CTR2,%xmm9
   1.986 +.set CTR3,%xmm10
   1.987 +.set CTR4,%xmm11
   1.988 +.set CTR5,%xmm12
   1.989 +.set CTR6,%xmm13
   1.990 +.set CTR7,%xmm14
   1.991 +.set CTR,%xmm15
   1.992 +
   1.993 +    test  len, len
   1.994 +    jnz   .LbeginDec
   1.995 +    ret
   1.996 +   
   1.997 +.LbeginDec:
   1.998 +
   1.999 +    pushq   %rbp
  1.1000 +    pushq   %rbx
  1.1001 +    movq    %rsp, %rbp   
  1.1002 +    sub     $128, %rsp
  1.1003 +    andq    $-16, %rsp
  1.1004 +    vmovdqu 288(Gctx), CTR
  1.1005 +    vmovdqu 272(Gctx), T
  1.1006 +    mov     304(Gctx), KS
  1.1007 +    mov     4(KS), NR
  1.1008 +    lea     48(KS), KS
  1.1009 +
  1.1010 +    vpshufb .Lbswap_mask(%rip), CTR, CTR
  1.1011 +    vpshufb .Lbswap_mask(%rip), T, T
  1.1012 +     
  1.1013 +    vmovdqu .Lbswap_mask(%rip), TMP3
  1.1014 +    jmp     .LDECOctets
  1.1015 +      
  1.1016 +# Decrypt 8 blocks each time while hashing them at the same time
  1.1017 +.align 64
  1.1018 +.LDECOctets:
  1.1019 +   
  1.1020 +        cmp $128, len
  1.1021 +        jb  .LDECSingles
  1.1022 +        sub $128, len
  1.1023 +
  1.1024 +        vmovdqa CTR, CTR0
  1.1025 +        vpaddd  .Lone(%rip), CTR0, CTR1
  1.1026 +        vpaddd  .Ltwo(%rip), CTR0, CTR2
  1.1027 +        vpaddd  .Lone(%rip), CTR2, CTR3
  1.1028 +        vpaddd  .Ltwo(%rip), CTR2, CTR4
  1.1029 +        vpaddd  .Lone(%rip), CTR4, CTR5
  1.1030 +        vpaddd  .Ltwo(%rip), CTR4, CTR6
  1.1031 +        vpaddd  .Lone(%rip), CTR6, CTR7
  1.1032 +        vpaddd  .Ltwo(%rip), CTR6, CTR
  1.1033 +
  1.1034 +        vpshufb TMP3, CTR0, CTR0
  1.1035 +        vpshufb TMP3, CTR1, CTR1
  1.1036 +        vpshufb TMP3, CTR2, CTR2
  1.1037 +        vpshufb TMP3, CTR3, CTR3
  1.1038 +        vpshufb TMP3, CTR4, CTR4
  1.1039 +        vpshufb TMP3, CTR5, CTR5
  1.1040 +        vpshufb TMP3, CTR6, CTR6
  1.1041 +        vpshufb TMP3, CTR7, CTR7
  1.1042 +
  1.1043 +        vmovdqu (KS), TMP3
  1.1044 +        vpxor  TMP3, CTR0, CTR0
  1.1045 +        vpxor  TMP3, CTR1, CTR1
  1.1046 +        vpxor  TMP3, CTR2, CTR2
  1.1047 +        vpxor  TMP3, CTR3, CTR3
  1.1048 +        vpxor  TMP3, CTR4, CTR4
  1.1049 +        vpxor  TMP3, CTR5, CTR5
  1.1050 +        vpxor  TMP3, CTR6, CTR6
  1.1051 +        vpxor  TMP3, CTR7, CTR7
  1.1052 +
  1.1053 +        vmovdqu     7*16(CT), TMP5
  1.1054 +        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
  1.1055 +        vmovdqu     16*0(Htbl), TMP3
  1.1056 +        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
  1.1057 +        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
  1.1058 +        vpshufd     $78, TMP5, TMP3
  1.1059 +        vpxor       TMP5, TMP3, TMP5
  1.1060 +        vmovdqu     128+0*16(Htbl), TMP3      
  1.1061 +        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
  1.1062 +
  1.1063 +        ROUND 1
  1.1064 +        DEC_KARATSUBA 1
  1.1065 +
  1.1066 +        ROUND 2
  1.1067 +        DEC_KARATSUBA 2
  1.1068 +
  1.1069 +        ROUND 3
  1.1070 +        DEC_KARATSUBA 3
  1.1071 +
  1.1072 +        ROUND 4
  1.1073 +        DEC_KARATSUBA 4
  1.1074 +
  1.1075 +        ROUND 5
  1.1076 +        DEC_KARATSUBA 5
  1.1077 +
  1.1078 +        ROUND 6
  1.1079 +        DEC_KARATSUBA 6
  1.1080 +
  1.1081 +        ROUND 7
  1.1082 +
  1.1083 +        vmovdqu     0*16(CT), TMP5
  1.1084 +        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
  1.1085 +        vpxor       T, TMP5, TMP5
  1.1086 +        vmovdqu     16*7(Htbl), TMP4
  1.1087 +            
  1.1088 +        vpclmulqdq  $0x11, TMP4, TMP5, TMP3
  1.1089 +        vpxor       TMP3, TMP1, TMP1
  1.1090 +        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
  1.1091 +        vpxor       TMP3, TMP2, TMP2
  1.1092 +
  1.1093 +        vpshufd     $78, TMP5, TMP3
  1.1094 +        vpxor       TMP5, TMP3, TMP5
  1.1095 +        vmovdqu     128+7*16(Htbl), TMP4
  1.1096 +
  1.1097 +        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
  1.1098 +        vpxor       TMP3, TMP0, TMP0
  1.1099 +
  1.1100 +        ROUND 8      
  1.1101 +
  1.1102 +        vpxor       TMP1, TMP0, TMP0
  1.1103 +        vpxor       TMP2, TMP0, TMP0
  1.1104 +
  1.1105 +        vpsrldq     $8, TMP0, TMP3
  1.1106 +        vpxor       TMP3, TMP1, TMP4
  1.1107 +        vpslldq     $8, TMP0, TMP3
  1.1108 +        vpxor       TMP3, TMP2, T
  1.1109 +        vmovdqa	  .Lpoly(%rip), TMP2
  1.1110 +
  1.1111 +        vpalignr    $8, T, T, TMP1
  1.1112 +        vpclmulqdq  $0x10, TMP2, T, T
  1.1113 +        vpxor       T, TMP1, T
  1.1114 +
  1.1115 +        ROUND 9
  1.1116 +
  1.1117 +        vpalignr    $8, T, T, TMP1
  1.1118 +        vpclmulqdq  $0x10, TMP2, T, T
  1.1119 +        vpxor       T, TMP1, T
  1.1120 +
  1.1121 +        vmovdqu     160(KS), TMP5
  1.1122 +        cmp         $10, NR
  1.1123 +
  1.1124 +        jbe  .LDECLast1
  1.1125 +
  1.1126 +        ROUND 10
  1.1127 +        ROUND 11
  1.1128 +
  1.1129 +        vmovdqu     192(KS), TMP5
  1.1130 +        cmp         $12, NR       
  1.1131 +
  1.1132 +        jbe  .LDECLast1
  1.1133 +
  1.1134 +        ROUND 12
  1.1135 +        ROUND 13
  1.1136 +
  1.1137 +        vmovdqu  224(KS), TMP5
  1.1138 +
  1.1139 +.LDECLast1:      
  1.1140 +      
  1.1141 +        vpxor   (CT), TMP5, TMP3
  1.1142 +        vaesenclast TMP3, CTR0, CTR0
  1.1143 +        vpxor   16(CT), TMP5, TMP3
  1.1144 +        vaesenclast TMP3, CTR1, CTR1
  1.1145 +        vpxor   32(CT), TMP5, TMP3
  1.1146 +        vaesenclast TMP3, CTR2, CTR2
  1.1147 +        vpxor   48(CT), TMP5, TMP3
  1.1148 +        vaesenclast TMP3, CTR3, CTR3
  1.1149 +        vpxor   64(CT), TMP5, TMP3
  1.1150 +        vaesenclast TMP3, CTR4, CTR4
  1.1151 +        vpxor   80(CT), TMP5, TMP3
  1.1152 +        vaesenclast TMP3, CTR5, CTR5
  1.1153 +        vpxor   96(CT), TMP5, TMP3
  1.1154 +        vaesenclast TMP3, CTR6, CTR6
  1.1155 +        vpxor   112(CT), TMP5, TMP3
  1.1156 +        vaesenclast TMP3, CTR7, CTR7
  1.1157 +
  1.1158 +        vmovdqu .Lbswap_mask(%rip), TMP3
  1.1159 +
  1.1160 +        vmovdqu CTR0, (PT)
  1.1161 +        vmovdqu CTR1, 16(PT)
  1.1162 +        vmovdqu CTR2, 32(PT)
  1.1163 +        vmovdqu CTR3, 48(PT)
  1.1164 +        vmovdqu CTR4, 64(PT)
  1.1165 +        vmovdqu CTR5, 80(PT)
  1.1166 +        vmovdqu CTR6, 96(PT)
  1.1167 +        vmovdqu CTR7,112(PT)
  1.1168 +
  1.1169 +        vpxor   TMP4, T, T
  1.1170 +
  1.1171 +        lea 128(CT), CT
  1.1172 +        lea 128(PT), PT
  1.1173 +   jmp  .LDECOctets
  1.1174 +   
  1.1175 +#Here we decrypt and hash any remaining whole block
  1.1176 +.LDECSingles:
  1.1177 +
  1.1178 +    cmp   $16, len
  1.1179 +    jb    .LDECTail
  1.1180 +    sub   $16, len
  1.1181 +
  1.1182 +    vmovdqu  (CT), TMP1
  1.1183 +    vpshufb  .Lbswap_mask(%rip), TMP1, TMP1
  1.1184 +    vpxor    TMP1, T, T
  1.1185 +    vmovdqu  (Htbl), TMP0
  1.1186 +    call     GFMUL
  1.1187 +
  1.1188 +
  1.1189 +    vpshufb  .Lbswap_mask(%rip), CTR, TMP1
  1.1190 +    vpaddd   .Lone(%rip), CTR, CTR
  1.1191 +
  1.1192 +    vpxor    (KS), TMP1, TMP1
  1.1193 +    vaesenc  16*1(KS), TMP1, TMP1
  1.1194 +    vaesenc  16*2(KS), TMP1, TMP1
  1.1195 +    vaesenc  16*3(KS), TMP1, TMP1
  1.1196 +    vaesenc  16*4(KS), TMP1, TMP1
  1.1197 +    vaesenc  16*5(KS), TMP1, TMP1
  1.1198 +    vaesenc  16*6(KS), TMP1, TMP1
  1.1199 +    vaesenc  16*7(KS), TMP1, TMP1
  1.1200 +    vaesenc  16*8(KS), TMP1, TMP1
  1.1201 +    vaesenc  16*9(KS), TMP1, TMP1
  1.1202 +    vmovdqu  16*10(KS), TMP2
  1.1203 +    cmp      $10, NR
  1.1204 +    je       .LDECLast2
  1.1205 +    vaesenc  16*10(KS), TMP1, TMP1
  1.1206 +    vaesenc  16*11(KS), TMP1, TMP1
  1.1207 +    vmovdqu  16*12(KS), TMP2
  1.1208 +    cmp      $12, NR
  1.1209 +    je       .LDECLast2
  1.1210 +    vaesenc  16*12(KS), TMP1, TMP1
  1.1211 +    vaesenc  16*13(KS), TMP1, TMP1
  1.1212 +    vmovdqu  16*14(KS), TMP2
  1.1213 +.LDECLast2:
  1.1214 +    vaesenclast TMP2, TMP1, TMP1
  1.1215 +
  1.1216 +    vpxor    (CT), TMP1, TMP1
  1.1217 +    vmovdqu  TMP1, (PT)
  1.1218 +    addq     $16, CT
  1.1219 +    addq     $16, PT  
  1.1220 +    jmp   .LDECSingles
  1.1221 +
  1.1222 +#Here we decrypt the final partial block, if there is one
  1.1223 +.LDECTail:
  1.1224 +   test   len, len
  1.1225 +   jz     .LDEC_END
  1.1226 +
  1.1227 +   vpshufb  .Lbswap_mask(%rip), CTR, TMP1
  1.1228 +   vpaddd .Lone(%rip), CTR, CTR
  1.1229 +
  1.1230 +   vpxor  (KS), TMP1, TMP1
  1.1231 +   vaesenc  16*1(KS), TMP1, TMP1
  1.1232 +   vaesenc  16*2(KS), TMP1, TMP1
  1.1233 +   vaesenc  16*3(KS), TMP1, TMP1
  1.1234 +   vaesenc  16*4(KS), TMP1, TMP1
  1.1235 +   vaesenc  16*5(KS), TMP1, TMP1
  1.1236 +   vaesenc  16*6(KS), TMP1, TMP1
  1.1237 +   vaesenc  16*7(KS), TMP1, TMP1
  1.1238 +   vaesenc  16*8(KS), TMP1, TMP1
  1.1239 +   vaesenc  16*9(KS), TMP1, TMP1
  1.1240 +   vmovdqu  16*10(KS), TMP2
  1.1241 +   cmp      $10, NR
  1.1242 +   je       .LDECLast3
  1.1243 +   vaesenc  16*10(KS), TMP1, TMP1
  1.1244 +   vaesenc  16*11(KS), TMP1, TMP1
  1.1245 +   vmovdqu  16*12(KS), TMP2
  1.1246 +   cmp      $12, NR
  1.1247 +   je       .LDECLast3
  1.1248 +   vaesenc  16*12(KS), TMP1, TMP1
  1.1249 +   vaesenc  16*13(KS), TMP1, TMP1
  1.1250 +   vmovdqu  16*14(KS), TMP2
  1.1251 +
  1.1252 +.LDECLast3:
  1.1253 +   vaesenclast TMP2, TMP1, TMP1
  1.1254 +  
  1.1255 +   vpxor   TMP2, TMP2, TMP2
  1.1256 +   vmovdqa TMP2, (%rsp) 
  1.1257 +# Copy the required bytes only (could probably use rep movsb)
  1.1258 +    xor KS, KS  
  1.1259 +.LDecCpy:
  1.1260 +        cmp     KS, len
  1.1261 +        je      .LDecCpy2
  1.1262 +        movb    (CT, KS, 1), %r8b
  1.1263 +        movb    %r8b, (%rsp, KS, 1)
  1.1264 +        inc     KS
  1.1265 +        jmp     .LDecCpy
  1.1266 +.LDecCpy2:
  1.1267 +        cmp     $16, KS
  1.1268 +        je      .LDecCpyEnd
  1.1269 +        movb    $0, (%rsp, KS, 1)
  1.1270 +        inc     KS
  1.1271 +        jmp     .LDecCpy2
  1.1272 +.LDecCpyEnd:
  1.1273 +# Xor with the counter block
  1.1274 +    vmovdqa (%rsp), TMP0
  1.1275 +    vpxor   TMP0, TMP1, TMP1
  1.1276 +# Again, store at temp location
  1.1277 +    vmovdqa TMP1, (%rsp)
  1.1278 +# Copy only the required bytes to PT, and zero the rest for the hash
  1.1279 +    xor KS, KS
  1.1280 +.LDecCpy3:
  1.1281 +    cmp     KS, len
  1.1282 +    je      .LDecCpyEnd3
  1.1283 +    movb    (%rsp, KS, 1), %r8b
  1.1284 +    movb    %r8b, (PT, KS, 1)
  1.1285 +    inc     KS
  1.1286 +    jmp     .LDecCpy3
  1.1287 +.LDecCpyEnd3:
  1.1288 +   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
  1.1289 +   vpxor    TMP0, T, T
  1.1290 +   vmovdqu  (Htbl), TMP0
  1.1291 +   call     GFMUL
  1.1292 +.LDEC_END:
  1.1293 +
  1.1294 +   vpshufb  .Lbswap_mask(%rip), T, T
  1.1295 +   vpshufb  .Lbswap_mask(%rip), CTR, CTR
  1.1296 +   vmovdqu  T, 272(Gctx)
  1.1297 +   vmovdqu  CTR, 288(Gctx)
  1.1298 +
  1.1299 +   movq   %rbp, %rsp
  1.1300 +
  1.1301 +   popq   %rbx
  1.1302 +   popq   %rbp
  1.1303 +   ret
  1.1304 +  .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
  1.1305 +#########################
  1.1306 +# a = T
  1.1307 +# b = TMP0 - remains unchanged
  1.1308 +# res = T
  1.1309 +# uses also TMP1,TMP2,TMP3,TMP4
  1.1310 +# __m128i GFMUL(__m128i A, __m128i B);
  1.1311 +.type GFMUL,@function
  1.1312 +.globl GFMUL
  1.1313 +GFMUL:  
  1.1314 +    vpclmulqdq  $0x00, TMP0, T, TMP1
  1.1315 +    vpclmulqdq  $0x11, TMP0, T, TMP4
  1.1316 +
  1.1317 +    vpshufd     $78, T, TMP2
  1.1318 +    vpshufd     $78, TMP0, TMP3
  1.1319 +    vpxor       T, TMP2, TMP2
  1.1320 +    vpxor       TMP0, TMP3, TMP3
  1.1321 +
  1.1322 +    vpclmulqdq  $0x00, TMP3, TMP2, TMP2
  1.1323 +    vpxor       TMP1, TMP2, TMP2
  1.1324 +    vpxor       TMP4, TMP2, TMP2
  1.1325 +
  1.1326 +    vpslldq     $8, TMP2, TMP3
  1.1327 +    vpsrldq     $8, TMP2, TMP2
  1.1328 +
  1.1329 +    vpxor       TMP3, TMP1, TMP1
  1.1330 +    vpxor       TMP2, TMP4, TMP4
  1.1331 +
  1.1332 +    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
  1.1333 +    vpshufd     $78, TMP1, TMP3
  1.1334 +    vpxor       TMP3, TMP2, TMP1
  1.1335 +
  1.1336 +    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
  1.1337 +    vpshufd     $78, TMP1, TMP3
  1.1338 +    vpxor       TMP3, TMP2, TMP1
  1.1339 +
  1.1340 +    vpxor       TMP4, TMP1, T
  1.1341 +    ret
  1.1342 +.size GFMUL, .-GFMUL
  1.1343 +

mercurial