security/nss/lib/freebl/mpi/mpi_x86_asm.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/mpi_x86_asm.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,535 @@
     1.4 +/*
     1.5 + *  mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
     1.6 + * 
     1.7 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.8 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.9 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
    1.10 +
    1.11 +#include "mpi-priv.h"
    1.12 +
    1.13 +static int is_sse = -1;
    1.14 +extern unsigned long s_mpi_is_sse2();
    1.15 +
    1.16 +/*
    1.17 + *   ebp - 36:	caller's esi
    1.18 + *   ebp - 32:	caller's edi
    1.19 + *   ebp - 28:	
    1.20 + *   ebp - 24:	
    1.21 + *   ebp - 20:	
    1.22 + *   ebp - 16:	
    1.23 + *   ebp - 12:	
    1.24 + *   ebp - 8:	
    1.25 + *   ebp - 4:	
    1.26 + *   ebp + 0:	caller's ebp
    1.27 + *   ebp + 4:	return address
    1.28 + *   ebp + 8:	a	argument
    1.29 + *   ebp + 12:	a_len	argument
    1.30 + *   ebp + 16:	b	argument
    1.31 + *   ebp + 20:	c	argument
    1.32 + *   registers:
    1.33 + *  	eax:
    1.34 + * 	ebx:	carry
    1.35 + * 	ecx:	a_len
    1.36 + * 	edx:
    1.37 + * 	esi:	a ptr
    1.38 + * 	edi:	c ptr
    1.39 + */
    1.40 +__declspec(naked) void
    1.41 +s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    1.42 +{
    1.43 +  __asm {
    1.44 +    mov    eax, is_sse
    1.45 +    cmp    eax, 0
    1.46 +    je     s_mpv_mul_d_x86
    1.47 +    jg     s_mpv_mul_d_sse2
    1.48 +    call   s_mpi_is_sse2
    1.49 +    mov    is_sse, eax
    1.50 +    cmp    eax, 0
    1.51 +    jg     s_mpv_mul_d_sse2
    1.52 +s_mpv_mul_d_x86:
    1.53 +    push   ebp
    1.54 +    mov    ebp,esp
    1.55 +    sub    esp,28
    1.56 +    push   edi
    1.57 +    push   esi
    1.58 +    push   ebx
    1.59 +    mov    ebx,0		; carry = 0
    1.60 +    mov    ecx,[ebp+12]		; ecx = a_len
    1.61 +    mov    edi,[ebp+20]
    1.62 +    cmp    ecx,0
    1.63 +    je     L_2			; jmp if a_len == 0
    1.64 +    mov    esi,[ebp+8]		; esi = a
    1.65 +    cld
    1.66 +L_1:
    1.67 +    lodsd			; eax = [ds:esi]; esi += 4
    1.68 +    mov    edx,[ebp+16]		; edx = b
    1.69 +    mul    edx			; edx:eax = Phi:Plo = a_i * b
    1.70 +
    1.71 +    add    eax,ebx		; add carry (ebx) to edx:eax
    1.72 +    adc    edx,0
    1.73 +    mov    ebx,edx		; high half of product becomes next carry
    1.74 +
    1.75 +    stosd			; [es:edi] = ax; edi += 4;
    1.76 +    dec    ecx			; --a_len
    1.77 +    jnz    L_1			; jmp if a_len != 0
    1.78 +L_2:
    1.79 +    mov    [edi],ebx		; *c = carry
    1.80 +    pop    ebx
    1.81 +    pop    esi
    1.82 +    pop    edi
    1.83 +    leave  
    1.84 +    ret    
    1.85 +    nop
    1.86 +s_mpv_mul_d_sse2:
    1.87 +    push   ebp
    1.88 +    mov    ebp, esp
    1.89 +    push   edi
    1.90 +    push   esi
    1.91 +    psubq  mm2, mm2		; carry = 0
    1.92 +    mov    ecx, [ebp+12]	; ecx = a_len
    1.93 +    movd   mm1, [ebp+16]	; mm1 = b
    1.94 +    mov    edi, [ebp+20]
    1.95 +    cmp    ecx, 0
    1.96 +    je     L_6			; jmp if a_len == 0
    1.97 +    mov    esi, [ebp+8]		; esi = a
    1.98 +    cld
    1.99 +L_5:
   1.100 +    movd   mm0, [esi]		; mm0 = *a++
   1.101 +    add    esi, 4
   1.102 +    pmuludq mm0, mm1		; mm0 = b * *a++
   1.103 +    paddq  mm2, mm0		; add the carry
   1.104 +    movd   [edi], mm2		; store the 32bit result
   1.105 +    add    edi, 4
   1.106 +    psrlq  mm2, 32		; save the carry
   1.107 +    dec    ecx			; --a_len
   1.108 +    jnz    L_5			; jmp if a_len != 0
   1.109 +L_6:
   1.110 +    movd   [edi], mm2		; *c = carry
   1.111 +    emms
   1.112 +    pop    esi
   1.113 +    pop    edi
   1.114 +    leave  
   1.115 +    ret    
   1.116 +    nop
   1.117 +  }
   1.118 +}
   1.119 +
   1.120 +/*
   1.121 + *   ebp - 36:	caller's esi
   1.122 + *   ebp - 32:	caller's edi
   1.123 + *   ebp - 28:	
   1.124 + *   ebp - 24:	
   1.125 + *   ebp - 20:	
   1.126 + *   ebp - 16:	
   1.127 + *   ebp - 12:	
   1.128 + *   ebp - 8:	
   1.129 + *   ebp - 4:	
   1.130 + *   ebp + 0:	caller's ebp
   1.131 + *   ebp + 4:	return address
   1.132 + *   ebp + 8:	a	argument
   1.133 + *   ebp + 12:	a_len	argument
   1.134 + *   ebp + 16:	b	argument
   1.135 + *   ebp + 20:	c	argument
   1.136 + *   registers:
   1.137 + *  	eax:
   1.138 + * 	ebx:	carry
   1.139 + * 	ecx:	a_len
   1.140 + * 	edx:
   1.141 + * 	esi:	a ptr
   1.142 + * 	edi:	c ptr
   1.143 + */
   1.144 +__declspec(naked) void
   1.145 +s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.146 +{
   1.147 +  __asm {
   1.148 +    mov    eax, is_sse
   1.149 +    cmp    eax, 0
   1.150 +    je     s_mpv_mul_d_add_x86
   1.151 +    jg     s_mpv_mul_d_add_sse2
   1.152 +    call   s_mpi_is_sse2
   1.153 +    mov    is_sse, eax
   1.154 +    cmp    eax, 0
   1.155 +    jg     s_mpv_mul_d_add_sse2
   1.156 +s_mpv_mul_d_add_x86:
   1.157 +    push   ebp
   1.158 +    mov    ebp,esp
   1.159 +    sub    esp,28
   1.160 +    push   edi
   1.161 +    push   esi
   1.162 +    push   ebx
   1.163 +    mov    ebx,0		; carry = 0
   1.164 +    mov    ecx,[ebp+12]		; ecx = a_len
   1.165 +    mov    edi,[ebp+20]
   1.166 +    cmp    ecx,0
   1.167 +    je     L_11			; jmp if a_len == 0
   1.168 +    mov    esi,[ebp+8]		; esi = a
   1.169 +    cld
   1.170 +L_10:
   1.171 +    lodsd			; eax = [ds:esi]; esi += 4
   1.172 +    mov    edx,[ebp+16]		; edx = b
   1.173 +    mul    edx			; edx:eax = Phi:Plo = a_i * b
   1.174 +
   1.175 +    add    eax,ebx		; add carry (ebx) to edx:eax
   1.176 +    adc    edx,0
   1.177 +    mov    ebx,[edi]		; add in current word from *c
   1.178 +    add    eax,ebx		
   1.179 +    adc    edx,0
   1.180 +    mov    ebx,edx		; high half of product becomes next carry
   1.181 +
   1.182 +    stosd			; [es:edi] = ax; edi += 4;
   1.183 +    dec    ecx			; --a_len
   1.184 +    jnz    L_10			; jmp if a_len != 0
   1.185 +L_11:
   1.186 +    mov    [edi],ebx		; *c = carry
   1.187 +    pop    ebx
   1.188 +    pop    esi
   1.189 +    pop    edi
   1.190 +    leave  
   1.191 +    ret    
   1.192 +    nop
   1.193 +s_mpv_mul_d_add_sse2:
   1.194 +    push   ebp
   1.195 +    mov    ebp, esp
   1.196 +    push   edi
   1.197 +    push   esi
   1.198 +    psubq  mm2, mm2		; carry = 0
   1.199 +    mov    ecx, [ebp+12]	; ecx = a_len
   1.200 +    movd   mm1, [ebp+16]	; mm1 = b
   1.201 +    mov    edi, [ebp+20]
   1.202 +    cmp    ecx, 0
   1.203 +    je     L_16			; jmp if a_len == 0
   1.204 +    mov    esi, [ebp+8]		; esi = a
   1.205 +    cld
   1.206 +L_15:
   1.207 +    movd   mm0, [esi]		; mm0 = *a++
   1.208 +    add    esi, 4
   1.209 +    pmuludq mm0, mm1		; mm0 = b * *a++
   1.210 +    paddq  mm2, mm0		; add the carry
   1.211 +    movd   mm0, [edi]
   1.212 +    paddq  mm2, mm0		; add the carry
   1.213 +    movd   [edi], mm2		; store the 32bit result
   1.214 +    add    edi, 4
   1.215 +    psrlq  mm2, 32		; save the carry
   1.216 +    dec    ecx			; --a_len
   1.217 +    jnz    L_15			; jmp if a_len != 0
   1.218 +L_16:
   1.219 +    movd   [edi], mm2		; *c = carry
   1.220 +    emms
   1.221 +    pop    esi
   1.222 +    pop    edi
   1.223 +    leave  
   1.224 +    ret    
   1.225 +    nop
   1.226 +  }
   1.227 +}
   1.228 +
   1.229 +/*
   1.230 + *   ebp - 36:	caller's esi
   1.231 + *   ebp - 32:	caller's edi
   1.232 + *   ebp - 28:	
   1.233 + *   ebp - 24:	
   1.234 + *   ebp - 20:	
   1.235 + *   ebp - 16:	
   1.236 + *   ebp - 12:	
   1.237 + *   ebp - 8:	
   1.238 + *   ebp - 4:	
   1.239 + *   ebp + 0:	caller's ebp
   1.240 + *   ebp + 4:	return address
   1.241 + *   ebp + 8:	a	argument
   1.242 + *   ebp + 12:	a_len	argument
   1.243 + *   ebp + 16:	b	argument
   1.244 + *   ebp + 20:	c	argument
   1.245 + *   registers:
   1.246 + *  	eax:
   1.247 + * 	ebx:	carry
   1.248 + * 	ecx:	a_len
   1.249 + * 	edx:
   1.250 + * 	esi:	a ptr
   1.251 + * 	edi:	c ptr
   1.252 + */
   1.253 +__declspec(naked) void
   1.254 +s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.255 +{
   1.256 +  __asm {
   1.257 +    mov    eax, is_sse
   1.258 +    cmp    eax, 0
   1.259 +    je     s_mpv_mul_d_add_prop_x86
   1.260 +    jg     s_mpv_mul_d_add_prop_sse2
   1.261 +    call   s_mpi_is_sse2
   1.262 +    mov    is_sse, eax
   1.263 +    cmp    eax, 0
   1.264 +    jg     s_mpv_mul_d_add_prop_sse2
   1.265 +s_mpv_mul_d_add_prop_x86:
   1.266 +    push   ebp
   1.267 +    mov    ebp,esp
   1.268 +    sub    esp,28
   1.269 +    push   edi
   1.270 +    push   esi
   1.271 +    push   ebx
   1.272 +    mov    ebx,0		; carry = 0
   1.273 +    mov    ecx,[ebp+12]		; ecx = a_len
   1.274 +    mov    edi,[ebp+20]
   1.275 +    cmp    ecx,0
   1.276 +    je     L_21			; jmp if a_len == 0
   1.277 +    cld
   1.278 +    mov    esi,[ebp+8]		; esi = a
   1.279 +L_20:
   1.280 +    lodsd			; eax = [ds:esi]; esi += 4
   1.281 +    mov    edx,[ebp+16]		; edx = b
   1.282 +    mul    edx			; edx:eax = Phi:Plo = a_i * b
   1.283 +
   1.284 +    add    eax,ebx		; add carry (ebx) to edx:eax
   1.285 +    adc    edx,0
   1.286 +    mov    ebx,[edi]		; add in current word from *c
   1.287 +    add    eax,ebx		
   1.288 +    adc    edx,0
   1.289 +    mov    ebx,edx		; high half of product becomes next carry
   1.290 +
   1.291 +    stosd			; [es:edi] = ax; edi += 4;
   1.292 +    dec    ecx			; --a_len
   1.293 +    jnz    L_20			; jmp if a_len != 0
   1.294 +L_21:
   1.295 +    cmp    ebx,0		; is carry zero?
   1.296 +    jz     L_23
   1.297 +    mov    eax,[edi]		; add in current word from *c
   1.298 +    add    eax,ebx
   1.299 +    stosd			; [es:edi] = ax; edi += 4;
   1.300 +    jnc    L_23
   1.301 +L_22:
   1.302 +    mov    eax,[edi]		; add in current word from *c
   1.303 +    adc    eax,0
   1.304 +    stosd			; [es:edi] = ax; edi += 4;
   1.305 +    jc     L_22
   1.306 +L_23:
   1.307 +    pop    ebx
   1.308 +    pop    esi
   1.309 +    pop    edi
   1.310 +    leave  
   1.311 +    ret    
   1.312 +    nop
   1.313 +s_mpv_mul_d_add_prop_sse2:
   1.314 +    push   ebp
   1.315 +    mov    ebp, esp
   1.316 +    push   edi
   1.317 +    push   esi
   1.318 +    push   ebx
   1.319 +    psubq  mm2, mm2		; carry = 0
   1.320 +    mov    ecx, [ebp+12]	; ecx = a_len
   1.321 +    movd   mm1, [ebp+16]	; mm1 = b
   1.322 +    mov    edi, [ebp+20]
   1.323 +    cmp    ecx, 0
   1.324 +    je     L_26			; jmp if a_len == 0
   1.325 +    mov    esi, [ebp+8]		; esi = a
   1.326 +    cld
   1.327 +L_25:
   1.328 +    movd   mm0, [esi]		; mm0 = *a++
   1.329 +    movd   mm3, [edi]		; fetch the sum
   1.330 +    add    esi, 4
   1.331 +    pmuludq mm0, mm1		; mm0 = b * *a++
   1.332 +    paddq  mm2, mm0		; add the carry
   1.333 +    paddq  mm2, mm3		; add *c++
   1.334 +    movd   [edi], mm2		; store the 32bit result
   1.335 +    add    edi, 4
   1.336 +    psrlq  mm2, 32		; save the carry
   1.337 +    dec    ecx			; --a_len
   1.338 +    jnz    L_25			; jmp if a_len != 0
   1.339 +L_26:
   1.340 +    movd   ebx, mm2
   1.341 +    cmp    ebx, 0		; is carry zero?
   1.342 +    jz     L_28
   1.343 +    mov    eax, [edi]
   1.344 +    add    eax, ebx
   1.345 +    stosd
   1.346 +    jnc    L_28
   1.347 +L_27:
   1.348 +    mov    eax, [edi]		; add in current word from *c
   1.349 +    adc	   eax, 0
   1.350 +    stosd			; [es:edi] = ax; edi += 4;
   1.351 +    jc     L_27
   1.352 +L_28:
   1.353 +    emms
   1.354 +    pop    ebx
   1.355 +    pop    esi
   1.356 +    pop    edi
   1.357 +    leave  
   1.358 +    ret    
   1.359 +    nop
   1.360 +  }
   1.361 +}
   1.362 +
   1.363 +/*
   1.364 + *   ebp - 20:	caller's esi
   1.365 + *   ebp - 16:	caller's edi
   1.366 + *   ebp - 12:	
   1.367 + *   ebp - 8:	carry
   1.368 + *   ebp - 4:	a_len	local
   1.369 + *   ebp + 0:	caller's ebp
   1.370 + *   ebp + 4:	return address
   1.371 + *   ebp + 8:	pa	argument
   1.372 + *   ebp + 12:	a_len	argument
   1.373 + *   ebp + 16:	ps	argument
   1.374 + *   ebp + 20:	
   1.375 + *   registers:
   1.376 + *  	eax:
   1.377 + * 	ebx:	carry
   1.378 + * 	ecx:	a_len
   1.379 + * 	edx:
   1.380 + * 	esi:	a ptr
   1.381 + * 	edi:	c ptr
   1.382 + */
   1.383 +__declspec(naked) void
   1.384 +s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
   1.385 +{
   1.386 +  __asm {
   1.387 +     mov    eax, is_sse
   1.388 +     cmp    eax, 0
   1.389 +     je     s_mpv_sqr_add_prop_x86
   1.390 +     jg     s_mpv_sqr_add_prop_sse2
   1.391 +     call   s_mpi_is_sse2
   1.392 +     mov    is_sse, eax
   1.393 +     cmp    eax, 0
   1.394 +     jg     s_mpv_sqr_add_prop_sse2
   1.395 +s_mpv_sqr_add_prop_x86:
   1.396 +     push   ebp
   1.397 +     mov    ebp,esp
   1.398 +     sub    esp,12
   1.399 +     push   edi
   1.400 +     push   esi
   1.401 +     push   ebx
   1.402 +     mov    ebx,0		; carry = 0
   1.403 +     mov    ecx,[ebp+12]	; a_len
   1.404 +     mov    edi,[ebp+16]	; edi = ps
   1.405 +     cmp    ecx,0
   1.406 +     je     L_31		; jump if a_len == 0
   1.407 +     cld
   1.408 +     mov    esi,[ebp+8]		; esi = pa
   1.409 +L_30:
   1.410 +     lodsd			; eax = [ds:si]; si += 4;
   1.411 +     mul    eax
   1.412 +
   1.413 +     add    eax,ebx		; add "carry"
   1.414 +     adc    edx,0
   1.415 +     mov    ebx,[edi]
   1.416 +     add    eax,ebx		; add low word from result
   1.417 +     mov    ebx,[edi+4]
   1.418 +     stosd			; [es:di] = eax; di += 4;
   1.419 +     adc    edx,ebx		; add high word from result
   1.420 +     mov    ebx,0
   1.421 +     mov    eax,edx
   1.422 +     adc    ebx,0
   1.423 +     stosd			; [es:di] = eax; di += 4;
   1.424 +     dec    ecx			; --a_len
   1.425 +     jnz    L_30		; jmp if a_len != 0
   1.426 +L_31:
   1.427 +    cmp    ebx,0		; is carry zero?
   1.428 +    jz     L_34
   1.429 +    mov    eax,[edi]		; add in current word from *c
   1.430 +    add    eax,ebx
   1.431 +    stosd			; [es:edi] = ax; edi += 4;
   1.432 +    jnc    L_34
   1.433 +L_32:
   1.434 +    mov    eax,[edi]		; add in current word from *c
   1.435 +    adc    eax,0
   1.436 +    stosd			; [es:edi] = ax; edi += 4;
   1.437 +    jc     L_32
   1.438 +L_34:
   1.439 +    pop    ebx
   1.440 +    pop    esi
   1.441 +    pop    edi
   1.442 +    leave  
   1.443 +    ret    
   1.444 +    nop
   1.445 +s_mpv_sqr_add_prop_sse2:
   1.446 +    push   ebp
   1.447 +    mov    ebp, esp
   1.448 +    push   edi
   1.449 +    push   esi
   1.450 +    push   ebx
   1.451 +    psubq  mm2, mm2		; carry = 0
   1.452 +    mov    ecx, [ebp+12]	; ecx = a_len
   1.453 +    mov    edi, [ebp+16]
   1.454 +    cmp    ecx, 0
   1.455 +    je     L_36		; jmp if a_len == 0
   1.456 +    mov    esi, [ebp+8]		; esi = a
   1.457 +    cld
   1.458 +L_35:
   1.459 +    movd   mm0, [esi]		; mm0 = *a
   1.460 +    movd   mm3, [edi]		; fetch the sum
   1.461 +    add	   esi, 4
   1.462 +    pmuludq mm0, mm0		; mm0 = sqr(a)
   1.463 +    paddq  mm2, mm0		; add the carry
   1.464 +    paddq  mm2, mm3		; add the low word
   1.465 +    movd   mm3, [edi+4]
   1.466 +    movd   [edi], mm2		; store the 32bit result
   1.467 +    psrlq  mm2, 32	
   1.468 +    paddq  mm2, mm3		; add the high word
   1.469 +    movd   [edi+4], mm2		; store the 32bit result
   1.470 +    psrlq  mm2, 32		; save the carry.
   1.471 +    add    edi, 8
   1.472 +    dec    ecx			; --a_len
   1.473 +    jnz    L_35			; jmp if a_len != 0
   1.474 +L_36:
   1.475 +    movd   ebx, mm2
   1.476 +    cmp    ebx, 0		; is carry zero?
   1.477 +    jz     L_38
   1.478 +    mov    eax, [edi]
   1.479 +    add    eax, ebx
   1.480 +    stosd
   1.481 +    jnc    L_38
   1.482 +L_37:
   1.483 +    mov    eax, [edi]		; add in current word from *c
   1.484 +    adc	   eax, 0
   1.485 +    stosd			; [es:edi] = ax; edi += 4;
   1.486 +    jc     L_37
   1.487 +L_38:
   1.488 +    emms
   1.489 +    pop    ebx
   1.490 +    pop    esi
   1.491 +    pop    edi
   1.492 +    leave  
   1.493 +    ret    
   1.494 +    nop
   1.495 +  }
   1.496 +}
   1.497 +
   1.498 +/* 
   1.499 + *  Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
   1.500 + *  so its high bit is 1.   This code is from NSPR.
   1.501 + *
   1.502 + *  Dump of assembler code for function s_mpv_div_2dx1d:
   1.503 + *  
   1.504 + *   esp +  0:   Caller's ebx
   1.505 + *   esp +  4:	return address
   1.506 + *   esp +  8:	Nhi	argument
   1.507 + *   esp + 12:	Nlo	argument
   1.508 + *   esp + 16:	divisor	argument
   1.509 + *   esp + 20:	qp	argument
   1.510 + *   esp + 24:   rp	argument
   1.511 + *   registers:
   1.512 + *  	eax:
   1.513 + * 	ebx:	carry
   1.514 + * 	ecx:	a_len
   1.515 + * 	edx:
   1.516 + * 	esi:	a ptr
   1.517 + * 	edi:	c ptr
   1.518 + */  
   1.519 +__declspec(naked) mp_err
   1.520 +s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
   1.521 +		mp_digit *qp, mp_digit *rp)
   1.522 +{
   1.523 +  __asm {
   1.524 +       push   ebx
   1.525 +       mov    edx,[esp+8]
   1.526 +       mov    eax,[esp+12]
   1.527 +       mov    ebx,[esp+16]
   1.528 +       div    ebx
   1.529 +       mov    ebx,[esp+20]
   1.530 +       mov    [ebx],eax
   1.531 +       mov    ebx,[esp+24]
   1.532 +       mov    [ebx],edx
   1.533 +       xor    eax,eax		; return zero
   1.534 +       pop    ebx
   1.535 +       ret    
   1.536 +       nop
   1.537 +  }
   1.538 +}

mercurial