1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_x86_asm.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,535 @@ 1.4 +/* 1.5 + * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. 1.6 + * 1.7 + * This Source Code Form is subject to the terms of the Mozilla Public 1.8 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.9 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.10 + 1.11 +#include "mpi-priv.h" 1.12 + 1.13 +static int is_sse = -1; 1.14 +extern unsigned long s_mpi_is_sse2(); 1.15 + 1.16 +/* 1.17 + * ebp - 36: caller's esi 1.18 + * ebp - 32: caller's edi 1.19 + * ebp - 28: 1.20 + * ebp - 24: 1.21 + * ebp - 20: 1.22 + * ebp - 16: 1.23 + * ebp - 12: 1.24 + * ebp - 8: 1.25 + * ebp - 4: 1.26 + * ebp + 0: caller's ebp 1.27 + * ebp + 4: return address 1.28 + * ebp + 8: a argument 1.29 + * ebp + 12: a_len argument 1.30 + * ebp + 16: b argument 1.31 + * ebp + 20: c argument 1.32 + * registers: 1.33 + * eax: 1.34 + * ebx: carry 1.35 + * ecx: a_len 1.36 + * edx: 1.37 + * esi: a ptr 1.38 + * edi: c ptr 1.39 + */ 1.40 +__declspec(naked) void 1.41 +s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.42 +{ 1.43 + __asm { 1.44 + mov eax, is_sse 1.45 + cmp eax, 0 1.46 + je s_mpv_mul_d_x86 1.47 + jg s_mpv_mul_d_sse2 1.48 + call s_mpi_is_sse2 1.49 + mov is_sse, eax 1.50 + cmp eax, 0 1.51 + jg s_mpv_mul_d_sse2 1.52 +s_mpv_mul_d_x86: 1.53 + push ebp 1.54 + mov ebp,esp 1.55 + sub esp,28 1.56 + push edi 1.57 + push esi 1.58 + push ebx 1.59 + mov ebx,0 ; carry = 0 1.60 + mov ecx,[ebp+12] ; ecx = a_len 1.61 + mov edi,[ebp+20] 1.62 + cmp ecx,0 1.63 + je L_2 ; jmp if a_len == 0 1.64 + mov esi,[ebp+8] ; esi = a 1.65 + cld 1.66 +L_1: 1.67 + lodsd ; eax = [ds:esi]; esi += 4 1.68 + mov edx,[ebp+16] ; edx = b 1.69 + mul edx ; edx:eax = Phi:Plo = a_i * b 1.70 + 1.71 + add eax,ebx ; add carry (ebx) to edx:eax 1.72 + adc edx,0 1.73 + mov ebx,edx ; high half of product becomes next carry 1.74 + 1.75 + stosd ; [es:edi] = ax; edi += 4; 1.76 + dec ecx ; --a_len 1.77 + jnz L_1 ; jmp if a_len != 0 1.78 +L_2: 1.79 + mov [edi],ebx ; *c = carry 1.80 + pop ebx 1.81 + pop esi 1.82 + pop edi 1.83 + leave 1.84 + ret 1.85 + nop 1.86 +s_mpv_mul_d_sse2: 1.87 + push ebp 1.88 + mov ebp, esp 1.89 + push edi 1.90 + push esi 1.91 + psubq mm2, mm2 ; carry = 0 1.92 + mov ecx, [ebp+12] ; ecx = a_len 1.93 + movd mm1, [ebp+16] ; mm1 = b 1.94 + mov edi, [ebp+20] 1.95 + cmp ecx, 0 1.96 + je L_6 ; jmp if a_len == 0 1.97 + mov esi, [ebp+8] ; esi = a 1.98 + cld 1.99 +L_5: 1.100 + movd mm0, [esi] ; mm0 = *a++ 1.101 + add esi, 4 1.102 + pmuludq mm0, mm1 ; mm0 = b * *a++ 1.103 + paddq mm2, mm0 ; add the carry 1.104 + movd [edi], mm2 ; store the 32bit result 1.105 + add edi, 4 1.106 + psrlq mm2, 32 ; save the carry 1.107 + dec ecx ; --a_len 1.108 + jnz L_5 ; jmp if a_len != 0 1.109 +L_6: 1.110 + movd [edi], mm2 ; *c = carry 1.111 + emms 1.112 + pop esi 1.113 + pop edi 1.114 + leave 1.115 + ret 1.116 + nop 1.117 + } 1.118 +} 1.119 + 1.120 +/* 1.121 + * ebp - 36: caller's esi 1.122 + * ebp - 32: caller's edi 1.123 + * ebp - 28: 1.124 + * ebp - 24: 1.125 + * ebp - 20: 1.126 + * ebp - 16: 1.127 + * ebp - 12: 1.128 + * ebp - 8: 1.129 + * ebp - 4: 1.130 + * ebp + 0: caller's ebp 1.131 + * ebp + 4: return address 1.132 + * ebp + 8: a argument 1.133 + * ebp + 12: a_len argument 1.134 + * ebp + 16: b argument 1.135 + * ebp + 20: c argument 1.136 + * registers: 1.137 + * eax: 1.138 + * ebx: carry 1.139 + * ecx: a_len 1.140 + * edx: 1.141 + * esi: a ptr 1.142 + * edi: c ptr 1.143 + */ 1.144 +__declspec(naked) void 1.145 +s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.146 +{ 1.147 + __asm { 1.148 + mov eax, is_sse 1.149 + cmp eax, 0 1.150 + je s_mpv_mul_d_add_x86 1.151 + jg s_mpv_mul_d_add_sse2 1.152 + call s_mpi_is_sse2 1.153 + mov is_sse, eax 1.154 + cmp eax, 0 1.155 + jg s_mpv_mul_d_add_sse2 1.156 +s_mpv_mul_d_add_x86: 1.157 + push ebp 1.158 + mov ebp,esp 1.159 + sub esp,28 1.160 + push edi 1.161 + push esi 1.162 + push ebx 1.163 + mov ebx,0 ; carry = 0 1.164 + mov ecx,[ebp+12] ; ecx = a_len 1.165 + mov edi,[ebp+20] 1.166 + cmp ecx,0 1.167 + je L_11 ; jmp if a_len == 0 1.168 + mov esi,[ebp+8] ; esi = a 1.169 + cld 1.170 +L_10: 1.171 + lodsd ; eax = [ds:esi]; esi += 4 1.172 + mov edx,[ebp+16] ; edx = b 1.173 + mul edx ; edx:eax = Phi:Plo = a_i * b 1.174 + 1.175 + add eax,ebx ; add carry (ebx) to edx:eax 1.176 + adc edx,0 1.177 + mov ebx,[edi] ; add in current word from *c 1.178 + add eax,ebx 1.179 + adc edx,0 1.180 + mov ebx,edx ; high half of product becomes next carry 1.181 + 1.182 + stosd ; [es:edi] = ax; edi += 4; 1.183 + dec ecx ; --a_len 1.184 + jnz L_10 ; jmp if a_len != 0 1.185 +L_11: 1.186 + mov [edi],ebx ; *c = carry 1.187 + pop ebx 1.188 + pop esi 1.189 + pop edi 1.190 + leave 1.191 + ret 1.192 + nop 1.193 +s_mpv_mul_d_add_sse2: 1.194 + push ebp 1.195 + mov ebp, esp 1.196 + push edi 1.197 + push esi 1.198 + psubq mm2, mm2 ; carry = 0 1.199 + mov ecx, [ebp+12] ; ecx = a_len 1.200 + movd mm1, [ebp+16] ; mm1 = b 1.201 + mov edi, [ebp+20] 1.202 + cmp ecx, 0 1.203 + je L_16 ; jmp if a_len == 0 1.204 + mov esi, [ebp+8] ; esi = a 1.205 + cld 1.206 +L_15: 1.207 + movd mm0, [esi] ; mm0 = *a++ 1.208 + add esi, 4 1.209 + pmuludq mm0, mm1 ; mm0 = b * *a++ 1.210 + paddq mm2, mm0 ; add the carry 1.211 + movd mm0, [edi] 1.212 + paddq mm2, mm0 ; add the carry 1.213 + movd [edi], mm2 ; store the 32bit result 1.214 + add edi, 4 1.215 + psrlq mm2, 32 ; save the carry 1.216 + dec ecx ; --a_len 1.217 + jnz L_15 ; jmp if a_len != 0 1.218 +L_16: 1.219 + movd [edi], mm2 ; *c = carry 1.220 + emms 1.221 + pop esi 1.222 + pop edi 1.223 + leave 1.224 + ret 1.225 + nop 1.226 + } 1.227 +} 1.228 + 1.229 +/* 1.230 + * ebp - 36: caller's esi 1.231 + * ebp - 32: caller's edi 1.232 + * ebp - 28: 1.233 + * ebp - 24: 1.234 + * ebp - 20: 1.235 + * ebp - 16: 1.236 + * ebp - 12: 1.237 + * ebp - 8: 1.238 + * ebp - 4: 1.239 + * ebp + 0: caller's ebp 1.240 + * ebp + 4: return address 1.241 + * ebp + 8: a argument 1.242 + * ebp + 12: a_len argument 1.243 + * ebp + 16: b argument 1.244 + * ebp + 20: c argument 1.245 + * registers: 1.246 + * eax: 1.247 + * ebx: carry 1.248 + * ecx: a_len 1.249 + * edx: 1.250 + * esi: a ptr 1.251 + * edi: c ptr 1.252 + */ 1.253 +__declspec(naked) void 1.254 +s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.255 +{ 1.256 + __asm { 1.257 + mov eax, is_sse 1.258 + cmp eax, 0 1.259 + je s_mpv_mul_d_add_prop_x86 1.260 + jg s_mpv_mul_d_add_prop_sse2 1.261 + call s_mpi_is_sse2 1.262 + mov is_sse, eax 1.263 + cmp eax, 0 1.264 + jg s_mpv_mul_d_add_prop_sse2 1.265 +s_mpv_mul_d_add_prop_x86: 1.266 + push ebp 1.267 + mov ebp,esp 1.268 + sub esp,28 1.269 + push edi 1.270 + push esi 1.271 + push ebx 1.272 + mov ebx,0 ; carry = 0 1.273 + mov ecx,[ebp+12] ; ecx = a_len 1.274 + mov edi,[ebp+20] 1.275 + cmp ecx,0 1.276 + je L_21 ; jmp if a_len == 0 1.277 + cld 1.278 + mov esi,[ebp+8] ; esi = a 1.279 +L_20: 1.280 + lodsd ; eax = [ds:esi]; esi += 4 1.281 + mov edx,[ebp+16] ; edx = b 1.282 + mul edx ; edx:eax = Phi:Plo = a_i * b 1.283 + 1.284 + add eax,ebx ; add carry (ebx) to edx:eax 1.285 + adc edx,0 1.286 + mov ebx,[edi] ; add in current word from *c 1.287 + add eax,ebx 1.288 + adc edx,0 1.289 + mov ebx,edx ; high half of product becomes next carry 1.290 + 1.291 + stosd ; [es:edi] = ax; edi += 4; 1.292 + dec ecx ; --a_len 1.293 + jnz L_20 ; jmp if a_len != 0 1.294 +L_21: 1.295 + cmp ebx,0 ; is carry zero? 1.296 + jz L_23 1.297 + mov eax,[edi] ; add in current word from *c 1.298 + add eax,ebx 1.299 + stosd ; [es:edi] = ax; edi += 4; 1.300 + jnc L_23 1.301 +L_22: 1.302 + mov eax,[edi] ; add in current word from *c 1.303 + adc eax,0 1.304 + stosd ; [es:edi] = ax; edi += 4; 1.305 + jc L_22 1.306 +L_23: 1.307 + pop ebx 1.308 + pop esi 1.309 + pop edi 1.310 + leave 1.311 + ret 1.312 + nop 1.313 +s_mpv_mul_d_add_prop_sse2: 1.314 + push ebp 1.315 + mov ebp, esp 1.316 + push edi 1.317 + push esi 1.318 + push ebx 1.319 + psubq mm2, mm2 ; carry = 0 1.320 + mov ecx, [ebp+12] ; ecx = a_len 1.321 + movd mm1, [ebp+16] ; mm1 = b 1.322 + mov edi, [ebp+20] 1.323 + cmp ecx, 0 1.324 + je L_26 ; jmp if a_len == 0 1.325 + mov esi, [ebp+8] ; esi = a 1.326 + cld 1.327 +L_25: 1.328 + movd mm0, [esi] ; mm0 = *a++ 1.329 + movd mm3, [edi] ; fetch the sum 1.330 + add esi, 4 1.331 + pmuludq mm0, mm1 ; mm0 = b * *a++ 1.332 + paddq mm2, mm0 ; add the carry 1.333 + paddq mm2, mm3 ; add *c++ 1.334 + movd [edi], mm2 ; store the 32bit result 1.335 + add edi, 4 1.336 + psrlq mm2, 32 ; save the carry 1.337 + dec ecx ; --a_len 1.338 + jnz L_25 ; jmp if a_len != 0 1.339 +L_26: 1.340 + movd ebx, mm2 1.341 + cmp ebx, 0 ; is carry zero? 1.342 + jz L_28 1.343 + mov eax, [edi] 1.344 + add eax, ebx 1.345 + stosd 1.346 + jnc L_28 1.347 +L_27: 1.348 + mov eax, [edi] ; add in current word from *c 1.349 + adc eax, 0 1.350 + stosd ; [es:edi] = ax; edi += 4; 1.351 + jc L_27 1.352 +L_28: 1.353 + emms 1.354 + pop ebx 1.355 + pop esi 1.356 + pop edi 1.357 + leave 1.358 + ret 1.359 + nop 1.360 + } 1.361 +} 1.362 + 1.363 +/* 1.364 + * ebp - 20: caller's esi 1.365 + * ebp - 16: caller's edi 1.366 + * ebp - 12: 1.367 + * ebp - 8: carry 1.368 + * ebp - 4: a_len local 1.369 + * ebp + 0: caller's ebp 1.370 + * ebp + 4: return address 1.371 + * ebp + 8: pa argument 1.372 + * ebp + 12: a_len argument 1.373 + * ebp + 16: ps argument 1.374 + * ebp + 20: 1.375 + * registers: 1.376 + * eax: 1.377 + * ebx: carry 1.378 + * ecx: a_len 1.379 + * edx: 1.380 + * esi: a ptr 1.381 + * edi: c ptr 1.382 + */ 1.383 +__declspec(naked) void 1.384 +s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) 1.385 +{ 1.386 + __asm { 1.387 + mov eax, is_sse 1.388 + cmp eax, 0 1.389 + je s_mpv_sqr_add_prop_x86 1.390 + jg s_mpv_sqr_add_prop_sse2 1.391 + call s_mpi_is_sse2 1.392 + mov is_sse, eax 1.393 + cmp eax, 0 1.394 + jg s_mpv_sqr_add_prop_sse2 1.395 +s_mpv_sqr_add_prop_x86: 1.396 + push ebp 1.397 + mov ebp,esp 1.398 + sub esp,12 1.399 + push edi 1.400 + push esi 1.401 + push ebx 1.402 + mov ebx,0 ; carry = 0 1.403 + mov ecx,[ebp+12] ; a_len 1.404 + mov edi,[ebp+16] ; edi = ps 1.405 + cmp ecx,0 1.406 + je L_31 ; jump if a_len == 0 1.407 + cld 1.408 + mov esi,[ebp+8] ; esi = pa 1.409 +L_30: 1.410 + lodsd ; eax = [ds:si]; si += 4; 1.411 + mul eax 1.412 + 1.413 + add eax,ebx ; add "carry" 1.414 + adc edx,0 1.415 + mov ebx,[edi] 1.416 + add eax,ebx ; add low word from result 1.417 + mov ebx,[edi+4] 1.418 + stosd ; [es:di] = eax; di += 4; 1.419 + adc edx,ebx ; add high word from result 1.420 + mov ebx,0 1.421 + mov eax,edx 1.422 + adc ebx,0 1.423 + stosd ; [es:di] = eax; di += 4; 1.424 + dec ecx ; --a_len 1.425 + jnz L_30 ; jmp if a_len != 0 1.426 +L_31: 1.427 + cmp ebx,0 ; is carry zero? 1.428 + jz L_34 1.429 + mov eax,[edi] ; add in current word from *c 1.430 + add eax,ebx 1.431 + stosd ; [es:edi] = ax; edi += 4; 1.432 + jnc L_34 1.433 +L_32: 1.434 + mov eax,[edi] ; add in current word from *c 1.435 + adc eax,0 1.436 + stosd ; [es:edi] = ax; edi += 4; 1.437 + jc L_32 1.438 +L_34: 1.439 + pop ebx 1.440 + pop esi 1.441 + pop edi 1.442 + leave 1.443 + ret 1.444 + nop 1.445 +s_mpv_sqr_add_prop_sse2: 1.446 + push ebp 1.447 + mov ebp, esp 1.448 + push edi 1.449 + push esi 1.450 + push ebx 1.451 + psubq mm2, mm2 ; carry = 0 1.452 + mov ecx, [ebp+12] ; ecx = a_len 1.453 + mov edi, [ebp+16] 1.454 + cmp ecx, 0 1.455 + je L_36 ; jmp if a_len == 0 1.456 + mov esi, [ebp+8] ; esi = a 1.457 + cld 1.458 +L_35: 1.459 + movd mm0, [esi] ; mm0 = *a 1.460 + movd mm3, [edi] ; fetch the sum 1.461 + add esi, 4 1.462 + pmuludq mm0, mm0 ; mm0 = sqr(a) 1.463 + paddq mm2, mm0 ; add the carry 1.464 + paddq mm2, mm3 ; add the low word 1.465 + movd mm3, [edi+4] 1.466 + movd [edi], mm2 ; store the 32bit result 1.467 + psrlq mm2, 32 1.468 + paddq mm2, mm3 ; add the high word 1.469 + movd [edi+4], mm2 ; store the 32bit result 1.470 + psrlq mm2, 32 ; save the carry. 1.471 + add edi, 8 1.472 + dec ecx ; --a_len 1.473 + jnz L_35 ; jmp if a_len != 0 1.474 +L_36: 1.475 + movd ebx, mm2 1.476 + cmp ebx, 0 ; is carry zero? 1.477 + jz L_38 1.478 + mov eax, [edi] 1.479 + add eax, ebx 1.480 + stosd 1.481 + jnc L_38 1.482 +L_37: 1.483 + mov eax, [edi] ; add in current word from *c 1.484 + adc eax, 0 1.485 + stosd ; [es:edi] = ax; edi += 4; 1.486 + jc L_37 1.487 +L_38: 1.488 + emms 1.489 + pop ebx 1.490 + pop esi 1.491 + pop edi 1.492 + leave 1.493 + ret 1.494 + nop 1.495 + } 1.496 +} 1.497 + 1.498 +/* 1.499 + * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized 1.500 + * so its high bit is 1. This code is from NSPR. 1.501 + * 1.502 + * Dump of assembler code for function s_mpv_div_2dx1d: 1.503 + * 1.504 + * esp + 0: Caller's ebx 1.505 + * esp + 4: return address 1.506 + * esp + 8: Nhi argument 1.507 + * esp + 12: Nlo argument 1.508 + * esp + 16: divisor argument 1.509 + * esp + 20: qp argument 1.510 + * esp + 24: rp argument 1.511 + * registers: 1.512 + * eax: 1.513 + * ebx: carry 1.514 + * ecx: a_len 1.515 + * edx: 1.516 + * esi: a ptr 1.517 + * edi: c ptr 1.518 + */ 1.519 +__declspec(naked) mp_err 1.520 +s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, 1.521 + mp_digit *qp, mp_digit *rp) 1.522 +{ 1.523 + __asm { 1.524 + push ebx 1.525 + mov edx,[esp+8] 1.526 + mov eax,[esp+12] 1.527 + mov ebx,[esp+16] 1.528 + div ebx 1.529 + mov ebx,[esp+20] 1.530 + mov [ebx],eax 1.531 + mov ebx,[esp+24] 1.532 + mov [ebx],edx 1.533 + xor eax,eax ; return zero 1.534 + pop ebx 1.535 + ret 1.536 + nop 1.537 + } 1.538 +}