1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_x86_os2.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,538 @@ 1.4 +# 1.5 +# This Source Code Form is subject to the terms of the Mozilla Public 1.6 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.8 + 1.9 +.data 1.10 +.align 4 1.11 + # 1.12 + # -1 means to call _s_mpi_is_sse to determine if we support sse 1.13 + # instructions. 1.14 + # 0 means to use x86 instructions 1.15 + # 1 means to use sse2 instructions 1.16 +.type is_sse,@object 1.17 +.size is_sse,4 1.18 +is_sse: .long -1 1.19 + 1.20 +# 1.21 +# sigh, handle the difference between -fPIC and not PIC 1.22 +# default to pic, since this file seems to be exclusively 1.23 +# linux right now (solaris uses mpi_i86pc.s and windows uses 1.24 +# mpi_x86_asm.c) 1.25 +# 1.26 +#.ifndef NO_PIC 1.27 +#.macro GET var,reg 1.28 +# movl \var@GOTOFF(%ebx),\reg 1.29 +#.endm 1.30 +#.macro PUT reg,var 1.31 +# movl \reg,\var@GOTOFF(%ebx) 1.32 +#.endm 1.33 +#.else 1.34 +.macro GET var,reg 1.35 + movl \var,\reg 1.36 +.endm 1.37 +.macro PUT reg,var 1.38 + movl \reg,\var 1.39 +.endm 1.40 +#.endif 1.41 + 1.42 +.text 1.43 + 1.44 + 1.45 + # ebp - 36: caller's esi 1.46 + # ebp - 32: caller's edi 1.47 + # ebp - 28: 1.48 + # ebp - 24: 1.49 + # ebp - 20: 1.50 + # ebp - 16: 1.51 + # ebp - 12: 1.52 + # ebp - 8: 1.53 + # ebp - 4: 1.54 + # ebp + 0: caller's ebp 1.55 + # ebp + 4: return address 1.56 + # ebp + 8: a argument 1.57 + # ebp + 12: a_len argument 1.58 + # ebp + 16: b argument 1.59 + # ebp + 20: c argument 1.60 + # registers: 1.61 + # eax: 1.62 + # ebx: carry 1.63 + # ecx: a_len 1.64 + # edx: 1.65 + # esi: a ptr 1.66 + # edi: c ptr 1.67 +.globl _s_mpv_mul_d 1.68 +.type _s_mpv_mul_d,@function 1.69 +_s_mpv_mul_d: 1.70 + GET is_sse,%eax 1.71 + cmp $0,%eax 1.72 + je _s_mpv_mul_d_x86 1.73 + jg _s_mpv_mul_d_sse2 1.74 + call _s_mpi_is_sse2 1.75 + PUT %eax,is_sse 1.76 + cmp $0,%eax 1.77 + jg _s_mpv_mul_d_sse2 1.78 +_s_mpv_mul_d_x86: 1.79 + push %ebp 1.80 + mov %esp,%ebp 1.81 + sub $28,%esp 1.82 + push %edi 1.83 + push %esi 1.84 + push %ebx 1.85 + movl $0,%ebx # carry = 0 1.86 + mov 12(%ebp),%ecx # ecx = a_len 1.87 + mov 20(%ebp),%edi 1.88 + cmp $0,%ecx 1.89 + je 2f # jmp if a_len == 0 1.90 + mov 8(%ebp),%esi # esi = a 1.91 + cld 1.92 +1: 1.93 + lodsl # eax = [ds:esi]; esi += 4 1.94 + mov 16(%ebp),%edx # edx = b 1.95 + mull %edx # edx:eax = Phi:Plo = a_i * b 1.96 + 1.97 + add %ebx,%eax # add carry (%ebx) to edx:eax 1.98 + adc $0,%edx 1.99 + mov %edx,%ebx # high half of product becomes next carry 1.100 + 1.101 + stosl # [es:edi] = ax; edi += 4; 1.102 + dec %ecx # --a_len 1.103 + jnz 1b # jmp if a_len != 0 1.104 +2: 1.105 + mov %ebx,0(%edi) # *c = carry 1.106 + pop %ebx 1.107 + pop %esi 1.108 + pop %edi 1.109 + leave 1.110 + ret 1.111 + nop 1.112 +_s_mpv_mul_d_sse2: 1.113 + push %ebp 1.114 + mov %esp,%ebp 1.115 + push %edi 1.116 + push %esi 1.117 + psubq %mm2,%mm2 # carry = 0 1.118 + mov 12(%ebp),%ecx # ecx = a_len 1.119 + movd 16(%ebp),%mm1 # mm1 = b 1.120 + mov 20(%ebp),%edi 1.121 + cmp $0,%ecx 1.122 + je 6f # jmp if a_len == 0 1.123 + mov 8(%ebp),%esi # esi = a 1.124 + cld 1.125 +5: 1.126 + movd 0(%esi),%mm0 # mm0 = *a++ 1.127 + add $4,%esi 1.128 + pmuludq %mm1,%mm0 # mm0 = b * *a++ 1.129 + paddq %mm0,%mm2 # add the carry 1.130 + movd %mm2,0(%edi) # store the 32bit result 1.131 + add $4,%edi 1.132 + psrlq $32, %mm2 # save the carry 1.133 + dec %ecx # --a_len 1.134 + jnz 5b # jmp if a_len != 0 1.135 +6: 1.136 + movd %mm2,0(%edi) # *c = carry 1.137 + emms 1.138 + pop %esi 1.139 + pop %edi 1.140 + leave 1.141 + ret 1.142 + nop 1.143 + 1.144 + # ebp - 36: caller's esi 1.145 + # ebp - 32: caller's edi 1.146 + # ebp - 28: 1.147 + # ebp - 24: 1.148 + # ebp - 20: 1.149 + # ebp - 16: 1.150 + # ebp - 12: 1.151 + # ebp - 8: 1.152 + # ebp - 4: 1.153 + # ebp + 0: caller's ebp 1.154 + # ebp + 4: return address 1.155 + # ebp + 8: a argument 1.156 + # ebp + 12: a_len argument 1.157 + # ebp + 16: b argument 1.158 + # ebp + 20: c argument 1.159 + # registers: 1.160 + # eax: 1.161 + # ebx: carry 1.162 + # ecx: a_len 1.163 + # edx: 1.164 + # esi: a ptr 1.165 + # edi: c ptr 1.166 +.globl _s_mpv_mul_d_add 1.167 +.type _s_mpv_mul_d_add,@function 1.168 +_s_mpv_mul_d_add: 1.169 + GET is_sse,%eax 1.170 + cmp $0,%eax 1.171 + je _s_mpv_mul_d_add_x86 1.172 + jg _s_mpv_mul_d_add_sse2 1.173 + call _s_mpi_is_sse2 1.174 + PUT %eax,is_sse 1.175 + cmp $0,%eax 1.176 + jg _s_mpv_mul_d_add_sse2 1.177 +_s_mpv_mul_d_add_x86: 1.178 + push %ebp 1.179 + mov %esp,%ebp 1.180 + sub $28,%esp 1.181 + push %edi 1.182 + push %esi 1.183 + push %ebx 1.184 + movl $0,%ebx # carry = 0 1.185 + mov 12(%ebp),%ecx # ecx = a_len 1.186 + mov 20(%ebp),%edi 1.187 + cmp $0,%ecx 1.188 + je 11f # jmp if a_len == 0 1.189 + mov 8(%ebp),%esi # esi = a 1.190 + cld 1.191 +10: 1.192 + lodsl # eax = [ds:esi]; esi += 4 1.193 + mov 16(%ebp),%edx # edx = b 1.194 + mull %edx # edx:eax = Phi:Plo = a_i * b 1.195 + 1.196 + add %ebx,%eax # add carry (%ebx) to edx:eax 1.197 + adc $0,%edx 1.198 + mov 0(%edi),%ebx # add in current word from *c 1.199 + add %ebx,%eax 1.200 + adc $0,%edx 1.201 + mov %edx,%ebx # high half of product becomes next carry 1.202 + 1.203 + stosl # [es:edi] = ax; edi += 4; 1.204 + dec %ecx # --a_len 1.205 + jnz 10b # jmp if a_len != 0 1.206 +11: 1.207 + mov %ebx,0(%edi) # *c = carry 1.208 + pop %ebx 1.209 + pop %esi 1.210 + pop %edi 1.211 + leave 1.212 + ret 1.213 + nop 1.214 +_s_mpv_mul_d_add_sse2: 1.215 + push %ebp 1.216 + mov %esp,%ebp 1.217 + push %edi 1.218 + push %esi 1.219 + psubq %mm2,%mm2 # carry = 0 1.220 + mov 12(%ebp),%ecx # ecx = a_len 1.221 + movd 16(%ebp),%mm1 # mm1 = b 1.222 + mov 20(%ebp),%edi 1.223 + cmp $0,%ecx 1.224 + je 16f # jmp if a_len == 0 1.225 + mov 8(%ebp),%esi # esi = a 1.226 + cld 1.227 +15: 1.228 + movd 0(%esi),%mm0 # mm0 = *a++ 1.229 + add $4,%esi 1.230 + pmuludq %mm1,%mm0 # mm0 = b * *a++ 1.231 + paddq %mm0,%mm2 # add the carry 1.232 + movd 0(%edi),%mm0 1.233 + paddq %mm0,%mm2 # add the carry 1.234 + movd %mm2,0(%edi) # store the 32bit result 1.235 + add $4,%edi 1.236 + psrlq $32, %mm2 # save the carry 1.237 + dec %ecx # --a_len 1.238 + jnz 15b # jmp if a_len != 0 1.239 +16: 1.240 + movd %mm2,0(%edi) # *c = carry 1.241 + emms 1.242 + pop %esi 1.243 + pop %edi 1.244 + leave 1.245 + ret 1.246 + nop 1.247 + 1.248 + # ebp - 8: caller's esi 1.249 + # ebp - 4: caller's edi 1.250 + # ebp + 0: caller's ebp 1.251 + # ebp + 4: return address 1.252 + # ebp + 8: a argument 1.253 + # ebp + 12: a_len argument 1.254 + # ebp + 16: b argument 1.255 + # ebp + 20: c argument 1.256 + # registers: 1.257 + # eax: 1.258 + # ebx: carry 1.259 + # ecx: a_len 1.260 + # edx: 1.261 + # esi: a ptr 1.262 + # edi: c ptr 1.263 +.globl _s_mpv_mul_d_add_prop 1.264 +.type _s_mpv_mul_d_add_prop,@function 1.265 +_s_mpv_mul_d_add_prop: 1.266 + GET is_sse,%eax 1.267 + cmp $0,%eax 1.268 + je _s_mpv_mul_d_add_prop_x86 1.269 + jg _s_mpv_mul_d_add_prop_sse2 1.270 + call _s_mpi_is_sse2 1.271 + PUT %eax,is_sse 1.272 + cmp $0,%eax 1.273 + jg _s_mpv_mul_d_add_prop_sse2 1.274 +_s_mpv_mul_d_add_prop_x86: 1.275 + push %ebp 1.276 + mov %esp,%ebp 1.277 + sub $28,%esp 1.278 + push %edi 1.279 + push %esi 1.280 + push %ebx 1.281 + movl $0,%ebx # carry = 0 1.282 + mov 12(%ebp),%ecx # ecx = a_len 1.283 + mov 20(%ebp),%edi 1.284 + cmp $0,%ecx 1.285 + je 21f # jmp if a_len == 0 1.286 + cld 1.287 + mov 8(%ebp),%esi # esi = a 1.288 +20: 1.289 + lodsl # eax = [ds:esi]; esi += 4 1.290 + mov 16(%ebp),%edx # edx = b 1.291 + mull %edx # edx:eax = Phi:Plo = a_i * b 1.292 + 1.293 + add %ebx,%eax # add carry (%ebx) to edx:eax 1.294 + adc $0,%edx 1.295 + mov 0(%edi),%ebx # add in current word from *c 1.296 + add %ebx,%eax 1.297 + adc $0,%edx 1.298 + mov %edx,%ebx # high half of product becomes next carry 1.299 + 1.300 + stosl # [es:edi] = ax; edi += 4; 1.301 + dec %ecx # --a_len 1.302 + jnz 20b # jmp if a_len != 0 1.303 +21: 1.304 + cmp $0,%ebx # is carry zero? 1.305 + jz 23f 1.306 + mov 0(%edi),%eax # add in current word from *c 1.307 + add %ebx,%eax 1.308 + stosl # [es:edi] = ax; edi += 4; 1.309 + jnc 23f 1.310 +22: 1.311 + mov 0(%edi),%eax # add in current word from *c 1.312 + adc $0,%eax 1.313 + stosl # [es:edi] = ax; edi += 4; 1.314 + jc 22b 1.315 +23: 1.316 + pop %ebx 1.317 + pop %esi 1.318 + pop %edi 1.319 + leave 1.320 + ret 1.321 + nop 1.322 +_s_mpv_mul_d_add_prop_sse2: 1.323 + push %ebp 1.324 + mov %esp,%ebp 1.325 + push %edi 1.326 + push %esi 1.327 + push %ebx 1.328 + psubq %mm2,%mm2 # carry = 0 1.329 + mov 12(%ebp),%ecx # ecx = a_len 1.330 + movd 16(%ebp),%mm1 # mm1 = b 1.331 + mov 20(%ebp),%edi 1.332 + cmp $0,%ecx 1.333 + je 26f # jmp if a_len == 0 1.334 + mov 8(%ebp),%esi # esi = a 1.335 + cld 1.336 +25: 1.337 + movd 0(%esi),%mm0 # mm0 = *a++ 1.338 + movd 0(%edi),%mm3 # fetch the sum 1.339 + add $4,%esi 1.340 + pmuludq %mm1,%mm0 # mm0 = b * *a++ 1.341 + paddq %mm0,%mm2 # add the carry 1.342 + paddq %mm3,%mm2 # add *c++ 1.343 + movd %mm2,0(%edi) # store the 32bit result 1.344 + add $4,%edi 1.345 + psrlq $32, %mm2 # save the carry 1.346 + dec %ecx # --a_len 1.347 + jnz 25b # jmp if a_len != 0 1.348 +26: 1.349 + movd %mm2,%ebx 1.350 + cmp $0,%ebx # is carry zero? 1.351 + jz 28f 1.352 + mov 0(%edi),%eax 1.353 + add %ebx, %eax 1.354 + stosl 1.355 + jnc 28f 1.356 +27: 1.357 + mov 0(%edi),%eax # add in current word from *c 1.358 + adc $0,%eax 1.359 + stosl # [es:edi] = ax; edi += 4; 1.360 + jc 27b 1.361 +28: 1.362 + emms 1.363 + pop %ebx 1.364 + pop %esi 1.365 + pop %edi 1.366 + leave 1.367 + ret 1.368 + nop 1.369 + 1.370 + 1.371 + # ebp - 20: caller's esi 1.372 + # ebp - 16: caller's edi 1.373 + # ebp - 12: 1.374 + # ebp - 8: carry 1.375 + # ebp - 4: a_len local 1.376 + # ebp + 0: caller's ebp 1.377 + # ebp + 4: return address 1.378 + # ebp + 8: pa argument 1.379 + # ebp + 12: a_len argument 1.380 + # ebp + 16: ps argument 1.381 + # ebp + 20: 1.382 + # registers: 1.383 + # eax: 1.384 + # ebx: carry 1.385 + # ecx: a_len 1.386 + # edx: 1.387 + # esi: a ptr 1.388 + # edi: c ptr 1.389 + 1.390 +.globl _s_mpv_sqr_add_prop 1.391 +.type _s_mpv_sqr_add_prop,@function 1.392 +_s_mpv_sqr_add_prop: 1.393 + GET is_sse,%eax 1.394 + cmp $0,%eax 1.395 + je _s_mpv_sqr_add_prop_x86 1.396 + jg _s_mpv_sqr_add_prop_sse2 1.397 + call _s_mpi_is_sse2 1.398 + PUT %eax,is_sse 1.399 + cmp $0,%eax 1.400 + jg _s_mpv_sqr_add_prop_sse2 1.401 +_s_mpv_sqr_add_prop_x86: 1.402 + push %ebp 1.403 + mov %esp,%ebp 1.404 + sub $12,%esp 1.405 + push %edi 1.406 + push %esi 1.407 + push %ebx 1.408 + movl $0,%ebx # carry = 0 1.409 + mov 12(%ebp),%ecx # a_len 1.410 + mov 16(%ebp),%edi # edi = ps 1.411 + cmp $0,%ecx 1.412 + je 31f # jump if a_len == 0 1.413 + cld 1.414 + mov 8(%ebp),%esi # esi = pa 1.415 +30: 1.416 + lodsl # %eax = [ds:si]; si += 4; 1.417 + mull %eax 1.418 + 1.419 + add %ebx,%eax # add "carry" 1.420 + adc $0,%edx 1.421 + mov 0(%edi),%ebx 1.422 + add %ebx,%eax # add low word from result 1.423 + mov 4(%edi),%ebx 1.424 + stosl # [es:di] = %eax; di += 4; 1.425 + adc %ebx,%edx # add high word from result 1.426 + movl $0,%ebx 1.427 + mov %edx,%eax 1.428 + adc $0,%ebx 1.429 + stosl # [es:di] = %eax; di += 4; 1.430 + dec %ecx # --a_len 1.431 + jnz 30b # jmp if a_len != 0 1.432 +31: 1.433 + cmp $0,%ebx # is carry zero? 1.434 + jz 34f 1.435 + mov 0(%edi),%eax # add in current word from *c 1.436 + add %ebx,%eax 1.437 + stosl # [es:edi] = ax; edi += 4; 1.438 + jnc 34f 1.439 +32: 1.440 + mov 0(%edi),%eax # add in current word from *c 1.441 + adc $0,%eax 1.442 + stosl # [es:edi] = ax; edi += 4; 1.443 + jc 32b 1.444 +34: 1.445 + pop %ebx 1.446 + pop %esi 1.447 + pop %edi 1.448 + leave 1.449 + ret 1.450 + nop 1.451 +_s_mpv_sqr_add_prop_sse2: 1.452 + push %ebp 1.453 + mov %esp,%ebp 1.454 + push %edi 1.455 + push %esi 1.456 + push %ebx 1.457 + psubq %mm2,%mm2 # carry = 0 1.458 + mov 12(%ebp),%ecx # ecx = a_len 1.459 + mov 16(%ebp),%edi 1.460 + cmp $0,%ecx 1.461 + je 36f # jmp if a_len == 0 1.462 + mov 8(%ebp),%esi # esi = a 1.463 + cld 1.464 +35: 1.465 + movd 0(%esi),%mm0 # mm0 = *a 1.466 + movd 0(%edi),%mm3 # fetch the sum 1.467 + add $4,%esi 1.468 + pmuludq %mm0,%mm0 # mm0 = sqr(a) 1.469 + paddq %mm0,%mm2 # add the carry 1.470 + paddq %mm3,%mm2 # add the low word 1.471 + movd 4(%edi),%mm3 1.472 + movd %mm2,0(%edi) # store the 32bit result 1.473 + psrlq $32, %mm2 1.474 + paddq %mm3,%mm2 # add the high word 1.475 + movd %mm2,4(%edi) # store the 32bit result 1.476 + psrlq $32, %mm2 # save the carry. 1.477 + add $8,%edi 1.478 + dec %ecx # --a_len 1.479 + jnz 35b # jmp if a_len != 0 1.480 +36: 1.481 + movd %mm2,%ebx 1.482 + cmp $0,%ebx # is carry zero? 1.483 + jz 38f 1.484 + mov 0(%edi),%eax 1.485 + add %ebx, %eax 1.486 + stosl 1.487 + jnc 38f 1.488 +37: 1.489 + mov 0(%edi),%eax # add in current word from *c 1.490 + adc $0,%eax 1.491 + stosl # [es:edi] = ax; edi += 4; 1.492 + jc 37b 1.493 +38: 1.494 + emms 1.495 + pop %ebx 1.496 + pop %esi 1.497 + pop %edi 1.498 + leave 1.499 + ret 1.500 + nop 1.501 + 1.502 + # 1.503 + # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized 1.504 + # so its high bit is 1. This code is from NSPR. 1.505 + # 1.506 + # mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, 1.507 + # mp_digit *qp, mp_digit *rp) 1.508 + 1.509 + # esp + 0: Caller's ebx 1.510 + # esp + 4: return address 1.511 + # esp + 8: Nhi argument 1.512 + # esp + 12: Nlo argument 1.513 + # esp + 16: divisor argument 1.514 + # esp + 20: qp argument 1.515 + # esp + 24: rp argument 1.516 + # registers: 1.517 + # eax: 1.518 + # ebx: carry 1.519 + # ecx: a_len 1.520 + # edx: 1.521 + # esi: a ptr 1.522 + # edi: c ptr 1.523 + # 1.524 + 1.525 +.globl _s_mpv_div_2dx1d 1.526 +.type _s_mpv_div_2dx1d,@function 1.527 +_s_mpv_div_2dx1d: 1.528 + push %ebx 1.529 + mov 8(%esp),%edx 1.530 + mov 12(%esp),%eax 1.531 + mov 16(%esp),%ebx 1.532 + div %ebx 1.533 + mov 20(%esp),%ebx 1.534 + mov %eax,0(%ebx) 1.535 + mov 24(%esp),%ebx 1.536 + mov %edx,0(%ebx) 1.537 + xor %eax,%eax # return zero 1.538 + pop %ebx 1.539 + ret 1.540 + nop 1.541 +