1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_sse2.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,294 @@ 1.4 +# This Source Code Form is subject to the terms of the Mozilla Public 1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +#ifdef DARWIN 1.9 +#define s_mpv_mul_d _s_mpv_mul_d 1.10 +#define s_mpv_mul_d_add _s_mpv_mul_d_add 1.11 +#define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop 1.12 +#define s_mpv_sqr_add_prop _s_mpv_sqr_add_prop 1.13 +#define s_mpv_div_2dx1d _s_mpv_div_2dx1d 1.14 +#define TYPE_FUNCTION(x) 1.15 +#else 1.16 +#define TYPE_FUNCTION(x) .type x, @function 1.17 +#endif 1.18 + 1.19 +.text 1.20 + 1.21 + # ebp - 8: caller's esi 1.22 + # ebp - 4: caller's edi 1.23 + # ebp + 0: caller's ebp 1.24 + # ebp + 4: return address 1.25 + # ebp + 8: a argument 1.26 + # ebp + 12: a_len argument 1.27 + # ebp + 16: b argument 1.28 + # ebp + 20: c argument 1.29 + # registers: 1.30 + # ebx: 1.31 + # ecx: a_len 1.32 + # esi: a ptr 1.33 + # edi: c ptr 1.34 +.globl s_mpv_mul_d 1.35 +.private_extern s_mpv_mul_d 1.36 +TYPE_FUNCTION(s_mpv_mul_d) 1.37 +s_mpv_mul_d: 1.38 + push %ebp 1.39 + mov %esp, %ebp 1.40 + push %edi 1.41 + push %esi 1.42 + psubq %mm2, %mm2 # carry = 0 1.43 + mov 12(%ebp), %ecx # ecx = a_len 1.44 + movd 16(%ebp), %mm1 # mm1 = b 1.45 + mov 20(%ebp), %edi 1.46 + cmp $0, %ecx 1.47 + je 2f # jmp if a_len == 0 1.48 + mov 8(%ebp), %esi # esi = a 1.49 + cld 1.50 +1: 1.51 + movd 0(%esi), %mm0 # mm0 = *a++ 1.52 + add $4, %esi 1.53 + pmuludq %mm1, %mm0 # mm0 = b * *a++ 1.54 + paddq %mm0, %mm2 # add the carry 1.55 + movd %mm2, 0(%edi) # store the 32bit result 1.56 + add $4, %edi 1.57 + psrlq $32, %mm2 # save the carry 1.58 + dec %ecx # --a_len 1.59 + jnz 1b # jmp if a_len != 0 1.60 +2: 1.61 + movd %mm2, 0(%edi) # *c = carry 1.62 + emms 1.63 + pop %esi 1.64 + pop %edi 1.65 + leave 1.66 + ret 1.67 + nop 1.68 + 1.69 + # ebp - 8: caller's esi 1.70 + # ebp - 4: caller's edi 1.71 + # ebp + 0: caller's ebp 1.72 + # ebp + 4: return address 1.73 + # ebp + 8: a argument 1.74 + # ebp + 12: a_len argument 1.75 + # ebp + 16: b argument 1.76 + # ebp + 20: c argument 1.77 + # registers: 1.78 + # ebx: 1.79 + # ecx: a_len 1.80 + # esi: a ptr 1.81 + # edi: c ptr 1.82 +.globl s_mpv_mul_d_add 1.83 +.private_extern s_mpv_mul_d_add 1.84 +TYPE_FUNCTION(s_mpv_mul_d_add) 1.85 +s_mpv_mul_d_add: 1.86 + push %ebp 1.87 + mov %esp, %ebp 1.88 + push %edi 1.89 + push %esi 1.90 + psubq %mm2, %mm2 # carry = 0 1.91 + mov 12(%ebp), %ecx # ecx = a_len 1.92 + movd 16(%ebp), %mm1 # mm1 = b 1.93 + mov 20(%ebp), %edi 1.94 + cmp $0, %ecx 1.95 + je 2f # jmp if a_len == 0 1.96 + mov 8(%ebp), %esi # esi = a 1.97 + cld 1.98 +1: 1.99 + movd 0(%esi), %mm0 # mm0 = *a++ 1.100 + add $4, %esi 1.101 + pmuludq %mm1, %mm0 # mm0 = b * *a++ 1.102 + paddq %mm0, %mm2 # add the carry 1.103 + movd 0(%edi), %mm0 1.104 + paddq %mm0, %mm2 # add the carry 1.105 + movd %mm2, 0(%edi) # store the 32bit result 1.106 + add $4, %edi 1.107 + psrlq $32, %mm2 # save the carry 1.108 + dec %ecx # --a_len 1.109 + jnz 1b # jmp if a_len != 0 1.110 +2: 1.111 + movd %mm2, 0(%edi) # *c = carry 1.112 + emms 1.113 + pop %esi 1.114 + pop %edi 1.115 + leave 1.116 + ret 1.117 + nop 1.118 + 1.119 + # ebp - 12: caller's ebx 1.120 + # ebp - 8: caller's esi 1.121 + # ebp - 4: caller's edi 1.122 + # ebp + 0: caller's ebp 1.123 + # ebp + 4: return address 1.124 + # ebp + 8: a argument 1.125 + # ebp + 12: a_len argument 1.126 + # ebp + 16: b argument 1.127 + # ebp + 20: c argument 1.128 + # registers: 1.129 + # eax: 1.130 + # ebx: carry 1.131 + # ecx: a_len 1.132 + # esi: a ptr 1.133 + # edi: c ptr 1.134 +.globl s_mpv_mul_d_add_prop 1.135 +.private_extern s_mpv_mul_d_add_prop 1.136 +TYPE_FUNCTION(s_mpv_mul_d_add_prop) 1.137 +s_mpv_mul_d_add_prop: 1.138 + push %ebp 1.139 + mov %esp, %ebp 1.140 + push %edi 1.141 + push %esi 1.142 + push %ebx 1.143 + psubq %mm2, %mm2 # carry = 0 1.144 + mov 12(%ebp), %ecx # ecx = a_len 1.145 + movd 16(%ebp), %mm1 # mm1 = b 1.146 + mov 20(%ebp), %edi 1.147 + cmp $0, %ecx 1.148 + je 2f # jmp if a_len == 0 1.149 + mov 8(%ebp), %esi # esi = a 1.150 + cld 1.151 +1: 1.152 + movd 0(%esi), %mm0 # mm0 = *a++ 1.153 + movd 0(%edi), %mm3 # fetch the sum 1.154 + add $4, %esi 1.155 + pmuludq %mm1, %mm0 # mm0 = b * *a++ 1.156 + paddq %mm0, %mm2 # add the carry 1.157 + paddq %mm3, %mm2 # add *c++ 1.158 + movd %mm2, 0(%edi) # store the 32bit result 1.159 + add $4, %edi 1.160 + psrlq $32, %mm2 # save the carry 1.161 + dec %ecx # --a_len 1.162 + jnz 1b # jmp if a_len != 0 1.163 +2: 1.164 + movd %mm2, %ebx 1.165 + cmp $0, %ebx # is carry zero? 1.166 + jz 4f 1.167 + mov 0(%edi), %eax 1.168 + add %ebx, %eax 1.169 + stosl 1.170 + jnc 4f 1.171 +3: 1.172 + mov 0(%edi), %eax # add in current word from *c 1.173 + adc $0, %eax 1.174 + stosl # [es:edi] = ax; edi += 4; 1.175 + jc 3b 1.176 +4: 1.177 + emms 1.178 + pop %ebx 1.179 + pop %esi 1.180 + pop %edi 1.181 + leave 1.182 + ret 1.183 + nop 1.184 + 1.185 + # ebp - 12: caller's ebx 1.186 + # ebp - 8: caller's esi 1.187 + # ebp - 4: caller's edi 1.188 + # ebp + 0: caller's ebp 1.189 + # ebp + 4: return address 1.190 + # ebp + 8: pa argument 1.191 + # ebp + 12: a_len argument 1.192 + # ebp + 16: ps argument 1.193 + # registers: 1.194 + # eax: 1.195 + # ebx: carry 1.196 + # ecx: a_len 1.197 + # esi: a ptr 1.198 + # edi: c ptr 1.199 +.globl s_mpv_sqr_add_prop 1.200 +.private_extern s_mpv_sqr_add_prop 1.201 +TYPE_FUNCTION(s_mpv_sqr_add_prop) 1.202 +s_mpv_sqr_add_prop: 1.203 + push %ebp 1.204 + mov %esp, %ebp 1.205 + push %edi 1.206 + push %esi 1.207 + push %ebx 1.208 + psubq %mm2, %mm2 # carry = 0 1.209 + mov 12(%ebp), %ecx # ecx = a_len 1.210 + mov 16(%ebp), %edi 1.211 + cmp $0, %ecx 1.212 + je 2f # jmp if a_len == 0 1.213 + mov 8(%ebp), %esi # esi = a 1.214 + cld 1.215 +1: 1.216 + movd 0(%esi), %mm0 # mm0 = *a 1.217 + movd 0(%edi), %mm3 # fetch the sum 1.218 + add $4, %esi 1.219 + pmuludq %mm0, %mm0 # mm0 = sqr(a) 1.220 + paddq %mm0, %mm2 # add the carry 1.221 + paddq %mm3, %mm2 # add the low word 1.222 + movd 4(%edi), %mm3 1.223 + movd %mm2, 0(%edi) # store the 32bit result 1.224 + psrlq $32, %mm2 1.225 + paddq %mm3, %mm2 # add the high word 1.226 + movd %mm2, 4(%edi) # store the 32bit result 1.227 + psrlq $32, %mm2 # save the carry. 1.228 + add $8, %edi 1.229 + dec %ecx # --a_len 1.230 + jnz 1b # jmp if a_len != 0 1.231 +2: 1.232 + movd %mm2, %ebx 1.233 + cmp $0, %ebx # is carry zero? 1.234 + jz 4f 1.235 + mov 0(%edi), %eax 1.236 + add %ebx, %eax 1.237 + stosl 1.238 + jnc 4f 1.239 +3: 1.240 + mov 0(%edi), %eax # add in current word from *c 1.241 + adc $0, %eax 1.242 + stosl # [es:edi] = ax; edi += 4; 1.243 + jc 3b 1.244 +4: 1.245 + emms 1.246 + pop %ebx 1.247 + pop %esi 1.248 + pop %edi 1.249 + leave 1.250 + ret 1.251 + nop 1.252 + 1.253 + # 1.254 + # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized 1.255 + # so its high bit is 1. This code is from NSPR. 1.256 + # 1.257 + # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, 1.258 + # mp_digit *qp, mp_digit *rp) 1.259 + 1.260 + # esp + 0: Caller's ebx 1.261 + # esp + 4: return address 1.262 + # esp + 8: Nhi argument 1.263 + # esp + 12: Nlo argument 1.264 + # esp + 16: divisor argument 1.265 + # esp + 20: qp argument 1.266 + # esp + 24: rp argument 1.267 + # registers: 1.268 + # eax: 1.269 + # ebx: carry 1.270 + # ecx: a_len 1.271 + # edx: 1.272 + # esi: a ptr 1.273 + # edi: c ptr 1.274 + # 1.275 +.globl s_mpv_div_2dx1d 1.276 +.private_extern s_mpv_div_2dx1d 1.277 +TYPE_FUNCTION(s_mpv_div_2dx1d) 1.278 +s_mpv_div_2dx1d: 1.279 + push %ebx 1.280 + mov 8(%esp), %edx 1.281 + mov 12(%esp), %eax 1.282 + mov 16(%esp), %ebx 1.283 + div %ebx 1.284 + mov 20(%esp), %ebx 1.285 + mov %eax, 0(%ebx) 1.286 + mov 24(%esp), %ebx 1.287 + mov %edx, 0(%ebx) 1.288 + xor %eax, %eax # return zero 1.289 + pop %ebx 1.290 + ret 1.291 + nop 1.292 + 1.293 +#ifndef DARWIN 1.294 + # Magic indicating no need for an executable stack 1.295 +.section .note.GNU-stack, "", @progbits 1.296 +.previous 1.297 +#endif