1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_mips.s Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,472 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 +#include <regdef.h> 1.8 + .set noreorder 1.9 + .set noat 1.10 + 1.11 + .section .text, 1, 0x00000006, 4, 4 1.12 +.text: 1.13 + .section .text 1.14 + 1.15 + .ent s_mpv_mul_d_add 1.16 + .globl s_mpv_mul_d_add 1.17 + 1.18 +s_mpv_mul_d_add: 1.19 + #/* c += a * b */ 1.20 + #void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, 1.21 + # mp_digit *c) 1.22 + #{ 1.23 + # mp_digit a0, a1; regs a4, a5 1.24 + # mp_digit c0, c1; regs a6, a7 1.25 + # mp_digit cy = 0; reg t2 1.26 + # mp_word w0, w1; regs t0, t1 1.27 + # 1.28 + # if (a_len) { 1.29 + beq a1,zero,.L.1 1.30 + move t2,zero # cy = 0 1.31 + dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) 1.32 + dsrl32 a2,a2,0 # This clears the upper 32 bits. 1.33 + # a0 = a[0]; 1.34 + lwu a4,0(a0) 1.35 + # w0 = ((mp_word)b * a0); 1.36 + dmultu a2,a4 1.37 + # if (--a_len) { 1.38 + addiu a1,a1,-1 1.39 + beq a1,zero,.L.2 1.40 + # while (a_len >= 2) { 1.41 + sltiu t3,a1,2 1.42 + bne t3,zero,.L.3 1.43 + # a1 = a[1]; 1.44 + lwu a5,4(a0) 1.45 +.L.4: 1.46 + # a_len -= 2; 1.47 + addiu a1,a1,-2 1.48 + # c0 = c[0]; 1.49 + lwu a6,0(a3) 1.50 + # w0 += cy; 1.51 + mflo t0 1.52 + daddu t0,t0,t2 1.53 + # w0 += c0; 1.54 + daddu t0,t0,a6 1.55 + # w1 = (mp_word)b * a1; 1.56 + dmultu a2,a5 # 1.57 + # cy = CARRYOUT(w0); 1.58 + dsrl32 t2,t0,0 1.59 + # c[0] = ACCUM(w0); 1.60 + sw t0,0(a3) 1.61 + # a0 = a[2]; 1.62 + lwu a4,8(a0) 1.63 + # a += 2; 1.64 + addiu a0,a0,8 1.65 + # c1 = c[1]; 1.66 + lwu a7,4(a3) 1.67 + # w1 += cy; 1.68 + mflo t1 1.69 + daddu t1,t1,t2 1.70 + # w1 += c1; 1.71 + daddu t1,t1,a7 1.72 + # w0 = (mp_word)b * a0; 1.73 + dmultu a2,a4 # 1.74 + # cy = CARRYOUT(w1); 1.75 + dsrl32 t2,t1,0 1.76 + # c[1] = ACCUM(w1); 1.77 + sw t1,4(a3) 1.78 + # c += 2; 1.79 + addiu a3,a3,8 1.80 + sltiu t3,a1,2 1.81 + beq t3,zero,.L.4 1.82 + # a1 = a[1]; 1.83 + lwu a5,4(a0) 1.84 + # } 1.85 +.L.3: 1.86 + # c0 = c[0]; 1.87 + lwu a6,0(a3) 1.88 + # w0 += cy; 1.89 + # if (a_len) { 1.90 + mflo t0 1.91 + beq a1,zero,.L.5 1.92 + daddu t0,t0,t2 1.93 + # w1 = (mp_word)b * a1; 1.94 + dmultu a2,a5 1.95 + # w0 += c0; 1.96 + daddu t0,t0,a6 # 1.97 + # cy = CARRYOUT(w0); 1.98 + dsrl32 t2,t0,0 1.99 + # c[0] = ACCUM(w0); 1.100 + sw t0,0(a3) 1.101 + # c1 = c[1]; 1.102 + lwu a7,4(a3) 1.103 + # w1 += cy; 1.104 + mflo t1 1.105 + daddu t1,t1,t2 1.106 + # w1 += c1; 1.107 + daddu t1,t1,a7 1.108 + # c[1] = ACCUM(w1); 1.109 + sw t1,4(a3) 1.110 + # cy = CARRYOUT(w1); 1.111 + dsrl32 t2,t1,0 1.112 + # c += 1; 1.113 + b .L.6 1.114 + addiu a3,a3,4 1.115 + # } else { 1.116 +.L.5: 1.117 + # w0 += c0; 1.118 + daddu t0,t0,a6 1.119 + # c[0] = ACCUM(w0); 1.120 + sw t0,0(a3) 1.121 + # cy = CARRYOUT(w0); 1.122 + b .L.6 1.123 + dsrl32 t2,t0,0 1.124 + # } 1.125 + # } else { 1.126 +.L.2: 1.127 + # c0 = c[0]; 1.128 + lwu a6,0(a3) 1.129 + # w0 += c0; 1.130 + mflo t0 1.131 + daddu t0,t0,a6 1.132 + # c[0] = ACCUM(w0); 1.133 + sw t0,0(a3) 1.134 + # cy = CARRYOUT(w0); 1.135 + dsrl32 t2,t0,0 1.136 + # } 1.137 +.L.6: 1.138 + # c[1] = cy; 1.139 + jr ra 1.140 + sw t2,4(a3) 1.141 + # } 1.142 +.L.1: 1.143 + jr ra 1.144 + nop 1.145 + #} 1.146 + # 1.147 + .end s_mpv_mul_d_add 1.148 + 1.149 + .ent s_mpv_mul_d_add_prop 1.150 + .globl s_mpv_mul_d_add_prop 1.151 + 1.152 +s_mpv_mul_d_add_prop: 1.153 + #/* c += a * b */ 1.154 + #void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, 1.155 + # mp_digit *c) 1.156 + #{ 1.157 + # mp_digit a0, a1; regs a4, a5 1.158 + # mp_digit c0, c1; regs a6, a7 1.159 + # mp_digit cy = 0; reg t2 1.160 + # mp_word w0, w1; regs t0, t1 1.161 + # 1.162 + # if (a_len) { 1.163 + beq a1,zero,.M.1 1.164 + move t2,zero # cy = 0 1.165 + dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) 1.166 + dsrl32 a2,a2,0 # This clears the upper 32 bits. 1.167 + # a0 = a[0]; 1.168 + lwu a4,0(a0) 1.169 + # w0 = ((mp_word)b * a0); 1.170 + dmultu a2,a4 1.171 + # if (--a_len) { 1.172 + addiu a1,a1,-1 1.173 + beq a1,zero,.M.2 1.174 + # while (a_len >= 2) { 1.175 + sltiu t3,a1,2 1.176 + bne t3,zero,.M.3 1.177 + # a1 = a[1]; 1.178 + lwu a5,4(a0) 1.179 +.M.4: 1.180 + # a_len -= 2; 1.181 + addiu a1,a1,-2 1.182 + # c0 = c[0]; 1.183 + lwu a6,0(a3) 1.184 + # w0 += cy; 1.185 + mflo t0 1.186 + daddu t0,t0,t2 1.187 + # w0 += c0; 1.188 + daddu t0,t0,a6 1.189 + # w1 = (mp_word)b * a1; 1.190 + dmultu a2,a5 # 1.191 + # cy = CARRYOUT(w0); 1.192 + dsrl32 t2,t0,0 1.193 + # c[0] = ACCUM(w0); 1.194 + sw t0,0(a3) 1.195 + # a0 = a[2]; 1.196 + lwu a4,8(a0) 1.197 + # a += 2; 1.198 + addiu a0,a0,8 1.199 + # c1 = c[1]; 1.200 + lwu a7,4(a3) 1.201 + # w1 += cy; 1.202 + mflo t1 1.203 + daddu t1,t1,t2 1.204 + # w1 += c1; 1.205 + daddu t1,t1,a7 1.206 + # w0 = (mp_word)b * a0; 1.207 + dmultu a2,a4 # 1.208 + # cy = CARRYOUT(w1); 1.209 + dsrl32 t2,t1,0 1.210 + # c[1] = ACCUM(w1); 1.211 + sw t1,4(a3) 1.212 + # c += 2; 1.213 + addiu a3,a3,8 1.214 + sltiu t3,a1,2 1.215 + beq t3,zero,.M.4 1.216 + # a1 = a[1]; 1.217 + lwu a5,4(a0) 1.218 + # } 1.219 +.M.3: 1.220 + # c0 = c[0]; 1.221 + lwu a6,0(a3) 1.222 + # w0 += cy; 1.223 + # if (a_len) { 1.224 + mflo t0 1.225 + beq a1,zero,.M.5 1.226 + daddu t0,t0,t2 1.227 + # w1 = (mp_word)b * a1; 1.228 + dmultu a2,a5 1.229 + # w0 += c0; 1.230 + daddu t0,t0,a6 # 1.231 + # cy = CARRYOUT(w0); 1.232 + dsrl32 t2,t0,0 1.233 + # c[0] = ACCUM(w0); 1.234 + sw t0,0(a3) 1.235 + # c1 = c[1]; 1.236 + lwu a7,4(a3) 1.237 + # w1 += cy; 1.238 + mflo t1 1.239 + daddu t1,t1,t2 1.240 + # w1 += c1; 1.241 + daddu t1,t1,a7 1.242 + # c[1] = ACCUM(w1); 1.243 + sw t1,4(a3) 1.244 + # cy = CARRYOUT(w1); 1.245 + dsrl32 t2,t1,0 1.246 + # c += 1; 1.247 + b .M.6 1.248 + addiu a3,a3,8 1.249 + # } else { 1.250 +.M.5: 1.251 + # w0 += c0; 1.252 + daddu t0,t0,a6 1.253 + # c[0] = ACCUM(w0); 1.254 + sw t0,0(a3) 1.255 + # cy = CARRYOUT(w0); 1.256 + dsrl32 t2,t0,0 1.257 + b .M.6 1.258 + addiu a3,a3,4 1.259 + # } 1.260 + # } else { 1.261 +.M.2: 1.262 + # c0 = c[0]; 1.263 + lwu a6,0(a3) 1.264 + # w0 += c0; 1.265 + mflo t0 1.266 + daddu t0,t0,a6 1.267 + # c[0] = ACCUM(w0); 1.268 + sw t0,0(a3) 1.269 + # cy = CARRYOUT(w0); 1.270 + dsrl32 t2,t0,0 1.271 + addiu a3,a3,4 1.272 + # } 1.273 +.M.6: 1.274 + 1.275 + # while (cy) { 1.276 + beq t2,zero,.M.1 1.277 + nop 1.278 +.M.7: 1.279 + # mp_word w = (mp_word)*c + cy; 1.280 + lwu a6,0(a3) 1.281 + daddu t2,t2,a6 1.282 + # *c++ = ACCUM(w); 1.283 + sw t2,0(a3) 1.284 + # cy = CARRYOUT(w); 1.285 + dsrl32 t2,t2,0 1.286 + bne t2,zero,.M.7 1.287 + addiu a3,a3,4 1.288 + 1.289 + # } 1.290 +.M.1: 1.291 + jr ra 1.292 + nop 1.293 + #} 1.294 + # 1.295 + .end s_mpv_mul_d_add_prop 1.296 + 1.297 + .ent s_mpv_mul_d 1.298 + .globl s_mpv_mul_d 1.299 + 1.300 +s_mpv_mul_d: 1.301 + #/* c = a * b */ 1.302 + #void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, 1.303 + # mp_digit *c) 1.304 + #{ 1.305 + # mp_digit a0, a1; regs a4, a5 1.306 + # mp_digit cy = 0; reg t2 1.307 + # mp_word w0, w1; regs t0, t1 1.308 + # 1.309 + # if (a_len) { 1.310 + beq a1,zero,.N.1 1.311 + move t2,zero # cy = 0 1.312 + dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) 1.313 + dsrl32 a2,a2,0 # This clears the upper 32 bits. 1.314 + # a0 = a[0]; 1.315 + lwu a4,0(a0) 1.316 + # w0 = ((mp_word)b * a0); 1.317 + dmultu a2,a4 1.318 + # if (--a_len) { 1.319 + addiu a1,a1,-1 1.320 + beq a1,zero,.N.2 1.321 + # while (a_len >= 2) { 1.322 + sltiu t3,a1,2 1.323 + bne t3,zero,.N.3 1.324 + # a1 = a[1]; 1.325 + lwu a5,4(a0) 1.326 +.N.4: 1.327 + # a_len -= 2; 1.328 + addiu a1,a1,-2 1.329 + # w0 += cy; 1.330 + mflo t0 1.331 + daddu t0,t0,t2 1.332 + # cy = CARRYOUT(w0); 1.333 + dsrl32 t2,t0,0 1.334 + # w1 = (mp_word)b * a1; 1.335 + dmultu a2,a5 1.336 + # c[0] = ACCUM(w0); 1.337 + sw t0,0(a3) 1.338 + # a0 = a[2]; 1.339 + lwu a4,8(a0) 1.340 + # a += 2; 1.341 + addiu a0,a0,8 1.342 + # w1 += cy; 1.343 + mflo t1 1.344 + daddu t1,t1,t2 1.345 + # cy = CARRYOUT(w1); 1.346 + dsrl32 t2,t1,0 1.347 + # w0 = (mp_word)b * a0; 1.348 + dmultu a2,a4 1.349 + # c[1] = ACCUM(w1); 1.350 + sw t1,4(a3) 1.351 + # c += 2; 1.352 + addiu a3,a3,8 1.353 + sltiu t3,a1,2 1.354 + beq t3,zero,.N.4 1.355 + # a1 = a[1]; 1.356 + lwu a5,4(a0) 1.357 + # } 1.358 +.N.3: 1.359 + # w0 += cy; 1.360 + # if (a_len) { 1.361 + mflo t0 1.362 + beq a1,zero,.N.5 1.363 + daddu t0,t0,t2 1.364 + # w1 = (mp_word)b * a1; 1.365 + dmultu a2,a5 # 1.366 + # cy = CARRYOUT(w0); 1.367 + dsrl32 t2,t0,0 1.368 + # c[0] = ACCUM(w0); 1.369 + sw t0,0(a3) 1.370 + # w1 += cy; 1.371 + mflo t1 1.372 + daddu t1,t1,t2 1.373 + # c[1] = ACCUM(w1); 1.374 + sw t1,4(a3) 1.375 + # cy = CARRYOUT(w1); 1.376 + dsrl32 t2,t1,0 1.377 + # c += 1; 1.378 + b .N.6 1.379 + addiu a3,a3,4 1.380 + # } else { 1.381 +.N.5: 1.382 + # c[0] = ACCUM(w0); 1.383 + sw t0,0(a3) 1.384 + # cy = CARRYOUT(w0); 1.385 + b .N.6 1.386 + dsrl32 t2,t0,0 1.387 + # } 1.388 + # } else { 1.389 +.N.2: 1.390 + mflo t0 1.391 + # c[0] = ACCUM(w0); 1.392 + sw t0,0(a3) 1.393 + # cy = CARRYOUT(w0); 1.394 + dsrl32 t2,t0,0 1.395 + # } 1.396 +.N.6: 1.397 + # c[1] = cy; 1.398 + jr ra 1.399 + sw t2,4(a3) 1.400 + # } 1.401 +.N.1: 1.402 + jr ra 1.403 + nop 1.404 + #} 1.405 + # 1.406 + .end s_mpv_mul_d 1.407 + 1.408 + 1.409 + .ent s_mpv_sqr_add_prop 1.410 + .globl s_mpv_sqr_add_prop 1.411 + #void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs); 1.412 + # registers 1.413 + # a0 *a 1.414 + # a1 a_len 1.415 + # a2 *sqr 1.416 + # a3 digit from *a, a_i 1.417 + # a4 square of digit from a 1.418 + # a5,a6 next 2 digits in sqr 1.419 + # a7,t0 carry 1.420 +s_mpv_sqr_add_prop: 1.421 + move a7,zero 1.422 + move t0,zero 1.423 + lwu a3,0(a0) 1.424 + addiu a1,a1,-1 # --a_len 1.425 + dmultu a3,a3 1.426 + beq a1,zero,.P.3 # jump if we've already done the only sqr 1.427 + addiu a0,a0,4 # ++a 1.428 +.P.2: 1.429 + lwu a5,0(a2) 1.430 + lwu a6,4(a2) 1.431 + addiu a2,a2,8 # sqrs += 2; 1.432 + dsll32 a6,a6,0 1.433 + daddu a5,a5,a6 1.434 + lwu a3,0(a0) 1.435 + addiu a0,a0,4 # ++a 1.436 + mflo a4 1.437 + daddu a6,a5,a4 1.438 + sltu a7,a6,a5 # a7 = a6 < a5 detect overflow 1.439 + dmultu a3,a3 1.440 + daddu a4,a6,t0 1.441 + sltu t0,a4,a6 1.442 + add t0,t0,a7 1.443 + sw a4,-8(a2) 1.444 + addiu a1,a1,-1 # --a_len 1.445 + dsrl32 a4,a4,0 1.446 + bne a1,zero,.P.2 # loop if a_len > 0 1.447 + sw a4,-4(a2) 1.448 +.P.3: 1.449 + lwu a5,0(a2) 1.450 + lwu a6,4(a2) 1.451 + addiu a2,a2,8 # sqrs += 2; 1.452 + dsll32 a6,a6,0 1.453 + daddu a5,a5,a6 1.454 + mflo a4 1.455 + daddu a6,a5,a4 1.456 + sltu a7,a6,a5 # a7 = a6 < a5 detect overflow 1.457 + daddu a4,a6,t0 1.458 + sltu t0,a4,a6 1.459 + add t0,t0,a7 1.460 + sw a4,-8(a2) 1.461 + beq t0,zero,.P.9 # jump if no carry 1.462 + dsrl32 a4,a4,0 1.463 +.P.8: 1.464 + sw a4,-4(a2) 1.465 + /* propagate final carry */ 1.466 + lwu a5,0(a2) 1.467 + daddu a6,a5,t0 1.468 + sltu t0,a6,a5 1.469 + bne t0,zero,.P.8 # loop if carry persists 1.470 + addiu a2,a2,4 # sqrs++ 1.471 +.P.9: 1.472 + jr ra 1.473 + sw a4,-4(a2) 1.474 + 1.475 + .end s_mpv_sqr_add_prop