|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 #include <regdef.h> |
|
5 .set noreorder |
|
6 .set noat |
|
7 |
|
8 .section .text, 1, 0x00000006, 4, 4 |
|
9 .text: |
|
10 .section .text |
|
11 |
|
12 .ent s_mpv_mul_d_add |
|
13 .globl s_mpv_mul_d_add |
|
14 |
|
15 s_mpv_mul_d_add: |
|
16 #/* c += a * b */ |
|
17 #void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, |
|
18 # mp_digit *c) |
|
19 #{ |
|
20 # mp_digit a0, a1; regs a4, a5 |
|
21 # mp_digit c0, c1; regs a6, a7 |
|
22 # mp_digit cy = 0; reg t2 |
|
23 # mp_word w0, w1; regs t0, t1 |
|
24 # |
|
25 # if (a_len) { |
|
26 beq a1,zero,.L.1 |
|
27 move t2,zero # cy = 0 |
|
28 dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) |
|
29 dsrl32 a2,a2,0 # This clears the upper 32 bits. |
|
30 # a0 = a[0]; |
|
31 lwu a4,0(a0) |
|
32 # w0 = ((mp_word)b * a0); |
|
33 dmultu a2,a4 |
|
34 # if (--a_len) { |
|
35 addiu a1,a1,-1 |
|
36 beq a1,zero,.L.2 |
|
37 # while (a_len >= 2) { |
|
38 sltiu t3,a1,2 |
|
39 bne t3,zero,.L.3 |
|
40 # a1 = a[1]; |
|
41 lwu a5,4(a0) |
|
42 .L.4: |
|
43 # a_len -= 2; |
|
44 addiu a1,a1,-2 |
|
45 # c0 = c[0]; |
|
46 lwu a6,0(a3) |
|
47 # w0 += cy; |
|
48 mflo t0 |
|
49 daddu t0,t0,t2 |
|
50 # w0 += c0; |
|
51 daddu t0,t0,a6 |
|
52 # w1 = (mp_word)b * a1; |
|
53 dmultu a2,a5 # |
|
54 # cy = CARRYOUT(w0); |
|
55 dsrl32 t2,t0,0 |
|
56 # c[0] = ACCUM(w0); |
|
57 sw t0,0(a3) |
|
58 # a0 = a[2]; |
|
59 lwu a4,8(a0) |
|
60 # a += 2; |
|
61 addiu a0,a0,8 |
|
62 # c1 = c[1]; |
|
63 lwu a7,4(a3) |
|
64 # w1 += cy; |
|
65 mflo t1 |
|
66 daddu t1,t1,t2 |
|
67 # w1 += c1; |
|
68 daddu t1,t1,a7 |
|
69 # w0 = (mp_word)b * a0; |
|
70 dmultu a2,a4 # |
|
71 # cy = CARRYOUT(w1); |
|
72 dsrl32 t2,t1,0 |
|
73 # c[1] = ACCUM(w1); |
|
74 sw t1,4(a3) |
|
75 # c += 2; |
|
76 addiu a3,a3,8 |
|
77 sltiu t3,a1,2 |
|
78 beq t3,zero,.L.4 |
|
79 # a1 = a[1]; |
|
80 lwu a5,4(a0) |
|
81 # } |
|
82 .L.3: |
|
83 # c0 = c[0]; |
|
84 lwu a6,0(a3) |
|
85 # w0 += cy; |
|
86 # if (a_len) { |
|
87 mflo t0 |
|
88 beq a1,zero,.L.5 |
|
89 daddu t0,t0,t2 |
|
90 # w1 = (mp_word)b * a1; |
|
91 dmultu a2,a5 |
|
92 # w0 += c0; |
|
93 daddu t0,t0,a6 # |
|
94 # cy = CARRYOUT(w0); |
|
95 dsrl32 t2,t0,0 |
|
96 # c[0] = ACCUM(w0); |
|
97 sw t0,0(a3) |
|
98 # c1 = c[1]; |
|
99 lwu a7,4(a3) |
|
100 # w1 += cy; |
|
101 mflo t1 |
|
102 daddu t1,t1,t2 |
|
103 # w1 += c1; |
|
104 daddu t1,t1,a7 |
|
105 # c[1] = ACCUM(w1); |
|
106 sw t1,4(a3) |
|
107 # cy = CARRYOUT(w1); |
|
108 dsrl32 t2,t1,0 |
|
109 # c += 1; |
|
110 b .L.6 |
|
111 addiu a3,a3,4 |
|
112 # } else { |
|
113 .L.5: |
|
114 # w0 += c0; |
|
115 daddu t0,t0,a6 |
|
116 # c[0] = ACCUM(w0); |
|
117 sw t0,0(a3) |
|
118 # cy = CARRYOUT(w0); |
|
119 b .L.6 |
|
120 dsrl32 t2,t0,0 |
|
121 # } |
|
122 # } else { |
|
123 .L.2: |
|
124 # c0 = c[0]; |
|
125 lwu a6,0(a3) |
|
126 # w0 += c0; |
|
127 mflo t0 |
|
128 daddu t0,t0,a6 |
|
129 # c[0] = ACCUM(w0); |
|
130 sw t0,0(a3) |
|
131 # cy = CARRYOUT(w0); |
|
132 dsrl32 t2,t0,0 |
|
133 # } |
|
134 .L.6: |
|
135 # c[1] = cy; |
|
136 jr ra |
|
137 sw t2,4(a3) |
|
138 # } |
|
139 .L.1: |
|
140 jr ra |
|
141 nop |
|
142 #} |
|
143 # |
|
144 .end s_mpv_mul_d_add |
|
145 |
|
146 .ent s_mpv_mul_d_add_prop |
|
147 .globl s_mpv_mul_d_add_prop |
|
148 |
|
149 s_mpv_mul_d_add_prop: |
|
150 #/* c += a * b */ |
|
151 #void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, |
|
152 # mp_digit *c) |
|
153 #{ |
|
154 # mp_digit a0, a1; regs a4, a5 |
|
155 # mp_digit c0, c1; regs a6, a7 |
|
156 # mp_digit cy = 0; reg t2 |
|
157 # mp_word w0, w1; regs t0, t1 |
|
158 # |
|
159 # if (a_len) { |
|
160 beq a1,zero,.M.1 |
|
161 move t2,zero # cy = 0 |
|
162 dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) |
|
163 dsrl32 a2,a2,0 # This clears the upper 32 bits. |
|
164 # a0 = a[0]; |
|
165 lwu a4,0(a0) |
|
166 # w0 = ((mp_word)b * a0); |
|
167 dmultu a2,a4 |
|
168 # if (--a_len) { |
|
169 addiu a1,a1,-1 |
|
170 beq a1,zero,.M.2 |
|
171 # while (a_len >= 2) { |
|
172 sltiu t3,a1,2 |
|
173 bne t3,zero,.M.3 |
|
174 # a1 = a[1]; |
|
175 lwu a5,4(a0) |
|
176 .M.4: |
|
177 # a_len -= 2; |
|
178 addiu a1,a1,-2 |
|
179 # c0 = c[0]; |
|
180 lwu a6,0(a3) |
|
181 # w0 += cy; |
|
182 mflo t0 |
|
183 daddu t0,t0,t2 |
|
184 # w0 += c0; |
|
185 daddu t0,t0,a6 |
|
186 # w1 = (mp_word)b * a1; |
|
187 dmultu a2,a5 # |
|
188 # cy = CARRYOUT(w0); |
|
189 dsrl32 t2,t0,0 |
|
190 # c[0] = ACCUM(w0); |
|
191 sw t0,0(a3) |
|
192 # a0 = a[2]; |
|
193 lwu a4,8(a0) |
|
194 # a += 2; |
|
195 addiu a0,a0,8 |
|
196 # c1 = c[1]; |
|
197 lwu a7,4(a3) |
|
198 # w1 += cy; |
|
199 mflo t1 |
|
200 daddu t1,t1,t2 |
|
201 # w1 += c1; |
|
202 daddu t1,t1,a7 |
|
203 # w0 = (mp_word)b * a0; |
|
204 dmultu a2,a4 # |
|
205 # cy = CARRYOUT(w1); |
|
206 dsrl32 t2,t1,0 |
|
207 # c[1] = ACCUM(w1); |
|
208 sw t1,4(a3) |
|
209 # c += 2; |
|
210 addiu a3,a3,8 |
|
211 sltiu t3,a1,2 |
|
212 beq t3,zero,.M.4 |
|
213 # a1 = a[1]; |
|
214 lwu a5,4(a0) |
|
215 # } |
|
216 .M.3: |
|
217 # c0 = c[0]; |
|
218 lwu a6,0(a3) |
|
219 # w0 += cy; |
|
220 # if (a_len) { |
|
221 mflo t0 |
|
222 beq a1,zero,.M.5 |
|
223 daddu t0,t0,t2 |
|
224 # w1 = (mp_word)b * a1; |
|
225 dmultu a2,a5 |
|
226 # w0 += c0; |
|
227 daddu t0,t0,a6 # |
|
228 # cy = CARRYOUT(w0); |
|
229 dsrl32 t2,t0,0 |
|
230 # c[0] = ACCUM(w0); |
|
231 sw t0,0(a3) |
|
232 # c1 = c[1]; |
|
233 lwu a7,4(a3) |
|
234 # w1 += cy; |
|
235 mflo t1 |
|
236 daddu t1,t1,t2 |
|
237 # w1 += c1; |
|
238 daddu t1,t1,a7 |
|
239 # c[1] = ACCUM(w1); |
|
240 sw t1,4(a3) |
|
241 # cy = CARRYOUT(w1); |
|
242 dsrl32 t2,t1,0 |
|
243 # c += 1; |
|
244 b .M.6 |
|
245 addiu a3,a3,8 |
|
246 # } else { |
|
247 .M.5: |
|
248 # w0 += c0; |
|
249 daddu t0,t0,a6 |
|
250 # c[0] = ACCUM(w0); |
|
251 sw t0,0(a3) |
|
252 # cy = CARRYOUT(w0); |
|
253 dsrl32 t2,t0,0 |
|
254 b .M.6 |
|
255 addiu a3,a3,4 |
|
256 # } |
|
257 # } else { |
|
258 .M.2: |
|
259 # c0 = c[0]; |
|
260 lwu a6,0(a3) |
|
261 # w0 += c0; |
|
262 mflo t0 |
|
263 daddu t0,t0,a6 |
|
264 # c[0] = ACCUM(w0); |
|
265 sw t0,0(a3) |
|
266 # cy = CARRYOUT(w0); |
|
267 dsrl32 t2,t0,0 |
|
268 addiu a3,a3,4 |
|
269 # } |
|
270 .M.6: |
|
271 |
|
272 # while (cy) { |
|
273 beq t2,zero,.M.1 |
|
274 nop |
|
275 .M.7: |
|
276 # mp_word w = (mp_word)*c + cy; |
|
277 lwu a6,0(a3) |
|
278 daddu t2,t2,a6 |
|
279 # *c++ = ACCUM(w); |
|
280 sw t2,0(a3) |
|
281 # cy = CARRYOUT(w); |
|
282 dsrl32 t2,t2,0 |
|
283 bne t2,zero,.M.7 |
|
284 addiu a3,a3,4 |
|
285 |
|
286 # } |
|
287 .M.1: |
|
288 jr ra |
|
289 nop |
|
290 #} |
|
291 # |
|
292 .end s_mpv_mul_d_add_prop |
|
293 |
|
294 .ent s_mpv_mul_d |
|
295 .globl s_mpv_mul_d |
|
296 |
|
297 s_mpv_mul_d: |
|
298 #/* c = a * b */ |
|
299 #void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, |
|
300 # mp_digit *c) |
|
301 #{ |
|
302 # mp_digit a0, a1; regs a4, a5 |
|
303 # mp_digit cy = 0; reg t2 |
|
304 # mp_word w0, w1; regs t0, t1 |
|
305 # |
|
306 # if (a_len) { |
|
307 beq a1,zero,.N.1 |
|
308 move t2,zero # cy = 0 |
|
309 dsll32 a2,a2,0 # "b" is sometimes negative (?!?!) |
|
310 dsrl32 a2,a2,0 # This clears the upper 32 bits. |
|
311 # a0 = a[0]; |
|
312 lwu a4,0(a0) |
|
313 # w0 = ((mp_word)b * a0); |
|
314 dmultu a2,a4 |
|
315 # if (--a_len) { |
|
316 addiu a1,a1,-1 |
|
317 beq a1,zero,.N.2 |
|
318 # while (a_len >= 2) { |
|
319 sltiu t3,a1,2 |
|
320 bne t3,zero,.N.3 |
|
321 # a1 = a[1]; |
|
322 lwu a5,4(a0) |
|
323 .N.4: |
|
324 # a_len -= 2; |
|
325 addiu a1,a1,-2 |
|
326 # w0 += cy; |
|
327 mflo t0 |
|
328 daddu t0,t0,t2 |
|
329 # cy = CARRYOUT(w0); |
|
330 dsrl32 t2,t0,0 |
|
331 # w1 = (mp_word)b * a1; |
|
332 dmultu a2,a5 |
|
333 # c[0] = ACCUM(w0); |
|
334 sw t0,0(a3) |
|
335 # a0 = a[2]; |
|
336 lwu a4,8(a0) |
|
337 # a += 2; |
|
338 addiu a0,a0,8 |
|
339 # w1 += cy; |
|
340 mflo t1 |
|
341 daddu t1,t1,t2 |
|
342 # cy = CARRYOUT(w1); |
|
343 dsrl32 t2,t1,0 |
|
344 # w0 = (mp_word)b * a0; |
|
345 dmultu a2,a4 |
|
346 # c[1] = ACCUM(w1); |
|
347 sw t1,4(a3) |
|
348 # c += 2; |
|
349 addiu a3,a3,8 |
|
350 sltiu t3,a1,2 |
|
351 beq t3,zero,.N.4 |
|
352 # a1 = a[1]; |
|
353 lwu a5,4(a0) |
|
354 # } |
|
355 .N.3: |
|
356 # w0 += cy; |
|
357 # if (a_len) { |
|
358 mflo t0 |
|
359 beq a1,zero,.N.5 |
|
360 daddu t0,t0,t2 |
|
361 # w1 = (mp_word)b * a1; |
|
362 dmultu a2,a5 # |
|
363 # cy = CARRYOUT(w0); |
|
364 dsrl32 t2,t0,0 |
|
365 # c[0] = ACCUM(w0); |
|
366 sw t0,0(a3) |
|
367 # w1 += cy; |
|
368 mflo t1 |
|
369 daddu t1,t1,t2 |
|
370 # c[1] = ACCUM(w1); |
|
371 sw t1,4(a3) |
|
372 # cy = CARRYOUT(w1); |
|
373 dsrl32 t2,t1,0 |
|
374 # c += 1; |
|
375 b .N.6 |
|
376 addiu a3,a3,4 |
|
377 # } else { |
|
378 .N.5: |
|
379 # c[0] = ACCUM(w0); |
|
380 sw t0,0(a3) |
|
381 # cy = CARRYOUT(w0); |
|
382 b .N.6 |
|
383 dsrl32 t2,t0,0 |
|
384 # } |
|
385 # } else { |
|
386 .N.2: |
|
387 mflo t0 |
|
388 # c[0] = ACCUM(w0); |
|
389 sw t0,0(a3) |
|
390 # cy = CARRYOUT(w0); |
|
391 dsrl32 t2,t0,0 |
|
392 # } |
|
393 .N.6: |
|
394 # c[1] = cy; |
|
395 jr ra |
|
396 sw t2,4(a3) |
|
397 # } |
|
398 .N.1: |
|
399 jr ra |
|
400 nop |
|
401 #} |
|
402 # |
|
403 .end s_mpv_mul_d |
|
404 |
|
405 |
|
406 .ent s_mpv_sqr_add_prop |
|
407 .globl s_mpv_sqr_add_prop |
|
408 #void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs); |
|
409 # registers |
|
410 # a0 *a |
|
411 # a1 a_len |
|
412 # a2 *sqr |
|
413 # a3 digit from *a, a_i |
|
414 # a4 square of digit from a |
|
415 # a5,a6 next 2 digits in sqr |
|
416 # a7,t0 carry |
|
417 s_mpv_sqr_add_prop: |
|
418 move a7,zero |
|
419 move t0,zero |
|
420 lwu a3,0(a0) |
|
421 addiu a1,a1,-1 # --a_len |
|
422 dmultu a3,a3 |
|
423 beq a1,zero,.P.3 # jump if we've already done the only sqr |
|
424 addiu a0,a0,4 # ++a |
|
425 .P.2: |
|
426 lwu a5,0(a2) |
|
427 lwu a6,4(a2) |
|
428 addiu a2,a2,8 # sqrs += 2; |
|
429 dsll32 a6,a6,0 |
|
430 daddu a5,a5,a6 |
|
431 lwu a3,0(a0) |
|
432 addiu a0,a0,4 # ++a |
|
433 mflo a4 |
|
434 daddu a6,a5,a4 |
|
435 sltu a7,a6,a5 # a7 = a6 < a5 detect overflow |
|
436 dmultu a3,a3 |
|
437 daddu a4,a6,t0 |
|
438 sltu t0,a4,a6 |
|
439 add t0,t0,a7 |
|
440 sw a4,-8(a2) |
|
441 addiu a1,a1,-1 # --a_len |
|
442 dsrl32 a4,a4,0 |
|
443 bne a1,zero,.P.2 # loop if a_len > 0 |
|
444 sw a4,-4(a2) |
|
445 .P.3: |
|
446 lwu a5,0(a2) |
|
447 lwu a6,4(a2) |
|
448 addiu a2,a2,8 # sqrs += 2; |
|
449 dsll32 a6,a6,0 |
|
450 daddu a5,a5,a6 |
|
451 mflo a4 |
|
452 daddu a6,a5,a4 |
|
453 sltu a7,a6,a5 # a7 = a6 < a5 detect overflow |
|
454 daddu a4,a6,t0 |
|
455 sltu t0,a4,a6 |
|
456 add t0,t0,a7 |
|
457 sw a4,-8(a2) |
|
458 beq t0,zero,.P.9 # jump if no carry |
|
459 dsrl32 a4,a4,0 |
|
460 .P.8: |
|
461 sw a4,-4(a2) |
|
462 /* propagate final carry */ |
|
463 lwu a5,0(a2) |
|
464 daddu a6,a5,t0 |
|
465 sltu t0,a6,a5 |
|
466 bne t0,zero,.P.8 # loop if carry persists |
|
467 addiu a2,a2,4 # sqrs++ |
|
468 .P.9: |
|
469 jr ra |
|
470 sw a4,-4(a2) |
|
471 |
|
472 .end s_mpv_sqr_add_prop |