|
1 # |
|
2 # This Source Code Form is subject to the terms of the Mozilla Public |
|
3 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5 |
|
6 .data |
|
7 .align 4 |
|
8 # |
|
9 # -1 means to call _s_mpi_is_sse to determine if we support sse |
|
10 # instructions. |
|
11 # 0 means to use x86 instructions |
|
12 # 1 means to use sse2 instructions |
|
13 .type is_sse,@object |
|
14 .size is_sse,4 |
|
15 is_sse: .long -1 |
|
16 |
|
17 # |
|
18 # sigh, handle the difference between -fPIC and not PIC |
|
19 # default to pic, since this file seems to be exclusively |
|
20 # linux right now (solaris uses mpi_i86pc.s and windows uses |
|
21 # mpi_x86_asm.c) |
|
22 # |
|
23 #.ifndef NO_PIC |
|
24 #.macro GET var,reg |
|
25 # movl \var@GOTOFF(%ebx),\reg |
|
26 #.endm |
|
27 #.macro PUT reg,var |
|
28 # movl \reg,\var@GOTOFF(%ebx) |
|
29 #.endm |
|
30 #.else |
|
31 .macro GET var,reg |
|
32 movl \var,\reg |
|
33 .endm |
|
34 .macro PUT reg,var |
|
35 movl \reg,\var |
|
36 .endm |
|
37 #.endif |
|
38 |
|
39 .text |
|
40 |
|
41 |
|
42 # ebp - 36: caller's esi |
|
43 # ebp - 32: caller's edi |
|
44 # ebp - 28: |
|
45 # ebp - 24: |
|
46 # ebp - 20: |
|
47 # ebp - 16: |
|
48 # ebp - 12: |
|
49 # ebp - 8: |
|
50 # ebp - 4: |
|
51 # ebp + 0: caller's ebp |
|
52 # ebp + 4: return address |
|
53 # ebp + 8: a argument |
|
54 # ebp + 12: a_len argument |
|
55 # ebp + 16: b argument |
|
56 # ebp + 20: c argument |
|
57 # registers: |
|
58 # eax: |
|
59 # ebx: carry |
|
60 # ecx: a_len |
|
61 # edx: |
|
62 # esi: a ptr |
|
63 # edi: c ptr |
|
64 .globl _s_mpv_mul_d |
|
65 .type _s_mpv_mul_d,@function |
|
66 _s_mpv_mul_d: |
|
67 GET is_sse,%eax |
|
68 cmp $0,%eax |
|
69 je _s_mpv_mul_d_x86 |
|
70 jg _s_mpv_mul_d_sse2 |
|
71 call _s_mpi_is_sse2 |
|
72 PUT %eax,is_sse |
|
73 cmp $0,%eax |
|
74 jg _s_mpv_mul_d_sse2 |
|
75 _s_mpv_mul_d_x86: |
|
76 push %ebp |
|
77 mov %esp,%ebp |
|
78 sub $28,%esp |
|
79 push %edi |
|
80 push %esi |
|
81 push %ebx |
|
82 movl $0,%ebx # carry = 0 |
|
83 mov 12(%ebp),%ecx # ecx = a_len |
|
84 mov 20(%ebp),%edi |
|
85 cmp $0,%ecx |
|
86 je 2f # jmp if a_len == 0 |
|
87 mov 8(%ebp),%esi # esi = a |
|
88 cld |
|
89 1: |
|
90 lodsl # eax = [ds:esi]; esi += 4 |
|
91 mov 16(%ebp),%edx # edx = b |
|
92 mull %edx # edx:eax = Phi:Plo = a_i * b |
|
93 |
|
94 add %ebx,%eax # add carry (%ebx) to edx:eax |
|
95 adc $0,%edx |
|
96 mov %edx,%ebx # high half of product becomes next carry |
|
97 |
|
98 stosl # [es:edi] = ax; edi += 4; |
|
99 dec %ecx # --a_len |
|
100 jnz 1b # jmp if a_len != 0 |
|
101 2: |
|
102 mov %ebx,0(%edi) # *c = carry |
|
103 pop %ebx |
|
104 pop %esi |
|
105 pop %edi |
|
106 leave |
|
107 ret |
|
108 nop |
|
109 _s_mpv_mul_d_sse2: |
|
110 push %ebp |
|
111 mov %esp,%ebp |
|
112 push %edi |
|
113 push %esi |
|
114 psubq %mm2,%mm2 # carry = 0 |
|
115 mov 12(%ebp),%ecx # ecx = a_len |
|
116 movd 16(%ebp),%mm1 # mm1 = b |
|
117 mov 20(%ebp),%edi |
|
118 cmp $0,%ecx |
|
119 je 6f # jmp if a_len == 0 |
|
120 mov 8(%ebp),%esi # esi = a |
|
121 cld |
|
122 5: |
|
123 movd 0(%esi),%mm0 # mm0 = *a++ |
|
124 add $4,%esi |
|
125 pmuludq %mm1,%mm0 # mm0 = b * *a++ |
|
126 paddq %mm0,%mm2 # add the carry |
|
127 movd %mm2,0(%edi) # store the 32bit result |
|
128 add $4,%edi |
|
129 psrlq $32, %mm2 # save the carry |
|
130 dec %ecx # --a_len |
|
131 jnz 5b # jmp if a_len != 0 |
|
132 6: |
|
133 movd %mm2,0(%edi) # *c = carry |
|
134 emms |
|
135 pop %esi |
|
136 pop %edi |
|
137 leave |
|
138 ret |
|
139 nop |
|
140 |
|
141 # ebp - 36: caller's esi |
|
142 # ebp - 32: caller's edi |
|
143 # ebp - 28: |
|
144 # ebp - 24: |
|
145 # ebp - 20: |
|
146 # ebp - 16: |
|
147 # ebp - 12: |
|
148 # ebp - 8: |
|
149 # ebp - 4: |
|
150 # ebp + 0: caller's ebp |
|
151 # ebp + 4: return address |
|
152 # ebp + 8: a argument |
|
153 # ebp + 12: a_len argument |
|
154 # ebp + 16: b argument |
|
155 # ebp + 20: c argument |
|
156 # registers: |
|
157 # eax: |
|
158 # ebx: carry |
|
159 # ecx: a_len |
|
160 # edx: |
|
161 # esi: a ptr |
|
162 # edi: c ptr |
|
163 .globl _s_mpv_mul_d_add |
|
164 .type _s_mpv_mul_d_add,@function |
|
165 _s_mpv_mul_d_add: |
|
166 GET is_sse,%eax |
|
167 cmp $0,%eax |
|
168 je _s_mpv_mul_d_add_x86 |
|
169 jg _s_mpv_mul_d_add_sse2 |
|
170 call _s_mpi_is_sse2 |
|
171 PUT %eax,is_sse |
|
172 cmp $0,%eax |
|
173 jg _s_mpv_mul_d_add_sse2 |
|
174 _s_mpv_mul_d_add_x86: |
|
175 push %ebp |
|
176 mov %esp,%ebp |
|
177 sub $28,%esp |
|
178 push %edi |
|
179 push %esi |
|
180 push %ebx |
|
181 movl $0,%ebx # carry = 0 |
|
182 mov 12(%ebp),%ecx # ecx = a_len |
|
183 mov 20(%ebp),%edi |
|
184 cmp $0,%ecx |
|
185 je 11f # jmp if a_len == 0 |
|
186 mov 8(%ebp),%esi # esi = a |
|
187 cld |
|
188 10: |
|
189 lodsl # eax = [ds:esi]; esi += 4 |
|
190 mov 16(%ebp),%edx # edx = b |
|
191 mull %edx # edx:eax = Phi:Plo = a_i * b |
|
192 |
|
193 add %ebx,%eax # add carry (%ebx) to edx:eax |
|
194 adc $0,%edx |
|
195 mov 0(%edi),%ebx # add in current word from *c |
|
196 add %ebx,%eax |
|
197 adc $0,%edx |
|
198 mov %edx,%ebx # high half of product becomes next carry |
|
199 |
|
200 stosl # [es:edi] = ax; edi += 4; |
|
201 dec %ecx # --a_len |
|
202 jnz 10b # jmp if a_len != 0 |
|
203 11: |
|
204 mov %ebx,0(%edi) # *c = carry |
|
205 pop %ebx |
|
206 pop %esi |
|
207 pop %edi |
|
208 leave |
|
209 ret |
|
210 nop |
|
211 _s_mpv_mul_d_add_sse2: |
|
212 push %ebp |
|
213 mov %esp,%ebp |
|
214 push %edi |
|
215 push %esi |
|
216 psubq %mm2,%mm2 # carry = 0 |
|
217 mov 12(%ebp),%ecx # ecx = a_len |
|
218 movd 16(%ebp),%mm1 # mm1 = b |
|
219 mov 20(%ebp),%edi |
|
220 cmp $0,%ecx |
|
221 je 16f # jmp if a_len == 0 |
|
222 mov 8(%ebp),%esi # esi = a |
|
223 cld |
|
224 15: |
|
225 movd 0(%esi),%mm0 # mm0 = *a++ |
|
226 add $4,%esi |
|
227 pmuludq %mm1,%mm0 # mm0 = b * *a++ |
|
228 paddq %mm0,%mm2 # add the carry |
|
229 movd 0(%edi),%mm0 |
|
230 paddq %mm0,%mm2 # add the carry |
|
231 movd %mm2,0(%edi) # store the 32bit result |
|
232 add $4,%edi |
|
233 psrlq $32, %mm2 # save the carry |
|
234 dec %ecx # --a_len |
|
235 jnz 15b # jmp if a_len != 0 |
|
236 16: |
|
237 movd %mm2,0(%edi) # *c = carry |
|
238 emms |
|
239 pop %esi |
|
240 pop %edi |
|
241 leave |
|
242 ret |
|
243 nop |
|
244 |
|
245 # ebp - 8: caller's esi |
|
246 # ebp - 4: caller's edi |
|
247 # ebp + 0: caller's ebp |
|
248 # ebp + 4: return address |
|
249 # ebp + 8: a argument |
|
250 # ebp + 12: a_len argument |
|
251 # ebp + 16: b argument |
|
252 # ebp + 20: c argument |
|
253 # registers: |
|
254 # eax: |
|
255 # ebx: carry |
|
256 # ecx: a_len |
|
257 # edx: |
|
258 # esi: a ptr |
|
259 # edi: c ptr |
|
260 .globl _s_mpv_mul_d_add_prop |
|
261 .type _s_mpv_mul_d_add_prop,@function |
|
262 _s_mpv_mul_d_add_prop: |
|
263 GET is_sse,%eax |
|
264 cmp $0,%eax |
|
265 je _s_mpv_mul_d_add_prop_x86 |
|
266 jg _s_mpv_mul_d_add_prop_sse2 |
|
267 call _s_mpi_is_sse2 |
|
268 PUT %eax,is_sse |
|
269 cmp $0,%eax |
|
270 jg _s_mpv_mul_d_add_prop_sse2 |
|
271 _s_mpv_mul_d_add_prop_x86: |
|
272 push %ebp |
|
273 mov %esp,%ebp |
|
274 sub $28,%esp |
|
275 push %edi |
|
276 push %esi |
|
277 push %ebx |
|
278 movl $0,%ebx # carry = 0 |
|
279 mov 12(%ebp),%ecx # ecx = a_len |
|
280 mov 20(%ebp),%edi |
|
281 cmp $0,%ecx |
|
282 je 21f # jmp if a_len == 0 |
|
283 cld |
|
284 mov 8(%ebp),%esi # esi = a |
|
285 20: |
|
286 lodsl # eax = [ds:esi]; esi += 4 |
|
287 mov 16(%ebp),%edx # edx = b |
|
288 mull %edx # edx:eax = Phi:Plo = a_i * b |
|
289 |
|
290 add %ebx,%eax # add carry (%ebx) to edx:eax |
|
291 adc $0,%edx |
|
292 mov 0(%edi),%ebx # add in current word from *c |
|
293 add %ebx,%eax |
|
294 adc $0,%edx |
|
295 mov %edx,%ebx # high half of product becomes next carry |
|
296 |
|
297 stosl # [es:edi] = ax; edi += 4; |
|
298 dec %ecx # --a_len |
|
299 jnz 20b # jmp if a_len != 0 |
|
300 21: |
|
301 cmp $0,%ebx # is carry zero? |
|
302 jz 23f |
|
303 mov 0(%edi),%eax # add in current word from *c |
|
304 add %ebx,%eax |
|
305 stosl # [es:edi] = ax; edi += 4; |
|
306 jnc 23f |
|
307 22: |
|
308 mov 0(%edi),%eax # add in current word from *c |
|
309 adc $0,%eax |
|
310 stosl # [es:edi] = ax; edi += 4; |
|
311 jc 22b |
|
312 23: |
|
313 pop %ebx |
|
314 pop %esi |
|
315 pop %edi |
|
316 leave |
|
317 ret |
|
318 nop |
|
319 _s_mpv_mul_d_add_prop_sse2: |
|
320 push %ebp |
|
321 mov %esp,%ebp |
|
322 push %edi |
|
323 push %esi |
|
324 push %ebx |
|
325 psubq %mm2,%mm2 # carry = 0 |
|
326 mov 12(%ebp),%ecx # ecx = a_len |
|
327 movd 16(%ebp),%mm1 # mm1 = b |
|
328 mov 20(%ebp),%edi |
|
329 cmp $0,%ecx |
|
330 je 26f # jmp if a_len == 0 |
|
331 mov 8(%ebp),%esi # esi = a |
|
332 cld |
|
333 25: |
|
334 movd 0(%esi),%mm0 # mm0 = *a++ |
|
335 movd 0(%edi),%mm3 # fetch the sum |
|
336 add $4,%esi |
|
337 pmuludq %mm1,%mm0 # mm0 = b * *a++ |
|
338 paddq %mm0,%mm2 # add the carry |
|
339 paddq %mm3,%mm2 # add *c++ |
|
340 movd %mm2,0(%edi) # store the 32bit result |
|
341 add $4,%edi |
|
342 psrlq $32, %mm2 # save the carry |
|
343 dec %ecx # --a_len |
|
344 jnz 25b # jmp if a_len != 0 |
|
345 26: |
|
346 movd %mm2,%ebx |
|
347 cmp $0,%ebx # is carry zero? |
|
348 jz 28f |
|
349 mov 0(%edi),%eax |
|
350 add %ebx, %eax |
|
351 stosl |
|
352 jnc 28f |
|
353 27: |
|
354 mov 0(%edi),%eax # add in current word from *c |
|
355 adc $0,%eax |
|
356 stosl # [es:edi] = ax; edi += 4; |
|
357 jc 27b |
|
358 28: |
|
359 emms |
|
360 pop %ebx |
|
361 pop %esi |
|
362 pop %edi |
|
363 leave |
|
364 ret |
|
365 nop |
|
366 |
|
367 |
|
368 # ebp - 20: caller's esi |
|
369 # ebp - 16: caller's edi |
|
370 # ebp - 12: |
|
371 # ebp - 8: carry |
|
372 # ebp - 4: a_len local |
|
373 # ebp + 0: caller's ebp |
|
374 # ebp + 4: return address |
|
375 # ebp + 8: pa argument |
|
376 # ebp + 12: a_len argument |
|
377 # ebp + 16: ps argument |
|
378 # ebp + 20: |
|
379 # registers: |
|
380 # eax: |
|
381 # ebx: carry |
|
382 # ecx: a_len |
|
383 # edx: |
|
384 # esi: a ptr |
|
385 # edi: c ptr |
|
386 |
|
387 .globl _s_mpv_sqr_add_prop |
|
388 .type _s_mpv_sqr_add_prop,@function |
|
389 _s_mpv_sqr_add_prop: |
|
390 GET is_sse,%eax |
|
391 cmp $0,%eax |
|
392 je _s_mpv_sqr_add_prop_x86 |
|
393 jg _s_mpv_sqr_add_prop_sse2 |
|
394 call _s_mpi_is_sse2 |
|
395 PUT %eax,is_sse |
|
396 cmp $0,%eax |
|
397 jg _s_mpv_sqr_add_prop_sse2 |
|
398 _s_mpv_sqr_add_prop_x86: |
|
399 push %ebp |
|
400 mov %esp,%ebp |
|
401 sub $12,%esp |
|
402 push %edi |
|
403 push %esi |
|
404 push %ebx |
|
405 movl $0,%ebx # carry = 0 |
|
406 mov 12(%ebp),%ecx # a_len |
|
407 mov 16(%ebp),%edi # edi = ps |
|
408 cmp $0,%ecx |
|
409 je 31f # jump if a_len == 0 |
|
410 cld |
|
411 mov 8(%ebp),%esi # esi = pa |
|
412 30: |
|
413 lodsl # %eax = [ds:si]; si += 4; |
|
414 mull %eax |
|
415 |
|
416 add %ebx,%eax # add "carry" |
|
417 adc $0,%edx |
|
418 mov 0(%edi),%ebx |
|
419 add %ebx,%eax # add low word from result |
|
420 mov 4(%edi),%ebx |
|
421 stosl # [es:di] = %eax; di += 4; |
|
422 adc %ebx,%edx # add high word from result |
|
423 movl $0,%ebx |
|
424 mov %edx,%eax |
|
425 adc $0,%ebx |
|
426 stosl # [es:di] = %eax; di += 4; |
|
427 dec %ecx # --a_len |
|
428 jnz 30b # jmp if a_len != 0 |
|
429 31: |
|
430 cmp $0,%ebx # is carry zero? |
|
431 jz 34f |
|
432 mov 0(%edi),%eax # add in current word from *c |
|
433 add %ebx,%eax |
|
434 stosl # [es:edi] = ax; edi += 4; |
|
435 jnc 34f |
|
436 32: |
|
437 mov 0(%edi),%eax # add in current word from *c |
|
438 adc $0,%eax |
|
439 stosl # [es:edi] = ax; edi += 4; |
|
440 jc 32b |
|
441 34: |
|
442 pop %ebx |
|
443 pop %esi |
|
444 pop %edi |
|
445 leave |
|
446 ret |
|
447 nop |
|
448 _s_mpv_sqr_add_prop_sse2: |
|
449 push %ebp |
|
450 mov %esp,%ebp |
|
451 push %edi |
|
452 push %esi |
|
453 push %ebx |
|
454 psubq %mm2,%mm2 # carry = 0 |
|
455 mov 12(%ebp),%ecx # ecx = a_len |
|
456 mov 16(%ebp),%edi |
|
457 cmp $0,%ecx |
|
458 je 36f # jmp if a_len == 0 |
|
459 mov 8(%ebp),%esi # esi = a |
|
460 cld |
|
461 35: |
|
462 movd 0(%esi),%mm0 # mm0 = *a |
|
463 movd 0(%edi),%mm3 # fetch the sum |
|
464 add $4,%esi |
|
465 pmuludq %mm0,%mm0 # mm0 = sqr(a) |
|
466 paddq %mm0,%mm2 # add the carry |
|
467 paddq %mm3,%mm2 # add the low word |
|
468 movd 4(%edi),%mm3 |
|
469 movd %mm2,0(%edi) # store the 32bit result |
|
470 psrlq $32, %mm2 |
|
471 paddq %mm3,%mm2 # add the high word |
|
472 movd %mm2,4(%edi) # store the 32bit result |
|
473 psrlq $32, %mm2 # save the carry. |
|
474 add $8,%edi |
|
475 dec %ecx # --a_len |
|
476 jnz 35b # jmp if a_len != 0 |
|
477 36: |
|
478 movd %mm2,%ebx |
|
479 cmp $0,%ebx # is carry zero? |
|
480 jz 38f |
|
481 mov 0(%edi),%eax |
|
482 add %ebx, %eax |
|
483 stosl |
|
484 jnc 38f |
|
485 37: |
|
486 mov 0(%edi),%eax # add in current word from *c |
|
487 adc $0,%eax |
|
488 stosl # [es:edi] = ax; edi += 4; |
|
489 jc 37b |
|
490 38: |
|
491 emms |
|
492 pop %ebx |
|
493 pop %esi |
|
494 pop %edi |
|
495 leave |
|
496 ret |
|
497 nop |
|
498 |
|
499 # |
|
500 # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized |
|
501 # so its high bit is 1. This code is from NSPR. |
|
502 # |
|
503 # mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, |
|
504 # mp_digit *qp, mp_digit *rp) |
|
505 |
|
506 # esp + 0: Caller's ebx |
|
507 # esp + 4: return address |
|
508 # esp + 8: Nhi argument |
|
509 # esp + 12: Nlo argument |
|
510 # esp + 16: divisor argument |
|
511 # esp + 20: qp argument |
|
512 # esp + 24: rp argument |
|
513 # registers: |
|
514 # eax: |
|
515 # ebx: carry |
|
516 # ecx: a_len |
|
517 # edx: |
|
518 # esi: a ptr |
|
519 # edi: c ptr |
|
520 # |
|
521 |
|
522 .globl _s_mpv_div_2dx1d |
|
523 .type _s_mpv_div_2dx1d,@function |
|
524 _s_mpv_div_2dx1d: |
|
525 push %ebx |
|
526 mov 8(%esp),%edx |
|
527 mov 12(%esp),%eax |
|
528 mov 16(%esp),%ebx |
|
529 div %ebx |
|
530 mov 20(%esp),%ebx |
|
531 mov %eax,0(%ebx) |
|
532 mov 24(%esp),%ebx |
|
533 mov %edx,0(%ebx) |
|
534 xor %eax,%eax # return zero |
|
535 pop %ebx |
|
536 ret |
|
537 nop |
|
538 |