michael@0: # michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: .data michael@0: .align 4 michael@0: # michael@0: # -1 means to call _s_mpi_is_sse to determine if we support sse michael@0: # instructions. michael@0: # 0 means to use x86 instructions michael@0: # 1 means to use sse2 instructions michael@0: .type is_sse,@object michael@0: .size is_sse,4 michael@0: is_sse: .long -1 michael@0: michael@0: # michael@0: # sigh, handle the difference between -fPIC and not PIC michael@0: # default to pic, since this file seems to be exclusively michael@0: # linux right now (solaris uses mpi_i86pc.s and windows uses michael@0: # mpi_x86_asm.c) michael@0: # michael@0: #.ifndef NO_PIC michael@0: #.macro GET var,reg michael@0: # movl \var@GOTOFF(%ebx),\reg michael@0: #.endm michael@0: #.macro PUT reg,var michael@0: # movl \reg,\var@GOTOFF(%ebx) michael@0: #.endm michael@0: #.else michael@0: .macro GET var,reg michael@0: movl \var,\reg michael@0: .endm michael@0: .macro PUT reg,var michael@0: movl \reg,\var michael@0: .endm michael@0: #.endif michael@0: michael@0: .text michael@0: michael@0: michael@0: # ebp - 36: caller's esi michael@0: # ebp - 32: caller's edi michael@0: # ebp - 28: michael@0: # ebp - 24: michael@0: # ebp - 20: michael@0: # ebp - 16: michael@0: # ebp - 12: michael@0: # ebp - 8: michael@0: # ebp - 4: michael@0: # ebp + 0: caller's ebp michael@0: # ebp + 4: return address michael@0: # ebp + 8: a argument michael@0: # ebp + 12: a_len argument michael@0: # ebp + 16: b argument michael@0: # ebp + 20: c argument michael@0: # registers: michael@0: # eax: michael@0: # ebx: carry michael@0: # ecx: a_len michael@0: # edx: michael@0: # esi: a ptr michael@0: # edi: c ptr michael@0: .globl _s_mpv_mul_d michael@0: .type _s_mpv_mul_d,@function michael@0: _s_mpv_mul_d: michael@0: GET is_sse,%eax michael@0: cmp $0,%eax michael@0: je _s_mpv_mul_d_x86 michael@0: jg _s_mpv_mul_d_sse2 michael@0: call _s_mpi_is_sse2 michael@0: PUT %eax,is_sse michael@0: cmp $0,%eax michael@0: jg _s_mpv_mul_d_sse2 michael@0: _s_mpv_mul_d_x86: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: sub $28,%esp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: movl $0,%ebx # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 2f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 1: michael@0: lodsl # eax = [ds:esi]; esi += 4 michael@0: mov 16(%ebp),%edx # edx = b michael@0: mull %edx # edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add %ebx,%eax # add carry (%ebx) to edx:eax michael@0: adc $0,%edx michael@0: mov %edx,%ebx # high half of product becomes next carry michael@0: michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: dec %ecx # --a_len michael@0: jnz 1b # jmp if a_len != 0 michael@0: 2: michael@0: mov %ebx,0(%edi) # *c = carry michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: _s_mpv_mul_d_sse2: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: push %edi michael@0: push %esi michael@0: psubq %mm2,%mm2 # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: movd 16(%ebp),%mm1 # mm1 = b michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 6f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 5: michael@0: movd 0(%esi),%mm0 # mm0 = *a++ michael@0: add $4,%esi michael@0: pmuludq %mm1,%mm0 # mm0 = b * *a++ michael@0: paddq %mm0,%mm2 # add the carry michael@0: movd %mm2,0(%edi) # store the 32bit result michael@0: add $4,%edi michael@0: psrlq $32, %mm2 # save the carry michael@0: dec %ecx # --a_len michael@0: jnz 5b # jmp if a_len != 0 michael@0: 6: michael@0: movd %mm2,0(%edi) # *c = carry michael@0: emms michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: michael@0: # ebp - 36: caller's esi michael@0: # ebp - 32: caller's edi michael@0: # ebp - 28: michael@0: # ebp - 24: michael@0: # ebp - 20: michael@0: # ebp - 16: michael@0: # ebp - 12: michael@0: # ebp - 8: michael@0: # ebp - 4: michael@0: # ebp + 0: caller's ebp michael@0: # ebp + 4: return address michael@0: # ebp + 8: a argument michael@0: # ebp + 12: a_len argument michael@0: # ebp + 16: b argument michael@0: # ebp + 20: c argument michael@0: # registers: michael@0: # eax: michael@0: # ebx: carry michael@0: # ecx: a_len michael@0: # edx: michael@0: # esi: a ptr michael@0: # edi: c ptr michael@0: .globl _s_mpv_mul_d_add michael@0: .type _s_mpv_mul_d_add,@function michael@0: _s_mpv_mul_d_add: michael@0: GET is_sse,%eax michael@0: cmp $0,%eax michael@0: je _s_mpv_mul_d_add_x86 michael@0: jg _s_mpv_mul_d_add_sse2 michael@0: call _s_mpi_is_sse2 michael@0: PUT %eax,is_sse michael@0: cmp $0,%eax michael@0: jg _s_mpv_mul_d_add_sse2 michael@0: _s_mpv_mul_d_add_x86: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: sub $28,%esp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: movl $0,%ebx # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 11f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 10: michael@0: lodsl # eax = [ds:esi]; esi += 4 michael@0: mov 16(%ebp),%edx # edx = b michael@0: mull %edx # edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add %ebx,%eax # add carry (%ebx) to edx:eax michael@0: adc $0,%edx michael@0: mov 0(%edi),%ebx # add in current word from *c michael@0: add %ebx,%eax michael@0: adc $0,%edx michael@0: mov %edx,%ebx # high half of product becomes next carry michael@0: michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: dec %ecx # --a_len michael@0: jnz 10b # jmp if a_len != 0 michael@0: 11: michael@0: mov %ebx,0(%edi) # *c = carry michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: _s_mpv_mul_d_add_sse2: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: push %edi michael@0: push %esi michael@0: psubq %mm2,%mm2 # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: movd 16(%ebp),%mm1 # mm1 = b michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 16f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 15: michael@0: movd 0(%esi),%mm0 # mm0 = *a++ michael@0: add $4,%esi michael@0: pmuludq %mm1,%mm0 # mm0 = b * *a++ michael@0: paddq %mm0,%mm2 # add the carry michael@0: movd 0(%edi),%mm0 michael@0: paddq %mm0,%mm2 # add the carry michael@0: movd %mm2,0(%edi) # store the 32bit result michael@0: add $4,%edi michael@0: psrlq $32, %mm2 # save the carry michael@0: dec %ecx # --a_len michael@0: jnz 15b # jmp if a_len != 0 michael@0: 16: michael@0: movd %mm2,0(%edi) # *c = carry michael@0: emms michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: michael@0: # ebp - 8: caller's esi michael@0: # ebp - 4: caller's edi michael@0: # ebp + 0: caller's ebp michael@0: # ebp + 4: return address michael@0: # ebp + 8: a argument michael@0: # ebp + 12: a_len argument michael@0: # ebp + 16: b argument michael@0: # ebp + 20: c argument michael@0: # registers: michael@0: # eax: michael@0: # ebx: carry michael@0: # ecx: a_len michael@0: # edx: michael@0: # esi: a ptr michael@0: # edi: c ptr michael@0: .globl _s_mpv_mul_d_add_prop michael@0: .type _s_mpv_mul_d_add_prop,@function michael@0: _s_mpv_mul_d_add_prop: michael@0: GET is_sse,%eax michael@0: cmp $0,%eax michael@0: je _s_mpv_mul_d_add_prop_x86 michael@0: jg _s_mpv_mul_d_add_prop_sse2 michael@0: call _s_mpi_is_sse2 michael@0: PUT %eax,is_sse michael@0: cmp $0,%eax michael@0: jg _s_mpv_mul_d_add_prop_sse2 michael@0: _s_mpv_mul_d_add_prop_x86: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: sub $28,%esp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: movl $0,%ebx # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 21f # jmp if a_len == 0 michael@0: cld michael@0: mov 8(%ebp),%esi # esi = a michael@0: 20: michael@0: lodsl # eax = [ds:esi]; esi += 4 michael@0: mov 16(%ebp),%edx # edx = b michael@0: mull %edx # edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add %ebx,%eax # add carry (%ebx) to edx:eax michael@0: adc $0,%edx michael@0: mov 0(%edi),%ebx # add in current word from *c michael@0: add %ebx,%eax michael@0: adc $0,%edx michael@0: mov %edx,%ebx # high half of product becomes next carry michael@0: michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: dec %ecx # --a_len michael@0: jnz 20b # jmp if a_len != 0 michael@0: 21: michael@0: cmp $0,%ebx # is carry zero? michael@0: jz 23f michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: add %ebx,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jnc 23f michael@0: 22: michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: adc $0,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jc 22b michael@0: 23: michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: _s_mpv_mul_d_add_prop_sse2: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: psubq %mm2,%mm2 # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: movd 16(%ebp),%mm1 # mm1 = b michael@0: mov 20(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 26f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 25: michael@0: movd 0(%esi),%mm0 # mm0 = *a++ michael@0: movd 0(%edi),%mm3 # fetch the sum michael@0: add $4,%esi michael@0: pmuludq %mm1,%mm0 # mm0 = b * *a++ michael@0: paddq %mm0,%mm2 # add the carry michael@0: paddq %mm3,%mm2 # add *c++ michael@0: movd %mm2,0(%edi) # store the 32bit result michael@0: add $4,%edi michael@0: psrlq $32, %mm2 # save the carry michael@0: dec %ecx # --a_len michael@0: jnz 25b # jmp if a_len != 0 michael@0: 26: michael@0: movd %mm2,%ebx michael@0: cmp $0,%ebx # is carry zero? michael@0: jz 28f michael@0: mov 0(%edi),%eax michael@0: add %ebx, %eax michael@0: stosl michael@0: jnc 28f michael@0: 27: michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: adc $0,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jc 27b michael@0: 28: michael@0: emms michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: michael@0: michael@0: # ebp - 20: caller's esi michael@0: # ebp - 16: caller's edi michael@0: # ebp - 12: michael@0: # ebp - 8: carry michael@0: # ebp - 4: a_len local michael@0: # ebp + 0: caller's ebp michael@0: # ebp + 4: return address michael@0: # ebp + 8: pa argument michael@0: # ebp + 12: a_len argument michael@0: # ebp + 16: ps argument michael@0: # ebp + 20: michael@0: # registers: michael@0: # eax: michael@0: # ebx: carry michael@0: # ecx: a_len michael@0: # edx: michael@0: # esi: a ptr michael@0: # edi: c ptr michael@0: michael@0: .globl _s_mpv_sqr_add_prop michael@0: .type _s_mpv_sqr_add_prop,@function michael@0: _s_mpv_sqr_add_prop: michael@0: GET is_sse,%eax michael@0: cmp $0,%eax michael@0: je _s_mpv_sqr_add_prop_x86 michael@0: jg _s_mpv_sqr_add_prop_sse2 michael@0: call _s_mpi_is_sse2 michael@0: PUT %eax,is_sse michael@0: cmp $0,%eax michael@0: jg _s_mpv_sqr_add_prop_sse2 michael@0: _s_mpv_sqr_add_prop_x86: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: sub $12,%esp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: movl $0,%ebx # carry = 0 michael@0: mov 12(%ebp),%ecx # a_len michael@0: mov 16(%ebp),%edi # edi = ps michael@0: cmp $0,%ecx michael@0: je 31f # jump if a_len == 0 michael@0: cld michael@0: mov 8(%ebp),%esi # esi = pa michael@0: 30: michael@0: lodsl # %eax = [ds:si]; si += 4; michael@0: mull %eax michael@0: michael@0: add %ebx,%eax # add "carry" michael@0: adc $0,%edx michael@0: mov 0(%edi),%ebx michael@0: add %ebx,%eax # add low word from result michael@0: mov 4(%edi),%ebx michael@0: stosl # [es:di] = %eax; di += 4; michael@0: adc %ebx,%edx # add high word from result michael@0: movl $0,%ebx michael@0: mov %edx,%eax michael@0: adc $0,%ebx michael@0: stosl # [es:di] = %eax; di += 4; michael@0: dec %ecx # --a_len michael@0: jnz 30b # jmp if a_len != 0 michael@0: 31: michael@0: cmp $0,%ebx # is carry zero? michael@0: jz 34f michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: add %ebx,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jnc 34f michael@0: 32: michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: adc $0,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jc 32b michael@0: 34: michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: _s_mpv_sqr_add_prop_sse2: michael@0: push %ebp michael@0: mov %esp,%ebp michael@0: push %edi michael@0: push %esi michael@0: push %ebx michael@0: psubq %mm2,%mm2 # carry = 0 michael@0: mov 12(%ebp),%ecx # ecx = a_len michael@0: mov 16(%ebp),%edi michael@0: cmp $0,%ecx michael@0: je 36f # jmp if a_len == 0 michael@0: mov 8(%ebp),%esi # esi = a michael@0: cld michael@0: 35: michael@0: movd 0(%esi),%mm0 # mm0 = *a michael@0: movd 0(%edi),%mm3 # fetch the sum michael@0: add $4,%esi michael@0: pmuludq %mm0,%mm0 # mm0 = sqr(a) michael@0: paddq %mm0,%mm2 # add the carry michael@0: paddq %mm3,%mm2 # add the low word michael@0: movd 4(%edi),%mm3 michael@0: movd %mm2,0(%edi) # store the 32bit result michael@0: psrlq $32, %mm2 michael@0: paddq %mm3,%mm2 # add the high word michael@0: movd %mm2,4(%edi) # store the 32bit result michael@0: psrlq $32, %mm2 # save the carry. michael@0: add $8,%edi michael@0: dec %ecx # --a_len michael@0: jnz 35b # jmp if a_len != 0 michael@0: 36: michael@0: movd %mm2,%ebx michael@0: cmp $0,%ebx # is carry zero? michael@0: jz 38f michael@0: mov 0(%edi),%eax michael@0: add %ebx, %eax michael@0: stosl michael@0: jnc 38f michael@0: 37: michael@0: mov 0(%edi),%eax # add in current word from *c michael@0: adc $0,%eax michael@0: stosl # [es:edi] = ax; edi += 4; michael@0: jc 37b michael@0: 38: michael@0: emms michael@0: pop %ebx michael@0: pop %esi michael@0: pop %edi michael@0: leave michael@0: ret michael@0: nop michael@0: michael@0: # michael@0: # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized michael@0: # so its high bit is 1. This code is from NSPR. michael@0: # michael@0: # mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, michael@0: # mp_digit *qp, mp_digit *rp) michael@0: michael@0: # esp + 0: Caller's ebx michael@0: # esp + 4: return address michael@0: # esp + 8: Nhi argument michael@0: # esp + 12: Nlo argument michael@0: # esp + 16: divisor argument michael@0: # esp + 20: qp argument michael@0: # esp + 24: rp argument michael@0: # registers: michael@0: # eax: michael@0: # ebx: carry michael@0: # ecx: a_len michael@0: # edx: michael@0: # esi: a ptr michael@0: # edi: c ptr michael@0: # michael@0: michael@0: .globl _s_mpv_div_2dx1d michael@0: .type _s_mpv_div_2dx1d,@function michael@0: _s_mpv_div_2dx1d: michael@0: push %ebx michael@0: mov 8(%esp),%edx michael@0: mov 12(%esp),%eax michael@0: mov 16(%esp),%ebx michael@0: div %ebx michael@0: mov 20(%esp),%ebx michael@0: mov %eax,0(%ebx) michael@0: mov 24(%esp),%ebx michael@0: mov %edx,0(%ebx) michael@0: xor %eax,%eax # return zero michael@0: pop %ebx michael@0: ret michael@0: nop michael@0: