michael@0: /*
michael@0:  *  mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
michael@0:  * 
michael@0:  * This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #include "mpi-priv.h"
michael@0: 
michael@0: static int is_sse = -1;
michael@0: extern unsigned long s_mpi_is_sse2();
michael@0: 
michael@0: /*
michael@0:  *   ebp - 36:	caller's esi
michael@0:  *   ebp - 32:	caller's edi
michael@0:  *   ebp - 28:	
michael@0:  *   ebp - 24:	
michael@0:  *   ebp - 20:	
michael@0:  *   ebp - 16:	
michael@0:  *   ebp - 12:	
michael@0:  *   ebp - 8:	
michael@0:  *   ebp - 4:	
michael@0:  *   ebp + 0:	caller's ebp
michael@0:  *   ebp + 4:	return address
michael@0:  *   ebp + 8:	a	argument
michael@0:  *   ebp + 12:	a_len	argument
michael@0:  *   ebp + 16:	b	argument
michael@0:  *   ebp + 20:	c	argument
michael@0:  *   registers:
michael@0:  *  	eax:
michael@0:  * 	ebx:	carry
michael@0:  * 	ecx:	a_len
michael@0:  * 	edx:
michael@0:  * 	esi:	a ptr
michael@0:  * 	edi:	c ptr
michael@0:  */
michael@0: __declspec(naked) void
michael@0: s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
michael@0: {
michael@0:   __asm {
michael@0:     mov    eax, is_sse
michael@0:     cmp    eax, 0
michael@0:     je     s_mpv_mul_d_x86
michael@0:     jg     s_mpv_mul_d_sse2
michael@0:     call   s_mpi_is_sse2
michael@0:     mov    is_sse, eax
michael@0:     cmp    eax, 0
michael@0:     jg     s_mpv_mul_d_sse2
michael@0: s_mpv_mul_d_x86:
michael@0:     push   ebp
michael@0:     mov    ebp,esp
michael@0:     sub    esp,28
michael@0:     push   edi
michael@0:     push   esi
michael@0:     push   ebx
michael@0:     mov    ebx,0		; carry = 0
michael@0:     mov    ecx,[ebp+12]		; ecx = a_len
michael@0:     mov    edi,[ebp+20]
michael@0:     cmp    ecx,0
michael@0:     je     L_2			; jmp if a_len == 0
michael@0:     mov    esi,[ebp+8]		; esi = a
michael@0:     cld
michael@0: L_1:
michael@0:     lodsd			; eax = [ds:esi]; esi += 4
michael@0:     mov    edx,[ebp+16]		; edx = b
michael@0:     mul    edx			; edx:eax = Phi:Plo = a_i * b
michael@0: 
michael@0:     add    eax,ebx		; add carry (ebx) to edx:eax
michael@0:     adc    edx,0
michael@0:     mov    ebx,edx		; high half of product becomes next carry
michael@0: 
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_1			; jmp if a_len != 0
michael@0: L_2:
michael@0:     mov    [edi],ebx		; *c = carry
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0: s_mpv_mul_d_sse2:
michael@0:     push   ebp
michael@0:     mov    ebp, esp
michael@0:     push   edi
michael@0:     push   esi
michael@0:     psubq  mm2, mm2		; carry = 0
michael@0:     mov    ecx, [ebp+12]	; ecx = a_len
michael@0:     movd   mm1, [ebp+16]	; mm1 = b
michael@0:     mov    edi, [ebp+20]
michael@0:     cmp    ecx, 0
michael@0:     je     L_6			; jmp if a_len == 0
michael@0:     mov    esi, [ebp+8]		; esi = a
michael@0:     cld
michael@0: L_5:
michael@0:     movd   mm0, [esi]		; mm0 = *a++
michael@0:     add    esi, 4
michael@0:     pmuludq mm0, mm1		; mm0 = b * *a++
michael@0:     paddq  mm2, mm0		; add the carry
michael@0:     movd   [edi], mm2		; store the 32bit result
michael@0:     add    edi, 4
michael@0:     psrlq  mm2, 32		; save the carry
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_5			; jmp if a_len != 0
michael@0: L_6:
michael@0:     movd   [edi], mm2		; *c = carry
michael@0:     emms
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0:   }
michael@0: }
michael@0: 
michael@0: /*
michael@0:  *   ebp - 36:	caller's esi
michael@0:  *   ebp - 32:	caller's edi
michael@0:  *   ebp - 28:	
michael@0:  *   ebp - 24:	
michael@0:  *   ebp - 20:	
michael@0:  *   ebp - 16:	
michael@0:  *   ebp - 12:	
michael@0:  *   ebp - 8:	
michael@0:  *   ebp - 4:	
michael@0:  *   ebp + 0:	caller's ebp
michael@0:  *   ebp + 4:	return address
michael@0:  *   ebp + 8:	a	argument
michael@0:  *   ebp + 12:	a_len	argument
michael@0:  *   ebp + 16:	b	argument
michael@0:  *   ebp + 20:	c	argument
michael@0:  *   registers:
michael@0:  *  	eax:
michael@0:  * 	ebx:	carry
michael@0:  * 	ecx:	a_len
michael@0:  * 	edx:
michael@0:  * 	esi:	a ptr
michael@0:  * 	edi:	c ptr
michael@0:  */
michael@0: __declspec(naked) void
michael@0: s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
michael@0: {
michael@0:   __asm {
michael@0:     mov    eax, is_sse
michael@0:     cmp    eax, 0
michael@0:     je     s_mpv_mul_d_add_x86
michael@0:     jg     s_mpv_mul_d_add_sse2
michael@0:     call   s_mpi_is_sse2
michael@0:     mov    is_sse, eax
michael@0:     cmp    eax, 0
michael@0:     jg     s_mpv_mul_d_add_sse2
michael@0: s_mpv_mul_d_add_x86:
michael@0:     push   ebp
michael@0:     mov    ebp,esp
michael@0:     sub    esp,28
michael@0:     push   edi
michael@0:     push   esi
michael@0:     push   ebx
michael@0:     mov    ebx,0		; carry = 0
michael@0:     mov    ecx,[ebp+12]		; ecx = a_len
michael@0:     mov    edi,[ebp+20]
michael@0:     cmp    ecx,0
michael@0:     je     L_11			; jmp if a_len == 0
michael@0:     mov    esi,[ebp+8]		; esi = a
michael@0:     cld
michael@0: L_10:
michael@0:     lodsd			; eax = [ds:esi]; esi += 4
michael@0:     mov    edx,[ebp+16]		; edx = b
michael@0:     mul    edx			; edx:eax = Phi:Plo = a_i * b
michael@0: 
michael@0:     add    eax,ebx		; add carry (ebx) to edx:eax
michael@0:     adc    edx,0
michael@0:     mov    ebx,[edi]		; add in current word from *c
michael@0:     add    eax,ebx		
michael@0:     adc    edx,0
michael@0:     mov    ebx,edx		; high half of product becomes next carry
michael@0: 
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_10			; jmp if a_len != 0
michael@0: L_11:
michael@0:     mov    [edi],ebx		; *c = carry
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0: s_mpv_mul_d_add_sse2:
michael@0:     push   ebp
michael@0:     mov    ebp, esp
michael@0:     push   edi
michael@0:     push   esi
michael@0:     psubq  mm2, mm2		; carry = 0
michael@0:     mov    ecx, [ebp+12]	; ecx = a_len
michael@0:     movd   mm1, [ebp+16]	; mm1 = b
michael@0:     mov    edi, [ebp+20]
michael@0:     cmp    ecx, 0
michael@0:     je     L_16			; jmp if a_len == 0
michael@0:     mov    esi, [ebp+8]		; esi = a
michael@0:     cld
michael@0: L_15:
michael@0:     movd   mm0, [esi]		; mm0 = *a++
michael@0:     add    esi, 4
michael@0:     pmuludq mm0, mm1		; mm0 = b * *a++
michael@0:     paddq  mm2, mm0		; add the carry
michael@0:     movd   mm0, [edi]
michael@0:     paddq  mm2, mm0		; add the carry
michael@0:     movd   [edi], mm2		; store the 32bit result
michael@0:     add    edi, 4
michael@0:     psrlq  mm2, 32		; save the carry
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_15			; jmp if a_len != 0
michael@0: L_16:
michael@0:     movd   [edi], mm2		; *c = carry
michael@0:     emms
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0:   }
michael@0: }
michael@0: 
michael@0: /*
michael@0:  *   ebp - 36:	caller's esi
michael@0:  *   ebp - 32:	caller's edi
michael@0:  *   ebp - 28:	
michael@0:  *   ebp - 24:	
michael@0:  *   ebp - 20:	
michael@0:  *   ebp - 16:	
michael@0:  *   ebp - 12:	
michael@0:  *   ebp - 8:	
michael@0:  *   ebp - 4:	
michael@0:  *   ebp + 0:	caller's ebp
michael@0:  *   ebp + 4:	return address
michael@0:  *   ebp + 8:	a	argument
michael@0:  *   ebp + 12:	a_len	argument
michael@0:  *   ebp + 16:	b	argument
michael@0:  *   ebp + 20:	c	argument
michael@0:  *   registers:
michael@0:  *  	eax:
michael@0:  * 	ebx:	carry
michael@0:  * 	ecx:	a_len
michael@0:  * 	edx:
michael@0:  * 	esi:	a ptr
michael@0:  * 	edi:	c ptr
michael@0:  */
michael@0: __declspec(naked) void
michael@0: s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
michael@0: {
michael@0:   __asm {
michael@0:     mov    eax, is_sse
michael@0:     cmp    eax, 0
michael@0:     je     s_mpv_mul_d_add_prop_x86
michael@0:     jg     s_mpv_mul_d_add_prop_sse2
michael@0:     call   s_mpi_is_sse2
michael@0:     mov    is_sse, eax
michael@0:     cmp    eax, 0
michael@0:     jg     s_mpv_mul_d_add_prop_sse2
michael@0: s_mpv_mul_d_add_prop_x86:
michael@0:     push   ebp
michael@0:     mov    ebp,esp
michael@0:     sub    esp,28
michael@0:     push   edi
michael@0:     push   esi
michael@0:     push   ebx
michael@0:     mov    ebx,0		; carry = 0
michael@0:     mov    ecx,[ebp+12]		; ecx = a_len
michael@0:     mov    edi,[ebp+20]
michael@0:     cmp    ecx,0
michael@0:     je     L_21			; jmp if a_len == 0
michael@0:     cld
michael@0:     mov    esi,[ebp+8]		; esi = a
michael@0: L_20:
michael@0:     lodsd			; eax = [ds:esi]; esi += 4
michael@0:     mov    edx,[ebp+16]		; edx = b
michael@0:     mul    edx			; edx:eax = Phi:Plo = a_i * b
michael@0: 
michael@0:     add    eax,ebx		; add carry (ebx) to edx:eax
michael@0:     adc    edx,0
michael@0:     mov    ebx,[edi]		; add in current word from *c
michael@0:     add    eax,ebx		
michael@0:     adc    edx,0
michael@0:     mov    ebx,edx		; high half of product becomes next carry
michael@0: 
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_20			; jmp if a_len != 0
michael@0: L_21:
michael@0:     cmp    ebx,0		; is carry zero?
michael@0:     jz     L_23
michael@0:     mov    eax,[edi]		; add in current word from *c
michael@0:     add    eax,ebx
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jnc    L_23
michael@0: L_22:
michael@0:     mov    eax,[edi]		; add in current word from *c
michael@0:     adc    eax,0
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jc     L_22
michael@0: L_23:
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0: s_mpv_mul_d_add_prop_sse2:
michael@0:     push   ebp
michael@0:     mov    ebp, esp
michael@0:     push   edi
michael@0:     push   esi
michael@0:     push   ebx
michael@0:     psubq  mm2, mm2		; carry = 0
michael@0:     mov    ecx, [ebp+12]	; ecx = a_len
michael@0:     movd   mm1, [ebp+16]	; mm1 = b
michael@0:     mov    edi, [ebp+20]
michael@0:     cmp    ecx, 0
michael@0:     je     L_26			; jmp if a_len == 0
michael@0:     mov    esi, [ebp+8]		; esi = a
michael@0:     cld
michael@0: L_25:
michael@0:     movd   mm0, [esi]		; mm0 = *a++
michael@0:     movd   mm3, [edi]		; fetch the sum
michael@0:     add    esi, 4
michael@0:     pmuludq mm0, mm1		; mm0 = b * *a++
michael@0:     paddq  mm2, mm0		; add the carry
michael@0:     paddq  mm2, mm3		; add *c++
michael@0:     movd   [edi], mm2		; store the 32bit result
michael@0:     add    edi, 4
michael@0:     psrlq  mm2, 32		; save the carry
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_25			; jmp if a_len != 0
michael@0: L_26:
michael@0:     movd   ebx, mm2
michael@0:     cmp    ebx, 0		; is carry zero?
michael@0:     jz     L_28
michael@0:     mov    eax, [edi]
michael@0:     add    eax, ebx
michael@0:     stosd
michael@0:     jnc    L_28
michael@0: L_27:
michael@0:     mov    eax, [edi]		; add in current word from *c
michael@0:     adc	   eax, 0
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jc     L_27
michael@0: L_28:
michael@0:     emms
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0:   }
michael@0: }
michael@0: 
michael@0: /*
michael@0:  *   ebp - 20:	caller's esi
michael@0:  *   ebp - 16:	caller's edi
michael@0:  *   ebp - 12:	
michael@0:  *   ebp - 8:	carry
michael@0:  *   ebp - 4:	a_len	local
michael@0:  *   ebp + 0:	caller's ebp
michael@0:  *   ebp + 4:	return address
michael@0:  *   ebp + 8:	pa	argument
michael@0:  *   ebp + 12:	a_len	argument
michael@0:  *   ebp + 16:	ps	argument
michael@0:  *   ebp + 20:	
michael@0:  *   registers:
michael@0:  *  	eax:
michael@0:  * 	ebx:	carry
michael@0:  * 	ecx:	a_len
michael@0:  * 	edx:
michael@0:  * 	esi:	a ptr
michael@0:  * 	edi:	c ptr
michael@0:  */
michael@0: __declspec(naked) void
michael@0: s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
michael@0: {
michael@0:   __asm {
michael@0:      mov    eax, is_sse
michael@0:      cmp    eax, 0
michael@0:      je     s_mpv_sqr_add_prop_x86
michael@0:      jg     s_mpv_sqr_add_prop_sse2
michael@0:      call   s_mpi_is_sse2
michael@0:      mov    is_sse, eax
michael@0:      cmp    eax, 0
michael@0:      jg     s_mpv_sqr_add_prop_sse2
michael@0: s_mpv_sqr_add_prop_x86:
michael@0:      push   ebp
michael@0:      mov    ebp,esp
michael@0:      sub    esp,12
michael@0:      push   edi
michael@0:      push   esi
michael@0:      push   ebx
michael@0:      mov    ebx,0		; carry = 0
michael@0:      mov    ecx,[ebp+12]	; a_len
michael@0:      mov    edi,[ebp+16]	; edi = ps
michael@0:      cmp    ecx,0
michael@0:      je     L_31		; jump if a_len == 0
michael@0:      cld
michael@0:      mov    esi,[ebp+8]		; esi = pa
michael@0: L_30:
michael@0:      lodsd			; eax = [ds:si]; si += 4;
michael@0:      mul    eax
michael@0: 
michael@0:      add    eax,ebx		; add "carry"
michael@0:      adc    edx,0
michael@0:      mov    ebx,[edi]
michael@0:      add    eax,ebx		; add low word from result
michael@0:      mov    ebx,[edi+4]
michael@0:      stosd			; [es:di] = eax; di += 4;
michael@0:      adc    edx,ebx		; add high word from result
michael@0:      mov    ebx,0
michael@0:      mov    eax,edx
michael@0:      adc    ebx,0
michael@0:      stosd			; [es:di] = eax; di += 4;
michael@0:      dec    ecx			; --a_len
michael@0:      jnz    L_30		; jmp if a_len != 0
michael@0: L_31:
michael@0:     cmp    ebx,0		; is carry zero?
michael@0:     jz     L_34
michael@0:     mov    eax,[edi]		; add in current word from *c
michael@0:     add    eax,ebx
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jnc    L_34
michael@0: L_32:
michael@0:     mov    eax,[edi]		; add in current word from *c
michael@0:     adc    eax,0
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jc     L_32
michael@0: L_34:
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0: s_mpv_sqr_add_prop_sse2:
michael@0:     push   ebp
michael@0:     mov    ebp, esp
michael@0:     push   edi
michael@0:     push   esi
michael@0:     push   ebx
michael@0:     psubq  mm2, mm2		; carry = 0
michael@0:     mov    ecx, [ebp+12]	; ecx = a_len
michael@0:     mov    edi, [ebp+16]
michael@0:     cmp    ecx, 0
michael@0:     je     L_36		; jmp if a_len == 0
michael@0:     mov    esi, [ebp+8]		; esi = a
michael@0:     cld
michael@0: L_35:
michael@0:     movd   mm0, [esi]		; mm0 = *a
michael@0:     movd   mm3, [edi]		; fetch the sum
michael@0:     add	   esi, 4
michael@0:     pmuludq mm0, mm0		; mm0 = sqr(a)
michael@0:     paddq  mm2, mm0		; add the carry
michael@0:     paddq  mm2, mm3		; add the low word
michael@0:     movd   mm3, [edi+4]
michael@0:     movd   [edi], mm2		; store the 32bit result
michael@0:     psrlq  mm2, 32	
michael@0:     paddq  mm2, mm3		; add the high word
michael@0:     movd   [edi+4], mm2		; store the 32bit result
michael@0:     psrlq  mm2, 32		; save the carry.
michael@0:     add    edi, 8
michael@0:     dec    ecx			; --a_len
michael@0:     jnz    L_35			; jmp if a_len != 0
michael@0: L_36:
michael@0:     movd   ebx, mm2
michael@0:     cmp    ebx, 0		; is carry zero?
michael@0:     jz     L_38
michael@0:     mov    eax, [edi]
michael@0:     add    eax, ebx
michael@0:     stosd
michael@0:     jnc    L_38
michael@0: L_37:
michael@0:     mov    eax, [edi]		; add in current word from *c
michael@0:     adc	   eax, 0
michael@0:     stosd			; [es:edi] = ax; edi += 4;
michael@0:     jc     L_37
michael@0: L_38:
michael@0:     emms
michael@0:     pop    ebx
michael@0:     pop    esi
michael@0:     pop    edi
michael@0:     leave  
michael@0:     ret    
michael@0:     nop
michael@0:   }
michael@0: }
michael@0: 
michael@0: /* 
michael@0:  *  Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
michael@0:  *  so its high bit is 1.   This code is from NSPR.
michael@0:  *
michael@0:  *  Dump of assembler code for function s_mpv_div_2dx1d:
michael@0:  *  
michael@0:  *   esp +  0:   Caller's ebx
michael@0:  *   esp +  4:	return address
michael@0:  *   esp +  8:	Nhi	argument
michael@0:  *   esp + 12:	Nlo	argument
michael@0:  *   esp + 16:	divisor	argument
michael@0:  *   esp + 20:	qp	argument
michael@0:  *   esp + 24:   rp	argument
michael@0:  *   registers:
michael@0:  *  	eax:
michael@0:  * 	ebx:	carry
michael@0:  * 	ecx:	a_len
michael@0:  * 	edx:
michael@0:  * 	esi:	a ptr
michael@0:  * 	edi:	c ptr
michael@0:  */  
michael@0: __declspec(naked) mp_err
michael@0: s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
michael@0: 		mp_digit *qp, mp_digit *rp)
michael@0: {
michael@0:   __asm {
michael@0:        push   ebx
michael@0:        mov    edx,[esp+8]
michael@0:        mov    eax,[esp+12]
michael@0:        mov    ebx,[esp+16]
michael@0:        div    ebx
michael@0:        mov    ebx,[esp+20]
michael@0:        mov    [ebx],eax
michael@0:        mov    ebx,[esp+24]
michael@0:        mov    [ebx],edx
michael@0:        xor    eax,eax		; return zero
michael@0:        pop    ebx
michael@0:        ret    
michael@0:        nop
michael@0:   }
michael@0: }