michael@0: /* michael@0: * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. michael@0: * michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "mpi-priv.h" michael@0: michael@0: static int is_sse = -1; michael@0: extern unsigned long s_mpi_is_sse2(); michael@0: michael@0: /* michael@0: * ebp - 36: caller's esi michael@0: * ebp - 32: caller's edi michael@0: * ebp - 28: michael@0: * ebp - 24: michael@0: * ebp - 20: michael@0: * ebp - 16: michael@0: * ebp - 12: michael@0: * ebp - 8: michael@0: * ebp - 4: michael@0: * ebp + 0: caller's ebp michael@0: * ebp + 4: return address michael@0: * ebp + 8: a argument michael@0: * ebp + 12: a_len argument michael@0: * ebp + 16: b argument michael@0: * ebp + 20: c argument michael@0: * registers: michael@0: * eax: michael@0: * ebx: carry michael@0: * ecx: a_len michael@0: * edx: michael@0: * esi: a ptr michael@0: * edi: c ptr michael@0: */ michael@0: __declspec(naked) void michael@0: s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: __asm { michael@0: mov eax, is_sse michael@0: cmp eax, 0 michael@0: je s_mpv_mul_d_x86 michael@0: jg s_mpv_mul_d_sse2 michael@0: call s_mpi_is_sse2 michael@0: mov is_sse, eax michael@0: cmp eax, 0 michael@0: jg s_mpv_mul_d_sse2 michael@0: s_mpv_mul_d_x86: michael@0: push ebp michael@0: mov ebp,esp michael@0: sub esp,28 michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: mov ebx,0 ; carry = 0 michael@0: mov ecx,[ebp+12] ; ecx = a_len michael@0: mov edi,[ebp+20] michael@0: cmp ecx,0 michael@0: je L_2 ; jmp if a_len == 0 michael@0: mov esi,[ebp+8] ; esi = a michael@0: cld michael@0: L_1: michael@0: lodsd ; eax = [ds:esi]; esi += 4 michael@0: mov edx,[ebp+16] ; edx = b michael@0: mul edx ; edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add eax,ebx ; add carry (ebx) to edx:eax michael@0: adc edx,0 michael@0: mov ebx,edx ; high half of product becomes next carry michael@0: michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: dec ecx ; --a_len michael@0: jnz L_1 ; jmp if a_len != 0 michael@0: L_2: michael@0: mov [edi],ebx ; *c = carry michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: s_mpv_mul_d_sse2: michael@0: push ebp michael@0: mov ebp, esp michael@0: push edi michael@0: push esi michael@0: psubq mm2, mm2 ; carry = 0 michael@0: mov ecx, [ebp+12] ; ecx = a_len michael@0: movd mm1, [ebp+16] ; mm1 = b michael@0: mov edi, [ebp+20] michael@0: cmp ecx, 0 michael@0: je L_6 ; jmp if a_len == 0 michael@0: mov esi, [ebp+8] ; esi = a michael@0: cld michael@0: L_5: michael@0: movd mm0, [esi] ; mm0 = *a++ michael@0: add esi, 4 michael@0: pmuludq mm0, mm1 ; mm0 = b * *a++ michael@0: paddq mm2, mm0 ; add the carry michael@0: movd [edi], mm2 ; store the 32bit result michael@0: add edi, 4 michael@0: psrlq mm2, 32 ; save the carry michael@0: dec ecx ; --a_len michael@0: jnz L_5 ; jmp if a_len != 0 michael@0: L_6: michael@0: movd [edi], mm2 ; *c = carry michael@0: emms michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * ebp - 36: caller's esi michael@0: * ebp - 32: caller's edi michael@0: * ebp - 28: michael@0: * ebp - 24: michael@0: * ebp - 20: michael@0: * ebp - 16: michael@0: * ebp - 12: michael@0: * ebp - 8: michael@0: * ebp - 4: michael@0: * ebp + 0: caller's ebp michael@0: * ebp + 4: return address michael@0: * ebp + 8: a argument michael@0: * ebp + 12: a_len argument michael@0: * ebp + 16: b argument michael@0: * ebp + 20: c argument michael@0: * registers: michael@0: * eax: michael@0: * ebx: carry michael@0: * ecx: a_len michael@0: * edx: michael@0: * esi: a ptr michael@0: * edi: c ptr michael@0: */ michael@0: __declspec(naked) void michael@0: s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: __asm { michael@0: mov eax, is_sse michael@0: cmp eax, 0 michael@0: je s_mpv_mul_d_add_x86 michael@0: jg s_mpv_mul_d_add_sse2 michael@0: call s_mpi_is_sse2 michael@0: mov is_sse, eax michael@0: cmp eax, 0 michael@0: jg s_mpv_mul_d_add_sse2 michael@0: s_mpv_mul_d_add_x86: michael@0: push ebp michael@0: mov ebp,esp michael@0: sub esp,28 michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: mov ebx,0 ; carry = 0 michael@0: mov ecx,[ebp+12] ; ecx = a_len michael@0: mov edi,[ebp+20] michael@0: cmp ecx,0 michael@0: je L_11 ; jmp if a_len == 0 michael@0: mov esi,[ebp+8] ; esi = a michael@0: cld michael@0: L_10: michael@0: lodsd ; eax = [ds:esi]; esi += 4 michael@0: mov edx,[ebp+16] ; edx = b michael@0: mul edx ; edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add eax,ebx ; add carry (ebx) to edx:eax michael@0: adc edx,0 michael@0: mov ebx,[edi] ; add in current word from *c michael@0: add eax,ebx michael@0: adc edx,0 michael@0: mov ebx,edx ; high half of product becomes next carry michael@0: michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: dec ecx ; --a_len michael@0: jnz L_10 ; jmp if a_len != 0 michael@0: L_11: michael@0: mov [edi],ebx ; *c = carry michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: s_mpv_mul_d_add_sse2: michael@0: push ebp michael@0: mov ebp, esp michael@0: push edi michael@0: push esi michael@0: psubq mm2, mm2 ; carry = 0 michael@0: mov ecx, [ebp+12] ; ecx = a_len michael@0: movd mm1, [ebp+16] ; mm1 = b michael@0: mov edi, [ebp+20] michael@0: cmp ecx, 0 michael@0: je L_16 ; jmp if a_len == 0 michael@0: mov esi, [ebp+8] ; esi = a michael@0: cld michael@0: L_15: michael@0: movd mm0, [esi] ; mm0 = *a++ michael@0: add esi, 4 michael@0: pmuludq mm0, mm1 ; mm0 = b * *a++ michael@0: paddq mm2, mm0 ; add the carry michael@0: movd mm0, [edi] michael@0: paddq mm2, mm0 ; add the carry michael@0: movd [edi], mm2 ; store the 32bit result michael@0: add edi, 4 michael@0: psrlq mm2, 32 ; save the carry michael@0: dec ecx ; --a_len michael@0: jnz L_15 ; jmp if a_len != 0 michael@0: L_16: michael@0: movd [edi], mm2 ; *c = carry michael@0: emms michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * ebp - 36: caller's esi michael@0: * ebp - 32: caller's edi michael@0: * ebp - 28: michael@0: * ebp - 24: michael@0: * ebp - 20: michael@0: * ebp - 16: michael@0: * ebp - 12: michael@0: * ebp - 8: michael@0: * ebp - 4: michael@0: * ebp + 0: caller's ebp michael@0: * ebp + 4: return address michael@0: * ebp + 8: a argument michael@0: * ebp + 12: a_len argument michael@0: * ebp + 16: b argument michael@0: * ebp + 20: c argument michael@0: * registers: michael@0: * eax: michael@0: * ebx: carry michael@0: * ecx: a_len michael@0: * edx: michael@0: * esi: a ptr michael@0: * edi: c ptr michael@0: */ michael@0: __declspec(naked) void michael@0: s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: __asm { michael@0: mov eax, is_sse michael@0: cmp eax, 0 michael@0: je s_mpv_mul_d_add_prop_x86 michael@0: jg s_mpv_mul_d_add_prop_sse2 michael@0: call s_mpi_is_sse2 michael@0: mov is_sse, eax michael@0: cmp eax, 0 michael@0: jg s_mpv_mul_d_add_prop_sse2 michael@0: s_mpv_mul_d_add_prop_x86: michael@0: push ebp michael@0: mov ebp,esp michael@0: sub esp,28 michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: mov ebx,0 ; carry = 0 michael@0: mov ecx,[ebp+12] ; ecx = a_len michael@0: mov edi,[ebp+20] michael@0: cmp ecx,0 michael@0: je L_21 ; jmp if a_len == 0 michael@0: cld michael@0: mov esi,[ebp+8] ; esi = a michael@0: L_20: michael@0: lodsd ; eax = [ds:esi]; esi += 4 michael@0: mov edx,[ebp+16] ; edx = b michael@0: mul edx ; edx:eax = Phi:Plo = a_i * b michael@0: michael@0: add eax,ebx ; add carry (ebx) to edx:eax michael@0: adc edx,0 michael@0: mov ebx,[edi] ; add in current word from *c michael@0: add eax,ebx michael@0: adc edx,0 michael@0: mov ebx,edx ; high half of product becomes next carry michael@0: michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: dec ecx ; --a_len michael@0: jnz L_20 ; jmp if a_len != 0 michael@0: L_21: michael@0: cmp ebx,0 ; is carry zero? michael@0: jz L_23 michael@0: mov eax,[edi] ; add in current word from *c michael@0: add eax,ebx michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jnc L_23 michael@0: L_22: michael@0: mov eax,[edi] ; add in current word from *c michael@0: adc eax,0 michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jc L_22 michael@0: L_23: michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: s_mpv_mul_d_add_prop_sse2: michael@0: push ebp michael@0: mov ebp, esp michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: psubq mm2, mm2 ; carry = 0 michael@0: mov ecx, [ebp+12] ; ecx = a_len michael@0: movd mm1, [ebp+16] ; mm1 = b michael@0: mov edi, [ebp+20] michael@0: cmp ecx, 0 michael@0: je L_26 ; jmp if a_len == 0 michael@0: mov esi, [ebp+8] ; esi = a michael@0: cld michael@0: L_25: michael@0: movd mm0, [esi] ; mm0 = *a++ michael@0: movd mm3, [edi] ; fetch the sum michael@0: add esi, 4 michael@0: pmuludq mm0, mm1 ; mm0 = b * *a++ michael@0: paddq mm2, mm0 ; add the carry michael@0: paddq mm2, mm3 ; add *c++ michael@0: movd [edi], mm2 ; store the 32bit result michael@0: add edi, 4 michael@0: psrlq mm2, 32 ; save the carry michael@0: dec ecx ; --a_len michael@0: jnz L_25 ; jmp if a_len != 0 michael@0: L_26: michael@0: movd ebx, mm2 michael@0: cmp ebx, 0 ; is carry zero? michael@0: jz L_28 michael@0: mov eax, [edi] michael@0: add eax, ebx michael@0: stosd michael@0: jnc L_28 michael@0: L_27: michael@0: mov eax, [edi] ; add in current word from *c michael@0: adc eax, 0 michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jc L_27 michael@0: L_28: michael@0: emms michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * ebp - 20: caller's esi michael@0: * ebp - 16: caller's edi michael@0: * ebp - 12: michael@0: * ebp - 8: carry michael@0: * ebp - 4: a_len local michael@0: * ebp + 0: caller's ebp michael@0: * ebp + 4: return address michael@0: * ebp + 8: pa argument michael@0: * ebp + 12: a_len argument michael@0: * ebp + 16: ps argument michael@0: * ebp + 20: michael@0: * registers: michael@0: * eax: michael@0: * ebx: carry michael@0: * ecx: a_len michael@0: * edx: michael@0: * esi: a ptr michael@0: * edi: c ptr michael@0: */ michael@0: __declspec(naked) void michael@0: s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) michael@0: { michael@0: __asm { michael@0: mov eax, is_sse michael@0: cmp eax, 0 michael@0: je s_mpv_sqr_add_prop_x86 michael@0: jg s_mpv_sqr_add_prop_sse2 michael@0: call s_mpi_is_sse2 michael@0: mov is_sse, eax michael@0: cmp eax, 0 michael@0: jg s_mpv_sqr_add_prop_sse2 michael@0: s_mpv_sqr_add_prop_x86: michael@0: push ebp michael@0: mov ebp,esp michael@0: sub esp,12 michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: mov ebx,0 ; carry = 0 michael@0: mov ecx,[ebp+12] ; a_len michael@0: mov edi,[ebp+16] ; edi = ps michael@0: cmp ecx,0 michael@0: je L_31 ; jump if a_len == 0 michael@0: cld michael@0: mov esi,[ebp+8] ; esi = pa michael@0: L_30: michael@0: lodsd ; eax = [ds:si]; si += 4; michael@0: mul eax michael@0: michael@0: add eax,ebx ; add "carry" michael@0: adc edx,0 michael@0: mov ebx,[edi] michael@0: add eax,ebx ; add low word from result michael@0: mov ebx,[edi+4] michael@0: stosd ; [es:di] = eax; di += 4; michael@0: adc edx,ebx ; add high word from result michael@0: mov ebx,0 michael@0: mov eax,edx michael@0: adc ebx,0 michael@0: stosd ; [es:di] = eax; di += 4; michael@0: dec ecx ; --a_len michael@0: jnz L_30 ; jmp if a_len != 0 michael@0: L_31: michael@0: cmp ebx,0 ; is carry zero? michael@0: jz L_34 michael@0: mov eax,[edi] ; add in current word from *c michael@0: add eax,ebx michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jnc L_34 michael@0: L_32: michael@0: mov eax,[edi] ; add in current word from *c michael@0: adc eax,0 michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jc L_32 michael@0: L_34: michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: s_mpv_sqr_add_prop_sse2: michael@0: push ebp michael@0: mov ebp, esp michael@0: push edi michael@0: push esi michael@0: push ebx michael@0: psubq mm2, mm2 ; carry = 0 michael@0: mov ecx, [ebp+12] ; ecx = a_len michael@0: mov edi, [ebp+16] michael@0: cmp ecx, 0 michael@0: je L_36 ; jmp if a_len == 0 michael@0: mov esi, [ebp+8] ; esi = a michael@0: cld michael@0: L_35: michael@0: movd mm0, [esi] ; mm0 = *a michael@0: movd mm3, [edi] ; fetch the sum michael@0: add esi, 4 michael@0: pmuludq mm0, mm0 ; mm0 = sqr(a) michael@0: paddq mm2, mm0 ; add the carry michael@0: paddq mm2, mm3 ; add the low word michael@0: movd mm3, [edi+4] michael@0: movd [edi], mm2 ; store the 32bit result michael@0: psrlq mm2, 32 michael@0: paddq mm2, mm3 ; add the high word michael@0: movd [edi+4], mm2 ; store the 32bit result michael@0: psrlq mm2, 32 ; save the carry. michael@0: add edi, 8 michael@0: dec ecx ; --a_len michael@0: jnz L_35 ; jmp if a_len != 0 michael@0: L_36: michael@0: movd ebx, mm2 michael@0: cmp ebx, 0 ; is carry zero? michael@0: jz L_38 michael@0: mov eax, [edi] michael@0: add eax, ebx michael@0: stosd michael@0: jnc L_38 michael@0: L_37: michael@0: mov eax, [edi] ; add in current word from *c michael@0: adc eax, 0 michael@0: stosd ; [es:edi] = ax; edi += 4; michael@0: jc L_37 michael@0: L_38: michael@0: emms michael@0: pop ebx michael@0: pop esi michael@0: pop edi michael@0: leave michael@0: ret michael@0: nop michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized michael@0: * so its high bit is 1. This code is from NSPR. michael@0: * michael@0: * Dump of assembler code for function s_mpv_div_2dx1d: michael@0: * michael@0: * esp + 0: Caller's ebx michael@0: * esp + 4: return address michael@0: * esp + 8: Nhi argument michael@0: * esp + 12: Nlo argument michael@0: * esp + 16: divisor argument michael@0: * esp + 20: qp argument michael@0: * esp + 24: rp argument michael@0: * registers: michael@0: * eax: michael@0: * ebx: carry michael@0: * ecx: a_len michael@0: * edx: michael@0: * esi: a ptr michael@0: * edi: c ptr michael@0: */ michael@0: __declspec(naked) mp_err michael@0: s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, michael@0: mp_digit *qp, mp_digit *rp) michael@0: { michael@0: __asm { michael@0: push ebx michael@0: mov edx,[esp+8] michael@0: mov eax,[esp+12] michael@0: mov ebx,[esp+16] michael@0: div ebx michael@0: mov ebx,[esp+20] michael@0: mov [ebx],eax michael@0: mov ebx,[esp+24] michael@0: mov [ebx],edx michael@0: xor eax,eax ; return zero michael@0: pop ebx michael@0: ret michael@0: nop michael@0: } michael@0: }