michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "mpi-priv.h" michael@0: #include michael@0: michael@0: michael@0: #define MP_MUL_DxD(a, b, Phi, Plo) \ michael@0: { Plo = asm ("mulq %a0, %a1, %v0", a, b); \ michael@0: Phi = asm ("umulh %a0, %a1, %v0", a, b); } \ michael@0: michael@0: /* This is empty for the loop in s_mpv_mul_d */ michael@0: #define CARRY_ADD michael@0: michael@0: #define ONE_MUL \ michael@0: a_i = *a++; \ michael@0: MP_MUL_DxD(a_i, b, a1b1, a0b0); \ michael@0: a0b0 += carry; \ michael@0: if (a0b0 < carry) \ michael@0: ++a1b1; \ michael@0: CARRY_ADD \ michael@0: *c++ = a0b0; \ michael@0: carry = a1b1; \ michael@0: michael@0: #define FOUR_MUL \ michael@0: ONE_MUL \ michael@0: ONE_MUL \ michael@0: ONE_MUL \ michael@0: ONE_MUL \ michael@0: michael@0: #define SIXTEEN_MUL \ michael@0: FOUR_MUL \ michael@0: FOUR_MUL \ michael@0: FOUR_MUL \ michael@0: FOUR_MUL \ michael@0: michael@0: #define THIRTYTWO_MUL \ michael@0: SIXTEEN_MUL \ michael@0: SIXTEEN_MUL \ michael@0: michael@0: #define ONETWENTYEIGHT_MUL \ michael@0: THIRTYTWO_MUL \ michael@0: THIRTYTWO_MUL \ michael@0: THIRTYTWO_MUL \ michael@0: THIRTYTWO_MUL \ michael@0: michael@0: michael@0: #define EXPAND_256(CALL) \ michael@0: mp_digit carry = 0; \ michael@0: mp_digit a_i; \ michael@0: mp_digit a0b0, a1b1; \ michael@0: if (a_len &255) { \ michael@0: if (a_len &1) { \ michael@0: ONE_MUL \ michael@0: } \ michael@0: if (a_len &2) { \ michael@0: ONE_MUL \ michael@0: ONE_MUL \ michael@0: } \ michael@0: if (a_len &4) { \ michael@0: FOUR_MUL \ michael@0: } \ michael@0: if (a_len &8) { \ michael@0: FOUR_MUL \ michael@0: FOUR_MUL \ michael@0: } \ michael@0: if (a_len & 16 ) { \ michael@0: SIXTEEN_MUL \ michael@0: } \ michael@0: if (a_len & 32 ) { \ michael@0: THIRTYTWO_MUL \ michael@0: } \ michael@0: if (a_len & 64 ) { \ michael@0: THIRTYTWO_MUL \ michael@0: THIRTYTWO_MUL \ michael@0: } \ michael@0: if (a_len & 128) { \ michael@0: ONETWENTYEIGHT_MUL \ michael@0: } \ michael@0: a_len = a_len & (-256); \ michael@0: } \ michael@0: if (a_len>=256 ) { \ michael@0: carry = CALL(a, a_len, b, c, carry); \ michael@0: c += a_len; \ michael@0: } \ michael@0: michael@0: #define FUNC_NAME(NAME) \ michael@0: mp_digit NAME(const mp_digit *a, \ michael@0: mp_size a_len, \ michael@0: mp_digit b, mp_digit *c, \ michael@0: mp_digit carry) \ michael@0: michael@0: #define DECLARE_MUL_256(FNAME) \ michael@0: FUNC_NAME(FNAME) \ michael@0: { \ michael@0: mp_digit a_i; \ michael@0: mp_digit a0b0, a1b1; \ michael@0: while (a_len) { \ michael@0: ONETWENTYEIGHT_MUL \ michael@0: ONETWENTYEIGHT_MUL \ michael@0: a_len-= 256; \ michael@0: } \ michael@0: return carry; \ michael@0: } \ michael@0: michael@0: /* Expanding the loop in s_mpv_mul_d appeared to slow down the michael@0: (admittedly) small number of tests (i.e., timetest) used to michael@0: measure performance, so this define disables that optimization. */ michael@0: #define DO_NOT_EXPAND 1 michael@0: michael@0: /* Need forward declaration so it can be instantiated after michael@0: the routine that uses it; this helps locality somewhat */ michael@0: #if !defined(DO_NOT_EXPAND) michael@0: FUNC_NAME(s_mpv_mul_d_MUL256); michael@0: #endif michael@0: michael@0: /* c = a * b */ michael@0: void s_mpv_mul_d(const mp_digit *a, mp_size a_len, michael@0: mp_digit b, mp_digit *c) michael@0: { michael@0: #if defined(DO_NOT_EXPAND) michael@0: mp_digit carry = 0; michael@0: while (a_len--) { michael@0: mp_digit a_i = *a++; michael@0: mp_digit a0b0, a1b1; michael@0: michael@0: MP_MUL_DxD(a_i, b, a1b1, a0b0); michael@0: michael@0: a0b0 += carry; michael@0: if (a0b0 < carry) michael@0: ++a1b1; michael@0: *c++ = a0b0; michael@0: carry = a1b1; michael@0: } michael@0: #else michael@0: EXPAND_256(s_mpv_mul_d_MUL256) michael@0: #endif michael@0: *c = carry; michael@0: } michael@0: michael@0: #if !defined(DO_NOT_EXPAND) michael@0: DECLARE_MUL_256(s_mpv_mul_d_MUL256) michael@0: #endif michael@0: michael@0: #undef CARRY_ADD michael@0: /* This is redefined for the loop in s_mpv_mul_d_add */ michael@0: #define CARRY_ADD \ michael@0: a0b0 += a_i = *c; \ michael@0: if (a0b0 < a_i) \ michael@0: ++a1b1; \ michael@0: michael@0: /* Need forward declaration so it can be instantiated between the michael@0: two routines that use it; this helps locality somewhat */ michael@0: FUNC_NAME(s_mpv_mul_d_add_MUL256); michael@0: michael@0: /* c += a * b */ michael@0: void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, michael@0: mp_digit b, mp_digit *c) michael@0: { michael@0: EXPAND_256(s_mpv_mul_d_add_MUL256) michael@0: *c = carry; michael@0: } michael@0: michael@0: /* Instantiate multiply 256 routine here */ michael@0: DECLARE_MUL_256(s_mpv_mul_d_add_MUL256) michael@0: michael@0: /* Presently, this is only used by the Montgomery arithmetic code. */ michael@0: /* c += a * b */ michael@0: void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, michael@0: mp_digit b, mp_digit *c) michael@0: { michael@0: EXPAND_256(s_mpv_mul_d_add_MUL256) michael@0: while (carry) { michael@0: mp_digit c_i = *c; michael@0: carry += c_i; michael@0: *c++ = carry; michael@0: carry = carry < c_i; michael@0: } michael@0: } michael@0: