michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: /* Multiplication performance enhancements for sparc v8+vis CPUs. */ michael@0: michael@0: #include "mpi-priv.h" michael@0: #include michael@0: #include michael@0: #include michael@0: michael@0: /* In the functions below, */ michael@0: /* vector y must be 8-byte aligned, and n must be even */ michael@0: /* returns carry out of high order word of result */ michael@0: /* maximum n is 256 */ michael@0: michael@0: /* vector x += vector y * scaler a; where y is of length n words. */ michael@0: extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); michael@0: michael@0: /* vector z = vector x + vector y * scaler a; where y is of length n words. */ michael@0: extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, michael@0: int n, mp_digit a); michael@0: michael@0: /* v8 versions of these functions run on any Sparc v8 CPU. */ michael@0: michael@0: /* This trick works on Sparc V8 CPUs with the Workshop compilers. */ michael@0: #define MP_MUL_DxD(a, b, Phi, Plo) \ michael@0: { unsigned long long product = (unsigned long long)a * b; \ michael@0: Plo = (mp_digit)product; \ michael@0: Phi = (mp_digit)(product >> MP_DIGIT_BIT); } michael@0: michael@0: /* c = a * b */ michael@0: static void michael@0: v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: #if !defined(MP_NO_MP_WORD) michael@0: mp_digit d = 0; michael@0: michael@0: /* Inner product: Digits of a */ michael@0: while (a_len--) { michael@0: mp_word w = ((mp_word)b * *a++) + d; michael@0: *c++ = ACCUM(w); michael@0: d = CARRYOUT(w); michael@0: } michael@0: *c = d; michael@0: #else michael@0: mp_digit carry = 0; michael@0: while (a_len--) { michael@0: mp_digit a_i = *a++; michael@0: mp_digit a0b0, a1b1; michael@0: michael@0: MP_MUL_DxD(a_i, b, a1b1, a0b0); michael@0: michael@0: a0b0 += carry; michael@0: if (a0b0 < carry) michael@0: ++a1b1; michael@0: *c++ = a0b0; michael@0: carry = a1b1; michael@0: } michael@0: *c = carry; michael@0: #endif michael@0: } michael@0: michael@0: /* c += a * b */ michael@0: static void michael@0: v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: #if !defined(MP_NO_MP_WORD) michael@0: mp_digit d = 0; michael@0: michael@0: /* Inner product: Digits of a */ michael@0: while (a_len--) { michael@0: mp_word w = ((mp_word)b * *a++) + *c + d; michael@0: *c++ = ACCUM(w); michael@0: d = CARRYOUT(w); michael@0: } michael@0: *c = d; michael@0: #else michael@0: mp_digit carry = 0; michael@0: while (a_len--) { michael@0: mp_digit a_i = *a++; michael@0: mp_digit a0b0, a1b1; michael@0: michael@0: MP_MUL_DxD(a_i, b, a1b1, a0b0); michael@0: michael@0: a0b0 += carry; michael@0: if (a0b0 < carry) michael@0: ++a1b1; michael@0: a0b0 += a_i = *c; michael@0: if (a0b0 < a_i) michael@0: ++a1b1; michael@0: *c++ = a0b0; michael@0: carry = a1b1; michael@0: } michael@0: *c = carry; michael@0: #endif michael@0: } michael@0: michael@0: /* Presently, this is only used by the Montgomery arithmetic code. */ michael@0: /* c += a * b */ michael@0: static void michael@0: v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: #if !defined(MP_NO_MP_WORD) michael@0: mp_digit d = 0; michael@0: michael@0: /* Inner product: Digits of a */ michael@0: while (a_len--) { michael@0: mp_word w = ((mp_word)b * *a++) + *c + d; michael@0: *c++ = ACCUM(w); michael@0: d = CARRYOUT(w); michael@0: } michael@0: michael@0: while (d) { michael@0: mp_word w = (mp_word)*c + d; michael@0: *c++ = ACCUM(w); michael@0: d = CARRYOUT(w); michael@0: } michael@0: #else michael@0: mp_digit carry = 0; michael@0: while (a_len--) { michael@0: mp_digit a_i = *a++; michael@0: mp_digit a0b0, a1b1; michael@0: michael@0: MP_MUL_DxD(a_i, b, a1b1, a0b0); michael@0: michael@0: a0b0 += carry; michael@0: if (a0b0 < carry) michael@0: ++a1b1; michael@0: michael@0: a0b0 += a_i = *c; michael@0: if (a0b0 < a_i) michael@0: ++a1b1; michael@0: michael@0: *c++ = a0b0; michael@0: carry = a1b1; michael@0: } michael@0: while (carry) { michael@0: mp_digit c_i = *c; michael@0: carry += c_i; michael@0: *c++ = carry; michael@0: carry = carry < c_i; michael@0: } michael@0: #endif michael@0: } michael@0: michael@0: /* These functions run only on v8plus+vis or v9+vis CPUs. */ michael@0: michael@0: /* c = a * b */ michael@0: void michael@0: s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: mp_digit d; michael@0: mp_digit x[258]; michael@0: if (a_len <= 256) { michael@0: if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { michael@0: mp_digit * px; michael@0: px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; michael@0: memcpy(px, a, a_len * sizeof(*a)); michael@0: a = px; michael@0: if (a_len & 1) { michael@0: px[a_len] = 0; michael@0: } michael@0: } michael@0: s_mp_setz(c, a_len + 1); michael@0: d = mul_add_inp(c, a, a_len, b); michael@0: c[a_len] = d; michael@0: } else { michael@0: v8_mpv_mul_d(a, a_len, b, c); michael@0: } michael@0: } michael@0: michael@0: /* c += a * b, where a is a_len words long. */ michael@0: void michael@0: s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: mp_digit d; michael@0: mp_digit x[258]; michael@0: if (a_len <= 256) { michael@0: if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { michael@0: mp_digit * px; michael@0: px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; michael@0: memcpy(px, a, a_len * sizeof(*a)); michael@0: a = px; michael@0: if (a_len & 1) { michael@0: px[a_len] = 0; michael@0: } michael@0: } michael@0: d = mul_add_inp(c, a, a_len, b); michael@0: c[a_len] = d; michael@0: } else { michael@0: v8_mpv_mul_d_add(a, a_len, b, c); michael@0: } michael@0: } michael@0: michael@0: /* c += a * b, where a is y words long. */ michael@0: void michael@0: s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) michael@0: { michael@0: mp_digit d; michael@0: mp_digit x[258]; michael@0: if (a_len <= 256) { michael@0: if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { michael@0: mp_digit * px; michael@0: px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; michael@0: memcpy(px, a, a_len * sizeof(*a)); michael@0: a = px; michael@0: if (a_len & 1) { michael@0: px[a_len] = 0; michael@0: } michael@0: } michael@0: d = mul_add_inp(c, a, a_len, b); michael@0: if (d) { michael@0: c += a_len; michael@0: do { michael@0: mp_digit sum = d + *c; michael@0: *c++ = sum; michael@0: d = sum < d; michael@0: } while (d); michael@0: } michael@0: } else { michael@0: v8_mpv_mul_d_add_prop(a, a_len, b, c); michael@0: } michael@0: }