1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpi_sparc.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,224 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +/* Multiplication performance enhancements for sparc v8+vis CPUs. */ 1.9 + 1.10 +#include "mpi-priv.h" 1.11 +#include <stddef.h> 1.12 +#include <sys/systeminfo.h> 1.13 +#include <strings.h> 1.14 + 1.15 +/* In the functions below, */ 1.16 +/* vector y must be 8-byte aligned, and n must be even */ 1.17 +/* returns carry out of high order word of result */ 1.18 +/* maximum n is 256 */ 1.19 + 1.20 +/* vector x += vector y * scaler a; where y is of length n words. */ 1.21 +extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); 1.22 + 1.23 +/* vector z = vector x + vector y * scaler a; where y is of length n words. */ 1.24 +extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, 1.25 + int n, mp_digit a); 1.26 + 1.27 +/* v8 versions of these functions run on any Sparc v8 CPU. */ 1.28 + 1.29 +/* This trick works on Sparc V8 CPUs with the Workshop compilers. */ 1.30 +#define MP_MUL_DxD(a, b, Phi, Plo) \ 1.31 + { unsigned long long product = (unsigned long long)a * b; \ 1.32 + Plo = (mp_digit)product; \ 1.33 + Phi = (mp_digit)(product >> MP_DIGIT_BIT); } 1.34 + 1.35 +/* c = a * b */ 1.36 +static void 1.37 +v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.38 +{ 1.39 +#if !defined(MP_NO_MP_WORD) 1.40 + mp_digit d = 0; 1.41 + 1.42 + /* Inner product: Digits of a */ 1.43 + while (a_len--) { 1.44 + mp_word w = ((mp_word)b * *a++) + d; 1.45 + *c++ = ACCUM(w); 1.46 + d = CARRYOUT(w); 1.47 + } 1.48 + *c = d; 1.49 +#else 1.50 + mp_digit carry = 0; 1.51 + while (a_len--) { 1.52 + mp_digit a_i = *a++; 1.53 + mp_digit a0b0, a1b1; 1.54 + 1.55 + MP_MUL_DxD(a_i, b, a1b1, a0b0); 1.56 + 1.57 + a0b0 += carry; 1.58 + if (a0b0 < carry) 1.59 + ++a1b1; 1.60 + *c++ = a0b0; 1.61 + carry = a1b1; 1.62 + } 1.63 + *c = carry; 1.64 +#endif 1.65 +} 1.66 + 1.67 +/* c += a * b */ 1.68 +static void 1.69 +v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.70 +{ 1.71 +#if !defined(MP_NO_MP_WORD) 1.72 + mp_digit d = 0; 1.73 + 1.74 + /* Inner product: Digits of a */ 1.75 + while (a_len--) { 1.76 + mp_word w = ((mp_word)b * *a++) + *c + d; 1.77 + *c++ = ACCUM(w); 1.78 + d = CARRYOUT(w); 1.79 + } 1.80 + *c = d; 1.81 +#else 1.82 + mp_digit carry = 0; 1.83 + while (a_len--) { 1.84 + mp_digit a_i = *a++; 1.85 + mp_digit a0b0, a1b1; 1.86 + 1.87 + MP_MUL_DxD(a_i, b, a1b1, a0b0); 1.88 + 1.89 + a0b0 += carry; 1.90 + if (a0b0 < carry) 1.91 + ++a1b1; 1.92 + a0b0 += a_i = *c; 1.93 + if (a0b0 < a_i) 1.94 + ++a1b1; 1.95 + *c++ = a0b0; 1.96 + carry = a1b1; 1.97 + } 1.98 + *c = carry; 1.99 +#endif 1.100 +} 1.101 + 1.102 +/* Presently, this is only used by the Montgomery arithmetic code. */ 1.103 +/* c += a * b */ 1.104 +static void 1.105 +v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.106 +{ 1.107 +#if !defined(MP_NO_MP_WORD) 1.108 + mp_digit d = 0; 1.109 + 1.110 + /* Inner product: Digits of a */ 1.111 + while (a_len--) { 1.112 + mp_word w = ((mp_word)b * *a++) + *c + d; 1.113 + *c++ = ACCUM(w); 1.114 + d = CARRYOUT(w); 1.115 + } 1.116 + 1.117 + while (d) { 1.118 + mp_word w = (mp_word)*c + d; 1.119 + *c++ = ACCUM(w); 1.120 + d = CARRYOUT(w); 1.121 + } 1.122 +#else 1.123 + mp_digit carry = 0; 1.124 + while (a_len--) { 1.125 + mp_digit a_i = *a++; 1.126 + mp_digit a0b0, a1b1; 1.127 + 1.128 + MP_MUL_DxD(a_i, b, a1b1, a0b0); 1.129 + 1.130 + a0b0 += carry; 1.131 + if (a0b0 < carry) 1.132 + ++a1b1; 1.133 + 1.134 + a0b0 += a_i = *c; 1.135 + if (a0b0 < a_i) 1.136 + ++a1b1; 1.137 + 1.138 + *c++ = a0b0; 1.139 + carry = a1b1; 1.140 + } 1.141 + while (carry) { 1.142 + mp_digit c_i = *c; 1.143 + carry += c_i; 1.144 + *c++ = carry; 1.145 + carry = carry < c_i; 1.146 + } 1.147 +#endif 1.148 +} 1.149 + 1.150 +/* These functions run only on v8plus+vis or v9+vis CPUs. */ 1.151 + 1.152 +/* c = a * b */ 1.153 +void 1.154 +s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.155 +{ 1.156 + mp_digit d; 1.157 + mp_digit x[258]; 1.158 + if (a_len <= 256) { 1.159 + if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 1.160 + mp_digit * px; 1.161 + px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 1.162 + memcpy(px, a, a_len * sizeof(*a)); 1.163 + a = px; 1.164 + if (a_len & 1) { 1.165 + px[a_len] = 0; 1.166 + } 1.167 + } 1.168 + s_mp_setz(c, a_len + 1); 1.169 + d = mul_add_inp(c, a, a_len, b); 1.170 + c[a_len] = d; 1.171 + } else { 1.172 + v8_mpv_mul_d(a, a_len, b, c); 1.173 + } 1.174 +} 1.175 + 1.176 +/* c += a * b, where a is a_len words long. */ 1.177 +void 1.178 +s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.179 +{ 1.180 + mp_digit d; 1.181 + mp_digit x[258]; 1.182 + if (a_len <= 256) { 1.183 + if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 1.184 + mp_digit * px; 1.185 + px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 1.186 + memcpy(px, a, a_len * sizeof(*a)); 1.187 + a = px; 1.188 + if (a_len & 1) { 1.189 + px[a_len] = 0; 1.190 + } 1.191 + } 1.192 + d = mul_add_inp(c, a, a_len, b); 1.193 + c[a_len] = d; 1.194 + } else { 1.195 + v8_mpv_mul_d_add(a, a_len, b, c); 1.196 + } 1.197 +} 1.198 + 1.199 +/* c += a * b, where a is y words long. */ 1.200 +void 1.201 +s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 1.202 +{ 1.203 + mp_digit d; 1.204 + mp_digit x[258]; 1.205 + if (a_len <= 256) { 1.206 + if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { 1.207 + mp_digit * px; 1.208 + px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; 1.209 + memcpy(px, a, a_len * sizeof(*a)); 1.210 + a = px; 1.211 + if (a_len & 1) { 1.212 + px[a_len] = 0; 1.213 + } 1.214 + } 1.215 + d = mul_add_inp(c, a, a_len, b); 1.216 + if (d) { 1.217 + c += a_len; 1.218 + do { 1.219 + mp_digit sum = d + *c; 1.220 + *c++ = sum; 1.221 + d = sum < d; 1.222 + } while (d); 1.223 + } 1.224 + } else { 1.225 + v8_mpv_mul_d_add_prop(a, a_len, b, c); 1.226 + } 1.227 +}