security/nss/lib/freebl/mpi/mpi_sparc.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/mpi_sparc.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,224 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +/* Multiplication performance enhancements for sparc v8+vis CPUs. */
     1.9 +
    1.10 +#include "mpi-priv.h"
    1.11 +#include <stddef.h>
    1.12 +#include <sys/systeminfo.h>
    1.13 +#include <strings.h>
    1.14 +
    1.15 +/* In the functions below, */
    1.16 +/* vector y must be 8-byte aligned, and n must be even */
    1.17 +/* returns carry out of high order word of result */
    1.18 +/* maximum n is 256 */
    1.19 +
    1.20 +/* vector x += vector y * scaler a; where y is of length n words. */
    1.21 +extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a);
    1.22 +
    1.23 +/* vector z = vector x + vector y * scaler a; where y is of length n words. */
    1.24 +extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, 
    1.25 +			int n, mp_digit a);
    1.26 +
    1.27 +/* v8 versions of these functions run on any Sparc v8 CPU. */
    1.28 +
    1.29 +/* This trick works on Sparc V8 CPUs with the Workshop compilers. */
    1.30 +#define MP_MUL_DxD(a, b, Phi, Plo) \
    1.31 +  { unsigned long long product = (unsigned long long)a * b; \
    1.32 +    Plo = (mp_digit)product; \
    1.33 +    Phi = (mp_digit)(product >> MP_DIGIT_BIT); }
    1.34 +
    1.35 +/* c = a * b */
    1.36 +static void 
    1.37 +v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    1.38 +{
    1.39 +#if !defined(MP_NO_MP_WORD)
    1.40 +  mp_digit   d = 0;
    1.41 +
    1.42 +  /* Inner product:  Digits of a */
    1.43 +  while (a_len--) {
    1.44 +    mp_word w = ((mp_word)b * *a++) + d;
    1.45 +    *c++ = ACCUM(w);
    1.46 +    d = CARRYOUT(w);
    1.47 +  }
    1.48 +  *c = d;
    1.49 +#else
    1.50 +  mp_digit carry = 0;
    1.51 +  while (a_len--) {
    1.52 +    mp_digit a_i = *a++;
    1.53 +    mp_digit a0b0, a1b1;
    1.54 +
    1.55 +    MP_MUL_DxD(a_i, b, a1b1, a0b0);
    1.56 +
    1.57 +    a0b0 += carry;
    1.58 +    if (a0b0 < carry)
    1.59 +      ++a1b1;
    1.60 +    *c++ = a0b0;
    1.61 +    carry = a1b1;
    1.62 +  }
    1.63 +  *c = carry;
    1.64 +#endif
    1.65 +}
    1.66 +
    1.67 +/* c += a * b */
    1.68 +static void 
    1.69 +v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    1.70 +{
    1.71 +#if !defined(MP_NO_MP_WORD)
    1.72 +  mp_digit   d = 0;
    1.73 +
    1.74 +  /* Inner product:  Digits of a */
    1.75 +  while (a_len--) {
    1.76 +    mp_word w = ((mp_word)b * *a++) + *c + d;
    1.77 +    *c++ = ACCUM(w);
    1.78 +    d = CARRYOUT(w);
    1.79 +  }
    1.80 +  *c = d;
    1.81 +#else
    1.82 +  mp_digit carry = 0;
    1.83 +  while (a_len--) {
    1.84 +    mp_digit a_i = *a++;
    1.85 +    mp_digit a0b0, a1b1;
    1.86 +
    1.87 +    MP_MUL_DxD(a_i, b, a1b1, a0b0);
    1.88 +
    1.89 +    a0b0 += carry;
    1.90 +    if (a0b0 < carry)
    1.91 +      ++a1b1;
    1.92 +    a0b0 += a_i = *c;
    1.93 +    if (a0b0 < a_i)
    1.94 +      ++a1b1;
    1.95 +    *c++ = a0b0;
    1.96 +    carry = a1b1;
    1.97 +  }
    1.98 +  *c = carry;
    1.99 +#endif
   1.100 +}
   1.101 +
   1.102 +/* Presently, this is only used by the Montgomery arithmetic code. */
   1.103 +/* c += a * b */
   1.104 +static void 
   1.105 +v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.106 +{
   1.107 +#if !defined(MP_NO_MP_WORD)
   1.108 +  mp_digit   d = 0;
   1.109 +
   1.110 +  /* Inner product:  Digits of a */
   1.111 +  while (a_len--) {
   1.112 +    mp_word w = ((mp_word)b * *a++) + *c + d;
   1.113 +    *c++ = ACCUM(w);
   1.114 +    d = CARRYOUT(w);
   1.115 +  }
   1.116 +
   1.117 +  while (d) {
   1.118 +    mp_word w = (mp_word)*c + d;
   1.119 +    *c++ = ACCUM(w);
   1.120 +    d = CARRYOUT(w);
   1.121 +  }
   1.122 +#else
   1.123 +  mp_digit carry = 0;
   1.124 +  while (a_len--) {
   1.125 +    mp_digit a_i = *a++;
   1.126 +    mp_digit a0b0, a1b1;
   1.127 +
   1.128 +    MP_MUL_DxD(a_i, b, a1b1, a0b0);
   1.129 +
   1.130 +    a0b0 += carry;
   1.131 +    if (a0b0 < carry)
   1.132 +      ++a1b1;
   1.133 +
   1.134 +    a0b0 += a_i = *c;
   1.135 +    if (a0b0 < a_i)
   1.136 +      ++a1b1;
   1.137 +
   1.138 +    *c++ = a0b0;
   1.139 +    carry = a1b1;
   1.140 +  }
   1.141 +  while (carry) {
   1.142 +    mp_digit c_i = *c;
   1.143 +    carry += c_i;
   1.144 +    *c++ = carry;
   1.145 +    carry = carry < c_i;
   1.146 +  }
   1.147 +#endif
   1.148 +}
   1.149 +
   1.150 +/* These functions run only on v8plus+vis or v9+vis CPUs. */
   1.151 +
   1.152 +/* c = a * b */
   1.153 +void 
   1.154 +s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.155 +{
   1.156 +    mp_digit d;
   1.157 +    mp_digit x[258];
   1.158 +    if (a_len <= 256) {
   1.159 +	if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   1.160 +	    mp_digit * px;
   1.161 +	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   1.162 +	    memcpy(px, a, a_len * sizeof(*a));
   1.163 +	    a = px;
   1.164 +	    if (a_len & 1) {
   1.165 +		px[a_len] = 0;
   1.166 +	    }
   1.167 +	}
   1.168 +	s_mp_setz(c, a_len + 1);
   1.169 +	d = mul_add_inp(c, a, a_len, b);
   1.170 +	c[a_len] = d;
   1.171 +    } else {
   1.172 +	v8_mpv_mul_d(a, a_len, b, c);
   1.173 +    }
   1.174 +}
   1.175 +
   1.176 +/* c += a * b, where a is a_len words long. */
   1.177 +void     
   1.178 +s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.179 +{
   1.180 +    mp_digit d;
   1.181 +    mp_digit x[258];
   1.182 +    if (a_len <= 256) {
   1.183 +	if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   1.184 +	    mp_digit * px;
   1.185 +	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   1.186 +	    memcpy(px, a, a_len * sizeof(*a));
   1.187 +	    a = px;
   1.188 +	    if (a_len & 1) {
   1.189 +		px[a_len] = 0;
   1.190 +	    }
   1.191 +	}
   1.192 +	d = mul_add_inp(c, a, a_len, b);
   1.193 +	c[a_len] = d;
   1.194 +    } else {
   1.195 +	v8_mpv_mul_d_add(a, a_len, b, c);
   1.196 +    }
   1.197 +}
   1.198 +
   1.199 +/* c += a * b, where a is y words long. */
   1.200 +void     
   1.201 +s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   1.202 +{
   1.203 +    mp_digit d;
   1.204 +    mp_digit x[258];
   1.205 +    if (a_len <= 256) {
   1.206 +	if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   1.207 +	    mp_digit * px;
   1.208 +	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   1.209 +	    memcpy(px, a, a_len * sizeof(*a));
   1.210 +	    a = px;
   1.211 +	    if (a_len & 1) {
   1.212 +		px[a_len] = 0;
   1.213 +	    }
   1.214 +	}
   1.215 +	d = mul_add_inp(c, a, a_len, b);
   1.216 +	if (d) {
   1.217 +	    c += a_len;
   1.218 +	    do {
   1.219 +		mp_digit sum = d + *c;
   1.220 +		*c++ = sum;
   1.221 +		d = sum < d;
   1.222 +	    } while (d);
   1.223 +	}
   1.224 +    } else {
   1.225 +	v8_mpv_mul_d_add_prop(a, a_len, b, c);
   1.226 +    }
   1.227 +}

mercurial