1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpmontg.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1173 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +/* This file implements moduluar exponentiation using Montgomery's 1.9 + * method for modular reduction. This file implements the method 1.10 + * described as "Improvement 2" in the paper "A Cryptogrpahic Library for 1.11 + * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr. 1.12 + * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90" 1.13 + * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244, 1.14 + * published by Springer Verlag. 1.15 + */ 1.16 + 1.17 +#define MP_USING_CACHE_SAFE_MOD_EXP 1 1.18 +#include <string.h> 1.19 +#include "mpi-priv.h" 1.20 +#include "mplogic.h" 1.21 +#include "mpprime.h" 1.22 +#ifdef MP_USING_MONT_MULF 1.23 +#include "montmulf.h" 1.24 +#endif 1.25 +#include <stddef.h> /* ptrdiff_t */ 1.26 + 1.27 +/* if MP_CHAR_STORE_SLOW is defined, we */ 1.28 +/* need to know endianness of this platform. */ 1.29 +#ifdef MP_CHAR_STORE_SLOW 1.30 +#if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN) 1.31 +#error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \ 1.32 + " if you define MP_CHAR_STORE_SLOW." 1.33 +#endif 1.34 +#endif 1.35 + 1.36 +#define STATIC 1.37 + 1.38 +#define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */ 1.39 + 1.40 +/*! computes T = REDC(T), 2^b == R 1.41 + \param T < RN 1.42 +*/ 1.43 +mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm) 1.44 +{ 1.45 + mp_err res; 1.46 + mp_size i; 1.47 + 1.48 + i = (MP_USED(&mmm->N) << 1) + 1; 1.49 + MP_CHECKOK( s_mp_pad(T, i) ); 1.50 + for (i = 0; i < MP_USED(&mmm->N); ++i ) { 1.51 + mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime; 1.52 + /* T += N * m_i * (MP_RADIX ** i); */ 1.53 + MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) ); 1.54 + } 1.55 + s_mp_clamp(T); 1.56 + 1.57 + /* T /= R */ 1.58 + s_mp_rshd( T, MP_USED(&mmm->N) ); 1.59 + 1.60 + if ((res = s_mp_cmp(T, &mmm->N)) >= 0) { 1.61 + /* T = T - N */ 1.62 + MP_CHECKOK( s_mp_sub(T, &mmm->N) ); 1.63 +#ifdef DEBUG 1.64 + if ((res = mp_cmp(T, &mmm->N)) >= 0) { 1.65 + res = MP_UNDEF; 1.66 + goto CLEANUP; 1.67 + } 1.68 +#endif 1.69 + } 1.70 + res = MP_OKAY; 1.71 +CLEANUP: 1.72 + return res; 1.73 +} 1.74 + 1.75 +#if !defined(MP_MONT_USE_MP_MUL) 1.76 + 1.77 +/*! c <- REDC( a * b ) mod N 1.78 + \param a < N i.e. "reduced" 1.79 + \param b < N i.e. "reduced" 1.80 + \param mmm modulus N and n0' of N 1.81 +*/ 1.82 +mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, 1.83 + mp_mont_modulus *mmm) 1.84 +{ 1.85 + mp_digit *pb; 1.86 + mp_digit m_i; 1.87 + mp_err res; 1.88 + mp_size ib; /* "index b": index of current digit of B */ 1.89 + mp_size useda, usedb; 1.90 + 1.91 + ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); 1.92 + 1.93 + if (MP_USED(a) < MP_USED(b)) { 1.94 + const mp_int *xch = b; /* switch a and b, to do fewer outer loops */ 1.95 + b = a; 1.96 + a = xch; 1.97 + } 1.98 + 1.99 + MP_USED(c) = 1; MP_DIGIT(c, 0) = 0; 1.100 + ib = (MP_USED(&mmm->N) << 1) + 1; 1.101 + if((res = s_mp_pad(c, ib)) != MP_OKAY) 1.102 + goto CLEANUP; 1.103 + 1.104 + useda = MP_USED(a); 1.105 + pb = MP_DIGITS(b); 1.106 + s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c)); 1.107 + s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1)); 1.108 + m_i = MP_DIGIT(c, 0) * mmm->n0prime; 1.109 + s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0); 1.110 + 1.111 + /* Outer loop: Digits of b */ 1.112 + usedb = MP_USED(b); 1.113 + for (ib = 1; ib < usedb; ib++) { 1.114 + mp_digit b_i = *pb++; 1.115 + 1.116 + /* Inner product: Digits of a */ 1.117 + if (b_i) 1.118 + s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib); 1.119 + m_i = MP_DIGIT(c, ib) * mmm->n0prime; 1.120 + s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); 1.121 + } 1.122 + if (usedb < MP_USED(&mmm->N)) { 1.123 + for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) { 1.124 + m_i = MP_DIGIT(c, ib) * mmm->n0prime; 1.125 + s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); 1.126 + } 1.127 + } 1.128 + s_mp_clamp(c); 1.129 + s_mp_rshd( c, MP_USED(&mmm->N) ); /* c /= R */ 1.130 + if (s_mp_cmp(c, &mmm->N) >= 0) { 1.131 + MP_CHECKOK( s_mp_sub(c, &mmm->N) ); 1.132 + } 1.133 + res = MP_OKAY; 1.134 + 1.135 +CLEANUP: 1.136 + return res; 1.137 +} 1.138 +#endif 1.139 + 1.140 +STATIC 1.141 +mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont) 1.142 +{ 1.143 + mp_err res; 1.144 + 1.145 + /* xMont = x * R mod N where N is modulus */ 1.146 + MP_CHECKOK( mp_copy( x, xMont ) ); 1.147 + MP_CHECKOK( s_mp_lshd( xMont, MP_USED(&mmm->N) ) ); /* xMont = x << b */ 1.148 + MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) ); /* mod N */ 1.149 +CLEANUP: 1.150 + return res; 1.151 +} 1.152 + 1.153 +#ifdef MP_USING_MONT_MULF 1.154 + 1.155 +/* the floating point multiply is already cache safe, 1.156 + * don't turn on cache safe unless we specifically 1.157 + * force it */ 1.158 +#ifndef MP_FORCE_CACHE_SAFE 1.159 +#undef MP_USING_CACHE_SAFE_MOD_EXP 1.160 +#endif 1.161 + 1.162 +unsigned int mp_using_mont_mulf = 1; 1.163 + 1.164 +/* computes montgomery square of the integer in mResult */ 1.165 +#define SQR \ 1.166 + conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \ 1.167 + mont_mulf_noconv(mResult, dm1, d16Tmp, \ 1.168 + dTmp, dn, MP_DIGITS(modulus), nLen, dn0) 1.169 + 1.170 +/* computes montgomery product of x and the integer in mResult */ 1.171 +#define MUL(x) \ 1.172 + conv_i32_to_d32(dm1, mResult, nLen); \ 1.173 + mont_mulf_noconv(mResult, dm1, oddPowers[x], \ 1.174 + dTmp, dn, MP_DIGITS(modulus), nLen, dn0) 1.175 + 1.176 +/* Do modular exponentiation using floating point multiply code. */ 1.177 +mp_err mp_exptmod_f(const mp_int * montBase, 1.178 + const mp_int * exponent, 1.179 + const mp_int * modulus, 1.180 + mp_int * result, 1.181 + mp_mont_modulus *mmm, 1.182 + int nLen, 1.183 + mp_size bits_in_exponent, 1.184 + mp_size window_bits, 1.185 + mp_size odd_ints) 1.186 +{ 1.187 + mp_digit *mResult; 1.188 + double *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp; 1.189 + double dn0; 1.190 + mp_size i; 1.191 + mp_err res; 1.192 + int expOff; 1.193 + int dSize = 0, oddPowSize, dTmpSize; 1.194 + mp_int accum1; 1.195 + double *oddPowers[MAX_ODD_INTS]; 1.196 + 1.197 + /* function for computing n0prime only works if n0 is odd */ 1.198 + 1.199 + MP_DIGITS(&accum1) = 0; 1.200 + 1.201 + for (i = 0; i < MAX_ODD_INTS; ++i) 1.202 + oddPowers[i] = 0; 1.203 + 1.204 + MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) ); 1.205 + 1.206 + mp_set(&accum1, 1); 1.207 + MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) ); 1.208 + MP_CHECKOK( s_mp_pad(&accum1, nLen) ); 1.209 + 1.210 + oddPowSize = 2 * nLen + 1; 1.211 + dTmpSize = 2 * oddPowSize; 1.212 + dSize = sizeof(double) * (nLen * 4 + 1 + 1.213 + ((odd_ints + 1) * oddPowSize) + dTmpSize); 1.214 + dBuf = (double *)malloc(dSize); 1.215 + dm1 = dBuf; /* array of d32 */ 1.216 + dn = dBuf + nLen; /* array of d32 */ 1.217 + dSqr = dn + nLen; /* array of d32 */ 1.218 + d16Tmp = dSqr + nLen; /* array of d16 */ 1.219 + dTmp = d16Tmp + oddPowSize; 1.220 + 1.221 + for (i = 0; i < odd_ints; ++i) { 1.222 + oddPowers[i] = dTmp; 1.223 + dTmp += oddPowSize; 1.224 + } 1.225 + mResult = (mp_digit *)(dTmp + dTmpSize); /* size is nLen + 1 */ 1.226 + 1.227 + /* Make dn and dn0 */ 1.228 + conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen); 1.229 + dn0 = (double)(mmm->n0prime & 0xffff); 1.230 + 1.231 + /* Make dSqr */ 1.232 + conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen); 1.233 + mont_mulf_noconv(mResult, dm1, oddPowers[0], 1.234 + dTmp, dn, MP_DIGITS(modulus), nLen, dn0); 1.235 + conv_i32_to_d32(dSqr, mResult, nLen); 1.236 + 1.237 + for (i = 1; i < odd_ints; ++i) { 1.238 + mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1], 1.239 + dTmp, dn, MP_DIGITS(modulus), nLen, dn0); 1.240 + conv_i32_to_d16(oddPowers[i], mResult, nLen); 1.241 + } 1.242 + 1.243 + s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */ 1.244 + 1.245 + for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) { 1.246 + mp_size smallExp; 1.247 + MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) ); 1.248 + smallExp = (mp_size)res; 1.249 + 1.250 + if (window_bits == 1) { 1.251 + if (!smallExp) { 1.252 + SQR; 1.253 + } else if (smallExp & 1) { 1.254 + SQR; MUL(0); 1.255 + } else { 1.256 + abort(); 1.257 + } 1.258 + } else if (window_bits == 4) { 1.259 + if (!smallExp) { 1.260 + SQR; SQR; SQR; SQR; 1.261 + } else if (smallExp & 1) { 1.262 + SQR; SQR; SQR; SQR; MUL(smallExp/2); 1.263 + } else if (smallExp & 2) { 1.264 + SQR; SQR; SQR; MUL(smallExp/4); SQR; 1.265 + } else if (smallExp & 4) { 1.266 + SQR; SQR; MUL(smallExp/8); SQR; SQR; 1.267 + } else if (smallExp & 8) { 1.268 + SQR; MUL(smallExp/16); SQR; SQR; SQR; 1.269 + } else { 1.270 + abort(); 1.271 + } 1.272 + } else if (window_bits == 5) { 1.273 + if (!smallExp) { 1.274 + SQR; SQR; SQR; SQR; SQR; 1.275 + } else if (smallExp & 1) { 1.276 + SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2); 1.277 + } else if (smallExp & 2) { 1.278 + SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR; 1.279 + } else if (smallExp & 4) { 1.280 + SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR; 1.281 + } else if (smallExp & 8) { 1.282 + SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR; 1.283 + } else if (smallExp & 0x10) { 1.284 + SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR; 1.285 + } else { 1.286 + abort(); 1.287 + } 1.288 + } else if (window_bits == 6) { 1.289 + if (!smallExp) { 1.290 + SQR; SQR; SQR; SQR; SQR; SQR; 1.291 + } else if (smallExp & 1) { 1.292 + SQR; SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2); 1.293 + } else if (smallExp & 2) { 1.294 + SQR; SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR; 1.295 + } else if (smallExp & 4) { 1.296 + SQR; SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR; 1.297 + } else if (smallExp & 8) { 1.298 + SQR; SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR; 1.299 + } else if (smallExp & 0x10) { 1.300 + SQR; SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR; 1.301 + } else if (smallExp & 0x20) { 1.302 + SQR; MUL(smallExp/64); SQR; SQR; SQR; SQR; SQR; 1.303 + } else { 1.304 + abort(); 1.305 + } 1.306 + } else { 1.307 + abort(); 1.308 + } 1.309 + } 1.310 + 1.311 + s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */ 1.312 + 1.313 + res = s_mp_redc(&accum1, mmm); 1.314 + mp_exch(&accum1, result); 1.315 + 1.316 +CLEANUP: 1.317 + mp_clear(&accum1); 1.318 + if (dBuf) { 1.319 + if (dSize) 1.320 + memset(dBuf, 0, dSize); 1.321 + free(dBuf); 1.322 + } 1.323 + 1.324 + return res; 1.325 +} 1.326 +#undef SQR 1.327 +#undef MUL 1.328 +#endif 1.329 + 1.330 +#define SQR(a,b) \ 1.331 + MP_CHECKOK( mp_sqr(a, b) );\ 1.332 + MP_CHECKOK( s_mp_redc(b, mmm) ) 1.333 + 1.334 +#if defined(MP_MONT_USE_MP_MUL) 1.335 +#define MUL(x,a,b) \ 1.336 + MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \ 1.337 + MP_CHECKOK( s_mp_redc(b, mmm) ) 1.338 +#else 1.339 +#define MUL(x,a,b) \ 1.340 + MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) ) 1.341 +#endif 1.342 + 1.343 +#define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp 1.344 + 1.345 +/* Do modular exponentiation using integer multiply code. */ 1.346 +mp_err mp_exptmod_i(const mp_int * montBase, 1.347 + const mp_int * exponent, 1.348 + const mp_int * modulus, 1.349 + mp_int * result, 1.350 + mp_mont_modulus *mmm, 1.351 + int nLen, 1.352 + mp_size bits_in_exponent, 1.353 + mp_size window_bits, 1.354 + mp_size odd_ints) 1.355 +{ 1.356 + mp_int *pa1, *pa2, *ptmp; 1.357 + mp_size i; 1.358 + mp_err res; 1.359 + int expOff; 1.360 + mp_int accum1, accum2, power2, oddPowers[MAX_ODD_INTS]; 1.361 + 1.362 + /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */ 1.363 + /* oddPowers[i] = base ** (2*i + 1); */ 1.364 + 1.365 + MP_DIGITS(&accum1) = 0; 1.366 + MP_DIGITS(&accum2) = 0; 1.367 + MP_DIGITS(&power2) = 0; 1.368 + for (i = 0; i < MAX_ODD_INTS; ++i) { 1.369 + MP_DIGITS(oddPowers + i) = 0; 1.370 + } 1.371 + 1.372 + MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) ); 1.373 + MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) ); 1.374 + 1.375 + MP_CHECKOK( mp_init_copy(&oddPowers[0], montBase) ); 1.376 + 1.377 + mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2); 1.378 + MP_CHECKOK( mp_sqr(montBase, &power2) ); /* power2 = montBase ** 2 */ 1.379 + MP_CHECKOK( s_mp_redc(&power2, mmm) ); 1.380 + 1.381 + for (i = 1; i < odd_ints; ++i) { 1.382 + mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2); 1.383 + MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) ); 1.384 + MP_CHECKOK( s_mp_redc(oddPowers + i, mmm) ); 1.385 + } 1.386 + 1.387 + /* set accumulator to montgomery residue of 1 */ 1.388 + mp_set(&accum1, 1); 1.389 + MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) ); 1.390 + pa1 = &accum1; 1.391 + pa2 = &accum2; 1.392 + 1.393 + for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) { 1.394 + mp_size smallExp; 1.395 + MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) ); 1.396 + smallExp = (mp_size)res; 1.397 + 1.398 + if (window_bits == 1) { 1.399 + if (!smallExp) { 1.400 + SQR(pa1,pa2); SWAPPA; 1.401 + } else if (smallExp & 1) { 1.402 + SQR(pa1,pa2); MUL(0,pa2,pa1); 1.403 + } else { 1.404 + abort(); 1.405 + } 1.406 + } else if (window_bits == 4) { 1.407 + if (!smallExp) { 1.408 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.409 + } else if (smallExp & 1) { 1.410 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.411 + MUL(smallExp/2, pa1,pa2); SWAPPA; 1.412 + } else if (smallExp & 2) { 1.413 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 1.414 + MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.415 + } else if (smallExp & 4) { 1.416 + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2); 1.417 + SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.418 + } else if (smallExp & 8) { 1.419 + SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); 1.420 + SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.421 + } else { 1.422 + abort(); 1.423 + } 1.424 + } else if (window_bits == 5) { 1.425 + if (!smallExp) { 1.426 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.427 + SQR(pa1,pa2); SWAPPA; 1.428 + } else if (smallExp & 1) { 1.429 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.430 + SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1); 1.431 + } else if (smallExp & 2) { 1.432 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.433 + MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1); 1.434 + } else if (smallExp & 4) { 1.435 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 1.436 + MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.437 + } else if (smallExp & 8) { 1.438 + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2); 1.439 + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.440 + } else if (smallExp & 0x10) { 1.441 + SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2); 1.442 + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.443 + } else { 1.444 + abort(); 1.445 + } 1.446 + } else if (window_bits == 6) { 1.447 + if (!smallExp) { 1.448 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.449 + SQR(pa1,pa2); SQR(pa2,pa1); 1.450 + } else if (smallExp & 1) { 1.451 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.452 + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA; 1.453 + } else if (smallExp & 2) { 1.454 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.455 + SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.456 + } else if (smallExp & 4) { 1.457 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.458 + MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.459 + } else if (smallExp & 8) { 1.460 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 1.461 + MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.462 + SQR(pa1,pa2); SWAPPA; 1.463 + } else if (smallExp & 0x10) { 1.464 + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2); 1.465 + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.466 + } else if (smallExp & 0x20) { 1.467 + SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2); 1.468 + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; 1.469 + } else { 1.470 + abort(); 1.471 + } 1.472 + } else { 1.473 + abort(); 1.474 + } 1.475 + } 1.476 + 1.477 + res = s_mp_redc(pa1, mmm); 1.478 + mp_exch(pa1, result); 1.479 + 1.480 +CLEANUP: 1.481 + mp_clear(&accum1); 1.482 + mp_clear(&accum2); 1.483 + mp_clear(&power2); 1.484 + for (i = 0; i < odd_ints; ++i) { 1.485 + mp_clear(oddPowers + i); 1.486 + } 1.487 + return res; 1.488 +} 1.489 +#undef SQR 1.490 +#undef MUL 1.491 + 1.492 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.493 +unsigned int mp_using_cache_safe_exp = 1; 1.494 +#endif 1.495 + 1.496 +mp_err mp_set_safe_modexp(int value) 1.497 +{ 1.498 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.499 + mp_using_cache_safe_exp = value; 1.500 + return MP_OKAY; 1.501 +#else 1.502 + if (value == 0) { 1.503 + return MP_OKAY; 1.504 + } 1.505 + return MP_BADARG; 1.506 +#endif 1.507 +} 1.508 + 1.509 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.510 +#define WEAVE_WORD_SIZE 4 1.511 + 1.512 +#ifndef MP_CHAR_STORE_SLOW 1.513 +/* 1.514 + * mpi_to_weave takes an array of bignums, a matrix in which each bignum 1.515 + * occupies all the columns of a row, and transposes it into a matrix in 1.516 + * which each bignum occupies a column of every row. The first row of the 1.517 + * input matrix becomes the first column of the output matrix. The n'th 1.518 + * row of input becomes the n'th column of output. The input data is said 1.519 + * to be "interleaved" or "woven" into the output matrix. 1.520 + * 1.521 + * The array of bignums is left in this woven form. Each time a single 1.522 + * bignum value is needed, it is recreated by fetching the n'th column, 1.523 + * forming a single row which is the new bignum. 1.524 + * 1.525 + * The purpose of this interleaving is make it impossible to determine which 1.526 + * of the bignums is being used in any one operation by examining the pattern 1.527 + * of cache misses. 1.528 + * 1.529 + * The weaving function does not transpose the entire input matrix in one call. 1.530 + * It transposes 4 rows of mp_ints into their respective columns of output. 1.531 + * 1.532 + * There are two different implementations of the weaving and unweaving code 1.533 + * in this file. One uses byte loads and stores. The second uses loads and 1.534 + * stores of mp_weave_word size values. The weaved forms of these two 1.535 + * implementations differ. Consequently, each one has its own explanation. 1.536 + * 1.537 + * Here is the explanation for the byte-at-a-time implementation. 1.538 + * 1.539 + * This implementation treats each mp_int bignum as an array of bytes, 1.540 + * rather than as an array of mp_digits. It stores those bytes as a 1.541 + * column of bytes in the output matrix. It doesn't care if the machine 1.542 + * uses big-endian or little-endian byte ordering within mp_digits. 1.543 + * The first byte of the mp_digit array becomes the first byte in the output 1.544 + * column, regardless of whether that byte is the MSB or LSB of the mp_digit. 1.545 + * 1.546 + * "bignums" is an array of mp_ints. 1.547 + * It points to four rows, four mp_ints, a subset of a larger array of mp_ints. 1.548 + * 1.549 + * "weaved" is the weaved output matrix. 1.550 + * The first byte of bignums[0] is stored in weaved[0]. 1.551 + * 1.552 + * "nBignums" is the total number of bignums in the array of which "bignums" 1.553 + * is a part. 1.554 + * 1.555 + * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array. 1.556 + * mp_ints that use less than nDigits digits are logically padded with zeros 1.557 + * while being stored in the weaved array. 1.558 + */ 1.559 +mp_err mpi_to_weave(const mp_int *bignums, 1.560 + unsigned char *weaved, 1.561 + mp_size nDigits, /* in each mp_int of input */ 1.562 + mp_size nBignums) /* in the entire source array */ 1.563 +{ 1.564 + mp_size i; 1.565 + unsigned char * endDest = weaved + (nDigits * nBignums * sizeof(mp_digit)); 1.566 + 1.567 + for (i=0; i < WEAVE_WORD_SIZE; i++) { 1.568 + mp_size used = MP_USED(&bignums[i]); 1.569 + unsigned char *pSrc = (unsigned char *)MP_DIGITS(&bignums[i]); 1.570 + unsigned char *endSrc = pSrc + (used * sizeof(mp_digit)); 1.571 + unsigned char *pDest = weaved + i; 1.572 + 1.573 + ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG); 1.574 + ARGCHK(used <= nDigits, MP_BADARG); 1.575 + 1.576 + for (; pSrc < endSrc; pSrc++) { 1.577 + *pDest = *pSrc; 1.578 + pDest += nBignums; 1.579 + } 1.580 + while (pDest < endDest) { 1.581 + *pDest = 0; 1.582 + pDest += nBignums; 1.583 + } 1.584 + } 1.585 + 1.586 + return MP_OKAY; 1.587 +} 1.588 + 1.589 +/* Reverse the operation above for one mp_int. 1.590 + * Reconstruct one mp_int from its column in the weaved array. 1.591 + * "pSrc" points to the offset into the weave array of the bignum we 1.592 + * are going to reconstruct. 1.593 + */ 1.594 +mp_err weave_to_mpi(mp_int *a, /* output, result */ 1.595 + const unsigned char *pSrc, /* input, byte matrix */ 1.596 + mp_size nDigits, /* per mp_int output */ 1.597 + mp_size nBignums) /* bignums in weaved matrix */ 1.598 +{ 1.599 + unsigned char *pDest = (unsigned char *)MP_DIGITS(a); 1.600 + unsigned char *endDest = pDest + (nDigits * sizeof(mp_digit)); 1.601 + 1.602 + MP_SIGN(a) = MP_ZPOS; 1.603 + MP_USED(a) = nDigits; 1.604 + 1.605 + for (; pDest < endDest; pSrc += nBignums, pDest++) { 1.606 + *pDest = *pSrc; 1.607 + } 1.608 + s_mp_clamp(a); 1.609 + return MP_OKAY; 1.610 +} 1.611 + 1.612 +#else 1.613 + 1.614 +/* Need a primitive that we know is 32 bits long... */ 1.615 +/* this is true on all modern processors we know of today*/ 1.616 +typedef unsigned int mp_weave_word; 1.617 + 1.618 +/* 1.619 + * on some platforms character stores into memory is very expensive since they 1.620 + * generate a read/modify/write operation on the bus. On those platforms 1.621 + * we need to do integer writes to the bus. Because of some unrolled code, 1.622 + * in this current code the size of mp_weave_word must be four. The code that 1.623 + * makes this assumption explicity is called out. (on some platforms a write 1.624 + * of 4 bytes still requires a single read-modify-write operation. 1.625 + * 1.626 + * This function is takes the identical parameters as the function above, 1.627 + * however it lays out the final array differently. Where the previous function 1.628 + * treats the mpi_int as an byte array, this function treats it as an array of 1.629 + * mp_digits where each digit is stored in big endian order. 1.630 + * 1.631 + * since we need to interleave on a byte by byte basis, we need to collect 1.632 + * several mpi structures together into a single PRUint32 before we write. We 1.633 + * also need to make sure the PRUint32 is arranged so that the first value of 1.634 + * the first array winds up in b[0]. This means construction of that PRUint32 1.635 + * is endian specific (even though the layout of the mp_digits in the array 1.636 + * is always big endian). 1.637 + * 1.638 + * The final data is stored as follows : 1.639 + * 1.640 + * Our same logical array p array, m is sizeof(mp_digit), 1.641 + * N is still count and n is now b_size. If we define p[i].digit[j]0 as the 1.642 + * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as 1.643 + * the next most significant byte of p[i].digit[j], ... and p[i].digit[j]m-1 1.644 + * is the least significant byte. 1.645 + * Our array would look like: 1.646 + * p[0].digit[0]0 p[1].digit[0]0 ... p[N-2].digit[0]0 p[N-1].digit[0]0 1.647 + * p[0].digit[0]1 p[1].digit[0]1 ... p[N-2].digit[0]1 p[N-1].digit[0]1 1.648 + * . . 1.649 + * p[0].digit[0]m-1 p[1].digit[0]m-1 ... p[N-2].digit[0]m-1 p[N-1].digit[0]m-1 1.650 + * p[0].digit[1]0 p[1].digit[1]0 ... p[N-2].digit[1]0 p[N-1].digit[1]0 1.651 + * . . 1.652 + * . . 1.653 + * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2 1.654 + * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1 1.655 + * 1.656 + */ 1.657 +mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 1.658 + mp_size b_size, mp_size count) 1.659 +{ 1.660 + mp_size i; 1.661 + mp_digit *digitsa0; 1.662 + mp_digit *digitsa1; 1.663 + mp_digit *digitsa2; 1.664 + mp_digit *digitsa3; 1.665 + mp_size useda0; 1.666 + mp_size useda1; 1.667 + mp_size useda2; 1.668 + mp_size useda3; 1.669 + mp_weave_word *weaved = (mp_weave_word *)b; 1.670 + 1.671 + count = count/sizeof(mp_weave_word); 1.672 + 1.673 + /* this code pretty much depends on this ! */ 1.674 +#if MP_ARGCHK == 2 1.675 + assert(WEAVE_WORD_SIZE == 4); 1.676 + assert(sizeof(mp_weave_word) == 4); 1.677 +#endif 1.678 + 1.679 + digitsa0 = MP_DIGITS(&a[0]); 1.680 + digitsa1 = MP_DIGITS(&a[1]); 1.681 + digitsa2 = MP_DIGITS(&a[2]); 1.682 + digitsa3 = MP_DIGITS(&a[3]); 1.683 + useda0 = MP_USED(&a[0]); 1.684 + useda1 = MP_USED(&a[1]); 1.685 + useda2 = MP_USED(&a[2]); 1.686 + useda3 = MP_USED(&a[3]); 1.687 + 1.688 + ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG); 1.689 + ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG); 1.690 + ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG); 1.691 + ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG); 1.692 + ARGCHK(useda0 <= b_size, MP_BADARG); 1.693 + ARGCHK(useda1 <= b_size, MP_BADARG); 1.694 + ARGCHK(useda2 <= b_size, MP_BADARG); 1.695 + ARGCHK(useda3 <= b_size, MP_BADARG); 1.696 + 1.697 +#define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0) 1.698 + 1.699 + for (i=0; i < b_size; i++) { 1.700 + mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i); 1.701 + mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i); 1.702 + mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i); 1.703 + mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i); 1.704 + register mp_weave_word acc; 1.705 + 1.706 +/* 1.707 + * ONE_STEP takes the MSB of each of our current digits and places that 1.708 + * byte in the appropriate position for writing to the weaved array. 1.709 + * On little endian: 1.710 + * b3 b2 b1 b0 1.711 + * On big endian: 1.712 + * b0 b1 b2 b3 1.713 + * When the data is written it would always wind up: 1.714 + * b[0] = b0 1.715 + * b[1] = b1 1.716 + * b[2] = b2 1.717 + * b[3] = b3 1.718 + * 1.719 + * Once we've written the MSB, we shift the whole digit up left one 1.720 + * byte, putting the Next Most Significant Byte in the MSB position, 1.721 + * so we we repeat the next one step that byte will be written. 1.722 + * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE 1.723 + * is 4. 1.724 + */ 1.725 +#ifdef MP_IS_LITTLE_ENDIAN 1.726 +#define MPI_WEAVE_ONE_STEP \ 1.727 + acc = (d0 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d0 <<= 8; /*b0*/ \ 1.728 + acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \ 1.729 + acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \ 1.730 + acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \ 1.731 + *weaved = acc; weaved += count; 1.732 +#else 1.733 +#define MPI_WEAVE_ONE_STEP \ 1.734 + acc = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \ 1.735 + acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \ 1.736 + acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \ 1.737 + acc |= (d3 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d3 <<= 8; /*b3*/ \ 1.738 + *weaved = acc; weaved += count; 1.739 +#endif 1.740 + switch (sizeof(mp_digit)) { 1.741 + case 32: 1.742 + MPI_WEAVE_ONE_STEP 1.743 + MPI_WEAVE_ONE_STEP 1.744 + MPI_WEAVE_ONE_STEP 1.745 + MPI_WEAVE_ONE_STEP 1.746 + MPI_WEAVE_ONE_STEP 1.747 + MPI_WEAVE_ONE_STEP 1.748 + MPI_WEAVE_ONE_STEP 1.749 + MPI_WEAVE_ONE_STEP 1.750 + MPI_WEAVE_ONE_STEP 1.751 + MPI_WEAVE_ONE_STEP 1.752 + MPI_WEAVE_ONE_STEP 1.753 + MPI_WEAVE_ONE_STEP 1.754 + MPI_WEAVE_ONE_STEP 1.755 + MPI_WEAVE_ONE_STEP 1.756 + MPI_WEAVE_ONE_STEP 1.757 + MPI_WEAVE_ONE_STEP 1.758 + case 16: 1.759 + MPI_WEAVE_ONE_STEP 1.760 + MPI_WEAVE_ONE_STEP 1.761 + MPI_WEAVE_ONE_STEP 1.762 + MPI_WEAVE_ONE_STEP 1.763 + MPI_WEAVE_ONE_STEP 1.764 + MPI_WEAVE_ONE_STEP 1.765 + MPI_WEAVE_ONE_STEP 1.766 + MPI_WEAVE_ONE_STEP 1.767 + case 8: 1.768 + MPI_WEAVE_ONE_STEP 1.769 + MPI_WEAVE_ONE_STEP 1.770 + MPI_WEAVE_ONE_STEP 1.771 + MPI_WEAVE_ONE_STEP 1.772 + case 4: 1.773 + MPI_WEAVE_ONE_STEP 1.774 + MPI_WEAVE_ONE_STEP 1.775 + case 2: 1.776 + MPI_WEAVE_ONE_STEP 1.777 + case 1: 1.778 + MPI_WEAVE_ONE_STEP 1.779 + break; 1.780 + } 1.781 + } 1.782 + 1.783 + return MP_OKAY; 1.784 +} 1.785 + 1.786 +/* reverse the operation above for one entry. 1.787 + * b points to the offset into the weave array of the power we are 1.788 + * calculating */ 1.789 +mp_err weave_to_mpi(mp_int *a, const unsigned char *b, 1.790 + mp_size b_size, mp_size count) 1.791 +{ 1.792 + mp_digit *pb = MP_DIGITS(a); 1.793 + mp_digit *end = &pb[b_size]; 1.794 + 1.795 + MP_SIGN(a) = MP_ZPOS; 1.796 + MP_USED(a) = b_size; 1.797 + 1.798 + for (; pb < end; pb++) { 1.799 + register mp_digit digit; 1.800 + 1.801 + digit = *b << 8; b += count; 1.802 +#define MPI_UNWEAVE_ONE_STEP digit |= *b; b += count; digit = digit << 8; 1.803 + switch (sizeof(mp_digit)) { 1.804 + case 32: 1.805 + MPI_UNWEAVE_ONE_STEP 1.806 + MPI_UNWEAVE_ONE_STEP 1.807 + MPI_UNWEAVE_ONE_STEP 1.808 + MPI_UNWEAVE_ONE_STEP 1.809 + MPI_UNWEAVE_ONE_STEP 1.810 + MPI_UNWEAVE_ONE_STEP 1.811 + MPI_UNWEAVE_ONE_STEP 1.812 + MPI_UNWEAVE_ONE_STEP 1.813 + MPI_UNWEAVE_ONE_STEP 1.814 + MPI_UNWEAVE_ONE_STEP 1.815 + MPI_UNWEAVE_ONE_STEP 1.816 + MPI_UNWEAVE_ONE_STEP 1.817 + MPI_UNWEAVE_ONE_STEP 1.818 + MPI_UNWEAVE_ONE_STEP 1.819 + MPI_UNWEAVE_ONE_STEP 1.820 + MPI_UNWEAVE_ONE_STEP 1.821 + case 16: 1.822 + MPI_UNWEAVE_ONE_STEP 1.823 + MPI_UNWEAVE_ONE_STEP 1.824 + MPI_UNWEAVE_ONE_STEP 1.825 + MPI_UNWEAVE_ONE_STEP 1.826 + MPI_UNWEAVE_ONE_STEP 1.827 + MPI_UNWEAVE_ONE_STEP 1.828 + MPI_UNWEAVE_ONE_STEP 1.829 + MPI_UNWEAVE_ONE_STEP 1.830 + case 8: 1.831 + MPI_UNWEAVE_ONE_STEP 1.832 + MPI_UNWEAVE_ONE_STEP 1.833 + MPI_UNWEAVE_ONE_STEP 1.834 + MPI_UNWEAVE_ONE_STEP 1.835 + case 4: 1.836 + MPI_UNWEAVE_ONE_STEP 1.837 + MPI_UNWEAVE_ONE_STEP 1.838 + case 2: 1.839 + break; 1.840 + } 1.841 + digit |= *b; b += count; 1.842 + 1.843 + *pb = digit; 1.844 + } 1.845 + s_mp_clamp(a); 1.846 + return MP_OKAY; 1.847 +} 1.848 +#endif 1.849 + 1.850 + 1.851 +#define SQR(a,b) \ 1.852 + MP_CHECKOK( mp_sqr(a, b) );\ 1.853 + MP_CHECKOK( s_mp_redc(b, mmm) ) 1.854 + 1.855 +#if defined(MP_MONT_USE_MP_MUL) 1.856 +#define MUL_NOWEAVE(x,a,b) \ 1.857 + MP_CHECKOK( mp_mul(a, x, b) ); \ 1.858 + MP_CHECKOK( s_mp_redc(b, mmm) ) 1.859 +#else 1.860 +#define MUL_NOWEAVE(x,a,b) \ 1.861 + MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) ) 1.862 +#endif 1.863 + 1.864 +#define MUL(x,a,b) \ 1.865 + MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \ 1.866 + MUL_NOWEAVE(&tmp,a,b) 1.867 + 1.868 +#define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp 1.869 +#define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y))) 1.870 + 1.871 +/* Do modular exponentiation using integer multiply code. */ 1.872 +mp_err mp_exptmod_safe_i(const mp_int * montBase, 1.873 + const mp_int * exponent, 1.874 + const mp_int * modulus, 1.875 + mp_int * result, 1.876 + mp_mont_modulus *mmm, 1.877 + int nLen, 1.878 + mp_size bits_in_exponent, 1.879 + mp_size window_bits, 1.880 + mp_size num_powers) 1.881 +{ 1.882 + mp_int *pa1, *pa2, *ptmp; 1.883 + mp_size i; 1.884 + mp_size first_window; 1.885 + mp_err res; 1.886 + int expOff; 1.887 + mp_int accum1, accum2, accum[WEAVE_WORD_SIZE]; 1.888 + mp_int tmp; 1.889 + unsigned char *powersArray; 1.890 + unsigned char *powers; 1.891 + 1.892 + MP_DIGITS(&accum1) = 0; 1.893 + MP_DIGITS(&accum2) = 0; 1.894 + MP_DIGITS(&accum[0]) = 0; 1.895 + MP_DIGITS(&accum[1]) = 0; 1.896 + MP_DIGITS(&accum[2]) = 0; 1.897 + MP_DIGITS(&accum[3]) = 0; 1.898 + MP_DIGITS(&tmp) = 0; 1.899 + 1.900 + powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1)); 1.901 + if (powersArray == NULL) { 1.902 + res = MP_MEM; 1.903 + goto CLEANUP; 1.904 + } 1.905 + 1.906 + /* powers[i] = base ** (i); */ 1.907 + powers = (unsigned char *)MP_ALIGN(powersArray,num_powers); 1.908 + 1.909 + /* grab the first window value. This allows us to preload accumulator1 1.910 + * and save a conversion, some squares and a multiple*/ 1.911 + MP_CHECKOK( mpl_get_bits(exponent, 1.912 + bits_in_exponent-window_bits, window_bits) ); 1.913 + first_window = (mp_size)res; 1.914 + 1.915 + MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) ); 1.916 + MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) ); 1.917 + MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) ); 1.918 + 1.919 + /* build the first WEAVE_WORD powers inline */ 1.920 + /* if WEAVE_WORD_SIZE is not 4, this code will have to change */ 1.921 + if (num_powers > 2) { 1.922 + MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) ); 1.923 + MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) ); 1.924 + MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) ); 1.925 + MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) ); 1.926 + mp_set(&accum[0], 1); 1.927 + MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) ); 1.928 + MP_CHECKOK( mp_copy(montBase, &accum[1]) ); 1.929 + SQR(montBase, &accum[2]); 1.930 + MUL_NOWEAVE(montBase, &accum[2], &accum[3]); 1.931 + MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) ); 1.932 + if (first_window < 4) { 1.933 + MP_CHECKOK( mp_copy(&accum[first_window], &accum1) ); 1.934 + first_window = num_powers; 1.935 + } 1.936 + } else { 1.937 + if (first_window == 0) { 1.938 + mp_set(&accum1, 1); 1.939 + MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) ); 1.940 + } else { 1.941 + /* assert first_window == 1? */ 1.942 + MP_CHECKOK( mp_copy(montBase, &accum1) ); 1.943 + } 1.944 + } 1.945 + 1.946 + /* 1.947 + * calculate all the powers in the powers array. 1.948 + * this adds 2**(k-1)-2 square operations over just calculating the 1.949 + * odd powers where k is the window size in the two other mp_modexpt 1.950 + * implementations in this file. We will get some of that 1.951 + * back by not needing the first 'k' squares and one multiply for the 1.952 + * first window */ 1.953 + for (i = WEAVE_WORD_SIZE; i < num_powers; i++) { 1.954 + int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */ 1.955 + if ( i & 1 ) { 1.956 + MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]); 1.957 + /* we've filled the array do our 'per array' processing */ 1.958 + if (acc_index == (WEAVE_WORD_SIZE-1)) { 1.959 + MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1), 1.960 + nLen, num_powers) ); 1.961 + 1.962 + if (first_window <= i) { 1.963 + MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)], 1.964 + &accum1) ); 1.965 + first_window = num_powers; 1.966 + } 1.967 + } 1.968 + } else { 1.969 + /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source 1.970 + * and target are the same so we need to copy.. After that, the 1.971 + * value is overwritten, so we need to fetch it from the stored 1.972 + * weave array */ 1.973 + if (i > 2* WEAVE_WORD_SIZE) { 1.974 + MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers)); 1.975 + SQR(&accum2, &accum[acc_index]); 1.976 + } else { 1.977 + int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1); 1.978 + if (half_power_index == acc_index) { 1.979 + /* copy is cheaper than weave_to_mpi */ 1.980 + MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2)); 1.981 + SQR(&accum2,&accum[acc_index]); 1.982 + } else { 1.983 + SQR(&accum[half_power_index],&accum[acc_index]); 1.984 + } 1.985 + } 1.986 + } 1.987 + } 1.988 + /* if the accum1 isn't set, Then there is something wrong with our logic 1.989 + * above and is an internal programming error. 1.990 + */ 1.991 +#if MP_ARGCHK == 2 1.992 + assert(MP_USED(&accum1) != 0); 1.993 +#endif 1.994 + 1.995 + /* set accumulator to montgomery residue of 1 */ 1.996 + pa1 = &accum1; 1.997 + pa2 = &accum2; 1.998 + 1.999 + for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) { 1.1000 + mp_size smallExp; 1.1001 + MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) ); 1.1002 + smallExp = (mp_size)res; 1.1003 + 1.1004 + /* handle unroll the loops */ 1.1005 + switch (window_bits) { 1.1006 + case 1: 1.1007 + if (!smallExp) { 1.1008 + SQR(pa1,pa2); SWAPPA; 1.1009 + } else if (smallExp & 1) { 1.1010 + SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1); 1.1011 + } else { 1.1012 + abort(); 1.1013 + } 1.1014 + break; 1.1015 + case 6: 1.1016 + SQR(pa1,pa2); SQR(pa2,pa1); 1.1017 + /* fall through */ 1.1018 + case 4: 1.1019 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.1020 + MUL(smallExp, pa1,pa2); SWAPPA; 1.1021 + break; 1.1022 + case 5: 1.1023 + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 1.1024 + SQR(pa1,pa2); MUL(smallExp,pa2,pa1); 1.1025 + break; 1.1026 + default: 1.1027 + abort(); /* could do a loop? */ 1.1028 + } 1.1029 + } 1.1030 + 1.1031 + res = s_mp_redc(pa1, mmm); 1.1032 + mp_exch(pa1, result); 1.1033 + 1.1034 +CLEANUP: 1.1035 + mp_clear(&accum1); 1.1036 + mp_clear(&accum2); 1.1037 + mp_clear(&accum[0]); 1.1038 + mp_clear(&accum[1]); 1.1039 + mp_clear(&accum[2]); 1.1040 + mp_clear(&accum[3]); 1.1041 + mp_clear(&tmp); 1.1042 + /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */ 1.1043 + free(powersArray); 1.1044 + return res; 1.1045 +} 1.1046 +#undef SQR 1.1047 +#undef MUL 1.1048 +#endif 1.1049 + 1.1050 +mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, 1.1051 + const mp_int *modulus, mp_int *result) 1.1052 +{ 1.1053 + const mp_int *base; 1.1054 + mp_size bits_in_exponent, i, window_bits, odd_ints; 1.1055 + mp_err res; 1.1056 + int nLen; 1.1057 + mp_int montBase, goodBase; 1.1058 + mp_mont_modulus mmm; 1.1059 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.1060 + static unsigned int max_window_bits; 1.1061 +#endif 1.1062 + 1.1063 + /* function for computing n0prime only works if n0 is odd */ 1.1064 + if (!mp_isodd(modulus)) 1.1065 + return s_mp_exptmod(inBase, exponent, modulus, result); 1.1066 + 1.1067 + MP_DIGITS(&montBase) = 0; 1.1068 + MP_DIGITS(&goodBase) = 0; 1.1069 + 1.1070 + if (mp_cmp(inBase, modulus) < 0) { 1.1071 + base = inBase; 1.1072 + } else { 1.1073 + MP_CHECKOK( mp_init(&goodBase) ); 1.1074 + base = &goodBase; 1.1075 + MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) ); 1.1076 + } 1.1077 + 1.1078 + nLen = MP_USED(modulus); 1.1079 + MP_CHECKOK( mp_init_size(&montBase, 2 * nLen + 2) ); 1.1080 + 1.1081 + mmm.N = *modulus; /* a copy of the mp_int struct */ 1.1082 + 1.1083 + /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX 1.1084 + ** where n0 = least significant mp_digit of N, the modulus. 1.1085 + */ 1.1086 + mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) ); 1.1087 + 1.1088 + MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) ); 1.1089 + 1.1090 + bits_in_exponent = mpl_significant_bits(exponent); 1.1091 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.1092 + if (mp_using_cache_safe_exp) { 1.1093 + if (bits_in_exponent > 780) 1.1094 + window_bits = 6; 1.1095 + else if (bits_in_exponent > 256) 1.1096 + window_bits = 5; 1.1097 + else if (bits_in_exponent > 20) 1.1098 + window_bits = 4; 1.1099 + /* RSA public key exponents are typically under 20 bits (common values 1.1100 + * are: 3, 17, 65537) and a 4-bit window is inefficient 1.1101 + */ 1.1102 + else 1.1103 + window_bits = 1; 1.1104 + } else 1.1105 +#endif 1.1106 + if (bits_in_exponent > 480) 1.1107 + window_bits = 6; 1.1108 + else if (bits_in_exponent > 160) 1.1109 + window_bits = 5; 1.1110 + else if (bits_in_exponent > 20) 1.1111 + window_bits = 4; 1.1112 + /* RSA public key exponents are typically under 20 bits (common values 1.1113 + * are: 3, 17, 65537) and a 4-bit window is inefficient 1.1114 + */ 1.1115 + else 1.1116 + window_bits = 1; 1.1117 + 1.1118 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.1119 + /* 1.1120 + * clamp the window size based on 1.1121 + * the cache line size. 1.1122 + */ 1.1123 + if (!max_window_bits) { 1.1124 + unsigned long cache_size = s_mpi_getProcessorLineSize(); 1.1125 + /* processor has no cache, use 'fast' code always */ 1.1126 + if (cache_size == 0) { 1.1127 + mp_using_cache_safe_exp = 0; 1.1128 + } 1.1129 + if ((cache_size == 0) || (cache_size >= 64)) { 1.1130 + max_window_bits = 6; 1.1131 + } else if (cache_size >= 32) { 1.1132 + max_window_bits = 5; 1.1133 + } else if (cache_size >= 16) { 1.1134 + max_window_bits = 4; 1.1135 + } else max_window_bits = 1; /* should this be an assert? */ 1.1136 + } 1.1137 + 1.1138 + /* clamp the window size down before we caclulate bits_in_exponent */ 1.1139 + if (mp_using_cache_safe_exp) { 1.1140 + if (window_bits > max_window_bits) { 1.1141 + window_bits = max_window_bits; 1.1142 + } 1.1143 + } 1.1144 +#endif 1.1145 + 1.1146 + odd_ints = 1 << (window_bits - 1); 1.1147 + i = bits_in_exponent % window_bits; 1.1148 + if (i != 0) { 1.1149 + bits_in_exponent += window_bits - i; 1.1150 + } 1.1151 + 1.1152 +#ifdef MP_USING_MONT_MULF 1.1153 + if (mp_using_mont_mulf) { 1.1154 + MP_CHECKOK( s_mp_pad(&montBase, nLen) ); 1.1155 + res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen, 1.1156 + bits_in_exponent, window_bits, odd_ints); 1.1157 + } else 1.1158 +#endif 1.1159 +#ifdef MP_USING_CACHE_SAFE_MOD_EXP 1.1160 + if (mp_using_cache_safe_exp) { 1.1161 + res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen, 1.1162 + bits_in_exponent, window_bits, 1 << window_bits); 1.1163 + } else 1.1164 +#endif 1.1165 + res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen, 1.1166 + bits_in_exponent, window_bits, odd_ints); 1.1167 + 1.1168 +CLEANUP: 1.1169 + mp_clear(&montBase); 1.1170 + mp_clear(&goodBase); 1.1171 + /* Don't mp_clear mmm.N because it is merely a copy of modulus. 1.1172 + ** Just zap it. 1.1173 + */ 1.1174 + memset(&mmm, 0, sizeof mmm); 1.1175 + return res; 1.1176 +}