security/nss/lib/freebl/mpi/mpmontg.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 4
michael@0 5 /* This file implements moduluar exponentiation using Montgomery's
michael@0 6 * method for modular reduction. This file implements the method
michael@0 7 * described as "Improvement 2" in the paper "A Cryptogrpahic Library for
michael@0 8 * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.
michael@0 9 * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"
michael@0 10 * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,
michael@0 11 * published by Springer Verlag.
michael@0 12 */
michael@0 13
michael@0 14 #define MP_USING_CACHE_SAFE_MOD_EXP 1
michael@0 15 #include <string.h>
michael@0 16 #include "mpi-priv.h"
michael@0 17 #include "mplogic.h"
michael@0 18 #include "mpprime.h"
michael@0 19 #ifdef MP_USING_MONT_MULF
michael@0 20 #include "montmulf.h"
michael@0 21 #endif
michael@0 22 #include <stddef.h> /* ptrdiff_t */
michael@0 23
michael@0 24 /* if MP_CHAR_STORE_SLOW is defined, we */
michael@0 25 /* need to know endianness of this platform. */
michael@0 26 #ifdef MP_CHAR_STORE_SLOW
michael@0 27 #if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)
michael@0 28 #error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \
michael@0 29 " if you define MP_CHAR_STORE_SLOW."
michael@0 30 #endif
michael@0 31 #endif
michael@0 32
michael@0 33 #define STATIC
michael@0 34
michael@0 35 #define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */
michael@0 36
michael@0 37 /*! computes T = REDC(T), 2^b == R
michael@0 38 \param T < RN
michael@0 39 */
michael@0 40 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
michael@0 41 {
michael@0 42 mp_err res;
michael@0 43 mp_size i;
michael@0 44
michael@0 45 i = (MP_USED(&mmm->N) << 1) + 1;
michael@0 46 MP_CHECKOK( s_mp_pad(T, i) );
michael@0 47 for (i = 0; i < MP_USED(&mmm->N); ++i ) {
michael@0 48 mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;
michael@0 49 /* T += N * m_i * (MP_RADIX ** i); */
michael@0 50 MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) );
michael@0 51 }
michael@0 52 s_mp_clamp(T);
michael@0 53
michael@0 54 /* T /= R */
michael@0 55 s_mp_rshd( T, MP_USED(&mmm->N) );
michael@0 56
michael@0 57 if ((res = s_mp_cmp(T, &mmm->N)) >= 0) {
michael@0 58 /* T = T - N */
michael@0 59 MP_CHECKOK( s_mp_sub(T, &mmm->N) );
michael@0 60 #ifdef DEBUG
michael@0 61 if ((res = mp_cmp(T, &mmm->N)) >= 0) {
michael@0 62 res = MP_UNDEF;
michael@0 63 goto CLEANUP;
michael@0 64 }
michael@0 65 #endif
michael@0 66 }
michael@0 67 res = MP_OKAY;
michael@0 68 CLEANUP:
michael@0 69 return res;
michael@0 70 }
michael@0 71
michael@0 72 #if !defined(MP_MONT_USE_MP_MUL)
michael@0 73
michael@0 74 /*! c <- REDC( a * b ) mod N
michael@0 75 \param a < N i.e. "reduced"
michael@0 76 \param b < N i.e. "reduced"
michael@0 77 \param mmm modulus N and n0' of N
michael@0 78 */
michael@0 79 mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c,
michael@0 80 mp_mont_modulus *mmm)
michael@0 81 {
michael@0 82 mp_digit *pb;
michael@0 83 mp_digit m_i;
michael@0 84 mp_err res;
michael@0 85 mp_size ib; /* "index b": index of current digit of B */
michael@0 86 mp_size useda, usedb;
michael@0 87
michael@0 88 ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
michael@0 89
michael@0 90 if (MP_USED(a) < MP_USED(b)) {
michael@0 91 const mp_int *xch = b; /* switch a and b, to do fewer outer loops */
michael@0 92 b = a;
michael@0 93 a = xch;
michael@0 94 }
michael@0 95
michael@0 96 MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;
michael@0 97 ib = (MP_USED(&mmm->N) << 1) + 1;
michael@0 98 if((res = s_mp_pad(c, ib)) != MP_OKAY)
michael@0 99 goto CLEANUP;
michael@0 100
michael@0 101 useda = MP_USED(a);
michael@0 102 pb = MP_DIGITS(b);
michael@0 103 s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c));
michael@0 104 s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1));
michael@0 105 m_i = MP_DIGIT(c, 0) * mmm->n0prime;
michael@0 106 s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0);
michael@0 107
michael@0 108 /* Outer loop: Digits of b */
michael@0 109 usedb = MP_USED(b);
michael@0 110 for (ib = 1; ib < usedb; ib++) {
michael@0 111 mp_digit b_i = *pb++;
michael@0 112
michael@0 113 /* Inner product: Digits of a */
michael@0 114 if (b_i)
michael@0 115 s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);
michael@0 116 m_i = MP_DIGIT(c, ib) * mmm->n0prime;
michael@0 117 s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
michael@0 118 }
michael@0 119 if (usedb < MP_USED(&mmm->N)) {
michael@0 120 for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) {
michael@0 121 m_i = MP_DIGIT(c, ib) * mmm->n0prime;
michael@0 122 s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
michael@0 123 }
michael@0 124 }
michael@0 125 s_mp_clamp(c);
michael@0 126 s_mp_rshd( c, MP_USED(&mmm->N) ); /* c /= R */
michael@0 127 if (s_mp_cmp(c, &mmm->N) >= 0) {
michael@0 128 MP_CHECKOK( s_mp_sub(c, &mmm->N) );
michael@0 129 }
michael@0 130 res = MP_OKAY;
michael@0 131
michael@0 132 CLEANUP:
michael@0 133 return res;
michael@0 134 }
michael@0 135 #endif
michael@0 136
michael@0 137 STATIC
michael@0 138 mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)
michael@0 139 {
michael@0 140 mp_err res;
michael@0 141
michael@0 142 /* xMont = x * R mod N where N is modulus */
michael@0 143 MP_CHECKOK( mp_copy( x, xMont ) );
michael@0 144 MP_CHECKOK( s_mp_lshd( xMont, MP_USED(&mmm->N) ) ); /* xMont = x << b */
michael@0 145 MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) ); /* mod N */
michael@0 146 CLEANUP:
michael@0 147 return res;
michael@0 148 }
michael@0 149
michael@0 150 #ifdef MP_USING_MONT_MULF
michael@0 151
michael@0 152 /* the floating point multiply is already cache safe,
michael@0 153 * don't turn on cache safe unless we specifically
michael@0 154 * force it */
michael@0 155 #ifndef MP_FORCE_CACHE_SAFE
michael@0 156 #undef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 157 #endif
michael@0 158
michael@0 159 unsigned int mp_using_mont_mulf = 1;
michael@0 160
michael@0 161 /* computes montgomery square of the integer in mResult */
michael@0 162 #define SQR \
michael@0 163 conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \
michael@0 164 mont_mulf_noconv(mResult, dm1, d16Tmp, \
michael@0 165 dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
michael@0 166
michael@0 167 /* computes montgomery product of x and the integer in mResult */
michael@0 168 #define MUL(x) \
michael@0 169 conv_i32_to_d32(dm1, mResult, nLen); \
michael@0 170 mont_mulf_noconv(mResult, dm1, oddPowers[x], \
michael@0 171 dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
michael@0 172
michael@0 173 /* Do modular exponentiation using floating point multiply code. */
michael@0 174 mp_err mp_exptmod_f(const mp_int * montBase,
michael@0 175 const mp_int * exponent,
michael@0 176 const mp_int * modulus,
michael@0 177 mp_int * result,
michael@0 178 mp_mont_modulus *mmm,
michael@0 179 int nLen,
michael@0 180 mp_size bits_in_exponent,
michael@0 181 mp_size window_bits,
michael@0 182 mp_size odd_ints)
michael@0 183 {
michael@0 184 mp_digit *mResult;
michael@0 185 double *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp;
michael@0 186 double dn0;
michael@0 187 mp_size i;
michael@0 188 mp_err res;
michael@0 189 int expOff;
michael@0 190 int dSize = 0, oddPowSize, dTmpSize;
michael@0 191 mp_int accum1;
michael@0 192 double *oddPowers[MAX_ODD_INTS];
michael@0 193
michael@0 194 /* function for computing n0prime only works if n0 is odd */
michael@0 195
michael@0 196 MP_DIGITS(&accum1) = 0;
michael@0 197
michael@0 198 for (i = 0; i < MAX_ODD_INTS; ++i)
michael@0 199 oddPowers[i] = 0;
michael@0 200
michael@0 201 MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
michael@0 202
michael@0 203 mp_set(&accum1, 1);
michael@0 204 MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
michael@0 205 MP_CHECKOK( s_mp_pad(&accum1, nLen) );
michael@0 206
michael@0 207 oddPowSize = 2 * nLen + 1;
michael@0 208 dTmpSize = 2 * oddPowSize;
michael@0 209 dSize = sizeof(double) * (nLen * 4 + 1 +
michael@0 210 ((odd_ints + 1) * oddPowSize) + dTmpSize);
michael@0 211 dBuf = (double *)malloc(dSize);
michael@0 212 dm1 = dBuf; /* array of d32 */
michael@0 213 dn = dBuf + nLen; /* array of d32 */
michael@0 214 dSqr = dn + nLen; /* array of d32 */
michael@0 215 d16Tmp = dSqr + nLen; /* array of d16 */
michael@0 216 dTmp = d16Tmp + oddPowSize;
michael@0 217
michael@0 218 for (i = 0; i < odd_ints; ++i) {
michael@0 219 oddPowers[i] = dTmp;
michael@0 220 dTmp += oddPowSize;
michael@0 221 }
michael@0 222 mResult = (mp_digit *)(dTmp + dTmpSize); /* size is nLen + 1 */
michael@0 223
michael@0 224 /* Make dn and dn0 */
michael@0 225 conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen);
michael@0 226 dn0 = (double)(mmm->n0prime & 0xffff);
michael@0 227
michael@0 228 /* Make dSqr */
michael@0 229 conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen);
michael@0 230 mont_mulf_noconv(mResult, dm1, oddPowers[0],
michael@0 231 dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
michael@0 232 conv_i32_to_d32(dSqr, mResult, nLen);
michael@0 233
michael@0 234 for (i = 1; i < odd_ints; ++i) {
michael@0 235 mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1],
michael@0 236 dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
michael@0 237 conv_i32_to_d16(oddPowers[i], mResult, nLen);
michael@0 238 }
michael@0 239
michael@0 240 s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */
michael@0 241
michael@0 242 for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
michael@0 243 mp_size smallExp;
michael@0 244 MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
michael@0 245 smallExp = (mp_size)res;
michael@0 246
michael@0 247 if (window_bits == 1) {
michael@0 248 if (!smallExp) {
michael@0 249 SQR;
michael@0 250 } else if (smallExp & 1) {
michael@0 251 SQR; MUL(0);
michael@0 252 } else {
michael@0 253 abort();
michael@0 254 }
michael@0 255 } else if (window_bits == 4) {
michael@0 256 if (!smallExp) {
michael@0 257 SQR; SQR; SQR; SQR;
michael@0 258 } else if (smallExp & 1) {
michael@0 259 SQR; SQR; SQR; SQR; MUL(smallExp/2);
michael@0 260 } else if (smallExp & 2) {
michael@0 261 SQR; SQR; SQR; MUL(smallExp/4); SQR;
michael@0 262 } else if (smallExp & 4) {
michael@0 263 SQR; SQR; MUL(smallExp/8); SQR; SQR;
michael@0 264 } else if (smallExp & 8) {
michael@0 265 SQR; MUL(smallExp/16); SQR; SQR; SQR;
michael@0 266 } else {
michael@0 267 abort();
michael@0 268 }
michael@0 269 } else if (window_bits == 5) {
michael@0 270 if (!smallExp) {
michael@0 271 SQR; SQR; SQR; SQR; SQR;
michael@0 272 } else if (smallExp & 1) {
michael@0 273 SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);
michael@0 274 } else if (smallExp & 2) {
michael@0 275 SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;
michael@0 276 } else if (smallExp & 4) {
michael@0 277 SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;
michael@0 278 } else if (smallExp & 8) {
michael@0 279 SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;
michael@0 280 } else if (smallExp & 0x10) {
michael@0 281 SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;
michael@0 282 } else {
michael@0 283 abort();
michael@0 284 }
michael@0 285 } else if (window_bits == 6) {
michael@0 286 if (!smallExp) {
michael@0 287 SQR; SQR; SQR; SQR; SQR; SQR;
michael@0 288 } else if (smallExp & 1) {
michael@0 289 SQR; SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);
michael@0 290 } else if (smallExp & 2) {
michael@0 291 SQR; SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;
michael@0 292 } else if (smallExp & 4) {
michael@0 293 SQR; SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;
michael@0 294 } else if (smallExp & 8) {
michael@0 295 SQR; SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;
michael@0 296 } else if (smallExp & 0x10) {
michael@0 297 SQR; SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;
michael@0 298 } else if (smallExp & 0x20) {
michael@0 299 SQR; MUL(smallExp/64); SQR; SQR; SQR; SQR; SQR;
michael@0 300 } else {
michael@0 301 abort();
michael@0 302 }
michael@0 303 } else {
michael@0 304 abort();
michael@0 305 }
michael@0 306 }
michael@0 307
michael@0 308 s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */
michael@0 309
michael@0 310 res = s_mp_redc(&accum1, mmm);
michael@0 311 mp_exch(&accum1, result);
michael@0 312
michael@0 313 CLEANUP:
michael@0 314 mp_clear(&accum1);
michael@0 315 if (dBuf) {
michael@0 316 if (dSize)
michael@0 317 memset(dBuf, 0, dSize);
michael@0 318 free(dBuf);
michael@0 319 }
michael@0 320
michael@0 321 return res;
michael@0 322 }
michael@0 323 #undef SQR
michael@0 324 #undef MUL
michael@0 325 #endif
michael@0 326
michael@0 327 #define SQR(a,b) \
michael@0 328 MP_CHECKOK( mp_sqr(a, b) );\
michael@0 329 MP_CHECKOK( s_mp_redc(b, mmm) )
michael@0 330
michael@0 331 #if defined(MP_MONT_USE_MP_MUL)
michael@0 332 #define MUL(x,a,b) \
michael@0 333 MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \
michael@0 334 MP_CHECKOK( s_mp_redc(b, mmm) )
michael@0 335 #else
michael@0 336 #define MUL(x,a,b) \
michael@0 337 MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) )
michael@0 338 #endif
michael@0 339
michael@0 340 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
michael@0 341
michael@0 342 /* Do modular exponentiation using integer multiply code. */
michael@0 343 mp_err mp_exptmod_i(const mp_int * montBase,
michael@0 344 const mp_int * exponent,
michael@0 345 const mp_int * modulus,
michael@0 346 mp_int * result,
michael@0 347 mp_mont_modulus *mmm,
michael@0 348 int nLen,
michael@0 349 mp_size bits_in_exponent,
michael@0 350 mp_size window_bits,
michael@0 351 mp_size odd_ints)
michael@0 352 {
michael@0 353 mp_int *pa1, *pa2, *ptmp;
michael@0 354 mp_size i;
michael@0 355 mp_err res;
michael@0 356 int expOff;
michael@0 357 mp_int accum1, accum2, power2, oddPowers[MAX_ODD_INTS];
michael@0 358
michael@0 359 /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
michael@0 360 /* oddPowers[i] = base ** (2*i + 1); */
michael@0 361
michael@0 362 MP_DIGITS(&accum1) = 0;
michael@0 363 MP_DIGITS(&accum2) = 0;
michael@0 364 MP_DIGITS(&power2) = 0;
michael@0 365 for (i = 0; i < MAX_ODD_INTS; ++i) {
michael@0 366 MP_DIGITS(oddPowers + i) = 0;
michael@0 367 }
michael@0 368
michael@0 369 MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
michael@0 370 MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
michael@0 371
michael@0 372 MP_CHECKOK( mp_init_copy(&oddPowers[0], montBase) );
michael@0 373
michael@0 374 mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2);
michael@0 375 MP_CHECKOK( mp_sqr(montBase, &power2) ); /* power2 = montBase ** 2 */
michael@0 376 MP_CHECKOK( s_mp_redc(&power2, mmm) );
michael@0 377
michael@0 378 for (i = 1; i < odd_ints; ++i) {
michael@0 379 mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2);
michael@0 380 MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );
michael@0 381 MP_CHECKOK( s_mp_redc(oddPowers + i, mmm) );
michael@0 382 }
michael@0 383
michael@0 384 /* set accumulator to montgomery residue of 1 */
michael@0 385 mp_set(&accum1, 1);
michael@0 386 MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
michael@0 387 pa1 = &accum1;
michael@0 388 pa2 = &accum2;
michael@0 389
michael@0 390 for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
michael@0 391 mp_size smallExp;
michael@0 392 MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
michael@0 393 smallExp = (mp_size)res;
michael@0 394
michael@0 395 if (window_bits == 1) {
michael@0 396 if (!smallExp) {
michael@0 397 SQR(pa1,pa2); SWAPPA;
michael@0 398 } else if (smallExp & 1) {
michael@0 399 SQR(pa1,pa2); MUL(0,pa2,pa1);
michael@0 400 } else {
michael@0 401 abort();
michael@0 402 }
michael@0 403 } else if (window_bits == 4) {
michael@0 404 if (!smallExp) {
michael@0 405 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 406 } else if (smallExp & 1) {
michael@0 407 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 408 MUL(smallExp/2, pa1,pa2); SWAPPA;
michael@0 409 } else if (smallExp & 2) {
michael@0 410 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
michael@0 411 MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 412 } else if (smallExp & 4) {
michael@0 413 SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2);
michael@0 414 SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 415 } else if (smallExp & 8) {
michael@0 416 SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2);
michael@0 417 SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 418 } else {
michael@0 419 abort();
michael@0 420 }
michael@0 421 } else if (window_bits == 5) {
michael@0 422 if (!smallExp) {
michael@0 423 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 424 SQR(pa1,pa2); SWAPPA;
michael@0 425 } else if (smallExp & 1) {
michael@0 426 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 427 SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1);
michael@0 428 } else if (smallExp & 2) {
michael@0 429 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 430 MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1);
michael@0 431 } else if (smallExp & 4) {
michael@0 432 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
michael@0 433 MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 434 } else if (smallExp & 8) {
michael@0 435 SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2);
michael@0 436 SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 437 } else if (smallExp & 0x10) {
michael@0 438 SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2);
michael@0 439 SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 440 } else {
michael@0 441 abort();
michael@0 442 }
michael@0 443 } else if (window_bits == 6) {
michael@0 444 if (!smallExp) {
michael@0 445 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 446 SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 447 } else if (smallExp & 1) {
michael@0 448 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 449 SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA;
michael@0 450 } else if (smallExp & 2) {
michael@0 451 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 452 SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 453 } else if (smallExp & 4) {
michael@0 454 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 455 MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 456 } else if (smallExp & 8) {
michael@0 457 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);
michael@0 458 MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 459 SQR(pa1,pa2); SWAPPA;
michael@0 460 } else if (smallExp & 0x10) {
michael@0 461 SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2);
michael@0 462 SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 463 } else if (smallExp & 0x20) {
michael@0 464 SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2);
michael@0 465 SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
michael@0 466 } else {
michael@0 467 abort();
michael@0 468 }
michael@0 469 } else {
michael@0 470 abort();
michael@0 471 }
michael@0 472 }
michael@0 473
michael@0 474 res = s_mp_redc(pa1, mmm);
michael@0 475 mp_exch(pa1, result);
michael@0 476
michael@0 477 CLEANUP:
michael@0 478 mp_clear(&accum1);
michael@0 479 mp_clear(&accum2);
michael@0 480 mp_clear(&power2);
michael@0 481 for (i = 0; i < odd_ints; ++i) {
michael@0 482 mp_clear(oddPowers + i);
michael@0 483 }
michael@0 484 return res;
michael@0 485 }
michael@0 486 #undef SQR
michael@0 487 #undef MUL
michael@0 488
michael@0 489 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 490 unsigned int mp_using_cache_safe_exp = 1;
michael@0 491 #endif
michael@0 492
michael@0 493 mp_err mp_set_safe_modexp(int value)
michael@0 494 {
michael@0 495 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 496 mp_using_cache_safe_exp = value;
michael@0 497 return MP_OKAY;
michael@0 498 #else
michael@0 499 if (value == 0) {
michael@0 500 return MP_OKAY;
michael@0 501 }
michael@0 502 return MP_BADARG;
michael@0 503 #endif
michael@0 504 }
michael@0 505
michael@0 506 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 507 #define WEAVE_WORD_SIZE 4
michael@0 508
michael@0 509 #ifndef MP_CHAR_STORE_SLOW
michael@0 510 /*
michael@0 511 * mpi_to_weave takes an array of bignums, a matrix in which each bignum
michael@0 512 * occupies all the columns of a row, and transposes it into a matrix in
michael@0 513 * which each bignum occupies a column of every row. The first row of the
michael@0 514 * input matrix becomes the first column of the output matrix. The n'th
michael@0 515 * row of input becomes the n'th column of output. The input data is said
michael@0 516 * to be "interleaved" or "woven" into the output matrix.
michael@0 517 *
michael@0 518 * The array of bignums is left in this woven form. Each time a single
michael@0 519 * bignum value is needed, it is recreated by fetching the n'th column,
michael@0 520 * forming a single row which is the new bignum.
michael@0 521 *
michael@0 522 * The purpose of this interleaving is make it impossible to determine which
michael@0 523 * of the bignums is being used in any one operation by examining the pattern
michael@0 524 * of cache misses.
michael@0 525 *
michael@0 526 * The weaving function does not transpose the entire input matrix in one call.
michael@0 527 * It transposes 4 rows of mp_ints into their respective columns of output.
michael@0 528 *
michael@0 529 * There are two different implementations of the weaving and unweaving code
michael@0 530 * in this file. One uses byte loads and stores. The second uses loads and
michael@0 531 * stores of mp_weave_word size values. The weaved forms of these two
michael@0 532 * implementations differ. Consequently, each one has its own explanation.
michael@0 533 *
michael@0 534 * Here is the explanation for the byte-at-a-time implementation.
michael@0 535 *
michael@0 536 * This implementation treats each mp_int bignum as an array of bytes,
michael@0 537 * rather than as an array of mp_digits. It stores those bytes as a
michael@0 538 * column of bytes in the output matrix. It doesn't care if the machine
michael@0 539 * uses big-endian or little-endian byte ordering within mp_digits.
michael@0 540 * The first byte of the mp_digit array becomes the first byte in the output
michael@0 541 * column, regardless of whether that byte is the MSB or LSB of the mp_digit.
michael@0 542 *
michael@0 543 * "bignums" is an array of mp_ints.
michael@0 544 * It points to four rows, four mp_ints, a subset of a larger array of mp_ints.
michael@0 545 *
michael@0 546 * "weaved" is the weaved output matrix.
michael@0 547 * The first byte of bignums[0] is stored in weaved[0].
michael@0 548 *
michael@0 549 * "nBignums" is the total number of bignums in the array of which "bignums"
michael@0 550 * is a part.
michael@0 551 *
michael@0 552 * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array.
michael@0 553 * mp_ints that use less than nDigits digits are logically padded with zeros
michael@0 554 * while being stored in the weaved array.
michael@0 555 */
michael@0 556 mp_err mpi_to_weave(const mp_int *bignums,
michael@0 557 unsigned char *weaved,
michael@0 558 mp_size nDigits, /* in each mp_int of input */
michael@0 559 mp_size nBignums) /* in the entire source array */
michael@0 560 {
michael@0 561 mp_size i;
michael@0 562 unsigned char * endDest = weaved + (nDigits * nBignums * sizeof(mp_digit));
michael@0 563
michael@0 564 for (i=0; i < WEAVE_WORD_SIZE; i++) {
michael@0 565 mp_size used = MP_USED(&bignums[i]);
michael@0 566 unsigned char *pSrc = (unsigned char *)MP_DIGITS(&bignums[i]);
michael@0 567 unsigned char *endSrc = pSrc + (used * sizeof(mp_digit));
michael@0 568 unsigned char *pDest = weaved + i;
michael@0 569
michael@0 570 ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG);
michael@0 571 ARGCHK(used <= nDigits, MP_BADARG);
michael@0 572
michael@0 573 for (; pSrc < endSrc; pSrc++) {
michael@0 574 *pDest = *pSrc;
michael@0 575 pDest += nBignums;
michael@0 576 }
michael@0 577 while (pDest < endDest) {
michael@0 578 *pDest = 0;
michael@0 579 pDest += nBignums;
michael@0 580 }
michael@0 581 }
michael@0 582
michael@0 583 return MP_OKAY;
michael@0 584 }
michael@0 585
michael@0 586 /* Reverse the operation above for one mp_int.
michael@0 587 * Reconstruct one mp_int from its column in the weaved array.
michael@0 588 * "pSrc" points to the offset into the weave array of the bignum we
michael@0 589 * are going to reconstruct.
michael@0 590 */
michael@0 591 mp_err weave_to_mpi(mp_int *a, /* output, result */
michael@0 592 const unsigned char *pSrc, /* input, byte matrix */
michael@0 593 mp_size nDigits, /* per mp_int output */
michael@0 594 mp_size nBignums) /* bignums in weaved matrix */
michael@0 595 {
michael@0 596 unsigned char *pDest = (unsigned char *)MP_DIGITS(a);
michael@0 597 unsigned char *endDest = pDest + (nDigits * sizeof(mp_digit));
michael@0 598
michael@0 599 MP_SIGN(a) = MP_ZPOS;
michael@0 600 MP_USED(a) = nDigits;
michael@0 601
michael@0 602 for (; pDest < endDest; pSrc += nBignums, pDest++) {
michael@0 603 *pDest = *pSrc;
michael@0 604 }
michael@0 605 s_mp_clamp(a);
michael@0 606 return MP_OKAY;
michael@0 607 }
michael@0 608
michael@0 609 #else
michael@0 610
michael@0 611 /* Need a primitive that we know is 32 bits long... */
michael@0 612 /* this is true on all modern processors we know of today*/
michael@0 613 typedef unsigned int mp_weave_word;
michael@0 614
michael@0 615 /*
michael@0 616 * on some platforms character stores into memory is very expensive since they
michael@0 617 * generate a read/modify/write operation on the bus. On those platforms
michael@0 618 * we need to do integer writes to the bus. Because of some unrolled code,
michael@0 619 * in this current code the size of mp_weave_word must be four. The code that
michael@0 620 * makes this assumption explicity is called out. (on some platforms a write
michael@0 621 * of 4 bytes still requires a single read-modify-write operation.
michael@0 622 *
michael@0 623 * This function is takes the identical parameters as the function above,
michael@0 624 * however it lays out the final array differently. Where the previous function
michael@0 625 * treats the mpi_int as an byte array, this function treats it as an array of
michael@0 626 * mp_digits where each digit is stored in big endian order.
michael@0 627 *
michael@0 628 * since we need to interleave on a byte by byte basis, we need to collect
michael@0 629 * several mpi structures together into a single PRUint32 before we write. We
michael@0 630 * also need to make sure the PRUint32 is arranged so that the first value of
michael@0 631 * the first array winds up in b[0]. This means construction of that PRUint32
michael@0 632 * is endian specific (even though the layout of the mp_digits in the array
michael@0 633 * is always big endian).
michael@0 634 *
michael@0 635 * The final data is stored as follows :
michael@0 636 *
michael@0 637 * Our same logical array p array, m is sizeof(mp_digit),
michael@0 638 * N is still count and n is now b_size. If we define p[i].digit[j]0 as the
michael@0 639 * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as
michael@0 640 * the next most significant byte of p[i].digit[j], ... and p[i].digit[j]m-1
michael@0 641 * is the least significant byte.
michael@0 642 * Our array would look like:
michael@0 643 * p[0].digit[0]0 p[1].digit[0]0 ... p[N-2].digit[0]0 p[N-1].digit[0]0
michael@0 644 * p[0].digit[0]1 p[1].digit[0]1 ... p[N-2].digit[0]1 p[N-1].digit[0]1
michael@0 645 * . .
michael@0 646 * p[0].digit[0]m-1 p[1].digit[0]m-1 ... p[N-2].digit[0]m-1 p[N-1].digit[0]m-1
michael@0 647 * p[0].digit[1]0 p[1].digit[1]0 ... p[N-2].digit[1]0 p[N-1].digit[1]0
michael@0 648 * . .
michael@0 649 * . .
michael@0 650 * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2
michael@0 651 * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1
michael@0 652 *
michael@0 653 */
michael@0 654 mp_err mpi_to_weave(const mp_int *a, unsigned char *b,
michael@0 655 mp_size b_size, mp_size count)
michael@0 656 {
michael@0 657 mp_size i;
michael@0 658 mp_digit *digitsa0;
michael@0 659 mp_digit *digitsa1;
michael@0 660 mp_digit *digitsa2;
michael@0 661 mp_digit *digitsa3;
michael@0 662 mp_size useda0;
michael@0 663 mp_size useda1;
michael@0 664 mp_size useda2;
michael@0 665 mp_size useda3;
michael@0 666 mp_weave_word *weaved = (mp_weave_word *)b;
michael@0 667
michael@0 668 count = count/sizeof(mp_weave_word);
michael@0 669
michael@0 670 /* this code pretty much depends on this ! */
michael@0 671 #if MP_ARGCHK == 2
michael@0 672 assert(WEAVE_WORD_SIZE == 4);
michael@0 673 assert(sizeof(mp_weave_word) == 4);
michael@0 674 #endif
michael@0 675
michael@0 676 digitsa0 = MP_DIGITS(&a[0]);
michael@0 677 digitsa1 = MP_DIGITS(&a[1]);
michael@0 678 digitsa2 = MP_DIGITS(&a[2]);
michael@0 679 digitsa3 = MP_DIGITS(&a[3]);
michael@0 680 useda0 = MP_USED(&a[0]);
michael@0 681 useda1 = MP_USED(&a[1]);
michael@0 682 useda2 = MP_USED(&a[2]);
michael@0 683 useda3 = MP_USED(&a[3]);
michael@0 684
michael@0 685 ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG);
michael@0 686 ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG);
michael@0 687 ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG);
michael@0 688 ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG);
michael@0 689 ARGCHK(useda0 <= b_size, MP_BADARG);
michael@0 690 ARGCHK(useda1 <= b_size, MP_BADARG);
michael@0 691 ARGCHK(useda2 <= b_size, MP_BADARG);
michael@0 692 ARGCHK(useda3 <= b_size, MP_BADARG);
michael@0 693
michael@0 694 #define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)
michael@0 695
michael@0 696 for (i=0; i < b_size; i++) {
michael@0 697 mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);
michael@0 698 mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);
michael@0 699 mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);
michael@0 700 mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);
michael@0 701 register mp_weave_word acc;
michael@0 702
michael@0 703 /*
michael@0 704 * ONE_STEP takes the MSB of each of our current digits and places that
michael@0 705 * byte in the appropriate position for writing to the weaved array.
michael@0 706 * On little endian:
michael@0 707 * b3 b2 b1 b0
michael@0 708 * On big endian:
michael@0 709 * b0 b1 b2 b3
michael@0 710 * When the data is written it would always wind up:
michael@0 711 * b[0] = b0
michael@0 712 * b[1] = b1
michael@0 713 * b[2] = b2
michael@0 714 * b[3] = b3
michael@0 715 *
michael@0 716 * Once we've written the MSB, we shift the whole digit up left one
michael@0 717 * byte, putting the Next Most Significant Byte in the MSB position,
michael@0 718 * so we we repeat the next one step that byte will be written.
michael@0 719 * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE
michael@0 720 * is 4.
michael@0 721 */
michael@0 722 #ifdef MP_IS_LITTLE_ENDIAN
michael@0 723 #define MPI_WEAVE_ONE_STEP \
michael@0 724 acc = (d0 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d0 <<= 8; /*b0*/ \
michael@0 725 acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \
michael@0 726 acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \
michael@0 727 acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
michael@0 728 *weaved = acc; weaved += count;
michael@0 729 #else
michael@0 730 #define MPI_WEAVE_ONE_STEP \
michael@0 731 acc = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
michael@0 732 acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \
michael@0 733 acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \
michael@0 734 acc |= (d3 >> (MP_DIGIT_BIT-8)) & 0x000000ff; d3 <<= 8; /*b3*/ \
michael@0 735 *weaved = acc; weaved += count;
michael@0 736 #endif
michael@0 737 switch (sizeof(mp_digit)) {
michael@0 738 case 32:
michael@0 739 MPI_WEAVE_ONE_STEP
michael@0 740 MPI_WEAVE_ONE_STEP
michael@0 741 MPI_WEAVE_ONE_STEP
michael@0 742 MPI_WEAVE_ONE_STEP
michael@0 743 MPI_WEAVE_ONE_STEP
michael@0 744 MPI_WEAVE_ONE_STEP
michael@0 745 MPI_WEAVE_ONE_STEP
michael@0 746 MPI_WEAVE_ONE_STEP
michael@0 747 MPI_WEAVE_ONE_STEP
michael@0 748 MPI_WEAVE_ONE_STEP
michael@0 749 MPI_WEAVE_ONE_STEP
michael@0 750 MPI_WEAVE_ONE_STEP
michael@0 751 MPI_WEAVE_ONE_STEP
michael@0 752 MPI_WEAVE_ONE_STEP
michael@0 753 MPI_WEAVE_ONE_STEP
michael@0 754 MPI_WEAVE_ONE_STEP
michael@0 755 case 16:
michael@0 756 MPI_WEAVE_ONE_STEP
michael@0 757 MPI_WEAVE_ONE_STEP
michael@0 758 MPI_WEAVE_ONE_STEP
michael@0 759 MPI_WEAVE_ONE_STEP
michael@0 760 MPI_WEAVE_ONE_STEP
michael@0 761 MPI_WEAVE_ONE_STEP
michael@0 762 MPI_WEAVE_ONE_STEP
michael@0 763 MPI_WEAVE_ONE_STEP
michael@0 764 case 8:
michael@0 765 MPI_WEAVE_ONE_STEP
michael@0 766 MPI_WEAVE_ONE_STEP
michael@0 767 MPI_WEAVE_ONE_STEP
michael@0 768 MPI_WEAVE_ONE_STEP
michael@0 769 case 4:
michael@0 770 MPI_WEAVE_ONE_STEP
michael@0 771 MPI_WEAVE_ONE_STEP
michael@0 772 case 2:
michael@0 773 MPI_WEAVE_ONE_STEP
michael@0 774 case 1:
michael@0 775 MPI_WEAVE_ONE_STEP
michael@0 776 break;
michael@0 777 }
michael@0 778 }
michael@0 779
michael@0 780 return MP_OKAY;
michael@0 781 }
michael@0 782
michael@0 783 /* reverse the operation above for one entry.
michael@0 784 * b points to the offset into the weave array of the power we are
michael@0 785 * calculating */
michael@0 786 mp_err weave_to_mpi(mp_int *a, const unsigned char *b,
michael@0 787 mp_size b_size, mp_size count)
michael@0 788 {
michael@0 789 mp_digit *pb = MP_DIGITS(a);
michael@0 790 mp_digit *end = &pb[b_size];
michael@0 791
michael@0 792 MP_SIGN(a) = MP_ZPOS;
michael@0 793 MP_USED(a) = b_size;
michael@0 794
michael@0 795 for (; pb < end; pb++) {
michael@0 796 register mp_digit digit;
michael@0 797
michael@0 798 digit = *b << 8; b += count;
michael@0 799 #define MPI_UNWEAVE_ONE_STEP digit |= *b; b += count; digit = digit << 8;
michael@0 800 switch (sizeof(mp_digit)) {
michael@0 801 case 32:
michael@0 802 MPI_UNWEAVE_ONE_STEP
michael@0 803 MPI_UNWEAVE_ONE_STEP
michael@0 804 MPI_UNWEAVE_ONE_STEP
michael@0 805 MPI_UNWEAVE_ONE_STEP
michael@0 806 MPI_UNWEAVE_ONE_STEP
michael@0 807 MPI_UNWEAVE_ONE_STEP
michael@0 808 MPI_UNWEAVE_ONE_STEP
michael@0 809 MPI_UNWEAVE_ONE_STEP
michael@0 810 MPI_UNWEAVE_ONE_STEP
michael@0 811 MPI_UNWEAVE_ONE_STEP
michael@0 812 MPI_UNWEAVE_ONE_STEP
michael@0 813 MPI_UNWEAVE_ONE_STEP
michael@0 814 MPI_UNWEAVE_ONE_STEP
michael@0 815 MPI_UNWEAVE_ONE_STEP
michael@0 816 MPI_UNWEAVE_ONE_STEP
michael@0 817 MPI_UNWEAVE_ONE_STEP
michael@0 818 case 16:
michael@0 819 MPI_UNWEAVE_ONE_STEP
michael@0 820 MPI_UNWEAVE_ONE_STEP
michael@0 821 MPI_UNWEAVE_ONE_STEP
michael@0 822 MPI_UNWEAVE_ONE_STEP
michael@0 823 MPI_UNWEAVE_ONE_STEP
michael@0 824 MPI_UNWEAVE_ONE_STEP
michael@0 825 MPI_UNWEAVE_ONE_STEP
michael@0 826 MPI_UNWEAVE_ONE_STEP
michael@0 827 case 8:
michael@0 828 MPI_UNWEAVE_ONE_STEP
michael@0 829 MPI_UNWEAVE_ONE_STEP
michael@0 830 MPI_UNWEAVE_ONE_STEP
michael@0 831 MPI_UNWEAVE_ONE_STEP
michael@0 832 case 4:
michael@0 833 MPI_UNWEAVE_ONE_STEP
michael@0 834 MPI_UNWEAVE_ONE_STEP
michael@0 835 case 2:
michael@0 836 break;
michael@0 837 }
michael@0 838 digit |= *b; b += count;
michael@0 839
michael@0 840 *pb = digit;
michael@0 841 }
michael@0 842 s_mp_clamp(a);
michael@0 843 return MP_OKAY;
michael@0 844 }
michael@0 845 #endif
michael@0 846
michael@0 847
michael@0 848 #define SQR(a,b) \
michael@0 849 MP_CHECKOK( mp_sqr(a, b) );\
michael@0 850 MP_CHECKOK( s_mp_redc(b, mmm) )
michael@0 851
michael@0 852 #if defined(MP_MONT_USE_MP_MUL)
michael@0 853 #define MUL_NOWEAVE(x,a,b) \
michael@0 854 MP_CHECKOK( mp_mul(a, x, b) ); \
michael@0 855 MP_CHECKOK( s_mp_redc(b, mmm) )
michael@0 856 #else
michael@0 857 #define MUL_NOWEAVE(x,a,b) \
michael@0 858 MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) )
michael@0 859 #endif
michael@0 860
michael@0 861 #define MUL(x,a,b) \
michael@0 862 MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
michael@0 863 MUL_NOWEAVE(&tmp,a,b)
michael@0 864
michael@0 865 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
michael@0 866 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y)))
michael@0 867
michael@0 868 /* Do modular exponentiation using integer multiply code. */
michael@0 869 mp_err mp_exptmod_safe_i(const mp_int * montBase,
michael@0 870 const mp_int * exponent,
michael@0 871 const mp_int * modulus,
michael@0 872 mp_int * result,
michael@0 873 mp_mont_modulus *mmm,
michael@0 874 int nLen,
michael@0 875 mp_size bits_in_exponent,
michael@0 876 mp_size window_bits,
michael@0 877 mp_size num_powers)
michael@0 878 {
michael@0 879 mp_int *pa1, *pa2, *ptmp;
michael@0 880 mp_size i;
michael@0 881 mp_size first_window;
michael@0 882 mp_err res;
michael@0 883 int expOff;
michael@0 884 mp_int accum1, accum2, accum[WEAVE_WORD_SIZE];
michael@0 885 mp_int tmp;
michael@0 886 unsigned char *powersArray;
michael@0 887 unsigned char *powers;
michael@0 888
michael@0 889 MP_DIGITS(&accum1) = 0;
michael@0 890 MP_DIGITS(&accum2) = 0;
michael@0 891 MP_DIGITS(&accum[0]) = 0;
michael@0 892 MP_DIGITS(&accum[1]) = 0;
michael@0 893 MP_DIGITS(&accum[2]) = 0;
michael@0 894 MP_DIGITS(&accum[3]) = 0;
michael@0 895 MP_DIGITS(&tmp) = 0;
michael@0 896
michael@0 897 powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1));
michael@0 898 if (powersArray == NULL) {
michael@0 899 res = MP_MEM;
michael@0 900 goto CLEANUP;
michael@0 901 }
michael@0 902
michael@0 903 /* powers[i] = base ** (i); */
michael@0 904 powers = (unsigned char *)MP_ALIGN(powersArray,num_powers);
michael@0 905
michael@0 906 /* grab the first window value. This allows us to preload accumulator1
michael@0 907 * and save a conversion, some squares and a multiple*/
michael@0 908 MP_CHECKOK( mpl_get_bits(exponent,
michael@0 909 bits_in_exponent-window_bits, window_bits) );
michael@0 910 first_window = (mp_size)res;
michael@0 911
michael@0 912 MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
michael@0 913 MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
michael@0 914 MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );
michael@0 915
michael@0 916 /* build the first WEAVE_WORD powers inline */
michael@0 917 /* if WEAVE_WORD_SIZE is not 4, this code will have to change */
michael@0 918 if (num_powers > 2) {
michael@0 919 MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
michael@0 920 MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
michael@0 921 MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
michael@0 922 MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
michael@0 923 mp_set(&accum[0], 1);
michael@0 924 MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );
michael@0 925 MP_CHECKOK( mp_copy(montBase, &accum[1]) );
michael@0 926 SQR(montBase, &accum[2]);
michael@0 927 MUL_NOWEAVE(montBase, &accum[2], &accum[3]);
michael@0 928 MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );
michael@0 929 if (first_window < 4) {
michael@0 930 MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );
michael@0 931 first_window = num_powers;
michael@0 932 }
michael@0 933 } else {
michael@0 934 if (first_window == 0) {
michael@0 935 mp_set(&accum1, 1);
michael@0 936 MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
michael@0 937 } else {
michael@0 938 /* assert first_window == 1? */
michael@0 939 MP_CHECKOK( mp_copy(montBase, &accum1) );
michael@0 940 }
michael@0 941 }
michael@0 942
michael@0 943 /*
michael@0 944 * calculate all the powers in the powers array.
michael@0 945 * this adds 2**(k-1)-2 square operations over just calculating the
michael@0 946 * odd powers where k is the window size in the two other mp_modexpt
michael@0 947 * implementations in this file. We will get some of that
michael@0 948 * back by not needing the first 'k' squares and one multiply for the
michael@0 949 * first window */
michael@0 950 for (i = WEAVE_WORD_SIZE; i < num_powers; i++) {
michael@0 951 int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */
michael@0 952 if ( i & 1 ) {
michael@0 953 MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);
michael@0 954 /* we've filled the array do our 'per array' processing */
michael@0 955 if (acc_index == (WEAVE_WORD_SIZE-1)) {
michael@0 956 MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1),
michael@0 957 nLen, num_powers) );
michael@0 958
michael@0 959 if (first_window <= i) {
michael@0 960 MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)],
michael@0 961 &accum1) );
michael@0 962 first_window = num_powers;
michael@0 963 }
michael@0 964 }
michael@0 965 } else {
michael@0 966 /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source
michael@0 967 * and target are the same so we need to copy.. After that, the
michael@0 968 * value is overwritten, so we need to fetch it from the stored
michael@0 969 * weave array */
michael@0 970 if (i > 2* WEAVE_WORD_SIZE) {
michael@0 971 MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers));
michael@0 972 SQR(&accum2, &accum[acc_index]);
michael@0 973 } else {
michael@0 974 int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1);
michael@0 975 if (half_power_index == acc_index) {
michael@0 976 /* copy is cheaper than weave_to_mpi */
michael@0 977 MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));
michael@0 978 SQR(&accum2,&accum[acc_index]);
michael@0 979 } else {
michael@0 980 SQR(&accum[half_power_index],&accum[acc_index]);
michael@0 981 }
michael@0 982 }
michael@0 983 }
michael@0 984 }
michael@0 985 /* if the accum1 isn't set, Then there is something wrong with our logic
michael@0 986 * above and is an internal programming error.
michael@0 987 */
michael@0 988 #if MP_ARGCHK == 2
michael@0 989 assert(MP_USED(&accum1) != 0);
michael@0 990 #endif
michael@0 991
michael@0 992 /* set accumulator to montgomery residue of 1 */
michael@0 993 pa1 = &accum1;
michael@0 994 pa2 = &accum2;
michael@0 995
michael@0 996 for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) {
michael@0 997 mp_size smallExp;
michael@0 998 MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
michael@0 999 smallExp = (mp_size)res;
michael@0 1000
michael@0 1001 /* handle unroll the loops */
michael@0 1002 switch (window_bits) {
michael@0 1003 case 1:
michael@0 1004 if (!smallExp) {
michael@0 1005 SQR(pa1,pa2); SWAPPA;
michael@0 1006 } else if (smallExp & 1) {
michael@0 1007 SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1);
michael@0 1008 } else {
michael@0 1009 abort();
michael@0 1010 }
michael@0 1011 break;
michael@0 1012 case 6:
michael@0 1013 SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 1014 /* fall through */
michael@0 1015 case 4:
michael@0 1016 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 1017 MUL(smallExp, pa1,pa2); SWAPPA;
michael@0 1018 break;
michael@0 1019 case 5:
michael@0 1020 SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
michael@0 1021 SQR(pa1,pa2); MUL(smallExp,pa2,pa1);
michael@0 1022 break;
michael@0 1023 default:
michael@0 1024 abort(); /* could do a loop? */
michael@0 1025 }
michael@0 1026 }
michael@0 1027
michael@0 1028 res = s_mp_redc(pa1, mmm);
michael@0 1029 mp_exch(pa1, result);
michael@0 1030
michael@0 1031 CLEANUP:
michael@0 1032 mp_clear(&accum1);
michael@0 1033 mp_clear(&accum2);
michael@0 1034 mp_clear(&accum[0]);
michael@0 1035 mp_clear(&accum[1]);
michael@0 1036 mp_clear(&accum[2]);
michael@0 1037 mp_clear(&accum[3]);
michael@0 1038 mp_clear(&tmp);
michael@0 1039 /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */
michael@0 1040 free(powersArray);
michael@0 1041 return res;
michael@0 1042 }
michael@0 1043 #undef SQR
michael@0 1044 #undef MUL
michael@0 1045 #endif
michael@0 1046
michael@0 1047 mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
michael@0 1048 const mp_int *modulus, mp_int *result)
michael@0 1049 {
michael@0 1050 const mp_int *base;
michael@0 1051 mp_size bits_in_exponent, i, window_bits, odd_ints;
michael@0 1052 mp_err res;
michael@0 1053 int nLen;
michael@0 1054 mp_int montBase, goodBase;
michael@0 1055 mp_mont_modulus mmm;
michael@0 1056 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 1057 static unsigned int max_window_bits;
michael@0 1058 #endif
michael@0 1059
michael@0 1060 /* function for computing n0prime only works if n0 is odd */
michael@0 1061 if (!mp_isodd(modulus))
michael@0 1062 return s_mp_exptmod(inBase, exponent, modulus, result);
michael@0 1063
michael@0 1064 MP_DIGITS(&montBase) = 0;
michael@0 1065 MP_DIGITS(&goodBase) = 0;
michael@0 1066
michael@0 1067 if (mp_cmp(inBase, modulus) < 0) {
michael@0 1068 base = inBase;
michael@0 1069 } else {
michael@0 1070 MP_CHECKOK( mp_init(&goodBase) );
michael@0 1071 base = &goodBase;
michael@0 1072 MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );
michael@0 1073 }
michael@0 1074
michael@0 1075 nLen = MP_USED(modulus);
michael@0 1076 MP_CHECKOK( mp_init_size(&montBase, 2 * nLen + 2) );
michael@0 1077
michael@0 1078 mmm.N = *modulus; /* a copy of the mp_int struct */
michael@0 1079
michael@0 1080 /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX
michael@0 1081 ** where n0 = least significant mp_digit of N, the modulus.
michael@0 1082 */
michael@0 1083 mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) );
michael@0 1084
michael@0 1085 MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) );
michael@0 1086
michael@0 1087 bits_in_exponent = mpl_significant_bits(exponent);
michael@0 1088 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 1089 if (mp_using_cache_safe_exp) {
michael@0 1090 if (bits_in_exponent > 780)
michael@0 1091 window_bits = 6;
michael@0 1092 else if (bits_in_exponent > 256)
michael@0 1093 window_bits = 5;
michael@0 1094 else if (bits_in_exponent > 20)
michael@0 1095 window_bits = 4;
michael@0 1096 /* RSA public key exponents are typically under 20 bits (common values
michael@0 1097 * are: 3, 17, 65537) and a 4-bit window is inefficient
michael@0 1098 */
michael@0 1099 else
michael@0 1100 window_bits = 1;
michael@0 1101 } else
michael@0 1102 #endif
michael@0 1103 if (bits_in_exponent > 480)
michael@0 1104 window_bits = 6;
michael@0 1105 else if (bits_in_exponent > 160)
michael@0 1106 window_bits = 5;
michael@0 1107 else if (bits_in_exponent > 20)
michael@0 1108 window_bits = 4;
michael@0 1109 /* RSA public key exponents are typically under 20 bits (common values
michael@0 1110 * are: 3, 17, 65537) and a 4-bit window is inefficient
michael@0 1111 */
michael@0 1112 else
michael@0 1113 window_bits = 1;
michael@0 1114
michael@0 1115 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 1116 /*
michael@0 1117 * clamp the window size based on
michael@0 1118 * the cache line size.
michael@0 1119 */
michael@0 1120 if (!max_window_bits) {
michael@0 1121 unsigned long cache_size = s_mpi_getProcessorLineSize();
michael@0 1122 /* processor has no cache, use 'fast' code always */
michael@0 1123 if (cache_size == 0) {
michael@0 1124 mp_using_cache_safe_exp = 0;
michael@0 1125 }
michael@0 1126 if ((cache_size == 0) || (cache_size >= 64)) {
michael@0 1127 max_window_bits = 6;
michael@0 1128 } else if (cache_size >= 32) {
michael@0 1129 max_window_bits = 5;
michael@0 1130 } else if (cache_size >= 16) {
michael@0 1131 max_window_bits = 4;
michael@0 1132 } else max_window_bits = 1; /* should this be an assert? */
michael@0 1133 }
michael@0 1134
michael@0 1135 /* clamp the window size down before we caclulate bits_in_exponent */
michael@0 1136 if (mp_using_cache_safe_exp) {
michael@0 1137 if (window_bits > max_window_bits) {
michael@0 1138 window_bits = max_window_bits;
michael@0 1139 }
michael@0 1140 }
michael@0 1141 #endif
michael@0 1142
michael@0 1143 odd_ints = 1 << (window_bits - 1);
michael@0 1144 i = bits_in_exponent % window_bits;
michael@0 1145 if (i != 0) {
michael@0 1146 bits_in_exponent += window_bits - i;
michael@0 1147 }
michael@0 1148
michael@0 1149 #ifdef MP_USING_MONT_MULF
michael@0 1150 if (mp_using_mont_mulf) {
michael@0 1151 MP_CHECKOK( s_mp_pad(&montBase, nLen) );
michael@0 1152 res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen,
michael@0 1153 bits_in_exponent, window_bits, odd_ints);
michael@0 1154 } else
michael@0 1155 #endif
michael@0 1156 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
michael@0 1157 if (mp_using_cache_safe_exp) {
michael@0 1158 res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen,
michael@0 1159 bits_in_exponent, window_bits, 1 << window_bits);
michael@0 1160 } else
michael@0 1161 #endif
michael@0 1162 res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen,
michael@0 1163 bits_in_exponent, window_bits, odd_ints);
michael@0 1164
michael@0 1165 CLEANUP:
michael@0 1166 mp_clear(&montBase);
michael@0 1167 mp_clear(&goodBase);
michael@0 1168 /* Don't mp_clear mmm.N because it is merely a copy of modulus.
michael@0 1169 ** Just zap it.
michael@0 1170 */
michael@0 1171 memset(&mmm, 0, sizeof mmm);
michael@0 1172 return res;
michael@0 1173 }

mercurial