The Tor Browser: security/nss/lib/freebl/mpi/mpmontg.c@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* This Source Code Form is subject to the terms of the Mozilla Public

     2  * License, v. 2.0. If a copy of the MPL was not distributed with this

     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     5 /* This file implements moduluar exponentiation using Montgomery's

     6  * method for modular reduction.  This file implements the method

     7  * described as "Improvement 2" in the paper "A Cryptogrpahic Library for

     8  * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.

     9  * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"

    10  * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,

    11  * published by Springer Verlag.

    12  */

    14 #define MP_USING_CACHE_SAFE_MOD_EXP 1

    15 #include <string.h>

    16 #include "mpi-priv.h"

    17 #include "mplogic.h"

    18 #include "mpprime.h"

    19 #ifdef MP_USING_MONT_MULF

    20 #include "montmulf.h"

    21 #endif

    22 #include <stddef.h> /* ptrdiff_t */

    24 /* if MP_CHAR_STORE_SLOW is defined, we  */

    25 /* need to know endianness of this platform. */

    26 #ifdef MP_CHAR_STORE_SLOW

    27 #if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)

    28 #error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \

    29        "  if you define MP_CHAR_STORE_SLOW."

    30 #endif

    31 #endif

    33 #define STATIC

    35 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */

    37 /*! computes T = REDC(T), 2^b == R

    38     \param T < RN

    39 */

    40 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)

    41 {

    42   mp_err res;

    43   mp_size i;

    45   i = (MP_USED(&mmm->N) << 1) + 1;

    46   MP_CHECKOK( s_mp_pad(T, i) );

    47   for (i = 0; i < MP_USED(&mmm->N); ++i ) {

    48     mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;

    49     /* T += N * m_i * (MP_RADIX ** i); */

    50     MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) );

    51   }

    52   s_mp_clamp(T);

    54   /* T /= R */

    55   s_mp_rshd( T, MP_USED(&mmm->N) );

    57   if ((res = s_mp_cmp(T, &mmm->N)) >= 0) {

    58     /* T = T - N */

    59     MP_CHECKOK( s_mp_sub(T, &mmm->N) );

    60 #ifdef DEBUG

    61     if ((res = mp_cmp(T, &mmm->N)) >= 0) {

    62       res = MP_UNDEF;

    63       goto CLEANUP;

    64     }

    65 #endif

    66   }

    67   res = MP_OKAY;

    68 CLEANUP:

    69   return res;

    70 }

    72 #if !defined(MP_MONT_USE_MP_MUL)

    74 /*! c <- REDC( a * b ) mod N

    75     \param a < N  i.e. "reduced"

    76     \param b < N  i.e. "reduced"

    77     \param mmm modulus N and n0' of N

    78 */

    79 mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c,

    80 	           mp_mont_modulus *mmm)

    81 {

    82   mp_digit *pb;

    83   mp_digit m_i;

    84   mp_err   res;

    85   mp_size  ib; /* "index b": index of current digit of B */

    86   mp_size  useda, usedb;

    88   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);

    90   if (MP_USED(a) < MP_USED(b)) {

    91     const mp_int *xch = b;	/* switch a and b, to do fewer outer loops */

    92     b = a;

    93     a = xch;

    94   }

    96   MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;

    97   ib = (MP_USED(&mmm->N) << 1) + 1;

    98   if((res = s_mp_pad(c, ib)) != MP_OKAY)

    99     goto CLEANUP;

   101   useda = MP_USED(a);

   102   pb = MP_DIGITS(b);

   103   s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c));

   104   s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1));

   105   m_i = MP_DIGIT(c, 0) * mmm->n0prime;

   106   s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0);

   108   /* Outer loop:  Digits of b */

   109   usedb = MP_USED(b);

   110   for (ib = 1; ib < usedb; ib++) {

   111     mp_digit b_i    = *pb++;

   113     /* Inner product:  Digits of a */

   114     if (b_i)

   115       s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);

   116     m_i = MP_DIGIT(c, ib) * mmm->n0prime;

   117     s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);

   118   }

   119   if (usedb < MP_USED(&mmm->N)) {

   120     for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) {

   121       m_i = MP_DIGIT(c, ib) * mmm->n0prime;

   122       s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);

   123     }

   124   }

   125   s_mp_clamp(c);

   126   s_mp_rshd( c, MP_USED(&mmm->N) ); /* c /= R */

   127   if (s_mp_cmp(c, &mmm->N) >= 0) {

   128     MP_CHECKOK( s_mp_sub(c, &mmm->N) );

   129   }

   130   res = MP_OKAY;

   132 CLEANUP:

   133   return res;

   134 }

   135 #endif

   137 STATIC

   138 mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)

   139 {

   140   mp_err res;

   142   /* xMont = x * R mod N   where  N is modulus */

   143   MP_CHECKOK( mp_copy( x, xMont ) );

   144   MP_CHECKOK( s_mp_lshd( xMont, MP_USED(&mmm->N) ) );	/* xMont = x << b */

   145   MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) );	/*         mod N */

   146 CLEANUP:

   147   return res;

   148 }

   150 #ifdef MP_USING_MONT_MULF

   152 /* the floating point multiply is already cache safe,

   153  * don't turn on cache safe unless we specifically

   154  * force it */

   155 #ifndef MP_FORCE_CACHE_SAFE

   156 #undef MP_USING_CACHE_SAFE_MOD_EXP

   157 #endif

   159 unsigned int mp_using_mont_mulf = 1;

   161 /* computes montgomery square of the integer in mResult */

   162 #define SQR \

   163   conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \

   164   mont_mulf_noconv(mResult, dm1, d16Tmp, \

   165 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0)

   167 /* computes montgomery product of x and the integer in mResult */

   168 #define MUL(x) \

   169   conv_i32_to_d32(dm1, mResult, nLen); \

   170   mont_mulf_noconv(mResult, dm1, oddPowers[x], \

   171 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0)

   173 /* Do modular exponentiation using floating point multiply code. */

   174 mp_err mp_exptmod_f(const mp_int *   montBase,

   175                     const mp_int *   exponent,

   176 		    const mp_int *   modulus,

   177 		    mp_int *         result,

   178 		    mp_mont_modulus *mmm,

   179 		    int              nLen,

   180 		    mp_size          bits_in_exponent,

   181 		    mp_size          window_bits,

   182 		    mp_size          odd_ints)

   183 {

   184   mp_digit *mResult;

   185   double   *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp;

   186   double    dn0;

   187   mp_size   i;

   188   mp_err    res;

   189   int       expOff;

   190   int       dSize = 0, oddPowSize, dTmpSize;

   191   mp_int    accum1;

   192   double   *oddPowers[MAX_ODD_INTS];

   194   /* function for computing n0prime only works if n0 is odd */

   196   MP_DIGITS(&accum1) = 0;

   198   for (i = 0; i < MAX_ODD_INTS; ++i)

   199     oddPowers[i] = 0;

   201   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );

   203   mp_set(&accum1, 1);

   204   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );

   205   MP_CHECKOK( s_mp_pad(&accum1, nLen) );

   207   oddPowSize = 2 * nLen + 1;

   208   dTmpSize   = 2 * oddPowSize;

   209   dSize = sizeof(double) * (nLen * 4 + 1 +

   210 			    ((odd_ints + 1) * oddPowSize) + dTmpSize);

   211   dBuf   = (double *)malloc(dSize);

   212   dm1    = dBuf;		/* array of d32 */

   213   dn     = dBuf   + nLen;	/* array of d32 */

   214   dSqr   = dn     + nLen;    	/* array of d32 */

   215   d16Tmp = dSqr   + nLen;	/* array of d16 */

   216   dTmp   = d16Tmp + oddPowSize;

   218   for (i = 0; i < odd_ints; ++i) {

   219       oddPowers[i] = dTmp;

   220       dTmp += oddPowSize;

   221   }

   222   mResult = (mp_digit *)(dTmp + dTmpSize);	/* size is nLen + 1 */

   224   /* Make dn and dn0 */

   225   conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen);

   226   dn0 = (double)(mmm->n0prime & 0xffff);

   228   /* Make dSqr */

   229   conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen);

   230   mont_mulf_noconv(mResult, dm1, oddPowers[0],

   231 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0);

   232   conv_i32_to_d32(dSqr, mResult, nLen);

   234   for (i = 1; i < odd_ints; ++i) {

   235     mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1],

   236 		     dTmp, dn, MP_DIGITS(modulus), nLen, dn0);

   237     conv_i32_to_d16(oddPowers[i], mResult, nLen);

   238   }

   240   s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */

   242   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {

   243     mp_size smallExp;

   244     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );

   245     smallExp = (mp_size)res;

   247     if (window_bits == 1) {

   248       if (!smallExp) {

   249 	SQR;

   250       } else if (smallExp & 1) {

   251 	SQR; MUL(0);

   252       } else {

   253 	abort();

   254       }

   255     } else if (window_bits == 4) {

   256       if (!smallExp) {

   257 	SQR; SQR; SQR; SQR;

   258       } else if (smallExp & 1) {

   259 	SQR; SQR; SQR; SQR; MUL(smallExp/2);

   260       } else if (smallExp & 2) {

   261 	SQR; SQR; SQR; MUL(smallExp/4); SQR;

   262       } else if (smallExp & 4) {

   263 	SQR; SQR; MUL(smallExp/8); SQR; SQR;

   264       } else if (smallExp & 8) {

   265 	SQR; MUL(smallExp/16); SQR; SQR; SQR;

   266       } else {

   267 	abort();

   268       }

   269     } else if (window_bits == 5) {

   270       if (!smallExp) {

   271 	SQR; SQR; SQR; SQR; SQR;

   272       } else if (smallExp & 1) {

   273 	SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);

   274       } else if (smallExp & 2) {

   275 	SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;

   276       } else if (smallExp & 4) {

   277 	SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;

   278       } else if (smallExp & 8) {

   279 	SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;

   280       } else if (smallExp & 0x10) {

   281 	SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;

   282       } else {

   283 	abort();

   284       }

   285     } else if (window_bits == 6) {

   286       if (!smallExp) {

   287 	SQR; SQR; SQR; SQR; SQR; SQR;

   288       } else if (smallExp & 1) {

   289 	SQR; SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);

   290       } else if (smallExp & 2) {

   291 	SQR; SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;

   292       } else if (smallExp & 4) {

   293 	SQR; SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;

   294       } else if (smallExp & 8) {

   295 	SQR; SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;

   296       } else if (smallExp & 0x10) {

   297 	SQR; SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;

   298       } else if (smallExp & 0x20) {

   299 	SQR; MUL(smallExp/64); SQR; SQR; SQR; SQR; SQR;

   300       } else {

   301 	abort();

   302       }

   303     } else {

   304       abort();

   305     }

   306   }

   308   s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */

   310   res = s_mp_redc(&accum1, mmm);

   311   mp_exch(&accum1, result);

   313 CLEANUP:

   314   mp_clear(&accum1);

   315   if (dBuf) {

   316     if (dSize)

   317       memset(dBuf, 0, dSize);

   318     free(dBuf);

   319   }

   321   return res;

   322 }

   323 #undef SQR

   324 #undef MUL

   325 #endif

   327 #define SQR(a,b) \

   328   MP_CHECKOK( mp_sqr(a, b) );\

   329   MP_CHECKOK( s_mp_redc(b, mmm) )

   331 #if defined(MP_MONT_USE_MP_MUL)

   332 #define MUL(x,a,b) \

   333   MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \

   334   MP_CHECKOK( s_mp_redc(b, mmm) )

   335 #else

   336 #define MUL(x,a,b) \

   337   MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) )

   338 #endif

   340 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp

   342 /* Do modular exponentiation using integer multiply code. */

   343 mp_err mp_exptmod_i(const mp_int *   montBase,

   344                     const mp_int *   exponent,

   345 		    const mp_int *   modulus,

   346 		    mp_int *         result,

   347 		    mp_mont_modulus *mmm,

   348 		    int              nLen,

   349 		    mp_size          bits_in_exponent,

   350 		    mp_size          window_bits,

   351 		    mp_size          odd_ints)

   352 {

   353   mp_int *pa1, *pa2, *ptmp;

   354   mp_size i;

   355   mp_err  res;

   356   int     expOff;

   357   mp_int  accum1, accum2, power2, oddPowers[MAX_ODD_INTS];

   359   /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */

   360   /* oddPowers[i] = base ** (2*i + 1); */

   362   MP_DIGITS(&accum1) = 0;

   363   MP_DIGITS(&accum2) = 0;

   364   MP_DIGITS(&power2) = 0;

   365   for (i = 0; i < MAX_ODD_INTS; ++i) {

   366     MP_DIGITS(oddPowers + i) = 0;

   367   }

   369   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );

   370   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );

   372   MP_CHECKOK( mp_init_copy(&oddPowers[0], montBase) );

   374   mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2);

   375   MP_CHECKOK( mp_sqr(montBase, &power2) );	/* power2 = montBase ** 2 */

   376   MP_CHECKOK( s_mp_redc(&power2, mmm) );

   378   for (i = 1; i < odd_ints; ++i) {

   379     mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2);

   380     MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );

   381     MP_CHECKOK( s_mp_redc(oddPowers + i, mmm) );

   382   }

   384   /* set accumulator to montgomery residue of 1 */

   385   mp_set(&accum1, 1);

   386   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );

   387   pa1 = &accum1;

   388   pa2 = &accum2;

   390   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {

   391     mp_size smallExp;

   392     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );

   393     smallExp = (mp_size)res;

   395     if (window_bits == 1) {

   396       if (!smallExp) {

   397 	SQR(pa1,pa2); SWAPPA;

   398       } else if (smallExp & 1) {

   399 	SQR(pa1,pa2); MUL(0,pa2,pa1);

   400       } else {

   401 	abort();

   402       }

   403     } else if (window_bits == 4) {

   404       if (!smallExp) {

   405 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   406       } else if (smallExp & 1) {

   407 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   408 	MUL(smallExp/2, pa1,pa2); SWAPPA;

   409       } else if (smallExp & 2) {

   410 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);

   411 	MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;

   412       } else if (smallExp & 4) {

   413 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2);

   414 	SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;

   415       } else if (smallExp & 8) {

   416 	SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2);

   417 	SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;

   418       } else {

   419 	abort();

   420       }

   421     } else if (window_bits == 5) {

   422       if (!smallExp) {

   423 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   424 	SQR(pa1,pa2); SWAPPA;

   425       } else if (smallExp & 1) {

   426 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   427 	SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1);

   428       } else if (smallExp & 2) {

   429 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   430 	MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1);

   431       } else if (smallExp & 4) {

   432 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);

   433 	MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   434       } else if (smallExp & 8) {

   435 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2);

   436 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   437       } else if (smallExp & 0x10) {

   438 	SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2);

   439 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   440       } else {

   441 	abort();

   442       }

   443     } else if (window_bits == 6) {

   444       if (!smallExp) {

   445 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   446 	SQR(pa1,pa2); SQR(pa2,pa1);

   447       } else if (smallExp & 1) {

   448 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   449 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA;

   450       } else if (smallExp & 2) {

   451 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   452 	SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;

   453       } else if (smallExp & 4) {

   454 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   455 	MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;

   456       } else if (smallExp & 8) {

   457 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2);

   458 	MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

   459 	SQR(pa1,pa2); SWAPPA;

   460       } else if (smallExp & 0x10) {

   461 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2);

   462 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;

   463       } else if (smallExp & 0x20) {

   464 	SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2);

   465 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;

   466       } else {

   467 	abort();

   468       }

   469     } else {

   470       abort();

   471     }

   472   }

   474   res = s_mp_redc(pa1, mmm);

   475   mp_exch(pa1, result);

   477 CLEANUP:

   478   mp_clear(&accum1);

   479   mp_clear(&accum2);

   480   mp_clear(&power2);

   481   for (i = 0; i < odd_ints; ++i) {

   482     mp_clear(oddPowers + i);

   483   }

   484   return res;

   485 }

   486 #undef SQR

   487 #undef MUL

   489 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

   490 unsigned int mp_using_cache_safe_exp = 1;

   491 #endif

   493 mp_err mp_set_safe_modexp(int value)

   494 {

   495 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

   496  mp_using_cache_safe_exp = value;

   497  return MP_OKAY;

   498 #else

   499  if (value == 0) {

   500    return MP_OKAY;

   501  }

   502  return MP_BADARG;

   503 #endif

   504 }

   506 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

   507 #define WEAVE_WORD_SIZE 4

   509 #ifndef MP_CHAR_STORE_SLOW

   510 /*

   511  * mpi_to_weave takes an array of bignums, a matrix in which each bignum

   512  * occupies all the columns of a row, and transposes it into a matrix in

   513  * which each bignum occupies a column of every row.  The first row of the

   514  * input matrix becomes the first column of the output matrix.  The n'th

   515  * row of input becomes the n'th column of output.  The input data is said

   516  * to be "interleaved" or "woven" into the output matrix.

   517  *

   518  * The array of bignums is left in this woven form.  Each time a single

   519  * bignum value is needed, it is recreated by fetching the n'th column,

   520  * forming a single row which is the new bignum.

   521  *

   522  * The purpose of this interleaving is make it impossible to determine which

   523  * of the bignums is being used in any one operation by examining the pattern

   524  * of cache misses.

   525  *

   526  * The weaving function does not transpose the entire input matrix in one call.

   527  * It transposes 4 rows of mp_ints into their respective columns of output.

   528  *

   529  * There are two different implementations of the weaving and unweaving code

   530  * in this file.  One uses byte loads and stores.  The second uses loads and

   531  * stores of mp_weave_word size values.  The weaved forms of these two

   532  * implementations differ.  Consequently, each one has its own explanation.

   533  *

   534  * Here is the explanation for the byte-at-a-time implementation.

   535  *

   536  * This implementation treats each mp_int bignum as an array of bytes,

   537  * rather than as an array of mp_digits.  It stores those bytes as a

   538  * column of bytes in the output matrix.  It doesn't care if the machine

   539  * uses big-endian or little-endian byte ordering within mp_digits.

   540  * The first byte of the mp_digit array becomes the first byte in the output

   541  * column, regardless of whether that byte is the MSB or LSB of the mp_digit.

   542  *

   543  * "bignums" is an array of mp_ints.

   544  * It points to four rows, four mp_ints, a subset of a larger array of mp_ints.

   545  *

   546  * "weaved" is the weaved output matrix.

   547  * The first byte of bignums[0] is stored in weaved[0].

   548  *

   549  * "nBignums" is the total number of bignums in the array of which "bignums"

   550  * is a part.

   551  *

   552  * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array.

   553  * mp_ints that use less than nDigits digits are logically padded with zeros

   554  * while being stored in the weaved array.

   555  */

   556 mp_err mpi_to_weave(const mp_int  *bignums,

   557                     unsigned char *weaved,

   558 		    mp_size nDigits,  /* in each mp_int of input */

   559 		    mp_size nBignums) /* in the entire source array */

   560 {

   561   mp_size i;

   562   unsigned char * endDest = weaved + (nDigits * nBignums * sizeof(mp_digit));

   564   for (i=0; i < WEAVE_WORD_SIZE; i++) {

   565     mp_size used = MP_USED(&bignums[i]);

   566     unsigned char *pSrc   = (unsigned char *)MP_DIGITS(&bignums[i]);

   567     unsigned char *endSrc = pSrc + (used * sizeof(mp_digit));

   568     unsigned char *pDest  = weaved + i;

   570     ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG);

   571     ARGCHK(used <= nDigits, MP_BADARG);

   573     for (; pSrc < endSrc; pSrc++) {

   574       *pDest = *pSrc;

   575       pDest += nBignums;

   576     }

   577     while (pDest < endDest) {

   578       *pDest = 0;

   579       pDest += nBignums;

   580     }

   581   }

   583   return MP_OKAY;

   584 }

   586 /* Reverse the operation above for one mp_int.

   587  * Reconstruct one mp_int from its column in the weaved array.

   588  * "pSrc" points to the offset into the weave array of the bignum we

   589  * are going to reconstruct.

   590  */

   591 mp_err weave_to_mpi(mp_int *a,                /* output, result */

   592                     const unsigned char *pSrc, /* input, byte matrix */

   593 		    mp_size nDigits,          /* per mp_int output */

   594 		    mp_size nBignums)         /* bignums in weaved matrix */

   595 {

   596   unsigned char *pDest   = (unsigned char *)MP_DIGITS(a);

   597   unsigned char *endDest = pDest + (nDigits * sizeof(mp_digit));

   599   MP_SIGN(a) = MP_ZPOS;

   600   MP_USED(a) = nDigits;

   602   for (; pDest < endDest; pSrc += nBignums, pDest++) {

   603     *pDest = *pSrc;

   604   }

   605   s_mp_clamp(a);

   606   return MP_OKAY;

   607 }

   609 #else

   611 /* Need a primitive that we know is 32 bits long... */

   612 /* this is true on all modern processors we know of today*/

   613 typedef unsigned int mp_weave_word;

   615 /*

   616  * on some platforms character stores into memory is very expensive since they

   617  * generate a read/modify/write operation on the bus. On those platforms

   618  * we need to do integer writes to the bus. Because of some unrolled code,

   619  * in this current code the size of mp_weave_word must be four. The code that

   620  * makes this assumption explicity is called out. (on some platforms a write

   621  * of 4 bytes still requires a single read-modify-write operation.

   622  *

   623  * This function is takes the identical parameters as the function above,

   624  * however it lays out the final array differently. Where the previous function

   625  * treats the mpi_int as an byte array, this function treats it as an array of

   626  * mp_digits where each digit is stored in big endian order.

   627  *

   628  * since we need to interleave on a byte by byte basis, we need to collect

   629  * several mpi structures together into a single PRUint32 before we write. We

   630  * also need to make sure the PRUint32 is arranged so that the first value of

   631  * the first array winds up in b[0]. This means construction of that PRUint32

   632  * is endian specific (even though the layout of the mp_digits in the array

   633  * is always big endian).

   634  *

   635  * The final data is stored as follows :

   636  *

   637  * Our same logical array p array, m is sizeof(mp_digit),

   638  * N is still count and n is now b_size. If we define p[i].digit[j]0 as the

   639  * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as

   640  * the next most significant byte of p[i].digit[j], ...  and p[i].digit[j]m-1

   641  * is the least significant byte.

   642  * Our array would look like:

   643  * p[0].digit[0]0     p[1].digit[0]0    ...  p[N-2].digit[0]0    p[N-1].digit[0]0

   644  * p[0].digit[0]1     p[1].digit[0]1    ...  p[N-2].digit[0]1    p[N-1].digit[0]1

   645  *                .                                         .

   646  * p[0].digit[0]m-1   p[1].digit[0]m-1  ...  p[N-2].digit[0]m-1  p[N-1].digit[0]m-1

   647  * p[0].digit[1]0     p[1].digit[1]0    ...  p[N-2].digit[1]0    p[N-1].digit[1]0

   648  *                .                                         .

   649  *                .                                         .

   650  * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2

   651  * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1

   652  *

   653  */

   654 mp_err mpi_to_weave(const mp_int *a, unsigned char *b,

   655 					mp_size b_size, mp_size count)

   656 {

   657   mp_size i;

   658   mp_digit *digitsa0;

   659   mp_digit *digitsa1;

   660   mp_digit *digitsa2;

   661   mp_digit *digitsa3;

   662   mp_size   useda0;

   663   mp_size   useda1;

   664   mp_size   useda2;

   665   mp_size   useda3;

   666   mp_weave_word *weaved = (mp_weave_word *)b;

   668   count = count/sizeof(mp_weave_word);

   670   /* this code pretty much depends on this ! */

   671 #if MP_ARGCHK == 2

   672   assert(WEAVE_WORD_SIZE == 4);

   673   assert(sizeof(mp_weave_word) == 4);

   674 #endif

   676   digitsa0 = MP_DIGITS(&a[0]);

   677   digitsa1 = MP_DIGITS(&a[1]);

   678   digitsa2 = MP_DIGITS(&a[2]);

   679   digitsa3 = MP_DIGITS(&a[3]);

   680   useda0 = MP_USED(&a[0]);

   681   useda1 = MP_USED(&a[1]);

   682   useda2 = MP_USED(&a[2]);

   683   useda3 = MP_USED(&a[3]);

   685   ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG);

   686   ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG);

   687   ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG);

   688   ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG);

   689   ARGCHK(useda0 <= b_size, MP_BADARG);

   690   ARGCHK(useda1 <= b_size, MP_BADARG);

   691   ARGCHK(useda2 <= b_size, MP_BADARG);

   692   ARGCHK(useda3 <= b_size, MP_BADARG);

   694 #define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)

   696   for (i=0; i < b_size; i++) {

   697     mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);

   698     mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);

   699     mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);

   700     mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);

   701     register mp_weave_word acc;

   703 /*

   704  * ONE_STEP takes the MSB of each of our current digits and places that

   705  * byte in the appropriate position for writing to the weaved array.

   706  *  On little endian:

   707  *   b3 b2 b1 b0

   708  *  On big endian:

   709  *   b0 b1 b2 b3

   710  *  When the data is written it would always wind up:

   711  *   b[0] = b0

   712  *   b[1] = b1

   713  *   b[2] = b2

   714  *   b[3] = b3

   715  *

   716  * Once we've written the MSB, we shift the whole digit up left one

   717  * byte, putting the Next Most Significant Byte in the MSB position,

   718  * so we we repeat the next one step that byte will be written.

   719  * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE

   720  * is 4.

   721  */

   722 #ifdef MP_IS_LITTLE_ENDIAN

   723 #define MPI_WEAVE_ONE_STEP \

   724     acc  = (d0 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d0 <<= 8; /*b0*/ \

   725     acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \

   726     acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \

   727     acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \

   728     *weaved = acc; weaved += count;

   729 #else

   730 #define MPI_WEAVE_ONE_STEP \

   731     acc  = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \

   732     acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \

   733     acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \

   734     acc |= (d3 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d3 <<= 8; /*b3*/ \

   735     *weaved = acc; weaved += count;

   736 #endif

   737    switch (sizeof(mp_digit)) {

   738    case 32:

   739     MPI_WEAVE_ONE_STEP

   740     MPI_WEAVE_ONE_STEP

   741     MPI_WEAVE_ONE_STEP

   742     MPI_WEAVE_ONE_STEP

   743     MPI_WEAVE_ONE_STEP

   744     MPI_WEAVE_ONE_STEP

   745     MPI_WEAVE_ONE_STEP

   746     MPI_WEAVE_ONE_STEP

   747     MPI_WEAVE_ONE_STEP

   748     MPI_WEAVE_ONE_STEP

   749     MPI_WEAVE_ONE_STEP

   750     MPI_WEAVE_ONE_STEP

   751     MPI_WEAVE_ONE_STEP

   752     MPI_WEAVE_ONE_STEP

   753     MPI_WEAVE_ONE_STEP

   754     MPI_WEAVE_ONE_STEP

   755    case 16:

   756     MPI_WEAVE_ONE_STEP

   757     MPI_WEAVE_ONE_STEP

   758     MPI_WEAVE_ONE_STEP

   759     MPI_WEAVE_ONE_STEP

   760     MPI_WEAVE_ONE_STEP

   761     MPI_WEAVE_ONE_STEP

   762     MPI_WEAVE_ONE_STEP

   763     MPI_WEAVE_ONE_STEP

   764    case 8:

   765     MPI_WEAVE_ONE_STEP

   766     MPI_WEAVE_ONE_STEP

   767     MPI_WEAVE_ONE_STEP

   768     MPI_WEAVE_ONE_STEP

   769    case 4:

   770     MPI_WEAVE_ONE_STEP

   771     MPI_WEAVE_ONE_STEP

   772    case 2:

   773     MPI_WEAVE_ONE_STEP

   774    case 1:

   775     MPI_WEAVE_ONE_STEP

   776     break;

   777    }

   778   }

   780   return MP_OKAY;

   781 }

   783 /* reverse the operation above for one entry.

   784  * b points to the offset into the weave array of the power we are

   785  * calculating */

   786 mp_err weave_to_mpi(mp_int *a, const unsigned char *b,

   787 					mp_size b_size, mp_size count)

   788 {

   789   mp_digit *pb = MP_DIGITS(a);

   790   mp_digit *end = &pb[b_size];

   792   MP_SIGN(a) = MP_ZPOS;

   793   MP_USED(a) = b_size;

   795   for (; pb < end; pb++) {

   796     register mp_digit digit;

   798     digit = *b << 8; b += count;

   799 #define MPI_UNWEAVE_ONE_STEP  digit |= *b; b += count; digit = digit << 8;

   800     switch (sizeof(mp_digit)) {

   801     case 32:

   802 	MPI_UNWEAVE_ONE_STEP

   803 	MPI_UNWEAVE_ONE_STEP

   804 	MPI_UNWEAVE_ONE_STEP

   805 	MPI_UNWEAVE_ONE_STEP

   806 	MPI_UNWEAVE_ONE_STEP

   807 	MPI_UNWEAVE_ONE_STEP

   808 	MPI_UNWEAVE_ONE_STEP

   809 	MPI_UNWEAVE_ONE_STEP

   810 	MPI_UNWEAVE_ONE_STEP

   811 	MPI_UNWEAVE_ONE_STEP

   812 	MPI_UNWEAVE_ONE_STEP

   813 	MPI_UNWEAVE_ONE_STEP

   814 	MPI_UNWEAVE_ONE_STEP

   815 	MPI_UNWEAVE_ONE_STEP

   816 	MPI_UNWEAVE_ONE_STEP

   817 	MPI_UNWEAVE_ONE_STEP

   818     case 16:

   819 	MPI_UNWEAVE_ONE_STEP

   820 	MPI_UNWEAVE_ONE_STEP

   821 	MPI_UNWEAVE_ONE_STEP

   822 	MPI_UNWEAVE_ONE_STEP

   823 	MPI_UNWEAVE_ONE_STEP

   824 	MPI_UNWEAVE_ONE_STEP

   825 	MPI_UNWEAVE_ONE_STEP

   826 	MPI_UNWEAVE_ONE_STEP

   827     case 8:

   828 	MPI_UNWEAVE_ONE_STEP

   829 	MPI_UNWEAVE_ONE_STEP

   830 	MPI_UNWEAVE_ONE_STEP

   831 	MPI_UNWEAVE_ONE_STEP

   832     case 4:

   833 	MPI_UNWEAVE_ONE_STEP

   834 	MPI_UNWEAVE_ONE_STEP

   835     case 2:

   836 	break;

   837     }

   838     digit |= *b; b += count;

   840     *pb = digit;

   841   }

   842   s_mp_clamp(a);

   843   return MP_OKAY;

   844 }

   845 #endif

   848 #define SQR(a,b) \

   849   MP_CHECKOK( mp_sqr(a, b) );\

   850   MP_CHECKOK( s_mp_redc(b, mmm) )

   852 #if defined(MP_MONT_USE_MP_MUL)

   853 #define MUL_NOWEAVE(x,a,b) \

   854   MP_CHECKOK( mp_mul(a, x, b) ); \

   855   MP_CHECKOK( s_mp_redc(b, mmm) )

   856 #else

   857 #define MUL_NOWEAVE(x,a,b) \

   858   MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) )

   859 #endif

   861 #define MUL(x,a,b) \

   862   MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \

   863   MUL_NOWEAVE(&tmp,a,b)

   865 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp

   866 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y)))

   868 /* Do modular exponentiation using integer multiply code. */

   869 mp_err mp_exptmod_safe_i(const mp_int *   montBase,

   870                     const mp_int *   exponent,

   871 		    const mp_int *   modulus,

   872 		    mp_int *         result,

   873 		    mp_mont_modulus *mmm,

   874 		    int              nLen,

   875 		    mp_size          bits_in_exponent,

   876 		    mp_size          window_bits,

   877 		    mp_size          num_powers)

   878 {

   879   mp_int *pa1, *pa2, *ptmp;

   880   mp_size i;

   881   mp_size first_window;

   882   mp_err  res;

   883   int     expOff;

   884   mp_int  accum1, accum2, accum[WEAVE_WORD_SIZE];

   885   mp_int  tmp;

   886   unsigned char *powersArray;

   887   unsigned char *powers;

   889   MP_DIGITS(&accum1) = 0;

   890   MP_DIGITS(&accum2) = 0;

   891   MP_DIGITS(&accum[0]) = 0;

   892   MP_DIGITS(&accum[1]) = 0;

   893   MP_DIGITS(&accum[2]) = 0;

   894   MP_DIGITS(&accum[3]) = 0;

   895   MP_DIGITS(&tmp) = 0;

   897   powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1));

   898   if (powersArray == NULL) {

   899     res = MP_MEM;

   900     goto CLEANUP;

   901   }

   903   /* powers[i] = base ** (i); */

   904   powers = (unsigned char *)MP_ALIGN(powersArray,num_powers);

   906   /* grab the first window value. This allows us to preload accumulator1

   907    * and save a conversion, some squares and a multiple*/

   908   MP_CHECKOK( mpl_get_bits(exponent,

   909 				bits_in_exponent-window_bits, window_bits) );

   910   first_window = (mp_size)res;

   912   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );

   913   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );

   914   MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );

   916   /* build the first WEAVE_WORD powers inline */

   917   /* if WEAVE_WORD_SIZE is not 4, this code will have to change */

   918   if (num_powers > 2) {

   919     MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );

   920     MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );

   921     MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );

   922     MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );

   923     mp_set(&accum[0], 1);

   924     MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );

   925     MP_CHECKOK( mp_copy(montBase, &accum[1]) );

   926     SQR(montBase, &accum[2]);

   927     MUL_NOWEAVE(montBase, &accum[2], &accum[3]);

   928     MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );

   929     if (first_window < 4) {

   930       MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );

   931       first_window = num_powers;

   932     }

   933   } else {

   934       if (first_window == 0) {

   935         mp_set(&accum1, 1);

   936         MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );

   937       } else {

   938         /* assert first_window == 1? */

   939         MP_CHECKOK( mp_copy(montBase, &accum1) );

   940       }

   941   }

   943   /*

   944    * calculate all the powers in the powers array.

   945    * this adds 2**(k-1)-2 square operations over just calculating the

   946    * odd powers where k is the window size in the two other mp_modexpt

   947    * implementations in this file. We will get some of that

   948    * back by not needing the first 'k' squares and one multiply for the

   949    * first window */

   950   for (i = WEAVE_WORD_SIZE; i < num_powers; i++) {

   951     int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */

   952     if ( i & 1 ) {

   953       MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);

   954       /* we've filled the array do our 'per array' processing */

   955       if (acc_index == (WEAVE_WORD_SIZE-1)) {

   956         MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1),

   957 							 nLen, num_powers) );

   959         if (first_window <= i) {

   960           MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)],

   961 								&accum1) );

   962           first_window = num_powers;

   963         }

   964       }

   965     } else {

   966       /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source

   967        * and target are the same so we need to copy.. After that, the

   968        * value is overwritten, so we need to fetch it from the stored

   969        * weave array */

   970       if (i > 2* WEAVE_WORD_SIZE) {

   971         MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers));

   972         SQR(&accum2, &accum[acc_index]);

   973       } else {

   974 	int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1);

   975 	if (half_power_index == acc_index) {

   976 	   /* copy is cheaper than weave_to_mpi */

   977 	   MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));

   978 	   SQR(&accum2,&accum[acc_index]);

   979 	} else {

   980 	   SQR(&accum[half_power_index],&accum[acc_index]);

   981 	}

   982       }

   983     }

   984   }

   985   /* if the accum1 isn't set, Then there is something wrong with our logic

   986    * above and is an internal programming error.

   987    */

   988 #if MP_ARGCHK == 2

   989   assert(MP_USED(&accum1) != 0);

   990 #endif

   992   /* set accumulator to montgomery residue of 1 */

   993   pa1 = &accum1;

   994   pa2 = &accum2;

   996   for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) {

   997     mp_size smallExp;

   998     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );

   999     smallExp = (mp_size)res;

  1001     /* handle unroll the loops */

  1002     switch (window_bits) {

  1003     case 1:

  1004 	if (!smallExp) {

  1005 	    SQR(pa1,pa2); SWAPPA;

  1006 	} else if (smallExp & 1) {

  1007 	    SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1);

  1008 	} else {

  1009 	    abort();

  1010 	}

  1011 	break;

  1012     case 6:

  1013 	SQR(pa1,pa2); SQR(pa2,pa1);

  1014 	/* fall through */

  1015     case 4:

  1016 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

  1017 	MUL(smallExp, pa1,pa2); SWAPPA;

  1018 	break;

  1019     case 5:

  1020 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);

  1021 	SQR(pa1,pa2); MUL(smallExp,pa2,pa1);

  1022 	break;

  1023     default:

  1024 	abort(); /* could do a loop? */

  1025     }

  1026   }

  1028   res = s_mp_redc(pa1, mmm);

  1029   mp_exch(pa1, result);

  1031 CLEANUP:

  1032   mp_clear(&accum1);

  1033   mp_clear(&accum2);

  1034   mp_clear(&accum[0]);

  1035   mp_clear(&accum[1]);

  1036   mp_clear(&accum[2]);

  1037   mp_clear(&accum[3]);

  1038   mp_clear(&tmp);

  1039   /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */

  1040   free(powersArray);

  1041   return res;

  1042 }

  1043 #undef SQR

  1044 #undef MUL

  1045 #endif

  1047 mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,

  1048 		  const mp_int *modulus, mp_int *result)

  1049 {

  1050   const mp_int *base;

  1051   mp_size bits_in_exponent, i, window_bits, odd_ints;

  1052   mp_err  res;

  1053   int     nLen;

  1054   mp_int  montBase, goodBase;

  1055   mp_mont_modulus mmm;

  1056 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

  1057   static unsigned int max_window_bits;

  1058 #endif

  1060   /* function for computing n0prime only works if n0 is odd */

  1061   if (!mp_isodd(modulus))

  1062     return s_mp_exptmod(inBase, exponent, modulus, result);

  1064   MP_DIGITS(&montBase) = 0;

  1065   MP_DIGITS(&goodBase) = 0;

  1067   if (mp_cmp(inBase, modulus) < 0) {

  1068     base = inBase;

  1069   } else {

  1070     MP_CHECKOK( mp_init(&goodBase) );

  1071     base = &goodBase;

  1072     MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );

  1073   }

  1075   nLen  = MP_USED(modulus);

  1076   MP_CHECKOK( mp_init_size(&montBase, 2 * nLen + 2) );

  1078   mmm.N = *modulus;			/* a copy of the mp_int struct */

  1080   /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX

  1081   **		where n0 = least significant mp_digit of N, the modulus.

  1082   */

  1083   mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) );

  1085   MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) );

  1087   bits_in_exponent = mpl_significant_bits(exponent);

  1088 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

  1089   if (mp_using_cache_safe_exp) {

  1090     if (bits_in_exponent > 780)

  1091 	window_bits = 6;

  1092     else if (bits_in_exponent > 256)

  1093 	window_bits = 5;

  1094     else if (bits_in_exponent > 20)

  1095 	window_bits = 4;

  1096        /* RSA public key exponents are typically under 20 bits (common values

  1097         * are: 3, 17, 65537) and a 4-bit window is inefficient

  1098         */

  1099     else

  1100 	window_bits = 1;

  1101   } else

  1102 #endif

  1103   if (bits_in_exponent > 480)

  1104     window_bits = 6;

  1105   else if (bits_in_exponent > 160)

  1106     window_bits = 5;

  1107   else if (bits_in_exponent > 20)

  1108     window_bits = 4;

  1109   /* RSA public key exponents are typically under 20 bits (common values

  1110    * are: 3, 17, 65537) and a 4-bit window is inefficient

  1111    */

  1112   else

  1113     window_bits = 1;

  1115 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

  1116   /*

  1117    * clamp the window size based on

  1118    * the cache line size.

  1119    */

  1120   if (!max_window_bits) {

  1121     unsigned long cache_size = s_mpi_getProcessorLineSize();

  1122     /* processor has no cache, use 'fast' code always */

  1123     if (cache_size == 0) {

  1124       mp_using_cache_safe_exp = 0;

  1125     }

  1126     if ((cache_size == 0) || (cache_size >= 64)) {

  1127       max_window_bits = 6;

  1128     } else if (cache_size >= 32) {

  1129       max_window_bits = 5;

  1130     } else if (cache_size >= 16) {

  1131       max_window_bits = 4;

  1132     } else max_window_bits = 1; /* should this be an assert? */

  1133   }

  1135   /* clamp the window size down before we caclulate bits_in_exponent */

  1136   if (mp_using_cache_safe_exp) {

  1137     if (window_bits > max_window_bits) {

  1138       window_bits = max_window_bits;

  1139     }

  1140   }

  1141 #endif

  1143   odd_ints = 1 << (window_bits - 1);

  1144   i = bits_in_exponent % window_bits;

  1145   if (i != 0) {

  1146     bits_in_exponent += window_bits - i;

  1147   }

  1149 #ifdef MP_USING_MONT_MULF

  1150   if (mp_using_mont_mulf) {

  1151     MP_CHECKOK( s_mp_pad(&montBase, nLen) );

  1152     res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen,

  1153 		     bits_in_exponent, window_bits, odd_ints);

  1154   } else

  1155 #endif

  1156 #ifdef MP_USING_CACHE_SAFE_MOD_EXP

  1157   if (mp_using_cache_safe_exp) {

  1158     res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen,

  1159 		     bits_in_exponent, window_bits, 1 << window_bits);

  1160   } else

  1161 #endif

  1162   res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen,

  1163 		     bits_in_exponent, window_bits, odd_ints);

  1165 CLEANUP:

  1166   mp_clear(&montBase);

  1167   mp_clear(&goodBase);

  1168   /* Don't mp_clear mmm.N because it is merely a copy of modulus.

  1169   ** Just zap it.

  1170   */

  1171   memset(&mmm, 0, sizeof mmm);

  1172   return res;

  1173 }

The Tor Browser / file revision

security/nss/lib/freebl/mpi/mpmontg.c@b8a032363ba2

security/nss/lib/freebl/mpi/mpmontg.c