security/nss/lib/freebl/mpi/mpmontg.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 /* This file implements moduluar exponentiation using Montgomery's
     6  * method for modular reduction.  This file implements the method
     7  * described as "Improvement 2" in the paper "A Cryptogrpahic Library for
     8  * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.
     9  * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"
    10  * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,
    11  * published by Springer Verlag.
    12  */
    14 #define MP_USING_CACHE_SAFE_MOD_EXP 1 
    15 #include <string.h>
    16 #include "mpi-priv.h"
    17 #include "mplogic.h"
    18 #include "mpprime.h"
    19 #ifdef MP_USING_MONT_MULF
    20 #include "montmulf.h"
    21 #endif
    22 #include <stddef.h> /* ptrdiff_t */
    24 /* if MP_CHAR_STORE_SLOW is defined, we  */
    25 /* need to know endianness of this platform. */
    26 #ifdef MP_CHAR_STORE_SLOW
    27 #if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)
    28 #error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \
    29        "  if you define MP_CHAR_STORE_SLOW."
    30 #endif
    31 #endif
    33 #define STATIC
    35 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
    37 /*! computes T = REDC(T), 2^b == R 
    38     \param T < RN
    39 */
    40 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
    41 {
    42   mp_err res;
    43   mp_size i;
    45   i = (MP_USED(&mmm->N) << 1) + 1;
    46   MP_CHECKOK( s_mp_pad(T, i) );
    47   for (i = 0; i < MP_USED(&mmm->N); ++i ) {
    48     mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;
    49     /* T += N * m_i * (MP_RADIX ** i); */
    50     MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) );
    51   }
    52   s_mp_clamp(T);
    54   /* T /= R */
    55   s_mp_rshd( T, MP_USED(&mmm->N) );
    57   if ((res = s_mp_cmp(T, &mmm->N)) >= 0) {
    58     /* T = T - N */
    59     MP_CHECKOK( s_mp_sub(T, &mmm->N) );
    60 #ifdef DEBUG
    61     if ((res = mp_cmp(T, &mmm->N)) >= 0) {
    62       res = MP_UNDEF;
    63       goto CLEANUP;
    64     }
    65 #endif
    66   }
    67   res = MP_OKAY;
    68 CLEANUP:
    69   return res;
    70 }
    72 #if !defined(MP_MONT_USE_MP_MUL)
    74 /*! c <- REDC( a * b ) mod N
    75     \param a < N  i.e. "reduced"
    76     \param b < N  i.e. "reduced"
    77     \param mmm modulus N and n0' of N
    78 */
    79 mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, 
    80 	           mp_mont_modulus *mmm)
    81 {
    82   mp_digit *pb;
    83   mp_digit m_i;
    84   mp_err   res;
    85   mp_size  ib; /* "index b": index of current digit of B */
    86   mp_size  useda, usedb;
    88   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
    90   if (MP_USED(a) < MP_USED(b)) {
    91     const mp_int *xch = b;	/* switch a and b, to do fewer outer loops */
    92     b = a;
    93     a = xch;
    94   }
    96   MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;
    97   ib = (MP_USED(&mmm->N) << 1) + 1;
    98   if((res = s_mp_pad(c, ib)) != MP_OKAY)
    99     goto CLEANUP;
   101   useda = MP_USED(a);
   102   pb = MP_DIGITS(b);
   103   s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c));
   104   s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1));
   105   m_i = MP_DIGIT(c, 0) * mmm->n0prime;
   106   s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0);
   108   /* Outer loop:  Digits of b */
   109   usedb = MP_USED(b);
   110   for (ib = 1; ib < usedb; ib++) {
   111     mp_digit b_i    = *pb++;
   113     /* Inner product:  Digits of a */
   114     if (b_i)
   115       s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);
   116     m_i = MP_DIGIT(c, ib) * mmm->n0prime;
   117     s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
   118   }
   119   if (usedb < MP_USED(&mmm->N)) {
   120     for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) {
   121       m_i = MP_DIGIT(c, ib) * mmm->n0prime;
   122       s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
   123     }
   124   }
   125   s_mp_clamp(c);
   126   s_mp_rshd( c, MP_USED(&mmm->N) ); /* c /= R */
   127   if (s_mp_cmp(c, &mmm->N) >= 0) {
   128     MP_CHECKOK( s_mp_sub(c, &mmm->N) );
   129   }
   130   res = MP_OKAY;
   132 CLEANUP:
   133   return res;
   134 }
   135 #endif
   137 STATIC
   138 mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)
   139 {
   140   mp_err res;
   142   /* xMont = x * R mod N   where  N is modulus */
   143   MP_CHECKOK( mp_copy( x, xMont ) );
   144   MP_CHECKOK( s_mp_lshd( xMont, MP_USED(&mmm->N) ) );	/* xMont = x << b */
   145   MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) );	/*         mod N */
   146 CLEANUP:
   147   return res;
   148 }
   150 #ifdef MP_USING_MONT_MULF
   152 /* the floating point multiply is already cache safe,
   153  * don't turn on cache safe unless we specifically
   154  * force it */
   155 #ifndef MP_FORCE_CACHE_SAFE
   156 #undef MP_USING_CACHE_SAFE_MOD_EXP
   157 #endif
   159 unsigned int mp_using_mont_mulf = 1;
   161 /* computes montgomery square of the integer in mResult */
   162 #define SQR \
   163   conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \
   164   mont_mulf_noconv(mResult, dm1, d16Tmp, \
   165 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
   167 /* computes montgomery product of x and the integer in mResult */
   168 #define MUL(x) \
   169   conv_i32_to_d32(dm1, mResult, nLen); \
   170   mont_mulf_noconv(mResult, dm1, oddPowers[x], \
   171 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
   173 /* Do modular exponentiation using floating point multiply code. */
   174 mp_err mp_exptmod_f(const mp_int *   montBase, 
   175                     const mp_int *   exponent, 
   176 		    const mp_int *   modulus, 
   177 		    mp_int *         result, 
   178 		    mp_mont_modulus *mmm, 
   179 		    int              nLen, 
   180 		    mp_size          bits_in_exponent, 
   181 		    mp_size          window_bits,
   182 		    mp_size          odd_ints)
   183 {
   184   mp_digit *mResult;
   185   double   *dBuf = 0, *dm1, *dn, *dSqr, *d16Tmp, *dTmp;
   186   double    dn0;
   187   mp_size   i;
   188   mp_err    res;
   189   int       expOff;
   190   int       dSize = 0, oddPowSize, dTmpSize;
   191   mp_int    accum1;
   192   double   *oddPowers[MAX_ODD_INTS];
   194   /* function for computing n0prime only works if n0 is odd */
   196   MP_DIGITS(&accum1) = 0;
   198   for (i = 0; i < MAX_ODD_INTS; ++i)
   199     oddPowers[i] = 0;
   201   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
   203   mp_set(&accum1, 1);
   204   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
   205   MP_CHECKOK( s_mp_pad(&accum1, nLen) );
   207   oddPowSize = 2 * nLen + 1;
   208   dTmpSize   = 2 * oddPowSize;
   209   dSize = sizeof(double) * (nLen * 4 + 1 + 
   210 			    ((odd_ints + 1) * oddPowSize) + dTmpSize);
   211   dBuf   = (double *)malloc(dSize);
   212   dm1    = dBuf;		/* array of d32 */
   213   dn     = dBuf   + nLen;	/* array of d32 */
   214   dSqr   = dn     + nLen;    	/* array of d32 */
   215   d16Tmp = dSqr   + nLen;	/* array of d16 */
   216   dTmp   = d16Tmp + oddPowSize;
   218   for (i = 0; i < odd_ints; ++i) {
   219       oddPowers[i] = dTmp;
   220       dTmp += oddPowSize;
   221   }
   222   mResult = (mp_digit *)(dTmp + dTmpSize);	/* size is nLen + 1 */
   224   /* Make dn and dn0 */
   225   conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen);
   226   dn0 = (double)(mmm->n0prime & 0xffff);
   228   /* Make dSqr */
   229   conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(montBase), nLen);
   230   mont_mulf_noconv(mResult, dm1, oddPowers[0], 
   231 		   dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
   232   conv_i32_to_d32(dSqr, mResult, nLen);
   234   for (i = 1; i < odd_ints; ++i) {
   235     mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1], 
   236 		     dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
   237     conv_i32_to_d16(oddPowers[i], mResult, nLen);
   238   }
   240   s_mp_copy(MP_DIGITS(&accum1), mResult, nLen); /* from, to, len */
   242   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
   243     mp_size smallExp;
   244     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
   245     smallExp = (mp_size)res;
   247     if (window_bits == 1) {
   248       if (!smallExp) {
   249 	SQR;
   250       } else if (smallExp & 1) {
   251 	SQR; MUL(0); 
   252       } else {
   253 	abort();
   254       }
   255     } else if (window_bits == 4) {
   256       if (!smallExp) {
   257 	SQR; SQR; SQR; SQR;
   258       } else if (smallExp & 1) {
   259 	SQR; SQR; SQR; SQR; MUL(smallExp/2); 
   260       } else if (smallExp & 2) {
   261 	SQR; SQR; SQR; MUL(smallExp/4); SQR; 
   262       } else if (smallExp & 4) {
   263 	SQR; SQR; MUL(smallExp/8); SQR; SQR; 
   264       } else if (smallExp & 8) {
   265 	SQR; MUL(smallExp/16); SQR; SQR; SQR; 
   266       } else {
   267 	abort();
   268       }
   269     } else if (window_bits == 5) {
   270       if (!smallExp) {
   271 	SQR; SQR; SQR; SQR; SQR; 
   272       } else if (smallExp & 1) {
   273 	SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2);
   274       } else if (smallExp & 2) {
   275 	SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR;
   276       } else if (smallExp & 4) {
   277 	SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR;
   278       } else if (smallExp & 8) {
   279 	SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR;
   280       } else if (smallExp & 0x10) {
   281 	SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR;
   282       } else {
   283 	abort();
   284       }
   285     } else if (window_bits == 6) {
   286       if (!smallExp) {
   287 	SQR; SQR; SQR; SQR; SQR; SQR;
   288       } else if (smallExp & 1) {
   289 	SQR; SQR; SQR; SQR; SQR; SQR; MUL(smallExp/2); 
   290       } else if (smallExp & 2) {
   291 	SQR; SQR; SQR; SQR; SQR; MUL(smallExp/4); SQR; 
   292       } else if (smallExp & 4) {
   293 	SQR; SQR; SQR; SQR; MUL(smallExp/8); SQR; SQR; 
   294       } else if (smallExp & 8) {
   295 	SQR; SQR; SQR; MUL(smallExp/16); SQR; SQR; SQR; 
   296       } else if (smallExp & 0x10) {
   297 	SQR; SQR; MUL(smallExp/32); SQR; SQR; SQR; SQR; 
   298       } else if (smallExp & 0x20) {
   299 	SQR; MUL(smallExp/64); SQR; SQR; SQR; SQR; SQR; 
   300       } else {
   301 	abort();
   302       }
   303     } else {
   304       abort();
   305     }
   306   }
   308   s_mp_copy(mResult, MP_DIGITS(&accum1), nLen); /* from, to, len */
   310   res = s_mp_redc(&accum1, mmm);
   311   mp_exch(&accum1, result);
   313 CLEANUP:
   314   mp_clear(&accum1);
   315   if (dBuf) {
   316     if (dSize)
   317       memset(dBuf, 0, dSize);
   318     free(dBuf);
   319   }
   321   return res;
   322 }
   323 #undef SQR
   324 #undef MUL
   325 #endif
   327 #define SQR(a,b) \
   328   MP_CHECKOK( mp_sqr(a, b) );\
   329   MP_CHECKOK( s_mp_redc(b, mmm) )
   331 #if defined(MP_MONT_USE_MP_MUL)
   332 #define MUL(x,a,b) \
   333   MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \
   334   MP_CHECKOK( s_mp_redc(b, mmm) ) 
   335 #else
   336 #define MUL(x,a,b) \
   337   MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, mmm) )
   338 #endif
   340 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
   342 /* Do modular exponentiation using integer multiply code. */
   343 mp_err mp_exptmod_i(const mp_int *   montBase, 
   344                     const mp_int *   exponent, 
   345 		    const mp_int *   modulus, 
   346 		    mp_int *         result, 
   347 		    mp_mont_modulus *mmm, 
   348 		    int              nLen, 
   349 		    mp_size          bits_in_exponent, 
   350 		    mp_size          window_bits,
   351 		    mp_size          odd_ints)
   352 {
   353   mp_int *pa1, *pa2, *ptmp;
   354   mp_size i;
   355   mp_err  res;
   356   int     expOff;
   357   mp_int  accum1, accum2, power2, oddPowers[MAX_ODD_INTS];
   359   /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
   360   /* oddPowers[i] = base ** (2*i + 1); */
   362   MP_DIGITS(&accum1) = 0;
   363   MP_DIGITS(&accum2) = 0;
   364   MP_DIGITS(&power2) = 0;
   365   for (i = 0; i < MAX_ODD_INTS; ++i) {
   366     MP_DIGITS(oddPowers + i) = 0;
   367   }
   369   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
   370   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
   372   MP_CHECKOK( mp_init_copy(&oddPowers[0], montBase) );
   374   mp_init_size(&power2, nLen + 2 * MP_USED(montBase) + 2);
   375   MP_CHECKOK( mp_sqr(montBase, &power2) );	/* power2 = montBase ** 2 */
   376   MP_CHECKOK( s_mp_redc(&power2, mmm) );
   378   for (i = 1; i < odd_ints; ++i) {
   379     mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2);
   380     MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );
   381     MP_CHECKOK( s_mp_redc(oddPowers + i, mmm) );
   382   }
   384   /* set accumulator to montgomery residue of 1 */
   385   mp_set(&accum1, 1);
   386   MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
   387   pa1 = &accum1;
   388   pa2 = &accum2;
   390   for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
   391     mp_size smallExp;
   392     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
   393     smallExp = (mp_size)res;
   395     if (window_bits == 1) {
   396       if (!smallExp) {
   397 	SQR(pa1,pa2); SWAPPA;
   398       } else if (smallExp & 1) {
   399 	SQR(pa1,pa2); MUL(0,pa2,pa1);
   400       } else {
   401 	abort();
   402       }
   403     } else if (window_bits == 4) {
   404       if (!smallExp) {
   405 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
   406       } else if (smallExp & 1) {
   407 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   408 	MUL(smallExp/2, pa1,pa2); SWAPPA;
   409       } else if (smallExp & 2) {
   410 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
   411 	MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
   412       } else if (smallExp & 4) {
   413 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2); 
   414 	SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
   415       } else if (smallExp & 8) {
   416 	SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); 
   417 	SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
   418       } else {
   419 	abort();
   420       }
   421     } else if (window_bits == 5) {
   422       if (!smallExp) {
   423 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   424 	SQR(pa1,pa2); SWAPPA;
   425       } else if (smallExp & 1) {
   426 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   427 	SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1);
   428       } else if (smallExp & 2) {
   429 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   430 	MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1);
   431       } else if (smallExp & 4) {
   432 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
   433 	MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
   434       } else if (smallExp & 8) {
   435 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2); 
   436 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
   437       } else if (smallExp & 0x10) {
   438 	SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2); 
   439 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
   440       } else {
   441 	abort();
   442       }
   443     } else if (window_bits == 6) {
   444       if (!smallExp) {
   445 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   446 	SQR(pa1,pa2); SQR(pa2,pa1);
   447       } else if (smallExp & 1) {
   448 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   449 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA;
   450       } else if (smallExp & 2) {
   451 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   452 	SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
   453       } else if (smallExp & 4) {
   454 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   455 	MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
   456       } else if (smallExp & 8) {
   457 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
   458 	MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
   459 	SQR(pa1,pa2); SWAPPA;
   460       } else if (smallExp & 0x10) {
   461 	SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2); 
   462 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
   463       } else if (smallExp & 0x20) {
   464 	SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2); 
   465 	SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
   466       } else {
   467 	abort();
   468       }
   469     } else {
   470       abort();
   471     }
   472   }
   474   res = s_mp_redc(pa1, mmm);
   475   mp_exch(pa1, result);
   477 CLEANUP:
   478   mp_clear(&accum1);
   479   mp_clear(&accum2);
   480   mp_clear(&power2);
   481   for (i = 0; i < odd_ints; ++i) {
   482     mp_clear(oddPowers + i);
   483   }
   484   return res;
   485 }
   486 #undef SQR
   487 #undef MUL
   489 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
   490 unsigned int mp_using_cache_safe_exp = 1;
   491 #endif
   493 mp_err mp_set_safe_modexp(int value) 
   494 {
   495 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
   496  mp_using_cache_safe_exp = value;
   497  return MP_OKAY;
   498 #else
   499  if (value == 0) {
   500    return MP_OKAY;
   501  }
   502  return MP_BADARG;
   503 #endif
   504 }
   506 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
   507 #define WEAVE_WORD_SIZE 4
   509 #ifndef MP_CHAR_STORE_SLOW
   510 /*
   511  * mpi_to_weave takes an array of bignums, a matrix in which each bignum 
   512  * occupies all the columns of a row, and transposes it into a matrix in 
   513  * which each bignum occupies a column of every row.  The first row of the
   514  * input matrix becomes the first column of the output matrix.  The n'th
   515  * row of input becomes the n'th column of output.  The input data is said
   516  * to be "interleaved" or "woven" into the output matrix.
   517  *
   518  * The array of bignums is left in this woven form.  Each time a single
   519  * bignum value is needed, it is recreated by fetching the n'th column, 
   520  * forming a single row which is the new bignum.
   521  *
   522  * The purpose of this interleaving is make it impossible to determine which
   523  * of the bignums is being used in any one operation by examining the pattern
   524  * of cache misses.
   525  *
   526  * The weaving function does not transpose the entire input matrix in one call.
   527  * It transposes 4 rows of mp_ints into their respective columns of output.
   528  *
   529  * There are two different implementations of the weaving and unweaving code
   530  * in this file.  One uses byte loads and stores.  The second uses loads and
   531  * stores of mp_weave_word size values.  The weaved forms of these two 
   532  * implementations differ.  Consequently, each one has its own explanation.
   533  *
   534  * Here is the explanation for the byte-at-a-time implementation.
   535  *
   536  * This implementation treats each mp_int bignum as an array of bytes, 
   537  * rather than as an array of mp_digits.  It stores those bytes as a 
   538  * column of bytes in the output matrix.  It doesn't care if the machine
   539  * uses big-endian or little-endian byte ordering within mp_digits.
   540  * The first byte of the mp_digit array becomes the first byte in the output
   541  * column, regardless of whether that byte is the MSB or LSB of the mp_digit.
   542  *
   543  * "bignums" is an array of mp_ints.
   544  * It points to four rows, four mp_ints, a subset of a larger array of mp_ints.
   545  *
   546  * "weaved" is the weaved output matrix. 
   547  * The first byte of bignums[0] is stored in weaved[0].
   548  * 
   549  * "nBignums" is the total number of bignums in the array of which "bignums" 
   550  * is a part.  
   551  *
   552  * "nDigits" is the size in mp_digits of each mp_int in the "bignums" array. 
   553  * mp_ints that use less than nDigits digits are logically padded with zeros 
   554  * while being stored in the weaved array.
   555  */
   556 mp_err mpi_to_weave(const mp_int  *bignums, 
   557                     unsigned char *weaved, 
   558 		    mp_size nDigits,  /* in each mp_int of input */
   559 		    mp_size nBignums) /* in the entire source array */
   560 {
   561   mp_size i;
   562   unsigned char * endDest = weaved + (nDigits * nBignums * sizeof(mp_digit));
   564   for (i=0; i < WEAVE_WORD_SIZE; i++) {
   565     mp_size used = MP_USED(&bignums[i]);
   566     unsigned char *pSrc   = (unsigned char *)MP_DIGITS(&bignums[i]);
   567     unsigned char *endSrc = pSrc + (used * sizeof(mp_digit));
   568     unsigned char *pDest  = weaved + i;
   570     ARGCHK(MP_SIGN(&bignums[i]) == MP_ZPOS, MP_BADARG);
   571     ARGCHK(used <= nDigits, MP_BADARG);
   573     for (; pSrc < endSrc; pSrc++) {
   574       *pDest = *pSrc;
   575       pDest += nBignums;
   576     }
   577     while (pDest < endDest) {
   578       *pDest = 0;
   579       pDest += nBignums;
   580     }
   581   }
   583   return MP_OKAY;
   584 }
   586 /* Reverse the operation above for one mp_int.
   587  * Reconstruct one mp_int from its column in the weaved array.
   588  * "pSrc" points to the offset into the weave array of the bignum we 
   589  * are going to reconstruct.
   590  */
   591 mp_err weave_to_mpi(mp_int *a,                /* output, result */
   592                     const unsigned char *pSrc, /* input, byte matrix */
   593 		    mp_size nDigits,          /* per mp_int output */
   594 		    mp_size nBignums)         /* bignums in weaved matrix */
   595 {
   596   unsigned char *pDest   = (unsigned char *)MP_DIGITS(a);
   597   unsigned char *endDest = pDest + (nDigits * sizeof(mp_digit));
   599   MP_SIGN(a) = MP_ZPOS;
   600   MP_USED(a) = nDigits;
   602   for (; pDest < endDest; pSrc += nBignums, pDest++) {
   603     *pDest = *pSrc;
   604   }
   605   s_mp_clamp(a);
   606   return MP_OKAY;
   607 }
   609 #else
   611 /* Need a primitive that we know is 32 bits long... */
   612 /* this is true on all modern processors we know of today*/
   613 typedef unsigned int mp_weave_word;
   615 /*
   616  * on some platforms character stores into memory is very expensive since they
   617  * generate a read/modify/write operation on the bus. On those platforms
   618  * we need to do integer writes to the bus. Because of some unrolled code,
   619  * in this current code the size of mp_weave_word must be four. The code that
   620  * makes this assumption explicity is called out. (on some platforms a write
   621  * of 4 bytes still requires a single read-modify-write operation.
   622  *
   623  * This function is takes the identical parameters as the function above, 
   624  * however it lays out the final array differently. Where the previous function
   625  * treats the mpi_int as an byte array, this function treats it as an array of
   626  * mp_digits where each digit is stored in big endian order.
   627  * 
   628  * since we need to interleave on a byte by byte basis, we need to collect 
   629  * several mpi structures together into a single PRUint32 before we write. We
   630  * also need to make sure the PRUint32 is arranged so that the first value of
   631  * the first array winds up in b[0]. This means construction of that PRUint32
   632  * is endian specific (even though the layout of the mp_digits in the array 
   633  * is always big endian).
   634  *
   635  * The final data is stored as follows :
   636  *
   637  * Our same logical array p array, m is sizeof(mp_digit),
   638  * N is still count and n is now b_size. If we define p[i].digit[j]0 as the 
   639  * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as 
   640  * the next most significant byte of p[i].digit[j], ...  and p[i].digit[j]m-1
   641  * is the least significant byte. 
   642  * Our array would look like:
   643  * p[0].digit[0]0     p[1].digit[0]0    ...  p[N-2].digit[0]0    p[N-1].digit[0]0
   644  * p[0].digit[0]1     p[1].digit[0]1    ...  p[N-2].digit[0]1    p[N-1].digit[0]1
   645  *                .                                         .
   646  * p[0].digit[0]m-1   p[1].digit[0]m-1  ...  p[N-2].digit[0]m-1  p[N-1].digit[0]m-1
   647  * p[0].digit[1]0     p[1].digit[1]0    ...  p[N-2].digit[1]0    p[N-1].digit[1]0
   648  *                .                                         .
   649  *                .                                         .
   650  * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2
   651  * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1 
   652  *
   653  */
   654 mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
   655 					mp_size b_size, mp_size count)
   656 {
   657   mp_size i;
   658   mp_digit *digitsa0;
   659   mp_digit *digitsa1;
   660   mp_digit *digitsa2;
   661   mp_digit *digitsa3;
   662   mp_size   useda0;
   663   mp_size   useda1;
   664   mp_size   useda2;
   665   mp_size   useda3;
   666   mp_weave_word *weaved = (mp_weave_word *)b;
   668   count = count/sizeof(mp_weave_word);
   670   /* this code pretty much depends on this ! */
   671 #if MP_ARGCHK == 2
   672   assert(WEAVE_WORD_SIZE == 4); 
   673   assert(sizeof(mp_weave_word) == 4);
   674 #endif
   676   digitsa0 = MP_DIGITS(&a[0]);
   677   digitsa1 = MP_DIGITS(&a[1]);
   678   digitsa2 = MP_DIGITS(&a[2]);
   679   digitsa3 = MP_DIGITS(&a[3]);
   680   useda0 = MP_USED(&a[0]);
   681   useda1 = MP_USED(&a[1]);
   682   useda2 = MP_USED(&a[2]);
   683   useda3 = MP_USED(&a[3]);
   685   ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG);
   686   ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG);
   687   ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG);
   688   ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG);
   689   ARGCHK(useda0 <= b_size, MP_BADARG);
   690   ARGCHK(useda1 <= b_size, MP_BADARG);
   691   ARGCHK(useda2 <= b_size, MP_BADARG);
   692   ARGCHK(useda3 <= b_size, MP_BADARG);
   694 #define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)
   696   for (i=0; i < b_size; i++) {
   697     mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);
   698     mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);
   699     mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);
   700     mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);
   701     register mp_weave_word acc;
   703 /*
   704  * ONE_STEP takes the MSB of each of our current digits and places that
   705  * byte in the appropriate position for writing to the weaved array.
   706  *  On little endian:
   707  *   b3 b2 b1 b0
   708  *  On big endian:
   709  *   b0 b1 b2 b3
   710  *  When the data is written it would always wind up:
   711  *   b[0] = b0
   712  *   b[1] = b1
   713  *   b[2] = b2
   714  *   b[3] = b3
   715  *
   716  * Once we've written the MSB, we shift the whole digit up left one
   717  * byte, putting the Next Most Significant Byte in the MSB position,
   718  * so we we repeat the next one step that byte will be written.
   719  * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE
   720  * is 4.
   721  */
   722 #ifdef MP_IS_LITTLE_ENDIAN 
   723 #define MPI_WEAVE_ONE_STEP \
   724     acc  = (d0 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d0 <<= 8; /*b0*/ \
   725     acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \
   726     acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \
   727     acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
   728     *weaved = acc; weaved += count;
   729 #else 
   730 #define MPI_WEAVE_ONE_STEP \
   731     acc  = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
   732     acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \
   733     acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \
   734     acc |= (d3 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d3 <<= 8; /*b3*/ \
   735     *weaved = acc; weaved += count;
   736 #endif 
   737    switch (sizeof(mp_digit)) {
   738    case 32:
   739     MPI_WEAVE_ONE_STEP
   740     MPI_WEAVE_ONE_STEP
   741     MPI_WEAVE_ONE_STEP
   742     MPI_WEAVE_ONE_STEP
   743     MPI_WEAVE_ONE_STEP
   744     MPI_WEAVE_ONE_STEP
   745     MPI_WEAVE_ONE_STEP
   746     MPI_WEAVE_ONE_STEP
   747     MPI_WEAVE_ONE_STEP
   748     MPI_WEAVE_ONE_STEP
   749     MPI_WEAVE_ONE_STEP
   750     MPI_WEAVE_ONE_STEP
   751     MPI_WEAVE_ONE_STEP
   752     MPI_WEAVE_ONE_STEP
   753     MPI_WEAVE_ONE_STEP
   754     MPI_WEAVE_ONE_STEP
   755    case 16:
   756     MPI_WEAVE_ONE_STEP
   757     MPI_WEAVE_ONE_STEP
   758     MPI_WEAVE_ONE_STEP
   759     MPI_WEAVE_ONE_STEP
   760     MPI_WEAVE_ONE_STEP
   761     MPI_WEAVE_ONE_STEP
   762     MPI_WEAVE_ONE_STEP
   763     MPI_WEAVE_ONE_STEP
   764    case 8:
   765     MPI_WEAVE_ONE_STEP
   766     MPI_WEAVE_ONE_STEP
   767     MPI_WEAVE_ONE_STEP
   768     MPI_WEAVE_ONE_STEP
   769    case 4:
   770     MPI_WEAVE_ONE_STEP
   771     MPI_WEAVE_ONE_STEP
   772    case 2:
   773     MPI_WEAVE_ONE_STEP
   774    case 1:
   775     MPI_WEAVE_ONE_STEP
   776     break;
   777    }
   778   }
   780   return MP_OKAY;
   781 }
   783 /* reverse the operation above for one entry.
   784  * b points to the offset into the weave array of the power we are
   785  * calculating */
   786 mp_err weave_to_mpi(mp_int *a, const unsigned char *b, 
   787 					mp_size b_size, mp_size count)
   788 {
   789   mp_digit *pb = MP_DIGITS(a);
   790   mp_digit *end = &pb[b_size];
   792   MP_SIGN(a) = MP_ZPOS;
   793   MP_USED(a) = b_size;
   795   for (; pb < end; pb++) {
   796     register mp_digit digit;
   798     digit = *b << 8; b += count;
   799 #define MPI_UNWEAVE_ONE_STEP  digit |= *b; b += count; digit = digit << 8;
   800     switch (sizeof(mp_digit)) {
   801     case 32:
   802 	MPI_UNWEAVE_ONE_STEP 
   803 	MPI_UNWEAVE_ONE_STEP 
   804 	MPI_UNWEAVE_ONE_STEP 
   805 	MPI_UNWEAVE_ONE_STEP 
   806 	MPI_UNWEAVE_ONE_STEP 
   807 	MPI_UNWEAVE_ONE_STEP 
   808 	MPI_UNWEAVE_ONE_STEP 
   809 	MPI_UNWEAVE_ONE_STEP 
   810 	MPI_UNWEAVE_ONE_STEP 
   811 	MPI_UNWEAVE_ONE_STEP 
   812 	MPI_UNWEAVE_ONE_STEP 
   813 	MPI_UNWEAVE_ONE_STEP 
   814 	MPI_UNWEAVE_ONE_STEP 
   815 	MPI_UNWEAVE_ONE_STEP 
   816 	MPI_UNWEAVE_ONE_STEP 
   817 	MPI_UNWEAVE_ONE_STEP 
   818     case 16:
   819 	MPI_UNWEAVE_ONE_STEP 
   820 	MPI_UNWEAVE_ONE_STEP 
   821 	MPI_UNWEAVE_ONE_STEP 
   822 	MPI_UNWEAVE_ONE_STEP 
   823 	MPI_UNWEAVE_ONE_STEP 
   824 	MPI_UNWEAVE_ONE_STEP 
   825 	MPI_UNWEAVE_ONE_STEP 
   826 	MPI_UNWEAVE_ONE_STEP 
   827     case 8:
   828 	MPI_UNWEAVE_ONE_STEP 
   829 	MPI_UNWEAVE_ONE_STEP 
   830 	MPI_UNWEAVE_ONE_STEP 
   831 	MPI_UNWEAVE_ONE_STEP 
   832     case 4:
   833 	MPI_UNWEAVE_ONE_STEP 
   834 	MPI_UNWEAVE_ONE_STEP 
   835     case 2:
   836 	break;
   837     }
   838     digit |= *b; b += count; 
   840     *pb = digit;
   841   }
   842   s_mp_clamp(a);
   843   return MP_OKAY;
   844 }
   845 #endif
   848 #define SQR(a,b) \
   849   MP_CHECKOK( mp_sqr(a, b) );\
   850   MP_CHECKOK( s_mp_redc(b, mmm) )
   852 #if defined(MP_MONT_USE_MP_MUL)
   853 #define MUL_NOWEAVE(x,a,b) \
   854   MP_CHECKOK( mp_mul(a, x, b) ); \
   855   MP_CHECKOK( s_mp_redc(b, mmm) ) 
   856 #else
   857 #define MUL_NOWEAVE(x,a,b) \
   858   MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) )
   859 #endif
   861 #define MUL(x,a,b) \
   862   MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \
   863   MUL_NOWEAVE(&tmp,a,b)
   865 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
   866 #define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(((ptrdiff_t)0)-(y)))
   868 /* Do modular exponentiation using integer multiply code. */
   869 mp_err mp_exptmod_safe_i(const mp_int *   montBase, 
   870                     const mp_int *   exponent, 
   871 		    const mp_int *   modulus, 
   872 		    mp_int *         result, 
   873 		    mp_mont_modulus *mmm, 
   874 		    int              nLen, 
   875 		    mp_size          bits_in_exponent, 
   876 		    mp_size          window_bits,
   877 		    mp_size          num_powers)
   878 {
   879   mp_int *pa1, *pa2, *ptmp;
   880   mp_size i;
   881   mp_size first_window;
   882   mp_err  res;
   883   int     expOff;
   884   mp_int  accum1, accum2, accum[WEAVE_WORD_SIZE];
   885   mp_int  tmp;
   886   unsigned char *powersArray;
   887   unsigned char *powers;
   889   MP_DIGITS(&accum1) = 0;
   890   MP_DIGITS(&accum2) = 0;
   891   MP_DIGITS(&accum[0]) = 0;
   892   MP_DIGITS(&accum[1]) = 0;
   893   MP_DIGITS(&accum[2]) = 0;
   894   MP_DIGITS(&accum[3]) = 0;
   895   MP_DIGITS(&tmp) = 0;
   897   powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1));
   898   if (powersArray == NULL) {
   899     res = MP_MEM;
   900     goto CLEANUP;
   901   }
   903   /* powers[i] = base ** (i); */
   904   powers = (unsigned char *)MP_ALIGN(powersArray,num_powers);
   906   /* grab the first window value. This allows us to preload accumulator1
   907    * and save a conversion, some squares and a multiple*/
   908   MP_CHECKOK( mpl_get_bits(exponent, 
   909 				bits_in_exponent-window_bits, window_bits) );
   910   first_window = (mp_size)res;
   912   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
   913   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
   914   MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );
   916   /* build the first WEAVE_WORD powers inline */
   917   /* if WEAVE_WORD_SIZE is not 4, this code will have to change */
   918   if (num_powers > 2) {
   919     MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
   920     MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
   921     MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
   922     MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
   923     mp_set(&accum[0], 1);
   924     MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );
   925     MP_CHECKOK( mp_copy(montBase, &accum[1]) );
   926     SQR(montBase, &accum[2]);
   927     MUL_NOWEAVE(montBase, &accum[2], &accum[3]);
   928     MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );
   929     if (first_window < 4) {
   930       MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );
   931       first_window = num_powers;
   932     }
   933   } else {
   934       if (first_window == 0) {
   935         mp_set(&accum1, 1);
   936         MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
   937       } else {
   938         /* assert first_window == 1? */
   939         MP_CHECKOK( mp_copy(montBase, &accum1) );
   940       }
   941   }
   943   /*
   944    * calculate all the powers in the powers array.
   945    * this adds 2**(k-1)-2 square operations over just calculating the
   946    * odd powers where k is the window size in the two other mp_modexpt
   947    * implementations in this file. We will get some of that
   948    * back by not needing the first 'k' squares and one multiply for the 
   949    * first window */ 
   950   for (i = WEAVE_WORD_SIZE; i < num_powers; i++) {
   951     int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */
   952     if ( i & 1 ) {
   953       MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);
   954       /* we've filled the array do our 'per array' processing */
   955       if (acc_index == (WEAVE_WORD_SIZE-1)) {
   956         MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1),
   957 							 nLen, num_powers) );
   959         if (first_window <= i) {
   960           MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)], 
   961 								&accum1) );
   962           first_window = num_powers;
   963         }
   964       }
   965     } else {
   966       /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source
   967        * and target are the same so we need to copy.. After that, the
   968        * value is overwritten, so we need to fetch it from the stored
   969        * weave array */
   970       if (i > 2* WEAVE_WORD_SIZE) {
   971         MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers));
   972         SQR(&accum2, &accum[acc_index]);
   973       } else {
   974 	int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1);
   975 	if (half_power_index == acc_index) {
   976 	   /* copy is cheaper than weave_to_mpi */
   977 	   MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));
   978 	   SQR(&accum2,&accum[acc_index]);
   979 	} else {
   980 	   SQR(&accum[half_power_index],&accum[acc_index]);
   981 	}
   982       }
   983     }
   984   }
   985   /* if the accum1 isn't set, Then there is something wrong with our logic 
   986    * above and is an internal programming error. 
   987    */
   988 #if MP_ARGCHK == 2
   989   assert(MP_USED(&accum1) != 0);
   990 #endif
   992   /* set accumulator to montgomery residue of 1 */
   993   pa1 = &accum1;
   994   pa2 = &accum2;
   996   for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) {
   997     mp_size smallExp;
   998     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
   999     smallExp = (mp_size)res;
  1001     /* handle unroll the loops */
  1002     switch (window_bits) {
  1003     case 1:
  1004 	if (!smallExp) {
  1005 	    SQR(pa1,pa2); SWAPPA;
  1006 	} else if (smallExp & 1) {
  1007 	    SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1);
  1008 	} else {
  1009 	    abort();
  1011 	break;
  1012     case 6:
  1013 	SQR(pa1,pa2); SQR(pa2,pa1); 
  1014 	/* fall through */
  1015     case 4:
  1016 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
  1017 	MUL(smallExp, pa1,pa2); SWAPPA;
  1018 	break;
  1019     case 5:
  1020 	SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
  1021 	SQR(pa1,pa2); MUL(smallExp,pa2,pa1);
  1022 	break;
  1023     default:
  1024 	abort(); /* could do a loop? */
  1028   res = s_mp_redc(pa1, mmm);
  1029   mp_exch(pa1, result);
  1031 CLEANUP:
  1032   mp_clear(&accum1);
  1033   mp_clear(&accum2);
  1034   mp_clear(&accum[0]);
  1035   mp_clear(&accum[1]);
  1036   mp_clear(&accum[2]);
  1037   mp_clear(&accum[3]);
  1038   mp_clear(&tmp);
  1039   /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */
  1040   free(powersArray);
  1041   return res;
  1043 #undef SQR
  1044 #undef MUL
  1045 #endif
  1047 mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, 
  1048 		  const mp_int *modulus, mp_int *result)
  1050   const mp_int *base;
  1051   mp_size bits_in_exponent, i, window_bits, odd_ints;
  1052   mp_err  res;
  1053   int     nLen;
  1054   mp_int  montBase, goodBase;
  1055   mp_mont_modulus mmm;
  1056 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
  1057   static unsigned int max_window_bits;
  1058 #endif
  1060   /* function for computing n0prime only works if n0 is odd */
  1061   if (!mp_isodd(modulus))
  1062     return s_mp_exptmod(inBase, exponent, modulus, result);
  1064   MP_DIGITS(&montBase) = 0;
  1065   MP_DIGITS(&goodBase) = 0;
  1067   if (mp_cmp(inBase, modulus) < 0) {
  1068     base = inBase;
  1069   } else {
  1070     MP_CHECKOK( mp_init(&goodBase) );
  1071     base = &goodBase;
  1072     MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );
  1075   nLen  = MP_USED(modulus);
  1076   MP_CHECKOK( mp_init_size(&montBase, 2 * nLen + 2) );
  1078   mmm.N = *modulus;			/* a copy of the mp_int struct */
  1080   /* compute n0', given n0, n0' = -(n0 ** -1) mod MP_RADIX
  1081   **		where n0 = least significant mp_digit of N, the modulus.
  1082   */
  1083   mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) );
  1085   MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) );
  1087   bits_in_exponent = mpl_significant_bits(exponent);
  1088 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
  1089   if (mp_using_cache_safe_exp) {
  1090     if (bits_in_exponent > 780)
  1091 	window_bits = 6;
  1092     else if (bits_in_exponent > 256)
  1093 	window_bits = 5;
  1094     else if (bits_in_exponent > 20)
  1095 	window_bits = 4;
  1096        /* RSA public key exponents are typically under 20 bits (common values 
  1097         * are: 3, 17, 65537) and a 4-bit window is inefficient
  1098         */
  1099     else 
  1100 	window_bits = 1;
  1101   } else
  1102 #endif
  1103   if (bits_in_exponent > 480)
  1104     window_bits = 6;
  1105   else if (bits_in_exponent > 160)
  1106     window_bits = 5;
  1107   else if (bits_in_exponent > 20)
  1108     window_bits = 4;
  1109   /* RSA public key exponents are typically under 20 bits (common values 
  1110    * are: 3, 17, 65537) and a 4-bit window is inefficient
  1111    */
  1112   else 
  1113     window_bits = 1;
  1115 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
  1116   /*
  1117    * clamp the window size based on
  1118    * the cache line size.
  1119    */
  1120   if (!max_window_bits) {
  1121     unsigned long cache_size = s_mpi_getProcessorLineSize();
  1122     /* processor has no cache, use 'fast' code always */
  1123     if (cache_size == 0) {
  1124       mp_using_cache_safe_exp = 0;
  1126     if ((cache_size == 0) || (cache_size >= 64)) {
  1127       max_window_bits = 6;
  1128     } else if (cache_size >= 32) {
  1129       max_window_bits = 5;
  1130     } else if (cache_size >= 16) {
  1131       max_window_bits = 4;
  1132     } else max_window_bits = 1; /* should this be an assert? */
  1135   /* clamp the window size down before we caclulate bits_in_exponent */
  1136   if (mp_using_cache_safe_exp) {
  1137     if (window_bits > max_window_bits) {
  1138       window_bits = max_window_bits;
  1141 #endif
  1143   odd_ints = 1 << (window_bits - 1);
  1144   i = bits_in_exponent % window_bits;
  1145   if (i != 0) {
  1146     bits_in_exponent += window_bits - i;
  1149 #ifdef MP_USING_MONT_MULF
  1150   if (mp_using_mont_mulf) {
  1151     MP_CHECKOK( s_mp_pad(&montBase, nLen) );
  1152     res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen, 
  1153 		     bits_in_exponent, window_bits, odd_ints);
  1154   } else
  1155 #endif
  1156 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
  1157   if (mp_using_cache_safe_exp) {
  1158     res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen, 
  1159 		     bits_in_exponent, window_bits, 1 << window_bits);
  1160   } else
  1161 #endif
  1162   res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen, 
  1163 		     bits_in_exponent, window_bits, odd_ints);
  1165 CLEANUP:
  1166   mp_clear(&montBase);
  1167   mp_clear(&goodBase);
  1168   /* Don't mp_clear mmm.N because it is merely a copy of modulus.
  1169   ** Just zap it.
  1170   */
  1171   memset(&mmm, 0, sizeof mmm);
  1172   return res;

mercurial