security/nss/lib/freebl/ecl/ecp_256_32.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 /* A 32-bit implementation of the NIST P-256 elliptic curve. */
     7 #include <string.h>
     9 #include "prtypes.h"
    10 #include "mpi.h"
    11 #include "mpi-priv.h"
    12 #include "ecp.h"
    14 typedef PRUint8 u8;
    15 typedef PRUint32 u32;
    16 typedef PRUint64 u64;
    18 /* Our field elements are represented as nine, unsigned 32-bit words. Freebl's
    19  * MPI library calls them digits, but here they are called limbs, which is
    20  * GMP's terminology.
    21  *
    22  * The value of an felem (field element) is:
    23  *   x[0] + (x[1] * 2**29) + (x[2] * 2**57) + ... + (x[8] * 2**228)
    24  *
    25  * That is, each limb is alternately 29 or 28-bits wide in little-endian
    26  * order.
    27  *
    28  * This means that an felem hits 2**257, rather than 2**256 as we would like. A
    29  * 28, 29, ... pattern would cause us to hit 2**256, but that causes problems
    30  * when multiplying as terms end up one bit short of a limb which would require
    31  * much bit-shifting to correct.
    32  *
    33  * Finally, the values stored in an felem are in Montgomery form. So the value
    34  * |y| is stored as (y*R) mod p, where p is the P-256 prime and R is 2**257.
    35  */
    36 typedef u32 limb;
    37 #define NLIMBS 9
    38 typedef limb felem[NLIMBS];
    40 static const limb kBottom28Bits = 0xfffffff;
    41 static const limb kBottom29Bits = 0x1fffffff;
    43 /* kOne is the number 1 as an felem. It's 2**257 mod p split up into 29 and
    44  * 28-bit words.
    45  */
    46 static const felem kOne = {
    47     2, 0, 0, 0xffff800,
    48     0x1fffffff, 0xfffffff, 0x1fbfffff, 0x1ffffff,
    49     0
    50 };
    51 static const felem kZero = {0};
    52 static const felem kP = {
    53     0x1fffffff, 0xfffffff, 0x1fffffff, 0x3ff,
    54     0, 0, 0x200000, 0xf000000,
    55     0xfffffff
    56 };
    57 static const felem k2P = {
    58     0x1ffffffe, 0xfffffff, 0x1fffffff, 0x7ff,
    59     0, 0, 0x400000, 0xe000000,
    60     0x1fffffff
    61 };
    63 /* kPrecomputed contains precomputed values to aid the calculation of scalar
    64  * multiples of the base point, G. It's actually two, equal length, tables
    65  * concatenated.
    66  *
    67  * The first table contains (x,y) felem pairs for 16 multiples of the base
    68  * point, G.
    69  *
    70  *   Index  |  Index (binary) | Value
    71  *       0  |           0000  | 0G (all zeros, omitted)
    72  *       1  |           0001  | G
    73  *       2  |           0010  | 2**64G
    74  *       3  |           0011  | 2**64G + G
    75  *       4  |           0100  | 2**128G
    76  *       5  |           0101  | 2**128G + G
    77  *       6  |           0110  | 2**128G + 2**64G
    78  *       7  |           0111  | 2**128G + 2**64G + G
    79  *       8  |           1000  | 2**192G
    80  *       9  |           1001  | 2**192G + G
    81  *      10  |           1010  | 2**192G + 2**64G
    82  *      11  |           1011  | 2**192G + 2**64G + G
    83  *      12  |           1100  | 2**192G + 2**128G
    84  *      13  |           1101  | 2**192G + 2**128G + G
    85  *      14  |           1110  | 2**192G + 2**128G + 2**64G
    86  *      15  |           1111  | 2**192G + 2**128G + 2**64G + G
    87  *
    88  * The second table follows the same style, but the terms are 2**32G,
    89  * 2**96G, 2**160G, 2**224G.
    90  *
    91  * This is ~2KB of data.
    92  */
    93 static const limb kPrecomputed[NLIMBS * 2 * 15 * 2] = {
    94     0x11522878, 0xe730d41, 0xdb60179, 0x4afe2ff, 0x12883add, 0xcaddd88, 0x119e7edc, 0xd4a6eab, 0x3120bee,
    95     0x1d2aac15, 0xf25357c, 0x19e45cdd, 0x5c721d0, 0x1992c5a5, 0xa237487, 0x154ba21, 0x14b10bb, 0xae3fe3,
    96     0xd41a576, 0x922fc51, 0x234994f, 0x60b60d3, 0x164586ae, 0xce95f18, 0x1fe49073, 0x3fa36cc, 0x5ebcd2c,
    97     0xb402f2f, 0x15c70bf, 0x1561925c, 0x5a26704, 0xda91e90, 0xcdc1c7f, 0x1ea12446, 0xe1ade1e, 0xec91f22,
    98     0x26f7778, 0x566847e, 0xa0bec9e, 0x234f453, 0x1a31f21a, 0xd85e75c, 0x56c7109, 0xa267a00, 0xb57c050,
    99     0x98fb57, 0xaa837cc, 0x60c0792, 0xcfa5e19, 0x61bab9e, 0x589e39b, 0xa324c5, 0x7d6dee7, 0x2976e4b,
   100     0x1fc4124a, 0xa8c244b, 0x1ce86762, 0xcd61c7e, 0x1831c8e0, 0x75774e1, 0x1d96a5a9, 0x843a649, 0xc3ab0fa,
   101     0x6e2e7d5, 0x7673a2a, 0x178b65e8, 0x4003e9b, 0x1a1f11c2, 0x7816ea, 0xf643e11, 0x58c43df, 0xf423fc2,
   102     0x19633ffa, 0x891f2b2, 0x123c231c, 0x46add8c, 0x54700dd, 0x59e2b17, 0x172db40f, 0x83e277d, 0xb0dd609,
   103     0xfd1da12, 0x35c6e52, 0x19ede20c, 0xd19e0c0, 0x97d0f40, 0xb015b19, 0x449e3f5, 0xe10c9e, 0x33ab581,
   104     0x56a67ab, 0x577734d, 0x1dddc062, 0xc57b10d, 0x149b39d, 0x26a9e7b, 0xc35df9f, 0x48764cd, 0x76dbcca,
   105     0xca4b366, 0xe9303ab, 0x1a7480e7, 0x57e9e81, 0x1e13eb50, 0xf466cf3, 0x6f16b20, 0x4ba3173, 0xc168c33,
   106     0x15cb5439, 0x6a38e11, 0x73658bd, 0xb29564f, 0x3f6dc5b, 0x53b97e, 0x1322c4c0, 0x65dd7ff, 0x3a1e4f6,
   107     0x14e614aa, 0x9246317, 0x1bc83aca, 0xad97eed, 0xd38ce4a, 0xf82b006, 0x341f077, 0xa6add89, 0x4894acd,
   108     0x9f162d5, 0xf8410ef, 0x1b266a56, 0xd7f223, 0x3e0cb92, 0xe39b672, 0x6a2901a, 0x69a8556, 0x7e7c0,
   109     0x9b7d8d3, 0x309a80, 0x1ad05f7f, 0xc2fb5dd, 0xcbfd41d, 0x9ceb638, 0x1051825c, 0xda0cf5b, 0x812e881,
   110     0x6f35669, 0x6a56f2c, 0x1df8d184, 0x345820, 0x1477d477, 0x1645db1, 0xbe80c51, 0xc22be3e, 0xe35e65a,
   111     0x1aeb7aa0, 0xc375315, 0xf67bc99, 0x7fdd7b9, 0x191fc1be, 0x61235d, 0x2c184e9, 0x1c5a839, 0x47a1e26,
   112     0xb7cb456, 0x93e225d, 0x14f3c6ed, 0xccc1ac9, 0x17fe37f3, 0x4988989, 0x1a90c502, 0x2f32042, 0xa17769b,
   113     0xafd8c7c, 0x8191c6e, 0x1dcdb237, 0x16200c0, 0x107b32a1, 0x66c08db, 0x10d06a02, 0x3fc93, 0x5620023,
   114     0x16722b27, 0x68b5c59, 0x270fcfc, 0xfad0ecc, 0xe5de1c2, 0xeab466b, 0x2fc513c, 0x407f75c, 0xbaab133,
   115     0x9705fe9, 0xb88b8e7, 0x734c993, 0x1e1ff8f, 0x19156970, 0xabd0f00, 0x10469ea7, 0x3293ac0, 0xcdc98aa,
   116     0x1d843fd, 0xe14bfe8, 0x15be825f, 0x8b5212, 0xeb3fb67, 0x81cbd29, 0xbc62f16, 0x2b6fcc7, 0xf5a4e29,
   117     0x13560b66, 0xc0b6ac2, 0x51ae690, 0xd41e271, 0xf3e9bd4, 0x1d70aab, 0x1029f72, 0x73e1c35, 0xee70fbc,
   118     0xad81baf, 0x9ecc49a, 0x86c741e, 0xfe6be30, 0x176752e7, 0x23d416, 0x1f83de85, 0x27de188, 0x66f70b8,
   119     0x181cd51f, 0x96b6e4c, 0x188f2335, 0xa5df759, 0x17a77eb6, 0xfeb0e73, 0x154ae914, 0x2f3ec51, 0x3826b59,
   120     0xb91f17d, 0x1c72949, 0x1362bf0a, 0xe23fddf, 0xa5614b0, 0xf7d8f, 0x79061, 0x823d9d2, 0x8213f39,
   121     0x1128ae0b, 0xd095d05, 0xb85c0c2, 0x1ecb2ef, 0x24ddc84, 0xe35e901, 0x18411a4a, 0xf5ddc3d, 0x3786689,
   122     0x52260e8, 0x5ae3564, 0x542b10d, 0x8d93a45, 0x19952aa4, 0x996cc41, 0x1051a729, 0x4be3499, 0x52b23aa,
   123     0x109f307e, 0x6f5b6bb, 0x1f84e1e7, 0x77a0cfa, 0x10c4df3f, 0x25a02ea, 0xb048035, 0xe31de66, 0xc6ecaa3,
   124     0x28ea335, 0x2886024, 0x1372f020, 0xf55d35, 0x15e4684c, 0xf2a9e17, 0x1a4a7529, 0xcb7beb1, 0xb2a78a1,
   125     0x1ab21f1f, 0x6361ccf, 0x6c9179d, 0xb135627, 0x1267b974, 0x4408bad, 0x1cbff658, 0xe3d6511, 0xc7d76f,
   126     0x1cc7a69, 0xe7ee31b, 0x54fab4f, 0x2b914f, 0x1ad27a30, 0xcd3579e, 0xc50124c, 0x50daa90, 0xb13f72,
   127     0xb06aa75, 0x70f5cc6, 0x1649e5aa, 0x84a5312, 0x329043c, 0x41c4011, 0x13d32411, 0xb04a838, 0xd760d2d,
   128     0x1713b532, 0xbaa0c03, 0x84022ab, 0x6bcf5c1, 0x2f45379, 0x18ae070, 0x18c9e11e, 0x20bca9a, 0x66f496b,
   129     0x3eef294, 0x67500d2, 0xd7f613c, 0x2dbbeb, 0xb741038, 0xe04133f, 0x1582968d, 0xbe985f7, 0x1acbc1a,
   130     0x1a6a939f, 0x33e50f6, 0xd665ed4, 0xb4b7bd6, 0x1e5a3799, 0x6b33847, 0x17fa56ff, 0x65ef930, 0x21dc4a,
   131     0x2b37659, 0x450fe17, 0xb357b65, 0xdf5efac, 0x15397bef, 0x9d35a7f, 0x112ac15f, 0x624e62e, 0xa90ae2f,
   132     0x107eecd2, 0x1f69bbe, 0x77d6bce, 0x5741394, 0x13c684fc, 0x950c910, 0x725522b, 0xdc78583, 0x40eeabb,
   133     0x1fde328a, 0xbd61d96, 0xd28c387, 0x9e77d89, 0x12550c40, 0x759cb7d, 0x367ef34, 0xae2a960, 0x91b8bdc,
   134     0x93462a9, 0xf469ef, 0xb2e9aef, 0xd2ca771, 0x54e1f42, 0x7aaa49, 0x6316abb, 0x2413c8e, 0x5425bf9,
   135     0x1bed3e3a, 0xf272274, 0x1f5e7326, 0x6416517, 0xea27072, 0x9cedea7, 0x6e7633, 0x7c91952, 0xd806dce,
   136     0x8e2a7e1, 0xe421e1a, 0x418c9e1, 0x1dbc890, 0x1b395c36, 0xa1dc175, 0x1dc4ef73, 0x8956f34, 0xe4b5cf2,
   137     0x1b0d3a18, 0x3194a36, 0x6c2641f, 0xe44124c, 0xa2f4eaa, 0xa8c25ba, 0xf927ed7, 0x627b614, 0x7371cca,
   138     0xba16694, 0x417bc03, 0x7c0a7e3, 0x9c35c19, 0x1168a205, 0x8b6b00d, 0x10e3edc9, 0x9c19bf2, 0x5882229,
   139     0x1b2b4162, 0xa5cef1a, 0x1543622b, 0x9bd433e, 0x364e04d, 0x7480792, 0x5c9b5b3, 0xe85ff25, 0x408ef57,
   140     0x1814cfa4, 0x121b41b, 0xd248a0f, 0x3b05222, 0x39bb16a, 0xc75966d, 0xa038113, 0xa4a1769, 0x11fbc6c,
   141     0x917e50e, 0xeec3da8, 0x169d6eac, 0x10c1699, 0xa416153, 0xf724912, 0x15cd60b7, 0x4acbad9, 0x5efc5fa,
   142     0xf150ed7, 0x122b51, 0x1104b40a, 0xcb7f442, 0xfbb28ff, 0x6ac53ca, 0x196142cc, 0x7bf0fa9, 0x957651,
   143     0x4e0f215, 0xed439f8, 0x3f46bd5, 0x5ace82f, 0x110916b6, 0x6db078, 0xffd7d57, 0xf2ecaac, 0xca86dec,
   144     0x15d6b2da, 0x965ecc9, 0x1c92b4c2, 0x1f3811, 0x1cb080f5, 0x2d8b804, 0x19d1c12d, 0xf20bd46, 0x1951fa7,
   145     0xa3656c3, 0x523a425, 0xfcd0692, 0xd44ddc8, 0x131f0f5b, 0xaf80e4a, 0xcd9fc74, 0x99bb618, 0x2db944c,
   146     0xa673090, 0x1c210e1, 0x178c8d23, 0x1474383, 0x10b8743d, 0x985a55b, 0x2e74779, 0x576138, 0x9587927,
   147     0x133130fa, 0xbe05516, 0x9f4d619, 0xbb62570, 0x99ec591, 0xd9468fe, 0x1d07782d, 0xfc72e0b, 0x701b298,
   148     0x1863863b, 0x85954b8, 0x121a0c36, 0x9e7fedf, 0xf64b429, 0x9b9d71e, 0x14e2f5d8, 0xf858d3a, 0x942eea8,
   149     0xda5b765, 0x6edafff, 0xa9d18cc, 0xc65e4ba, 0x1c747e86, 0xe4ea915, 0x1981d7a1, 0x8395659, 0x52ed4e2,
   150     0x87d43b7, 0x37ab11b, 0x19d292ce, 0xf8d4692, 0x18c3053f, 0x8863e13, 0x4c146c0, 0x6bdf55a, 0x4e4457d,
   151     0x16152289, 0xac78ec2, 0x1a59c5a2, 0x2028b97, 0x71c2d01, 0x295851f, 0x404747b, 0x878558d, 0x7d29aa4,
   152     0x13d8341f, 0x8daefd7, 0x139c972d, 0x6b7ea75, 0xd4a9dde, 0xff163d8, 0x81d55d7, 0xa5bef68, 0xb7b30d8,
   153     0xbe73d6f, 0xaa88141, 0xd976c81, 0x7e7a9cc, 0x18beb771, 0xd773cbd, 0x13f51951, 0x9d0c177, 0x1c49a78,
   154 };
   156 /* Field element operations:
   157  */
   159 /* NON_ZERO_TO_ALL_ONES returns:
   160  *   0xffffffff for 0 < x <= 2**31
   161  *   0 for x == 0 or x > 2**31.
   162  *
   163  * x must be a u32 or an equivalent type such as limb.
   164  */
   165 #define NON_ZERO_TO_ALL_ONES(x) ((((u32)(x) - 1) >> 31) - 1)
   167 /* felem_reduce_carry adds a multiple of p in order to cancel |carry|,
   168  * which is a term at 2**257.
   169  *
   170  * On entry: carry < 2**3, inout[0,2,...] < 2**29, inout[1,3,...] < 2**28.
   171  * On exit: inout[0,2,..] < 2**30, inout[1,3,...] < 2**29.
   172  */
   173 static void felem_reduce_carry(felem inout, limb carry)
   174 {
   175     const u32 carry_mask = NON_ZERO_TO_ALL_ONES(carry);
   177     inout[0] += carry << 1;
   178     inout[3] += 0x10000000 & carry_mask;
   179     /* carry < 2**3 thus (carry << 11) < 2**14 and we added 2**28 in the
   180      * previous line therefore this doesn't underflow.
   181      */
   182     inout[3] -= carry << 11;
   183     inout[4] += (0x20000000 - 1) & carry_mask;
   184     inout[5] += (0x10000000 - 1) & carry_mask;
   185     inout[6] += (0x20000000 - 1) & carry_mask;
   186     inout[6] -= carry << 22;
   187     /* This may underflow if carry is non-zero but, if so, we'll fix it in the
   188      * next line.
   189      */
   190     inout[7] -= 1 & carry_mask;
   191     inout[7] += carry << 25;
   192 }
   194 /* felem_sum sets out = in+in2.
   195  *
   196  * On entry, in[i]+in2[i] must not overflow a 32-bit word.
   197  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   198  */
   199 static void felem_sum(felem out, const felem in, const felem in2)
   200 {
   201     limb carry = 0;
   202     unsigned int i;
   203     for (i = 0;; i++) {
   204 	out[i] = in[i] + in2[i];
   205 	out[i] += carry;
   206 	carry = out[i] >> 29;
   207 	out[i] &= kBottom29Bits;
   209 	i++;
   210 	if (i == NLIMBS)
   211 	    break;
   213 	out[i] = in[i] + in2[i];
   214 	out[i] += carry;
   215 	carry = out[i] >> 28;
   216 	out[i] &= kBottom28Bits;
   217     }
   219     felem_reduce_carry(out, carry);
   220 }
   222 #define two31m3 (((limb)1) << 31) - (((limb)1) << 3)
   223 #define two30m2 (((limb)1) << 30) - (((limb)1) << 2)
   224 #define two30p13m2 (((limb)1) << 30) + (((limb)1) << 13) - (((limb)1) << 2)
   225 #define two31m2 (((limb)1) << 31) - (((limb)1) << 2)
   226 #define two31p24m2 (((limb)1) << 31) + (((limb)1) << 24) - (((limb)1) << 2)
   227 #define two30m27m2 (((limb)1) << 30) - (((limb)1) << 27) - (((limb)1) << 2)
   229 /* zero31 is 0 mod p.
   230  */
   231 static const felem zero31 = {
   232     two31m3, two30m2, two31m2, two30p13m2,
   233     two31m2, two30m2, two31p24m2, two30m27m2,
   234     two31m2
   235 };
   237 /* felem_diff sets out = in-in2.
   238  *
   239  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   240  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   241  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   242  */
   243 static void felem_diff(felem out, const felem in, const felem in2)
   244 {
   245     limb carry = 0;
   246     unsigned int i;
   248     for (i = 0;; i++) {
   249 	out[i] = in[i] - in2[i];
   250 	out[i] += zero31[i];
   251 	out[i] += carry;
   252 	carry = out[i] >> 29;
   253 	out[i] &= kBottom29Bits;
   255 	i++;
   256 	if (i == NLIMBS)
   257 	    break;
   259 	out[i] = in[i] - in2[i];
   260 	out[i] += zero31[i];
   261 	out[i] += carry;
   262 	carry = out[i] >> 28;
   263 	out[i] &= kBottom28Bits;
   264     }
   266     felem_reduce_carry(out, carry);
   267 }
   269 /* felem_reduce_degree sets out = tmp/R mod p where tmp contains 64-bit words
   270  * with the same 29,28,... bit positions as an felem.
   271  *
   272  * The values in felems are in Montgomery form: x*R mod p where R = 2**257.
   273  * Since we just multiplied two Montgomery values together, the result is
   274  * x*y*R*R mod p. We wish to divide by R in order for the result also to be
   275  * in Montgomery form.
   276  *
   277  * On entry: tmp[i] < 2**64
   278  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29
   279  */
   280 static void felem_reduce_degree(felem out, u64 tmp[17])
   281 {
   282     /* The following table may be helpful when reading this code:
   283      *
   284      * Limb number:   0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10...
   285      * Width (bits):  29| 28| 29| 28| 29| 28| 29| 28| 29| 28| 29
   286      * Start bit:     0 | 29| 57| 86|114|143|171|200|228|257|285
   287      *   (odd phase): 0 | 28| 57| 85|114|142|171|199|228|256|285
   288      */
   289     limb tmp2[18], carry, x, xMask;
   290     unsigned int i;
   292     /* tmp contains 64-bit words with the same 29,28,29-bit positions as an
   293      * felem. So the top of an element of tmp might overlap with another
   294      * element two positions down. The following loop eliminates this
   295      * overlap.
   296      */
   297     tmp2[0] = tmp[0] & kBottom29Bits;
   299     /* In the following we use "(limb) tmp[x]" and "(limb) (tmp[x]>>32)" to try
   300      * and hint to the compiler that it can do a single-word shift by selecting
   301      * the right register rather than doing a double-word shift and truncating
   302      * afterwards.
   303      */
   304     tmp2[1] = ((limb) tmp[0]) >> 29;
   305     tmp2[1] |= (((limb) (tmp[0] >> 32)) << 3) & kBottom28Bits;
   306     tmp2[1] += ((limb) tmp[1]) & kBottom28Bits;
   307     carry = tmp2[1] >> 28;
   308     tmp2[1] &= kBottom28Bits;
   310     for (i = 2; i < 17; i++) {
   311 	tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
   312 	tmp2[i] += ((limb) (tmp[i - 1])) >> 28;
   313 	tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 4) & kBottom29Bits;
   314 	tmp2[i] += ((limb) tmp[i]) & kBottom29Bits;
   315 	tmp2[i] += carry;
   316 	carry = tmp2[i] >> 29;
   317 	tmp2[i] &= kBottom29Bits;
   319 	i++;
   320 	if (i == 17)
   321 	    break;
   322 	tmp2[i] = ((limb) (tmp[i - 2] >> 32)) >> 25;
   323 	tmp2[i] += ((limb) (tmp[i - 1])) >> 29;
   324 	tmp2[i] += (((limb) (tmp[i - 1] >> 32)) << 3) & kBottom28Bits;
   325 	tmp2[i] += ((limb) tmp[i]) & kBottom28Bits;
   326 	tmp2[i] += carry;
   327 	carry = tmp2[i] >> 28;
   328 	tmp2[i] &= kBottom28Bits;
   329     }
   331     tmp2[17] = ((limb) (tmp[15] >> 32)) >> 25;
   332     tmp2[17] += ((limb) (tmp[16])) >> 29;
   333     tmp2[17] += (((limb) (tmp[16] >> 32)) << 3);
   334     tmp2[17] += carry;
   336     /* Montgomery elimination of terms:
   337      *
   338      * Since R is 2**257, we can divide by R with a bitwise shift if we can
   339      * ensure that the right-most 257 bits are all zero. We can make that true
   340      * by adding multiplies of p without affecting the value.
   341      *
   342      * So we eliminate limbs from right to left. Since the bottom 29 bits of p
   343      * are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
   344      * We can do that for 8 further limbs and then right shift to eliminate the
   345      * extra factor of R.
   346      */
   347     for (i = 0;; i += 2) {
   348 	tmp2[i + 1] += tmp2[i] >> 29;
   349 	x = tmp2[i] & kBottom29Bits;
   350 	xMask = NON_ZERO_TO_ALL_ONES(x);
   351 	tmp2[i] = 0;
   353 	/* The bounds calculations for this loop are tricky. Each iteration of
   354 	 * the loop eliminates two words by adding values to words to their
   355 	 * right.
   356 	 *
   357 	 * The following table contains the amounts added to each word (as an
   358 	 * offset from the value of i at the top of the loop). The amounts are
   359 	 * accounted for from the first and second half of the loop separately
   360 	 * and are written as, for example, 28 to mean a value <2**28.
   361 	 *
   362 	 * Word:                   3   4   5   6   7   8   9   10
   363 	 * Added in top half:     28  11      29  21  29  28
   364 	 *                                        28  29
   365 	 *                                            29
   366 	 * Added in bottom half:      29  10      28  21  28   28
   367 	 *                                            29
   368 	 *
   369 	 * The value that is currently offset 7 will be offset 5 for the next
   370 	 * iteration and then offset 3 for the iteration after that. Therefore
   371 	 * the total value added will be the values added at 7, 5 and 3.
   372 	 *
   373 	 * The following table accumulates these values. The sums at the bottom
   374 	 * are written as, for example, 29+28, to mean a value < 2**29+2**28.
   375 	 *
   376 	 * Word:                   3   4   5   6   7   8   9  10  11  12  13
   377 	 *                        28  11  10  29  21  29  28  28  28  28  28
   378 	 *                            29  28  11  28  29  28  29  28  29  28
   379 	 *                                    29  28  21  21  29  21  29  21
   380 	 *                                        10  29  28  21  28  21  28
   381 	 *                                        28  29  28  29  28  29  28
   382 	 *                                            11  10  29  10  29  10
   383 	 *                                            29  28  11  28  11
   384 	 *                                                    29      29
   385 	 *                        --------------------------------------------
   386 	 *                                                30+ 31+ 30+ 31+ 30+
   387 	 *                                                28+ 29+ 28+ 29+ 21+
   388 	 *                                                21+ 28+ 21+ 28+ 10
   389 	 *                                                10  21+ 10  21+
   390 	 *                                                    11      11
   391 	 *
   392 	 * So the greatest amount is added to tmp2[10] and tmp2[12]. If
   393 	 * tmp2[10/12] has an initial value of <2**29, then the maximum value
   394 	 * will be < 2**31 + 2**30 + 2**28 + 2**21 + 2**11, which is < 2**32,
   395 	 * as required.
   396          */
   397 	tmp2[i + 3] += (x << 10) & kBottom28Bits;
   398 	tmp2[i + 4] += (x >> 18);
   400 	tmp2[i + 6] += (x << 21) & kBottom29Bits;
   401 	tmp2[i + 7] += x >> 8;
   403 	/* At position 200, which is the starting bit position for word 7, we
   404 	 * have a factor of 0xf000000 = 2**28 - 2**24.
   405 	 */
   406 	tmp2[i + 7] += 0x10000000 & xMask;
   407 	/* Word 7 is 28 bits wide, so the 2**28 term exactly hits word 8. */
   408 	tmp2[i + 8] += (x - 1) & xMask;
   409 	tmp2[i + 7] -= (x << 24) & kBottom28Bits;
   410 	tmp2[i + 8] -= x >> 4;
   412 	tmp2[i + 8] += 0x20000000 & xMask;
   413 	tmp2[i + 8] -= x;
   414 	tmp2[i + 8] += (x << 28) & kBottom29Bits;
   415 	tmp2[i + 9] += ((x >> 1) - 1) & xMask;
   417 	if (i+1 == NLIMBS)
   418 	    break;
   419 	tmp2[i + 2] += tmp2[i + 1] >> 28;
   420 	x = tmp2[i + 1] & kBottom28Bits;
   421 	xMask = NON_ZERO_TO_ALL_ONES(x);
   422 	tmp2[i + 1] = 0;
   424 	tmp2[i + 4] += (x << 11) & kBottom29Bits;
   425 	tmp2[i + 5] += (x >> 18);
   427 	tmp2[i + 7] += (x << 21) & kBottom28Bits;
   428 	tmp2[i + 8] += x >> 7;
   430 	/* At position 199, which is the starting bit of the 8th word when
   431 	 * dealing with a context starting on an odd word, we have a factor of
   432 	 * 0x1e000000 = 2**29 - 2**25. Since we have not updated i, the 8th
   433 	 * word from i+1 is i+8.
   434 	 */
   435 	tmp2[i + 8] += 0x20000000 & xMask;
   436 	tmp2[i + 9] += (x - 1) & xMask;
   437 	tmp2[i + 8] -= (x << 25) & kBottom29Bits;
   438 	tmp2[i + 9] -= x >> 4;
   440 	tmp2[i + 9] += 0x10000000 & xMask;
   441 	tmp2[i + 9] -= x;
   442 	tmp2[i + 10] += (x - 1) & xMask;
   443     }
   445     /* We merge the right shift with a carry chain. The words above 2**257 have
   446      * widths of 28,29,... which we need to correct when copying them down.
   447      */
   448     carry = 0;
   449     for (i = 0; i < 8; i++) {
   450 	/* The maximum value of tmp2[i + 9] occurs on the first iteration and
   451 	 * is < 2**30+2**29+2**28. Adding 2**29 (from tmp2[i + 10]) is
   452 	 * therefore safe.
   453 	 */
   454 	out[i] = tmp2[i + 9];
   455 	out[i] += carry;
   456 	out[i] += (tmp2[i + 10] << 28) & kBottom29Bits;
   457 	carry = out[i] >> 29;
   458 	out[i] &= kBottom29Bits;
   460 	i++;
   461 	out[i] = tmp2[i + 9] >> 1;
   462 	out[i] += carry;
   463 	carry = out[i] >> 28;
   464 	out[i] &= kBottom28Bits;
   465     }
   467     out[8] = tmp2[17];
   468     out[8] += carry;
   469     carry = out[8] >> 29;
   470     out[8] &= kBottom29Bits;
   472     felem_reduce_carry(out, carry);
   473 }
   475 /* felem_square sets out=in*in.
   476  *
   477  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29.
   478  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   479  */
   480 static void felem_square(felem out, const felem in)
   481 {
   482     u64 tmp[17];
   484     tmp[0] = ((u64) in[0]) * in[0];
   485     tmp[1] = ((u64) in[0]) * (in[1] << 1);
   486     tmp[2] = ((u64) in[0]) * (in[2] << 1) +
   487 	     ((u64) in[1]) * (in[1] << 1);
   488     tmp[3] = ((u64) in[0]) * (in[3] << 1) +
   489 	     ((u64) in[1]) * (in[2] << 1);
   490     tmp[4] = ((u64) in[0]) * (in[4] << 1) +
   491 	     ((u64) in[1]) * (in[3] << 2) +
   492 	     ((u64) in[2]) * in[2];
   493     tmp[5] = ((u64) in[0]) * (in[5] << 1) +
   494 	     ((u64) in[1]) * (in[4] << 1) +
   495 	     ((u64) in[2]) * (in[3] << 1);
   496     tmp[6] = ((u64) in[0]) * (in[6] << 1) +
   497 	     ((u64) in[1]) * (in[5] << 2) +
   498 	     ((u64) in[2]) * (in[4] << 1) +
   499 	     ((u64) in[3]) * (in[3] << 1);
   500     tmp[7] = ((u64) in[0]) * (in[7] << 1) +
   501 	     ((u64) in[1]) * (in[6] << 1) +
   502 	     ((u64) in[2]) * (in[5] << 1) +
   503 	     ((u64) in[3]) * (in[4] << 1);
   504     /* tmp[8] has the greatest value of 2**61 + 2**60 + 2**61 + 2**60 + 2**60,
   505      * which is < 2**64 as required.
   506      */
   507     tmp[8] = ((u64) in[0]) * (in[8] << 1) +
   508 	     ((u64) in[1]) * (in[7] << 2) +
   509 	     ((u64) in[2]) * (in[6] << 1) +
   510 	     ((u64) in[3]) * (in[5] << 2) +
   511 	     ((u64) in[4]) * in[4];
   512     tmp[9] = ((u64) in[1]) * (in[8] << 1) +
   513 	     ((u64) in[2]) * (in[7] << 1) +
   514 	     ((u64) in[3]) * (in[6] << 1) +
   515 	     ((u64) in[4]) * (in[5] << 1);
   516     tmp[10] = ((u64) in[2]) * (in[8] << 1) +
   517 	      ((u64) in[3]) * (in[7] << 2) +
   518 	      ((u64) in[4]) * (in[6] << 1) +
   519 	      ((u64) in[5]) * (in[5] << 1);
   520     tmp[11] = ((u64) in[3]) * (in[8] << 1) +
   521 	      ((u64) in[4]) * (in[7] << 1) +
   522 	      ((u64) in[5]) * (in[6] << 1);
   523     tmp[12] = ((u64) in[4]) * (in[8] << 1) +
   524 	      ((u64) in[5]) * (in[7] << 2) +
   525 	      ((u64) in[6]) * in[6];
   526     tmp[13] = ((u64) in[5]) * (in[8] << 1) +
   527 	      ((u64) in[6]) * (in[7] << 1);
   528     tmp[14] = ((u64) in[6]) * (in[8] << 1) +
   529 	      ((u64) in[7]) * (in[7] << 1);
   530     tmp[15] = ((u64) in[7]) * (in[8] << 1);
   531     tmp[16] = ((u64) in[8]) * in[8];
   533     felem_reduce_degree(out, tmp);
   534 }
   536 /* felem_mul sets out=in*in2.
   537  *
   538  * On entry: in[0,2,...] < 2**30, in[1,3,...] < 2**29 and
   539  *           in2[0,2,...] < 2**30, in2[1,3,...] < 2**29.
   540  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   541  */
   542 static void felem_mul(felem out, const felem in, const felem in2)
   543 {
   544     u64 tmp[17];
   546     tmp[0] = ((u64) in[0]) * in2[0];
   547     tmp[1] = ((u64) in[0]) * (in2[1] << 0) +
   548 	     ((u64) in[1]) * (in2[0] << 0);
   549     tmp[2] = ((u64) in[0]) * (in2[2] << 0) +
   550 	     ((u64) in[1]) * (in2[1] << 1) +
   551 	     ((u64) in[2]) * (in2[0] << 0);
   552     tmp[3] = ((u64) in[0]) * (in2[3] << 0) +
   553 	     ((u64) in[1]) * (in2[2] << 0) +
   554 	     ((u64) in[2]) * (in2[1] << 0) +
   555 	     ((u64) in[3]) * (in2[0] << 0);
   556     tmp[4] = ((u64) in[0]) * (in2[4] << 0) +
   557 	     ((u64) in[1]) * (in2[3] << 1) +
   558 	     ((u64) in[2]) * (in2[2] << 0) +
   559 	     ((u64) in[3]) * (in2[1] << 1) +
   560 	     ((u64) in[4]) * (in2[0] << 0);
   561     tmp[5] = ((u64) in[0]) * (in2[5] << 0) +
   562 	     ((u64) in[1]) * (in2[4] << 0) +
   563 	     ((u64) in[2]) * (in2[3] << 0) +
   564 	     ((u64) in[3]) * (in2[2] << 0) +
   565 	     ((u64) in[4]) * (in2[1] << 0) +
   566 	     ((u64) in[5]) * (in2[0] << 0);
   567     tmp[6] = ((u64) in[0]) * (in2[6] << 0) +
   568 	     ((u64) in[1]) * (in2[5] << 1) +
   569 	     ((u64) in[2]) * (in2[4] << 0) +
   570 	     ((u64) in[3]) * (in2[3] << 1) +
   571 	     ((u64) in[4]) * (in2[2] << 0) +
   572 	     ((u64) in[5]) * (in2[1] << 1) +
   573 	     ((u64) in[6]) * (in2[0] << 0);
   574     tmp[7] = ((u64) in[0]) * (in2[7] << 0) +
   575 	     ((u64) in[1]) * (in2[6] << 0) +
   576 	     ((u64) in[2]) * (in2[5] << 0) +
   577 	     ((u64) in[3]) * (in2[4] << 0) +
   578 	     ((u64) in[4]) * (in2[3] << 0) +
   579 	     ((u64) in[5]) * (in2[2] << 0) +
   580 	     ((u64) in[6]) * (in2[1] << 0) +
   581 	     ((u64) in[7]) * (in2[0] << 0);
   582     /* tmp[8] has the greatest value but doesn't overflow. See logic in
   583      * felem_square.
   584      */
   585     tmp[8] = ((u64) in[0]) * (in2[8] << 0) +
   586 	     ((u64) in[1]) * (in2[7] << 1) +
   587 	     ((u64) in[2]) * (in2[6] << 0) +
   588 	     ((u64) in[3]) * (in2[5] << 1) +
   589 	     ((u64) in[4]) * (in2[4] << 0) +
   590 	     ((u64) in[5]) * (in2[3] << 1) +
   591 	     ((u64) in[6]) * (in2[2] << 0) +
   592 	     ((u64) in[7]) * (in2[1] << 1) +
   593 	     ((u64) in[8]) * (in2[0] << 0);
   594     tmp[9] = ((u64) in[1]) * (in2[8] << 0) +
   595 	     ((u64) in[2]) * (in2[7] << 0) +
   596 	     ((u64) in[3]) * (in2[6] << 0) +
   597 	     ((u64) in[4]) * (in2[5] << 0) +
   598 	     ((u64) in[5]) * (in2[4] << 0) +
   599 	     ((u64) in[6]) * (in2[3] << 0) +
   600 	     ((u64) in[7]) * (in2[2] << 0) +
   601 	     ((u64) in[8]) * (in2[1] << 0);
   602     tmp[10] = ((u64) in[2]) * (in2[8] << 0) +
   603 	      ((u64) in[3]) * (in2[7] << 1) +
   604 	      ((u64) in[4]) * (in2[6] << 0) +
   605 	      ((u64) in[5]) * (in2[5] << 1) +
   606 	      ((u64) in[6]) * (in2[4] << 0) +
   607 	      ((u64) in[7]) * (in2[3] << 1) +
   608 	      ((u64) in[8]) * (in2[2] << 0);
   609     tmp[11] = ((u64) in[3]) * (in2[8] << 0) +
   610 	      ((u64) in[4]) * (in2[7] << 0) +
   611 	      ((u64) in[5]) * (in2[6] << 0) +
   612 	      ((u64) in[6]) * (in2[5] << 0) +
   613 	      ((u64) in[7]) * (in2[4] << 0) +
   614 	      ((u64) in[8]) * (in2[3] << 0);
   615     tmp[12] = ((u64) in[4]) * (in2[8] << 0) +
   616 	      ((u64) in[5]) * (in2[7] << 1) +
   617 	      ((u64) in[6]) * (in2[6] << 0) +
   618 	      ((u64) in[7]) * (in2[5] << 1) +
   619 	      ((u64) in[8]) * (in2[4] << 0);
   620     tmp[13] = ((u64) in[5]) * (in2[8] << 0) +
   621 	      ((u64) in[6]) * (in2[7] << 0) +
   622 	      ((u64) in[7]) * (in2[6] << 0) +
   623 	      ((u64) in[8]) * (in2[5] << 0);
   624     tmp[14] = ((u64) in[6]) * (in2[8] << 0) +
   625 	      ((u64) in[7]) * (in2[7] << 1) +
   626 	      ((u64) in[8]) * (in2[6] << 0);
   627     tmp[15] = ((u64) in[7]) * (in2[8] << 0) +
   628 	      ((u64) in[8]) * (in2[7] << 0);
   629     tmp[16] = ((u64) in[8]) * (in2[8] << 0);
   631     felem_reduce_degree(out, tmp);
   632 }
   634 static void felem_assign(felem out, const felem in)
   635 {
   636     memcpy(out, in, sizeof(felem));
   637 }
   639 /* felem_inv calculates |out| = |in|^{-1}
   640  *
   641  * Based on Fermat's Little Theorem:
   642  *   a^p = a (mod p)
   643  *   a^{p-1} = 1 (mod p)
   644  *   a^{p-2} = a^{-1} (mod p)
   645  */
   646 static void felem_inv(felem out, const felem in)
   647 {
   648     felem ftmp, ftmp2;
   649     /* each e_I will hold |in|^{2^I - 1} */
   650     felem e2, e4, e8, e16, e32, e64;
   651     unsigned int i;
   653     felem_square(ftmp, in);		/* 2^1 */
   654     felem_mul(ftmp, in, ftmp);		/* 2^2 - 2^0 */
   655     felem_assign(e2, ftmp);
   656     felem_square(ftmp, ftmp);		/* 2^3 - 2^1 */
   657     felem_square(ftmp, ftmp);		/* 2^4 - 2^2 */
   658     felem_mul(ftmp, ftmp, e2);		/* 2^4 - 2^0 */
   659     felem_assign(e4, ftmp);
   660     felem_square(ftmp, ftmp);		/* 2^5 - 2^1 */
   661     felem_square(ftmp, ftmp);		/* 2^6 - 2^2 */
   662     felem_square(ftmp, ftmp);		/* 2^7 - 2^3 */
   663     felem_square(ftmp, ftmp);		/* 2^8 - 2^4 */
   664     felem_mul(ftmp, ftmp, e4);		/* 2^8 - 2^0 */
   665     felem_assign(e8, ftmp);
   666     for (i = 0; i < 8; i++) {
   667 	felem_square(ftmp, ftmp);
   668     }					/* 2^16 - 2^8 */
   669     felem_mul(ftmp, ftmp, e8);		/* 2^16 - 2^0 */
   670     felem_assign(e16, ftmp);
   671     for (i = 0; i < 16; i++) {
   672 	felem_square(ftmp, ftmp);
   673     }					/* 2^32 - 2^16 */
   674     felem_mul(ftmp, ftmp, e16);		/* 2^32 - 2^0 */
   675     felem_assign(e32, ftmp);
   676     for (i = 0; i < 32; i++) {
   677 	felem_square(ftmp, ftmp);
   678     }					/* 2^64 - 2^32 */
   679     felem_assign(e64, ftmp);
   680     felem_mul(ftmp, ftmp, in);		/* 2^64 - 2^32 + 2^0 */
   681     for (i = 0; i < 192; i++) {
   682 	felem_square(ftmp, ftmp);
   683     }					/* 2^256 - 2^224 + 2^192 */
   685     felem_mul(ftmp2, e64, e32);		/* 2^64 - 2^0 */
   686     for (i = 0; i < 16; i++) {
   687 	felem_square(ftmp2, ftmp2);
   688     }					/* 2^80 - 2^16 */
   689     felem_mul(ftmp2, ftmp2, e16);	/* 2^80 - 2^0 */
   690     for (i = 0; i < 8; i++) {
   691 	felem_square(ftmp2, ftmp2);
   692     }					/* 2^88 - 2^8 */
   693     felem_mul(ftmp2, ftmp2, e8);	/* 2^88 - 2^0 */
   694     for (i = 0; i < 4; i++) {
   695 	felem_square(ftmp2, ftmp2);
   696     }					/* 2^92 - 2^4 */
   697     felem_mul(ftmp2, ftmp2, e4);	/* 2^92 - 2^0 */
   698     felem_square(ftmp2, ftmp2);		/* 2^93 - 2^1 */
   699     felem_square(ftmp2, ftmp2);		/* 2^94 - 2^2 */
   700     felem_mul(ftmp2, ftmp2, e2);	/* 2^94 - 2^0 */
   701     felem_square(ftmp2, ftmp2);		/* 2^95 - 2^1 */
   702     felem_square(ftmp2, ftmp2);		/* 2^96 - 2^2 */
   703     felem_mul(ftmp2, ftmp2, in);	/* 2^96 - 3 */
   705     felem_mul(out, ftmp2, ftmp);	/* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
   706 }
   708 /* felem_scalar_3 sets out=3*out.
   709  *
   710  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   711  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   712  */
   713 static void felem_scalar_3(felem out)
   714 {
   715     limb carry = 0;
   716     unsigned int i;
   718     for (i = 0;; i++) {
   719 	out[i] *= 3;
   720 	out[i] += carry;
   721 	carry = out[i] >> 29;
   722 	out[i] &= kBottom29Bits;
   724 	i++;
   725 	if (i == NLIMBS)
   726 	    break;
   728 	out[i] *= 3;
   729 	out[i] += carry;
   730 	carry = out[i] >> 28;
   731 	out[i] &= kBottom28Bits;
   732     }
   734     felem_reduce_carry(out, carry);
   735 }
   737 /* felem_scalar_4 sets out=4*out.
   738  *
   739  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   740  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   741  */
   742 static void felem_scalar_4(felem out)
   743 {
   744     limb carry = 0, next_carry;
   745     unsigned int i;
   747     for (i = 0;; i++) {
   748 	next_carry = out[i] >> 27;
   749 	out[i] <<= 2;
   750 	out[i] &= kBottom29Bits;
   751 	out[i] += carry;
   752 	carry = next_carry + (out[i] >> 29);
   753 	out[i] &= kBottom29Bits;
   755 	i++;
   756 	if (i == NLIMBS)
   757 	    break;
   758 	next_carry = out[i] >> 26;
   759 	out[i] <<= 2;
   760 	out[i] &= kBottom28Bits;
   761 	out[i] += carry;
   762 	carry = next_carry + (out[i] >> 28);
   763 	out[i] &= kBottom28Bits;
   764     }
   766     felem_reduce_carry(out, carry);
   767 }
   769 /* felem_scalar_8 sets out=8*out.
   770  *
   771  * On entry: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   772  * On exit: out[0,2,...] < 2**30, out[1,3,...] < 2**29.
   773  */
   774 static void felem_scalar_8(felem out)
   775 {
   776     limb carry = 0, next_carry;
   777     unsigned int i;
   779     for (i = 0;; i++) {
   780 	next_carry = out[i] >> 26;
   781 	out[i] <<= 3;
   782 	out[i] &= kBottom29Bits;
   783 	out[i] += carry;
   784 	carry = next_carry + (out[i] >> 29);
   785 	out[i] &= kBottom29Bits;
   787 	i++;
   788 	if (i == NLIMBS)
   789 	    break;
   790 	next_carry = out[i] >> 25;
   791 	out[i] <<= 3;
   792 	out[i] &= kBottom28Bits;
   793 	out[i] += carry;
   794 	carry = next_carry + (out[i] >> 28);
   795 	out[i] &= kBottom28Bits;
   796     }
   798     felem_reduce_carry(out, carry);
   799 }
   801 /* felem_is_zero_vartime returns 1 iff |in| == 0. It takes a variable amount of
   802  * time depending on the value of |in|.
   803  */
   804 static char felem_is_zero_vartime(const felem in)
   805 {
   806     limb carry;
   807     int i;
   808     limb tmp[NLIMBS];
   809     felem_assign(tmp, in);
   811     /* First, reduce tmp to a minimal form.
   812      */
   813     do {
   814 	carry = 0;
   815 	for (i = 0;; i++) {
   816 	    tmp[i] += carry;
   817 	    carry = tmp[i] >> 29;
   818 	    tmp[i] &= kBottom29Bits;
   820 	    i++;
   821 	    if (i == NLIMBS)
   822 		break;
   824 	    tmp[i] += carry;
   825 	    carry = tmp[i] >> 28;
   826 	    tmp[i] &= kBottom28Bits;
   827 	}
   829 	felem_reduce_carry(tmp, carry);
   830     } while (carry);
   832     /* tmp < 2**257, so the only possible zero values are 0, p and 2p.
   833      */
   834     return memcmp(tmp, kZero, sizeof(tmp)) == 0 ||
   835 	   memcmp(tmp, kP, sizeof(tmp)) == 0 ||
   836 	   memcmp(tmp, k2P, sizeof(tmp)) == 0;
   837 }
   839 /* Group operations:
   840  *
   841  * Elements of the elliptic curve group are represented in Jacobian
   842  * coordinates: (x, y, z). An affine point (x', y') is x'=x/z**2, y'=y/z**3 in
   843  * Jacobian form.
   844  */
   846 /* point_double sets {x_out,y_out,z_out} = 2*{x,y,z}.
   847  *
   848  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
   849  */
   850 static void point_double(felem x_out, felem y_out, felem z_out,
   851 			 const felem x, const felem y, const felem z)
   852 {
   853     felem delta, gamma, alpha, beta, tmp, tmp2;
   855     felem_square(delta, z);
   856     felem_square(gamma, y);
   857     felem_mul(beta, x, gamma);
   859     felem_sum(tmp, x, delta);
   860     felem_diff(tmp2, x, delta);
   861     felem_mul(alpha, tmp, tmp2);
   862     felem_scalar_3(alpha);
   864     felem_sum(tmp, y, z);
   865     felem_square(tmp, tmp);
   866     felem_diff(tmp, tmp, gamma);
   867     felem_diff(z_out, tmp, delta);
   869     felem_scalar_4(beta);
   870     felem_square(x_out, alpha);
   871     felem_diff(x_out, x_out, beta);
   872     felem_diff(x_out, x_out, beta);
   874     felem_diff(tmp, beta, x_out);
   875     felem_mul(tmp, alpha, tmp);
   876     felem_square(tmp2, gamma);
   877     felem_scalar_8(tmp2);
   878     felem_diff(y_out, tmp, tmp2);
   879 }
   881 /* point_add_mixed sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,1}.
   882  * (i.e. the second point is affine.)
   883  *
   884  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   885  *
   886  * Note that this function does not handle P+P, infinity+P nor P+infinity
   887  * correctly.
   888  */
   889 static void point_add_mixed(felem x_out, felem y_out, felem z_out,
   890 			    const felem x1, const felem y1, const felem z1,
   891 			    const felem x2, const felem y2)
   892 {
   893     felem z1z1, z1z1z1, s2, u2, h, i, j, r, rr, v, tmp;
   895     felem_square(z1z1, z1);
   896     felem_sum(tmp, z1, z1);
   898     felem_mul(u2, x2, z1z1);
   899     felem_mul(z1z1z1, z1, z1z1);
   900     felem_mul(s2, y2, z1z1z1);
   901     felem_diff(h, u2, x1);
   902     felem_sum(i, h, h);
   903     felem_square(i, i);
   904     felem_mul(j, h, i);
   905     felem_diff(r, s2, y1);
   906     felem_sum(r, r, r);
   907     felem_mul(v, x1, i);
   909     felem_mul(z_out, tmp, h);
   910     felem_square(rr, r);
   911     felem_diff(x_out, rr, j);
   912     felem_diff(x_out, x_out, v);
   913     felem_diff(x_out, x_out, v);
   915     felem_diff(tmp, v, x_out);
   916     felem_mul(y_out, tmp, r);
   917     felem_mul(tmp, y1, j);
   918     felem_diff(y_out, y_out, tmp);
   919     felem_diff(y_out, y_out, tmp);
   920 }
   922 /* point_add sets {x_out,y_out,z_out} = {x1,y1,z1} + {x2,y2,z2}.
   923  *
   924  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   925  *
   926  * Note that this function does not handle P+P, infinity+P nor P+infinity
   927  * correctly.
   928  */
   929 static void point_add(felem x_out, felem y_out, felem z_out,
   930 		      const felem x1, const felem y1, const felem z1,
   931 		      const felem x2, const felem y2, const felem z2)
   932 {
   933     felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
   935     felem_square(z1z1, z1);
   936     felem_square(z2z2, z2);
   937     felem_mul(u1, x1, z2z2);
   939     felem_sum(tmp, z1, z2);
   940     felem_square(tmp, tmp);
   941     felem_diff(tmp, tmp, z1z1);
   942     felem_diff(tmp, tmp, z2z2);
   944     felem_mul(z2z2z2, z2, z2z2);
   945     felem_mul(s1, y1, z2z2z2);
   947     felem_mul(u2, x2, z1z1);
   948     felem_mul(z1z1z1, z1, z1z1);
   949     felem_mul(s2, y2, z1z1z1);
   950     felem_diff(h, u2, u1);
   951     felem_sum(i, h, h);
   952     felem_square(i, i);
   953     felem_mul(j, h, i);
   954     felem_diff(r, s2, s1);
   955     felem_sum(r, r, r);
   956     felem_mul(v, u1, i);
   958     felem_mul(z_out, tmp, h);
   959     felem_square(rr, r);
   960     felem_diff(x_out, rr, j);
   961     felem_diff(x_out, x_out, v);
   962     felem_diff(x_out, x_out, v);
   964     felem_diff(tmp, v, x_out);
   965     felem_mul(y_out, tmp, r);
   966     felem_mul(tmp, s1, j);
   967     felem_diff(y_out, y_out, tmp);
   968     felem_diff(y_out, y_out, tmp);
   969 }
   971 /* point_add_or_double_vartime sets {x_out,y_out,z_out} = {x1,y1,z1} +
   972  *                                                        {x2,y2,z2}.
   973  *
   974  * See http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
   975  *
   976  * This function handles the case where {x1,y1,z1}={x2,y2,z2}.
   977  */
   978 static void point_add_or_double_vartime(
   979     felem x_out, felem y_out, felem z_out,
   980     const felem x1, const felem y1, const felem z1,
   981     const felem x2, const felem y2, const felem z2)
   982 {
   983     felem z1z1, z1z1z1, z2z2, z2z2z2, s1, s2, u1, u2, h, i, j, r, rr, v, tmp;
   984     char x_equal, y_equal;
   986     felem_square(z1z1, z1);
   987     felem_square(z2z2, z2);
   988     felem_mul(u1, x1, z2z2);
   990     felem_sum(tmp, z1, z2);
   991     felem_square(tmp, tmp);
   992     felem_diff(tmp, tmp, z1z1);
   993     felem_diff(tmp, tmp, z2z2);
   995     felem_mul(z2z2z2, z2, z2z2);
   996     felem_mul(s1, y1, z2z2z2);
   998     felem_mul(u2, x2, z1z1);
   999     felem_mul(z1z1z1, z1, z1z1);
  1000     felem_mul(s2, y2, z1z1z1);
  1001     felem_diff(h, u2, u1);
  1002     x_equal = felem_is_zero_vartime(h);
  1003     felem_sum(i, h, h);
  1004     felem_square(i, i);
  1005     felem_mul(j, h, i);
  1006     felem_diff(r, s2, s1);
  1007     y_equal = felem_is_zero_vartime(r);
  1008     if (x_equal && y_equal) {
  1009 	point_double(x_out, y_out, z_out, x1, y1, z1);
  1010 	return;
  1012     felem_sum(r, r, r);
  1013     felem_mul(v, u1, i);
  1015     felem_mul(z_out, tmp, h);
  1016     felem_square(rr, r);
  1017     felem_diff(x_out, rr, j);
  1018     felem_diff(x_out, x_out, v);
  1019     felem_diff(x_out, x_out, v);
  1021     felem_diff(tmp, v, x_out);
  1022     felem_mul(y_out, tmp, r);
  1023     felem_mul(tmp, s1, j);
  1024     felem_diff(y_out, y_out, tmp);
  1025     felem_diff(y_out, y_out, tmp);
  1028 /* copy_conditional sets out=in if mask = 0xffffffff in constant time.
  1030  * On entry: mask is either 0 or 0xffffffff.
  1031  */
  1032 static void copy_conditional(felem out, const felem in, limb mask)
  1034     int i;
  1036     for (i = 0; i < NLIMBS; i++) {
  1037 	const limb tmp = mask & (in[i] ^ out[i]);
  1038 	out[i] ^= tmp;
  1042 /* select_affine_point sets {out_x,out_y} to the index'th entry of table.
  1043  * On entry: index < 16, table[0] must be zero.
  1044  */
  1045 static void select_affine_point(felem out_x, felem out_y,
  1046 				const limb *table, limb index)
  1048     limb i, j;
  1050     memset(out_x, 0, sizeof(felem));
  1051     memset(out_y, 0, sizeof(felem));
  1053     for (i = 1; i < 16; i++) {
  1054 	limb mask = i ^ index;
  1055 	mask |= mask >> 2;
  1056 	mask |= mask >> 1;
  1057 	mask &= 1;
  1058 	mask--;
  1059 	for (j = 0; j < NLIMBS; j++, table++) {
  1060 	    out_x[j] |= *table & mask;
  1062 	for (j = 0; j < NLIMBS; j++, table++) {
  1063 	    out_y[j] |= *table & mask;
  1068 /* select_jacobian_point sets {out_x,out_y,out_z} to the index'th entry of
  1069  * table.  On entry: index < 16, table[0] must be zero.
  1070  */
  1071 static void select_jacobian_point(felem out_x, felem out_y, felem out_z,
  1072 				  const limb *table, limb index)
  1074     limb i, j;
  1076     memset(out_x, 0, sizeof(felem));
  1077     memset(out_y, 0, sizeof(felem));
  1078     memset(out_z, 0, sizeof(felem));
  1080     /* The implicit value at index 0 is all zero. We don't need to perform that
  1081      * iteration of the loop because we already set out_* to zero.
  1082      */
  1083     table += 3*NLIMBS;
  1085     for (i = 1; i < 16; i++) {
  1086 	limb mask = i ^ index;
  1087 	mask |= mask >> 2;
  1088 	mask |= mask >> 1;
  1089 	mask &= 1;
  1090 	mask--;
  1091 	for (j = 0; j < NLIMBS; j++, table++) {
  1092 	    out_x[j] |= *table & mask;
  1094 	for (j = 0; j < NLIMBS; j++, table++) {
  1095 	    out_y[j] |= *table & mask;
  1097 	for (j = 0; j < NLIMBS; j++, table++) {
  1098 	    out_z[j] |= *table & mask;
  1103 /* get_bit returns the bit'th bit of scalar. */
  1104 static char get_bit(const u8 scalar[32], int bit)
  1106     return ((scalar[bit >> 3]) >> (bit & 7)) & 1;
  1109 /* scalar_base_mult sets {nx,ny,nz} = scalar*G where scalar is a little-endian
  1110  * number. Note that the value of scalar must be less than the order of the
  1111  * group.
  1112  */
  1113 static void scalar_base_mult(felem nx, felem ny, felem nz, const u8 scalar[32])
  1115     int i, j;
  1116     limb n_is_infinity_mask = -1, p_is_noninfinite_mask, mask;
  1117     u32 table_offset;
  1119     felem px, py;
  1120     felem tx, ty, tz;
  1122     memset(nx, 0, sizeof(felem));
  1123     memset(ny, 0, sizeof(felem));
  1124     memset(nz, 0, sizeof(felem));
  1126     /* The loop adds bits at positions 0, 64, 128 and 192, followed by
  1127      * positions 32,96,160 and 224 and does this 32 times.
  1128      */
  1129     for (i = 0; i < 32; i++) {
  1130 	if (i) {
  1131 	    point_double(nx, ny, nz, nx, ny, nz);
  1133 	table_offset = 0;
  1134 	for (j = 0; j <= 32; j += 32) {
  1135 	    char bit0 = get_bit(scalar, 31 - i + j);
  1136 	    char bit1 = get_bit(scalar, 95 - i + j);
  1137 	    char bit2 = get_bit(scalar, 159 - i + j);
  1138 	    char bit3 = get_bit(scalar, 223 - i + j);
  1139 	    limb index = bit0 | (bit1 << 1) | (bit2 << 2) | (bit3 << 3);
  1141 	    select_affine_point(px, py, kPrecomputed + table_offset, index);
  1142 	    table_offset += 30 * NLIMBS;
  1144 	    /* Since scalar is less than the order of the group, we know that
  1145 	     * {nx,ny,nz} != {px,py,1}, unless both are zero, which we handle
  1146 	     * below.
  1147 	     */
  1148 	    point_add_mixed(tx, ty, tz, nx, ny, nz, px, py);
  1149 	    /* The result of point_add_mixed is incorrect if {nx,ny,nz} is zero
  1150 	     * (a.k.a.  the point at infinity). We handle that situation by
  1151 	     * copying the point from the table.
  1152 	     */
  1153 	    copy_conditional(nx, px, n_is_infinity_mask);
  1154 	    copy_conditional(ny, py, n_is_infinity_mask);
  1155 	    copy_conditional(nz, kOne, n_is_infinity_mask);
  1157 	    /* Equally, the result is also wrong if the point from the table is
  1158 	     * zero, which happens when the index is zero. We handle that by
  1159 	     * only copying from {tx,ty,tz} to {nx,ny,nz} if index != 0.
  1160 	     */
  1161 	    p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
  1162 	    mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
  1163 	    copy_conditional(nx, tx, mask);
  1164 	    copy_conditional(ny, ty, mask);
  1165 	    copy_conditional(nz, tz, mask);
  1166 	    /* If p was not zero, then n is now non-zero. */
  1167 	    n_is_infinity_mask &= ~p_is_noninfinite_mask;
  1172 /* point_to_affine converts a Jacobian point to an affine point. If the input
  1173  * is the point at infinity then it returns (0, 0) in constant time.
  1174  */
  1175 static void point_to_affine(felem x_out, felem y_out,
  1176 			    const felem nx, const felem ny, const felem nz) {
  1177     felem z_inv, z_inv_sq;
  1178     felem_inv(z_inv, nz);
  1179     felem_square(z_inv_sq, z_inv);
  1180     felem_mul(x_out, nx, z_inv_sq);
  1181     felem_mul(z_inv, z_inv, z_inv_sq);
  1182     felem_mul(y_out, ny, z_inv);
  1185 /* scalar_mult sets {nx,ny,nz} = scalar*{x,y}. */
  1186 static void scalar_mult(felem nx, felem ny, felem nz,
  1187 			const felem x, const felem y, const u8 scalar[32])
  1189     int i;
  1190     felem px, py, pz, tx, ty, tz;
  1191     felem precomp[16][3];
  1192     limb n_is_infinity_mask, index, p_is_noninfinite_mask, mask;
  1194     /* We precompute 0,1,2,... times {x,y}. */
  1195     memset(precomp, 0, sizeof(felem) * 3);
  1196     memcpy(&precomp[1][0], x, sizeof(felem));
  1197     memcpy(&precomp[1][1], y, sizeof(felem));
  1198     memcpy(&precomp[1][2], kOne, sizeof(felem));
  1200     for (i = 2; i < 16; i += 2) {
  1201 	point_double(precomp[i][0], precomp[i][1], precomp[i][2],
  1202 		     precomp[i / 2][0], precomp[i / 2][1], precomp[i / 2][2]);
  1204 	point_add_mixed(precomp[i + 1][0], precomp[i + 1][1], precomp[i + 1][2],
  1205 			precomp[i][0], precomp[i][1], precomp[i][2], x, y);
  1208     memset(nx, 0, sizeof(felem));
  1209     memset(ny, 0, sizeof(felem));
  1210     memset(nz, 0, sizeof(felem));
  1211     n_is_infinity_mask = -1;
  1213     /* We add in a window of four bits each iteration and do this 64 times. */
  1214     for (i = 0; i < 64; i++) {
  1215 	if (i) {
  1216 	    point_double(nx, ny, nz, nx, ny, nz);
  1217 	    point_double(nx, ny, nz, nx, ny, nz);
  1218 	    point_double(nx, ny, nz, nx, ny, nz);
  1219 	    point_double(nx, ny, nz, nx, ny, nz);
  1222 	index = scalar[31 - i / 2];
  1223 	if ((i & 1) == 1) {
  1224 	    index &= 15;
  1225 	} else {
  1226 	    index >>= 4;
  1229 	/* See the comments in scalar_base_mult about handling infinities. */
  1230 	select_jacobian_point(px, py, pz, precomp[0][0], index);
  1231 	point_add(tx, ty, tz, nx, ny, nz, px, py, pz);
  1232 	copy_conditional(nx, px, n_is_infinity_mask);
  1233 	copy_conditional(ny, py, n_is_infinity_mask);
  1234 	copy_conditional(nz, pz, n_is_infinity_mask);
  1236 	p_is_noninfinite_mask = NON_ZERO_TO_ALL_ONES(index);
  1237 	mask = p_is_noninfinite_mask & ~n_is_infinity_mask;
  1238 	copy_conditional(nx, tx, mask);
  1239 	copy_conditional(ny, ty, mask);
  1240 	copy_conditional(nz, tz, mask);
  1241 	n_is_infinity_mask &= ~p_is_noninfinite_mask;
  1245 /* Interface with Freebl: */
  1247 /* BYTESWAP_MP_DIGIT_TO_LE swaps the bytes of a mp_digit to
  1248  * little-endian order.
  1249  */
  1250 #ifdef IS_BIG_ENDIAN
  1251 #ifdef __APPLE__
  1252 #include <libkern/OSByteOrder.h>
  1253 #define BYTESWAP32(x) OSSwapInt32(x)
  1254 #define BYTESWAP64(x) OSSwapInt64(x)
  1255 #else
  1256 #define BYTESWAP32(x) \
  1257     ((x) >> 24 | (x) >> 8 & 0xff00 | ((x) & 0xff00) << 8 | (x) << 24)
  1258 #define BYTESWAP64(x) \
  1259     ((x) >> 56 | (x) >> 40 & 0xff00 | \
  1260      (x) >> 24 & 0xff0000 | (x) >> 8 & 0xff000000 | \
  1261      ((x) & 0xff000000) << 8 | ((x) & 0xff0000) << 24 | \
  1262      ((x) & 0xff00) << 40 | (x) << 56)
  1263 #endif
  1265 #ifdef MP_USE_UINT_DIGIT
  1266 #define BYTESWAP_MP_DIGIT_TO_LE(x) BYTESWAP32(x)
  1267 #else
  1268 #define BYTESWAP_MP_DIGIT_TO_LE(x) BYTESWAP64(x)
  1269 #endif
  1270 #endif /* IS_BIG_ENDIAN */
  1272 #ifdef MP_USE_UINT_DIGIT
  1273 static const mp_digit kRInvDigits[8] = {
  1274     0x80000000, 1, 0xffffffff, 0,
  1275     0x80000001, 0xfffffffe, 1, 0x7fffffff
  1276 };
  1277 #else
  1278 static const mp_digit kRInvDigits[4] = {
  1279     PR_UINT64(0x180000000),  0xffffffff,
  1280     PR_UINT64(0xfffffffe80000001), PR_UINT64(0x7fffffff00000001)
  1281 };
  1282 #endif
  1283 #define MP_DIGITS_IN_256_BITS (32/sizeof(mp_digit))
  1284 static const mp_int kRInv = {
  1285     MP_ZPOS,
  1286     MP_DIGITS_IN_256_BITS,
  1287     MP_DIGITS_IN_256_BITS,
  1288     (mp_digit*) kRInvDigits
  1289 };
  1291 static const limb kTwo28 = 0x10000000;
  1292 static const limb kTwo29 = 0x20000000;
  1294 /* to_montgomery sets out = R*in. */
  1295 static mp_err to_montgomery(felem out, const mp_int *in, const ECGroup *group)
  1297     /* There are no MPI functions for bitshift operations and we wish to shift
  1298      * in 257 bits left so we move the digits 256-bits left and then multiply
  1299      * by two.
  1300      */
  1301     mp_int in_shifted;
  1302     int i;
  1303     mp_err res;
  1305     mp_init(&in_shifted);
  1306     s_mp_pad(&in_shifted, MP_USED(in) + MP_DIGITS_IN_256_BITS);
  1307     memcpy(&MP_DIGIT(&in_shifted, MP_DIGITS_IN_256_BITS),
  1308 	   MP_DIGITS(in),
  1309 	   MP_USED(in)*sizeof(mp_digit));
  1310     mp_mul_2(&in_shifted, &in_shifted);
  1311     MP_CHECKOK(group->meth->field_mod(&in_shifted, &in_shifted, group->meth));
  1313     for (i = 0;; i++) {
  1314 	out[i] = MP_DIGIT(&in_shifted, 0) & kBottom29Bits;
  1315 	mp_div_d(&in_shifted, kTwo29, &in_shifted, NULL);
  1317 	i++;
  1318 	if (i == NLIMBS)
  1319 	    break;
  1320 	out[i] = MP_DIGIT(&in_shifted, 0) & kBottom28Bits;
  1321 	mp_div_d(&in_shifted, kTwo28, &in_shifted, NULL);
  1324 CLEANUP:
  1325     mp_clear(&in_shifted);
  1326     return res;
  1329 /* from_montgomery sets out=in/R. */
  1330 static mp_err from_montgomery(mp_int *out, const felem in,
  1331 			      const ECGroup *group)
  1333     mp_int result, tmp;
  1334     mp_err res;
  1335     int i;
  1337     mp_init(&result);
  1338     mp_init(&tmp);
  1340     MP_CHECKOK(mp_add_d(&tmp, in[NLIMBS-1], &result));
  1341     for (i = NLIMBS-2; i >= 0; i--) {
  1342 	if ((i & 1) == 0) {
  1343 	    MP_CHECKOK(mp_mul_d(&result, kTwo29, &tmp));
  1344 	} else {
  1345 	    MP_CHECKOK(mp_mul_d(&result, kTwo28, &tmp));
  1347 	MP_CHECKOK(mp_add_d(&tmp, in[i], &result));
  1350     MP_CHECKOK(mp_mul(&result, &kRInv, out));
  1351     MP_CHECKOK(group->meth->field_mod(out, out, group->meth));
  1353 CLEANUP:
  1354     mp_clear(&result);
  1355     mp_clear(&tmp);
  1356     return res;
  1359 /* scalar_from_mp_int sets out_scalar=n, where n < the group order. */
  1360 static void scalar_from_mp_int(u8 out_scalar[32], const mp_int *n)
  1362     /* We require that |n| is less than the order of the group and therefore it
  1363      * will fit into |out_scalar|. However, these is a timing side-channel here
  1364      * that we cannot avoid: if |n| is sufficiently small it may be one or more
  1365      * words too short and we'll copy less data.
  1366      */
  1367     memset(out_scalar, 0, 32);
  1368 #ifdef IS_LITTLE_ENDIAN
  1369     memcpy(out_scalar, MP_DIGITS(n), MP_USED(n) * sizeof(mp_digit));
  1370 #else
  1372 	mp_size i;
  1373 	mp_digit swapped[MP_DIGITS_IN_256_BITS];
  1374 	for (i = 0; i < MP_USED(n); i++) {
  1375 	    swapped[i] = BYTESWAP_MP_DIGIT_TO_LE(MP_DIGIT(n, i));
  1377 	memcpy(out_scalar, swapped, MP_USED(n) * sizeof(mp_digit));
  1379 #endif
  1382 /* ec_GFp_nistp256_base_point_mul sets {out_x,out_y} = nG, where n is < the
  1383  * order of the group.
  1384  */
  1385 static mp_err ec_GFp_nistp256_base_point_mul(const mp_int *n,
  1386 				             mp_int *out_x, mp_int *out_y,
  1387 				             const ECGroup *group)
  1389     u8 scalar[32];
  1390     felem x, y, z, x_affine, y_affine;
  1391     mp_err res;
  1393     /* FIXME(agl): test that n < order. */
  1395     scalar_from_mp_int(scalar, n);
  1396     scalar_base_mult(x, y, z, scalar);
  1397     point_to_affine(x_affine, y_affine, x, y, z);
  1398     MP_CHECKOK(from_montgomery(out_x, x_affine, group));
  1399     MP_CHECKOK(from_montgomery(out_y, y_affine, group));
  1401 CLEANUP:
  1402     return res;
  1405 /* ec_GFp_nistp256_point_mul sets {out_x,out_y} = n*{in_x,in_y}, where n is <
  1406  * the order of the group.
  1407  */
  1408 static mp_err ec_GFp_nistp256_point_mul(const mp_int *n,
  1409 				        const mp_int *in_x, const mp_int *in_y,
  1410 				        mp_int *out_x, mp_int *out_y,
  1411 				        const ECGroup *group)
  1413     u8 scalar[32];
  1414     felem x, y, z, x_affine, y_affine, px, py;
  1415     mp_err res;
  1417     scalar_from_mp_int(scalar, n);
  1419     MP_CHECKOK(to_montgomery(px, in_x, group));
  1420     MP_CHECKOK(to_montgomery(py, in_y, group));
  1422     scalar_mult(x, y, z, px, py, scalar);
  1423     point_to_affine(x_affine, y_affine, x, y, z);
  1424     MP_CHECKOK(from_montgomery(out_x, x_affine, group));
  1425     MP_CHECKOK(from_montgomery(out_y, y_affine, group));
  1427 CLEANUP:
  1428     return res;
  1431 /* ec_GFp_nistp256_point_mul_vartime sets {out_x,out_y} = n1*G +
  1432  * n2*{in_x,in_y}, where n1 and n2 are < the order of the group.
  1434  * As indicated by the name, this function operates in variable time. This
  1435  * is safe because it's used for signature validation which doesn't deal
  1436  * with secrets.
  1437  */
  1438 static mp_err ec_GFp_nistp256_points_mul_vartime(
  1439 	const mp_int *n1, const mp_int *n2,
  1440 	const mp_int *in_x, const mp_int *in_y,
  1441 	mp_int *out_x, mp_int *out_y,
  1442 	const ECGroup *group)
  1444     u8 scalar1[32], scalar2[32];
  1445     felem x1, y1, z1, x2, y2, z2, x_affine, y_affine, px, py;
  1446     mp_err res = MP_OKAY;
  1448     /* If n2 == NULL, this is just a base-point multiplication. */
  1449     if (n2 == NULL) {
  1450 	return ec_GFp_nistp256_base_point_mul(n1, out_x, out_y, group);
  1453     /* If n1 == nULL, this is just an arbitary-point multiplication. */
  1454     if (n1 == NULL) {
  1455 	return ec_GFp_nistp256_point_mul(n2, in_x, in_y, out_x, out_y, group);
  1458     /* If both scalars are zero, then the result is the point at infinity. */
  1459     if (mp_cmp_z(n1) == 0 && mp_cmp_z(n2) == 0) {
  1460 	mp_zero(out_x);
  1461 	mp_zero(out_y);
  1462 	return res;
  1465     scalar_from_mp_int(scalar1, n1);
  1466     scalar_from_mp_int(scalar2, n2);
  1468     MP_CHECKOK(to_montgomery(px, in_x, group));
  1469     MP_CHECKOK(to_montgomery(py, in_y, group));
  1470     scalar_base_mult(x1, y1, z1, scalar1);
  1471     scalar_mult(x2, y2, z2, px, py, scalar2);
  1473     if (mp_cmp_z(n2) == 0) {
  1474 	/* If n2 == 0, then {x2,y2,z2} is zero and the result is just
  1475 	 * {x1,y1,z1}. */
  1476     } else if (mp_cmp_z(n1) == 0) {
  1477 	/* If n1 == 0, then {x1,y1,z1} is zero and the result is just
  1478 	 * {x2,y2,z2}. */
  1479 	memcpy(x1, x2, sizeof(x2));
  1480 	memcpy(y1, y2, sizeof(y2));
  1481 	memcpy(z1, z2, sizeof(z2));
  1482     } else {
  1483 	/* This function handles the case where {x1,y1,z1} == {x2,y2,z2}. */
  1484 	point_add_or_double_vartime(x1, y1, z1, x1, y1, z1, x2, y2, z2);
  1487     point_to_affine(x_affine, y_affine, x1, y1, z1);
  1488     MP_CHECKOK(from_montgomery(out_x, x_affine, group));
  1489     MP_CHECKOK(from_montgomery(out_y, y_affine, group));
  1491 CLEANUP:
  1492     return res;
  1495 /* Wire in fast point multiplication for named curves. */
  1496 mp_err ec_group_set_gfp256_32(ECGroup *group, ECCurveName name)
  1498     if (name == ECCurve_NIST_P256) {
  1499         group->base_point_mul = &ec_GFp_nistp256_base_point_mul;
  1500         group->point_mul = &ec_GFp_nistp256_point_mul;
  1501         group->points_mul = &ec_GFp_nistp256_points_mul_vartime;
  1503     return MP_OKAY;

mercurial