security/nss/lib/freebl/rijndael.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/rijndael.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1323 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +#ifdef FREEBL_NO_DEPEND
     1.9 +#include "stubs.h"
    1.10 +#endif
    1.11 +
    1.12 +#include "prinit.h"
    1.13 +#include "prerr.h"
    1.14 +#include "secerr.h"
    1.15 +
    1.16 +#include "prtypes.h"
    1.17 +#include "blapi.h"
    1.18 +#include "rijndael.h"
    1.19 +
    1.20 +#include "cts.h"
    1.21 +#include "ctr.h"
    1.22 +#include "gcm.h"
    1.23 +
    1.24 +#ifdef USE_HW_AES
    1.25 +#include "intel-aes.h"
    1.26 +#include "mpi.h"
    1.27 +
    1.28 +static int has_intel_aes = 0;
    1.29 +static PRBool use_hw_aes = PR_FALSE;
    1.30 +
    1.31 +#ifdef INTEL_GCM
    1.32 +#include "intel-gcm.h"
    1.33 +static int has_intel_avx = 0;
    1.34 +static int has_intel_clmul = 0;
    1.35 +static PRBool use_hw_gcm = PR_FALSE;
    1.36 +#endif
    1.37 +#endif  /* USE_HW_AES */
    1.38 +
    1.39 +/*
    1.40 + * There are currently five ways to build this code, varying in performance
    1.41 + * and code size.
    1.42 + *
    1.43 + * RIJNDAEL_INCLUDE_TABLES         Include all tables from rijndael32.tab
    1.44 + * RIJNDAEL_GENERATE_TABLES        Generate tables on first 
    1.45 + *                                 encryption/decryption, then store them;
    1.46 + *                                 use the function gfm
    1.47 + * RIJNDAEL_GENERATE_TABLES_MACRO  Same as above, but use macros to do
    1.48 + *                                 the generation
    1.49 + * RIJNDAEL_GENERATE_VALUES        Do not store tables, generate the table
    1.50 + *                                 values "on-the-fly", using gfm
    1.51 + * RIJNDAEL_GENERATE_VALUES_MACRO  Same as above, but use macros
    1.52 + *
    1.53 + * The default is RIJNDAEL_INCLUDE_TABLES.
    1.54 + */
    1.55 +
    1.56 +/*
    1.57 + * When building RIJNDAEL_INCLUDE_TABLES, includes S**-1, Rcon, T[0..4], 
    1.58 + *                                                 T**-1[0..4], IMXC[0..4]
    1.59 + * When building anything else, includes S, S**-1, Rcon
    1.60 + */
    1.61 +#include "rijndael32.tab"
    1.62 +
    1.63 +#if defined(RIJNDAEL_INCLUDE_TABLES)
    1.64 +/*
    1.65 + * RIJNDAEL_INCLUDE_TABLES
    1.66 + */
    1.67 +#define T0(i)    _T0[i]
    1.68 +#define T1(i)    _T1[i]
    1.69 +#define T2(i)    _T2[i]
    1.70 +#define T3(i)    _T3[i]
    1.71 +#define TInv0(i) _TInv0[i]
    1.72 +#define TInv1(i) _TInv1[i]
    1.73 +#define TInv2(i) _TInv2[i]
    1.74 +#define TInv3(i) _TInv3[i]
    1.75 +#define IMXC0(b) _IMXC0[b]
    1.76 +#define IMXC1(b) _IMXC1[b]
    1.77 +#define IMXC2(b) _IMXC2[b]
    1.78 +#define IMXC3(b) _IMXC3[b]
    1.79 +/* The S-box can be recovered from the T-tables */
    1.80 +#ifdef IS_LITTLE_ENDIAN
    1.81 +#define SBOX(b)    ((PRUint8)_T3[b])
    1.82 +#else
    1.83 +#define SBOX(b)    ((PRUint8)_T1[b])
    1.84 +#endif
    1.85 +#define SINV(b) (_SInv[b])
    1.86 +
    1.87 +#else /* not RIJNDAEL_INCLUDE_TABLES */
    1.88 +
    1.89 +/*
    1.90 + * Code for generating T-table values.
    1.91 + */
    1.92 +
    1.93 +#ifdef IS_LITTLE_ENDIAN
    1.94 +#define WORD4(b0, b1, b2, b3) \
    1.95 +    (((b3) << 24) | ((b2) << 16) | ((b1) << 8) | (b0))
    1.96 +#else
    1.97 +#define WORD4(b0, b1, b2, b3) \
    1.98 +    (((b0) << 24) | ((b1) << 16) | ((b2) << 8) | (b3))
    1.99 +#endif
   1.100 +
   1.101 +/*
   1.102 + * Define the S and S**-1 tables (both have been stored)
   1.103 + */
   1.104 +#define SBOX(b)    (_S[b])
   1.105 +#define SINV(b) (_SInv[b])
   1.106 +
   1.107 +/*
   1.108 + * The function xtime, used for Galois field multiplication
   1.109 + */
   1.110 +#define XTIME(a) \
   1.111 +    ((a & 0x80) ? ((a << 1) ^ 0x1b) : (a << 1))
   1.112 +
   1.113 +/* Choose GFM method (macros or function) */
   1.114 +#if defined(RIJNDAEL_GENERATE_TABLES_MACRO) ||  \
   1.115 +    defined(RIJNDAEL_GENERATE_VALUES_MACRO)
   1.116 +
   1.117 +/*
   1.118 + * Galois field GF(2**8) multipliers, in macro form
   1.119 + */
   1.120 +#define GFM01(a) \
   1.121 +    (a)                                 /* a * 01 = a, the identity */
   1.122 +#define GFM02(a) \
   1.123 +    (XTIME(a) & 0xff)                   /* a * 02 = xtime(a) */
   1.124 +#define GFM04(a) \
   1.125 +    (GFM02(GFM02(a)))                   /* a * 04 = xtime**2(a) */
   1.126 +#define GFM08(a) \
   1.127 +    (GFM02(GFM04(a)))                   /* a * 08 = xtime**3(a) */
   1.128 +#define GFM03(a) \
   1.129 +    (GFM01(a) ^ GFM02(a))               /* a * 03 = a * (01 + 02) */
   1.130 +#define GFM09(a) \
   1.131 +    (GFM01(a) ^ GFM08(a))               /* a * 09 = a * (01 + 08) */
   1.132 +#define GFM0B(a) \
   1.133 +    (GFM01(a) ^ GFM02(a) ^ GFM08(a))    /* a * 0B = a * (01 + 02 + 08) */
   1.134 +#define GFM0D(a) \
   1.135 +    (GFM01(a) ^ GFM04(a) ^ GFM08(a))    /* a * 0D = a * (01 + 04 + 08) */
   1.136 +#define GFM0E(a) \
   1.137 +    (GFM02(a) ^ GFM04(a) ^ GFM08(a))    /* a * 0E = a * (02 + 04 + 08) */
   1.138 +
   1.139 +#else  /* RIJNDAEL_GENERATE_TABLES or RIJNDAEL_GENERATE_VALUES */
   1.140 +
   1.141 +/* GF_MULTIPLY
   1.142 + *
   1.143 + * multiply two bytes represented in GF(2**8), mod (x**4 + 1)
   1.144 + */
   1.145 +PRUint8 gfm(PRUint8 a, PRUint8 b)
   1.146 +{
   1.147 +    PRUint8 res = 0;
   1.148 +    while (b > 0) {
   1.149 +	res = (b & 0x01) ? res ^ a : res;
   1.150 +	a = XTIME(a);
   1.151 +	b >>= 1;
   1.152 +    }
   1.153 +    return res;
   1.154 +}
   1.155 +
   1.156 +#define GFM01(a) \
   1.157 +    (a)                                 /* a * 01 = a, the identity */
   1.158 +#define GFM02(a) \
   1.159 +    (XTIME(a) & 0xff)                   /* a * 02 = xtime(a) */
   1.160 +#define GFM03(a) \
   1.161 +    (gfm(a, 0x03))                      /* a * 03 */
   1.162 +#define GFM09(a) \
   1.163 +    (gfm(a, 0x09))                      /* a * 09 */
   1.164 +#define GFM0B(a) \
   1.165 +    (gfm(a, 0x0B))                      /* a * 0B */
   1.166 +#define GFM0D(a) \
   1.167 +    (gfm(a, 0x0D))                      /* a * 0D */
   1.168 +#define GFM0E(a) \
   1.169 +    (gfm(a, 0x0E))                      /* a * 0E */
   1.170 +
   1.171 +#endif /* choosing GFM function */
   1.172 +
   1.173 +/*
   1.174 + * The T-tables
   1.175 + */
   1.176 +#define G_T0(i) \
   1.177 +    ( WORD4( GFM02(SBOX(i)), GFM01(SBOX(i)), GFM01(SBOX(i)), GFM03(SBOX(i)) ) )
   1.178 +#define G_T1(i) \
   1.179 +    ( WORD4( GFM03(SBOX(i)), GFM02(SBOX(i)), GFM01(SBOX(i)), GFM01(SBOX(i)) ) )
   1.180 +#define G_T2(i) \
   1.181 +    ( WORD4( GFM01(SBOX(i)), GFM03(SBOX(i)), GFM02(SBOX(i)), GFM01(SBOX(i)) ) )
   1.182 +#define G_T3(i) \
   1.183 +    ( WORD4( GFM01(SBOX(i)), GFM01(SBOX(i)), GFM03(SBOX(i)), GFM02(SBOX(i)) ) )
   1.184 +
   1.185 +/*
   1.186 + * The inverse T-tables
   1.187 + */
   1.188 +#define G_TInv0(i) \
   1.189 +    ( WORD4( GFM0E(SINV(i)), GFM09(SINV(i)), GFM0D(SINV(i)), GFM0B(SINV(i)) ) )
   1.190 +#define G_TInv1(i) \
   1.191 +    ( WORD4( GFM0B(SINV(i)), GFM0E(SINV(i)), GFM09(SINV(i)), GFM0D(SINV(i)) ) )
   1.192 +#define G_TInv2(i) \
   1.193 +    ( WORD4( GFM0D(SINV(i)), GFM0B(SINV(i)), GFM0E(SINV(i)), GFM09(SINV(i)) ) )
   1.194 +#define G_TInv3(i) \
   1.195 +    ( WORD4( GFM09(SINV(i)), GFM0D(SINV(i)), GFM0B(SINV(i)), GFM0E(SINV(i)) ) )
   1.196 +
   1.197 +/*
   1.198 + * The inverse mix column tables
   1.199 + */
   1.200 +#define G_IMXC0(i) \
   1.201 +    ( WORD4( GFM0E(i), GFM09(i), GFM0D(i), GFM0B(i) ) )
   1.202 +#define G_IMXC1(i) \
   1.203 +    ( WORD4( GFM0B(i), GFM0E(i), GFM09(i), GFM0D(i) ) )
   1.204 +#define G_IMXC2(i) \
   1.205 +    ( WORD4( GFM0D(i), GFM0B(i), GFM0E(i), GFM09(i) ) )
   1.206 +#define G_IMXC3(i) \
   1.207 +    ( WORD4( GFM09(i), GFM0D(i), GFM0B(i), GFM0E(i) ) )
   1.208 +
   1.209 +/* Now choose the T-table indexing method */
   1.210 +#if defined(RIJNDAEL_GENERATE_VALUES)
   1.211 +/* generate values for the tables with a function*/
   1.212 +static PRUint32 gen_TInvXi(PRUint8 tx, PRUint8 i)
   1.213 +{
   1.214 +    PRUint8 si01, si02, si03, si04, si08, si09, si0B, si0D, si0E;
   1.215 +    si01 = SINV(i);
   1.216 +    si02 = XTIME(si01);
   1.217 +    si04 = XTIME(si02);
   1.218 +    si08 = XTIME(si04);
   1.219 +    si03 = si02 ^ si01;
   1.220 +    si09 = si08 ^ si01;
   1.221 +    si0B = si08 ^ si03;
   1.222 +    si0D = si09 ^ si04;
   1.223 +    si0E = si08 ^ si04 ^ si02;
   1.224 +    switch (tx) {
   1.225 +    case 0:
   1.226 +	return WORD4(si0E, si09, si0D, si0B);
   1.227 +    case 1:
   1.228 +	return WORD4(si0B, si0E, si09, si0D);
   1.229 +    case 2:
   1.230 +	return WORD4(si0D, si0B, si0E, si09);
   1.231 +    case 3:
   1.232 +	return WORD4(si09, si0D, si0B, si0E);
   1.233 +    }
   1.234 +    return -1;
   1.235 +}
   1.236 +#define T0(i)    G_T0(i)
   1.237 +#define T1(i)    G_T1(i)
   1.238 +#define T2(i)    G_T2(i)
   1.239 +#define T3(i)    G_T3(i)
   1.240 +#define TInv0(i) gen_TInvXi(0, i)
   1.241 +#define TInv1(i) gen_TInvXi(1, i)
   1.242 +#define TInv2(i) gen_TInvXi(2, i)
   1.243 +#define TInv3(i) gen_TInvXi(3, i)
   1.244 +#define IMXC0(b) G_IMXC0(b)
   1.245 +#define IMXC1(b) G_IMXC1(b)
   1.246 +#define IMXC2(b) G_IMXC2(b)
   1.247 +#define IMXC3(b) G_IMXC3(b)
   1.248 +#elif defined(RIJNDAEL_GENERATE_VALUES_MACRO)
   1.249 +/* generate values for the tables with macros */
   1.250 +#define T0(i)    G_T0(i)
   1.251 +#define T1(i)    G_T1(i)
   1.252 +#define T2(i)    G_T2(i)
   1.253 +#define T3(i)    G_T3(i)
   1.254 +#define TInv0(i) G_TInv0(i)
   1.255 +#define TInv1(i) G_TInv1(i)
   1.256 +#define TInv2(i) G_TInv2(i)
   1.257 +#define TInv3(i) G_TInv3(i)
   1.258 +#define IMXC0(b) G_IMXC0(b)
   1.259 +#define IMXC1(b) G_IMXC1(b)
   1.260 +#define IMXC2(b) G_IMXC2(b)
   1.261 +#define IMXC3(b) G_IMXC3(b)
   1.262 +#else  /* RIJNDAEL_GENERATE_TABLES or RIJNDAEL_GENERATE_TABLES_MACRO */
   1.263 +/* Generate T and T**-1 table values and store, then index */
   1.264 +/* The inverse mix column tables are still generated */
   1.265 +#define T0(i)    rijndaelTables->T0[i]
   1.266 +#define T1(i)    rijndaelTables->T1[i]
   1.267 +#define T2(i)    rijndaelTables->T2[i]
   1.268 +#define T3(i)    rijndaelTables->T3[i]
   1.269 +#define TInv0(i) rijndaelTables->TInv0[i]
   1.270 +#define TInv1(i) rijndaelTables->TInv1[i]
   1.271 +#define TInv2(i) rijndaelTables->TInv2[i]
   1.272 +#define TInv3(i) rijndaelTables->TInv3[i]
   1.273 +#define IMXC0(b) G_IMXC0(b)
   1.274 +#define IMXC1(b) G_IMXC1(b)
   1.275 +#define IMXC2(b) G_IMXC2(b)
   1.276 +#define IMXC3(b) G_IMXC3(b)
   1.277 +#endif /* choose T-table indexing method */
   1.278 +
   1.279 +#endif /* not RIJNDAEL_INCLUDE_TABLES */
   1.280 +
   1.281 +#if defined(RIJNDAEL_GENERATE_TABLES) ||  \
   1.282 +    defined(RIJNDAEL_GENERATE_TABLES_MACRO)
   1.283 +
   1.284 +/* Code to generate and store the tables */
   1.285 +
   1.286 +struct rijndael_tables_str {
   1.287 +    PRUint32 T0[256];
   1.288 +    PRUint32 T1[256];
   1.289 +    PRUint32 T2[256];
   1.290 +    PRUint32 T3[256];
   1.291 +    PRUint32 TInv0[256];
   1.292 +    PRUint32 TInv1[256];
   1.293 +    PRUint32 TInv2[256];
   1.294 +    PRUint32 TInv3[256];
   1.295 +};
   1.296 +
   1.297 +static struct rijndael_tables_str *rijndaelTables = NULL;
   1.298 +static PRCallOnceType coRTInit = { 0, 0, 0 };
   1.299 +static PRStatus 
   1.300 +init_rijndael_tables(void)
   1.301 +{
   1.302 +    PRUint32 i;
   1.303 +    PRUint8 si01, si02, si03, si04, si08, si09, si0B, si0D, si0E;
   1.304 +    struct rijndael_tables_str *rts;
   1.305 +    rts = (struct rijndael_tables_str *)
   1.306 +                   PORT_Alloc(sizeof(struct rijndael_tables_str));
   1.307 +    if (!rts) return PR_FAILURE;
   1.308 +    for (i=0; i<256; i++) {
   1.309 +	/* The forward values */
   1.310 +	si01 = SBOX(i);
   1.311 +	si02 = XTIME(si01);
   1.312 +	si03 = si02 ^ si01;
   1.313 +	rts->T0[i] = WORD4(si02, si01, si01, si03);
   1.314 +	rts->T1[i] = WORD4(si03, si02, si01, si01);
   1.315 +	rts->T2[i] = WORD4(si01, si03, si02, si01);
   1.316 +	rts->T3[i] = WORD4(si01, si01, si03, si02);
   1.317 +	/* The inverse values */
   1.318 +	si01 = SINV(i);
   1.319 +	si02 = XTIME(si01);
   1.320 +	si04 = XTIME(si02);
   1.321 +	si08 = XTIME(si04);
   1.322 +	si03 = si02 ^ si01;
   1.323 +	si09 = si08 ^ si01;
   1.324 +	si0B = si08 ^ si03;
   1.325 +	si0D = si09 ^ si04;
   1.326 +	si0E = si08 ^ si04 ^ si02;
   1.327 +	rts->TInv0[i] = WORD4(si0E, si09, si0D, si0B);
   1.328 +	rts->TInv1[i] = WORD4(si0B, si0E, si09, si0D);
   1.329 +	rts->TInv2[i] = WORD4(si0D, si0B, si0E, si09);
   1.330 +	rts->TInv3[i] = WORD4(si09, si0D, si0B, si0E);
   1.331 +    }
   1.332 +    /* wait until all the values are in to set */
   1.333 +    rijndaelTables = rts;
   1.334 +    return PR_SUCCESS;
   1.335 +}
   1.336 +
   1.337 +#endif /* code to generate tables */
   1.338 +
   1.339 +/**************************************************************************
   1.340 + *
   1.341 + * Stuff related to the Rijndael key schedule
   1.342 + *
   1.343 + *************************************************************************/
   1.344 +
   1.345 +#define SUBBYTE(w) \
   1.346 +    ((SBOX((w >> 24) & 0xff) << 24) | \
   1.347 +     (SBOX((w >> 16) & 0xff) << 16) | \
   1.348 +     (SBOX((w >>  8) & 0xff) <<  8) | \
   1.349 +     (SBOX((w      ) & 0xff)         ))
   1.350 +
   1.351 +#ifdef IS_LITTLE_ENDIAN
   1.352 +#define ROTBYTE(b) \
   1.353 +    ((b >> 8) | (b << 24))
   1.354 +#else
   1.355 +#define ROTBYTE(b) \
   1.356 +    ((b << 8) | (b >> 24))
   1.357 +#endif
   1.358 +
   1.359 +/* rijndael_key_expansion7
   1.360 + *
   1.361 + * Generate the expanded key from the key input by the user.
   1.362 + * XXX
   1.363 + * Nk == 7 (224 key bits) is a weird case.  Since Nk > 6, an added SubByte
   1.364 + * transformation is done periodically.  The period is every 4 bytes, and
   1.365 + * since 7%4 != 0 this happens at different times for each key word (unlike
   1.366 + * Nk == 8 where it happens twice in every key word, in the same positions).
   1.367 + * For now, I'm implementing this case "dumbly", w/o any unrolling.
   1.368 + */
   1.369 +static SECStatus
   1.370 +rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int Nk)
   1.371 +{
   1.372 +    unsigned int i;
   1.373 +    PRUint32 *W;
   1.374 +    PRUint32 *pW;
   1.375 +    PRUint32 tmp;
   1.376 +    W = cx->expandedKey;
   1.377 +    /* 1.  the first Nk words contain the cipher key */
   1.378 +    memcpy(W, key, Nk * 4);
   1.379 +    i = Nk;
   1.380 +    /* 2.  loop until full expanded key is obtained */
   1.381 +    pW = W + i - 1;
   1.382 +    for (; i < cx->Nb * (cx->Nr + 1); ++i) {
   1.383 +	tmp = *pW++;
   1.384 +	if (i % Nk == 0)
   1.385 +	    tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
   1.386 +	else if (i % Nk == 4)
   1.387 +	    tmp = SUBBYTE(tmp);
   1.388 +	*pW = W[i - Nk] ^ tmp;
   1.389 +    }
   1.390 +    return SECSuccess;
   1.391 +}
   1.392 +
   1.393 +/* rijndael_key_expansion
   1.394 + *
   1.395 + * Generate the expanded key from the key input by the user.
   1.396 + */
   1.397 +static SECStatus
   1.398 +rijndael_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
   1.399 +{
   1.400 +    unsigned int i;
   1.401 +    PRUint32 *W;
   1.402 +    PRUint32 *pW;
   1.403 +    PRUint32 tmp;
   1.404 +    unsigned int round_key_words = cx->Nb * (cx->Nr + 1);
   1.405 +    if (Nk == 7)
   1.406 +	return rijndael_key_expansion7(cx, key, Nk);
   1.407 +    W = cx->expandedKey;
   1.408 +    /* The first Nk words contain the input cipher key */
   1.409 +    memcpy(W, key, Nk * 4);
   1.410 +    i = Nk;
   1.411 +    pW = W + i - 1;
   1.412 +    /* Loop over all sets of Nk words, except the last */
   1.413 +    while (i < round_key_words - Nk) {
   1.414 +	tmp = *pW++;
   1.415 +	tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
   1.416 +	*pW = W[i++ - Nk] ^ tmp;
   1.417 +	tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.418 +	tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.419 +	tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.420 +	if (Nk == 4)
   1.421 +	    continue;
   1.422 +	switch (Nk) {
   1.423 +	case 8: tmp = *pW++; tmp = SUBBYTE(tmp); *pW = W[i++ - Nk] ^ tmp;
   1.424 +	case 7: tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.425 +	case 6: tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.426 +	case 5: tmp = *pW++; *pW = W[i++ - Nk] ^ tmp;
   1.427 +	}
   1.428 +    }
   1.429 +    /* Generate the last word */
   1.430 +    tmp = *pW++;
   1.431 +    tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
   1.432 +    *pW = W[i++ - Nk] ^ tmp;
   1.433 +    /* There may be overflow here, if Nk % (Nb * (Nr + 1)) > 0.  However,
   1.434 +     * since the above loop generated all but the last Nk key words, there
   1.435 +     * is no more need for the SubByte transformation.
   1.436 +     */
   1.437 +    if (Nk < 8) {
   1.438 +	for (; i < round_key_words; ++i) {
   1.439 +	    tmp = *pW++; 
   1.440 +	    *pW = W[i - Nk] ^ tmp;
   1.441 +	}
   1.442 +    } else {
   1.443 +	/* except in the case when Nk == 8.  Then one more SubByte may have
   1.444 +	 * to be performed, at i % Nk == 4.
   1.445 +	 */
   1.446 +	for (; i < round_key_words; ++i) {
   1.447 +	    tmp = *pW++;
   1.448 +	    if (i % Nk == 4)
   1.449 +		tmp = SUBBYTE(tmp);
   1.450 +	    *pW = W[i - Nk] ^ tmp;
   1.451 +	}
   1.452 +    }
   1.453 +    return SECSuccess;
   1.454 +}
   1.455 +
   1.456 +/* rijndael_invkey_expansion
   1.457 + *
   1.458 + * Generate the expanded key for the inverse cipher from the key input by 
   1.459 + * the user.
   1.460 + */
   1.461 +static SECStatus
   1.462 +rijndael_invkey_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
   1.463 +{
   1.464 +    unsigned int r;
   1.465 +    PRUint32 *roundkeyw;
   1.466 +    PRUint8 *b;
   1.467 +    int Nb = cx->Nb;
   1.468 +    /* begins like usual key expansion ... */
   1.469 +    if (rijndael_key_expansion(cx, key, Nk) != SECSuccess)
   1.470 +	return SECFailure;
   1.471 +    /* ... but has the additional step of InvMixColumn,
   1.472 +     * excepting the first and last round keys.
   1.473 +     */
   1.474 +    roundkeyw = cx->expandedKey + cx->Nb;
   1.475 +    for (r=1; r<cx->Nr; ++r) {
   1.476 +	/* each key word, roundkeyw, represents a column in the key
   1.477 +	 * matrix.  Each column is multiplied by the InvMixColumn matrix.
   1.478 +	 *   [ 0E 0B 0D 09 ]   [ b0 ]
   1.479 +	 *   [ 09 0E 0B 0D ] * [ b1 ]
   1.480 +	 *   [ 0D 09 0E 0B ]   [ b2 ]
   1.481 +	 *   [ 0B 0D 09 0E ]   [ b3 ]
   1.482 +	 */
   1.483 +	b = (PRUint8 *)roundkeyw;
   1.484 +	*roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
   1.485 +	b = (PRUint8 *)roundkeyw;
   1.486 +	*roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
   1.487 +	b = (PRUint8 *)roundkeyw;
   1.488 +	*roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
   1.489 +	b = (PRUint8 *)roundkeyw;
   1.490 +	*roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
   1.491 +	if (Nb <= 4)
   1.492 +	    continue;
   1.493 +	switch (Nb) {
   1.494 +	case 8: b = (PRUint8 *)roundkeyw;
   1.495 +	        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ 
   1.496 +	                       IMXC2(b[2]) ^ IMXC3(b[3]);
   1.497 +	case 7: b = (PRUint8 *)roundkeyw;
   1.498 +	        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ 
   1.499 +	                       IMXC2(b[2]) ^ IMXC3(b[3]);
   1.500 +	case 6: b = (PRUint8 *)roundkeyw;
   1.501 +	        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ 
   1.502 +	                       IMXC2(b[2]) ^ IMXC3(b[3]);
   1.503 +	case 5: b = (PRUint8 *)roundkeyw;
   1.504 +	        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ 
   1.505 +	                       IMXC2(b[2]) ^ IMXC3(b[3]);
   1.506 +	}
   1.507 +    }
   1.508 +    return SECSuccess;
   1.509 +}
   1.510 +/**************************************************************************
   1.511 + *
   1.512 + * Stuff related to Rijndael encryption/decryption, optimized for
   1.513 + * a 128-bit blocksize.
   1.514 + *
   1.515 + *************************************************************************/
   1.516 +
   1.517 +#ifdef IS_LITTLE_ENDIAN
   1.518 +#define BYTE0WORD(w) ((w) & 0x000000ff)
   1.519 +#define BYTE1WORD(w) ((w) & 0x0000ff00)
   1.520 +#define BYTE2WORD(w) ((w) & 0x00ff0000)
   1.521 +#define BYTE3WORD(w) ((w) & 0xff000000)
   1.522 +#else
   1.523 +#define BYTE0WORD(w) ((w) & 0xff000000)
   1.524 +#define BYTE1WORD(w) ((w) & 0x00ff0000)
   1.525 +#define BYTE2WORD(w) ((w) & 0x0000ff00)
   1.526 +#define BYTE3WORD(w) ((w) & 0x000000ff)
   1.527 +#endif
   1.528 +
   1.529 +typedef union {
   1.530 +    PRUint32 w[4];
   1.531 +    PRUint8  b[16];
   1.532 +} rijndael_state;
   1.533 +
   1.534 +#define COLUMN_0(state) state.w[0]
   1.535 +#define COLUMN_1(state) state.w[1]
   1.536 +#define COLUMN_2(state) state.w[2]
   1.537 +#define COLUMN_3(state) state.w[3]
   1.538 +
   1.539 +#define STATE_BYTE(i) state.b[i]
   1.540 +
   1.541 +static SECStatus 
   1.542 +rijndael_encryptBlock128(AESContext *cx, 
   1.543 +                         unsigned char *output,
   1.544 +                         const unsigned char *input)
   1.545 +{
   1.546 +    unsigned int r;
   1.547 +    PRUint32 *roundkeyw;
   1.548 +    rijndael_state state;
   1.549 +    PRUint32 C0, C1, C2, C3;
   1.550 +#if defined(NSS_X86_OR_X64)
   1.551 +#define pIn input
   1.552 +#define pOut output
   1.553 +#else
   1.554 +    unsigned char *pIn, *pOut;
   1.555 +    PRUint32 inBuf[4], outBuf[4];
   1.556 +
   1.557 +    if ((ptrdiff_t)input & 0x3) {
   1.558 +	memcpy(inBuf, input, sizeof inBuf);
   1.559 +	pIn = (unsigned char *)inBuf;
   1.560 +    } else {
   1.561 +	pIn = (unsigned char *)input;
   1.562 +    }
   1.563 +    if ((ptrdiff_t)output & 0x3) {
   1.564 +	pOut = (unsigned char *)outBuf;
   1.565 +    } else {
   1.566 +	pOut = (unsigned char *)output;
   1.567 +    }
   1.568 +#endif
   1.569 +    roundkeyw = cx->expandedKey;
   1.570 +    /* Step 1: Add Round Key 0 to initial state */
   1.571 +    COLUMN_0(state) = *((PRUint32 *)(pIn     )) ^ *roundkeyw++;
   1.572 +    COLUMN_1(state) = *((PRUint32 *)(pIn + 4 )) ^ *roundkeyw++;
   1.573 +    COLUMN_2(state) = *((PRUint32 *)(pIn + 8 )) ^ *roundkeyw++;
   1.574 +    COLUMN_3(state) = *((PRUint32 *)(pIn + 12)) ^ *roundkeyw++;
   1.575 +    /* Step 2: Loop over rounds [1..NR-1] */
   1.576 +    for (r=1; r<cx->Nr; ++r) {
   1.577 +        /* Do ShiftRow, ByteSub, and MixColumn all at once */
   1.578 +	C0 = T0(STATE_BYTE(0))  ^
   1.579 +	     T1(STATE_BYTE(5))  ^
   1.580 +	     T2(STATE_BYTE(10)) ^
   1.581 +	     T3(STATE_BYTE(15));
   1.582 +	C1 = T0(STATE_BYTE(4))  ^
   1.583 +	     T1(STATE_BYTE(9))  ^
   1.584 +	     T2(STATE_BYTE(14)) ^
   1.585 +	     T3(STATE_BYTE(3));
   1.586 +	C2 = T0(STATE_BYTE(8))  ^
   1.587 +	     T1(STATE_BYTE(13)) ^
   1.588 +	     T2(STATE_BYTE(2))  ^
   1.589 +	     T3(STATE_BYTE(7));
   1.590 +	C3 = T0(STATE_BYTE(12)) ^
   1.591 +	     T1(STATE_BYTE(1))  ^
   1.592 +	     T2(STATE_BYTE(6))  ^
   1.593 +	     T3(STATE_BYTE(11));
   1.594 +	/* Round key addition */
   1.595 +	COLUMN_0(state) = C0 ^ *roundkeyw++;
   1.596 +	COLUMN_1(state) = C1 ^ *roundkeyw++;
   1.597 +	COLUMN_2(state) = C2 ^ *roundkeyw++;
   1.598 +	COLUMN_3(state) = C3 ^ *roundkeyw++;
   1.599 +    }
   1.600 +    /* Step 3: Do the last round */
   1.601 +    /* Final round does not employ MixColumn */
   1.602 +    C0 = ((BYTE0WORD(T2(STATE_BYTE(0))))   |
   1.603 +          (BYTE1WORD(T3(STATE_BYTE(5))))   |
   1.604 +          (BYTE2WORD(T0(STATE_BYTE(10))))  |
   1.605 +          (BYTE3WORD(T1(STATE_BYTE(15)))))  ^
   1.606 +          *roundkeyw++;
   1.607 +    C1 = ((BYTE0WORD(T2(STATE_BYTE(4))))   |
   1.608 +          (BYTE1WORD(T3(STATE_BYTE(9))))   |
   1.609 +          (BYTE2WORD(T0(STATE_BYTE(14))))  |
   1.610 +          (BYTE3WORD(T1(STATE_BYTE(3)))))   ^
   1.611 +          *roundkeyw++;
   1.612 +    C2 = ((BYTE0WORD(T2(STATE_BYTE(8))))   |
   1.613 +          (BYTE1WORD(T3(STATE_BYTE(13))))  |
   1.614 +          (BYTE2WORD(T0(STATE_BYTE(2))))   |
   1.615 +          (BYTE3WORD(T1(STATE_BYTE(7)))))   ^
   1.616 +          *roundkeyw++;
   1.617 +    C3 = ((BYTE0WORD(T2(STATE_BYTE(12))))  |
   1.618 +          (BYTE1WORD(T3(STATE_BYTE(1))))   |
   1.619 +          (BYTE2WORD(T0(STATE_BYTE(6))))   |
   1.620 +          (BYTE3WORD(T1(STATE_BYTE(11)))))  ^
   1.621 +          *roundkeyw++;
   1.622 +    *((PRUint32 *) pOut     )  = C0;
   1.623 +    *((PRUint32 *)(pOut + 4))  = C1;
   1.624 +    *((PRUint32 *)(pOut + 8))  = C2;
   1.625 +    *((PRUint32 *)(pOut + 12)) = C3;
   1.626 +#if defined(NSS_X86_OR_X64)
   1.627 +#undef pIn
   1.628 +#undef pOut
   1.629 +#else
   1.630 +    if ((ptrdiff_t)output & 0x3) {
   1.631 +	memcpy(output, outBuf, sizeof outBuf);
   1.632 +    }
   1.633 +#endif
   1.634 +    return SECSuccess;
   1.635 +}
   1.636 +
   1.637 +static SECStatus 
   1.638 +rijndael_decryptBlock128(AESContext *cx, 
   1.639 +                         unsigned char *output,
   1.640 +                         const unsigned char *input)
   1.641 +{
   1.642 +    int r;
   1.643 +    PRUint32 *roundkeyw;
   1.644 +    rijndael_state state;
   1.645 +    PRUint32 C0, C1, C2, C3;
   1.646 +#if defined(NSS_X86_OR_X64)
   1.647 +#define pIn input
   1.648 +#define pOut output
   1.649 +#else
   1.650 +    unsigned char *pIn, *pOut;
   1.651 +    PRUint32 inBuf[4], outBuf[4];
   1.652 +
   1.653 +    if ((ptrdiff_t)input & 0x3) {
   1.654 +	memcpy(inBuf, input, sizeof inBuf);
   1.655 +	pIn = (unsigned char *)inBuf;
   1.656 +    } else {
   1.657 +	pIn = (unsigned char *)input;
   1.658 +    }
   1.659 +    if ((ptrdiff_t)output & 0x3) {
   1.660 +	pOut = (unsigned char *)outBuf;
   1.661 +    } else {
   1.662 +	pOut = (unsigned char *)output;
   1.663 +    }
   1.664 +#endif
   1.665 +    roundkeyw = cx->expandedKey + cx->Nb * cx->Nr + 3;
   1.666 +    /* reverse the final key addition */
   1.667 +    COLUMN_3(state) = *((PRUint32 *)(pIn + 12)) ^ *roundkeyw--;
   1.668 +    COLUMN_2(state) = *((PRUint32 *)(pIn +  8)) ^ *roundkeyw--;
   1.669 +    COLUMN_1(state) = *((PRUint32 *)(pIn +  4)) ^ *roundkeyw--;
   1.670 +    COLUMN_0(state) = *((PRUint32 *)(pIn     )) ^ *roundkeyw--;
   1.671 +    /* Loop over rounds in reverse [NR..1] */
   1.672 +    for (r=cx->Nr; r>1; --r) {
   1.673 +	/* Invert the (InvByteSub*InvMixColumn)(InvShiftRow(state)) */
   1.674 +	C0 = TInv0(STATE_BYTE(0))  ^
   1.675 +	     TInv1(STATE_BYTE(13)) ^
   1.676 +	     TInv2(STATE_BYTE(10)) ^
   1.677 +	     TInv3(STATE_BYTE(7));
   1.678 +	C1 = TInv0(STATE_BYTE(4))  ^
   1.679 +	     TInv1(STATE_BYTE(1))  ^
   1.680 +	     TInv2(STATE_BYTE(14)) ^
   1.681 +	     TInv3(STATE_BYTE(11));
   1.682 +	C2 = TInv0(STATE_BYTE(8))  ^
   1.683 +	     TInv1(STATE_BYTE(5))  ^
   1.684 +	     TInv2(STATE_BYTE(2))  ^
   1.685 +	     TInv3(STATE_BYTE(15));
   1.686 +	C3 = TInv0(STATE_BYTE(12)) ^
   1.687 +	     TInv1(STATE_BYTE(9))  ^
   1.688 +	     TInv2(STATE_BYTE(6))  ^
   1.689 +	     TInv3(STATE_BYTE(3));
   1.690 +	/* Invert the key addition step */
   1.691 +	COLUMN_3(state) = C3 ^ *roundkeyw--;
   1.692 +	COLUMN_2(state) = C2 ^ *roundkeyw--;
   1.693 +	COLUMN_1(state) = C1 ^ *roundkeyw--;
   1.694 +	COLUMN_0(state) = C0 ^ *roundkeyw--;
   1.695 +    }
   1.696 +    /* inverse sub */
   1.697 +    pOut[ 0] = SINV(STATE_BYTE( 0));
   1.698 +    pOut[ 1] = SINV(STATE_BYTE(13));
   1.699 +    pOut[ 2] = SINV(STATE_BYTE(10));
   1.700 +    pOut[ 3] = SINV(STATE_BYTE( 7));
   1.701 +    pOut[ 4] = SINV(STATE_BYTE( 4));
   1.702 +    pOut[ 5] = SINV(STATE_BYTE( 1));
   1.703 +    pOut[ 6] = SINV(STATE_BYTE(14));
   1.704 +    pOut[ 7] = SINV(STATE_BYTE(11));
   1.705 +    pOut[ 8] = SINV(STATE_BYTE( 8));
   1.706 +    pOut[ 9] = SINV(STATE_BYTE( 5));
   1.707 +    pOut[10] = SINV(STATE_BYTE( 2));
   1.708 +    pOut[11] = SINV(STATE_BYTE(15));
   1.709 +    pOut[12] = SINV(STATE_BYTE(12));
   1.710 +    pOut[13] = SINV(STATE_BYTE( 9));
   1.711 +    pOut[14] = SINV(STATE_BYTE( 6));
   1.712 +    pOut[15] = SINV(STATE_BYTE( 3));
   1.713 +    /* final key addition */
   1.714 +    *((PRUint32 *)(pOut + 12)) ^= *roundkeyw--;
   1.715 +    *((PRUint32 *)(pOut +  8)) ^= *roundkeyw--;
   1.716 +    *((PRUint32 *)(pOut +  4)) ^= *roundkeyw--;
   1.717 +    *((PRUint32 *) pOut      ) ^= *roundkeyw--;
   1.718 +#if defined(NSS_X86_OR_X64)
   1.719 +#undef pIn
   1.720 +#undef pOut
   1.721 +#else
   1.722 +    if ((ptrdiff_t)output & 0x3) {
   1.723 +	memcpy(output, outBuf, sizeof outBuf);
   1.724 +    }
   1.725 +#endif
   1.726 +    return SECSuccess;
   1.727 +}
   1.728 +
   1.729 +/**************************************************************************
   1.730 + *
   1.731 + * Stuff related to general Rijndael encryption/decryption, for blocksizes
   1.732 + * greater than 128 bits.
   1.733 + *
   1.734 + * XXX This code is currently untested!  So far, AES specs have only been
   1.735 + *     released for 128 bit blocksizes.  This will be tested, but for now
   1.736 + *     only the code above has been tested using known values.
   1.737 + *
   1.738 + *************************************************************************/
   1.739 +
   1.740 +#define COLUMN(array, j) *((PRUint32 *)(array + j))
   1.741 +
   1.742 +SECStatus 
   1.743 +rijndael_encryptBlock(AESContext *cx, 
   1.744 +                      unsigned char *output,
   1.745 +                      const unsigned char *input)
   1.746 +{
   1.747 +    return SECFailure;
   1.748 +#ifdef rijndael_large_blocks_fixed
   1.749 +    unsigned int j, r, Nb;
   1.750 +    unsigned int c2=0, c3=0;
   1.751 +    PRUint32 *roundkeyw;
   1.752 +    PRUint8 clone[RIJNDAEL_MAX_STATE_SIZE];
   1.753 +    Nb = cx->Nb;
   1.754 +    roundkeyw = cx->expandedKey;
   1.755 +    /* Step 1: Add Round Key 0 to initial state */
   1.756 +    for (j=0; j<4*Nb; j+=4) {
   1.757 +	COLUMN(clone, j) = COLUMN(input, j) ^ *roundkeyw++;
   1.758 +    }
   1.759 +    /* Step 2: Loop over rounds [1..NR-1] */
   1.760 +    for (r=1; r<cx->Nr; ++r) {
   1.761 +	for (j=0; j<Nb; ++j) {
   1.762 +	    COLUMN(output, j) = T0(STATE_BYTE(4*  j          )) ^
   1.763 +	                        T1(STATE_BYTE(4*((j+ 1)%Nb)+1)) ^
   1.764 +	                        T2(STATE_BYTE(4*((j+c2)%Nb)+2)) ^
   1.765 +	                        T3(STATE_BYTE(4*((j+c3)%Nb)+3));
   1.766 +	}
   1.767 +	for (j=0; j<4*Nb; j+=4) {
   1.768 +	    COLUMN(clone, j) = COLUMN(output, j) ^ *roundkeyw++;
   1.769 +	}
   1.770 +    }
   1.771 +    /* Step 3: Do the last round */
   1.772 +    /* Final round does not employ MixColumn */
   1.773 +    for (j=0; j<Nb; ++j) {
   1.774 +	COLUMN(output, j) = ((BYTE0WORD(T2(STATE_BYTE(4* j         ))))  |
   1.775 +                             (BYTE1WORD(T3(STATE_BYTE(4*(j+ 1)%Nb)+1)))  |
   1.776 +                             (BYTE2WORD(T0(STATE_BYTE(4*(j+c2)%Nb)+2)))  |
   1.777 +                             (BYTE3WORD(T1(STATE_BYTE(4*(j+c3)%Nb)+3)))) ^
   1.778 +	                     *roundkeyw++;
   1.779 +    }
   1.780 +    return SECSuccess;
   1.781 +#endif
   1.782 +}
   1.783 +
   1.784 +SECStatus 
   1.785 +rijndael_decryptBlock(AESContext *cx, 
   1.786 +                      unsigned char *output,
   1.787 +                      const unsigned char *input)
   1.788 +{
   1.789 +    return SECFailure;
   1.790 +#ifdef rijndael_large_blocks_fixed
   1.791 +    int j, r, Nb;
   1.792 +    int c2=0, c3=0;
   1.793 +    PRUint32 *roundkeyw;
   1.794 +    PRUint8 clone[RIJNDAEL_MAX_STATE_SIZE];
   1.795 +    Nb = cx->Nb;
   1.796 +    roundkeyw = cx->expandedKey + cx->Nb * cx->Nr + 3;
   1.797 +    /* reverse key addition */
   1.798 +    for (j=4*Nb; j>=0; j-=4) {
   1.799 +	COLUMN(clone, j) = COLUMN(input, j) ^ *roundkeyw--;
   1.800 +    }
   1.801 +    /* Loop over rounds in reverse [NR..1] */
   1.802 +    for (r=cx->Nr; r>1; --r) {
   1.803 +	/* Invert the (InvByteSub*InvMixColumn)(InvShiftRow(state)) */
   1.804 +	for (j=0; j<Nb; ++j) {
   1.805 +	    COLUMN(output, 4*j) = TInv0(STATE_BYTE(4* j            )) ^
   1.806 +	                          TInv1(STATE_BYTE(4*(j+Nb- 1)%Nb)+1) ^
   1.807 +	                          TInv2(STATE_BYTE(4*(j+Nb-c2)%Nb)+2) ^
   1.808 +	                          TInv3(STATE_BYTE(4*(j+Nb-c3)%Nb)+3);
   1.809 +	}
   1.810 +	/* Invert the key addition step */
   1.811 +	for (j=4*Nb; j>=0; j-=4) {
   1.812 +	    COLUMN(clone, j) = COLUMN(output, j) ^ *roundkeyw--;
   1.813 +	}
   1.814 +    }
   1.815 +    /* inverse sub */
   1.816 +    for (j=0; j<4*Nb; ++j) {
   1.817 +	output[j] = SINV(clone[j]);
   1.818 +    }
   1.819 +    /* final key addition */
   1.820 +    for (j=4*Nb; j>=0; j-=4) {
   1.821 +	COLUMN(output, j) ^= *roundkeyw--;
   1.822 +    }
   1.823 +    return SECSuccess;
   1.824 +#endif
   1.825 +}
   1.826 +
   1.827 +/**************************************************************************
   1.828 + *
   1.829 + *  Rijndael modes of operation (ECB and CBC)
   1.830 + *
   1.831 + *************************************************************************/
   1.832 +
   1.833 +static SECStatus 
   1.834 +rijndael_encryptECB(AESContext *cx, unsigned char *output,
   1.835 +                    unsigned int *outputLen, unsigned int maxOutputLen,
   1.836 +                    const unsigned char *input, unsigned int inputLen, 
   1.837 +                    unsigned int blocksize)
   1.838 +{
   1.839 +    SECStatus rv;
   1.840 +    AESBlockFunc *encryptor;
   1.841 +
   1.842 +    encryptor = (blocksize == RIJNDAEL_MIN_BLOCKSIZE) 
   1.843 +				  ? &rijndael_encryptBlock128 
   1.844 +				  : &rijndael_encryptBlock;
   1.845 +    while (inputLen > 0) {
   1.846 +        rv = (*encryptor)(cx, output, input);
   1.847 +	if (rv != SECSuccess)
   1.848 +	    return rv;
   1.849 +	output += blocksize;
   1.850 +	input += blocksize;
   1.851 +	inputLen -= blocksize;
   1.852 +    }
   1.853 +    return SECSuccess;
   1.854 +}
   1.855 +
   1.856 +static SECStatus 
   1.857 +rijndael_encryptCBC(AESContext *cx, unsigned char *output,
   1.858 +                    unsigned int *outputLen, unsigned int maxOutputLen,
   1.859 +                    const unsigned char *input, unsigned int inputLen, 
   1.860 +                    unsigned int blocksize)
   1.861 +{
   1.862 +    unsigned int j;
   1.863 +    SECStatus rv;
   1.864 +    AESBlockFunc *encryptor;
   1.865 +    unsigned char *lastblock;
   1.866 +    unsigned char inblock[RIJNDAEL_MAX_STATE_SIZE * 8];
   1.867 +
   1.868 +    if (!inputLen)
   1.869 +	return SECSuccess;
   1.870 +    lastblock = cx->iv;
   1.871 +    encryptor = (blocksize == RIJNDAEL_MIN_BLOCKSIZE) 
   1.872 +				  ? &rijndael_encryptBlock128 
   1.873 +				  : &rijndael_encryptBlock;
   1.874 +    while (inputLen > 0) {
   1.875 +	/* XOR with the last block (IV if first block) */
   1.876 +	for (j=0; j<blocksize; ++j)
   1.877 +	    inblock[j] = input[j] ^ lastblock[j];
   1.878 +	/* encrypt */
   1.879 +        rv = (*encryptor)(cx, output, inblock);
   1.880 +	if (rv != SECSuccess)
   1.881 +	    return rv;
   1.882 +	/* move to the next block */
   1.883 +	lastblock = output;
   1.884 +	output += blocksize;
   1.885 +	input += blocksize;
   1.886 +	inputLen -= blocksize;
   1.887 +    }
   1.888 +    memcpy(cx->iv, lastblock, blocksize);
   1.889 +    return SECSuccess;
   1.890 +}
   1.891 +
   1.892 +static SECStatus 
   1.893 +rijndael_decryptECB(AESContext *cx, unsigned char *output,
   1.894 +                    unsigned int *outputLen, unsigned int maxOutputLen,
   1.895 +                    const unsigned char *input, unsigned int inputLen, 
   1.896 +                    unsigned int blocksize)
   1.897 +{
   1.898 +    SECStatus rv;
   1.899 +    AESBlockFunc *decryptor;
   1.900 +
   1.901 +    decryptor = (blocksize == RIJNDAEL_MIN_BLOCKSIZE) 
   1.902 +				  ? &rijndael_decryptBlock128 
   1.903 +				  : &rijndael_decryptBlock;
   1.904 +    while (inputLen > 0) {
   1.905 +        rv = (*decryptor)(cx, output, input);
   1.906 +	if (rv != SECSuccess)
   1.907 +	    return rv;
   1.908 +	output += blocksize;
   1.909 +	input += blocksize;
   1.910 +	inputLen -= blocksize;
   1.911 +    }
   1.912 +    return SECSuccess;
   1.913 +}
   1.914 +
   1.915 +static SECStatus 
   1.916 +rijndael_decryptCBC(AESContext *cx, unsigned char *output,
   1.917 +                    unsigned int *outputLen, unsigned int maxOutputLen,
   1.918 +                    const unsigned char *input, unsigned int inputLen, 
   1.919 +                    unsigned int blocksize)
   1.920 +{
   1.921 +    SECStatus rv;
   1.922 +    AESBlockFunc *decryptor;
   1.923 +    const unsigned char *in;
   1.924 +    unsigned char *out;
   1.925 +    unsigned int j;
   1.926 +    unsigned char newIV[RIJNDAEL_MAX_BLOCKSIZE];
   1.927 +
   1.928 +
   1.929 +    if (!inputLen) 
   1.930 +	return SECSuccess;
   1.931 +    PORT_Assert(output - input >= 0 || input - output >= (int)inputLen );
   1.932 +    decryptor = (blocksize == RIJNDAEL_MIN_BLOCKSIZE) 
   1.933 +                                  ? &rijndael_decryptBlock128 
   1.934 +				  : &rijndael_decryptBlock;
   1.935 +    in  = input  + (inputLen - blocksize);
   1.936 +    memcpy(newIV, in, blocksize);
   1.937 +    out = output + (inputLen - blocksize);
   1.938 +    while (inputLen > blocksize) {
   1.939 +        rv = (*decryptor)(cx, out, in);
   1.940 +	if (rv != SECSuccess)
   1.941 +	    return rv;
   1.942 +	for (j=0; j<blocksize; ++j)
   1.943 +	    out[j] ^= in[(int)(j - blocksize)];
   1.944 +	out -= blocksize;
   1.945 +	in -= blocksize;
   1.946 +	inputLen -= blocksize;
   1.947 +    }
   1.948 +    if (in == input) {
   1.949 +        rv = (*decryptor)(cx, out, in);
   1.950 +	if (rv != SECSuccess)
   1.951 +	    return rv;
   1.952 +	for (j=0; j<blocksize; ++j)
   1.953 +	    out[j] ^= cx->iv[j];
   1.954 +    }
   1.955 +    memcpy(cx->iv, newIV, blocksize);
   1.956 +    return SECSuccess;
   1.957 +}
   1.958 +
   1.959 +/************************************************************************
   1.960 + *
   1.961 + * BLAPI Interface functions
   1.962 + *
   1.963 + * The following functions implement the encryption routines defined in
   1.964 + * BLAPI for the AES cipher, Rijndael.
   1.965 + *
   1.966 + ***********************************************************************/
   1.967 +
   1.968 +AESContext * AES_AllocateContext(void)
   1.969 +{
   1.970 +    return PORT_ZNew(AESContext);
   1.971 +}
   1.972 +
   1.973 +
   1.974 +#ifdef INTEL_GCM
   1.975 +/*
   1.976 + * Adapted from the example code in "How to detect New Instruction support in
   1.977 + * the 4th generation Intel Core processor family" by Max Locktyukhin.
   1.978 + *
   1.979 + * XGETBV:
   1.980 + *   Reads an extended control register (XCR) specified by ECX into EDX:EAX.
   1.981 + */
   1.982 +static PRBool
   1.983 +check_xcr0_ymm()
   1.984 +{
   1.985 +    PRUint32 xcr0;
   1.986 +#if defined(_MSC_VER)
   1.987 +#if defined(_M_IX86)
   1.988 +    __asm {
   1.989 +        mov ecx, 0
   1.990 +        xgetbv
   1.991 +        mov xcr0, eax
   1.992 +    }
   1.993 +#else
   1.994 +    xcr0 = (PRUint32)_xgetbv(0);  /* Requires VS2010 SP1 or later. */
   1.995 +#endif
   1.996 +#else
   1.997 +    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx");
   1.998 +#endif
   1.999 +    /* Check if xmm and ymm state are enabled in XCR0. */
  1.1000 +    return (xcr0 & 6) == 6;
  1.1001 +}
  1.1002 +#endif
  1.1003 +
  1.1004 +/*
  1.1005 +** Initialize a new AES context suitable for AES encryption/decryption in
  1.1006 +** the ECB or CBC mode.
  1.1007 +** 	"mode" the mode of operation, which must be NSS_AES or NSS_AES_CBC
  1.1008 +*/
  1.1009 +static SECStatus   
  1.1010 +aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, 
  1.1011 +	        const unsigned char *iv, int mode, unsigned int encrypt,
  1.1012 +	        unsigned int blocksize)
  1.1013 +{
  1.1014 +    unsigned int Nk;
  1.1015 +    /* According to Rijndael AES Proposal, section 12.1, block and key
  1.1016 +     * lengths between 128 and 256 bits are supported, as long as the
  1.1017 +     * length in bytes is divisible by 4.
  1.1018 +     */
  1.1019 +    if (key == NULL || 
  1.1020 +        keysize < RIJNDAEL_MIN_BLOCKSIZE   || 
  1.1021 +	keysize > RIJNDAEL_MAX_BLOCKSIZE   || 
  1.1022 +	keysize % 4 != 0 ||
  1.1023 +        blocksize < RIJNDAEL_MIN_BLOCKSIZE || 
  1.1024 +	blocksize > RIJNDAEL_MAX_BLOCKSIZE || 
  1.1025 +	blocksize % 4 != 0) {
  1.1026 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1027 +	return SECFailure;
  1.1028 +    }
  1.1029 +    if (mode != NSS_AES && mode != NSS_AES_CBC) {
  1.1030 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1031 +	return SECFailure;
  1.1032 +    }
  1.1033 +    if (mode == NSS_AES_CBC && iv == NULL) {
  1.1034 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1035 +	return SECFailure;
  1.1036 +    }
  1.1037 +    if (!cx) {
  1.1038 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1039 +    	return SECFailure;
  1.1040 +    }
  1.1041 +#ifdef USE_HW_AES
  1.1042 +    if (has_intel_aes == 0) {
  1.1043 +	unsigned long eax, ebx, ecx, edx;
  1.1044 +	char *disable_hw_aes = getenv("NSS_DISABLE_HW_AES");
  1.1045 +
  1.1046 +	if (disable_hw_aes == NULL) {
  1.1047 +	    freebl_cpuid(1, &eax, &ebx, &ecx, &edx);
  1.1048 +	    has_intel_aes = (ecx & (1 << 25)) != 0 ? 1 : -1;
  1.1049 +#ifdef INTEL_GCM
  1.1050 +	    has_intel_clmul = (ecx & (1 << 1)) != 0 ? 1 : -1;
  1.1051 +	    if ((ecx & (1 << 27)) != 0 && (ecx & (1 << 28)) != 0 &&
  1.1052 +		check_xcr0_ymm()) {
  1.1053 +		has_intel_avx = 1;
  1.1054 +	    } else {
  1.1055 +		has_intel_avx = -1;
  1.1056 +	    }
  1.1057 +#endif
  1.1058 +	} else {
  1.1059 +	    has_intel_aes = -1;
  1.1060 +#ifdef INTEL_GCM
  1.1061 +	    has_intel_avx = -1;
  1.1062 +	    has_intel_clmul = -1;
  1.1063 +#endif
  1.1064 +	}
  1.1065 +    }
  1.1066 +    use_hw_aes = (PRBool)
  1.1067 +		(has_intel_aes > 0 && (keysize % 8) == 0 && blocksize == 16);
  1.1068 +#ifdef INTEL_GCM
  1.1069 +    use_hw_gcm = (PRBool)
  1.1070 +		(use_hw_aes && has_intel_avx>0 && has_intel_clmul>0);
  1.1071 +#endif
  1.1072 +#endif  /* USE_HW_AES */
  1.1073 +    /* Nb = (block size in bits) / 32 */
  1.1074 +    cx->Nb = blocksize / 4;
  1.1075 +    /* Nk = (key size in bits) / 32 */
  1.1076 +    Nk = keysize / 4;
  1.1077 +    /* Obtain number of rounds from "table" */
  1.1078 +    cx->Nr = RIJNDAEL_NUM_ROUNDS(Nk, cx->Nb);
  1.1079 +    /* copy in the iv, if neccessary */
  1.1080 +    if (mode == NSS_AES_CBC) {
  1.1081 +	memcpy(cx->iv, iv, blocksize);
  1.1082 +#ifdef USE_HW_AES
  1.1083 +	if (use_hw_aes) {
  1.1084 +	    cx->worker = (freeblCipherFunc)
  1.1085 +				intel_aes_cbc_worker(encrypt, keysize);
  1.1086 +	} else
  1.1087 +#endif
  1.1088 +	{
  1.1089 +	    cx->worker = (freeblCipherFunc) (encrypt
  1.1090 +			  ? &rijndael_encryptCBC : &rijndael_decryptCBC);
  1.1091 +	}
  1.1092 +    } else {
  1.1093 +#ifdef  USE_HW_AES
  1.1094 +	if (use_hw_aes) {
  1.1095 +	    cx->worker = (freeblCipherFunc) 
  1.1096 +				intel_aes_ecb_worker(encrypt, keysize);
  1.1097 +	} else
  1.1098 +#endif
  1.1099 +	{
  1.1100 +	    cx->worker = (freeblCipherFunc) (encrypt
  1.1101 +			  ? &rijndael_encryptECB : &rijndael_decryptECB);
  1.1102 +	}
  1.1103 +    }
  1.1104 +    PORT_Assert((cx->Nb * (cx->Nr + 1)) <= RIJNDAEL_MAX_EXP_KEY_SIZE);
  1.1105 +    if ((cx->Nb * (cx->Nr + 1)) > RIJNDAEL_MAX_EXP_KEY_SIZE) {
  1.1106 +	PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
  1.1107 +	goto cleanup;
  1.1108 +    }
  1.1109 +#ifdef USE_HW_AES
  1.1110 +    if (use_hw_aes) {
  1.1111 +	intel_aes_init(encrypt, keysize);
  1.1112 +    } else
  1.1113 +#endif
  1.1114 +    {
  1.1115 +
  1.1116 +#if defined(RIJNDAEL_GENERATE_TABLES) ||  \
  1.1117 +	defined(RIJNDAEL_GENERATE_TABLES_MACRO)
  1.1118 +	if (rijndaelTables == NULL) {
  1.1119 +	    if (PR_CallOnce(&coRTInit, init_rijndael_tables)
  1.1120 +	      != PR_SUCCESS) {
  1.1121 +		return SecFailure;
  1.1122 +	    }
  1.1123 +	}
  1.1124 +#endif
  1.1125 +	/* Generate expanded key */
  1.1126 +	if (encrypt) {
  1.1127 +	    if (rijndael_key_expansion(cx, key, Nk) != SECSuccess)
  1.1128 +		goto cleanup;
  1.1129 +	} else {
  1.1130 +	    if (rijndael_invkey_expansion(cx, key, Nk) != SECSuccess)
  1.1131 +		goto cleanup;
  1.1132 +	}
  1.1133 +    }
  1.1134 +    cx->worker_cx = cx;
  1.1135 +    cx->destroy = NULL;
  1.1136 +    cx->isBlock = PR_TRUE;
  1.1137 +    return SECSuccess;
  1.1138 +cleanup:
  1.1139 +    return SECFailure;
  1.1140 +}
  1.1141 +
  1.1142 +SECStatus   
  1.1143 +AES_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, 
  1.1144 +	        const unsigned char *iv, int mode, unsigned int encrypt,
  1.1145 +	        unsigned int blocksize)
  1.1146 +{
  1.1147 +    int basemode = mode;
  1.1148 +    PRBool baseencrypt = encrypt;
  1.1149 +    SECStatus rv;
  1.1150 +
  1.1151 +    switch (mode) {
  1.1152 +    case NSS_AES_CTS:
  1.1153 +	basemode = NSS_AES_CBC;
  1.1154 +	break;
  1.1155 +    case NSS_AES_GCM:
  1.1156 +    case NSS_AES_CTR:
  1.1157 +	basemode = NSS_AES;
  1.1158 +	baseencrypt = PR_TRUE;
  1.1159 +	break;
  1.1160 +    }
  1.1161 +    /* make sure enough is initializes so we can safely call Destroy */
  1.1162 +    cx->worker_cx = NULL;
  1.1163 +    cx->destroy = NULL;
  1.1164 +    rv = aes_InitContext(cx, key, keysize, iv, basemode, 
  1.1165 +					baseencrypt, blocksize);
  1.1166 +    if (rv != SECSuccess) {
  1.1167 +	AES_DestroyContext(cx, PR_FALSE);
  1.1168 +	return rv;
  1.1169 +    }
  1.1170 +
  1.1171 +    /* finally, set up any mode specific contexts */
  1.1172 +    switch (mode) {
  1.1173 +    case NSS_AES_CTS:
  1.1174 +	cx->worker_cx = CTS_CreateContext(cx, cx->worker, iv, blocksize);
  1.1175 +	cx->worker = (freeblCipherFunc) 
  1.1176 +			(encrypt ?  CTS_EncryptUpdate : CTS_DecryptUpdate);
  1.1177 +	cx->destroy = (freeblDestroyFunc) CTS_DestroyContext;
  1.1178 +	cx->isBlock = PR_FALSE;
  1.1179 +	break;
  1.1180 +    case NSS_AES_GCM:
  1.1181 +#ifdef INTEL_GCM
  1.1182 +	if(use_hw_gcm) {
  1.1183 +        	cx->worker_cx = intel_AES_GCM_CreateContext(cx, cx->worker, iv, blocksize);
  1.1184 +		cx->worker = (freeblCipherFunc)
  1.1185 +			(encrypt ? intel_AES_GCM_EncryptUpdate : intel_AES_GCM_DecryptUpdate);
  1.1186 +		cx->destroy = (freeblDestroyFunc) intel_AES_GCM_DestroyContext;
  1.1187 +		cx->isBlock = PR_FALSE;
  1.1188 +    	} else
  1.1189 +#endif
  1.1190 +	{
  1.1191 +	cx->worker_cx = GCM_CreateContext(cx, cx->worker, iv, blocksize);
  1.1192 +	cx->worker = (freeblCipherFunc)
  1.1193 +			(encrypt ? GCM_EncryptUpdate : GCM_DecryptUpdate);
  1.1194 +	cx->destroy = (freeblDestroyFunc) GCM_DestroyContext;
  1.1195 +	cx->isBlock = PR_FALSE;
  1.1196 +	}
  1.1197 +	break;
  1.1198 +    case NSS_AES_CTR:
  1.1199 +	cx->worker_cx = CTR_CreateContext(cx, cx->worker, iv, blocksize);
  1.1200 +#if defined(USE_HW_AES) && defined(_MSC_VER)
  1.1201 +	if (use_hw_aes) {
  1.1202 +	    cx->worker = (freeblCipherFunc) CTR_Update_HW_AES;
  1.1203 +	} else
  1.1204 +#endif
  1.1205 +	{
  1.1206 +	    cx->worker = (freeblCipherFunc) CTR_Update;
  1.1207 +	}
  1.1208 +	cx->destroy = (freeblDestroyFunc) CTR_DestroyContext;
  1.1209 +	cx->isBlock = PR_FALSE;
  1.1210 +	break;
  1.1211 +    default:
  1.1212 +	/* everything has already been set up by aes_InitContext, just
  1.1213 +	 * return */
  1.1214 +	return SECSuccess;
  1.1215 +    }
  1.1216 +    /* check to see if we succeeded in getting the worker context */
  1.1217 +    if (cx->worker_cx == NULL) {
  1.1218 +	/* no, just destroy the existing context */
  1.1219 +	cx->destroy = NULL; /* paranoia, though you can see a dozen lines */
  1.1220 +			    /* below that this isn't necessary */
  1.1221 +	AES_DestroyContext(cx, PR_FALSE);
  1.1222 +	return SECFailure;
  1.1223 +    }
  1.1224 +    return SECSuccess;
  1.1225 +}
  1.1226 +
  1.1227 +/* AES_CreateContext
  1.1228 + *
  1.1229 + * create a new context for Rijndael operations
  1.1230 + */
  1.1231 +AESContext *
  1.1232 +AES_CreateContext(const unsigned char *key, const unsigned char *iv, 
  1.1233 +                  int mode, int encrypt,
  1.1234 +                  unsigned int keysize, unsigned int blocksize)
  1.1235 +{
  1.1236 +    AESContext *cx = AES_AllocateContext();
  1.1237 +    if (cx) {
  1.1238 +	SECStatus rv = AES_InitContext(cx, key, keysize, iv, mode, encrypt,
  1.1239 +				       blocksize);
  1.1240 +	if (rv != SECSuccess) {
  1.1241 +	    AES_DestroyContext(cx, PR_TRUE);
  1.1242 +	    cx = NULL;
  1.1243 +	}
  1.1244 +    }
  1.1245 +    return cx;
  1.1246 +}
  1.1247 +
  1.1248 +/*
  1.1249 + * AES_DestroyContext
  1.1250 + * 
  1.1251 + * Zero an AES cipher context.  If freeit is true, also free the pointer
  1.1252 + * to the context.
  1.1253 + */
  1.1254 +void 
  1.1255 +AES_DestroyContext(AESContext *cx, PRBool freeit)
  1.1256 +{
  1.1257 +    if (cx->worker_cx && cx->destroy) {
  1.1258 +	(*cx->destroy)(cx->worker_cx, PR_TRUE);
  1.1259 +	cx->worker_cx = NULL;
  1.1260 +	cx->destroy = NULL;
  1.1261 +    }
  1.1262 +    if (freeit)
  1.1263 +	PORT_Free(cx);
  1.1264 +}
  1.1265 +
  1.1266 +/*
  1.1267 + * AES_Encrypt
  1.1268 + *
  1.1269 + * Encrypt an arbitrary-length buffer.  The output buffer must already be
  1.1270 + * allocated to at least inputLen.
  1.1271 + */
  1.1272 +SECStatus 
  1.1273 +AES_Encrypt(AESContext *cx, unsigned char *output,
  1.1274 +            unsigned int *outputLen, unsigned int maxOutputLen,
  1.1275 +            const unsigned char *input, unsigned int inputLen)
  1.1276 +{
  1.1277 +    int blocksize;
  1.1278 +    /* Check args */
  1.1279 +    if (cx == NULL || output == NULL || (input == NULL && inputLen != 0)) {
  1.1280 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1281 +	return SECFailure;
  1.1282 +    }
  1.1283 +    blocksize = 4 * cx->Nb;
  1.1284 +    if (cx->isBlock && (inputLen % blocksize != 0)) {
  1.1285 +	PORT_SetError(SEC_ERROR_INPUT_LEN);
  1.1286 +	return SECFailure;
  1.1287 +    }
  1.1288 +    if (maxOutputLen < inputLen) {
  1.1289 +	PORT_SetError(SEC_ERROR_OUTPUT_LEN);
  1.1290 +	return SECFailure;
  1.1291 +    }
  1.1292 +    *outputLen = inputLen;
  1.1293 +    return (*cx->worker)(cx->worker_cx, output, outputLen, maxOutputLen,	
  1.1294 +                             input, inputLen, blocksize);
  1.1295 +}
  1.1296 +
  1.1297 +/*
  1.1298 + * AES_Decrypt
  1.1299 + *
  1.1300 + * Decrypt and arbitrary-length buffer.  The output buffer must already be
  1.1301 + * allocated to at least inputLen.
  1.1302 + */
  1.1303 +SECStatus 
  1.1304 +AES_Decrypt(AESContext *cx, unsigned char *output,
  1.1305 +            unsigned int *outputLen, unsigned int maxOutputLen,
  1.1306 +            const unsigned char *input, unsigned int inputLen)
  1.1307 +{
  1.1308 +    int blocksize;
  1.1309 +    /* Check args */
  1.1310 +    if (cx == NULL || output == NULL || (input == NULL && inputLen != 0)) {
  1.1311 +	PORT_SetError(SEC_ERROR_INVALID_ARGS);
  1.1312 +	return SECFailure;
  1.1313 +    }
  1.1314 +    blocksize = 4 * cx->Nb;
  1.1315 +    if (cx->isBlock && (inputLen % blocksize != 0)) {
  1.1316 +	PORT_SetError(SEC_ERROR_INPUT_LEN);
  1.1317 +	return SECFailure;
  1.1318 +    }
  1.1319 +    if (maxOutputLen < inputLen) {
  1.1320 +	PORT_SetError(SEC_ERROR_OUTPUT_LEN);
  1.1321 +	return SECFailure;
  1.1322 +    }
  1.1323 +    *outputLen = inputLen;
  1.1324 +    return (*cx->worker)(cx->worker_cx, output, outputLen, maxOutputLen,	
  1.1325 +                             input, inputLen, blocksize);
  1.1326 +}

mercurial