security/nss/lib/freebl/mpi/mpcpucache.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/security/nss/lib/freebl/mpi/mpcpucache.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,813 @@
     1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.7 +
     1.8 +#include "mpi.h"
     1.9 +
    1.10 +/*
    1.11 + * This file implements a single function: s_mpi_getProcessorLineSize();
    1.12 + * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line
    1.13 + * if a cache exists, or zero if there is no cache. If more than one
    1.14 + * cache line exists, it should return the smallest line size (which is 
    1.15 + * usually the L1 cache).
    1.16 + *
    1.17 + * mp_modexp uses this information to make sure that private key information
    1.18 + * isn't being leaked through the cache.
    1.19 + *
    1.20 + * Currently the file returns good data for most modern x86 processors, and
    1.21 + * reasonable data on 64-bit ppc processors. All other processors are assumed
    1.22 + * to have a cache line size of 32 bytes unless modified by target.mk.
    1.23 + * 
    1.24 + */
    1.25 +
    1.26 +#if defined(i386) || defined(__i386) || defined(__X86__) || defined (_M_IX86) || defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
    1.27 +/* X86 processors have special instructions that tell us about the cache */
    1.28 +#include "string.h"
    1.29 +
    1.30 +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
    1.31 +#define AMD_64 1
    1.32 +#endif
    1.33 +
    1.34 +/* Generic CPUID function */
    1.35 +#if defined(AMD_64)
    1.36 +
    1.37 +#if defined(__GNUC__)
    1.38 +
    1.39 +void freebl_cpuid(unsigned long op, unsigned long *eax, 
    1.40 +	                 unsigned long *ebx, unsigned long *ecx, 
    1.41 +                         unsigned long *edx)
    1.42 +{
    1.43 +	__asm__("cpuid\n\t"
    1.44 +		: "=a" (*eax),
    1.45 +		  "=b" (*ebx),
    1.46 +		  "=c" (*ecx),
    1.47 +		  "=d" (*edx)
    1.48 +		: "0" (op));
    1.49 +}
    1.50 +
    1.51 +#elif defined(_MSC_VER)
    1.52 +
    1.53 +#include <intrin.h>
    1.54 +
    1.55 +void freebl_cpuid(unsigned long op, unsigned long *eax, 
    1.56 +           unsigned long *ebx, unsigned long *ecx, 
    1.57 +           unsigned long *edx)
    1.58 +{
    1.59 +    int intrinsic_out[4];
    1.60 +
    1.61 +    __cpuid(intrinsic_out, op);
    1.62 +    *eax = intrinsic_out[0];
    1.63 +    *ebx = intrinsic_out[1];
    1.64 +    *ecx = intrinsic_out[2];
    1.65 +    *edx = intrinsic_out[3];
    1.66 +}
    1.67 +
    1.68 +#endif
    1.69 +
    1.70 +#else /* !defined(AMD_64) */
    1.71 +
    1.72 +/* x86 */
    1.73 +
    1.74 +#if defined(__GNUC__)
    1.75 +void freebl_cpuid(unsigned long op, unsigned long *eax, 
    1.76 +	                 unsigned long *ebx, unsigned long *ecx, 
    1.77 +                         unsigned long *edx)
    1.78 +{
    1.79 +/* sigh GCC isn't smart enough to save the ebx PIC register on it's own
    1.80 + * in this case, so do it by hand. Use edi to store ebx and pass the
    1.81 + * value returned in ebx from cpuid through edi. */
    1.82 +	__asm__("mov %%ebx,%%edi\n\t"
    1.83 +		  "cpuid\n\t"
    1.84 +		  "xchgl %%ebx,%%edi\n\t"
    1.85 +		: "=a" (*eax),
    1.86 +		  "=D" (*ebx),
    1.87 +		  "=c" (*ecx),
    1.88 +		  "=d" (*edx)
    1.89 +		: "0" (op));
    1.90 +}
    1.91 +
    1.92 +/*
    1.93 + * try flipping a processor flag to determine CPU type
    1.94 + */
    1.95 +static unsigned long changeFlag(unsigned long flag)
    1.96 +{
    1.97 +	unsigned long changedFlags, originalFlags;
    1.98 +	__asm__("pushfl\n\t"            /* get the flags */
    1.99 +	        "popl %0\n\t"
   1.100 +	        "movl %0,%1\n\t"	/* save the original flags */
   1.101 +	        "xorl %2,%0\n\t" 	/* flip the bit */
   1.102 +		"pushl %0\n\t"  	/* set the flags */
   1.103 +	        "popfl\n\t"
   1.104 +		"pushfl\n\t"		/* get the flags again (for return) */
   1.105 +		"popl %0\n\t"
   1.106 +		"pushl %1\n\t"		/* restore the original flags */
   1.107 +		 "popfl\n\t"
   1.108 +		: "=r" (changedFlags),
   1.109 +		  "=r" (originalFlags),
   1.110 +		  "=r" (flag)
   1.111 +		: "2" (flag));
   1.112 +	return changedFlags ^ originalFlags;
   1.113 +}
   1.114 +
   1.115 +#elif defined(_MSC_VER)
   1.116 +
   1.117 +/*
   1.118 + * windows versions of the above assembler
   1.119 + */
   1.120 +#define wcpuid __asm __emit 0fh __asm __emit 0a2h
   1.121 +void freebl_cpuid(unsigned long op,    unsigned long *Reax, 
   1.122 +    unsigned long *Rebx, unsigned long *Recx, unsigned long *Redx)
   1.123 +{
   1.124 +        unsigned long  Leax, Lebx, Lecx, Ledx;
   1.125 +        __asm {
   1.126 +        pushad
   1.127 +        mov     eax,op
   1.128 +        wcpuid
   1.129 +        mov     Leax,eax
   1.130 +        mov     Lebx,ebx
   1.131 +        mov     Lecx,ecx
   1.132 +        mov     Ledx,edx
   1.133 +        popad
   1.134 +        }
   1.135 +        *Reax = Leax;
   1.136 +        *Rebx = Lebx;
   1.137 +        *Recx = Lecx;
   1.138 +        *Redx = Ledx;
   1.139 +}
   1.140 +
   1.141 +static unsigned long changeFlag(unsigned long flag)
   1.142 +{
   1.143 +	unsigned long changedFlags, originalFlags;
   1.144 +	__asm {
   1.145 +		push eax
   1.146 +		push ebx
   1.147 +		pushfd 	                /* get the flags */
   1.148 +	        pop  eax
   1.149 +		push eax		/* save the flags on the stack */
   1.150 +	        mov  originalFlags,eax  /* save the original flags */
   1.151 +		mov  ebx,flag
   1.152 +	        xor  eax,ebx            /* flip the bit */
   1.153 +		push eax                /* set the flags */
   1.154 +	        popfd
   1.155 +		pushfd                  /* get the flags again (for return) */
   1.156 +		pop  eax	
   1.157 +		popfd                   /* restore the original flags */
   1.158 +		mov changedFlags,eax
   1.159 +		pop ebx
   1.160 +		pop eax
   1.161 +	}
   1.162 +	return changedFlags ^ originalFlags;
   1.163 +}
   1.164 +#endif
   1.165 +
   1.166 +#endif
   1.167 +
   1.168 +#if !defined(AMD_64)
   1.169 +#define AC_FLAG 0x40000
   1.170 +#define ID_FLAG 0x200000
   1.171 +
   1.172 +/* 386 processors can't flip the AC_FLAG, intel AP Note AP-485 */
   1.173 +static int is386()
   1.174 +{
   1.175 +    return changeFlag(AC_FLAG) == 0;
   1.176 +}
   1.177 +
   1.178 +/* 486 processors can't flip the ID_FLAG, intel AP Note AP-485 */
   1.179 +static int is486()
   1.180 +{
   1.181 +    return changeFlag(ID_FLAG) == 0;
   1.182 +}
   1.183 +#endif
   1.184 +
   1.185 +
   1.186 +/*
   1.187 + * table for Intel Cache.
   1.188 + * See Intel Application Note AP-485 for more information 
   1.189 + */
   1.190 +
   1.191 +typedef unsigned char CacheTypeEntry;
   1.192 +
   1.193 +typedef enum {
   1.194 +    Cache_NONE    = 0,
   1.195 +    Cache_UNKNOWN = 1,
   1.196 +    Cache_TLB     = 2,
   1.197 +    Cache_TLBi    = 3,
   1.198 +    Cache_TLBd    = 4,
   1.199 +    Cache_Trace   = 5,
   1.200 +    Cache_L1      = 6,
   1.201 +    Cache_L1i     = 7,
   1.202 +    Cache_L1d     = 8,
   1.203 +    Cache_L2      = 9 ,
   1.204 +    Cache_L2i     = 10 ,
   1.205 +    Cache_L2d     = 11 ,
   1.206 +    Cache_L3      = 12 ,
   1.207 +    Cache_L3i     = 13,
   1.208 +    Cache_L3d     = 14
   1.209 +} CacheType;
   1.210 +
   1.211 +struct _cache {
   1.212 +    CacheTypeEntry type;
   1.213 +    unsigned char lineSize;
   1.214 +};
   1.215 +static const struct _cache CacheMap[256] = {
   1.216 +/* 00 */ {Cache_NONE,    0   },
   1.217 +/* 01 */ {Cache_TLBi,    0   },
   1.218 +/* 02 */ {Cache_TLBi,    0   },
   1.219 +/* 03 */ {Cache_TLBd,    0   },
   1.220 +/* 04 */ {Cache_TLBd,        },
   1.221 +/* 05 */ {Cache_UNKNOWN, 0   },
   1.222 +/* 06 */ {Cache_L1i,     32  },
   1.223 +/* 07 */ {Cache_UNKNOWN, 0   },
   1.224 +/* 08 */ {Cache_L1i,     32  },
   1.225 +/* 09 */ {Cache_UNKNOWN, 0   },
   1.226 +/* 0a */ {Cache_L1d,     32  },
   1.227 +/* 0b */ {Cache_UNKNOWN, 0   },
   1.228 +/* 0c */ {Cache_L1d,     32  },
   1.229 +/* 0d */ {Cache_UNKNOWN, 0   },
   1.230 +/* 0e */ {Cache_UNKNOWN, 0   },
   1.231 +/* 0f */ {Cache_UNKNOWN, 0   },
   1.232 +/* 10 */ {Cache_UNKNOWN, 0   },
   1.233 +/* 11 */ {Cache_UNKNOWN, 0   },
   1.234 +/* 12 */ {Cache_UNKNOWN, 0   },
   1.235 +/* 13 */ {Cache_UNKNOWN, 0   },
   1.236 +/* 14 */ {Cache_UNKNOWN, 0   },
   1.237 +/* 15 */ {Cache_UNKNOWN, 0   },
   1.238 +/* 16 */ {Cache_UNKNOWN, 0   },
   1.239 +/* 17 */ {Cache_UNKNOWN, 0   },
   1.240 +/* 18 */ {Cache_UNKNOWN, 0   },
   1.241 +/* 19 */ {Cache_UNKNOWN, 0   },
   1.242 +/* 1a */ {Cache_UNKNOWN, 0   },
   1.243 +/* 1b */ {Cache_UNKNOWN, 0   },
   1.244 +/* 1c */ {Cache_UNKNOWN, 0   },
   1.245 +/* 1d */ {Cache_UNKNOWN, 0   },
   1.246 +/* 1e */ {Cache_UNKNOWN, 0   },
   1.247 +/* 1f */ {Cache_UNKNOWN, 0   },
   1.248 +/* 20 */ {Cache_UNKNOWN, 0   },
   1.249 +/* 21 */ {Cache_UNKNOWN, 0   },
   1.250 +/* 22 */ {Cache_L3,      64  },
   1.251 +/* 23 */ {Cache_L3,      64  },
   1.252 +/* 24 */ {Cache_UNKNOWN, 0   },
   1.253 +/* 25 */ {Cache_L3,      64  },
   1.254 +/* 26 */ {Cache_UNKNOWN, 0   },
   1.255 +/* 27 */ {Cache_UNKNOWN, 0   },
   1.256 +/* 28 */ {Cache_UNKNOWN, 0   },
   1.257 +/* 29 */ {Cache_L3,      64  },
   1.258 +/* 2a */ {Cache_UNKNOWN, 0   },
   1.259 +/* 2b */ {Cache_UNKNOWN, 0   },
   1.260 +/* 2c */ {Cache_L1d,     64  },
   1.261 +/* 2d */ {Cache_UNKNOWN, 0   },
   1.262 +/* 2e */ {Cache_UNKNOWN, 0   },
   1.263 +/* 2f */ {Cache_UNKNOWN, 0   },
   1.264 +/* 30 */ {Cache_L1i,     64  },
   1.265 +/* 31 */ {Cache_UNKNOWN, 0   },
   1.266 +/* 32 */ {Cache_UNKNOWN, 0   },
   1.267 +/* 33 */ {Cache_UNKNOWN, 0   },
   1.268 +/* 34 */ {Cache_UNKNOWN, 0   },
   1.269 +/* 35 */ {Cache_UNKNOWN, 0   },
   1.270 +/* 36 */ {Cache_UNKNOWN, 0   },
   1.271 +/* 37 */ {Cache_UNKNOWN, 0   },
   1.272 +/* 38 */ {Cache_UNKNOWN, 0   },
   1.273 +/* 39 */ {Cache_L2,      64  },
   1.274 +/* 3a */ {Cache_UNKNOWN, 0   },
   1.275 +/* 3b */ {Cache_L2,      64  },
   1.276 +/* 3c */ {Cache_L2,      64  },
   1.277 +/* 3d */ {Cache_UNKNOWN, 0   },
   1.278 +/* 3e */ {Cache_UNKNOWN, 0   },
   1.279 +/* 3f */ {Cache_UNKNOWN, 0   },
   1.280 +/* 40 */ {Cache_L2,      0   },
   1.281 +/* 41 */ {Cache_L2,      32  },
   1.282 +/* 42 */ {Cache_L2,      32  },
   1.283 +/* 43 */ {Cache_L2,      32  },
   1.284 +/* 44 */ {Cache_L2,      32  },
   1.285 +/* 45 */ {Cache_L2,      32  },
   1.286 +/* 46 */ {Cache_UNKNOWN, 0   },
   1.287 +/* 47 */ {Cache_UNKNOWN, 0   },
   1.288 +/* 48 */ {Cache_UNKNOWN, 0   },
   1.289 +/* 49 */ {Cache_UNKNOWN, 0   },
   1.290 +/* 4a */ {Cache_UNKNOWN, 0   },
   1.291 +/* 4b */ {Cache_UNKNOWN, 0   },
   1.292 +/* 4c */ {Cache_UNKNOWN, 0   },
   1.293 +/* 4d */ {Cache_UNKNOWN, 0   },
   1.294 +/* 4e */ {Cache_UNKNOWN, 0   },
   1.295 +/* 4f */ {Cache_UNKNOWN, 0   },
   1.296 +/* 50 */ {Cache_TLBi,    0   },
   1.297 +/* 51 */ {Cache_TLBi,    0   },
   1.298 +/* 52 */ {Cache_TLBi,    0   },
   1.299 +/* 53 */ {Cache_UNKNOWN, 0   },
   1.300 +/* 54 */ {Cache_UNKNOWN, 0   },
   1.301 +/* 55 */ {Cache_UNKNOWN, 0   },
   1.302 +/* 56 */ {Cache_UNKNOWN, 0   },
   1.303 +/* 57 */ {Cache_UNKNOWN, 0   },
   1.304 +/* 58 */ {Cache_UNKNOWN, 0   },
   1.305 +/* 59 */ {Cache_UNKNOWN, 0   },
   1.306 +/* 5a */ {Cache_UNKNOWN, 0   },
   1.307 +/* 5b */ {Cache_TLBd,    0   },
   1.308 +/* 5c */ {Cache_TLBd,    0   },
   1.309 +/* 5d */ {Cache_TLBd,    0   },
   1.310 +/* 5e */ {Cache_UNKNOWN, 0   },
   1.311 +/* 5f */ {Cache_UNKNOWN, 0   },
   1.312 +/* 60 */ {Cache_UNKNOWN, 0   },
   1.313 +/* 61 */ {Cache_UNKNOWN, 0   },
   1.314 +/* 62 */ {Cache_UNKNOWN, 0   },
   1.315 +/* 63 */ {Cache_UNKNOWN, 0   },
   1.316 +/* 64 */ {Cache_UNKNOWN, 0   },
   1.317 +/* 65 */ {Cache_UNKNOWN, 0   },
   1.318 +/* 66 */ {Cache_L1d,     64  },
   1.319 +/* 67 */ {Cache_L1d,     64  },
   1.320 +/* 68 */ {Cache_L1d,     64  },
   1.321 +/* 69 */ {Cache_UNKNOWN, 0   },
   1.322 +/* 6a */ {Cache_UNKNOWN, 0   },
   1.323 +/* 6b */ {Cache_UNKNOWN, 0   },
   1.324 +/* 6c */ {Cache_UNKNOWN, 0   },
   1.325 +/* 6d */ {Cache_UNKNOWN, 0   },
   1.326 +/* 6e */ {Cache_UNKNOWN, 0   },
   1.327 +/* 6f */ {Cache_UNKNOWN, 0   },
   1.328 +/* 70 */ {Cache_Trace,   1   },
   1.329 +/* 71 */ {Cache_Trace,   1   },
   1.330 +/* 72 */ {Cache_Trace,   1   },
   1.331 +/* 73 */ {Cache_UNKNOWN, 0   },
   1.332 +/* 74 */ {Cache_UNKNOWN, 0   },
   1.333 +/* 75 */ {Cache_UNKNOWN, 0   },
   1.334 +/* 76 */ {Cache_UNKNOWN, 0   },
   1.335 +/* 77 */ {Cache_UNKNOWN, 0   },
   1.336 +/* 78 */ {Cache_UNKNOWN, 0   },
   1.337 +/* 79 */ {Cache_L2,      64  },
   1.338 +/* 7a */ {Cache_L2,      64  },
   1.339 +/* 7b */ {Cache_L2,      64  },
   1.340 +/* 7c */ {Cache_L2,      64  },
   1.341 +/* 7d */ {Cache_UNKNOWN, 0   },
   1.342 +/* 7e */ {Cache_UNKNOWN, 0   },
   1.343 +/* 7f */ {Cache_UNKNOWN, 0   },
   1.344 +/* 80 */ {Cache_UNKNOWN, 0   },
   1.345 +/* 81 */ {Cache_UNKNOWN, 0   },
   1.346 +/* 82 */ {Cache_L2,      32  },
   1.347 +/* 83 */ {Cache_L2,      32  },
   1.348 +/* 84 */ {Cache_L2,      32  },
   1.349 +/* 85 */ {Cache_L2,      32  },
   1.350 +/* 86 */ {Cache_L2,      64  },
   1.351 +/* 87 */ {Cache_L2,      64  },
   1.352 +/* 88 */ {Cache_UNKNOWN, 0   },
   1.353 +/* 89 */ {Cache_UNKNOWN, 0   },
   1.354 +/* 8a */ {Cache_UNKNOWN, 0   },
   1.355 +/* 8b */ {Cache_UNKNOWN, 0   },
   1.356 +/* 8c */ {Cache_UNKNOWN, 0   },
   1.357 +/* 8d */ {Cache_UNKNOWN, 0   },
   1.358 +/* 8e */ {Cache_UNKNOWN, 0   },
   1.359 +/* 8f */ {Cache_UNKNOWN, 0   },
   1.360 +/* 90 */ {Cache_UNKNOWN, 0   },
   1.361 +/* 91 */ {Cache_UNKNOWN, 0   },
   1.362 +/* 92 */ {Cache_UNKNOWN, 0   },
   1.363 +/* 93 */ {Cache_UNKNOWN, 0   },
   1.364 +/* 94 */ {Cache_UNKNOWN, 0   },
   1.365 +/* 95 */ {Cache_UNKNOWN, 0   },
   1.366 +/* 96 */ {Cache_UNKNOWN, 0   },
   1.367 +/* 97 */ {Cache_UNKNOWN, 0   },
   1.368 +/* 98 */ {Cache_UNKNOWN, 0   },
   1.369 +/* 99 */ {Cache_UNKNOWN, 0   },
   1.370 +/* 9a */ {Cache_UNKNOWN, 0   },
   1.371 +/* 9b */ {Cache_UNKNOWN, 0   },
   1.372 +/* 9c */ {Cache_UNKNOWN, 0   },
   1.373 +/* 9d */ {Cache_UNKNOWN, 0   },
   1.374 +/* 9e */ {Cache_UNKNOWN, 0   },
   1.375 +/* 9f */ {Cache_UNKNOWN, 0   },
   1.376 +/* a0 */ {Cache_UNKNOWN, 0   },
   1.377 +/* a1 */ {Cache_UNKNOWN, 0   },
   1.378 +/* a2 */ {Cache_UNKNOWN, 0   },
   1.379 +/* a3 */ {Cache_UNKNOWN, 0   },
   1.380 +/* a4 */ {Cache_UNKNOWN, 0   },
   1.381 +/* a5 */ {Cache_UNKNOWN, 0   },
   1.382 +/* a6 */ {Cache_UNKNOWN, 0   },
   1.383 +/* a7 */ {Cache_UNKNOWN, 0   },
   1.384 +/* a8 */ {Cache_UNKNOWN, 0   },
   1.385 +/* a9 */ {Cache_UNKNOWN, 0   },
   1.386 +/* aa */ {Cache_UNKNOWN, 0   },
   1.387 +/* ab */ {Cache_UNKNOWN, 0   },
   1.388 +/* ac */ {Cache_UNKNOWN, 0   },
   1.389 +/* ad */ {Cache_UNKNOWN, 0   },
   1.390 +/* ae */ {Cache_UNKNOWN, 0   },
   1.391 +/* af */ {Cache_UNKNOWN, 0   },
   1.392 +/* b0 */ {Cache_TLBi,    0   },
   1.393 +/* b1 */ {Cache_UNKNOWN, 0   },
   1.394 +/* b2 */ {Cache_UNKNOWN, 0   },
   1.395 +/* b3 */ {Cache_TLBd,    0   },
   1.396 +/* b4 */ {Cache_UNKNOWN, 0   },
   1.397 +/* b5 */ {Cache_UNKNOWN, 0   },
   1.398 +/* b6 */ {Cache_UNKNOWN, 0   },
   1.399 +/* b7 */ {Cache_UNKNOWN, 0   },
   1.400 +/* b8 */ {Cache_UNKNOWN, 0   },
   1.401 +/* b9 */ {Cache_UNKNOWN, 0   },
   1.402 +/* ba */ {Cache_UNKNOWN, 0   },
   1.403 +/* bb */ {Cache_UNKNOWN, 0   },
   1.404 +/* bc */ {Cache_UNKNOWN, 0   },
   1.405 +/* bd */ {Cache_UNKNOWN, 0   },
   1.406 +/* be */ {Cache_UNKNOWN, 0   },
   1.407 +/* bf */ {Cache_UNKNOWN, 0   },
   1.408 +/* c0 */ {Cache_UNKNOWN, 0   },
   1.409 +/* c1 */ {Cache_UNKNOWN, 0   },
   1.410 +/* c2 */ {Cache_UNKNOWN, 0   },
   1.411 +/* c3 */ {Cache_UNKNOWN, 0   },
   1.412 +/* c4 */ {Cache_UNKNOWN, 0   },
   1.413 +/* c5 */ {Cache_UNKNOWN, 0   },
   1.414 +/* c6 */ {Cache_UNKNOWN, 0   },
   1.415 +/* c7 */ {Cache_UNKNOWN, 0   },
   1.416 +/* c8 */ {Cache_UNKNOWN, 0   },
   1.417 +/* c9 */ {Cache_UNKNOWN, 0   },
   1.418 +/* ca */ {Cache_UNKNOWN, 0   },
   1.419 +/* cb */ {Cache_UNKNOWN, 0   },
   1.420 +/* cc */ {Cache_UNKNOWN, 0   },
   1.421 +/* cd */ {Cache_UNKNOWN, 0   },
   1.422 +/* ce */ {Cache_UNKNOWN, 0   },
   1.423 +/* cf */ {Cache_UNKNOWN, 0   },
   1.424 +/* d0 */ {Cache_UNKNOWN, 0   },
   1.425 +/* d1 */ {Cache_UNKNOWN, 0   },
   1.426 +/* d2 */ {Cache_UNKNOWN, 0   },
   1.427 +/* d3 */ {Cache_UNKNOWN, 0   },
   1.428 +/* d4 */ {Cache_UNKNOWN, 0   },
   1.429 +/* d5 */ {Cache_UNKNOWN, 0   },
   1.430 +/* d6 */ {Cache_UNKNOWN, 0   },
   1.431 +/* d7 */ {Cache_UNKNOWN, 0   },
   1.432 +/* d8 */ {Cache_UNKNOWN, 0   },
   1.433 +/* d9 */ {Cache_UNKNOWN, 0   },
   1.434 +/* da */ {Cache_UNKNOWN, 0   },
   1.435 +/* db */ {Cache_UNKNOWN, 0   },
   1.436 +/* dc */ {Cache_UNKNOWN, 0   },
   1.437 +/* dd */ {Cache_UNKNOWN, 0   },
   1.438 +/* de */ {Cache_UNKNOWN, 0   },
   1.439 +/* df */ {Cache_UNKNOWN, 0   },
   1.440 +/* e0 */ {Cache_UNKNOWN, 0   },
   1.441 +/* e1 */ {Cache_UNKNOWN, 0   },
   1.442 +/* e2 */ {Cache_UNKNOWN, 0   },
   1.443 +/* e3 */ {Cache_UNKNOWN, 0   },
   1.444 +/* e4 */ {Cache_UNKNOWN, 0   },
   1.445 +/* e5 */ {Cache_UNKNOWN, 0   },
   1.446 +/* e6 */ {Cache_UNKNOWN, 0   },
   1.447 +/* e7 */ {Cache_UNKNOWN, 0   },
   1.448 +/* e8 */ {Cache_UNKNOWN, 0   },
   1.449 +/* e9 */ {Cache_UNKNOWN, 0   },
   1.450 +/* ea */ {Cache_UNKNOWN, 0   },
   1.451 +/* eb */ {Cache_UNKNOWN, 0   },
   1.452 +/* ec */ {Cache_UNKNOWN, 0   },
   1.453 +/* ed */ {Cache_UNKNOWN, 0   },
   1.454 +/* ee */ {Cache_UNKNOWN, 0   },
   1.455 +/* ef */ {Cache_UNKNOWN, 0   },
   1.456 +/* f0 */ {Cache_UNKNOWN, 0   },
   1.457 +/* f1 */ {Cache_UNKNOWN, 0   },
   1.458 +/* f2 */ {Cache_UNKNOWN, 0   },
   1.459 +/* f3 */ {Cache_UNKNOWN, 0   },
   1.460 +/* f4 */ {Cache_UNKNOWN, 0   },
   1.461 +/* f5 */ {Cache_UNKNOWN, 0   },
   1.462 +/* f6 */ {Cache_UNKNOWN, 0   },
   1.463 +/* f7 */ {Cache_UNKNOWN, 0   },
   1.464 +/* f8 */ {Cache_UNKNOWN, 0   },
   1.465 +/* f9 */ {Cache_UNKNOWN, 0   },
   1.466 +/* fa */ {Cache_UNKNOWN, 0   },
   1.467 +/* fb */ {Cache_UNKNOWN, 0   },
   1.468 +/* fc */ {Cache_UNKNOWN, 0   },
   1.469 +/* fd */ {Cache_UNKNOWN, 0   },
   1.470 +/* fe */ {Cache_UNKNOWN, 0   },
   1.471 +/* ff */ {Cache_UNKNOWN, 0   }
   1.472 +};
   1.473 +
   1.474 +
   1.475 +/*
   1.476 + * use the above table to determine the CacheEntryLineSize.
   1.477 + */
   1.478 +static void
   1.479 +getIntelCacheEntryLineSize(unsigned long val, int *level, 
   1.480 +						unsigned long *lineSize)
   1.481 +{
   1.482 +    CacheType type;
   1.483 +
   1.484 +    type = CacheMap[val].type;
   1.485 +    /* only interested in data caches */
   1.486 +    /* NOTE val = 0x40 is a special value that means no L2 or L3 cache.
   1.487 +     * this data check has the side effect of rejecting that entry. If
   1.488 +     * that wasn't the case, we could have to reject it explicitly */
   1.489 +    if (CacheMap[val].lineSize == 0) {
   1.490 +	return;
   1.491 +    }
   1.492 +    /* look at the caches, skip types we aren't interested in.
   1.493 +     * if we already have a value for a lower level cache, skip the
   1.494 +     * current entry */
   1.495 +    if ((type == Cache_L1)|| (type == Cache_L1d)) {
   1.496 +	*level = 1;
   1.497 +	*lineSize = CacheMap[val].lineSize;
   1.498 +    } else if ((*level >= 2) && ((type == Cache_L2) || (type == Cache_L2d))) {
   1.499 +	*level = 2;
   1.500 +	*lineSize = CacheMap[val].lineSize;
   1.501 +    } else if ((*level >= 3) && ((type == Cache_L3) || (type == Cache_L3d))) {
   1.502 +	*level = 3;
   1.503 +	*lineSize = CacheMap[val].lineSize;
   1.504 +    }
   1.505 +    return;
   1.506 +}
   1.507 +
   1.508 +
   1.509 +static void
   1.510 +getIntelRegisterCacheLineSize(unsigned long val, 
   1.511 +			int *level, unsigned long *lineSize)
   1.512 +{
   1.513 +    getIntelCacheEntryLineSize(val >> 24 & 0xff, level, lineSize);
   1.514 +    getIntelCacheEntryLineSize(val >> 16 & 0xff, level, lineSize);
   1.515 +    getIntelCacheEntryLineSize(val >> 8 & 0xff, level, lineSize);
   1.516 +    getIntelCacheEntryLineSize(val & 0xff, level, lineSize);
   1.517 +}
   1.518 +
   1.519 +/*
   1.520 + * returns '0' if no recognized cache is found, or if the cache
   1.521 + * information is supported by this processor 
   1.522 + */
   1.523 +static unsigned long
   1.524 +getIntelCacheLineSize(int cpuidLevel)
   1.525 +{
   1.526 +    int level = 4;
   1.527 +    unsigned long lineSize = 0;
   1.528 +    unsigned long eax, ebx, ecx, edx;
   1.529 +    int repeat, count;
   1.530 +
   1.531 +    if (cpuidLevel < 2) {
   1.532 +	return 0;
   1.533 +    }
   1.534 +
   1.535 +    /* command '2' of the cpuid is intel's cache info call. Each byte of the
   1.536 +     * 4 registers contain a potential descriptor for the cache. The CacheMap	
   1.537 +     * table maps the cache entry with the processor cache. Register 'al'
   1.538 +     * contains a count value that cpuid '2' needs to be called in order to 
   1.539 +     * find all the cache descriptors. Only registers with the high bit set
   1.540 +     * to 'zero' have valid descriptors. This code loops through all the
   1.541 +     * required calls to cpuid '2' and passes any valid descriptors it finds
   1.542 +     * to the getIntelRegisterCacheLineSize code, which breaks the registers
   1.543 +     * down into their component descriptors. In the end the lineSize of the
   1.544 +     * lowest level cache data cache is returned. */
   1.545 +    freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
   1.546 +    repeat = eax & 0xf;
   1.547 +    for (count = 0; count < repeat; count++) {
   1.548 +	if ((eax & 0x80000000) == 0) {
   1.549 +	    getIntelRegisterCacheLineSize(eax & 0xffffff00, &level, &lineSize);
   1.550 +	}
   1.551 +	if ((ebx & 0x80000000) == 0) {
   1.552 +	    getIntelRegisterCacheLineSize(ebx, &level, &lineSize);
   1.553 +	}
   1.554 +	if ((ecx & 0x80000000) == 0) {
   1.555 +	    getIntelRegisterCacheLineSize(ecx, &level, &lineSize);
   1.556 +	}
   1.557 +	if ((edx & 0x80000000) == 0) {
   1.558 +	    getIntelRegisterCacheLineSize(edx, &level, &lineSize);
   1.559 +	}
   1.560 +	if (count+1 != repeat) {
   1.561 +	    freebl_cpuid(2, &eax, &ebx, &ecx, &edx);
   1.562 +	}
   1.563 +    }
   1.564 +    return lineSize;
   1.565 +}
   1.566 +
   1.567 +/*
   1.568 + * returns '0' if the cache info is not supported by this processor.
   1.569 + * This is based on the AMD extended cache commands for cpuid. 
   1.570 + * (see "AMD Processor Recognition Application Note" Publication 20734).
   1.571 + * Some other processors use the identical scheme.
   1.572 + * (see "Processor Recognition, Transmeta Corporation").
   1.573 + */
   1.574 +static unsigned long
   1.575 +getOtherCacheLineSize(unsigned long cpuidLevel)
   1.576 +{
   1.577 +    unsigned long lineSize = 0;
   1.578 +    unsigned long eax, ebx, ecx, edx;
   1.579 +
   1.580 +    /* get the Extended CPUID level */
   1.581 +    freebl_cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
   1.582 +    cpuidLevel = eax;
   1.583 +
   1.584 +    if (cpuidLevel >= 0x80000005) {
   1.585 +	freebl_cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
   1.586 +	lineSize = ecx & 0xff; /* line Size, L1 Data Cache */
   1.587 +    }
   1.588 +    return lineSize;
   1.589 +}
   1.590 +
   1.591 +static const char * const manMap[] = {
   1.592 +#define INTEL     0
   1.593 +    "GenuineIntel",
   1.594 +#define AMD       1
   1.595 +    "AuthenticAMD",
   1.596 +#define CYRIX     2
   1.597 +    "CyrixInstead",
   1.598 +#define CENTAUR   2
   1.599 +    "CentaurHauls",
   1.600 +#define NEXGEN    3
   1.601 +    "NexGenDriven",
   1.602 +#define TRANSMETA 4
   1.603 +    "GenuineTMx86",
   1.604 +#define RISE      5
   1.605 +    "RiseRiseRise",
   1.606 +#define UMC       6
   1.607 +    "UMC UMC UMC ",
   1.608 +#define SIS       7
   1.609 +    "Sis Sis Sis ",
   1.610 +#define NATIONAL  8
   1.611 +    "Geode by NSC",
   1.612 +};
   1.613 +
   1.614 +static const int n_manufacturers = sizeof(manMap)/sizeof(manMap[0]);
   1.615 +
   1.616 +
   1.617 +#define MAN_UNKNOWN 9
   1.618 +
   1.619 +#if !defined(AMD_64)
   1.620 +#define SSE2_FLAG (1<<26)
   1.621 +unsigned long
   1.622 +s_mpi_is_sse2()
   1.623 +{
   1.624 +    unsigned long eax, ebx, ecx, edx;
   1.625 +    int manufacturer = MAN_UNKNOWN;
   1.626 +    int i;
   1.627 +    char string[13];
   1.628 +
   1.629 +    if (is386() || is486()) {
   1.630 +	return 0;
   1.631 +    }
   1.632 +    freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
   1.633 +    /* string holds the CPU's manufacturer ID string - a twelve
   1.634 +     * character ASCII string stored in ebx, edx, ecx, and
   1.635 +     * the 32-bit extended feature flags are in edx, ecx.
   1.636 +     */
   1.637 +    *(int *)string = ebx;
   1.638 +    *(int *)&string[4] = (int)edx;
   1.639 +    *(int *)&string[8] = (int)ecx;
   1.640 +    string[12] = 0;
   1.641 +
   1.642 +    /* has no SSE2 extensions */
   1.643 +    if (eax == 0) {
   1.644 +	return 0;
   1.645 +    }
   1.646 +
   1.647 +    for (i=0; i < n_manufacturers; i++) {
   1.648 +	if ( strcmp(manMap[i],string) == 0) {
   1.649 +	    manufacturer = i;
   1.650 +	    break;
   1.651 +	}
   1.652 +    }
   1.653 +
   1.654 +    freebl_cpuid(1,&eax,&ebx,&ecx,&edx);
   1.655 +    return (edx & SSE2_FLAG) == SSE2_FLAG;
   1.656 +}
   1.657 +#endif
   1.658 +
   1.659 +unsigned long
   1.660 +s_mpi_getProcessorLineSize()
   1.661 +{
   1.662 +    unsigned long eax, ebx, ecx, edx;
   1.663 +    unsigned long cpuidLevel;
   1.664 +    unsigned long cacheLineSize = 0;
   1.665 +    int manufacturer = MAN_UNKNOWN;
   1.666 +    int i;
   1.667 +    char string[65];
   1.668 +
   1.669 +#if !defined(AMD_64)
   1.670 +    if (is386()) {
   1.671 +	return 0; /* 386 had no cache */
   1.672 +    } if (is486()) {
   1.673 +	return 32; /* really? need more info */
   1.674 +    }
   1.675 +#endif
   1.676 +
   1.677 +    /* Pentium, cpuid command is available */
   1.678 +    freebl_cpuid(0, &eax, &ebx, &ecx, &edx);
   1.679 +    cpuidLevel = eax;
   1.680 +    /* string holds the CPU's manufacturer ID string - a twelve
   1.681 +     * character ASCII string stored in ebx, edx, ecx, and
   1.682 +     * the 32-bit extended feature flags are in edx, ecx.
   1.683 +     */
   1.684 +    *(int *)string = ebx;
   1.685 +    *(int *)&string[4] = (int)edx;
   1.686 +    *(int *)&string[8] = (int)ecx;
   1.687 +    string[12] = 0;
   1.688 +
   1.689 +    manufacturer = MAN_UNKNOWN;
   1.690 +    for (i=0; i < n_manufacturers; i++) {
   1.691 +	if ( strcmp(manMap[i],string) == 0) {
   1.692 +	    manufacturer = i;
   1.693 +	}
   1.694 +    }
   1.695 +
   1.696 +    if (manufacturer == INTEL) {
   1.697 +	cacheLineSize = getIntelCacheLineSize(cpuidLevel);
   1.698 +    } else {
   1.699 +	cacheLineSize = getOtherCacheLineSize(cpuidLevel);
   1.700 +    }
   1.701 +    /* doesn't support cache info based on cpuid. This means
   1.702 +     * an old pentium class processor, which have cache lines of
   1.703 +     * 32. If we learn differently, we can use a switch based on
   1.704 +     * the Manufacturer id  */
   1.705 +    if (cacheLineSize == 0) {
   1.706 +	cacheLineSize = 32;
   1.707 +    }
   1.708 +    return cacheLineSize;
   1.709 +}
   1.710 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
   1.711 +#endif
   1.712 +
   1.713 +#if defined(__ppc64__) 
   1.714 +/*
   1.715 + *  Sigh, The PPC has some really nice features to help us determine cache
   1.716 + *  size, since it had lots of direct control functions to do so. The POWER
   1.717 + *  processor even has an instruction to do this, but it was dropped in
   1.718 + *  PowerPC. Unfortunately most of them are not available in user mode.
   1.719 + *
   1.720 + *  The dcbz function would be a great way to determine cache line size except
   1.721 + *  1) it only works on write-back memory (it throws an exception otherwise), 
   1.722 + *  and 2) because so many mac programs 'knew' the processor cache size was
   1.723 + *  32 bytes, they used this instruction as a fast 'zero 32 bytes'. Now the new
   1.724 + *  G5 processor has 128 byte cache, but dcbz only clears 32 bytes to keep
   1.725 + *  these programs happy. dcbzl work if 64 bit instructions are supported.
   1.726 + *  If you know 64 bit instructions are supported, and that stack is 
   1.727 + *  write-back, you can use this code.
   1.728 + */
   1.729 +#include "memory.h"
   1.730 +
   1.731 +/* clear the cache line that contains 'array' */
   1.732 +static inline void dcbzl(char *array)
   1.733 +{
   1.734 +	register char *a asm("r2") = array;
   1.735 +	__asm__ __volatile__( "dcbzl %0,r0" : "=r" (a): "0"(a) );
   1.736 +}
   1.737 +
   1.738 +
   1.739 +#define PPC_DO_ALIGN(x,y) ((char *)\
   1.740 +			((((long long) (x))+((y)-1))&~((y)-1)))
   1.741 +
   1.742 +#define PPC_MAX_LINE_SIZE 256
   1.743 +unsigned long
   1.744 +s_mpi_getProcessorLineSize()
   1.745 +{
   1.746 +    char testArray[2*PPC_MAX_LINE_SIZE+1];
   1.747 +    char *test;
   1.748 +    int i;
   1.749 +
   1.750 +    /* align the array on a maximum line size boundary, so we
   1.751 +     * know we are starting to clear from the first address */
   1.752 +    test = PPC_DO_ALIGN(testArray, PPC_MAX_LINE_SIZE); 
   1.753 +    /* set all the values to 1's */
   1.754 +    memset(test, 0xff, PPC_MAX_LINE_SIZE);
   1.755 +    /* clear one cache block starting at 'test' */
   1.756 +    dcbzl(test);
   1.757 +
   1.758 +    /* find the size of the cleared area, that's our block size */
   1.759 +    for (i=PPC_MAX_LINE_SIZE; i != 0; i = i/2) {
   1.760 +	if (test[i-1] == 0) {
   1.761 +	    return i;
   1.762 +	}
   1.763 +    }
   1.764 +    return 0;
   1.765 +}
   1.766 +
   1.767 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
   1.768 +#endif
   1.769 +
   1.770 +
   1.771 +/*
   1.772 + * put other processor and platform specific cache code here
   1.773 + * return the smallest cache line size in bytes on the processor 
   1.774 + * (usually the L1 cache). If the OS has a call, this would be
   1.775 + * a greate place to put it.
   1.776 + *
   1.777 + * If there is no cache, return 0;
   1.778 + * 
   1.779 + * define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED so the generic functions
   1.780 + * below aren't compiled.
   1.781 + *
   1.782 + */
   1.783 +
   1.784 +
   1.785 +/* target.mk can define MPI_CACHE_LINE_SIZE if it's common for the family or 
   1.786 + * OS */
   1.787 +#if defined(MPI_CACHE_LINE_SIZE) && !defined(MPI_GET_PROCESSOR_LINE_SIZE_DEFINED)
   1.788 +
   1.789 +unsigned long
   1.790 +s_mpi_getProcessorLineSize()
   1.791 +{
   1.792 +   return MPI_CACHE_LINE_SIZE;
   1.793 +}
   1.794 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1
   1.795 +#endif
   1.796 +
   1.797 +
   1.798 +/* If no way to get the processor cache line size has been defined, assume
   1.799 + * it's 32 bytes (most common value, does not significantly impact performance)
   1.800 + */ 
   1.801 +#ifndef MPI_GET_PROCESSOR_LINE_SIZE_DEFINED
   1.802 +unsigned long
   1.803 +s_mpi_getProcessorLineSize()
   1.804 +{
   1.805 +   return 32;
   1.806 +}
   1.807 +#endif
   1.808 +
   1.809 +#ifdef TEST_IT
   1.810 +#include <stdio.h>
   1.811 +
   1.812 +main()
   1.813 +{
   1.814 +    printf("line size = %d\n", s_mpi_getProcessorLineSize());
   1.815 +} 
   1.816 +#endif

mercurial