1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/security/nss/lib/freebl/mpi/mpcpucache.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,813 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +#include "mpi.h" 1.9 + 1.10 +/* 1.11 + * This file implements a single function: s_mpi_getProcessorLineSize(); 1.12 + * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line 1.13 + * if a cache exists, or zero if there is no cache. If more than one 1.14 + * cache line exists, it should return the smallest line size (which is 1.15 + * usually the L1 cache). 1.16 + * 1.17 + * mp_modexp uses this information to make sure that private key information 1.18 + * isn't being leaked through the cache. 1.19 + * 1.20 + * Currently the file returns good data for most modern x86 processors, and 1.21 + * reasonable data on 64-bit ppc processors. All other processors are assumed 1.22 + * to have a cache line size of 32 bytes unless modified by target.mk. 1.23 + * 1.24 + */ 1.25 + 1.26 +#if defined(i386) || defined(__i386) || defined(__X86__) || defined (_M_IX86) || defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64) 1.27 +/* X86 processors have special instructions that tell us about the cache */ 1.28 +#include "string.h" 1.29 + 1.30 +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64) 1.31 +#define AMD_64 1 1.32 +#endif 1.33 + 1.34 +/* Generic CPUID function */ 1.35 +#if defined(AMD_64) 1.36 + 1.37 +#if defined(__GNUC__) 1.38 + 1.39 +void freebl_cpuid(unsigned long op, unsigned long *eax, 1.40 + unsigned long *ebx, unsigned long *ecx, 1.41 + unsigned long *edx) 1.42 +{ 1.43 + __asm__("cpuid\n\t" 1.44 + : "=a" (*eax), 1.45 + "=b" (*ebx), 1.46 + "=c" (*ecx), 1.47 + "=d" (*edx) 1.48 + : "0" (op)); 1.49 +} 1.50 + 1.51 +#elif defined(_MSC_VER) 1.52 + 1.53 +#include <intrin.h> 1.54 + 1.55 +void freebl_cpuid(unsigned long op, unsigned long *eax, 1.56 + unsigned long *ebx, unsigned long *ecx, 1.57 + unsigned long *edx) 1.58 +{ 1.59 + int intrinsic_out[4]; 1.60 + 1.61 + __cpuid(intrinsic_out, op); 1.62 + *eax = intrinsic_out[0]; 1.63 + *ebx = intrinsic_out[1]; 1.64 + *ecx = intrinsic_out[2]; 1.65 + *edx = intrinsic_out[3]; 1.66 +} 1.67 + 1.68 +#endif 1.69 + 1.70 +#else /* !defined(AMD_64) */ 1.71 + 1.72 +/* x86 */ 1.73 + 1.74 +#if defined(__GNUC__) 1.75 +void freebl_cpuid(unsigned long op, unsigned long *eax, 1.76 + unsigned long *ebx, unsigned long *ecx, 1.77 + unsigned long *edx) 1.78 +{ 1.79 +/* sigh GCC isn't smart enough to save the ebx PIC register on it's own 1.80 + * in this case, so do it by hand. Use edi to store ebx and pass the 1.81 + * value returned in ebx from cpuid through edi. */ 1.82 + __asm__("mov %%ebx,%%edi\n\t" 1.83 + "cpuid\n\t" 1.84 + "xchgl %%ebx,%%edi\n\t" 1.85 + : "=a" (*eax), 1.86 + "=D" (*ebx), 1.87 + "=c" (*ecx), 1.88 + "=d" (*edx) 1.89 + : "0" (op)); 1.90 +} 1.91 + 1.92 +/* 1.93 + * try flipping a processor flag to determine CPU type 1.94 + */ 1.95 +static unsigned long changeFlag(unsigned long flag) 1.96 +{ 1.97 + unsigned long changedFlags, originalFlags; 1.98 + __asm__("pushfl\n\t" /* get the flags */ 1.99 + "popl %0\n\t" 1.100 + "movl %0,%1\n\t" /* save the original flags */ 1.101 + "xorl %2,%0\n\t" /* flip the bit */ 1.102 + "pushl %0\n\t" /* set the flags */ 1.103 + "popfl\n\t" 1.104 + "pushfl\n\t" /* get the flags again (for return) */ 1.105 + "popl %0\n\t" 1.106 + "pushl %1\n\t" /* restore the original flags */ 1.107 + "popfl\n\t" 1.108 + : "=r" (changedFlags), 1.109 + "=r" (originalFlags), 1.110 + "=r" (flag) 1.111 + : "2" (flag)); 1.112 + return changedFlags ^ originalFlags; 1.113 +} 1.114 + 1.115 +#elif defined(_MSC_VER) 1.116 + 1.117 +/* 1.118 + * windows versions of the above assembler 1.119 + */ 1.120 +#define wcpuid __asm __emit 0fh __asm __emit 0a2h 1.121 +void freebl_cpuid(unsigned long op, unsigned long *Reax, 1.122 + unsigned long *Rebx, unsigned long *Recx, unsigned long *Redx) 1.123 +{ 1.124 + unsigned long Leax, Lebx, Lecx, Ledx; 1.125 + __asm { 1.126 + pushad 1.127 + mov eax,op 1.128 + wcpuid 1.129 + mov Leax,eax 1.130 + mov Lebx,ebx 1.131 + mov Lecx,ecx 1.132 + mov Ledx,edx 1.133 + popad 1.134 + } 1.135 + *Reax = Leax; 1.136 + *Rebx = Lebx; 1.137 + *Recx = Lecx; 1.138 + *Redx = Ledx; 1.139 +} 1.140 + 1.141 +static unsigned long changeFlag(unsigned long flag) 1.142 +{ 1.143 + unsigned long changedFlags, originalFlags; 1.144 + __asm { 1.145 + push eax 1.146 + push ebx 1.147 + pushfd /* get the flags */ 1.148 + pop eax 1.149 + push eax /* save the flags on the stack */ 1.150 + mov originalFlags,eax /* save the original flags */ 1.151 + mov ebx,flag 1.152 + xor eax,ebx /* flip the bit */ 1.153 + push eax /* set the flags */ 1.154 + popfd 1.155 + pushfd /* get the flags again (for return) */ 1.156 + pop eax 1.157 + popfd /* restore the original flags */ 1.158 + mov changedFlags,eax 1.159 + pop ebx 1.160 + pop eax 1.161 + } 1.162 + return changedFlags ^ originalFlags; 1.163 +} 1.164 +#endif 1.165 + 1.166 +#endif 1.167 + 1.168 +#if !defined(AMD_64) 1.169 +#define AC_FLAG 0x40000 1.170 +#define ID_FLAG 0x200000 1.171 + 1.172 +/* 386 processors can't flip the AC_FLAG, intel AP Note AP-485 */ 1.173 +static int is386() 1.174 +{ 1.175 + return changeFlag(AC_FLAG) == 0; 1.176 +} 1.177 + 1.178 +/* 486 processors can't flip the ID_FLAG, intel AP Note AP-485 */ 1.179 +static int is486() 1.180 +{ 1.181 + return changeFlag(ID_FLAG) == 0; 1.182 +} 1.183 +#endif 1.184 + 1.185 + 1.186 +/* 1.187 + * table for Intel Cache. 1.188 + * See Intel Application Note AP-485 for more information 1.189 + */ 1.190 + 1.191 +typedef unsigned char CacheTypeEntry; 1.192 + 1.193 +typedef enum { 1.194 + Cache_NONE = 0, 1.195 + Cache_UNKNOWN = 1, 1.196 + Cache_TLB = 2, 1.197 + Cache_TLBi = 3, 1.198 + Cache_TLBd = 4, 1.199 + Cache_Trace = 5, 1.200 + Cache_L1 = 6, 1.201 + Cache_L1i = 7, 1.202 + Cache_L1d = 8, 1.203 + Cache_L2 = 9 , 1.204 + Cache_L2i = 10 , 1.205 + Cache_L2d = 11 , 1.206 + Cache_L3 = 12 , 1.207 + Cache_L3i = 13, 1.208 + Cache_L3d = 14 1.209 +} CacheType; 1.210 + 1.211 +struct _cache { 1.212 + CacheTypeEntry type; 1.213 + unsigned char lineSize; 1.214 +}; 1.215 +static const struct _cache CacheMap[256] = { 1.216 +/* 00 */ {Cache_NONE, 0 }, 1.217 +/* 01 */ {Cache_TLBi, 0 }, 1.218 +/* 02 */ {Cache_TLBi, 0 }, 1.219 +/* 03 */ {Cache_TLBd, 0 }, 1.220 +/* 04 */ {Cache_TLBd, }, 1.221 +/* 05 */ {Cache_UNKNOWN, 0 }, 1.222 +/* 06 */ {Cache_L1i, 32 }, 1.223 +/* 07 */ {Cache_UNKNOWN, 0 }, 1.224 +/* 08 */ {Cache_L1i, 32 }, 1.225 +/* 09 */ {Cache_UNKNOWN, 0 }, 1.226 +/* 0a */ {Cache_L1d, 32 }, 1.227 +/* 0b */ {Cache_UNKNOWN, 0 }, 1.228 +/* 0c */ {Cache_L1d, 32 }, 1.229 +/* 0d */ {Cache_UNKNOWN, 0 }, 1.230 +/* 0e */ {Cache_UNKNOWN, 0 }, 1.231 +/* 0f */ {Cache_UNKNOWN, 0 }, 1.232 +/* 10 */ {Cache_UNKNOWN, 0 }, 1.233 +/* 11 */ {Cache_UNKNOWN, 0 }, 1.234 +/* 12 */ {Cache_UNKNOWN, 0 }, 1.235 +/* 13 */ {Cache_UNKNOWN, 0 }, 1.236 +/* 14 */ {Cache_UNKNOWN, 0 }, 1.237 +/* 15 */ {Cache_UNKNOWN, 0 }, 1.238 +/* 16 */ {Cache_UNKNOWN, 0 }, 1.239 +/* 17 */ {Cache_UNKNOWN, 0 }, 1.240 +/* 18 */ {Cache_UNKNOWN, 0 }, 1.241 +/* 19 */ {Cache_UNKNOWN, 0 }, 1.242 +/* 1a */ {Cache_UNKNOWN, 0 }, 1.243 +/* 1b */ {Cache_UNKNOWN, 0 }, 1.244 +/* 1c */ {Cache_UNKNOWN, 0 }, 1.245 +/* 1d */ {Cache_UNKNOWN, 0 }, 1.246 +/* 1e */ {Cache_UNKNOWN, 0 }, 1.247 +/* 1f */ {Cache_UNKNOWN, 0 }, 1.248 +/* 20 */ {Cache_UNKNOWN, 0 }, 1.249 +/* 21 */ {Cache_UNKNOWN, 0 }, 1.250 +/* 22 */ {Cache_L3, 64 }, 1.251 +/* 23 */ {Cache_L3, 64 }, 1.252 +/* 24 */ {Cache_UNKNOWN, 0 }, 1.253 +/* 25 */ {Cache_L3, 64 }, 1.254 +/* 26 */ {Cache_UNKNOWN, 0 }, 1.255 +/* 27 */ {Cache_UNKNOWN, 0 }, 1.256 +/* 28 */ {Cache_UNKNOWN, 0 }, 1.257 +/* 29 */ {Cache_L3, 64 }, 1.258 +/* 2a */ {Cache_UNKNOWN, 0 }, 1.259 +/* 2b */ {Cache_UNKNOWN, 0 }, 1.260 +/* 2c */ {Cache_L1d, 64 }, 1.261 +/* 2d */ {Cache_UNKNOWN, 0 }, 1.262 +/* 2e */ {Cache_UNKNOWN, 0 }, 1.263 +/* 2f */ {Cache_UNKNOWN, 0 }, 1.264 +/* 30 */ {Cache_L1i, 64 }, 1.265 +/* 31 */ {Cache_UNKNOWN, 0 }, 1.266 +/* 32 */ {Cache_UNKNOWN, 0 }, 1.267 +/* 33 */ {Cache_UNKNOWN, 0 }, 1.268 +/* 34 */ {Cache_UNKNOWN, 0 }, 1.269 +/* 35 */ {Cache_UNKNOWN, 0 }, 1.270 +/* 36 */ {Cache_UNKNOWN, 0 }, 1.271 +/* 37 */ {Cache_UNKNOWN, 0 }, 1.272 +/* 38 */ {Cache_UNKNOWN, 0 }, 1.273 +/* 39 */ {Cache_L2, 64 }, 1.274 +/* 3a */ {Cache_UNKNOWN, 0 }, 1.275 +/* 3b */ {Cache_L2, 64 }, 1.276 +/* 3c */ {Cache_L2, 64 }, 1.277 +/* 3d */ {Cache_UNKNOWN, 0 }, 1.278 +/* 3e */ {Cache_UNKNOWN, 0 }, 1.279 +/* 3f */ {Cache_UNKNOWN, 0 }, 1.280 +/* 40 */ {Cache_L2, 0 }, 1.281 +/* 41 */ {Cache_L2, 32 }, 1.282 +/* 42 */ {Cache_L2, 32 }, 1.283 +/* 43 */ {Cache_L2, 32 }, 1.284 +/* 44 */ {Cache_L2, 32 }, 1.285 +/* 45 */ {Cache_L2, 32 }, 1.286 +/* 46 */ {Cache_UNKNOWN, 0 }, 1.287 +/* 47 */ {Cache_UNKNOWN, 0 }, 1.288 +/* 48 */ {Cache_UNKNOWN, 0 }, 1.289 +/* 49 */ {Cache_UNKNOWN, 0 }, 1.290 +/* 4a */ {Cache_UNKNOWN, 0 }, 1.291 +/* 4b */ {Cache_UNKNOWN, 0 }, 1.292 +/* 4c */ {Cache_UNKNOWN, 0 }, 1.293 +/* 4d */ {Cache_UNKNOWN, 0 }, 1.294 +/* 4e */ {Cache_UNKNOWN, 0 }, 1.295 +/* 4f */ {Cache_UNKNOWN, 0 }, 1.296 +/* 50 */ {Cache_TLBi, 0 }, 1.297 +/* 51 */ {Cache_TLBi, 0 }, 1.298 +/* 52 */ {Cache_TLBi, 0 }, 1.299 +/* 53 */ {Cache_UNKNOWN, 0 }, 1.300 +/* 54 */ {Cache_UNKNOWN, 0 }, 1.301 +/* 55 */ {Cache_UNKNOWN, 0 }, 1.302 +/* 56 */ {Cache_UNKNOWN, 0 }, 1.303 +/* 57 */ {Cache_UNKNOWN, 0 }, 1.304 +/* 58 */ {Cache_UNKNOWN, 0 }, 1.305 +/* 59 */ {Cache_UNKNOWN, 0 }, 1.306 +/* 5a */ {Cache_UNKNOWN, 0 }, 1.307 +/* 5b */ {Cache_TLBd, 0 }, 1.308 +/* 5c */ {Cache_TLBd, 0 }, 1.309 +/* 5d */ {Cache_TLBd, 0 }, 1.310 +/* 5e */ {Cache_UNKNOWN, 0 }, 1.311 +/* 5f */ {Cache_UNKNOWN, 0 }, 1.312 +/* 60 */ {Cache_UNKNOWN, 0 }, 1.313 +/* 61 */ {Cache_UNKNOWN, 0 }, 1.314 +/* 62 */ {Cache_UNKNOWN, 0 }, 1.315 +/* 63 */ {Cache_UNKNOWN, 0 }, 1.316 +/* 64 */ {Cache_UNKNOWN, 0 }, 1.317 +/* 65 */ {Cache_UNKNOWN, 0 }, 1.318 +/* 66 */ {Cache_L1d, 64 }, 1.319 +/* 67 */ {Cache_L1d, 64 }, 1.320 +/* 68 */ {Cache_L1d, 64 }, 1.321 +/* 69 */ {Cache_UNKNOWN, 0 }, 1.322 +/* 6a */ {Cache_UNKNOWN, 0 }, 1.323 +/* 6b */ {Cache_UNKNOWN, 0 }, 1.324 +/* 6c */ {Cache_UNKNOWN, 0 }, 1.325 +/* 6d */ {Cache_UNKNOWN, 0 }, 1.326 +/* 6e */ {Cache_UNKNOWN, 0 }, 1.327 +/* 6f */ {Cache_UNKNOWN, 0 }, 1.328 +/* 70 */ {Cache_Trace, 1 }, 1.329 +/* 71 */ {Cache_Trace, 1 }, 1.330 +/* 72 */ {Cache_Trace, 1 }, 1.331 +/* 73 */ {Cache_UNKNOWN, 0 }, 1.332 +/* 74 */ {Cache_UNKNOWN, 0 }, 1.333 +/* 75 */ {Cache_UNKNOWN, 0 }, 1.334 +/* 76 */ {Cache_UNKNOWN, 0 }, 1.335 +/* 77 */ {Cache_UNKNOWN, 0 }, 1.336 +/* 78 */ {Cache_UNKNOWN, 0 }, 1.337 +/* 79 */ {Cache_L2, 64 }, 1.338 +/* 7a */ {Cache_L2, 64 }, 1.339 +/* 7b */ {Cache_L2, 64 }, 1.340 +/* 7c */ {Cache_L2, 64 }, 1.341 +/* 7d */ {Cache_UNKNOWN, 0 }, 1.342 +/* 7e */ {Cache_UNKNOWN, 0 }, 1.343 +/* 7f */ {Cache_UNKNOWN, 0 }, 1.344 +/* 80 */ {Cache_UNKNOWN, 0 }, 1.345 +/* 81 */ {Cache_UNKNOWN, 0 }, 1.346 +/* 82 */ {Cache_L2, 32 }, 1.347 +/* 83 */ {Cache_L2, 32 }, 1.348 +/* 84 */ {Cache_L2, 32 }, 1.349 +/* 85 */ {Cache_L2, 32 }, 1.350 +/* 86 */ {Cache_L2, 64 }, 1.351 +/* 87 */ {Cache_L2, 64 }, 1.352 +/* 88 */ {Cache_UNKNOWN, 0 }, 1.353 +/* 89 */ {Cache_UNKNOWN, 0 }, 1.354 +/* 8a */ {Cache_UNKNOWN, 0 }, 1.355 +/* 8b */ {Cache_UNKNOWN, 0 }, 1.356 +/* 8c */ {Cache_UNKNOWN, 0 }, 1.357 +/* 8d */ {Cache_UNKNOWN, 0 }, 1.358 +/* 8e */ {Cache_UNKNOWN, 0 }, 1.359 +/* 8f */ {Cache_UNKNOWN, 0 }, 1.360 +/* 90 */ {Cache_UNKNOWN, 0 }, 1.361 +/* 91 */ {Cache_UNKNOWN, 0 }, 1.362 +/* 92 */ {Cache_UNKNOWN, 0 }, 1.363 +/* 93 */ {Cache_UNKNOWN, 0 }, 1.364 +/* 94 */ {Cache_UNKNOWN, 0 }, 1.365 +/* 95 */ {Cache_UNKNOWN, 0 }, 1.366 +/* 96 */ {Cache_UNKNOWN, 0 }, 1.367 +/* 97 */ {Cache_UNKNOWN, 0 }, 1.368 +/* 98 */ {Cache_UNKNOWN, 0 }, 1.369 +/* 99 */ {Cache_UNKNOWN, 0 }, 1.370 +/* 9a */ {Cache_UNKNOWN, 0 }, 1.371 +/* 9b */ {Cache_UNKNOWN, 0 }, 1.372 +/* 9c */ {Cache_UNKNOWN, 0 }, 1.373 +/* 9d */ {Cache_UNKNOWN, 0 }, 1.374 +/* 9e */ {Cache_UNKNOWN, 0 }, 1.375 +/* 9f */ {Cache_UNKNOWN, 0 }, 1.376 +/* a0 */ {Cache_UNKNOWN, 0 }, 1.377 +/* a1 */ {Cache_UNKNOWN, 0 }, 1.378 +/* a2 */ {Cache_UNKNOWN, 0 }, 1.379 +/* a3 */ {Cache_UNKNOWN, 0 }, 1.380 +/* a4 */ {Cache_UNKNOWN, 0 }, 1.381 +/* a5 */ {Cache_UNKNOWN, 0 }, 1.382 +/* a6 */ {Cache_UNKNOWN, 0 }, 1.383 +/* a7 */ {Cache_UNKNOWN, 0 }, 1.384 +/* a8 */ {Cache_UNKNOWN, 0 }, 1.385 +/* a9 */ {Cache_UNKNOWN, 0 }, 1.386 +/* aa */ {Cache_UNKNOWN, 0 }, 1.387 +/* ab */ {Cache_UNKNOWN, 0 }, 1.388 +/* ac */ {Cache_UNKNOWN, 0 }, 1.389 +/* ad */ {Cache_UNKNOWN, 0 }, 1.390 +/* ae */ {Cache_UNKNOWN, 0 }, 1.391 +/* af */ {Cache_UNKNOWN, 0 }, 1.392 +/* b0 */ {Cache_TLBi, 0 }, 1.393 +/* b1 */ {Cache_UNKNOWN, 0 }, 1.394 +/* b2 */ {Cache_UNKNOWN, 0 }, 1.395 +/* b3 */ {Cache_TLBd, 0 }, 1.396 +/* b4 */ {Cache_UNKNOWN, 0 }, 1.397 +/* b5 */ {Cache_UNKNOWN, 0 }, 1.398 +/* b6 */ {Cache_UNKNOWN, 0 }, 1.399 +/* b7 */ {Cache_UNKNOWN, 0 }, 1.400 +/* b8 */ {Cache_UNKNOWN, 0 }, 1.401 +/* b9 */ {Cache_UNKNOWN, 0 }, 1.402 +/* ba */ {Cache_UNKNOWN, 0 }, 1.403 +/* bb */ {Cache_UNKNOWN, 0 }, 1.404 +/* bc */ {Cache_UNKNOWN, 0 }, 1.405 +/* bd */ {Cache_UNKNOWN, 0 }, 1.406 +/* be */ {Cache_UNKNOWN, 0 }, 1.407 +/* bf */ {Cache_UNKNOWN, 0 }, 1.408 +/* c0 */ {Cache_UNKNOWN, 0 }, 1.409 +/* c1 */ {Cache_UNKNOWN, 0 }, 1.410 +/* c2 */ {Cache_UNKNOWN, 0 }, 1.411 +/* c3 */ {Cache_UNKNOWN, 0 }, 1.412 +/* c4 */ {Cache_UNKNOWN, 0 }, 1.413 +/* c5 */ {Cache_UNKNOWN, 0 }, 1.414 +/* c6 */ {Cache_UNKNOWN, 0 }, 1.415 +/* c7 */ {Cache_UNKNOWN, 0 }, 1.416 +/* c8 */ {Cache_UNKNOWN, 0 }, 1.417 +/* c9 */ {Cache_UNKNOWN, 0 }, 1.418 +/* ca */ {Cache_UNKNOWN, 0 }, 1.419 +/* cb */ {Cache_UNKNOWN, 0 }, 1.420 +/* cc */ {Cache_UNKNOWN, 0 }, 1.421 +/* cd */ {Cache_UNKNOWN, 0 }, 1.422 +/* ce */ {Cache_UNKNOWN, 0 }, 1.423 +/* cf */ {Cache_UNKNOWN, 0 }, 1.424 +/* d0 */ {Cache_UNKNOWN, 0 }, 1.425 +/* d1 */ {Cache_UNKNOWN, 0 }, 1.426 +/* d2 */ {Cache_UNKNOWN, 0 }, 1.427 +/* d3 */ {Cache_UNKNOWN, 0 }, 1.428 +/* d4 */ {Cache_UNKNOWN, 0 }, 1.429 +/* d5 */ {Cache_UNKNOWN, 0 }, 1.430 +/* d6 */ {Cache_UNKNOWN, 0 }, 1.431 +/* d7 */ {Cache_UNKNOWN, 0 }, 1.432 +/* d8 */ {Cache_UNKNOWN, 0 }, 1.433 +/* d9 */ {Cache_UNKNOWN, 0 }, 1.434 +/* da */ {Cache_UNKNOWN, 0 }, 1.435 +/* db */ {Cache_UNKNOWN, 0 }, 1.436 +/* dc */ {Cache_UNKNOWN, 0 }, 1.437 +/* dd */ {Cache_UNKNOWN, 0 }, 1.438 +/* de */ {Cache_UNKNOWN, 0 }, 1.439 +/* df */ {Cache_UNKNOWN, 0 }, 1.440 +/* e0 */ {Cache_UNKNOWN, 0 }, 1.441 +/* e1 */ {Cache_UNKNOWN, 0 }, 1.442 +/* e2 */ {Cache_UNKNOWN, 0 }, 1.443 +/* e3 */ {Cache_UNKNOWN, 0 }, 1.444 +/* e4 */ {Cache_UNKNOWN, 0 }, 1.445 +/* e5 */ {Cache_UNKNOWN, 0 }, 1.446 +/* e6 */ {Cache_UNKNOWN, 0 }, 1.447 +/* e7 */ {Cache_UNKNOWN, 0 }, 1.448 +/* e8 */ {Cache_UNKNOWN, 0 }, 1.449 +/* e9 */ {Cache_UNKNOWN, 0 }, 1.450 +/* ea */ {Cache_UNKNOWN, 0 }, 1.451 +/* eb */ {Cache_UNKNOWN, 0 }, 1.452 +/* ec */ {Cache_UNKNOWN, 0 }, 1.453 +/* ed */ {Cache_UNKNOWN, 0 }, 1.454 +/* ee */ {Cache_UNKNOWN, 0 }, 1.455 +/* ef */ {Cache_UNKNOWN, 0 }, 1.456 +/* f0 */ {Cache_UNKNOWN, 0 }, 1.457 +/* f1 */ {Cache_UNKNOWN, 0 }, 1.458 +/* f2 */ {Cache_UNKNOWN, 0 }, 1.459 +/* f3 */ {Cache_UNKNOWN, 0 }, 1.460 +/* f4 */ {Cache_UNKNOWN, 0 }, 1.461 +/* f5 */ {Cache_UNKNOWN, 0 }, 1.462 +/* f6 */ {Cache_UNKNOWN, 0 }, 1.463 +/* f7 */ {Cache_UNKNOWN, 0 }, 1.464 +/* f8 */ {Cache_UNKNOWN, 0 }, 1.465 +/* f9 */ {Cache_UNKNOWN, 0 }, 1.466 +/* fa */ {Cache_UNKNOWN, 0 }, 1.467 +/* fb */ {Cache_UNKNOWN, 0 }, 1.468 +/* fc */ {Cache_UNKNOWN, 0 }, 1.469 +/* fd */ {Cache_UNKNOWN, 0 }, 1.470 +/* fe */ {Cache_UNKNOWN, 0 }, 1.471 +/* ff */ {Cache_UNKNOWN, 0 } 1.472 +}; 1.473 + 1.474 + 1.475 +/* 1.476 + * use the above table to determine the CacheEntryLineSize. 1.477 + */ 1.478 +static void 1.479 +getIntelCacheEntryLineSize(unsigned long val, int *level, 1.480 + unsigned long *lineSize) 1.481 +{ 1.482 + CacheType type; 1.483 + 1.484 + type = CacheMap[val].type; 1.485 + /* only interested in data caches */ 1.486 + /* NOTE val = 0x40 is a special value that means no L2 or L3 cache. 1.487 + * this data check has the side effect of rejecting that entry. If 1.488 + * that wasn't the case, we could have to reject it explicitly */ 1.489 + if (CacheMap[val].lineSize == 0) { 1.490 + return; 1.491 + } 1.492 + /* look at the caches, skip types we aren't interested in. 1.493 + * if we already have a value for a lower level cache, skip the 1.494 + * current entry */ 1.495 + if ((type == Cache_L1)|| (type == Cache_L1d)) { 1.496 + *level = 1; 1.497 + *lineSize = CacheMap[val].lineSize; 1.498 + } else if ((*level >= 2) && ((type == Cache_L2) || (type == Cache_L2d))) { 1.499 + *level = 2; 1.500 + *lineSize = CacheMap[val].lineSize; 1.501 + } else if ((*level >= 3) && ((type == Cache_L3) || (type == Cache_L3d))) { 1.502 + *level = 3; 1.503 + *lineSize = CacheMap[val].lineSize; 1.504 + } 1.505 + return; 1.506 +} 1.507 + 1.508 + 1.509 +static void 1.510 +getIntelRegisterCacheLineSize(unsigned long val, 1.511 + int *level, unsigned long *lineSize) 1.512 +{ 1.513 + getIntelCacheEntryLineSize(val >> 24 & 0xff, level, lineSize); 1.514 + getIntelCacheEntryLineSize(val >> 16 & 0xff, level, lineSize); 1.515 + getIntelCacheEntryLineSize(val >> 8 & 0xff, level, lineSize); 1.516 + getIntelCacheEntryLineSize(val & 0xff, level, lineSize); 1.517 +} 1.518 + 1.519 +/* 1.520 + * returns '0' if no recognized cache is found, or if the cache 1.521 + * information is supported by this processor 1.522 + */ 1.523 +static unsigned long 1.524 +getIntelCacheLineSize(int cpuidLevel) 1.525 +{ 1.526 + int level = 4; 1.527 + unsigned long lineSize = 0; 1.528 + unsigned long eax, ebx, ecx, edx; 1.529 + int repeat, count; 1.530 + 1.531 + if (cpuidLevel < 2) { 1.532 + return 0; 1.533 + } 1.534 + 1.535 + /* command '2' of the cpuid is intel's cache info call. Each byte of the 1.536 + * 4 registers contain a potential descriptor for the cache. The CacheMap 1.537 + * table maps the cache entry with the processor cache. Register 'al' 1.538 + * contains a count value that cpuid '2' needs to be called in order to 1.539 + * find all the cache descriptors. Only registers with the high bit set 1.540 + * to 'zero' have valid descriptors. This code loops through all the 1.541 + * required calls to cpuid '2' and passes any valid descriptors it finds 1.542 + * to the getIntelRegisterCacheLineSize code, which breaks the registers 1.543 + * down into their component descriptors. In the end the lineSize of the 1.544 + * lowest level cache data cache is returned. */ 1.545 + freebl_cpuid(2, &eax, &ebx, &ecx, &edx); 1.546 + repeat = eax & 0xf; 1.547 + for (count = 0; count < repeat; count++) { 1.548 + if ((eax & 0x80000000) == 0) { 1.549 + getIntelRegisterCacheLineSize(eax & 0xffffff00, &level, &lineSize); 1.550 + } 1.551 + if ((ebx & 0x80000000) == 0) { 1.552 + getIntelRegisterCacheLineSize(ebx, &level, &lineSize); 1.553 + } 1.554 + if ((ecx & 0x80000000) == 0) { 1.555 + getIntelRegisterCacheLineSize(ecx, &level, &lineSize); 1.556 + } 1.557 + if ((edx & 0x80000000) == 0) { 1.558 + getIntelRegisterCacheLineSize(edx, &level, &lineSize); 1.559 + } 1.560 + if (count+1 != repeat) { 1.561 + freebl_cpuid(2, &eax, &ebx, &ecx, &edx); 1.562 + } 1.563 + } 1.564 + return lineSize; 1.565 +} 1.566 + 1.567 +/* 1.568 + * returns '0' if the cache info is not supported by this processor. 1.569 + * This is based on the AMD extended cache commands for cpuid. 1.570 + * (see "AMD Processor Recognition Application Note" Publication 20734). 1.571 + * Some other processors use the identical scheme. 1.572 + * (see "Processor Recognition, Transmeta Corporation"). 1.573 + */ 1.574 +static unsigned long 1.575 +getOtherCacheLineSize(unsigned long cpuidLevel) 1.576 +{ 1.577 + unsigned long lineSize = 0; 1.578 + unsigned long eax, ebx, ecx, edx; 1.579 + 1.580 + /* get the Extended CPUID level */ 1.581 + freebl_cpuid(0x80000000, &eax, &ebx, &ecx, &edx); 1.582 + cpuidLevel = eax; 1.583 + 1.584 + if (cpuidLevel >= 0x80000005) { 1.585 + freebl_cpuid(0x80000005, &eax, &ebx, &ecx, &edx); 1.586 + lineSize = ecx & 0xff; /* line Size, L1 Data Cache */ 1.587 + } 1.588 + return lineSize; 1.589 +} 1.590 + 1.591 +static const char * const manMap[] = { 1.592 +#define INTEL 0 1.593 + "GenuineIntel", 1.594 +#define AMD 1 1.595 + "AuthenticAMD", 1.596 +#define CYRIX 2 1.597 + "CyrixInstead", 1.598 +#define CENTAUR 2 1.599 + "CentaurHauls", 1.600 +#define NEXGEN 3 1.601 + "NexGenDriven", 1.602 +#define TRANSMETA 4 1.603 + "GenuineTMx86", 1.604 +#define RISE 5 1.605 + "RiseRiseRise", 1.606 +#define UMC 6 1.607 + "UMC UMC UMC ", 1.608 +#define SIS 7 1.609 + "Sis Sis Sis ", 1.610 +#define NATIONAL 8 1.611 + "Geode by NSC", 1.612 +}; 1.613 + 1.614 +static const int n_manufacturers = sizeof(manMap)/sizeof(manMap[0]); 1.615 + 1.616 + 1.617 +#define MAN_UNKNOWN 9 1.618 + 1.619 +#if !defined(AMD_64) 1.620 +#define SSE2_FLAG (1<<26) 1.621 +unsigned long 1.622 +s_mpi_is_sse2() 1.623 +{ 1.624 + unsigned long eax, ebx, ecx, edx; 1.625 + int manufacturer = MAN_UNKNOWN; 1.626 + int i; 1.627 + char string[13]; 1.628 + 1.629 + if (is386() || is486()) { 1.630 + return 0; 1.631 + } 1.632 + freebl_cpuid(0, &eax, &ebx, &ecx, &edx); 1.633 + /* string holds the CPU's manufacturer ID string - a twelve 1.634 + * character ASCII string stored in ebx, edx, ecx, and 1.635 + * the 32-bit extended feature flags are in edx, ecx. 1.636 + */ 1.637 + *(int *)string = ebx; 1.638 + *(int *)&string[4] = (int)edx; 1.639 + *(int *)&string[8] = (int)ecx; 1.640 + string[12] = 0; 1.641 + 1.642 + /* has no SSE2 extensions */ 1.643 + if (eax == 0) { 1.644 + return 0; 1.645 + } 1.646 + 1.647 + for (i=0; i < n_manufacturers; i++) { 1.648 + if ( strcmp(manMap[i],string) == 0) { 1.649 + manufacturer = i; 1.650 + break; 1.651 + } 1.652 + } 1.653 + 1.654 + freebl_cpuid(1,&eax,&ebx,&ecx,&edx); 1.655 + return (edx & SSE2_FLAG) == SSE2_FLAG; 1.656 +} 1.657 +#endif 1.658 + 1.659 +unsigned long 1.660 +s_mpi_getProcessorLineSize() 1.661 +{ 1.662 + unsigned long eax, ebx, ecx, edx; 1.663 + unsigned long cpuidLevel; 1.664 + unsigned long cacheLineSize = 0; 1.665 + int manufacturer = MAN_UNKNOWN; 1.666 + int i; 1.667 + char string[65]; 1.668 + 1.669 +#if !defined(AMD_64) 1.670 + if (is386()) { 1.671 + return 0; /* 386 had no cache */ 1.672 + } if (is486()) { 1.673 + return 32; /* really? need more info */ 1.674 + } 1.675 +#endif 1.676 + 1.677 + /* Pentium, cpuid command is available */ 1.678 + freebl_cpuid(0, &eax, &ebx, &ecx, &edx); 1.679 + cpuidLevel = eax; 1.680 + /* string holds the CPU's manufacturer ID string - a twelve 1.681 + * character ASCII string stored in ebx, edx, ecx, and 1.682 + * the 32-bit extended feature flags are in edx, ecx. 1.683 + */ 1.684 + *(int *)string = ebx; 1.685 + *(int *)&string[4] = (int)edx; 1.686 + *(int *)&string[8] = (int)ecx; 1.687 + string[12] = 0; 1.688 + 1.689 + manufacturer = MAN_UNKNOWN; 1.690 + for (i=0; i < n_manufacturers; i++) { 1.691 + if ( strcmp(manMap[i],string) == 0) { 1.692 + manufacturer = i; 1.693 + } 1.694 + } 1.695 + 1.696 + if (manufacturer == INTEL) { 1.697 + cacheLineSize = getIntelCacheLineSize(cpuidLevel); 1.698 + } else { 1.699 + cacheLineSize = getOtherCacheLineSize(cpuidLevel); 1.700 + } 1.701 + /* doesn't support cache info based on cpuid. This means 1.702 + * an old pentium class processor, which have cache lines of 1.703 + * 32. If we learn differently, we can use a switch based on 1.704 + * the Manufacturer id */ 1.705 + if (cacheLineSize == 0) { 1.706 + cacheLineSize = 32; 1.707 + } 1.708 + return cacheLineSize; 1.709 +} 1.710 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1 1.711 +#endif 1.712 + 1.713 +#if defined(__ppc64__) 1.714 +/* 1.715 + * Sigh, The PPC has some really nice features to help us determine cache 1.716 + * size, since it had lots of direct control functions to do so. The POWER 1.717 + * processor even has an instruction to do this, but it was dropped in 1.718 + * PowerPC. Unfortunately most of them are not available in user mode. 1.719 + * 1.720 + * The dcbz function would be a great way to determine cache line size except 1.721 + * 1) it only works on write-back memory (it throws an exception otherwise), 1.722 + * and 2) because so many mac programs 'knew' the processor cache size was 1.723 + * 32 bytes, they used this instruction as a fast 'zero 32 bytes'. Now the new 1.724 + * G5 processor has 128 byte cache, but dcbz only clears 32 bytes to keep 1.725 + * these programs happy. dcbzl work if 64 bit instructions are supported. 1.726 + * If you know 64 bit instructions are supported, and that stack is 1.727 + * write-back, you can use this code. 1.728 + */ 1.729 +#include "memory.h" 1.730 + 1.731 +/* clear the cache line that contains 'array' */ 1.732 +static inline void dcbzl(char *array) 1.733 +{ 1.734 + register char *a asm("r2") = array; 1.735 + __asm__ __volatile__( "dcbzl %0,r0" : "=r" (a): "0"(a) ); 1.736 +} 1.737 + 1.738 + 1.739 +#define PPC_DO_ALIGN(x,y) ((char *)\ 1.740 + ((((long long) (x))+((y)-1))&~((y)-1))) 1.741 + 1.742 +#define PPC_MAX_LINE_SIZE 256 1.743 +unsigned long 1.744 +s_mpi_getProcessorLineSize() 1.745 +{ 1.746 + char testArray[2*PPC_MAX_LINE_SIZE+1]; 1.747 + char *test; 1.748 + int i; 1.749 + 1.750 + /* align the array on a maximum line size boundary, so we 1.751 + * know we are starting to clear from the first address */ 1.752 + test = PPC_DO_ALIGN(testArray, PPC_MAX_LINE_SIZE); 1.753 + /* set all the values to 1's */ 1.754 + memset(test, 0xff, PPC_MAX_LINE_SIZE); 1.755 + /* clear one cache block starting at 'test' */ 1.756 + dcbzl(test); 1.757 + 1.758 + /* find the size of the cleared area, that's our block size */ 1.759 + for (i=PPC_MAX_LINE_SIZE; i != 0; i = i/2) { 1.760 + if (test[i-1] == 0) { 1.761 + return i; 1.762 + } 1.763 + } 1.764 + return 0; 1.765 +} 1.766 + 1.767 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1 1.768 +#endif 1.769 + 1.770 + 1.771 +/* 1.772 + * put other processor and platform specific cache code here 1.773 + * return the smallest cache line size in bytes on the processor 1.774 + * (usually the L1 cache). If the OS has a call, this would be 1.775 + * a greate place to put it. 1.776 + * 1.777 + * If there is no cache, return 0; 1.778 + * 1.779 + * define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED so the generic functions 1.780 + * below aren't compiled. 1.781 + * 1.782 + */ 1.783 + 1.784 + 1.785 +/* target.mk can define MPI_CACHE_LINE_SIZE if it's common for the family or 1.786 + * OS */ 1.787 +#if defined(MPI_CACHE_LINE_SIZE) && !defined(MPI_GET_PROCESSOR_LINE_SIZE_DEFINED) 1.788 + 1.789 +unsigned long 1.790 +s_mpi_getProcessorLineSize() 1.791 +{ 1.792 + return MPI_CACHE_LINE_SIZE; 1.793 +} 1.794 +#define MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1 1.795 +#endif 1.796 + 1.797 + 1.798 +/* If no way to get the processor cache line size has been defined, assume 1.799 + * it's 32 bytes (most common value, does not significantly impact performance) 1.800 + */ 1.801 +#ifndef MPI_GET_PROCESSOR_LINE_SIZE_DEFINED 1.802 +unsigned long 1.803 +s_mpi_getProcessorLineSize() 1.804 +{ 1.805 + return 32; 1.806 +} 1.807 +#endif 1.808 + 1.809 +#ifdef TEST_IT 1.810 +#include <stdio.h> 1.811 + 1.812 +main() 1.813 +{ 1.814 + printf("line size = %d\n", s_mpi_getProcessorLineSize()); 1.815 +} 1.816 +#endif