1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/modules/libbz2/src/blocksort.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1094 @@ 1.4 + 1.5 +/*-------------------------------------------------------------*/ 1.6 +/*--- Block sorting machinery ---*/ 1.7 +/*--- blocksort.c ---*/ 1.8 +/*-------------------------------------------------------------*/ 1.9 + 1.10 +/* ------------------------------------------------------------------ 1.11 + This file is part of bzip2/libbzip2, a program and library for 1.12 + lossless, block-sorting data compression. 1.13 + 1.14 + bzip2/libbzip2 version 1.0.4 of 20 December 2006 1.15 + Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org> 1.16 + 1.17 + Please read the WARNING, DISCLAIMER and PATENTS sections in the 1.18 + README file. 1.19 + 1.20 + This program is released under the terms of the license contained 1.21 + in the file LICENSE. 1.22 + ------------------------------------------------------------------ */ 1.23 + 1.24 + 1.25 +#include "bzlib_private.h" 1.26 + 1.27 +/*---------------------------------------------*/ 1.28 +/*--- Fallback O(N log(N)^2) sorting ---*/ 1.29 +/*--- algorithm, for repetitive blocks ---*/ 1.30 +/*---------------------------------------------*/ 1.31 + 1.32 +/*---------------------------------------------*/ 1.33 +static 1.34 +__inline__ 1.35 +void fallbackSimpleSort ( UInt32* fmap, 1.36 + UInt32* eclass, 1.37 + Int32 lo, 1.38 + Int32 hi ) 1.39 +{ 1.40 + Int32 i, j, tmp; 1.41 + UInt32 ec_tmp; 1.42 + 1.43 + if (lo == hi) return; 1.44 + 1.45 + if (hi - lo > 3) { 1.46 + for ( i = hi-4; i >= lo; i-- ) { 1.47 + tmp = fmap[i]; 1.48 + ec_tmp = eclass[tmp]; 1.49 + for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) 1.50 + fmap[j-4] = fmap[j]; 1.51 + fmap[j-4] = tmp; 1.52 + } 1.53 + } 1.54 + 1.55 + for ( i = hi-1; i >= lo; i-- ) { 1.56 + tmp = fmap[i]; 1.57 + ec_tmp = eclass[tmp]; 1.58 + for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) 1.59 + fmap[j-1] = fmap[j]; 1.60 + fmap[j-1] = tmp; 1.61 + } 1.62 +} 1.63 + 1.64 + 1.65 +/*---------------------------------------------*/ 1.66 +#define fswap(zz1, zz2) \ 1.67 + { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 1.68 + 1.69 +#define fvswap(zzp1, zzp2, zzn) \ 1.70 +{ \ 1.71 + Int32 yyp1 = (zzp1); \ 1.72 + Int32 yyp2 = (zzp2); \ 1.73 + Int32 yyn = (zzn); \ 1.74 + while (yyn > 0) { \ 1.75 + fswap(fmap[yyp1], fmap[yyp2]); \ 1.76 + yyp1++; yyp2++; yyn--; \ 1.77 + } \ 1.78 +} 1.79 + 1.80 + 1.81 +#define fmin(a,b) ((a) < (b)) ? (a) : (b) 1.82 + 1.83 +#define fpush(lz,hz) { stackLo[sp] = lz; \ 1.84 + stackHi[sp] = hz; \ 1.85 + sp++; } 1.86 + 1.87 +#define fpop(lz,hz) { sp--; \ 1.88 + lz = stackLo[sp]; \ 1.89 + hz = stackHi[sp]; } 1.90 + 1.91 +#define FALLBACK_QSORT_SMALL_THRESH 10 1.92 +#define FALLBACK_QSORT_STACK_SIZE 100 1.93 + 1.94 + 1.95 +static 1.96 +void fallbackQSort3 ( UInt32* fmap, 1.97 + UInt32* eclass, 1.98 + Int32 loSt, 1.99 + Int32 hiSt ) 1.100 +{ 1.101 + Int32 unLo, unHi, ltLo, gtHi, n, m; 1.102 + Int32 sp, lo, hi; 1.103 + UInt32 med, r, r3; 1.104 + Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; 1.105 + Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; 1.106 + 1.107 + r = 0; 1.108 + 1.109 + sp = 0; 1.110 + fpush ( loSt, hiSt ); 1.111 + 1.112 + while (sp > 0) { 1.113 + 1.114 + AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 ); 1.115 + 1.116 + fpop ( lo, hi ); 1.117 + if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { 1.118 + fallbackSimpleSort ( fmap, eclass, lo, hi ); 1.119 + continue; 1.120 + } 1.121 + 1.122 + /* Random partitioning. Median of 3 sometimes fails to 1.123 + avoid bad cases. Median of 9 seems to help but 1.124 + looks rather expensive. This too seems to work but 1.125 + is cheaper. Guidance for the magic constants 1.126 + 7621 and 32768 is taken from Sedgewick's algorithms 1.127 + book, chapter 35. 1.128 + */ 1.129 + r = ((r * 7621) + 1) % 32768; 1.130 + r3 = r % 3; 1.131 + if (r3 == 0) med = eclass[fmap[lo]]; else 1.132 + if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else 1.133 + med = eclass[fmap[hi]]; 1.134 + 1.135 + unLo = ltLo = lo; 1.136 + unHi = gtHi = hi; 1.137 + 1.138 + while (1) { 1.139 + while (1) { 1.140 + if (unLo > unHi) break; 1.141 + n = (Int32)eclass[fmap[unLo]] - (Int32)med; 1.142 + if (n == 0) { 1.143 + fswap(fmap[unLo], fmap[ltLo]); 1.144 + ltLo++; unLo++; 1.145 + continue; 1.146 + }; 1.147 + if (n > 0) break; 1.148 + unLo++; 1.149 + } 1.150 + while (1) { 1.151 + if (unLo > unHi) break; 1.152 + n = (Int32)eclass[fmap[unHi]] - (Int32)med; 1.153 + if (n == 0) { 1.154 + fswap(fmap[unHi], fmap[gtHi]); 1.155 + gtHi--; unHi--; 1.156 + continue; 1.157 + }; 1.158 + if (n < 0) break; 1.159 + unHi--; 1.160 + } 1.161 + if (unLo > unHi) break; 1.162 + fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; 1.163 + } 1.164 + 1.165 + AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); 1.166 + 1.167 + if (gtHi < ltLo) continue; 1.168 + 1.169 + n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); 1.170 + m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); 1.171 + 1.172 + n = lo + unLo - ltLo - 1; 1.173 + m = hi - (gtHi - unHi) + 1; 1.174 + 1.175 + if (n - lo > hi - m) { 1.176 + fpush ( lo, n ); 1.177 + fpush ( m, hi ); 1.178 + } else { 1.179 + fpush ( m, hi ); 1.180 + fpush ( lo, n ); 1.181 + } 1.182 + } 1.183 +} 1.184 + 1.185 +#undef fmin 1.186 +#undef fpush 1.187 +#undef fpop 1.188 +#undef fswap 1.189 +#undef fvswap 1.190 +#undef FALLBACK_QSORT_SMALL_THRESH 1.191 +#undef FALLBACK_QSORT_STACK_SIZE 1.192 + 1.193 + 1.194 +/*---------------------------------------------*/ 1.195 +/* Pre: 1.196 + nblock > 0 1.197 + eclass exists for [0 .. nblock-1] 1.198 + ((UChar*)eclass) [0 .. nblock-1] holds block 1.199 + ptr exists for [0 .. nblock-1] 1.200 + 1.201 + Post: 1.202 + ((UChar*)eclass) [0 .. nblock-1] holds block 1.203 + All other areas of eclass destroyed 1.204 + fmap [0 .. nblock-1] holds sorted order 1.205 + bhtab [ 0 .. 2+(nblock/32) ] destroyed 1.206 +*/ 1.207 + 1.208 +#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) 1.209 +#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) 1.210 +#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) 1.211 +#define WORD_BH(zz) bhtab[(zz) >> 5] 1.212 +#define UNALIGNED_BH(zz) ((zz) & 0x01f) 1.213 + 1.214 +static 1.215 +void fallbackSort ( UInt32* fmap, 1.216 + UInt32* eclass, 1.217 + UInt32* bhtab, 1.218 + Int32 nblock, 1.219 + Int32 verb ) 1.220 +{ 1.221 + Int32 ftab[257]; 1.222 + Int32 ftabCopy[256]; 1.223 + Int32 H, i, j, k, l, r, cc, cc1; 1.224 + Int32 nNotDone; 1.225 + Int32 nBhtab; 1.226 + UChar* eclass8 = (UChar*)eclass; 1.227 + 1.228 + /*-- 1.229 + Initial 1-char radix sort to generate 1.230 + initial fmap and initial BH bits. 1.231 + --*/ 1.232 + if (verb >= 4) 1.233 + VPrintf0 ( " bucket sorting ...\n" ); 1.234 + for (i = 0; i < 257; i++) ftab[i] = 0; 1.235 + for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; 1.236 + for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; 1.237 + for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; 1.238 + 1.239 + for (i = 0; i < nblock; i++) { 1.240 + j = eclass8[i]; 1.241 + k = ftab[j] - 1; 1.242 + ftab[j] = k; 1.243 + fmap[k] = i; 1.244 + } 1.245 + 1.246 + nBhtab = 2 + (nblock / 32); 1.247 + for (i = 0; i < nBhtab; i++) bhtab[i] = 0; 1.248 + for (i = 0; i < 256; i++) SET_BH(ftab[i]); 1.249 + 1.250 + /*-- 1.251 + Inductively refine the buckets. Kind-of an 1.252 + "exponential radix sort" (!), inspired by the 1.253 + Manber-Myers suffix array construction algorithm. 1.254 + --*/ 1.255 + 1.256 + /*-- set sentinel bits for block-end detection --*/ 1.257 + for (i = 0; i < 32; i++) { 1.258 + SET_BH(nblock + 2*i); 1.259 + CLEAR_BH(nblock + 2*i + 1); 1.260 + } 1.261 + 1.262 + /*-- the log(N) loop --*/ 1.263 + H = 1; 1.264 + while (1) { 1.265 + 1.266 + if (verb >= 4) 1.267 + VPrintf1 ( " depth %6d has ", H ); 1.268 + 1.269 + j = 0; 1.270 + for (i = 0; i < nblock; i++) { 1.271 + if (ISSET_BH(i)) j = i; 1.272 + k = fmap[i] - H; if (k < 0) k += nblock; 1.273 + eclass[k] = j; 1.274 + } 1.275 + 1.276 + nNotDone = 0; 1.277 + r = -1; 1.278 + while (1) { 1.279 + 1.280 + /*-- find the next non-singleton bucket --*/ 1.281 + k = r + 1; 1.282 + while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; 1.283 + if (ISSET_BH(k)) { 1.284 + while (WORD_BH(k) == 0xffffffff) k += 32; 1.285 + while (ISSET_BH(k)) k++; 1.286 + } 1.287 + l = k - 1; 1.288 + if (l >= nblock) break; 1.289 + while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; 1.290 + if (!ISSET_BH(k)) { 1.291 + while (WORD_BH(k) == 0x00000000) k += 32; 1.292 + while (!ISSET_BH(k)) k++; 1.293 + } 1.294 + r = k - 1; 1.295 + if (r >= nblock) break; 1.296 + 1.297 + /*-- now [l, r] bracket current bucket --*/ 1.298 + if (r > l) { 1.299 + nNotDone += (r - l + 1); 1.300 + fallbackQSort3 ( fmap, eclass, l, r ); 1.301 + 1.302 + /*-- scan bucket and generate header bits-- */ 1.303 + cc = -1; 1.304 + for (i = l; i <= r; i++) { 1.305 + cc1 = eclass[fmap[i]]; 1.306 + if (cc != cc1) { SET_BH(i); cc = cc1; }; 1.307 + } 1.308 + } 1.309 + } 1.310 + 1.311 + if (verb >= 4) 1.312 + VPrintf1 ( "%6d unresolved strings\n", nNotDone ); 1.313 + 1.314 + H *= 2; 1.315 + if (H > nblock || nNotDone == 0) break; 1.316 + } 1.317 + 1.318 + /*-- 1.319 + Reconstruct the original block in 1.320 + eclass8 [0 .. nblock-1], since the 1.321 + previous phase destroyed it. 1.322 + --*/ 1.323 + if (verb >= 4) 1.324 + VPrintf0 ( " reconstructing block ...\n" ); 1.325 + j = 0; 1.326 + for (i = 0; i < nblock; i++) { 1.327 + while (ftabCopy[j] == 0) j++; 1.328 + ftabCopy[j]--; 1.329 + eclass8[fmap[i]] = (UChar)j; 1.330 + } 1.331 + AssertH ( j < 256, 1005 ); 1.332 +} 1.333 + 1.334 +#undef SET_BH 1.335 +#undef CLEAR_BH 1.336 +#undef ISSET_BH 1.337 +#undef WORD_BH 1.338 +#undef UNALIGNED_BH 1.339 + 1.340 + 1.341 +/*---------------------------------------------*/ 1.342 +/*--- The main, O(N^2 log(N)) sorting ---*/ 1.343 +/*--- algorithm. Faster for "normal" ---*/ 1.344 +/*--- non-repetitive blocks. ---*/ 1.345 +/*---------------------------------------------*/ 1.346 + 1.347 +/*---------------------------------------------*/ 1.348 +static 1.349 +__inline__ 1.350 +Bool mainGtU ( UInt32 i1, 1.351 + UInt32 i2, 1.352 + UChar* block, 1.353 + UInt16* quadrant, 1.354 + UInt32 nblock, 1.355 + Int32* budget ) 1.356 +{ 1.357 + Int32 k; 1.358 + UChar c1, c2; 1.359 + UInt16 s1, s2; 1.360 + 1.361 + AssertD ( i1 != i2, "mainGtU" ); 1.362 + /* 1 */ 1.363 + c1 = block[i1]; c2 = block[i2]; 1.364 + if (c1 != c2) return (c1 > c2); 1.365 + i1++; i2++; 1.366 + /* 2 */ 1.367 + c1 = block[i1]; c2 = block[i2]; 1.368 + if (c1 != c2) return (c1 > c2); 1.369 + i1++; i2++; 1.370 + /* 3 */ 1.371 + c1 = block[i1]; c2 = block[i2]; 1.372 + if (c1 != c2) return (c1 > c2); 1.373 + i1++; i2++; 1.374 + /* 4 */ 1.375 + c1 = block[i1]; c2 = block[i2]; 1.376 + if (c1 != c2) return (c1 > c2); 1.377 + i1++; i2++; 1.378 + /* 5 */ 1.379 + c1 = block[i1]; c2 = block[i2]; 1.380 + if (c1 != c2) return (c1 > c2); 1.381 + i1++; i2++; 1.382 + /* 6 */ 1.383 + c1 = block[i1]; c2 = block[i2]; 1.384 + if (c1 != c2) return (c1 > c2); 1.385 + i1++; i2++; 1.386 + /* 7 */ 1.387 + c1 = block[i1]; c2 = block[i2]; 1.388 + if (c1 != c2) return (c1 > c2); 1.389 + i1++; i2++; 1.390 + /* 8 */ 1.391 + c1 = block[i1]; c2 = block[i2]; 1.392 + if (c1 != c2) return (c1 > c2); 1.393 + i1++; i2++; 1.394 + /* 9 */ 1.395 + c1 = block[i1]; c2 = block[i2]; 1.396 + if (c1 != c2) return (c1 > c2); 1.397 + i1++; i2++; 1.398 + /* 10 */ 1.399 + c1 = block[i1]; c2 = block[i2]; 1.400 + if (c1 != c2) return (c1 > c2); 1.401 + i1++; i2++; 1.402 + /* 11 */ 1.403 + c1 = block[i1]; c2 = block[i2]; 1.404 + if (c1 != c2) return (c1 > c2); 1.405 + i1++; i2++; 1.406 + /* 12 */ 1.407 + c1 = block[i1]; c2 = block[i2]; 1.408 + if (c1 != c2) return (c1 > c2); 1.409 + i1++; i2++; 1.410 + 1.411 + k = nblock + 8; 1.412 + 1.413 + do { 1.414 + /* 1 */ 1.415 + c1 = block[i1]; c2 = block[i2]; 1.416 + if (c1 != c2) return (c1 > c2); 1.417 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.418 + if (s1 != s2) return (s1 > s2); 1.419 + i1++; i2++; 1.420 + /* 2 */ 1.421 + c1 = block[i1]; c2 = block[i2]; 1.422 + if (c1 != c2) return (c1 > c2); 1.423 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.424 + if (s1 != s2) return (s1 > s2); 1.425 + i1++; i2++; 1.426 + /* 3 */ 1.427 + c1 = block[i1]; c2 = block[i2]; 1.428 + if (c1 != c2) return (c1 > c2); 1.429 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.430 + if (s1 != s2) return (s1 > s2); 1.431 + i1++; i2++; 1.432 + /* 4 */ 1.433 + c1 = block[i1]; c2 = block[i2]; 1.434 + if (c1 != c2) return (c1 > c2); 1.435 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.436 + if (s1 != s2) return (s1 > s2); 1.437 + i1++; i2++; 1.438 + /* 5 */ 1.439 + c1 = block[i1]; c2 = block[i2]; 1.440 + if (c1 != c2) return (c1 > c2); 1.441 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.442 + if (s1 != s2) return (s1 > s2); 1.443 + i1++; i2++; 1.444 + /* 6 */ 1.445 + c1 = block[i1]; c2 = block[i2]; 1.446 + if (c1 != c2) return (c1 > c2); 1.447 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.448 + if (s1 != s2) return (s1 > s2); 1.449 + i1++; i2++; 1.450 + /* 7 */ 1.451 + c1 = block[i1]; c2 = block[i2]; 1.452 + if (c1 != c2) return (c1 > c2); 1.453 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.454 + if (s1 != s2) return (s1 > s2); 1.455 + i1++; i2++; 1.456 + /* 8 */ 1.457 + c1 = block[i1]; c2 = block[i2]; 1.458 + if (c1 != c2) return (c1 > c2); 1.459 + s1 = quadrant[i1]; s2 = quadrant[i2]; 1.460 + if (s1 != s2) return (s1 > s2); 1.461 + i1++; i2++; 1.462 + 1.463 + if (i1 >= nblock) i1 -= nblock; 1.464 + if (i2 >= nblock) i2 -= nblock; 1.465 + 1.466 + k -= 8; 1.467 + (*budget)--; 1.468 + } 1.469 + while (k >= 0); 1.470 + 1.471 + return False; 1.472 +} 1.473 + 1.474 + 1.475 +/*---------------------------------------------*/ 1.476 +/*-- 1.477 + Knuth's increments seem to work better 1.478 + than Incerpi-Sedgewick here. Possibly 1.479 + because the number of elems to sort is 1.480 + usually small, typically <= 20. 1.481 +--*/ 1.482 +static 1.483 +Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, 1.484 + 9841, 29524, 88573, 265720, 1.485 + 797161, 2391484 }; 1.486 + 1.487 +static 1.488 +void mainSimpleSort ( UInt32* ptr, 1.489 + UChar* block, 1.490 + UInt16* quadrant, 1.491 + Int32 nblock, 1.492 + Int32 lo, 1.493 + Int32 hi, 1.494 + Int32 d, 1.495 + Int32* budget ) 1.496 +{ 1.497 + Int32 i, j, h, bigN, hp; 1.498 + UInt32 v; 1.499 + 1.500 + bigN = hi - lo + 1; 1.501 + if (bigN < 2) return; 1.502 + 1.503 + hp = 0; 1.504 + while (incs[hp] < bigN) hp++; 1.505 + hp--; 1.506 + 1.507 + for (; hp >= 0; hp--) { 1.508 + h = incs[hp]; 1.509 + 1.510 + i = lo + h; 1.511 + while (True) { 1.512 + 1.513 + /*-- copy 1 --*/ 1.514 + if (i > hi) break; 1.515 + v = ptr[i]; 1.516 + j = i; 1.517 + while ( mainGtU ( 1.518 + ptr[j-h]+d, v+d, block, quadrant, nblock, budget 1.519 + ) ) { 1.520 + ptr[j] = ptr[j-h]; 1.521 + j = j - h; 1.522 + if (j <= (lo + h - 1)) break; 1.523 + } 1.524 + ptr[j] = v; 1.525 + i++; 1.526 + 1.527 + /*-- copy 2 --*/ 1.528 + if (i > hi) break; 1.529 + v = ptr[i]; 1.530 + j = i; 1.531 + while ( mainGtU ( 1.532 + ptr[j-h]+d, v+d, block, quadrant, nblock, budget 1.533 + ) ) { 1.534 + ptr[j] = ptr[j-h]; 1.535 + j = j - h; 1.536 + if (j <= (lo + h - 1)) break; 1.537 + } 1.538 + ptr[j] = v; 1.539 + i++; 1.540 + 1.541 + /*-- copy 3 --*/ 1.542 + if (i > hi) break; 1.543 + v = ptr[i]; 1.544 + j = i; 1.545 + while ( mainGtU ( 1.546 + ptr[j-h]+d, v+d, block, quadrant, nblock, budget 1.547 + ) ) { 1.548 + ptr[j] = ptr[j-h]; 1.549 + j = j - h; 1.550 + if (j <= (lo + h - 1)) break; 1.551 + } 1.552 + ptr[j] = v; 1.553 + i++; 1.554 + 1.555 + if (*budget < 0) return; 1.556 + } 1.557 + } 1.558 +} 1.559 + 1.560 + 1.561 +/*---------------------------------------------*/ 1.562 +/*-- 1.563 + The following is an implementation of 1.564 + an elegant 3-way quicksort for strings, 1.565 + described in a paper "Fast Algorithms for 1.566 + Sorting and Searching Strings", by Robert 1.567 + Sedgewick and Jon L. Bentley. 1.568 +--*/ 1.569 + 1.570 +#define mswap(zz1, zz2) \ 1.571 + { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 1.572 + 1.573 +#define mvswap(zzp1, zzp2, zzn) \ 1.574 +{ \ 1.575 + Int32 yyp1 = (zzp1); \ 1.576 + Int32 yyp2 = (zzp2); \ 1.577 + Int32 yyn = (zzn); \ 1.578 + while (yyn > 0) { \ 1.579 + mswap(ptr[yyp1], ptr[yyp2]); \ 1.580 + yyp1++; yyp2++; yyn--; \ 1.581 + } \ 1.582 +} 1.583 + 1.584 +static 1.585 +__inline__ 1.586 +UChar mmed3 ( UChar a, UChar b, UChar c ) 1.587 +{ 1.588 + UChar t; 1.589 + if (a > b) { t = a; a = b; b = t; }; 1.590 + if (b > c) { 1.591 + b = c; 1.592 + if (a > b) b = a; 1.593 + } 1.594 + return b; 1.595 +} 1.596 + 1.597 +#define mmin(a,b) ((a) < (b)) ? (a) : (b) 1.598 + 1.599 +#define mpush(lz,hz,dz) { stackLo[sp] = lz; \ 1.600 + stackHi[sp] = hz; \ 1.601 + stackD [sp] = dz; \ 1.602 + sp++; } 1.603 + 1.604 +#define mpop(lz,hz,dz) { sp--; \ 1.605 + lz = stackLo[sp]; \ 1.606 + hz = stackHi[sp]; \ 1.607 + dz = stackD [sp]; } 1.608 + 1.609 + 1.610 +#define mnextsize(az) (nextHi[az]-nextLo[az]) 1.611 + 1.612 +#define mnextswap(az,bz) \ 1.613 + { Int32 tz; \ 1.614 + tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ 1.615 + tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ 1.616 + tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } 1.617 + 1.618 + 1.619 +#define MAIN_QSORT_SMALL_THRESH 20 1.620 +#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) 1.621 +#define MAIN_QSORT_STACK_SIZE 100 1.622 + 1.623 +static 1.624 +void mainQSort3 ( UInt32* ptr, 1.625 + UChar* block, 1.626 + UInt16* quadrant, 1.627 + Int32 nblock, 1.628 + Int32 loSt, 1.629 + Int32 hiSt, 1.630 + Int32 dSt, 1.631 + Int32* budget ) 1.632 +{ 1.633 + Int32 unLo, unHi, ltLo, gtHi, n, m, med; 1.634 + Int32 sp, lo, hi, d; 1.635 + 1.636 + Int32 stackLo[MAIN_QSORT_STACK_SIZE]; 1.637 + Int32 stackHi[MAIN_QSORT_STACK_SIZE]; 1.638 + Int32 stackD [MAIN_QSORT_STACK_SIZE]; 1.639 + 1.640 + Int32 nextLo[3]; 1.641 + Int32 nextHi[3]; 1.642 + Int32 nextD [3]; 1.643 + 1.644 + sp = 0; 1.645 + mpush ( loSt, hiSt, dSt ); 1.646 + 1.647 + while (sp > 0) { 1.648 + 1.649 + AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 ); 1.650 + 1.651 + mpop ( lo, hi, d ); 1.652 + if (hi - lo < MAIN_QSORT_SMALL_THRESH || 1.653 + d > MAIN_QSORT_DEPTH_THRESH) { 1.654 + mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); 1.655 + if (*budget < 0) return; 1.656 + continue; 1.657 + } 1.658 + 1.659 + med = (Int32) 1.660 + mmed3 ( block[ptr[ lo ]+d], 1.661 + block[ptr[ hi ]+d], 1.662 + block[ptr[ (lo+hi)>>1 ]+d] ); 1.663 + 1.664 + unLo = ltLo = lo; 1.665 + unHi = gtHi = hi; 1.666 + 1.667 + while (True) { 1.668 + while (True) { 1.669 + if (unLo > unHi) break; 1.670 + n = ((Int32)block[ptr[unLo]+d]) - med; 1.671 + if (n == 0) { 1.672 + mswap(ptr[unLo], ptr[ltLo]); 1.673 + ltLo++; unLo++; continue; 1.674 + }; 1.675 + if (n > 0) break; 1.676 + unLo++; 1.677 + } 1.678 + while (True) { 1.679 + if (unLo > unHi) break; 1.680 + n = ((Int32)block[ptr[unHi]+d]) - med; 1.681 + if (n == 0) { 1.682 + mswap(ptr[unHi], ptr[gtHi]); 1.683 + gtHi--; unHi--; continue; 1.684 + }; 1.685 + if (n < 0) break; 1.686 + unHi--; 1.687 + } 1.688 + if (unLo > unHi) break; 1.689 + mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; 1.690 + } 1.691 + 1.692 + AssertD ( unHi == unLo-1, "mainQSort3(2)" ); 1.693 + 1.694 + if (gtHi < ltLo) { 1.695 + mpush(lo, hi, d+1 ); 1.696 + continue; 1.697 + } 1.698 + 1.699 + n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); 1.700 + m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); 1.701 + 1.702 + n = lo + unLo - ltLo - 1; 1.703 + m = hi - (gtHi - unHi) + 1; 1.704 + 1.705 + nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; 1.706 + nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; 1.707 + nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; 1.708 + 1.709 + if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 1.710 + if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); 1.711 + if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 1.712 + 1.713 + AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); 1.714 + AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); 1.715 + 1.716 + mpush (nextLo[0], nextHi[0], nextD[0]); 1.717 + mpush (nextLo[1], nextHi[1], nextD[1]); 1.718 + mpush (nextLo[2], nextHi[2], nextD[2]); 1.719 + } 1.720 +} 1.721 + 1.722 +#undef mswap 1.723 +#undef mvswap 1.724 +#undef mpush 1.725 +#undef mpop 1.726 +#undef mmin 1.727 +#undef mnextsize 1.728 +#undef mnextswap 1.729 +#undef MAIN_QSORT_SMALL_THRESH 1.730 +#undef MAIN_QSORT_DEPTH_THRESH 1.731 +#undef MAIN_QSORT_STACK_SIZE 1.732 + 1.733 + 1.734 +/*---------------------------------------------*/ 1.735 +/* Pre: 1.736 + nblock > N_OVERSHOOT 1.737 + block32 exists for [0 .. nblock-1 +N_OVERSHOOT] 1.738 + ((UChar*)block32) [0 .. nblock-1] holds block 1.739 + ptr exists for [0 .. nblock-1] 1.740 + 1.741 + Post: 1.742 + ((UChar*)block32) [0 .. nblock-1] holds block 1.743 + All other areas of block32 destroyed 1.744 + ftab [0 .. 65536 ] destroyed 1.745 + ptr [0 .. nblock-1] holds sorted order 1.746 + if (*budget < 0), sorting was abandoned 1.747 +*/ 1.748 + 1.749 +#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) 1.750 +#define SETMASK (1 << 21) 1.751 +#define CLEARMASK (~(SETMASK)) 1.752 + 1.753 +static 1.754 +void mainSort ( UInt32* ptr, 1.755 + UChar* block, 1.756 + UInt16* quadrant, 1.757 + UInt32* ftab, 1.758 + Int32 nblock, 1.759 + Int32 verb, 1.760 + Int32* budget ) 1.761 +{ 1.762 + Int32 i, j, k, ss, sb; 1.763 + Int32 runningOrder[256]; 1.764 + Bool bigDone[256]; 1.765 + Int32 copyStart[256]; 1.766 + Int32 copyEnd [256]; 1.767 + UChar c1; 1.768 + Int32 numQSorted; 1.769 + UInt16 s; 1.770 + if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); 1.771 + 1.772 + /*-- set up the 2-byte frequency table --*/ 1.773 + for (i = 65536; i >= 0; i--) ftab[i] = 0; 1.774 + 1.775 + j = block[0] << 8; 1.776 + i = nblock-1; 1.777 + for (; i >= 3; i -= 4) { 1.778 + quadrant[i] = 0; 1.779 + j = (j >> 8) | ( ((UInt16)block[i]) << 8); 1.780 + ftab[j]++; 1.781 + quadrant[i-1] = 0; 1.782 + j = (j >> 8) | ( ((UInt16)block[i-1]) << 8); 1.783 + ftab[j]++; 1.784 + quadrant[i-2] = 0; 1.785 + j = (j >> 8) | ( ((UInt16)block[i-2]) << 8); 1.786 + ftab[j]++; 1.787 + quadrant[i-3] = 0; 1.788 + j = (j >> 8) | ( ((UInt16)block[i-3]) << 8); 1.789 + ftab[j]++; 1.790 + } 1.791 + for (; i >= 0; i--) { 1.792 + quadrant[i] = 0; 1.793 + j = (j >> 8) | ( ((UInt16)block[i]) << 8); 1.794 + ftab[j]++; 1.795 + } 1.796 + 1.797 + /*-- (emphasises close relationship of block & quadrant) --*/ 1.798 + for (i = 0; i < BZ_N_OVERSHOOT; i++) { 1.799 + block [nblock+i] = block[i]; 1.800 + quadrant[nblock+i] = 0; 1.801 + } 1.802 + 1.803 + if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); 1.804 + 1.805 + /*-- Complete the initial radix sort --*/ 1.806 + for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; 1.807 + 1.808 + s = block[0] << 8; 1.809 + i = nblock-1; 1.810 + for (; i >= 3; i -= 4) { 1.811 + s = (s >> 8) | (block[i] << 8); 1.812 + j = ftab[s] -1; 1.813 + ftab[s] = j; 1.814 + ptr[j] = i; 1.815 + s = (s >> 8) | (block[i-1] << 8); 1.816 + j = ftab[s] -1; 1.817 + ftab[s] = j; 1.818 + ptr[j] = i-1; 1.819 + s = (s >> 8) | (block[i-2] << 8); 1.820 + j = ftab[s] -1; 1.821 + ftab[s] = j; 1.822 + ptr[j] = i-2; 1.823 + s = (s >> 8) | (block[i-3] << 8); 1.824 + j = ftab[s] -1; 1.825 + ftab[s] = j; 1.826 + ptr[j] = i-3; 1.827 + } 1.828 + for (; i >= 0; i--) { 1.829 + s = (s >> 8) | (block[i] << 8); 1.830 + j = ftab[s] -1; 1.831 + ftab[s] = j; 1.832 + ptr[j] = i; 1.833 + } 1.834 + 1.835 + /*-- 1.836 + Now ftab contains the first loc of every small bucket. 1.837 + Calculate the running order, from smallest to largest 1.838 + big bucket. 1.839 + --*/ 1.840 + for (i = 0; i <= 255; i++) { 1.841 + bigDone [i] = False; 1.842 + runningOrder[i] = i; 1.843 + } 1.844 + 1.845 + { 1.846 + Int32 vv; 1.847 + Int32 h = 1; 1.848 + do h = 3 * h + 1; while (h <= 256); 1.849 + do { 1.850 + h = h / 3; 1.851 + for (i = h; i <= 255; i++) { 1.852 + vv = runningOrder[i]; 1.853 + j = i; 1.854 + while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { 1.855 + runningOrder[j] = runningOrder[j-h]; 1.856 + j = j - h; 1.857 + if (j <= (h - 1)) goto zero; 1.858 + } 1.859 + zero: 1.860 + runningOrder[j] = vv; 1.861 + } 1.862 + } while (h != 1); 1.863 + } 1.864 + 1.865 + /*-- 1.866 + The main sorting loop. 1.867 + --*/ 1.868 + 1.869 + numQSorted = 0; 1.870 + 1.871 + for (i = 0; i <= 255; i++) { 1.872 + 1.873 + /*-- 1.874 + Process big buckets, starting with the least full. 1.875 + Basically this is a 3-step process in which we call 1.876 + mainQSort3 to sort the small buckets [ss, j], but 1.877 + also make a big effort to avoid the calls if we can. 1.878 + --*/ 1.879 + ss = runningOrder[i]; 1.880 + 1.881 + /*-- 1.882 + Step 1: 1.883 + Complete the big bucket [ss] by quicksorting 1.884 + any unsorted small buckets [ss, j], for j != ss. 1.885 + Hopefully previous pointer-scanning phases have already 1.886 + completed many of the small buckets [ss, j], so 1.887 + we don't have to sort them at all. 1.888 + --*/ 1.889 + for (j = 0; j <= 255; j++) { 1.890 + if (j != ss) { 1.891 + sb = (ss << 8) + j; 1.892 + if ( ! (ftab[sb] & SETMASK) ) { 1.893 + Int32 lo = ftab[sb] & CLEARMASK; 1.894 + Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; 1.895 + if (hi > lo) { 1.896 + if (verb >= 4) 1.897 + VPrintf4 ( " qsort [0x%x, 0x%x] " 1.898 + "done %d this %d\n", 1.899 + ss, j, numQSorted, hi - lo + 1 ); 1.900 + mainQSort3 ( 1.901 + ptr, block, quadrant, nblock, 1.902 + lo, hi, BZ_N_RADIX, budget 1.903 + ); 1.904 + numQSorted += (hi - lo + 1); 1.905 + if (*budget < 0) return; 1.906 + } 1.907 + } 1.908 + ftab[sb] |= SETMASK; 1.909 + } 1.910 + } 1.911 + 1.912 + AssertH ( !bigDone[ss], 1006 ); 1.913 + 1.914 + /*-- 1.915 + Step 2: 1.916 + Now scan this big bucket [ss] so as to synthesise the 1.917 + sorted order for small buckets [t, ss] for all t, 1.918 + including, magically, the bucket [ss,ss] too. 1.919 + This will avoid doing Real Work in subsequent Step 1's. 1.920 + --*/ 1.921 + { 1.922 + for (j = 0; j <= 255; j++) { 1.923 + copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; 1.924 + copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; 1.925 + } 1.926 + for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { 1.927 + k = ptr[j]-1; if (k < 0) k += nblock; 1.928 + c1 = block[k]; 1.929 + if (!bigDone[c1]) 1.930 + ptr[ copyStart[c1]++ ] = k; 1.931 + } 1.932 + for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { 1.933 + k = ptr[j]-1; if (k < 0) k += nblock; 1.934 + c1 = block[k]; 1.935 + if (!bigDone[c1]) 1.936 + ptr[ copyEnd[c1]-- ] = k; 1.937 + } 1.938 + } 1.939 + 1.940 + AssertH ( (copyStart[ss]-1 == copyEnd[ss]) 1.941 + || 1.942 + /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. 1.943 + Necessity for this case is demonstrated by compressing 1.944 + a sequence of approximately 48.5 million of character 1.945 + 251; 1.0.0/1.0.1 will then die here. */ 1.946 + (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1.947 + 1007 ) 1.948 + 1.949 + for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; 1.950 + 1.951 + /*-- 1.952 + Step 3: 1.953 + The [ss] big bucket is now done. Record this fact, 1.954 + and update the quadrant descriptors. Remember to 1.955 + update quadrants in the overshoot area too, if 1.956 + necessary. The "if (i < 255)" test merely skips 1.957 + this updating for the last bucket processed, since 1.958 + updating for the last bucket is pointless. 1.959 + 1.960 + The quadrant array provides a way to incrementally 1.961 + cache sort orderings, as they appear, so as to 1.962 + make subsequent comparisons in fullGtU() complete 1.963 + faster. For repetitive blocks this makes a big 1.964 + difference (but not big enough to be able to avoid 1.965 + the fallback sorting mechanism, exponential radix sort). 1.966 + 1.967 + The precise meaning is: at all times: 1.968 + 1.969 + for 0 <= i < nblock and 0 <= j <= nblock 1.970 + 1.971 + if block[i] != block[j], 1.972 + 1.973 + then the relative values of quadrant[i] and 1.974 + quadrant[j] are meaningless. 1.975 + 1.976 + else { 1.977 + if quadrant[i] < quadrant[j] 1.978 + then the string starting at i lexicographically 1.979 + precedes the string starting at j 1.980 + 1.981 + else if quadrant[i] > quadrant[j] 1.982 + then the string starting at j lexicographically 1.983 + precedes the string starting at i 1.984 + 1.985 + else 1.986 + the relative ordering of the strings starting 1.987 + at i and j has not yet been determined. 1.988 + } 1.989 + --*/ 1.990 + bigDone[ss] = True; 1.991 + 1.992 + if (i < 255) { 1.993 + Int32 bbStart = ftab[ss << 8] & CLEARMASK; 1.994 + Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; 1.995 + Int32 shifts = 0; 1.996 + 1.997 + while ((bbSize >> shifts) > 65534) shifts++; 1.998 + 1.999 + for (j = bbSize-1; j >= 0; j--) { 1.1000 + Int32 a2update = ptr[bbStart + j]; 1.1001 + UInt16 qVal = (UInt16)(j >> shifts); 1.1002 + quadrant[a2update] = qVal; 1.1003 + if (a2update < BZ_N_OVERSHOOT) 1.1004 + quadrant[a2update + nblock] = qVal; 1.1005 + } 1.1006 + AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); 1.1007 + } 1.1008 + 1.1009 + } 1.1010 + 1.1011 + if (verb >= 4) 1.1012 + VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", 1.1013 + nblock, numQSorted, nblock - numQSorted ); 1.1014 +} 1.1015 + 1.1016 +#undef BIGFREQ 1.1017 +#undef SETMASK 1.1018 +#undef CLEARMASK 1.1019 + 1.1020 + 1.1021 +/*---------------------------------------------*/ 1.1022 +/* Pre: 1.1023 + nblock > 0 1.1024 + arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] 1.1025 + ((UChar*)arr2) [0 .. nblock-1] holds block 1.1026 + arr1 exists for [0 .. nblock-1] 1.1027 + 1.1028 + Post: 1.1029 + ((UChar*)arr2) [0 .. nblock-1] holds block 1.1030 + All other areas of block destroyed 1.1031 + ftab [ 0 .. 65536 ] destroyed 1.1032 + arr1 [0 .. nblock-1] holds sorted order 1.1033 +*/ 1.1034 +void BZ2_blockSort ( EState* s ) 1.1035 +{ 1.1036 + UInt32* ptr = s->ptr; 1.1037 + UChar* block = s->block; 1.1038 + UInt32* ftab = s->ftab; 1.1039 + Int32 nblock = s->nblock; 1.1040 + Int32 verb = s->verbosity; 1.1041 + Int32 wfact = s->workFactor; 1.1042 + UInt16* quadrant; 1.1043 + Int32 budget; 1.1044 + Int32 budgetInit; 1.1045 + Int32 i; 1.1046 + 1.1047 + if (nblock < 10000) { 1.1048 + fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1.1049 + } else { 1.1050 + /* Calculate the location for quadrant, remembering to get 1.1051 + the alignment right. Assumes that &(block[0]) is at least 1.1052 + 2-byte aligned -- this should be ok since block is really 1.1053 + the first section of arr2. 1.1054 + */ 1.1055 + i = nblock+BZ_N_OVERSHOOT; 1.1056 + if (i & 1) i++; 1.1057 + quadrant = (UInt16*)(&(block[i])); 1.1058 + 1.1059 + /* (wfact-1) / 3 puts the default-factor-30 1.1060 + transition point at very roughly the same place as 1.1061 + with v0.1 and v0.9.0. 1.1062 + Not that it particularly matters any more, since the 1.1063 + resulting compressed stream is now the same regardless 1.1064 + of whether or not we use the main sort or fallback sort. 1.1065 + */ 1.1066 + if (wfact < 1 ) wfact = 1; 1.1067 + if (wfact > 100) wfact = 100; 1.1068 + budgetInit = nblock * ((wfact-1) / 3); 1.1069 + budget = budgetInit; 1.1070 + 1.1071 + mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); 1.1072 + if (verb >= 3) 1.1073 + VPrintf3 ( " %d work, %d block, ratio %5.2f\n", 1.1074 + budgetInit - budget, 1.1075 + nblock, 1.1076 + (float)(budgetInit - budget) / 1.1077 + (float)(nblock==0 ? 1 : nblock) ); 1.1078 + if (budget < 0) { 1.1079 + if (verb >= 2) 1.1080 + VPrintf0 ( " too repetitive; using fallback" 1.1081 + " sorting algorithm\n" ); 1.1082 + fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1.1083 + } 1.1084 + } 1.1085 + 1.1086 + s->origPtr = -1; 1.1087 + for (i = 0; i < s->nblock; i++) 1.1088 + if (ptr[i] == 0) 1.1089 + { s->origPtr = i; break; }; 1.1090 + 1.1091 + AssertH( s->origPtr != -1, 1003 ); 1.1092 +} 1.1093 + 1.1094 + 1.1095 +/*-------------------------------------------------------------*/ 1.1096 +/*--- end blocksort.c ---*/ 1.1097 +/*-------------------------------------------------------------*/