michael@0: michael@0: /*-------------------------------------------------------------*/ michael@0: /*--- Block sorting machinery ---*/ michael@0: /*--- blocksort.c ---*/ michael@0: /*-------------------------------------------------------------*/ michael@0: michael@0: /* ------------------------------------------------------------------ michael@0: This file is part of bzip2/libbzip2, a program and library for michael@0: lossless, block-sorting data compression. michael@0: michael@0: bzip2/libbzip2 version 1.0.4 of 20 December 2006 michael@0: Copyright (C) 1996-2006 Julian Seward michael@0: michael@0: Please read the WARNING, DISCLAIMER and PATENTS sections in the michael@0: README file. michael@0: michael@0: This program is released under the terms of the license contained michael@0: in the file LICENSE. michael@0: ------------------------------------------------------------------ */ michael@0: michael@0: michael@0: #include "bzlib_private.h" michael@0: michael@0: /*---------------------------------------------*/ michael@0: /*--- Fallback O(N log(N)^2) sorting ---*/ michael@0: /*--- algorithm, for repetitive blocks ---*/ michael@0: /*---------------------------------------------*/ michael@0: michael@0: /*---------------------------------------------*/ michael@0: static michael@0: __inline__ michael@0: void fallbackSimpleSort ( UInt32* fmap, michael@0: UInt32* eclass, michael@0: Int32 lo, michael@0: Int32 hi ) michael@0: { michael@0: Int32 i, j, tmp; michael@0: UInt32 ec_tmp; michael@0: michael@0: if (lo == hi) return; michael@0: michael@0: if (hi - lo > 3) { michael@0: for ( i = hi-4; i >= lo; i-- ) { michael@0: tmp = fmap[i]; michael@0: ec_tmp = eclass[tmp]; michael@0: for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) michael@0: fmap[j-4] = fmap[j]; michael@0: fmap[j-4] = tmp; michael@0: } michael@0: } michael@0: michael@0: for ( i = hi-1; i >= lo; i-- ) { michael@0: tmp = fmap[i]; michael@0: ec_tmp = eclass[tmp]; michael@0: for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) michael@0: fmap[j-1] = fmap[j]; michael@0: fmap[j-1] = tmp; michael@0: } michael@0: } michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: #define fswap(zz1, zz2) \ michael@0: { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } michael@0: michael@0: #define fvswap(zzp1, zzp2, zzn) \ michael@0: { \ michael@0: Int32 yyp1 = (zzp1); \ michael@0: Int32 yyp2 = (zzp2); \ michael@0: Int32 yyn = (zzn); \ michael@0: while (yyn > 0) { \ michael@0: fswap(fmap[yyp1], fmap[yyp2]); \ michael@0: yyp1++; yyp2++; yyn--; \ michael@0: } \ michael@0: } michael@0: michael@0: michael@0: #define fmin(a,b) ((a) < (b)) ? (a) : (b) michael@0: michael@0: #define fpush(lz,hz) { stackLo[sp] = lz; \ michael@0: stackHi[sp] = hz; \ michael@0: sp++; } michael@0: michael@0: #define fpop(lz,hz) { sp--; \ michael@0: lz = stackLo[sp]; \ michael@0: hz = stackHi[sp]; } michael@0: michael@0: #define FALLBACK_QSORT_SMALL_THRESH 10 michael@0: #define FALLBACK_QSORT_STACK_SIZE 100 michael@0: michael@0: michael@0: static michael@0: void fallbackQSort3 ( UInt32* fmap, michael@0: UInt32* eclass, michael@0: Int32 loSt, michael@0: Int32 hiSt ) michael@0: { michael@0: Int32 unLo, unHi, ltLo, gtHi, n, m; michael@0: Int32 sp, lo, hi; michael@0: UInt32 med, r, r3; michael@0: Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; michael@0: Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; michael@0: michael@0: r = 0; michael@0: michael@0: sp = 0; michael@0: fpush ( loSt, hiSt ); michael@0: michael@0: while (sp > 0) { michael@0: michael@0: AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 ); michael@0: michael@0: fpop ( lo, hi ); michael@0: if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { michael@0: fallbackSimpleSort ( fmap, eclass, lo, hi ); michael@0: continue; michael@0: } michael@0: michael@0: /* Random partitioning. Median of 3 sometimes fails to michael@0: avoid bad cases. Median of 9 seems to help but michael@0: looks rather expensive. This too seems to work but michael@0: is cheaper. Guidance for the magic constants michael@0: 7621 and 32768 is taken from Sedgewick's algorithms michael@0: book, chapter 35. michael@0: */ michael@0: r = ((r * 7621) + 1) % 32768; michael@0: r3 = r % 3; michael@0: if (r3 == 0) med = eclass[fmap[lo]]; else michael@0: if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else michael@0: med = eclass[fmap[hi]]; michael@0: michael@0: unLo = ltLo = lo; michael@0: unHi = gtHi = hi; michael@0: michael@0: while (1) { michael@0: while (1) { michael@0: if (unLo > unHi) break; michael@0: n = (Int32)eclass[fmap[unLo]] - (Int32)med; michael@0: if (n == 0) { michael@0: fswap(fmap[unLo], fmap[ltLo]); michael@0: ltLo++; unLo++; michael@0: continue; michael@0: }; michael@0: if (n > 0) break; michael@0: unLo++; michael@0: } michael@0: while (1) { michael@0: if (unLo > unHi) break; michael@0: n = (Int32)eclass[fmap[unHi]] - (Int32)med; michael@0: if (n == 0) { michael@0: fswap(fmap[unHi], fmap[gtHi]); michael@0: gtHi--; unHi--; michael@0: continue; michael@0: }; michael@0: if (n < 0) break; michael@0: unHi--; michael@0: } michael@0: if (unLo > unHi) break; michael@0: fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; michael@0: } michael@0: michael@0: AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); michael@0: michael@0: if (gtHi < ltLo) continue; michael@0: michael@0: n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); michael@0: m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); michael@0: michael@0: n = lo + unLo - ltLo - 1; michael@0: m = hi - (gtHi - unHi) + 1; michael@0: michael@0: if (n - lo > hi - m) { michael@0: fpush ( lo, n ); michael@0: fpush ( m, hi ); michael@0: } else { michael@0: fpush ( m, hi ); michael@0: fpush ( lo, n ); michael@0: } michael@0: } michael@0: } michael@0: michael@0: #undef fmin michael@0: #undef fpush michael@0: #undef fpop michael@0: #undef fswap michael@0: #undef fvswap michael@0: #undef FALLBACK_QSORT_SMALL_THRESH michael@0: #undef FALLBACK_QSORT_STACK_SIZE michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /* Pre: michael@0: nblock > 0 michael@0: eclass exists for [0 .. nblock-1] michael@0: ((UChar*)eclass) [0 .. nblock-1] holds block michael@0: ptr exists for [0 .. nblock-1] michael@0: michael@0: Post: michael@0: ((UChar*)eclass) [0 .. nblock-1] holds block michael@0: All other areas of eclass destroyed michael@0: fmap [0 .. nblock-1] holds sorted order michael@0: bhtab [ 0 .. 2+(nblock/32) ] destroyed michael@0: */ michael@0: michael@0: #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) michael@0: #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) michael@0: #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) michael@0: #define WORD_BH(zz) bhtab[(zz) >> 5] michael@0: #define UNALIGNED_BH(zz) ((zz) & 0x01f) michael@0: michael@0: static michael@0: void fallbackSort ( UInt32* fmap, michael@0: UInt32* eclass, michael@0: UInt32* bhtab, michael@0: Int32 nblock, michael@0: Int32 verb ) michael@0: { michael@0: Int32 ftab[257]; michael@0: Int32 ftabCopy[256]; michael@0: Int32 H, i, j, k, l, r, cc, cc1; michael@0: Int32 nNotDone; michael@0: Int32 nBhtab; michael@0: UChar* eclass8 = (UChar*)eclass; michael@0: michael@0: /*-- michael@0: Initial 1-char radix sort to generate michael@0: initial fmap and initial BH bits. michael@0: --*/ michael@0: if (verb >= 4) michael@0: VPrintf0 ( " bucket sorting ...\n" ); michael@0: for (i = 0; i < 257; i++) ftab[i] = 0; michael@0: for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; michael@0: for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; michael@0: for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; michael@0: michael@0: for (i = 0; i < nblock; i++) { michael@0: j = eclass8[i]; michael@0: k = ftab[j] - 1; michael@0: ftab[j] = k; michael@0: fmap[k] = i; michael@0: } michael@0: michael@0: nBhtab = 2 + (nblock / 32); michael@0: for (i = 0; i < nBhtab; i++) bhtab[i] = 0; michael@0: for (i = 0; i < 256; i++) SET_BH(ftab[i]); michael@0: michael@0: /*-- michael@0: Inductively refine the buckets. Kind-of an michael@0: "exponential radix sort" (!), inspired by the michael@0: Manber-Myers suffix array construction algorithm. michael@0: --*/ michael@0: michael@0: /*-- set sentinel bits for block-end detection --*/ michael@0: for (i = 0; i < 32; i++) { michael@0: SET_BH(nblock + 2*i); michael@0: CLEAR_BH(nblock + 2*i + 1); michael@0: } michael@0: michael@0: /*-- the log(N) loop --*/ michael@0: H = 1; michael@0: while (1) { michael@0: michael@0: if (verb >= 4) michael@0: VPrintf1 ( " depth %6d has ", H ); michael@0: michael@0: j = 0; michael@0: for (i = 0; i < nblock; i++) { michael@0: if (ISSET_BH(i)) j = i; michael@0: k = fmap[i] - H; if (k < 0) k += nblock; michael@0: eclass[k] = j; michael@0: } michael@0: michael@0: nNotDone = 0; michael@0: r = -1; michael@0: while (1) { michael@0: michael@0: /*-- find the next non-singleton bucket --*/ michael@0: k = r + 1; michael@0: while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; michael@0: if (ISSET_BH(k)) { michael@0: while (WORD_BH(k) == 0xffffffff) k += 32; michael@0: while (ISSET_BH(k)) k++; michael@0: } michael@0: l = k - 1; michael@0: if (l >= nblock) break; michael@0: while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; michael@0: if (!ISSET_BH(k)) { michael@0: while (WORD_BH(k) == 0x00000000) k += 32; michael@0: while (!ISSET_BH(k)) k++; michael@0: } michael@0: r = k - 1; michael@0: if (r >= nblock) break; michael@0: michael@0: /*-- now [l, r] bracket current bucket --*/ michael@0: if (r > l) { michael@0: nNotDone += (r - l + 1); michael@0: fallbackQSort3 ( fmap, eclass, l, r ); michael@0: michael@0: /*-- scan bucket and generate header bits-- */ michael@0: cc = -1; michael@0: for (i = l; i <= r; i++) { michael@0: cc1 = eclass[fmap[i]]; michael@0: if (cc != cc1) { SET_BH(i); cc = cc1; }; michael@0: } michael@0: } michael@0: } michael@0: michael@0: if (verb >= 4) michael@0: VPrintf1 ( "%6d unresolved strings\n", nNotDone ); michael@0: michael@0: H *= 2; michael@0: if (H > nblock || nNotDone == 0) break; michael@0: } michael@0: michael@0: /*-- michael@0: Reconstruct the original block in michael@0: eclass8 [0 .. nblock-1], since the michael@0: previous phase destroyed it. michael@0: --*/ michael@0: if (verb >= 4) michael@0: VPrintf0 ( " reconstructing block ...\n" ); michael@0: j = 0; michael@0: for (i = 0; i < nblock; i++) { michael@0: while (ftabCopy[j] == 0) j++; michael@0: ftabCopy[j]--; michael@0: eclass8[fmap[i]] = (UChar)j; michael@0: } michael@0: AssertH ( j < 256, 1005 ); michael@0: } michael@0: michael@0: #undef SET_BH michael@0: #undef CLEAR_BH michael@0: #undef ISSET_BH michael@0: #undef WORD_BH michael@0: #undef UNALIGNED_BH michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /*--- The main, O(N^2 log(N)) sorting ---*/ michael@0: /*--- algorithm. Faster for "normal" ---*/ michael@0: /*--- non-repetitive blocks. ---*/ michael@0: /*---------------------------------------------*/ michael@0: michael@0: /*---------------------------------------------*/ michael@0: static michael@0: __inline__ michael@0: Bool mainGtU ( UInt32 i1, michael@0: UInt32 i2, michael@0: UChar* block, michael@0: UInt16* quadrant, michael@0: UInt32 nblock, michael@0: Int32* budget ) michael@0: { michael@0: Int32 k; michael@0: UChar c1, c2; michael@0: UInt16 s1, s2; michael@0: michael@0: AssertD ( i1 != i2, "mainGtU" ); michael@0: /* 1 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 2 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 3 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 4 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 5 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 6 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 7 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 8 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 9 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 10 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 11 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: /* 12 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: i1++; i2++; michael@0: michael@0: k = nblock + 8; michael@0: michael@0: do { michael@0: /* 1 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 2 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 3 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 4 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 5 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 6 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 7 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: /* 8 */ michael@0: c1 = block[i1]; c2 = block[i2]; michael@0: if (c1 != c2) return (c1 > c2); michael@0: s1 = quadrant[i1]; s2 = quadrant[i2]; michael@0: if (s1 != s2) return (s1 > s2); michael@0: i1++; i2++; michael@0: michael@0: if (i1 >= nblock) i1 -= nblock; michael@0: if (i2 >= nblock) i2 -= nblock; michael@0: michael@0: k -= 8; michael@0: (*budget)--; michael@0: } michael@0: while (k >= 0); michael@0: michael@0: return False; michael@0: } michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /*-- michael@0: Knuth's increments seem to work better michael@0: than Incerpi-Sedgewick here. Possibly michael@0: because the number of elems to sort is michael@0: usually small, typically <= 20. michael@0: --*/ michael@0: static michael@0: Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, michael@0: 9841, 29524, 88573, 265720, michael@0: 797161, 2391484 }; michael@0: michael@0: static michael@0: void mainSimpleSort ( UInt32* ptr, michael@0: UChar* block, michael@0: UInt16* quadrant, michael@0: Int32 nblock, michael@0: Int32 lo, michael@0: Int32 hi, michael@0: Int32 d, michael@0: Int32* budget ) michael@0: { michael@0: Int32 i, j, h, bigN, hp; michael@0: UInt32 v; michael@0: michael@0: bigN = hi - lo + 1; michael@0: if (bigN < 2) return; michael@0: michael@0: hp = 0; michael@0: while (incs[hp] < bigN) hp++; michael@0: hp--; michael@0: michael@0: for (; hp >= 0; hp--) { michael@0: h = incs[hp]; michael@0: michael@0: i = lo + h; michael@0: while (True) { michael@0: michael@0: /*-- copy 1 --*/ michael@0: if (i > hi) break; michael@0: v = ptr[i]; michael@0: j = i; michael@0: while ( mainGtU ( michael@0: ptr[j-h]+d, v+d, block, quadrant, nblock, budget michael@0: ) ) { michael@0: ptr[j] = ptr[j-h]; michael@0: j = j - h; michael@0: if (j <= (lo + h - 1)) break; michael@0: } michael@0: ptr[j] = v; michael@0: i++; michael@0: michael@0: /*-- copy 2 --*/ michael@0: if (i > hi) break; michael@0: v = ptr[i]; michael@0: j = i; michael@0: while ( mainGtU ( michael@0: ptr[j-h]+d, v+d, block, quadrant, nblock, budget michael@0: ) ) { michael@0: ptr[j] = ptr[j-h]; michael@0: j = j - h; michael@0: if (j <= (lo + h - 1)) break; michael@0: } michael@0: ptr[j] = v; michael@0: i++; michael@0: michael@0: /*-- copy 3 --*/ michael@0: if (i > hi) break; michael@0: v = ptr[i]; michael@0: j = i; michael@0: while ( mainGtU ( michael@0: ptr[j-h]+d, v+d, block, quadrant, nblock, budget michael@0: ) ) { michael@0: ptr[j] = ptr[j-h]; michael@0: j = j - h; michael@0: if (j <= (lo + h - 1)) break; michael@0: } michael@0: ptr[j] = v; michael@0: i++; michael@0: michael@0: if (*budget < 0) return; michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /*-- michael@0: The following is an implementation of michael@0: an elegant 3-way quicksort for strings, michael@0: described in a paper "Fast Algorithms for michael@0: Sorting and Searching Strings", by Robert michael@0: Sedgewick and Jon L. Bentley. michael@0: --*/ michael@0: michael@0: #define mswap(zz1, zz2) \ michael@0: { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } michael@0: michael@0: #define mvswap(zzp1, zzp2, zzn) \ michael@0: { \ michael@0: Int32 yyp1 = (zzp1); \ michael@0: Int32 yyp2 = (zzp2); \ michael@0: Int32 yyn = (zzn); \ michael@0: while (yyn > 0) { \ michael@0: mswap(ptr[yyp1], ptr[yyp2]); \ michael@0: yyp1++; yyp2++; yyn--; \ michael@0: } \ michael@0: } michael@0: michael@0: static michael@0: __inline__ michael@0: UChar mmed3 ( UChar a, UChar b, UChar c ) michael@0: { michael@0: UChar t; michael@0: if (a > b) { t = a; a = b; b = t; }; michael@0: if (b > c) { michael@0: b = c; michael@0: if (a > b) b = a; michael@0: } michael@0: return b; michael@0: } michael@0: michael@0: #define mmin(a,b) ((a) < (b)) ? (a) : (b) michael@0: michael@0: #define mpush(lz,hz,dz) { stackLo[sp] = lz; \ michael@0: stackHi[sp] = hz; \ michael@0: stackD [sp] = dz; \ michael@0: sp++; } michael@0: michael@0: #define mpop(lz,hz,dz) { sp--; \ michael@0: lz = stackLo[sp]; \ michael@0: hz = stackHi[sp]; \ michael@0: dz = stackD [sp]; } michael@0: michael@0: michael@0: #define mnextsize(az) (nextHi[az]-nextLo[az]) michael@0: michael@0: #define mnextswap(az,bz) \ michael@0: { Int32 tz; \ michael@0: tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ michael@0: tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ michael@0: tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } michael@0: michael@0: michael@0: #define MAIN_QSORT_SMALL_THRESH 20 michael@0: #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) michael@0: #define MAIN_QSORT_STACK_SIZE 100 michael@0: michael@0: static michael@0: void mainQSort3 ( UInt32* ptr, michael@0: UChar* block, michael@0: UInt16* quadrant, michael@0: Int32 nblock, michael@0: Int32 loSt, michael@0: Int32 hiSt, michael@0: Int32 dSt, michael@0: Int32* budget ) michael@0: { michael@0: Int32 unLo, unHi, ltLo, gtHi, n, m, med; michael@0: Int32 sp, lo, hi, d; michael@0: michael@0: Int32 stackLo[MAIN_QSORT_STACK_SIZE]; michael@0: Int32 stackHi[MAIN_QSORT_STACK_SIZE]; michael@0: Int32 stackD [MAIN_QSORT_STACK_SIZE]; michael@0: michael@0: Int32 nextLo[3]; michael@0: Int32 nextHi[3]; michael@0: Int32 nextD [3]; michael@0: michael@0: sp = 0; michael@0: mpush ( loSt, hiSt, dSt ); michael@0: michael@0: while (sp > 0) { michael@0: michael@0: AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 ); michael@0: michael@0: mpop ( lo, hi, d ); michael@0: if (hi - lo < MAIN_QSORT_SMALL_THRESH || michael@0: d > MAIN_QSORT_DEPTH_THRESH) { michael@0: mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); michael@0: if (*budget < 0) return; michael@0: continue; michael@0: } michael@0: michael@0: med = (Int32) michael@0: mmed3 ( block[ptr[ lo ]+d], michael@0: block[ptr[ hi ]+d], michael@0: block[ptr[ (lo+hi)>>1 ]+d] ); michael@0: michael@0: unLo = ltLo = lo; michael@0: unHi = gtHi = hi; michael@0: michael@0: while (True) { michael@0: while (True) { michael@0: if (unLo > unHi) break; michael@0: n = ((Int32)block[ptr[unLo]+d]) - med; michael@0: if (n == 0) { michael@0: mswap(ptr[unLo], ptr[ltLo]); michael@0: ltLo++; unLo++; continue; michael@0: }; michael@0: if (n > 0) break; michael@0: unLo++; michael@0: } michael@0: while (True) { michael@0: if (unLo > unHi) break; michael@0: n = ((Int32)block[ptr[unHi]+d]) - med; michael@0: if (n == 0) { michael@0: mswap(ptr[unHi], ptr[gtHi]); michael@0: gtHi--; unHi--; continue; michael@0: }; michael@0: if (n < 0) break; michael@0: unHi--; michael@0: } michael@0: if (unLo > unHi) break; michael@0: mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; michael@0: } michael@0: michael@0: AssertD ( unHi == unLo-1, "mainQSort3(2)" ); michael@0: michael@0: if (gtHi < ltLo) { michael@0: mpush(lo, hi, d+1 ); michael@0: continue; michael@0: } michael@0: michael@0: n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); michael@0: m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); michael@0: michael@0: n = lo + unLo - ltLo - 1; michael@0: m = hi - (gtHi - unHi) + 1; michael@0: michael@0: nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; michael@0: nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; michael@0: nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; michael@0: michael@0: if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); michael@0: if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); michael@0: if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); michael@0: michael@0: AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); michael@0: AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); michael@0: michael@0: mpush (nextLo[0], nextHi[0], nextD[0]); michael@0: mpush (nextLo[1], nextHi[1], nextD[1]); michael@0: mpush (nextLo[2], nextHi[2], nextD[2]); michael@0: } michael@0: } michael@0: michael@0: #undef mswap michael@0: #undef mvswap michael@0: #undef mpush michael@0: #undef mpop michael@0: #undef mmin michael@0: #undef mnextsize michael@0: #undef mnextswap michael@0: #undef MAIN_QSORT_SMALL_THRESH michael@0: #undef MAIN_QSORT_DEPTH_THRESH michael@0: #undef MAIN_QSORT_STACK_SIZE michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /* Pre: michael@0: nblock > N_OVERSHOOT michael@0: block32 exists for [0 .. nblock-1 +N_OVERSHOOT] michael@0: ((UChar*)block32) [0 .. nblock-1] holds block michael@0: ptr exists for [0 .. nblock-1] michael@0: michael@0: Post: michael@0: ((UChar*)block32) [0 .. nblock-1] holds block michael@0: All other areas of block32 destroyed michael@0: ftab [0 .. 65536 ] destroyed michael@0: ptr [0 .. nblock-1] holds sorted order michael@0: if (*budget < 0), sorting was abandoned michael@0: */ michael@0: michael@0: #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) michael@0: #define SETMASK (1 << 21) michael@0: #define CLEARMASK (~(SETMASK)) michael@0: michael@0: static michael@0: void mainSort ( UInt32* ptr, michael@0: UChar* block, michael@0: UInt16* quadrant, michael@0: UInt32* ftab, michael@0: Int32 nblock, michael@0: Int32 verb, michael@0: Int32* budget ) michael@0: { michael@0: Int32 i, j, k, ss, sb; michael@0: Int32 runningOrder[256]; michael@0: Bool bigDone[256]; michael@0: Int32 copyStart[256]; michael@0: Int32 copyEnd [256]; michael@0: UChar c1; michael@0: Int32 numQSorted; michael@0: UInt16 s; michael@0: if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); michael@0: michael@0: /*-- set up the 2-byte frequency table --*/ michael@0: for (i = 65536; i >= 0; i--) ftab[i] = 0; michael@0: michael@0: j = block[0] << 8; michael@0: i = nblock-1; michael@0: for (; i >= 3; i -= 4) { michael@0: quadrant[i] = 0; michael@0: j = (j >> 8) | ( ((UInt16)block[i]) << 8); michael@0: ftab[j]++; michael@0: quadrant[i-1] = 0; michael@0: j = (j >> 8) | ( ((UInt16)block[i-1]) << 8); michael@0: ftab[j]++; michael@0: quadrant[i-2] = 0; michael@0: j = (j >> 8) | ( ((UInt16)block[i-2]) << 8); michael@0: ftab[j]++; michael@0: quadrant[i-3] = 0; michael@0: j = (j >> 8) | ( ((UInt16)block[i-3]) << 8); michael@0: ftab[j]++; michael@0: } michael@0: for (; i >= 0; i--) { michael@0: quadrant[i] = 0; michael@0: j = (j >> 8) | ( ((UInt16)block[i]) << 8); michael@0: ftab[j]++; michael@0: } michael@0: michael@0: /*-- (emphasises close relationship of block & quadrant) --*/ michael@0: for (i = 0; i < BZ_N_OVERSHOOT; i++) { michael@0: block [nblock+i] = block[i]; michael@0: quadrant[nblock+i] = 0; michael@0: } michael@0: michael@0: if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); michael@0: michael@0: /*-- Complete the initial radix sort --*/ michael@0: for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; michael@0: michael@0: s = block[0] << 8; michael@0: i = nblock-1; michael@0: for (; i >= 3; i -= 4) { michael@0: s = (s >> 8) | (block[i] << 8); michael@0: j = ftab[s] -1; michael@0: ftab[s] = j; michael@0: ptr[j] = i; michael@0: s = (s >> 8) | (block[i-1] << 8); michael@0: j = ftab[s] -1; michael@0: ftab[s] = j; michael@0: ptr[j] = i-1; michael@0: s = (s >> 8) | (block[i-2] << 8); michael@0: j = ftab[s] -1; michael@0: ftab[s] = j; michael@0: ptr[j] = i-2; michael@0: s = (s >> 8) | (block[i-3] << 8); michael@0: j = ftab[s] -1; michael@0: ftab[s] = j; michael@0: ptr[j] = i-3; michael@0: } michael@0: for (; i >= 0; i--) { michael@0: s = (s >> 8) | (block[i] << 8); michael@0: j = ftab[s] -1; michael@0: ftab[s] = j; michael@0: ptr[j] = i; michael@0: } michael@0: michael@0: /*-- michael@0: Now ftab contains the first loc of every small bucket. michael@0: Calculate the running order, from smallest to largest michael@0: big bucket. michael@0: --*/ michael@0: for (i = 0; i <= 255; i++) { michael@0: bigDone [i] = False; michael@0: runningOrder[i] = i; michael@0: } michael@0: michael@0: { michael@0: Int32 vv; michael@0: Int32 h = 1; michael@0: do h = 3 * h + 1; while (h <= 256); michael@0: do { michael@0: h = h / 3; michael@0: for (i = h; i <= 255; i++) { michael@0: vv = runningOrder[i]; michael@0: j = i; michael@0: while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { michael@0: runningOrder[j] = runningOrder[j-h]; michael@0: j = j - h; michael@0: if (j <= (h - 1)) goto zero; michael@0: } michael@0: zero: michael@0: runningOrder[j] = vv; michael@0: } michael@0: } while (h != 1); michael@0: } michael@0: michael@0: /*-- michael@0: The main sorting loop. michael@0: --*/ michael@0: michael@0: numQSorted = 0; michael@0: michael@0: for (i = 0; i <= 255; i++) { michael@0: michael@0: /*-- michael@0: Process big buckets, starting with the least full. michael@0: Basically this is a 3-step process in which we call michael@0: mainQSort3 to sort the small buckets [ss, j], but michael@0: also make a big effort to avoid the calls if we can. michael@0: --*/ michael@0: ss = runningOrder[i]; michael@0: michael@0: /*-- michael@0: Step 1: michael@0: Complete the big bucket [ss] by quicksorting michael@0: any unsorted small buckets [ss, j], for j != ss. michael@0: Hopefully previous pointer-scanning phases have already michael@0: completed many of the small buckets [ss, j], so michael@0: we don't have to sort them at all. michael@0: --*/ michael@0: for (j = 0; j <= 255; j++) { michael@0: if (j != ss) { michael@0: sb = (ss << 8) + j; michael@0: if ( ! (ftab[sb] & SETMASK) ) { michael@0: Int32 lo = ftab[sb] & CLEARMASK; michael@0: Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; michael@0: if (hi > lo) { michael@0: if (verb >= 4) michael@0: VPrintf4 ( " qsort [0x%x, 0x%x] " michael@0: "done %d this %d\n", michael@0: ss, j, numQSorted, hi - lo + 1 ); michael@0: mainQSort3 ( michael@0: ptr, block, quadrant, nblock, michael@0: lo, hi, BZ_N_RADIX, budget michael@0: ); michael@0: numQSorted += (hi - lo + 1); michael@0: if (*budget < 0) return; michael@0: } michael@0: } michael@0: ftab[sb] |= SETMASK; michael@0: } michael@0: } michael@0: michael@0: AssertH ( !bigDone[ss], 1006 ); michael@0: michael@0: /*-- michael@0: Step 2: michael@0: Now scan this big bucket [ss] so as to synthesise the michael@0: sorted order for small buckets [t, ss] for all t, michael@0: including, magically, the bucket [ss,ss] too. michael@0: This will avoid doing Real Work in subsequent Step 1's. michael@0: --*/ michael@0: { michael@0: for (j = 0; j <= 255; j++) { michael@0: copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; michael@0: copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; michael@0: } michael@0: for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { michael@0: k = ptr[j]-1; if (k < 0) k += nblock; michael@0: c1 = block[k]; michael@0: if (!bigDone[c1]) michael@0: ptr[ copyStart[c1]++ ] = k; michael@0: } michael@0: for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { michael@0: k = ptr[j]-1; if (k < 0) k += nblock; michael@0: c1 = block[k]; michael@0: if (!bigDone[c1]) michael@0: ptr[ copyEnd[c1]-- ] = k; michael@0: } michael@0: } michael@0: michael@0: AssertH ( (copyStart[ss]-1 == copyEnd[ss]) michael@0: || michael@0: /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. michael@0: Necessity for this case is demonstrated by compressing michael@0: a sequence of approximately 48.5 million of character michael@0: 251; 1.0.0/1.0.1 will then die here. */ michael@0: (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), michael@0: 1007 ) michael@0: michael@0: for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; michael@0: michael@0: /*-- michael@0: Step 3: michael@0: The [ss] big bucket is now done. Record this fact, michael@0: and update the quadrant descriptors. Remember to michael@0: update quadrants in the overshoot area too, if michael@0: necessary. The "if (i < 255)" test merely skips michael@0: this updating for the last bucket processed, since michael@0: updating for the last bucket is pointless. michael@0: michael@0: The quadrant array provides a way to incrementally michael@0: cache sort orderings, as they appear, so as to michael@0: make subsequent comparisons in fullGtU() complete michael@0: faster. For repetitive blocks this makes a big michael@0: difference (but not big enough to be able to avoid michael@0: the fallback sorting mechanism, exponential radix sort). michael@0: michael@0: The precise meaning is: at all times: michael@0: michael@0: for 0 <= i < nblock and 0 <= j <= nblock michael@0: michael@0: if block[i] != block[j], michael@0: michael@0: then the relative values of quadrant[i] and michael@0: quadrant[j] are meaningless. michael@0: michael@0: else { michael@0: if quadrant[i] < quadrant[j] michael@0: then the string starting at i lexicographically michael@0: precedes the string starting at j michael@0: michael@0: else if quadrant[i] > quadrant[j] michael@0: then the string starting at j lexicographically michael@0: precedes the string starting at i michael@0: michael@0: else michael@0: the relative ordering of the strings starting michael@0: at i and j has not yet been determined. michael@0: } michael@0: --*/ michael@0: bigDone[ss] = True; michael@0: michael@0: if (i < 255) { michael@0: Int32 bbStart = ftab[ss << 8] & CLEARMASK; michael@0: Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; michael@0: Int32 shifts = 0; michael@0: michael@0: while ((bbSize >> shifts) > 65534) shifts++; michael@0: michael@0: for (j = bbSize-1; j >= 0; j--) { michael@0: Int32 a2update = ptr[bbStart + j]; michael@0: UInt16 qVal = (UInt16)(j >> shifts); michael@0: quadrant[a2update] = qVal; michael@0: if (a2update < BZ_N_OVERSHOOT) michael@0: quadrant[a2update + nblock] = qVal; michael@0: } michael@0: AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); michael@0: } michael@0: michael@0: } michael@0: michael@0: if (verb >= 4) michael@0: VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", michael@0: nblock, numQSorted, nblock - numQSorted ); michael@0: } michael@0: michael@0: #undef BIGFREQ michael@0: #undef SETMASK michael@0: #undef CLEARMASK michael@0: michael@0: michael@0: /*---------------------------------------------*/ michael@0: /* Pre: michael@0: nblock > 0 michael@0: arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] michael@0: ((UChar*)arr2) [0 .. nblock-1] holds block michael@0: arr1 exists for [0 .. nblock-1] michael@0: michael@0: Post: michael@0: ((UChar*)arr2) [0 .. nblock-1] holds block michael@0: All other areas of block destroyed michael@0: ftab [ 0 .. 65536 ] destroyed michael@0: arr1 [0 .. nblock-1] holds sorted order michael@0: */ michael@0: void BZ2_blockSort ( EState* s ) michael@0: { michael@0: UInt32* ptr = s->ptr; michael@0: UChar* block = s->block; michael@0: UInt32* ftab = s->ftab; michael@0: Int32 nblock = s->nblock; michael@0: Int32 verb = s->verbosity; michael@0: Int32 wfact = s->workFactor; michael@0: UInt16* quadrant; michael@0: Int32 budget; michael@0: Int32 budgetInit; michael@0: Int32 i; michael@0: michael@0: if (nblock < 10000) { michael@0: fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); michael@0: } else { michael@0: /* Calculate the location for quadrant, remembering to get michael@0: the alignment right. Assumes that &(block[0]) is at least michael@0: 2-byte aligned -- this should be ok since block is really michael@0: the first section of arr2. michael@0: */ michael@0: i = nblock+BZ_N_OVERSHOOT; michael@0: if (i & 1) i++; michael@0: quadrant = (UInt16*)(&(block[i])); michael@0: michael@0: /* (wfact-1) / 3 puts the default-factor-30 michael@0: transition point at very roughly the same place as michael@0: with v0.1 and v0.9.0. michael@0: Not that it particularly matters any more, since the michael@0: resulting compressed stream is now the same regardless michael@0: of whether or not we use the main sort or fallback sort. michael@0: */ michael@0: if (wfact < 1 ) wfact = 1; michael@0: if (wfact > 100) wfact = 100; michael@0: budgetInit = nblock * ((wfact-1) / 3); michael@0: budget = budgetInit; michael@0: michael@0: mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); michael@0: if (verb >= 3) michael@0: VPrintf3 ( " %d work, %d block, ratio %5.2f\n", michael@0: budgetInit - budget, michael@0: nblock, michael@0: (float)(budgetInit - budget) / michael@0: (float)(nblock==0 ? 1 : nblock) ); michael@0: if (budget < 0) { michael@0: if (verb >= 2) michael@0: VPrintf0 ( " too repetitive; using fallback" michael@0: " sorting algorithm\n" ); michael@0: fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); michael@0: } michael@0: } michael@0: michael@0: s->origPtr = -1; michael@0: for (i = 0; i < s->nblock; i++) michael@0: if (ptr[i] == 0) michael@0: { s->origPtr = i; break; }; michael@0: michael@0: AssertH( s->origPtr != -1, 1003 ); michael@0: } michael@0: michael@0: michael@0: /*-------------------------------------------------------------*/ michael@0: /*--- end blocksort.c ---*/ michael@0: /*-------------------------------------------------------------*/