blocksort.c revision 215041
11541Srgrimes 21541Srgrimes/*-------------------------------------------------------------*/ 31541Srgrimes/*--- Block sorting machinery ---*/ 41541Srgrimes/*--- blocksort.c ---*/ 51541Srgrimes/*-------------------------------------------------------------*/ 61541Srgrimes 71541Srgrimes/* ------------------------------------------------------------------ 81541Srgrimes This file is part of bzip2/libbzip2, a program and library for 91541Srgrimes lossless, block-sorting data compression. 101541Srgrimes 111541Srgrimes bzip2/libbzip2 version 1.0.6 of 6 September 2010 121541Srgrimes Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org> 131541Srgrimes 141541Srgrimes Please read the WARNING, DISCLAIMER and PATENTS sections in the 151541Srgrimes README file. 161541Srgrimes 171541Srgrimes This program is released under the terms of the license contained 181541Srgrimes in the file LICENSE. 191541Srgrimes ------------------------------------------------------------------ */ 201541Srgrimes 211541Srgrimes 221541Srgrimes#include "bzlib_private.h" 231541Srgrimes 241541Srgrimes/*---------------------------------------------*/ 251541Srgrimes/*--- Fallback O(N log(N)^2) sorting ---*/ 261541Srgrimes/*--- algorithm, for repetitive blocks ---*/ 271541Srgrimes/*---------------------------------------------*/ 281541Srgrimes 2985052Sru/*---------------------------------------------*/ 3050477Speterstatic 311541Srgrimes__inline__ 321541Srgrimesvoid fallbackSimpleSort ( UInt32* fmap, 332168Spaul UInt32* eclass, 342168Spaul Int32 lo, 352168Spaul Int32 hi ) 361541Srgrimes{ 371541Srgrimes Int32 i, j, tmp; 388876Srgrimes UInt32 ec_tmp; 391541Srgrimes 401541Srgrimes if (lo == hi) return; 411541Srgrimes 421541Srgrimes if (hi - lo > 3) { 431541Srgrimes for ( i = hi-4; i >= lo; i-- ) { 441541Srgrimes tmp = fmap[i]; 451541Srgrimes ec_tmp = eclass[tmp]; 461541Srgrimes for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 ) 471541Srgrimes fmap[j-4] = fmap[j]; 481541Srgrimes fmap[j-4] = tmp; 491541Srgrimes } 501541Srgrimes } 511541Srgrimes 521541Srgrimes for ( i = hi-1; i >= lo; i-- ) { 531541Srgrimes tmp = fmap[i]; 541541Srgrimes ec_tmp = eclass[tmp]; 551541Srgrimes for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ ) 561541Srgrimes fmap[j-1] = fmap[j]; 57122922Sandre fmap[j-1] = tmp; 58122922Sandre } 59122922Sandre} 60122922Sandre 61122922Sandre 62122922Sandre/*---------------------------------------------*/ 631541Srgrimes#define fswap(zz1, zz2) \ 641541Srgrimes { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 651541Srgrimes 661541Srgrimes#define fvswap(zzp1, zzp2, zzn) \ 671541Srgrimes{ \ 6813765Smpp Int32 yyp1 = (zzp1); \ 6913765Smpp Int32 yyp2 = (zzp2); \ 701541Srgrimes Int32 yyn = (zzn); \ 711541Srgrimes while (yyn > 0) { \ 721541Srgrimes fswap(fmap[yyp1], fmap[yyp2]); \ 731541Srgrimes yyp1++; yyp2++; yyn--; \ 745791Swollman } \ 751541Srgrimes} 761541Srgrimes 771541Srgrimes 781541Srgrimes#define fmin(a,b) ((a) < (b)) ? (a) : (b) 791541Srgrimes 801541Srgrimes#define fpush(lz,hz) { stackLo[sp] = lz; \ 811541Srgrimes stackHi[sp] = hz; \ 821541Srgrimes sp++; } 831541Srgrimes 841541Srgrimes#define fpop(lz,hz) { sp--; \ 851541Srgrimes lz = stackLo[sp]; \ 865833Sbde hz = stackHi[sp]; } 875833Sbde 885833Sbde#define FALLBACK_QSORT_SMALL_THRESH 10 895833Sbde#define FALLBACK_QSORT_STACK_SIZE 100 905833Sbde 911541Srgrimes 921541Srgrimesstatic 931541Srgrimesvoid fallbackQSort3 ( UInt32* fmap, 941541Srgrimes UInt32* eclass, 951541Srgrimes Int32 loSt, 961541Srgrimes Int32 hiSt ) 971541Srgrimes{ 981541Srgrimes Int32 unLo, unHi, ltLo, gtHi, n, m; 991541Srgrimes Int32 sp, lo, hi; 1001541Srgrimes UInt32 med, r, r3; 1011541Srgrimes Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; 1021541Srgrimes Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; 103128454Sluigi 104128454Sluigi r = 0; 105128454Sluigi 106128454Sluigi sp = 0; 107128454Sluigi fpush ( loSt, hiSt ); 108132780Skan 109132780Skan while (sp > 0) { 1101541Srgrimes 1117197Swollman AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 ); 1121541Srgrimes 113122922Sandre fpop ( lo, hi ); 114127828Sluigi if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { 115127828Sluigi fallbackSimpleSort ( fmap, eclass, lo, hi ); 1161541Srgrimes continue; 1171541Srgrimes } 1181541Srgrimes 1195791Swollman /* Random partitioning. Median of 3 sometimes fails to 120120727Ssam avoid bad cases. Median of 9 seems to help but 121120727Ssam looks rather expensive. This too seems to work but 122120727Ssam is cheaper. Guidance for the magic constants 123120727Ssam 7621 and 32768 is taken from Sedgewick's algorithms 1241541Srgrimes book, chapter 35. 1251541Srgrimes */ 1261541Srgrimes r = ((r * 7621) + 1) % 32768; 1271541Srgrimes r3 = r % 3; 1281541Srgrimes if (r3 == 0) med = eclass[fmap[lo]]; else 1291541Srgrimes if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else 1301541Srgrimes med = eclass[fmap[hi]]; 1311541Srgrimes 1321541Srgrimes unLo = ltLo = lo; 1331541Srgrimes unHi = gtHi = hi; 1341541Srgrimes 1351541Srgrimes while (1) { 1361541Srgrimes while (1) { 1371541Srgrimes if (unLo > unHi) break; 1381541Srgrimes n = (Int32)eclass[fmap[unLo]] - (Int32)med; 1391541Srgrimes if (n == 0) { 1404104Swollman fswap(fmap[unLo], fmap[ltLo]); 1414104Swollman ltLo++; unLo++; 1421541Srgrimes continue; 1431541Srgrimes }; 1441541Srgrimes if (n > 0) break; 1451541Srgrimes unLo++; 1461541Srgrimes } 1471541Srgrimes while (1) { 1481541Srgrimes if (unLo > unHi) break; 14986764Sjlemon n = (Int32)eclass[fmap[unHi]] - (Int32)med; 1501541Srgrimes if (n == 0) { 1511541Srgrimes fswap(fmap[unHi], fmap[gtHi]); 15217835Sjulian gtHi--; unHi--; 1531541Srgrimes continue; 1541541Srgrimes }; 1551541Srgrimes if (n < 0) break; 1561541Srgrimes unHi--; 1571541Srgrimes } 158122921Sandre if (unLo > unHi) break; 159122921Sandre fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; 160122921Sandre } 161122921Sandre 162122921Sandre AssertD ( unHi == unLo-1, "fallbackQSort3(2)" ); 1635099Swollman 1645099Swollman if (gtHi < ltLo) continue; 16518839Swollman 1666245Swollman n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n); 16715652Swollman m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m); 16815652Swollman 16915652Swollman n = lo + unLo - ltLo - 1; 17015652Swollman m = hi - (gtHi - unHi) + 1; 1711541Srgrimes 1721541Srgrimes if (n - lo > hi - m) { 1731541Srgrimes fpush ( lo, n ); 1741541Srgrimes fpush ( m, hi ); 1751541Srgrimes } else { 1761541Srgrimes fpush ( m, hi ); 1771541Srgrimes fpush ( lo, n ); 1781541Srgrimes } 1791541Srgrimes } 1801541Srgrimes} 1811541Srgrimes 1821541Srgrimes#undef fmin 1831541Srgrimes#undef fpush 1841541Srgrimes#undef fpop 1851541Srgrimes#undef fswap 1861541Srgrimes#undef fvswap 1871541Srgrimes#undef FALLBACK_QSORT_SMALL_THRESH 1881541Srgrimes#undef FALLBACK_QSORT_STACK_SIZE 1891541Srgrimes 1901541Srgrimes 1911541Srgrimes/*---------------------------------------------*/ 1921541Srgrimes/* Pre: 1931541Srgrimes nblock > 0 1941541Srgrimes eclass exists for [0 .. nblock-1] 1951541Srgrimes ((UChar*)eclass) [0 .. nblock-1] holds block 1961541Srgrimes ptr exists for [0 .. nblock-1] 1971541Srgrimes 1981541Srgrimes Post: 1991541Srgrimes ((UChar*)eclass) [0 .. nblock-1] holds block 2005791Swollman All other areas of eclass destroyed 2011541Srgrimes fmap [0 .. nblock-1] holds sorted order 20251252Sru bhtab [ 0 .. 2+(nblock/32) ] destroyed 20351252Sru*/ 20451252Sru 2051541Srgrimes#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) 2061541Srgrimes#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) 2071541Srgrimes#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) 2081541Srgrimes#define WORD_BH(zz) bhtab[(zz) >> 5] 2091541Srgrimes#define UNALIGNED_BH(zz) ((zz) & 0x01f) 2101541Srgrimes 2111541Srgrimesstatic 2121541Srgrimesvoid fallbackSort ( UInt32* fmap, 2131541Srgrimes UInt32* eclass, 2141541Srgrimes UInt32* bhtab, 2151541Srgrimes Int32 nblock, 2161541Srgrimes Int32 verb ) 2171541Srgrimes{ 2181541Srgrimes Int32 ftab[257]; 21921666Swollman Int32 ftabCopy[256]; 22021666Swollman Int32 H, i, j, k, l, r, cc, cc1; 22189498Sru Int32 nNotDone; 2221541Srgrimes Int32 nBhtab; 22351252Sru UChar* eclass8 = (UChar*)eclass; 22451252Sru 22551252Sru /*-- 2261541Srgrimes Initial 1-char radix sort to generate 2271541Srgrimes initial fmap and initial BH bits. 22851252Sru --*/ 2291541Srgrimes if (verb >= 4) 2301541Srgrimes VPrintf0 ( " bucket sorting ...\n" ); 2311541Srgrimes for (i = 0; i < 257; i++) ftab[i] = 0; 2321541Srgrimes for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; 2331541Srgrimes for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; 2341541Srgrimes for (i = 1; i < 257; i++) ftab[i] += ftab[i-1]; 2351541Srgrimes 23651252Sru for (i = 0; i < nblock; i++) { 2371541Srgrimes j = eclass8[i]; 2381541Srgrimes k = ftab[j] - 1; 2391541Srgrimes ftab[j] = k; 2401541Srgrimes fmap[k] = i; 2411541Srgrimes } 2421541Srgrimes 2431541Srgrimes nBhtab = 2 + (nblock / 32); 2441541Srgrimes for (i = 0; i < nBhtab; i++) bhtab[i] = 0; 2451541Srgrimes for (i = 0; i < 256; i++) SET_BH(ftab[i]); 2461541Srgrimes 2471541Srgrimes /*-- 2481541Srgrimes Inductively refine the buckets. Kind-of an 2491541Srgrimes "exponential radix sort" (!), inspired by the 2501541Srgrimes Manber-Myers suffix array construction algorithm. 2511541Srgrimes --*/ 2521541Srgrimes 2531541Srgrimes /*-- set sentinel bits for block-end detection --*/ 2541541Srgrimes for (i = 0; i < 32; i++) { 2551541Srgrimes SET_BH(nblock + 2*i); 2561541Srgrimes CLEAR_BH(nblock + 2*i + 1); 2571541Srgrimes } 2581541Srgrimes 2591541Srgrimes /*-- the log(N) loop --*/ 2601541Srgrimes H = 1; 2611541Srgrimes while (1) { 2621541Srgrimes 26385074Sru if (verb >= 4) 26485074Sru VPrintf1 ( " depth %6d has ", H ); 26585074Sru 2661541Srgrimes j = 0; 2671541Srgrimes for (i = 0; i < nblock; i++) { 268128185Sluigi if (ISSET_BH(i)) j = i; 269128185Sluigi k = fmap[i] - H; if (k < 0) k += nblock; 270128185Sluigi eclass[k] = j; 271128185Sluigi } 272128185Sluigi 273128185Sluigi nNotDone = 0; 274128185Sluigi r = -1; 275128185Sluigi while (1) { 276128185Sluigi 277128185Sluigi /*-- find the next non-singleton bucket --*/ 278128185Sluigi k = r + 1; 279128185Sluigi while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; 28055205Speter if (ISSET_BH(k)) { 281117752Shsu while (WORD_BH(k) == 0xffffffff) k += 32; 282120727Ssam while (ISSET_BH(k)) k++; 283120727Ssam } 284120727Ssam l = k - 1; 285120727Ssam if (l >= nblock) break; 286120727Ssam while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; 287120727Ssam if (!ISSET_BH(k)) { 288117752Shsu while (WORD_BH(k) == 0x00000000) k += 32; 289122334Ssam while (!ISSET_BH(k)) k++; 290122334Ssam } 291122334Ssam r = k - 1; 292122334Ssam if (r >= nblock) break; 293122334Ssam 294122334Ssam /*-- now [l, r] bracket current bucket --*/ 295122334Ssam if (r > l) { 296122334Ssam nNotDone += (r - l + 1); 297122334Ssam fallbackQSort3 ( fmap, eclass, l, r ); 298122334Ssam 299122334Ssam /*-- scan bucket and generate header bits-- */ 300122334Ssam cc = -1; 301122334Ssam for (i = l; i <= r; i++) { 302122334Ssam cc1 = eclass[fmap[i]]; 303122334Ssam if (cc != cc1) { SET_BH(i); cc = cc1; }; 304122334Ssam } 305122334Ssam } 306122334Ssam } 307122334Ssam 308122334Ssam if (verb >= 4) 309122334Ssam VPrintf1 ( "%6d unresolved strings\n", nNotDone ); 310122334Ssam 31146568Speter H *= 2; 312122334Ssam if (H > nblock || nNotDone == 0) break; 313122334Ssam } 314122334Ssam 315120727Ssam /*-- 3161541Srgrimes Reconstruct the original block in 3179759Sbde eclass8 [0 .. nblock-1], since the 3181541Srgrimes previous phase destroyed it. 31921666Swollman --*/ 32021666Swollman if (verb >= 4) 32192725Salfred VPrintf0 ( " reconstructing block ...\n" ); 32292725Salfred j = 0; 32392725Salfred for (i = 0; i < nblock; i++) { 32492725Salfred while (ftabCopy[j] == 0) j++; 32592725Salfred ftabCopy[j]--; 32692725Salfred eclass8[fmap[i]] = (UChar)j; 32792725Salfred } 328128621Sluigi AssertH ( j < 256, 1005 ); 329128621Sluigi} 330128621Sluigi 331128621Sluigi#undef SET_BH 332128621Sluigi#undef CLEAR_BH 333128621Sluigi#undef ISSET_BH 334128621Sluigi#undef WORD_BH 335128621Sluigi#undef UNALIGNED_BH 336128621Sluigi 337128621Sluigi 338128621Sluigi/*---------------------------------------------*/ 339128621Sluigi/*--- The main, O(N^2 log(N)) sorting ---*/ 340128621Sluigi/*--- algorithm. Faster for "normal" ---*/ 341128621Sluigi/*--- non-repetitive blocks. ---*/ 342128621Sluigi/*---------------------------------------------*/ 343120727Ssam 344121770Ssam/*---------------------------------------------*/ 34592725Salfredstatic 34692725Salfred__inline__ 34792725SalfredBool mainGtU ( UInt32 i1, 34892725Salfred UInt32 i2, 349120727Ssam UChar* block, 35092725Salfred UInt16* quadrant, 35192725Salfred UInt32 nblock, 35292725Salfred Int32* budget ) 353111767Smdodd{ 3541541Srgrimes Int32 k; 3552168Spaul UChar c1, c2; 3562168Spaul UInt16 s1, s2; 357 358 AssertD ( i1 != i2, "mainGtU" ); 359 /* 1 */ 360 c1 = block[i1]; c2 = block[i2]; 361 if (c1 != c2) return (c1 > c2); 362 i1++; i2++; 363 /* 2 */ 364 c1 = block[i1]; c2 = block[i2]; 365 if (c1 != c2) return (c1 > c2); 366 i1++; i2++; 367 /* 3 */ 368 c1 = block[i1]; c2 = block[i2]; 369 if (c1 != c2) return (c1 > c2); 370 i1++; i2++; 371 /* 4 */ 372 c1 = block[i1]; c2 = block[i2]; 373 if (c1 != c2) return (c1 > c2); 374 i1++; i2++; 375 /* 5 */ 376 c1 = block[i1]; c2 = block[i2]; 377 if (c1 != c2) return (c1 > c2); 378 i1++; i2++; 379 /* 6 */ 380 c1 = block[i1]; c2 = block[i2]; 381 if (c1 != c2) return (c1 > c2); 382 i1++; i2++; 383 /* 7 */ 384 c1 = block[i1]; c2 = block[i2]; 385 if (c1 != c2) return (c1 > c2); 386 i1++; i2++; 387 /* 8 */ 388 c1 = block[i1]; c2 = block[i2]; 389 if (c1 != c2) return (c1 > c2); 390 i1++; i2++; 391 /* 9 */ 392 c1 = block[i1]; c2 = block[i2]; 393 if (c1 != c2) return (c1 > c2); 394 i1++; i2++; 395 /* 10 */ 396 c1 = block[i1]; c2 = block[i2]; 397 if (c1 != c2) return (c1 > c2); 398 i1++; i2++; 399 /* 11 */ 400 c1 = block[i1]; c2 = block[i2]; 401 if (c1 != c2) return (c1 > c2); 402 i1++; i2++; 403 /* 12 */ 404 c1 = block[i1]; c2 = block[i2]; 405 if (c1 != c2) return (c1 > c2); 406 i1++; i2++; 407 408 k = nblock + 8; 409 410 do { 411 /* 1 */ 412 c1 = block[i1]; c2 = block[i2]; 413 if (c1 != c2) return (c1 > c2); 414 s1 = quadrant[i1]; s2 = quadrant[i2]; 415 if (s1 != s2) return (s1 > s2); 416 i1++; i2++; 417 /* 2 */ 418 c1 = block[i1]; c2 = block[i2]; 419 if (c1 != c2) return (c1 > c2); 420 s1 = quadrant[i1]; s2 = quadrant[i2]; 421 if (s1 != s2) return (s1 > s2); 422 i1++; i2++; 423 /* 3 */ 424 c1 = block[i1]; c2 = block[i2]; 425 if (c1 != c2) return (c1 > c2); 426 s1 = quadrant[i1]; s2 = quadrant[i2]; 427 if (s1 != s2) return (s1 > s2); 428 i1++; i2++; 429 /* 4 */ 430 c1 = block[i1]; c2 = block[i2]; 431 if (c1 != c2) return (c1 > c2); 432 s1 = quadrant[i1]; s2 = quadrant[i2]; 433 if (s1 != s2) return (s1 > s2); 434 i1++; i2++; 435 /* 5 */ 436 c1 = block[i1]; c2 = block[i2]; 437 if (c1 != c2) return (c1 > c2); 438 s1 = quadrant[i1]; s2 = quadrant[i2]; 439 if (s1 != s2) return (s1 > s2); 440 i1++; i2++; 441 /* 6 */ 442 c1 = block[i1]; c2 = block[i2]; 443 if (c1 != c2) return (c1 > c2); 444 s1 = quadrant[i1]; s2 = quadrant[i2]; 445 if (s1 != s2) return (s1 > s2); 446 i1++; i2++; 447 /* 7 */ 448 c1 = block[i1]; c2 = block[i2]; 449 if (c1 != c2) return (c1 > c2); 450 s1 = quadrant[i1]; s2 = quadrant[i2]; 451 if (s1 != s2) return (s1 > s2); 452 i1++; i2++; 453 /* 8 */ 454 c1 = block[i1]; c2 = block[i2]; 455 if (c1 != c2) return (c1 > c2); 456 s1 = quadrant[i1]; s2 = quadrant[i2]; 457 if (s1 != s2) return (s1 > s2); 458 i1++; i2++; 459 460 if (i1 >= nblock) i1 -= nblock; 461 if (i2 >= nblock) i2 -= nblock; 462 463 k -= 8; 464 (*budget)--; 465 } 466 while (k >= 0); 467 468 return False; 469} 470 471 472/*---------------------------------------------*/ 473/*-- 474 Knuth's increments seem to work better 475 than Incerpi-Sedgewick here. Possibly 476 because the number of elems to sort is 477 usually small, typically <= 20. 478--*/ 479static 480Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, 481 9841, 29524, 88573, 265720, 482 797161, 2391484 }; 483 484static 485void mainSimpleSort ( UInt32* ptr, 486 UChar* block, 487 UInt16* quadrant, 488 Int32 nblock, 489 Int32 lo, 490 Int32 hi, 491 Int32 d, 492 Int32* budget ) 493{ 494 Int32 i, j, h, bigN, hp; 495 UInt32 v; 496 497 bigN = hi - lo + 1; 498 if (bigN < 2) return; 499 500 hp = 0; 501 while (incs[hp] < bigN) hp++; 502 hp--; 503 504 for (; hp >= 0; hp--) { 505 h = incs[hp]; 506 507 i = lo + h; 508 while (True) { 509 510 /*-- copy 1 --*/ 511 if (i > hi) break; 512 v = ptr[i]; 513 j = i; 514 while ( mainGtU ( 515 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 516 ) ) { 517 ptr[j] = ptr[j-h]; 518 j = j - h; 519 if (j <= (lo + h - 1)) break; 520 } 521 ptr[j] = v; 522 i++; 523 524 /*-- copy 2 --*/ 525 if (i > hi) break; 526 v = ptr[i]; 527 j = i; 528 while ( mainGtU ( 529 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 530 ) ) { 531 ptr[j] = ptr[j-h]; 532 j = j - h; 533 if (j <= (lo + h - 1)) break; 534 } 535 ptr[j] = v; 536 i++; 537 538 /*-- copy 3 --*/ 539 if (i > hi) break; 540 v = ptr[i]; 541 j = i; 542 while ( mainGtU ( 543 ptr[j-h]+d, v+d, block, quadrant, nblock, budget 544 ) ) { 545 ptr[j] = ptr[j-h]; 546 j = j - h; 547 if (j <= (lo + h - 1)) break; 548 } 549 ptr[j] = v; 550 i++; 551 552 if (*budget < 0) return; 553 } 554 } 555} 556 557 558/*---------------------------------------------*/ 559/*-- 560 The following is an implementation of 561 an elegant 3-way quicksort for strings, 562 described in a paper "Fast Algorithms for 563 Sorting and Searching Strings", by Robert 564 Sedgewick and Jon L. Bentley. 565--*/ 566 567#define mswap(zz1, zz2) \ 568 { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; } 569 570#define mvswap(zzp1, zzp2, zzn) \ 571{ \ 572 Int32 yyp1 = (zzp1); \ 573 Int32 yyp2 = (zzp2); \ 574 Int32 yyn = (zzn); \ 575 while (yyn > 0) { \ 576 mswap(ptr[yyp1], ptr[yyp2]); \ 577 yyp1++; yyp2++; yyn--; \ 578 } \ 579} 580 581static 582__inline__ 583UChar mmed3 ( UChar a, UChar b, UChar c ) 584{ 585 UChar t; 586 if (a > b) { t = a; a = b; b = t; }; 587 if (b > c) { 588 b = c; 589 if (a > b) b = a; 590 } 591 return b; 592} 593 594#define mmin(a,b) ((a) < (b)) ? (a) : (b) 595 596#define mpush(lz,hz,dz) { stackLo[sp] = lz; \ 597 stackHi[sp] = hz; \ 598 stackD [sp] = dz; \ 599 sp++; } 600 601#define mpop(lz,hz,dz) { sp--; \ 602 lz = stackLo[sp]; \ 603 hz = stackHi[sp]; \ 604 dz = stackD [sp]; } 605 606 607#define mnextsize(az) (nextHi[az]-nextLo[az]) 608 609#define mnextswap(az,bz) \ 610 { Int32 tz; \ 611 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ 612 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ 613 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; } 614 615 616#define MAIN_QSORT_SMALL_THRESH 20 617#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) 618#define MAIN_QSORT_STACK_SIZE 100 619 620static 621void mainQSort3 ( UInt32* ptr, 622 UChar* block, 623 UInt16* quadrant, 624 Int32 nblock, 625 Int32 loSt, 626 Int32 hiSt, 627 Int32 dSt, 628 Int32* budget ) 629{ 630 Int32 unLo, unHi, ltLo, gtHi, n, m, med; 631 Int32 sp, lo, hi, d; 632 633 Int32 stackLo[MAIN_QSORT_STACK_SIZE]; 634 Int32 stackHi[MAIN_QSORT_STACK_SIZE]; 635 Int32 stackD [MAIN_QSORT_STACK_SIZE]; 636 637 Int32 nextLo[3]; 638 Int32 nextHi[3]; 639 Int32 nextD [3]; 640 641 sp = 0; 642 mpush ( loSt, hiSt, dSt ); 643 644 while (sp > 0) { 645 646 AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 ); 647 648 mpop ( lo, hi, d ); 649 if (hi - lo < MAIN_QSORT_SMALL_THRESH || 650 d > MAIN_QSORT_DEPTH_THRESH) { 651 mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget ); 652 if (*budget < 0) return; 653 continue; 654 } 655 656 med = (Int32) 657 mmed3 ( block[ptr[ lo ]+d], 658 block[ptr[ hi ]+d], 659 block[ptr[ (lo+hi)>>1 ]+d] ); 660 661 unLo = ltLo = lo; 662 unHi = gtHi = hi; 663 664 while (True) { 665 while (True) { 666 if (unLo > unHi) break; 667 n = ((Int32)block[ptr[unLo]+d]) - med; 668 if (n == 0) { 669 mswap(ptr[unLo], ptr[ltLo]); 670 ltLo++; unLo++; continue; 671 }; 672 if (n > 0) break; 673 unLo++; 674 } 675 while (True) { 676 if (unLo > unHi) break; 677 n = ((Int32)block[ptr[unHi]+d]) - med; 678 if (n == 0) { 679 mswap(ptr[unHi], ptr[gtHi]); 680 gtHi--; unHi--; continue; 681 }; 682 if (n < 0) break; 683 unHi--; 684 } 685 if (unLo > unHi) break; 686 mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; 687 } 688 689 AssertD ( unHi == unLo-1, "mainQSort3(2)" ); 690 691 if (gtHi < ltLo) { 692 mpush(lo, hi, d+1 ); 693 continue; 694 } 695 696 n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n); 697 m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m); 698 699 n = lo + unLo - ltLo - 1; 700 m = hi - (gtHi - unHi) + 1; 701 702 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; 703 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; 704 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; 705 706 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 707 if (mnextsize(1) < mnextsize(2)) mnextswap(1,2); 708 if (mnextsize(0) < mnextsize(1)) mnextswap(0,1); 709 710 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" ); 711 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" ); 712 713 mpush (nextLo[0], nextHi[0], nextD[0]); 714 mpush (nextLo[1], nextHi[1], nextD[1]); 715 mpush (nextLo[2], nextHi[2], nextD[2]); 716 } 717} 718 719#undef mswap 720#undef mvswap 721#undef mpush 722#undef mpop 723#undef mmin 724#undef mnextsize 725#undef mnextswap 726#undef MAIN_QSORT_SMALL_THRESH 727#undef MAIN_QSORT_DEPTH_THRESH 728#undef MAIN_QSORT_STACK_SIZE 729 730 731/*---------------------------------------------*/ 732/* Pre: 733 nblock > N_OVERSHOOT 734 block32 exists for [0 .. nblock-1 +N_OVERSHOOT] 735 ((UChar*)block32) [0 .. nblock-1] holds block 736 ptr exists for [0 .. nblock-1] 737 738 Post: 739 ((UChar*)block32) [0 .. nblock-1] holds block 740 All other areas of block32 destroyed 741 ftab [0 .. 65536 ] destroyed 742 ptr [0 .. nblock-1] holds sorted order 743 if (*budget < 0), sorting was abandoned 744*/ 745 746#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) 747#define SETMASK (1 << 21) 748#define CLEARMASK (~(SETMASK)) 749 750static 751void mainSort ( UInt32* ptr, 752 UChar* block, 753 UInt16* quadrant, 754 UInt32* ftab, 755 Int32 nblock, 756 Int32 verb, 757 Int32* budget ) 758{ 759 Int32 i, j, k, ss, sb; 760 Int32 runningOrder[256]; 761 Bool bigDone[256]; 762 Int32 copyStart[256]; 763 Int32 copyEnd [256]; 764 UChar c1; 765 Int32 numQSorted; 766 UInt16 s; 767 if (verb >= 4) VPrintf0 ( " main sort initialise ...\n" ); 768 769 /*-- set up the 2-byte frequency table --*/ 770 for (i = 65536; i >= 0; i--) ftab[i] = 0; 771 772 j = block[0] << 8; 773 i = nblock-1; 774 for (; i >= 3; i -= 4) { 775 quadrant[i] = 0; 776 j = (j >> 8) | ( ((UInt16)block[i]) << 8); 777 ftab[j]++; 778 quadrant[i-1] = 0; 779 j = (j >> 8) | ( ((UInt16)block[i-1]) << 8); 780 ftab[j]++; 781 quadrant[i-2] = 0; 782 j = (j >> 8) | ( ((UInt16)block[i-2]) << 8); 783 ftab[j]++; 784 quadrant[i-3] = 0; 785 j = (j >> 8) | ( ((UInt16)block[i-3]) << 8); 786 ftab[j]++; 787 } 788 for (; i >= 0; i--) { 789 quadrant[i] = 0; 790 j = (j >> 8) | ( ((UInt16)block[i]) << 8); 791 ftab[j]++; 792 } 793 794 /*-- (emphasises close relationship of block & quadrant) --*/ 795 for (i = 0; i < BZ_N_OVERSHOOT; i++) { 796 block [nblock+i] = block[i]; 797 quadrant[nblock+i] = 0; 798 } 799 800 if (verb >= 4) VPrintf0 ( " bucket sorting ...\n" ); 801 802 /*-- Complete the initial radix sort --*/ 803 for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1]; 804 805 s = block[0] << 8; 806 i = nblock-1; 807 for (; i >= 3; i -= 4) { 808 s = (s >> 8) | (block[i] << 8); 809 j = ftab[s] -1; 810 ftab[s] = j; 811 ptr[j] = i; 812 s = (s >> 8) | (block[i-1] << 8); 813 j = ftab[s] -1; 814 ftab[s] = j; 815 ptr[j] = i-1; 816 s = (s >> 8) | (block[i-2] << 8); 817 j = ftab[s] -1; 818 ftab[s] = j; 819 ptr[j] = i-2; 820 s = (s >> 8) | (block[i-3] << 8); 821 j = ftab[s] -1; 822 ftab[s] = j; 823 ptr[j] = i-3; 824 } 825 for (; i >= 0; i--) { 826 s = (s >> 8) | (block[i] << 8); 827 j = ftab[s] -1; 828 ftab[s] = j; 829 ptr[j] = i; 830 } 831 832 /*-- 833 Now ftab contains the first loc of every small bucket. 834 Calculate the running order, from smallest to largest 835 big bucket. 836 --*/ 837 for (i = 0; i <= 255; i++) { 838 bigDone [i] = False; 839 runningOrder[i] = i; 840 } 841 842 { 843 Int32 vv; 844 Int32 h = 1; 845 do h = 3 * h + 1; while (h <= 256); 846 do { 847 h = h / 3; 848 for (i = h; i <= 255; i++) { 849 vv = runningOrder[i]; 850 j = i; 851 while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) { 852 runningOrder[j] = runningOrder[j-h]; 853 j = j - h; 854 if (j <= (h - 1)) goto zero; 855 } 856 zero: 857 runningOrder[j] = vv; 858 } 859 } while (h != 1); 860 } 861 862 /*-- 863 The main sorting loop. 864 --*/ 865 866 numQSorted = 0; 867 868 for (i = 0; i <= 255; i++) { 869 870 /*-- 871 Process big buckets, starting with the least full. 872 Basically this is a 3-step process in which we call 873 mainQSort3 to sort the small buckets [ss, j], but 874 also make a big effort to avoid the calls if we can. 875 --*/ 876 ss = runningOrder[i]; 877 878 /*-- 879 Step 1: 880 Complete the big bucket [ss] by quicksorting 881 any unsorted small buckets [ss, j], for j != ss. 882 Hopefully previous pointer-scanning phases have already 883 completed many of the small buckets [ss, j], so 884 we don't have to sort them at all. 885 --*/ 886 for (j = 0; j <= 255; j++) { 887 if (j != ss) { 888 sb = (ss << 8) + j; 889 if ( ! (ftab[sb] & SETMASK) ) { 890 Int32 lo = ftab[sb] & CLEARMASK; 891 Int32 hi = (ftab[sb+1] & CLEARMASK) - 1; 892 if (hi > lo) { 893 if (verb >= 4) 894 VPrintf4 ( " qsort [0x%x, 0x%x] " 895 "done %d this %d\n", 896 ss, j, numQSorted, hi - lo + 1 ); 897 mainQSort3 ( 898 ptr, block, quadrant, nblock, 899 lo, hi, BZ_N_RADIX, budget 900 ); 901 numQSorted += (hi - lo + 1); 902 if (*budget < 0) return; 903 } 904 } 905 ftab[sb] |= SETMASK; 906 } 907 } 908 909 AssertH ( !bigDone[ss], 1006 ); 910 911 /*-- 912 Step 2: 913 Now scan this big bucket [ss] so as to synthesise the 914 sorted order for small buckets [t, ss] for all t, 915 including, magically, the bucket [ss,ss] too. 916 This will avoid doing Real Work in subsequent Step 1's. 917 --*/ 918 { 919 for (j = 0; j <= 255; j++) { 920 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; 921 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; 922 } 923 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { 924 k = ptr[j]-1; if (k < 0) k += nblock; 925 c1 = block[k]; 926 if (!bigDone[c1]) 927 ptr[ copyStart[c1]++ ] = k; 928 } 929 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { 930 k = ptr[j]-1; if (k < 0) k += nblock; 931 c1 = block[k]; 932 if (!bigDone[c1]) 933 ptr[ copyEnd[c1]-- ] = k; 934 } 935 } 936 937 AssertH ( (copyStart[ss]-1 == copyEnd[ss]) 938 || 939 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. 940 Necessity for this case is demonstrated by compressing 941 a sequence of approximately 48.5 million of character 942 251; 1.0.0/1.0.1 will then die here. */ 943 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 944 1007 ) 945 946 for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; 947 948 /*-- 949 Step 3: 950 The [ss] big bucket is now done. Record this fact, 951 and update the quadrant descriptors. Remember to 952 update quadrants in the overshoot area too, if 953 necessary. The "if (i < 255)" test merely skips 954 this updating for the last bucket processed, since 955 updating for the last bucket is pointless. 956 957 The quadrant array provides a way to incrementally 958 cache sort orderings, as they appear, so as to 959 make subsequent comparisons in fullGtU() complete 960 faster. For repetitive blocks this makes a big 961 difference (but not big enough to be able to avoid 962 the fallback sorting mechanism, exponential radix sort). 963 964 The precise meaning is: at all times: 965 966 for 0 <= i < nblock and 0 <= j <= nblock 967 968 if block[i] != block[j], 969 970 then the relative values of quadrant[i] and 971 quadrant[j] are meaningless. 972 973 else { 974 if quadrant[i] < quadrant[j] 975 then the string starting at i lexicographically 976 precedes the string starting at j 977 978 else if quadrant[i] > quadrant[j] 979 then the string starting at j lexicographically 980 precedes the string starting at i 981 982 else 983 the relative ordering of the strings starting 984 at i and j has not yet been determined. 985 } 986 --*/ 987 bigDone[ss] = True; 988 989 if (i < 255) { 990 Int32 bbStart = ftab[ss << 8] & CLEARMASK; 991 Int32 bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; 992 Int32 shifts = 0; 993 994 while ((bbSize >> shifts) > 65534) shifts++; 995 996 for (j = bbSize-1; j >= 0; j--) { 997 Int32 a2update = ptr[bbStart + j]; 998 UInt16 qVal = (UInt16)(j >> shifts); 999 quadrant[a2update] = qVal; 1000 if (a2update < BZ_N_OVERSHOOT) 1001 quadrant[a2update + nblock] = qVal; 1002 } 1003 AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 ); 1004 } 1005 1006 } 1007 1008 if (verb >= 4) 1009 VPrintf3 ( " %d pointers, %d sorted, %d scanned\n", 1010 nblock, numQSorted, nblock - numQSorted ); 1011} 1012 1013#undef BIGFREQ 1014#undef SETMASK 1015#undef CLEARMASK 1016 1017 1018/*---------------------------------------------*/ 1019/* Pre: 1020 nblock > 0 1021 arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] 1022 ((UChar*)arr2) [0 .. nblock-1] holds block 1023 arr1 exists for [0 .. nblock-1] 1024 1025 Post: 1026 ((UChar*)arr2) [0 .. nblock-1] holds block 1027 All other areas of block destroyed 1028 ftab [ 0 .. 65536 ] destroyed 1029 arr1 [0 .. nblock-1] holds sorted order 1030*/ 1031void BZ2_blockSort ( EState* s ) 1032{ 1033 UInt32* ptr = s->ptr; 1034 UChar* block = s->block; 1035 UInt32* ftab = s->ftab; 1036 Int32 nblock = s->nblock; 1037 Int32 verb = s->verbosity; 1038 Int32 wfact = s->workFactor; 1039 UInt16* quadrant; 1040 Int32 budget; 1041 Int32 budgetInit; 1042 Int32 i; 1043 1044 if (nblock < 10000) { 1045 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1046 } else { 1047 /* Calculate the location for quadrant, remembering to get 1048 the alignment right. Assumes that &(block[0]) is at least 1049 2-byte aligned -- this should be ok since block is really 1050 the first section of arr2. 1051 */ 1052 i = nblock+BZ_N_OVERSHOOT; 1053 if (i & 1) i++; 1054 quadrant = (UInt16*)(&(block[i])); 1055 1056 /* (wfact-1) / 3 puts the default-factor-30 1057 transition point at very roughly the same place as 1058 with v0.1 and v0.9.0. 1059 Not that it particularly matters any more, since the 1060 resulting compressed stream is now the same regardless 1061 of whether or not we use the main sort or fallback sort. 1062 */ 1063 if (wfact < 1 ) wfact = 1; 1064 if (wfact > 100) wfact = 100; 1065 budgetInit = nblock * ((wfact-1) / 3); 1066 budget = budgetInit; 1067 1068 mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget ); 1069 if (verb >= 3) 1070 VPrintf3 ( " %d work, %d block, ratio %5.2f\n", 1071 budgetInit - budget, 1072 nblock, 1073 (float)(budgetInit - budget) / 1074 (float)(nblock==0 ? 1 : nblock) ); 1075 if (budget < 0) { 1076 if (verb >= 2) 1077 VPrintf0 ( " too repetitive; using fallback" 1078 " sorting algorithm\n" ); 1079 fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb ); 1080 } 1081 } 1082 1083 s->origPtr = -1; 1084 for (i = 0; i < s->nblock; i++) 1085 if (ptr[i] == 0) 1086 { s->origPtr = i; break; }; 1087 1088 AssertH( s->origPtr != -1, 1003 ); 1089} 1090 1091 1092/*-------------------------------------------------------------*/ 1093/*--- end blocksort.c ---*/ 1094/*-------------------------------------------------------------*/ 1095