1/* 2 * bzip2 is written by Julian Seward <jseward@bzip.org>. 3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>. 4 * See README and LICENSE files in this directory for more information. 5 */ 6 7/*-------------------------------------------------------------*/ 8/*--- Block sorting machinery ---*/ 9/*--- blocksort.c ---*/ 10/*-------------------------------------------------------------*/ 11 12/* ------------------------------------------------------------------ 13This file is part of bzip2/libbzip2, a program and library for 14lossless, block-sorting data compression. 15 16bzip2/libbzip2 version 1.0.4 of 20 December 2006 17Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org> 18 19Please read the WARNING, DISCLAIMER and PATENTS sections in the 20README file. 21 22This program is released under the terms of the license contained 23in the file LICENSE. 24------------------------------------------------------------------ */ 25 26/* #include "bzlib_private.h" */ 27 28#define mswap(zz1, zz2) \ 29{ \ 30 int32_t zztmp = zz1; \ 31 zz1 = zz2; \ 32 zz2 = zztmp; \ 33} 34 35static 36/* No measurable speed gain with inlining */ 37/* ALWAYS_INLINE */ 38void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn) 39{ 40 while (zzn > 0) { 41 mswap(ptr[zzp1], ptr[zzp2]); 42 zzp1++; 43 zzp2++; 44 zzn--; 45 } 46} 47 48static 49ALWAYS_INLINE 50int32_t mmin(int32_t a, int32_t b) 51{ 52 return (a < b) ? a : b; 53} 54 55 56/*---------------------------------------------*/ 57/*--- Fallback O(N log(N)^2) sorting ---*/ 58/*--- algorithm, for repetitive blocks ---*/ 59/*---------------------------------------------*/ 60 61/*---------------------------------------------*/ 62static 63inline 64void fallbackSimpleSort(uint32_t* fmap, 65 uint32_t* eclass, 66 int32_t lo, 67 int32_t hi) 68{ 69 int32_t i, j, tmp; 70 uint32_t ec_tmp; 71 72 if (lo == hi) return; 73 74 if (hi - lo > 3) { 75 for (i = hi-4; i >= lo; i--) { 76 tmp = fmap[i]; 77 ec_tmp = eclass[tmp]; 78 for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4) 79 fmap[j-4] = fmap[j]; 80 fmap[j-4] = tmp; 81 } 82 } 83 84 for (i = hi-1; i >= lo; i--) { 85 tmp = fmap[i]; 86 ec_tmp = eclass[tmp]; 87 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) 88 fmap[j-1] = fmap[j]; 89 fmap[j-1] = tmp; 90 } 91} 92 93 94/*---------------------------------------------*/ 95#define fpush(lz,hz) { \ 96 stackLo[sp] = lz; \ 97 stackHi[sp] = hz; \ 98 sp++; \ 99} 100 101#define fpop(lz,hz) { \ 102 sp--; \ 103 lz = stackLo[sp]; \ 104 hz = stackHi[sp]; \ 105} 106 107#define FALLBACK_QSORT_SMALL_THRESH 10 108#define FALLBACK_QSORT_STACK_SIZE 100 109 110static 111void fallbackQSort3(uint32_t* fmap, 112 uint32_t* eclass, 113 int32_t loSt, 114 int32_t hiSt) 115{ 116 int32_t unLo, unHi, ltLo, gtHi, n, m; 117 int32_t sp, lo, hi; 118 uint32_t med, r, r3; 119 int32_t stackLo[FALLBACK_QSORT_STACK_SIZE]; 120 int32_t stackHi[FALLBACK_QSORT_STACK_SIZE]; 121 122 r = 0; 123 124 sp = 0; 125 fpush(loSt, hiSt); 126 127 while (sp > 0) { 128 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004); 129 130 fpop(lo, hi); 131 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { 132 fallbackSimpleSort(fmap, eclass, lo, hi); 133 continue; 134 } 135 136 /* Random partitioning. Median of 3 sometimes fails to 137 * avoid bad cases. Median of 9 seems to help but 138 * looks rather expensive. This too seems to work but 139 * is cheaper. Guidance for the magic constants 140 * 7621 and 32768 is taken from Sedgewick's algorithms 141 * book, chapter 35. 142 */ 143 r = ((r * 7621) + 1) % 32768; 144 r3 = r % 3; 145 if (r3 == 0) 146 med = eclass[fmap[lo]]; 147 else if (r3 == 1) 148 med = eclass[fmap[(lo+hi)>>1]]; 149 else 150 med = eclass[fmap[hi]]; 151 152 unLo = ltLo = lo; 153 unHi = gtHi = hi; 154 155 while (1) { 156 while (1) { 157 if (unLo > unHi) break; 158 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med; 159 if (n == 0) { 160 mswap(fmap[unLo], fmap[ltLo]); 161 ltLo++; 162 unLo++; 163 continue; 164 }; 165 if (n > 0) break; 166 unLo++; 167 } 168 while (1) { 169 if (unLo > unHi) break; 170 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med; 171 if (n == 0) { 172 mswap(fmap[unHi], fmap[gtHi]); 173 gtHi--; unHi--; 174 continue; 175 }; 176 if (n < 0) break; 177 unHi--; 178 } 179 if (unLo > unHi) break; 180 mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; 181 } 182 183 AssertD(unHi == unLo-1, "fallbackQSort3(2)"); 184 185 if (gtHi < ltLo) continue; 186 187 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n); 188 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m); 189 190 n = lo + unLo - ltLo - 1; 191 m = hi - (gtHi - unHi) + 1; 192 193 if (n - lo > hi - m) { 194 fpush(lo, n); 195 fpush(m, hi); 196 } else { 197 fpush(m, hi); 198 fpush(lo, n); 199 } 200 } 201} 202 203#undef fpush 204#undef fpop 205#undef FALLBACK_QSORT_SMALL_THRESH 206#undef FALLBACK_QSORT_STACK_SIZE 207 208 209/*---------------------------------------------*/ 210/* Pre: 211 * nblock > 0 212 * eclass exists for [0 .. nblock-1] 213 * ((uint8_t*)eclass) [0 .. nblock-1] holds block 214 * ptr exists for [0 .. nblock-1] 215 * 216 * Post: 217 * ((uint8_t*)eclass) [0 .. nblock-1] holds block 218 * All other areas of eclass destroyed 219 * fmap [0 .. nblock-1] holds sorted order 220 * bhtab[0 .. 2+(nblock/32)] destroyed 221*/ 222 223#define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31)) 224#define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31)) 225#define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31))) 226#define WORD_BH(zz) bhtab[(zz) >> 5] 227#define UNALIGNED_BH(zz) ((zz) & 0x01f) 228 229static 230void fallbackSort(uint32_t* fmap, 231 uint32_t* eclass, 232 uint32_t* bhtab, 233 int32_t nblock) 234{ 235 int32_t ftab[257]; 236 int32_t ftabCopy[256]; 237 int32_t H, i, j, k, l, r, cc, cc1; 238 int32_t nNotDone; 239 int32_t nBhtab; 240 uint8_t* eclass8 = (uint8_t*)eclass; 241 242 /* 243 * Initial 1-char radix sort to generate 244 * initial fmap and initial BH bits. 245 */ 246 for (i = 0; i < 257; i++) ftab[i] = 0; 247 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; 248 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; 249 250 j = ftab[0]; /* bbox: optimized */ 251 for (i = 1; i < 257; i++) { 252 j += ftab[i]; 253 ftab[i] = j; 254 } 255 256 for (i = 0; i < nblock; i++) { 257 j = eclass8[i]; 258 k = ftab[j] - 1; 259 ftab[j] = k; 260 fmap[k] = i; 261 } 262 263 nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */ 264 for (i = 0; i < nBhtab; i++) bhtab[i] = 0; 265 for (i = 0; i < 256; i++) SET_BH(ftab[i]); 266 267 /* 268 * Inductively refine the buckets. Kind-of an 269 * "exponential radix sort" (!), inspired by the 270 * Manber-Myers suffix array construction algorithm. 271 */ 272 273 /*-- set sentinel bits for block-end detection --*/ 274 for (i = 0; i < 32; i++) { 275 SET_BH(nblock + 2*i); 276 CLEAR_BH(nblock + 2*i + 1); 277 } 278 279 /*-- the log(N) loop --*/ 280 H = 1; 281 while (1) { 282 j = 0; 283 for (i = 0; i < nblock; i++) { 284 if (ISSET_BH(i)) 285 j = i; 286 k = fmap[i] - H; 287 if (k < 0) 288 k += nblock; 289 eclass[k] = j; 290 } 291 292 nNotDone = 0; 293 r = -1; 294 while (1) { 295 296 /*-- find the next non-singleton bucket --*/ 297 k = r + 1; 298 while (ISSET_BH(k) && UNALIGNED_BH(k)) 299 k++; 300 if (ISSET_BH(k)) { 301 while (WORD_BH(k) == 0xffffffff) k += 32; 302 while (ISSET_BH(k)) k++; 303 } 304 l = k - 1; 305 if (l >= nblock) 306 break; 307 while (!ISSET_BH(k) && UNALIGNED_BH(k)) 308 k++; 309 if (!ISSET_BH(k)) { 310 while (WORD_BH(k) == 0x00000000) k += 32; 311 while (!ISSET_BH(k)) k++; 312 } 313 r = k - 1; 314 if (r >= nblock) 315 break; 316 317 /*-- now [l, r] bracket current bucket --*/ 318 if (r > l) { 319 nNotDone += (r - l + 1); 320 fallbackQSort3(fmap, eclass, l, r); 321 322 /*-- scan bucket and generate header bits-- */ 323 cc = -1; 324 for (i = l; i <= r; i++) { 325 cc1 = eclass[fmap[i]]; 326 if (cc != cc1) { 327 SET_BH(i); 328 cc = cc1; 329 }; 330 } 331 } 332 } 333 334 H *= 2; 335 if (H > nblock || nNotDone == 0) 336 break; 337 } 338 339 /* 340 * Reconstruct the original block in 341 * eclass8 [0 .. nblock-1], since the 342 * previous phase destroyed it. 343 */ 344 j = 0; 345 for (i = 0; i < nblock; i++) { 346 while (ftabCopy[j] == 0) 347 j++; 348 ftabCopy[j]--; 349 eclass8[fmap[i]] = (uint8_t)j; 350 } 351 AssertH(j < 256, 1005); 352} 353 354#undef SET_BH 355#undef CLEAR_BH 356#undef ISSET_BH 357#undef WORD_BH 358#undef UNALIGNED_BH 359 360 361/*---------------------------------------------*/ 362/*--- The main, O(N^2 log(N)) sorting ---*/ 363/*--- algorithm. Faster for "normal" ---*/ 364/*--- non-repetitive blocks. ---*/ 365/*---------------------------------------------*/ 366 367/*---------------------------------------------*/ 368static 369NOINLINE 370int mainGtU( 371 uint32_t i1, 372 uint32_t i2, 373 uint8_t* block, 374 uint16_t* quadrant, 375 uint32_t nblock, 376 int32_t* budget) 377{ 378 int32_t k; 379 uint8_t c1, c2; 380 uint16_t s1, s2; 381 382/* Loop unrolling here is actually very useful 383 * (generated code is much simpler), 384 * code size increase is only 270 bytes (i386) 385 * but speeds up compression 10% overall 386 */ 387 388#if CONFIG_BZIP2_FEATURE_SPEED >= 1 389 390#define TIMES_8(code) \ 391 code; code; code; code; \ 392 code; code; code; code; 393#define TIMES_12(code) \ 394 code; code; code; code; \ 395 code; code; code; code; \ 396 code; code; code; code; 397 398#else 399 400#define TIMES_8(code) \ 401{ \ 402 int nn = 8; \ 403 do { \ 404 code; \ 405 } while (--nn); \ 406} 407#define TIMES_12(code) \ 408{ \ 409 int nn = 12; \ 410 do { \ 411 code; \ 412 } while (--nn); \ 413} 414 415#endif 416 417 AssertD(i1 != i2, "mainGtU"); 418 TIMES_12( 419 c1 = block[i1]; c2 = block[i2]; 420 if (c1 != c2) return (c1 > c2); 421 i1++; i2++; 422 ) 423 424 k = nblock + 8; 425 426 do { 427 TIMES_8( 428 c1 = block[i1]; c2 = block[i2]; 429 if (c1 != c2) return (c1 > c2); 430 s1 = quadrant[i1]; s2 = quadrant[i2]; 431 if (s1 != s2) return (s1 > s2); 432 i1++; i2++; 433 ) 434 435 if (i1 >= nblock) i1 -= nblock; 436 if (i2 >= nblock) i2 -= nblock; 437 438 (*budget)--; 439 k -= 8; 440 } while (k >= 0); 441 442 return False; 443} 444#undef TIMES_8 445#undef TIMES_12 446 447/*---------------------------------------------*/ 448/* 449 * Knuth's increments seem to work better 450 * than Incerpi-Sedgewick here. Possibly 451 * because the number of elems to sort is 452 * usually small, typically <= 20. 453 */ 454static 455const int32_t incs[14] = { 456 1, 4, 13, 40, 121, 364, 1093, 3280, 457 9841, 29524, 88573, 265720, 458 797161, 2391484 459}; 460 461static 462void mainSimpleSort(uint32_t* ptr, 463 uint8_t* block, 464 uint16_t* quadrant, 465 int32_t nblock, 466 int32_t lo, 467 int32_t hi, 468 int32_t d, 469 int32_t* budget) 470{ 471 int32_t i, j, h, bigN, hp; 472 uint32_t v; 473 474 bigN = hi - lo + 1; 475 if (bigN < 2) return; 476 477 hp = 0; 478 while (incs[hp] < bigN) hp++; 479 hp--; 480 481 for (; hp >= 0; hp--) { 482 h = incs[hp]; 483 484 i = lo + h; 485 while (1) { 486 /*-- copy 1 --*/ 487 if (i > hi) break; 488 v = ptr[i]; 489 j = i; 490 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { 491 ptr[j] = ptr[j-h]; 492 j = j - h; 493 if (j <= (lo + h - 1)) break; 494 } 495 ptr[j] = v; 496 i++; 497 498/* 1.5% overall speedup, +290 bytes */ 499#if CONFIG_BZIP2_FEATURE_SPEED >= 3 500 /*-- copy 2 --*/ 501 if (i > hi) break; 502 v = ptr[i]; 503 j = i; 504 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { 505 ptr[j] = ptr[j-h]; 506 j = j - h; 507 if (j <= (lo + h - 1)) break; 508 } 509 ptr[j] = v; 510 i++; 511 512 /*-- copy 3 --*/ 513 if (i > hi) break; 514 v = ptr[i]; 515 j = i; 516 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) { 517 ptr[j] = ptr[j-h]; 518 j = j - h; 519 if (j <= (lo + h - 1)) break; 520 } 521 ptr[j] = v; 522 i++; 523#endif 524 if (*budget < 0) return; 525 } 526 } 527} 528 529 530/*---------------------------------------------*/ 531/* 532 * The following is an implementation of 533 * an elegant 3-way quicksort for strings, 534 * described in a paper "Fast Algorithms for 535 * Sorting and Searching Strings", by Robert 536 * Sedgewick and Jon L. Bentley. 537 */ 538 539static 540ALWAYS_INLINE 541uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c) 542{ 543 uint8_t t; 544 if (a > b) { 545 t = a; 546 a = b; 547 b = t; 548 }; 549 /* here b >= a */ 550 if (b > c) { 551 b = c; 552 if (a > b) 553 b = a; 554 } 555 return b; 556} 557 558#define mpush(lz,hz,dz) \ 559{ \ 560 stackLo[sp] = lz; \ 561 stackHi[sp] = hz; \ 562 stackD [sp] = dz; \ 563 sp++; \ 564} 565 566#define mpop(lz,hz,dz) \ 567{ \ 568 sp--; \ 569 lz = stackLo[sp]; \ 570 hz = stackHi[sp]; \ 571 dz = stackD [sp]; \ 572} 573 574#define mnextsize(az) (nextHi[az] - nextLo[az]) 575 576#define mnextswap(az,bz) \ 577{ \ 578 int32_t tz; \ 579 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \ 580 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \ 581 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \ 582} 583 584#define MAIN_QSORT_SMALL_THRESH 20 585#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) 586#define MAIN_QSORT_STACK_SIZE 100 587 588static NOINLINE 589void mainQSort3(uint32_t* ptr, 590 uint8_t* block, 591 uint16_t* quadrant, 592 int32_t nblock, 593 int32_t loSt, 594 int32_t hiSt, 595 int32_t dSt, 596 int32_t* budget) 597{ 598 int32_t unLo, unHi, ltLo, gtHi, n, m, med; 599 int32_t sp, lo, hi, d; 600 601 int32_t stackLo[MAIN_QSORT_STACK_SIZE]; 602 int32_t stackHi[MAIN_QSORT_STACK_SIZE]; 603 int32_t stackD [MAIN_QSORT_STACK_SIZE]; 604 605 int32_t nextLo[3]; 606 int32_t nextHi[3]; 607 int32_t nextD [3]; 608 609 sp = 0; 610 mpush(loSt, hiSt, dSt); 611 612 while (sp > 0) { 613 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001); 614 615 mpop(lo, hi, d); 616 if (hi - lo < MAIN_QSORT_SMALL_THRESH 617 || d > MAIN_QSORT_DEPTH_THRESH 618 ) { 619 mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget); 620 if (*budget < 0) 621 return; 622 continue; 623 } 624 med = (int32_t) mmed3(block[ptr[lo ] + d], 625 block[ptr[hi ] + d], 626 block[ptr[(lo+hi) >> 1] + d]); 627 628 unLo = ltLo = lo; 629 unHi = gtHi = hi; 630 631 while (1) { 632 while (1) { 633 if (unLo > unHi) 634 break; 635 n = ((int32_t)block[ptr[unLo]+d]) - med; 636 if (n == 0) { 637 mswap(ptr[unLo], ptr[ltLo]); 638 ltLo++; 639 unLo++; 640 continue; 641 }; 642 if (n > 0) break; 643 unLo++; 644 } 645 while (1) { 646 if (unLo > unHi) 647 break; 648 n = ((int32_t)block[ptr[unHi]+d]) - med; 649 if (n == 0) { 650 mswap(ptr[unHi], ptr[gtHi]); 651 gtHi--; 652 unHi--; 653 continue; 654 }; 655 if (n < 0) break; 656 unHi--; 657 } 658 if (unLo > unHi) 659 break; 660 mswap(ptr[unLo], ptr[unHi]); 661 unLo++; 662 unHi--; 663 } 664 665 AssertD(unHi == unLo-1, "mainQSort3(2)"); 666 667 if (gtHi < ltLo) { 668 mpush(lo, hi, d + 1); 669 continue; 670 } 671 672 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n); 673 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m); 674 675 n = lo + unLo - ltLo - 1; 676 m = hi - (gtHi - unHi) + 1; 677 678 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; 679 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; 680 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1; 681 682 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); 683 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2); 684 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); 685 686 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)"); 687 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)"); 688 689 mpush(nextLo[0], nextHi[0], nextD[0]); 690 mpush(nextLo[1], nextHi[1], nextD[1]); 691 mpush(nextLo[2], nextHi[2], nextD[2]); 692 } 693} 694 695#undef mpush 696#undef mpop 697#undef mnextsize 698#undef mnextswap 699#undef MAIN_QSORT_SMALL_THRESH 700#undef MAIN_QSORT_DEPTH_THRESH 701#undef MAIN_QSORT_STACK_SIZE 702 703 704/*---------------------------------------------*/ 705/* Pre: 706 * nblock > N_OVERSHOOT 707 * block32 exists for [0 .. nblock-1 +N_OVERSHOOT] 708 * ((uint8_t*)block32) [0 .. nblock-1] holds block 709 * ptr exists for [0 .. nblock-1] 710 * 711 * Post: 712 * ((uint8_t*)block32) [0 .. nblock-1] holds block 713 * All other areas of block32 destroyed 714 * ftab[0 .. 65536] destroyed 715 * ptr [0 .. nblock-1] holds sorted order 716 * if (*budget < 0), sorting was abandoned 717 */ 718 719#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8]) 720#define SETMASK (1 << 21) 721#define CLEARMASK (~(SETMASK)) 722 723static NOINLINE 724void mainSort(EState* state, 725 uint32_t* ptr, 726 uint8_t* block, 727 uint16_t* quadrant, 728 uint32_t* ftab, 729 int32_t nblock, 730 int32_t* budget) 731{ 732 int32_t i, j, k, ss, sb; 733 uint8_t c1; 734 int32_t numQSorted; 735 uint16_t s; 736 Bool bigDone[256]; 737 /* bbox: moved to EState to save stack 738 int32_t runningOrder[256]; 739 int32_t copyStart[256]; 740 int32_t copyEnd [256]; 741 */ 742#define runningOrder (state->mainSort__runningOrder) 743#define copyStart (state->mainSort__copyStart) 744#define copyEnd (state->mainSort__copyEnd) 745 746 /*-- set up the 2-byte frequency table --*/ 747 /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */ 748 memset(ftab, 0, 65537 * sizeof(ftab[0])); 749 750 j = block[0] << 8; 751 i = nblock - 1; 752/* 3%, +300 bytes */ 753#if CONFIG_BZIP2_FEATURE_SPEED >= 2 754 for (; i >= 3; i -= 4) { 755 quadrant[i] = 0; 756 j = (j >> 8) | (((uint16_t)block[i]) << 8); 757 ftab[j]++; 758 quadrant[i-1] = 0; 759 j = (j >> 8) | (((uint16_t)block[i-1]) << 8); 760 ftab[j]++; 761 quadrant[i-2] = 0; 762 j = (j >> 8) | (((uint16_t)block[i-2]) << 8); 763 ftab[j]++; 764 quadrant[i-3] = 0; 765 j = (j >> 8) | (((uint16_t)block[i-3]) << 8); 766 ftab[j]++; 767 } 768#endif 769 for (; i >= 0; i--) { 770 quadrant[i] = 0; 771 j = (j >> 8) | (((uint16_t)block[i]) << 8); 772 ftab[j]++; 773 } 774 775 /*-- (emphasises close relationship of block & quadrant) --*/ 776 for (i = 0; i < BZ_N_OVERSHOOT; i++) { 777 block [nblock+i] = block[i]; 778 quadrant[nblock+i] = 0; 779 } 780 781 /*-- Complete the initial radix sort --*/ 782 j = ftab[0]; /* bbox: optimized */ 783 for (i = 1; i <= 65536; i++) { 784 j += ftab[i]; 785 ftab[i] = j; 786 } 787 788 s = block[0] << 8; 789 i = nblock - 1; 790#if CONFIG_BZIP2_FEATURE_SPEED >= 2 791 for (; i >= 3; i -= 4) { 792 s = (s >> 8) | (block[i] << 8); 793 j = ftab[s] - 1; 794 ftab[s] = j; 795 ptr[j] = i; 796 s = (s >> 8) | (block[i-1] << 8); 797 j = ftab[s] - 1; 798 ftab[s] = j; 799 ptr[j] = i-1; 800 s = (s >> 8) | (block[i-2] << 8); 801 j = ftab[s] - 1; 802 ftab[s] = j; 803 ptr[j] = i-2; 804 s = (s >> 8) | (block[i-3] << 8); 805 j = ftab[s] - 1; 806 ftab[s] = j; 807 ptr[j] = i-3; 808 } 809#endif 810 for (; i >= 0; i--) { 811 s = (s >> 8) | (block[i] << 8); 812 j = ftab[s] - 1; 813 ftab[s] = j; 814 ptr[j] = i; 815 } 816 817 /* 818 * Now ftab contains the first loc of every small bucket. 819 * Calculate the running order, from smallest to largest 820 * big bucket. 821 */ 822 for (i = 0; i <= 255; i++) { 823 bigDone [i] = False; 824 runningOrder[i] = i; 825 } 826 827 { 828 int32_t vv; 829 /* bbox: was: int32_t h = 1; */ 830 /* do h = 3 * h + 1; while (h <= 256); */ 831 uint32_t h = 364; 832 833 do { 834 /*h = h / 3;*/ 835 h = (h * 171) >> 9; /* bbox: fast h/3 */ 836 for (i = h; i <= 255; i++) { 837 vv = runningOrder[i]; 838 j = i; 839 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) { 840 runningOrder[j] = runningOrder[j-h]; 841 j = j - h; 842 if (j <= (h - 1)) 843 goto zero; 844 } 845 zero: 846 runningOrder[j] = vv; 847 } 848 } while (h != 1); 849 } 850 851 /* 852 * The main sorting loop. 853 */ 854 855 numQSorted = 0; 856 857 for (i = 0; i <= 255; i++) { 858 859 /* 860 * Process big buckets, starting with the least full. 861 * Basically this is a 3-step process in which we call 862 * mainQSort3 to sort the small buckets [ss, j], but 863 * also make a big effort to avoid the calls if we can. 864 */ 865 ss = runningOrder[i]; 866 867 /* 868 * Step 1: 869 * Complete the big bucket [ss] by quicksorting 870 * any unsorted small buckets [ss, j], for j != ss. 871 * Hopefully previous pointer-scanning phases have already 872 * completed many of the small buckets [ss, j], so 873 * we don't have to sort them at all. 874 */ 875 for (j = 0; j <= 255; j++) { 876 if (j != ss) { 877 sb = (ss << 8) + j; 878 if (!(ftab[sb] & SETMASK)) { 879 int32_t lo = ftab[sb] & CLEARMASK; 880 int32_t hi = (ftab[sb+1] & CLEARMASK) - 1; 881 if (hi > lo) { 882 mainQSort3( 883 ptr, block, quadrant, nblock, 884 lo, hi, BZ_N_RADIX, budget 885 ); 886 if (*budget < 0) return; 887 numQSorted += (hi - lo + 1); 888 } 889 } 890 ftab[sb] |= SETMASK; 891 } 892 } 893 894 AssertH(!bigDone[ss], 1006); 895 896 /* 897 * Step 2: 898 * Now scan this big bucket [ss] so as to synthesise the 899 * sorted order for small buckets [t, ss] for all t, 900 * including, magically, the bucket [ss,ss] too. 901 * This will avoid doing Real Work in subsequent Step 1's. 902 */ 903 { 904 for (j = 0; j <= 255; j++) { 905 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; 906 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; 907 } 908 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { 909 k = ptr[j] - 1; 910 if (k < 0) 911 k += nblock; 912 c1 = block[k]; 913 if (!bigDone[c1]) 914 ptr[copyStart[c1]++] = k; 915 } 916 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { 917 k = ptr[j]-1; 918 if (k < 0) 919 k += nblock; 920 c1 = block[k]; 921 if (!bigDone[c1]) 922 ptr[copyEnd[c1]--] = k; 923 } 924 } 925 926 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. 927 * Necessity for this case is demonstrated by compressing 928 * a sequence of approximately 48.5 million of character 929 * 251; 1.0.0/1.0.1 will then die here. */ 930 AssertH((copyStart[ss]-1 == copyEnd[ss]) \ 931 || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007); 932 933 for (j = 0; j <= 255; j++) 934 ftab[(j << 8) + ss] |= SETMASK; 935 936 /* 937 * Step 3: 938 * The [ss] big bucket is now done. Record this fact, 939 * and update the quadrant descriptors. Remember to 940 * update quadrants in the overshoot area too, if 941 * necessary. The "if (i < 255)" test merely skips 942 * this updating for the last bucket processed, since 943 * updating for the last bucket is pointless. 944 * 945 * The quadrant array provides a way to incrementally 946 * cache sort orderings, as they appear, so as to 947 * make subsequent comparisons in fullGtU() complete 948 * faster. For repetitive blocks this makes a big 949 * difference (but not big enough to be able to avoid 950 * the fallback sorting mechanism, exponential radix sort). 951 * 952 * The precise meaning is: at all times: 953 * 954 * for 0 <= i < nblock and 0 <= j <= nblock 955 * 956 * if block[i] != block[j], 957 * 958 * then the relative values of quadrant[i] and 959 * quadrant[j] are meaningless. 960 * 961 * else { 962 * if quadrant[i] < quadrant[j] 963 * then the string starting at i lexicographically 964 * precedes the string starting at j 965 * 966 * else if quadrant[i] > quadrant[j] 967 * then the string starting at j lexicographically 968 * precedes the string starting at i 969 * 970 * else 971 * the relative ordering of the strings starting 972 * at i and j has not yet been determined. 973 * } 974 */ 975 bigDone[ss] = True; 976 977 if (i < 255) { 978 int32_t bbStart = ftab[ss << 8] & CLEARMASK; 979 int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart; 980 int32_t shifts = 0; 981 982 while ((bbSize >> shifts) > 65534) shifts++; 983 984 for (j = bbSize-1; j >= 0; j--) { 985 int32_t a2update = ptr[bbStart + j]; 986 uint16_t qVal = (uint16_t)(j >> shifts); 987 quadrant[a2update] = qVal; 988 if (a2update < BZ_N_OVERSHOOT) 989 quadrant[a2update + nblock] = qVal; 990 } 991 AssertH(((bbSize-1) >> shifts) <= 65535, 1002); 992 } 993 } 994#undef runningOrder 995#undef copyStart 996#undef copyEnd 997} 998 999#undef BIGFREQ 1000#undef SETMASK 1001#undef CLEARMASK 1002 1003 1004/*---------------------------------------------*/ 1005/* Pre: 1006 * nblock > 0 1007 * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] 1008 * ((uint8_t*)arr2)[0 .. nblock-1] holds block 1009 * arr1 exists for [0 .. nblock-1] 1010 * 1011 * Post: 1012 * ((uint8_t*)arr2) [0 .. nblock-1] holds block 1013 * All other areas of block destroyed 1014 * ftab[0 .. 65536] destroyed 1015 * arr1[0 .. nblock-1] holds sorted order 1016 */ 1017static NOINLINE 1018void BZ2_blockSort(EState* s) 1019{ 1020 /* In original bzip2 1.0.4, it's a parameter, but 30 1021 * (which was the default) should work ok. */ 1022 enum { wfact = 30 }; 1023 1024 uint32_t* ptr = s->ptr; 1025 uint8_t* block = s->block; 1026 uint32_t* ftab = s->ftab; 1027 int32_t nblock = s->nblock; 1028 uint16_t* quadrant; 1029 int32_t budget; 1030 int32_t i; 1031 1032 if (nblock < 10000) { 1033 fallbackSort(s->arr1, s->arr2, ftab, nblock); 1034 } else { 1035 /* Calculate the location for quadrant, remembering to get 1036 * the alignment right. Assumes that &(block[0]) is at least 1037 * 2-byte aligned -- this should be ok since block is really 1038 * the first section of arr2. 1039 */ 1040 i = nblock + BZ_N_OVERSHOOT; 1041 if (i & 1) i++; 1042 quadrant = (uint16_t*)(&(block[i])); 1043 1044 /* (wfact-1) / 3 puts the default-factor-30 1045 * transition point at very roughly the same place as 1046 * with v0.1 and v0.9.0. 1047 * Not that it particularly matters any more, since the 1048 * resulting compressed stream is now the same regardless 1049 * of whether or not we use the main sort or fallback sort. 1050 */ 1051 budget = nblock * ((wfact-1) / 3); 1052 1053 mainSort(s, ptr, block, quadrant, ftab, nblock, &budget); 1054 if (budget < 0) { 1055 fallbackSort(s->arr1, s->arr2, ftab, nblock); 1056 } 1057 } 1058 1059 s->origPtr = -1; 1060 for (i = 0; i < s->nblock; i++) 1061 if (ptr[i] == 0) { 1062 s->origPtr = i; 1063 break; 1064 }; 1065 1066 AssertH(s->origPtr != -1, 1003); 1067} 1068 1069 1070/*-------------------------------------------------------------*/ 1071/*--- end blocksort.c ---*/ 1072/*-------------------------------------------------------------*/ 1073