blocksort.c revision 215041
11541Srgrimes
21541Srgrimes/*-------------------------------------------------------------*/
31541Srgrimes/*--- Block sorting machinery                               ---*/
41541Srgrimes/*---                                           blocksort.c ---*/
51541Srgrimes/*-------------------------------------------------------------*/
61541Srgrimes
71541Srgrimes/* ------------------------------------------------------------------
81541Srgrimes   This file is part of bzip2/libbzip2, a program and library for
91541Srgrimes   lossless, block-sorting data compression.
101541Srgrimes
111541Srgrimes   bzip2/libbzip2 version 1.0.6 of 6 September 2010
121541Srgrimes   Copyright (C) 1996-2010 Julian Seward <jseward@bzip.org>
131541Srgrimes
141541Srgrimes   Please read the WARNING, DISCLAIMER and PATENTS sections in the
151541Srgrimes   README file.
161541Srgrimes
171541Srgrimes   This program is released under the terms of the license contained
181541Srgrimes   in the file LICENSE.
191541Srgrimes   ------------------------------------------------------------------ */
201541Srgrimes
211541Srgrimes
221541Srgrimes#include "bzlib_private.h"
231541Srgrimes
241541Srgrimes/*---------------------------------------------*/
251541Srgrimes/*--- Fallback O(N log(N)^2) sorting        ---*/
261541Srgrimes/*--- algorithm, for repetitive blocks      ---*/
271541Srgrimes/*---------------------------------------------*/
281541Srgrimes
2985052Sru/*---------------------------------------------*/
3050477Speterstatic
311541Srgrimes__inline__
321541Srgrimesvoid fallbackSimpleSort ( UInt32* fmap,
332168Spaul                          UInt32* eclass,
342168Spaul                          Int32   lo,
352168Spaul                          Int32   hi )
361541Srgrimes{
371541Srgrimes   Int32 i, j, tmp;
388876Srgrimes   UInt32 ec_tmp;
391541Srgrimes
401541Srgrimes   if (lo == hi) return;
411541Srgrimes
421541Srgrimes   if (hi - lo > 3) {
431541Srgrimes      for ( i = hi-4; i >= lo; i-- ) {
441541Srgrimes         tmp = fmap[i];
451541Srgrimes         ec_tmp = eclass[tmp];
461541Srgrimes         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
471541Srgrimes            fmap[j-4] = fmap[j];
481541Srgrimes         fmap[j-4] = tmp;
491541Srgrimes      }
501541Srgrimes   }
511541Srgrimes
521541Srgrimes   for ( i = hi-1; i >= lo; i-- ) {
531541Srgrimes      tmp = fmap[i];
541541Srgrimes      ec_tmp = eclass[tmp];
551541Srgrimes      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
561541Srgrimes         fmap[j-1] = fmap[j];
57122922Sandre      fmap[j-1] = tmp;
58122922Sandre   }
59122922Sandre}
60122922Sandre
61122922Sandre
62122922Sandre/*---------------------------------------------*/
631541Srgrimes#define fswap(zz1, zz2) \
641541Srgrimes   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
651541Srgrimes
661541Srgrimes#define fvswap(zzp1, zzp2, zzn)       \
671541Srgrimes{                                     \
6813765Smpp   Int32 yyp1 = (zzp1);               \
6913765Smpp   Int32 yyp2 = (zzp2);               \
701541Srgrimes   Int32 yyn  = (zzn);                \
711541Srgrimes   while (yyn > 0) {                  \
721541Srgrimes      fswap(fmap[yyp1], fmap[yyp2]);  \
731541Srgrimes      yyp1++; yyp2++; yyn--;          \
745791Swollman   }                                  \
751541Srgrimes}
761541Srgrimes
771541Srgrimes
781541Srgrimes#define fmin(a,b) ((a) < (b)) ? (a) : (b)
791541Srgrimes
801541Srgrimes#define fpush(lz,hz) { stackLo[sp] = lz; \
811541Srgrimes                       stackHi[sp] = hz; \
821541Srgrimes                       sp++; }
831541Srgrimes
841541Srgrimes#define fpop(lz,hz) { sp--;              \
851541Srgrimes                      lz = stackLo[sp];  \
865833Sbde                      hz = stackHi[sp]; }
875833Sbde
885833Sbde#define FALLBACK_QSORT_SMALL_THRESH 10
895833Sbde#define FALLBACK_QSORT_STACK_SIZE   100
905833Sbde
911541Srgrimes
921541Srgrimesstatic
931541Srgrimesvoid fallbackQSort3 ( UInt32* fmap,
941541Srgrimes                      UInt32* eclass,
951541Srgrimes                      Int32   loSt,
961541Srgrimes                      Int32   hiSt )
971541Srgrimes{
981541Srgrimes   Int32 unLo, unHi, ltLo, gtHi, n, m;
991541Srgrimes   Int32 sp, lo, hi;
1001541Srgrimes   UInt32 med, r, r3;
1011541Srgrimes   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
1021541Srgrimes   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
103128454Sluigi
104128454Sluigi   r = 0;
105128454Sluigi
106128454Sluigi   sp = 0;
107128454Sluigi   fpush ( loSt, hiSt );
108132780Skan
109132780Skan   while (sp > 0) {
1101541Srgrimes
1117197Swollman      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
1121541Srgrimes
113122922Sandre      fpop ( lo, hi );
114127828Sluigi      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
115127828Sluigi         fallbackSimpleSort ( fmap, eclass, lo, hi );
1161541Srgrimes         continue;
1171541Srgrimes      }
1181541Srgrimes
1195791Swollman      /* Random partitioning.  Median of 3 sometimes fails to
120120727Ssam         avoid bad cases.  Median of 9 seems to help but
121120727Ssam         looks rather expensive.  This too seems to work but
122120727Ssam         is cheaper.  Guidance for the magic constants
123120727Ssam         7621 and 32768 is taken from Sedgewick's algorithms
1241541Srgrimes         book, chapter 35.
1251541Srgrimes      */
1261541Srgrimes      r = ((r * 7621) + 1) % 32768;
1271541Srgrimes      r3 = r % 3;
1281541Srgrimes      if (r3 == 0) med = eclass[fmap[lo]]; else
1291541Srgrimes      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
1301541Srgrimes                   med = eclass[fmap[hi]];
1311541Srgrimes
1321541Srgrimes      unLo = ltLo = lo;
1331541Srgrimes      unHi = gtHi = hi;
1341541Srgrimes
1351541Srgrimes      while (1) {
1361541Srgrimes         while (1) {
1371541Srgrimes            if (unLo > unHi) break;
1381541Srgrimes            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
1391541Srgrimes            if (n == 0) {
1404104Swollman               fswap(fmap[unLo], fmap[ltLo]);
1414104Swollman               ltLo++; unLo++;
1421541Srgrimes               continue;
1431541Srgrimes            };
1441541Srgrimes            if (n > 0) break;
1451541Srgrimes            unLo++;
1461541Srgrimes         }
1471541Srgrimes         while (1) {
1481541Srgrimes            if (unLo > unHi) break;
14986764Sjlemon            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
1501541Srgrimes            if (n == 0) {
1511541Srgrimes               fswap(fmap[unHi], fmap[gtHi]);
15217835Sjulian               gtHi--; unHi--;
1531541Srgrimes               continue;
1541541Srgrimes            };
1551541Srgrimes            if (n < 0) break;
1561541Srgrimes            unHi--;
1571541Srgrimes         }
158122921Sandre         if (unLo > unHi) break;
159122921Sandre         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
160122921Sandre      }
161122921Sandre
162122921Sandre      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
1635099Swollman
1645099Swollman      if (gtHi < ltLo) continue;
16518839Swollman
1666245Swollman      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
16715652Swollman      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
16815652Swollman
16915652Swollman      n = lo + unLo - ltLo - 1;
17015652Swollman      m = hi - (gtHi - unHi) + 1;
1711541Srgrimes
1721541Srgrimes      if (n - lo > hi - m) {
1731541Srgrimes         fpush ( lo, n );
1741541Srgrimes         fpush ( m, hi );
1751541Srgrimes      } else {
1761541Srgrimes         fpush ( m, hi );
1771541Srgrimes         fpush ( lo, n );
1781541Srgrimes      }
1791541Srgrimes   }
1801541Srgrimes}
1811541Srgrimes
1821541Srgrimes#undef fmin
1831541Srgrimes#undef fpush
1841541Srgrimes#undef fpop
1851541Srgrimes#undef fswap
1861541Srgrimes#undef fvswap
1871541Srgrimes#undef FALLBACK_QSORT_SMALL_THRESH
1881541Srgrimes#undef FALLBACK_QSORT_STACK_SIZE
1891541Srgrimes
1901541Srgrimes
1911541Srgrimes/*---------------------------------------------*/
1921541Srgrimes/* Pre:
1931541Srgrimes      nblock > 0
1941541Srgrimes      eclass exists for [0 .. nblock-1]
1951541Srgrimes      ((UChar*)eclass) [0 .. nblock-1] holds block
1961541Srgrimes      ptr exists for [0 .. nblock-1]
1971541Srgrimes
1981541Srgrimes   Post:
1991541Srgrimes      ((UChar*)eclass) [0 .. nblock-1] holds block
2005791Swollman      All other areas of eclass destroyed
2011541Srgrimes      fmap [0 .. nblock-1] holds sorted order
20251252Sru      bhtab [ 0 .. 2+(nblock/32) ] destroyed
20351252Sru*/
20451252Sru
2051541Srgrimes#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
2061541Srgrimes#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
2071541Srgrimes#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
2081541Srgrimes#define      WORD_BH(zz)  bhtab[(zz) >> 5]
2091541Srgrimes#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
2101541Srgrimes
2111541Srgrimesstatic
2121541Srgrimesvoid fallbackSort ( UInt32* fmap,
2131541Srgrimes                    UInt32* eclass,
2141541Srgrimes                    UInt32* bhtab,
2151541Srgrimes                    Int32   nblock,
2161541Srgrimes                    Int32   verb )
2171541Srgrimes{
2181541Srgrimes   Int32 ftab[257];
21921666Swollman   Int32 ftabCopy[256];
22021666Swollman   Int32 H, i, j, k, l, r, cc, cc1;
22189498Sru   Int32 nNotDone;
2221541Srgrimes   Int32 nBhtab;
22351252Sru   UChar* eclass8 = (UChar*)eclass;
22451252Sru
22551252Sru   /*--
2261541Srgrimes      Initial 1-char radix sort to generate
2271541Srgrimes      initial fmap and initial BH bits.
22851252Sru   --*/
2291541Srgrimes   if (verb >= 4)
2301541Srgrimes      VPrintf0 ( "        bucket sorting ...\n" );
2311541Srgrimes   for (i = 0; i < 257;    i++) ftab[i] = 0;
2321541Srgrimes   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
2331541Srgrimes   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
2341541Srgrimes   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
2351541Srgrimes
23651252Sru   for (i = 0; i < nblock; i++) {
2371541Srgrimes      j = eclass8[i];
2381541Srgrimes      k = ftab[j] - 1;
2391541Srgrimes      ftab[j] = k;
2401541Srgrimes      fmap[k] = i;
2411541Srgrimes   }
2421541Srgrimes
2431541Srgrimes   nBhtab = 2 + (nblock / 32);
2441541Srgrimes   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
2451541Srgrimes   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
2461541Srgrimes
2471541Srgrimes   /*--
2481541Srgrimes      Inductively refine the buckets.  Kind-of an
2491541Srgrimes      "exponential radix sort" (!), inspired by the
2501541Srgrimes      Manber-Myers suffix array construction algorithm.
2511541Srgrimes   --*/
2521541Srgrimes
2531541Srgrimes   /*-- set sentinel bits for block-end detection --*/
2541541Srgrimes   for (i = 0; i < 32; i++) {
2551541Srgrimes      SET_BH(nblock + 2*i);
2561541Srgrimes      CLEAR_BH(nblock + 2*i + 1);
2571541Srgrimes   }
2581541Srgrimes
2591541Srgrimes   /*-- the log(N) loop --*/
2601541Srgrimes   H = 1;
2611541Srgrimes   while (1) {
2621541Srgrimes
26385074Sru      if (verb >= 4)
26485074Sru         VPrintf1 ( "        depth %6d has ", H );
26585074Sru
2661541Srgrimes      j = 0;
2671541Srgrimes      for (i = 0; i < nblock; i++) {
268128185Sluigi         if (ISSET_BH(i)) j = i;
269128185Sluigi         k = fmap[i] - H; if (k < 0) k += nblock;
270128185Sluigi         eclass[k] = j;
271128185Sluigi      }
272128185Sluigi
273128185Sluigi      nNotDone = 0;
274128185Sluigi      r = -1;
275128185Sluigi      while (1) {
276128185Sluigi
277128185Sluigi	 /*-- find the next non-singleton bucket --*/
278128185Sluigi         k = r + 1;
279128185Sluigi         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
28055205Speter         if (ISSET_BH(k)) {
281117752Shsu            while (WORD_BH(k) == 0xffffffff) k += 32;
282120727Ssam            while (ISSET_BH(k)) k++;
283120727Ssam         }
284120727Ssam         l = k - 1;
285120727Ssam         if (l >= nblock) break;
286120727Ssam         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
287120727Ssam         if (!ISSET_BH(k)) {
288117752Shsu            while (WORD_BH(k) == 0x00000000) k += 32;
289122334Ssam            while (!ISSET_BH(k)) k++;
290122334Ssam         }
291122334Ssam         r = k - 1;
292122334Ssam         if (r >= nblock) break;
293122334Ssam
294122334Ssam         /*-- now [l, r] bracket current bucket --*/
295122334Ssam         if (r > l) {
296122334Ssam            nNotDone += (r - l + 1);
297122334Ssam            fallbackQSort3 ( fmap, eclass, l, r );
298122334Ssam
299122334Ssam            /*-- scan bucket and generate header bits-- */
300122334Ssam            cc = -1;
301122334Ssam            for (i = l; i <= r; i++) {
302122334Ssam               cc1 = eclass[fmap[i]];
303122334Ssam               if (cc != cc1) { SET_BH(i); cc = cc1; };
304122334Ssam            }
305122334Ssam         }
306122334Ssam      }
307122334Ssam
308122334Ssam      if (verb >= 4)
309122334Ssam         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
310122334Ssam
31146568Speter      H *= 2;
312122334Ssam      if (H > nblock || nNotDone == 0) break;
313122334Ssam   }
314122334Ssam
315120727Ssam   /*--
3161541Srgrimes      Reconstruct the original block in
3179759Sbde      eclass8 [0 .. nblock-1], since the
3181541Srgrimes      previous phase destroyed it.
31921666Swollman   --*/
32021666Swollman   if (verb >= 4)
32192725Salfred      VPrintf0 ( "        reconstructing block ...\n" );
32292725Salfred   j = 0;
32392725Salfred   for (i = 0; i < nblock; i++) {
32492725Salfred      while (ftabCopy[j] == 0) j++;
32592725Salfred      ftabCopy[j]--;
32692725Salfred      eclass8[fmap[i]] = (UChar)j;
32792725Salfred   }
328128621Sluigi   AssertH ( j < 256, 1005 );
329128621Sluigi}
330128621Sluigi
331128621Sluigi#undef       SET_BH
332128621Sluigi#undef     CLEAR_BH
333128621Sluigi#undef     ISSET_BH
334128621Sluigi#undef      WORD_BH
335128621Sluigi#undef UNALIGNED_BH
336128621Sluigi
337128621Sluigi
338128621Sluigi/*---------------------------------------------*/
339128621Sluigi/*--- The main, O(N^2 log(N)) sorting       ---*/
340128621Sluigi/*--- algorithm.  Faster for "normal"       ---*/
341128621Sluigi/*--- non-repetitive blocks.                ---*/
342128621Sluigi/*---------------------------------------------*/
343120727Ssam
344121770Ssam/*---------------------------------------------*/
34592725Salfredstatic
34692725Salfred__inline__
34792725SalfredBool mainGtU ( UInt32  i1,
34892725Salfred               UInt32  i2,
349120727Ssam               UChar*  block,
35092725Salfred               UInt16* quadrant,
35192725Salfred               UInt32  nblock,
35292725Salfred               Int32*  budget )
353111767Smdodd{
3541541Srgrimes   Int32  k;
3552168Spaul   UChar  c1, c2;
3562168Spaul   UInt16 s1, s2;
357
358   AssertD ( i1 != i2, "mainGtU" );
359   /* 1 */
360   c1 = block[i1]; c2 = block[i2];
361   if (c1 != c2) return (c1 > c2);
362   i1++; i2++;
363   /* 2 */
364   c1 = block[i1]; c2 = block[i2];
365   if (c1 != c2) return (c1 > c2);
366   i1++; i2++;
367   /* 3 */
368   c1 = block[i1]; c2 = block[i2];
369   if (c1 != c2) return (c1 > c2);
370   i1++; i2++;
371   /* 4 */
372   c1 = block[i1]; c2 = block[i2];
373   if (c1 != c2) return (c1 > c2);
374   i1++; i2++;
375   /* 5 */
376   c1 = block[i1]; c2 = block[i2];
377   if (c1 != c2) return (c1 > c2);
378   i1++; i2++;
379   /* 6 */
380   c1 = block[i1]; c2 = block[i2];
381   if (c1 != c2) return (c1 > c2);
382   i1++; i2++;
383   /* 7 */
384   c1 = block[i1]; c2 = block[i2];
385   if (c1 != c2) return (c1 > c2);
386   i1++; i2++;
387   /* 8 */
388   c1 = block[i1]; c2 = block[i2];
389   if (c1 != c2) return (c1 > c2);
390   i1++; i2++;
391   /* 9 */
392   c1 = block[i1]; c2 = block[i2];
393   if (c1 != c2) return (c1 > c2);
394   i1++; i2++;
395   /* 10 */
396   c1 = block[i1]; c2 = block[i2];
397   if (c1 != c2) return (c1 > c2);
398   i1++; i2++;
399   /* 11 */
400   c1 = block[i1]; c2 = block[i2];
401   if (c1 != c2) return (c1 > c2);
402   i1++; i2++;
403   /* 12 */
404   c1 = block[i1]; c2 = block[i2];
405   if (c1 != c2) return (c1 > c2);
406   i1++; i2++;
407
408   k = nblock + 8;
409
410   do {
411      /* 1 */
412      c1 = block[i1]; c2 = block[i2];
413      if (c1 != c2) return (c1 > c2);
414      s1 = quadrant[i1]; s2 = quadrant[i2];
415      if (s1 != s2) return (s1 > s2);
416      i1++; i2++;
417      /* 2 */
418      c1 = block[i1]; c2 = block[i2];
419      if (c1 != c2) return (c1 > c2);
420      s1 = quadrant[i1]; s2 = quadrant[i2];
421      if (s1 != s2) return (s1 > s2);
422      i1++; i2++;
423      /* 3 */
424      c1 = block[i1]; c2 = block[i2];
425      if (c1 != c2) return (c1 > c2);
426      s1 = quadrant[i1]; s2 = quadrant[i2];
427      if (s1 != s2) return (s1 > s2);
428      i1++; i2++;
429      /* 4 */
430      c1 = block[i1]; c2 = block[i2];
431      if (c1 != c2) return (c1 > c2);
432      s1 = quadrant[i1]; s2 = quadrant[i2];
433      if (s1 != s2) return (s1 > s2);
434      i1++; i2++;
435      /* 5 */
436      c1 = block[i1]; c2 = block[i2];
437      if (c1 != c2) return (c1 > c2);
438      s1 = quadrant[i1]; s2 = quadrant[i2];
439      if (s1 != s2) return (s1 > s2);
440      i1++; i2++;
441      /* 6 */
442      c1 = block[i1]; c2 = block[i2];
443      if (c1 != c2) return (c1 > c2);
444      s1 = quadrant[i1]; s2 = quadrant[i2];
445      if (s1 != s2) return (s1 > s2);
446      i1++; i2++;
447      /* 7 */
448      c1 = block[i1]; c2 = block[i2];
449      if (c1 != c2) return (c1 > c2);
450      s1 = quadrant[i1]; s2 = quadrant[i2];
451      if (s1 != s2) return (s1 > s2);
452      i1++; i2++;
453      /* 8 */
454      c1 = block[i1]; c2 = block[i2];
455      if (c1 != c2) return (c1 > c2);
456      s1 = quadrant[i1]; s2 = quadrant[i2];
457      if (s1 != s2) return (s1 > s2);
458      i1++; i2++;
459
460      if (i1 >= nblock) i1 -= nblock;
461      if (i2 >= nblock) i2 -= nblock;
462
463      k -= 8;
464      (*budget)--;
465   }
466      while (k >= 0);
467
468   return False;
469}
470
471
472/*---------------------------------------------*/
473/*--
474   Knuth's increments seem to work better
475   than Incerpi-Sedgewick here.  Possibly
476   because the number of elems to sort is
477   usually small, typically <= 20.
478--*/
479static
480Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
481                   9841, 29524, 88573, 265720,
482                   797161, 2391484 };
483
484static
485void mainSimpleSort ( UInt32* ptr,
486                      UChar*  block,
487                      UInt16* quadrant,
488                      Int32   nblock,
489                      Int32   lo,
490                      Int32   hi,
491                      Int32   d,
492                      Int32*  budget )
493{
494   Int32 i, j, h, bigN, hp;
495   UInt32 v;
496
497   bigN = hi - lo + 1;
498   if (bigN < 2) return;
499
500   hp = 0;
501   while (incs[hp] < bigN) hp++;
502   hp--;
503
504   for (; hp >= 0; hp--) {
505      h = incs[hp];
506
507      i = lo + h;
508      while (True) {
509
510         /*-- copy 1 --*/
511         if (i > hi) break;
512         v = ptr[i];
513         j = i;
514         while ( mainGtU (
515                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
516                 ) ) {
517            ptr[j] = ptr[j-h];
518            j = j - h;
519            if (j <= (lo + h - 1)) break;
520         }
521         ptr[j] = v;
522         i++;
523
524         /*-- copy 2 --*/
525         if (i > hi) break;
526         v = ptr[i];
527         j = i;
528         while ( mainGtU (
529                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
530                 ) ) {
531            ptr[j] = ptr[j-h];
532            j = j - h;
533            if (j <= (lo + h - 1)) break;
534         }
535         ptr[j] = v;
536         i++;
537
538         /*-- copy 3 --*/
539         if (i > hi) break;
540         v = ptr[i];
541         j = i;
542         while ( mainGtU (
543                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
544                 ) ) {
545            ptr[j] = ptr[j-h];
546            j = j - h;
547            if (j <= (lo + h - 1)) break;
548         }
549         ptr[j] = v;
550         i++;
551
552         if (*budget < 0) return;
553      }
554   }
555}
556
557
558/*---------------------------------------------*/
559/*--
560   The following is an implementation of
561   an elegant 3-way quicksort for strings,
562   described in a paper "Fast Algorithms for
563   Sorting and Searching Strings", by Robert
564   Sedgewick and Jon L. Bentley.
565--*/
566
567#define mswap(zz1, zz2) \
568   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
569
570#define mvswap(zzp1, zzp2, zzn)       \
571{                                     \
572   Int32 yyp1 = (zzp1);               \
573   Int32 yyp2 = (zzp2);               \
574   Int32 yyn  = (zzn);                \
575   while (yyn > 0) {                  \
576      mswap(ptr[yyp1], ptr[yyp2]);    \
577      yyp1++; yyp2++; yyn--;          \
578   }                                  \
579}
580
581static
582__inline__
583UChar mmed3 ( UChar a, UChar b, UChar c )
584{
585   UChar t;
586   if (a > b) { t = a; a = b; b = t; };
587   if (b > c) {
588      b = c;
589      if (a > b) b = a;
590   }
591   return b;
592}
593
594#define mmin(a,b) ((a) < (b)) ? (a) : (b)
595
596#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
597                          stackHi[sp] = hz; \
598                          stackD [sp] = dz; \
599                          sp++; }
600
601#define mpop(lz,hz,dz) { sp--;             \
602                         lz = stackLo[sp]; \
603                         hz = stackHi[sp]; \
604                         dz = stackD [sp]; }
605
606
607#define mnextsize(az) (nextHi[az]-nextLo[az])
608
609#define mnextswap(az,bz)                                        \
610   { Int32 tz;                                                  \
611     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
612     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
613     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
614
615
616#define MAIN_QSORT_SMALL_THRESH 20
617#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
618#define MAIN_QSORT_STACK_SIZE 100
619
620static
621void mainQSort3 ( UInt32* ptr,
622                  UChar*  block,
623                  UInt16* quadrant,
624                  Int32   nblock,
625                  Int32   loSt,
626                  Int32   hiSt,
627                  Int32   dSt,
628                  Int32*  budget )
629{
630   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
631   Int32 sp, lo, hi, d;
632
633   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
634   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
635   Int32 stackD [MAIN_QSORT_STACK_SIZE];
636
637   Int32 nextLo[3];
638   Int32 nextHi[3];
639   Int32 nextD [3];
640
641   sp = 0;
642   mpush ( loSt, hiSt, dSt );
643
644   while (sp > 0) {
645
646      AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
647
648      mpop ( lo, hi, d );
649      if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
650          d > MAIN_QSORT_DEPTH_THRESH) {
651         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
652         if (*budget < 0) return;
653         continue;
654      }
655
656      med = (Int32)
657            mmed3 ( block[ptr[ lo         ]+d],
658                    block[ptr[ hi         ]+d],
659                    block[ptr[ (lo+hi)>>1 ]+d] );
660
661      unLo = ltLo = lo;
662      unHi = gtHi = hi;
663
664      while (True) {
665         while (True) {
666            if (unLo > unHi) break;
667            n = ((Int32)block[ptr[unLo]+d]) - med;
668            if (n == 0) {
669               mswap(ptr[unLo], ptr[ltLo]);
670               ltLo++; unLo++; continue;
671            };
672            if (n >  0) break;
673            unLo++;
674         }
675         while (True) {
676            if (unLo > unHi) break;
677            n = ((Int32)block[ptr[unHi]+d]) - med;
678            if (n == 0) {
679               mswap(ptr[unHi], ptr[gtHi]);
680               gtHi--; unHi--; continue;
681            };
682            if (n <  0) break;
683            unHi--;
684         }
685         if (unLo > unHi) break;
686         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
687      }
688
689      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
690
691      if (gtHi < ltLo) {
692         mpush(lo, hi, d+1 );
693         continue;
694      }
695
696      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
697      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
698
699      n = lo + unLo - ltLo - 1;
700      m = hi - (gtHi - unHi) + 1;
701
702      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
703      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
704      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
705
706      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
707      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
708      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
709
710      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
711      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
712
713      mpush (nextLo[0], nextHi[0], nextD[0]);
714      mpush (nextLo[1], nextHi[1], nextD[1]);
715      mpush (nextLo[2], nextHi[2], nextD[2]);
716   }
717}
718
719#undef mswap
720#undef mvswap
721#undef mpush
722#undef mpop
723#undef mmin
724#undef mnextsize
725#undef mnextswap
726#undef MAIN_QSORT_SMALL_THRESH
727#undef MAIN_QSORT_DEPTH_THRESH
728#undef MAIN_QSORT_STACK_SIZE
729
730
731/*---------------------------------------------*/
732/* Pre:
733      nblock > N_OVERSHOOT
734      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
735      ((UChar*)block32) [0 .. nblock-1] holds block
736      ptr exists for [0 .. nblock-1]
737
738   Post:
739      ((UChar*)block32) [0 .. nblock-1] holds block
740      All other areas of block32 destroyed
741      ftab [0 .. 65536 ] destroyed
742      ptr [0 .. nblock-1] holds sorted order
743      if (*budget < 0), sorting was abandoned
744*/
745
746#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
747#define SETMASK (1 << 21)
748#define CLEARMASK (~(SETMASK))
749
750static
751void mainSort ( UInt32* ptr,
752                UChar*  block,
753                UInt16* quadrant,
754                UInt32* ftab,
755                Int32   nblock,
756                Int32   verb,
757                Int32*  budget )
758{
759   Int32  i, j, k, ss, sb;
760   Int32  runningOrder[256];
761   Bool   bigDone[256];
762   Int32  copyStart[256];
763   Int32  copyEnd  [256];
764   UChar  c1;
765   Int32  numQSorted;
766   UInt16 s;
767   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
768
769   /*-- set up the 2-byte frequency table --*/
770   for (i = 65536; i >= 0; i--) ftab[i] = 0;
771
772   j = block[0] << 8;
773   i = nblock-1;
774   for (; i >= 3; i -= 4) {
775      quadrant[i] = 0;
776      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
777      ftab[j]++;
778      quadrant[i-1] = 0;
779      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
780      ftab[j]++;
781      quadrant[i-2] = 0;
782      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
783      ftab[j]++;
784      quadrant[i-3] = 0;
785      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
786      ftab[j]++;
787   }
788   for (; i >= 0; i--) {
789      quadrant[i] = 0;
790      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
791      ftab[j]++;
792   }
793
794   /*-- (emphasises close relationship of block & quadrant) --*/
795   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
796      block   [nblock+i] = block[i];
797      quadrant[nblock+i] = 0;
798   }
799
800   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
801
802   /*-- Complete the initial radix sort --*/
803   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
804
805   s = block[0] << 8;
806   i = nblock-1;
807   for (; i >= 3; i -= 4) {
808      s = (s >> 8) | (block[i] << 8);
809      j = ftab[s] -1;
810      ftab[s] = j;
811      ptr[j] = i;
812      s = (s >> 8) | (block[i-1] << 8);
813      j = ftab[s] -1;
814      ftab[s] = j;
815      ptr[j] = i-1;
816      s = (s >> 8) | (block[i-2] << 8);
817      j = ftab[s] -1;
818      ftab[s] = j;
819      ptr[j] = i-2;
820      s = (s >> 8) | (block[i-3] << 8);
821      j = ftab[s] -1;
822      ftab[s] = j;
823      ptr[j] = i-3;
824   }
825   for (; i >= 0; i--) {
826      s = (s >> 8) | (block[i] << 8);
827      j = ftab[s] -1;
828      ftab[s] = j;
829      ptr[j] = i;
830   }
831
832   /*--
833      Now ftab contains the first loc of every small bucket.
834      Calculate the running order, from smallest to largest
835      big bucket.
836   --*/
837   for (i = 0; i <= 255; i++) {
838      bigDone     [i] = False;
839      runningOrder[i] = i;
840   }
841
842   {
843      Int32 vv;
844      Int32 h = 1;
845      do h = 3 * h + 1; while (h <= 256);
846      do {
847         h = h / 3;
848         for (i = h; i <= 255; i++) {
849            vv = runningOrder[i];
850            j = i;
851            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
852               runningOrder[j] = runningOrder[j-h];
853               j = j - h;
854               if (j <= (h - 1)) goto zero;
855            }
856            zero:
857            runningOrder[j] = vv;
858         }
859      } while (h != 1);
860   }
861
862   /*--
863      The main sorting loop.
864   --*/
865
866   numQSorted = 0;
867
868   for (i = 0; i <= 255; i++) {
869
870      /*--
871         Process big buckets, starting with the least full.
872         Basically this is a 3-step process in which we call
873         mainQSort3 to sort the small buckets [ss, j], but
874         also make a big effort to avoid the calls if we can.
875      --*/
876      ss = runningOrder[i];
877
878      /*--
879         Step 1:
880         Complete the big bucket [ss] by quicksorting
881         any unsorted small buckets [ss, j], for j != ss.
882         Hopefully previous pointer-scanning phases have already
883         completed many of the small buckets [ss, j], so
884         we don't have to sort them at all.
885      --*/
886      for (j = 0; j <= 255; j++) {
887         if (j != ss) {
888            sb = (ss << 8) + j;
889            if ( ! (ftab[sb] & SETMASK) ) {
890               Int32 lo = ftab[sb]   & CLEARMASK;
891               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
892               if (hi > lo) {
893                  if (verb >= 4)
894                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
895                                "done %d   this %d\n",
896                                ss, j, numQSorted, hi - lo + 1 );
897                  mainQSort3 (
898                     ptr, block, quadrant, nblock,
899                     lo, hi, BZ_N_RADIX, budget
900                  );
901                  numQSorted += (hi - lo + 1);
902                  if (*budget < 0) return;
903               }
904            }
905            ftab[sb] |= SETMASK;
906         }
907      }
908
909      AssertH ( !bigDone[ss], 1006 );
910
911      /*--
912         Step 2:
913         Now scan this big bucket [ss] so as to synthesise the
914         sorted order for small buckets [t, ss] for all t,
915         including, magically, the bucket [ss,ss] too.
916         This will avoid doing Real Work in subsequent Step 1's.
917      --*/
918      {
919         for (j = 0; j <= 255; j++) {
920            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
921            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
922         }
923         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
924            k = ptr[j]-1; if (k < 0) k += nblock;
925            c1 = block[k];
926            if (!bigDone[c1])
927               ptr[ copyStart[c1]++ ] = k;
928         }
929         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
930            k = ptr[j]-1; if (k < 0) k += nblock;
931            c1 = block[k];
932            if (!bigDone[c1])
933               ptr[ copyEnd[c1]-- ] = k;
934         }
935      }
936
937      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
938                ||
939                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
940                   Necessity for this case is demonstrated by compressing
941                   a sequence of approximately 48.5 million of character
942                   251; 1.0.0/1.0.1 will then die here. */
943                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
944                1007 )
945
946      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
947
948      /*--
949         Step 3:
950         The [ss] big bucket is now done.  Record this fact,
951         and update the quadrant descriptors.  Remember to
952         update quadrants in the overshoot area too, if
953         necessary.  The "if (i < 255)" test merely skips
954         this updating for the last bucket processed, since
955         updating for the last bucket is pointless.
956
957         The quadrant array provides a way to incrementally
958         cache sort orderings, as they appear, so as to
959         make subsequent comparisons in fullGtU() complete
960         faster.  For repetitive blocks this makes a big
961         difference (but not big enough to be able to avoid
962         the fallback sorting mechanism, exponential radix sort).
963
964         The precise meaning is: at all times:
965
966            for 0 <= i < nblock and 0 <= j <= nblock
967
968            if block[i] != block[j],
969
970               then the relative values of quadrant[i] and
971                    quadrant[j] are meaningless.
972
973               else {
974                  if quadrant[i] < quadrant[j]
975                     then the string starting at i lexicographically
976                     precedes the string starting at j
977
978                  else if quadrant[i] > quadrant[j]
979                     then the string starting at j lexicographically
980                     precedes the string starting at i
981
982                  else
983                     the relative ordering of the strings starting
984                     at i and j has not yet been determined.
985               }
986      --*/
987      bigDone[ss] = True;
988
989      if (i < 255) {
990         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
991         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
992         Int32 shifts   = 0;
993
994         while ((bbSize >> shifts) > 65534) shifts++;
995
996         for (j = bbSize-1; j >= 0; j--) {
997            Int32 a2update     = ptr[bbStart + j];
998            UInt16 qVal        = (UInt16)(j >> shifts);
999            quadrant[a2update] = qVal;
1000            if (a2update < BZ_N_OVERSHOOT)
1001               quadrant[a2update + nblock] = qVal;
1002         }
1003         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004      }
1005
1006   }
1007
1008   if (verb >= 4)
1009      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010                 nblock, numQSorted, nblock - numQSorted );
1011}
1012
1013#undef BIGFREQ
1014#undef SETMASK
1015#undef CLEARMASK
1016
1017
1018/*---------------------------------------------*/
1019/* Pre:
1020      nblock > 0
1021      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022      ((UChar*)arr2)  [0 .. nblock-1] holds block
1023      arr1 exists for [0 .. nblock-1]
1024
1025   Post:
1026      ((UChar*)arr2) [0 .. nblock-1] holds block
1027      All other areas of block destroyed
1028      ftab [ 0 .. 65536 ] destroyed
1029      arr1 [0 .. nblock-1] holds sorted order
1030*/
1031void BZ2_blockSort ( EState* s )
1032{
1033   UInt32* ptr    = s->ptr;
1034   UChar*  block  = s->block;
1035   UInt32* ftab   = s->ftab;
1036   Int32   nblock = s->nblock;
1037   Int32   verb   = s->verbosity;
1038   Int32   wfact  = s->workFactor;
1039   UInt16* quadrant;
1040   Int32   budget;
1041   Int32   budgetInit;
1042   Int32   i;
1043
1044   if (nblock < 10000) {
1045      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046   } else {
1047      /* Calculate the location for quadrant, remembering to get
1048         the alignment right.  Assumes that &(block[0]) is at least
1049         2-byte aligned -- this should be ok since block is really
1050         the first section of arr2.
1051      */
1052      i = nblock+BZ_N_OVERSHOOT;
1053      if (i & 1) i++;
1054      quadrant = (UInt16*)(&(block[i]));
1055
1056      /* (wfact-1) / 3 puts the default-factor-30
1057         transition point at very roughly the same place as
1058         with v0.1 and v0.9.0.
1059         Not that it particularly matters any more, since the
1060         resulting compressed stream is now the same regardless
1061         of whether or not we use the main sort or fallback sort.
1062      */
1063      if (wfact < 1  ) wfact = 1;
1064      if (wfact > 100) wfact = 100;
1065      budgetInit = nblock * ((wfact-1) / 3);
1066      budget = budgetInit;
1067
1068      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069      if (verb >= 3)
1070         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071                    budgetInit - budget,
1072                    nblock,
1073                    (float)(budgetInit - budget) /
1074                    (float)(nblock==0 ? 1 : nblock) );
1075      if (budget < 0) {
1076         if (verb >= 2)
1077            VPrintf0 ( "    too repetitive; using fallback"
1078                       " sorting algorithm\n" );
1079         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080      }
1081   }
1082
1083   s->origPtr = -1;
1084   for (i = 0; i < s->nblock; i++)
1085      if (ptr[i] == 0)
1086         { s->origPtr = i; break; };
1087
1088   AssertH( s->origPtr != -1, 1003 );
1089}
1090
1091
1092/*-------------------------------------------------------------*/
1093/*--- end                                       blocksort.c ---*/
1094/*-------------------------------------------------------------*/
1095