blocksort.c revision 90067
1
2/*-------------------------------------------------------------*/
3/*--- Block sorting machinery                               ---*/
4/*---                                           blocksort.c ---*/
5/*-------------------------------------------------------------*/
6
7/*--
8  This file is a part of bzip2 and/or libbzip2, a program and
9  library for lossless, block-sorting data compression.
10
11  Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
12
13  Redistribution and use in source and binary forms, with or without
14  modification, are permitted provided that the following conditions
15  are met:
16
17  1. Redistributions of source code must retain the above copyright
18     notice, this list of conditions and the following disclaimer.
19
20  2. The origin of this software must not be misrepresented; you must
21     not claim that you wrote the original software.  If you use this
22     software in a product, an acknowledgment in the product
23     documentation would be appreciated but is not required.
24
25  3. Altered source versions must be plainly marked as such, and must
26     not be misrepresented as being the original software.
27
28  4. The name of the author may not be used to endorse or promote
29     products derived from this software without specific prior written
30     permission.
31
32  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
33  OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43
44  Julian Seward, Cambridge, UK.
45  jseward@acm.org
46  bzip2/libbzip2 version 1.0 of 21 March 2000
47
48  This program is based on (at least) the work of:
49     Mike Burrows
50     David Wheeler
51     Peter Fenwick
52     Alistair Moffat
53     Radford Neal
54     Ian H. Witten
55     Robert Sedgewick
56     Jon L. Bentley
57
58  For more information on these sources, see the manual.
59
60  To get some idea how the block sorting algorithms in this file
61  work, read my paper
62     On the Performance of BWT Sorting Algorithms
63  in Proceedings of the IEEE Data Compression Conference 2000,
64  Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
65  file implements the algorithm called  cache  in the paper.
66--*/
67
68
69#include "bzlib_private.h"
70
71/*---------------------------------------------*/
72/*--- Fallback O(N log(N)^2) sorting        ---*/
73/*--- algorithm, for repetitive blocks      ---*/
74/*---------------------------------------------*/
75
76/*---------------------------------------------*/
77static
78__inline__
79void fallbackSimpleSort ( UInt32* fmap,
80                          UInt32* eclass,
81                          Int32   lo,
82                          Int32   hi )
83{
84   Int32 i, j, tmp;
85   UInt32 ec_tmp;
86
87   if (lo == hi) return;
88
89   if (hi - lo > 3) {
90      for ( i = hi-4; i >= lo; i-- ) {
91         tmp = fmap[i];
92         ec_tmp = eclass[tmp];
93         for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
94            fmap[j-4] = fmap[j];
95         fmap[j-4] = tmp;
96      }
97   }
98
99   for ( i = hi-1; i >= lo; i-- ) {
100      tmp = fmap[i];
101      ec_tmp = eclass[tmp];
102      for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
103         fmap[j-1] = fmap[j];
104      fmap[j-1] = tmp;
105   }
106}
107
108
109/*---------------------------------------------*/
110#define fswap(zz1, zz2) \
111   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
112
113#define fvswap(zzp1, zzp2, zzn)       \
114{                                     \
115   Int32 yyp1 = (zzp1);               \
116   Int32 yyp2 = (zzp2);               \
117   Int32 yyn  = (zzn);                \
118   while (yyn > 0) {                  \
119      fswap(fmap[yyp1], fmap[yyp2]);  \
120      yyp1++; yyp2++; yyn--;          \
121   }                                  \
122}
123
124
125#define fmin(a,b) ((a) < (b)) ? (a) : (b)
126
127#define fpush(lz,hz) { stackLo[sp] = lz; \
128                       stackHi[sp] = hz; \
129                       sp++; }
130
131#define fpop(lz,hz) { sp--;              \
132                      lz = stackLo[sp];  \
133                      hz = stackHi[sp]; }
134
135#define FALLBACK_QSORT_SMALL_THRESH 10
136#define FALLBACK_QSORT_STACK_SIZE   100
137
138
139static
140void fallbackQSort3 ( UInt32* fmap,
141                      UInt32* eclass,
142                      Int32   loSt,
143                      Int32   hiSt )
144{
145   Int32 unLo, unHi, ltLo, gtHi, n, m;
146   Int32 sp, lo, hi;
147   UInt32 med, r, r3;
148   Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
149   Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
150
151   r = 0;
152
153   sp = 0;
154   fpush ( loSt, hiSt );
155
156   while (sp > 0) {
157
158      AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
159
160      fpop ( lo, hi );
161      if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
162         fallbackSimpleSort ( fmap, eclass, lo, hi );
163         continue;
164      }
165
166      /* Random partitioning.  Median of 3 sometimes fails to
167         avoid bad cases.  Median of 9 seems to help but
168         looks rather expensive.  This too seems to work but
169         is cheaper.  Guidance for the magic constants
170         7621 and 32768 is taken from Sedgewick's algorithms
171         book, chapter 35.
172      */
173      r = ((r * 7621) + 1) % 32768;
174      r3 = r % 3;
175      if (r3 == 0) med = eclass[fmap[lo]]; else
176      if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
177                   med = eclass[fmap[hi]];
178
179      unLo = ltLo = lo;
180      unHi = gtHi = hi;
181
182      while (1) {
183         while (1) {
184            if (unLo > unHi) break;
185            n = (Int32)eclass[fmap[unLo]] - (Int32)med;
186            if (n == 0) {
187               fswap(fmap[unLo], fmap[ltLo]);
188               ltLo++; unLo++;
189               continue;
190            };
191            if (n > 0) break;
192            unLo++;
193         }
194         while (1) {
195            if (unLo > unHi) break;
196            n = (Int32)eclass[fmap[unHi]] - (Int32)med;
197            if (n == 0) {
198               fswap(fmap[unHi], fmap[gtHi]);
199               gtHi--; unHi--;
200               continue;
201            };
202            if (n < 0) break;
203            unHi--;
204         }
205         if (unLo > unHi) break;
206         fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
207      }
208
209      AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
210
211      if (gtHi < ltLo) continue;
212
213      n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
214      m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
215
216      n = lo + unLo - ltLo - 1;
217      m = hi - (gtHi - unHi) + 1;
218
219      if (n - lo > hi - m) {
220         fpush ( lo, n );
221         fpush ( m, hi );
222      } else {
223         fpush ( m, hi );
224         fpush ( lo, n );
225      }
226   }
227}
228
229#undef fmin
230#undef fpush
231#undef fpop
232#undef fswap
233#undef fvswap
234#undef FALLBACK_QSORT_SMALL_THRESH
235#undef FALLBACK_QSORT_STACK_SIZE
236
237
238/*---------------------------------------------*/
239/* Pre:
240      nblock > 0
241      eclass exists for [0 .. nblock-1]
242      ((UChar*)eclass) [0 .. nblock-1] holds block
243      ptr exists for [0 .. nblock-1]
244
245   Post:
246      ((UChar*)eclass) [0 .. nblock-1] holds block
247      All other areas of eclass destroyed
248      fmap [0 .. nblock-1] holds sorted order
249      bhtab [ 0 .. 2+(nblock/32) ] destroyed
250*/
251
252#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
253#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
254#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
255#define      WORD_BH(zz)  bhtab[(zz) >> 5]
256#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
257
258static
259void fallbackSort ( UInt32* fmap,
260                    UInt32* eclass,
261                    UInt32* bhtab,
262                    Int32   nblock,
263                    Int32   verb )
264{
265   Int32 ftab[257];
266   Int32 ftabCopy[256];
267   Int32 H, i, j, k, l, r, cc, cc1;
268   Int32 nNotDone;
269   Int32 nBhtab;
270   UChar* eclass8 = (UChar*)eclass;
271
272   /*--
273      Initial 1-char radix sort to generate
274      initial fmap and initial BH bits.
275   --*/
276   if (verb >= 4)
277      VPrintf0 ( "        bucket sorting ...\n" );
278   for (i = 0; i < 257;    i++) ftab[i] = 0;
279   for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
280   for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
281   for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
282
283   for (i = 0; i < nblock; i++) {
284      j = eclass8[i];
285      k = ftab[j] - 1;
286      ftab[j] = k;
287      fmap[k] = i;
288   }
289
290   nBhtab = 2 + (nblock / 32);
291   for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
292   for (i = 0; i < 256; i++) SET_BH(ftab[i]);
293
294   /*--
295      Inductively refine the buckets.  Kind-of an
296      "exponential radix sort" (!), inspired by the
297      Manber-Myers suffix array construction algorithm.
298   --*/
299
300   /*-- set sentinel bits for block-end detection --*/
301   for (i = 0; i < 32; i++) {
302      SET_BH(nblock + 2*i);
303      CLEAR_BH(nblock + 2*i + 1);
304   }
305
306   /*-- the log(N) loop --*/
307   H = 1;
308   while (1) {
309
310      if (verb >= 4)
311         VPrintf1 ( "        depth %6d has ", H );
312
313      j = 0;
314      for (i = 0; i < nblock; i++) {
315         if (ISSET_BH(i)) j = i;
316         k = fmap[i] - H; if (k < 0) k += nblock;
317         eclass[k] = j;
318      }
319
320      nNotDone = 0;
321      r = -1;
322      while (1) {
323
324	 /*-- find the next non-singleton bucket --*/
325         k = r + 1;
326         while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
327         if (ISSET_BH(k)) {
328            while (WORD_BH(k) == 0xffffffff) k += 32;
329            while (ISSET_BH(k)) k++;
330         }
331         l = k - 1;
332         if (l >= nblock) break;
333         while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
334         if (!ISSET_BH(k)) {
335            while (WORD_BH(k) == 0x00000000) k += 32;
336            while (!ISSET_BH(k)) k++;
337         }
338         r = k - 1;
339         if (r >= nblock) break;
340
341         /*-- now [l, r] bracket current bucket --*/
342         if (r > l) {
343            nNotDone += (r - l + 1);
344            fallbackQSort3 ( fmap, eclass, l, r );
345
346            /*-- scan bucket and generate header bits-- */
347            cc = -1;
348            for (i = l; i <= r; i++) {
349               cc1 = eclass[fmap[i]];
350               if (cc != cc1) { SET_BH(i); cc = cc1; };
351            }
352         }
353      }
354
355      if (verb >= 4)
356         VPrintf1 ( "%6d unresolved strings\n", nNotDone );
357
358      H *= 2;
359      if (H > nblock || nNotDone == 0) break;
360   }
361
362   /*--
363      Reconstruct the original block in
364      eclass8 [0 .. nblock-1], since the
365      previous phase destroyed it.
366   --*/
367   if (verb >= 4)
368      VPrintf0 ( "        reconstructing block ...\n" );
369   j = 0;
370   for (i = 0; i < nblock; i++) {
371      while (ftabCopy[j] == 0) j++;
372      ftabCopy[j]--;
373      eclass8[fmap[i]] = (UChar)j;
374   }
375   AssertH ( j < 256, 1005 );
376}
377
378#undef       SET_BH
379#undef     CLEAR_BH
380#undef     ISSET_BH
381#undef      WORD_BH
382#undef UNALIGNED_BH
383
384
385/*---------------------------------------------*/
386/*--- The main, O(N^2 log(N)) sorting       ---*/
387/*--- algorithm.  Faster for "normal"       ---*/
388/*--- non-repetitive blocks.                ---*/
389/*---------------------------------------------*/
390
391/*---------------------------------------------*/
392static
393__inline__
394Bool mainGtU ( UInt32  i1,
395               UInt32  i2,
396               UChar*  block,
397               UInt16* quadrant,
398               UInt32  nblock,
399               Int32*  budget )
400{
401   Int32  k;
402   UChar  c1, c2;
403   UInt16 s1, s2;
404
405   AssertD ( i1 != i2, "mainGtU" );
406   /* 1 */
407   c1 = block[i1]; c2 = block[i2];
408   if (c1 != c2) return (c1 > c2);
409   i1++; i2++;
410   /* 2 */
411   c1 = block[i1]; c2 = block[i2];
412   if (c1 != c2) return (c1 > c2);
413   i1++; i2++;
414   /* 3 */
415   c1 = block[i1]; c2 = block[i2];
416   if (c1 != c2) return (c1 > c2);
417   i1++; i2++;
418   /* 4 */
419   c1 = block[i1]; c2 = block[i2];
420   if (c1 != c2) return (c1 > c2);
421   i1++; i2++;
422   /* 5 */
423   c1 = block[i1]; c2 = block[i2];
424   if (c1 != c2) return (c1 > c2);
425   i1++; i2++;
426   /* 6 */
427   c1 = block[i1]; c2 = block[i2];
428   if (c1 != c2) return (c1 > c2);
429   i1++; i2++;
430   /* 7 */
431   c1 = block[i1]; c2 = block[i2];
432   if (c1 != c2) return (c1 > c2);
433   i1++; i2++;
434   /* 8 */
435   c1 = block[i1]; c2 = block[i2];
436   if (c1 != c2) return (c1 > c2);
437   i1++; i2++;
438   /* 9 */
439   c1 = block[i1]; c2 = block[i2];
440   if (c1 != c2) return (c1 > c2);
441   i1++; i2++;
442   /* 10 */
443   c1 = block[i1]; c2 = block[i2];
444   if (c1 != c2) return (c1 > c2);
445   i1++; i2++;
446   /* 11 */
447   c1 = block[i1]; c2 = block[i2];
448   if (c1 != c2) return (c1 > c2);
449   i1++; i2++;
450   /* 12 */
451   c1 = block[i1]; c2 = block[i2];
452   if (c1 != c2) return (c1 > c2);
453   i1++; i2++;
454
455   k = nblock + 8;
456
457   do {
458      /* 1 */
459      c1 = block[i1]; c2 = block[i2];
460      if (c1 != c2) return (c1 > c2);
461      s1 = quadrant[i1]; s2 = quadrant[i2];
462      if (s1 != s2) return (s1 > s2);
463      i1++; i2++;
464      /* 2 */
465      c1 = block[i1]; c2 = block[i2];
466      if (c1 != c2) return (c1 > c2);
467      s1 = quadrant[i1]; s2 = quadrant[i2];
468      if (s1 != s2) return (s1 > s2);
469      i1++; i2++;
470      /* 3 */
471      c1 = block[i1]; c2 = block[i2];
472      if (c1 != c2) return (c1 > c2);
473      s1 = quadrant[i1]; s2 = quadrant[i2];
474      if (s1 != s2) return (s1 > s2);
475      i1++; i2++;
476      /* 4 */
477      c1 = block[i1]; c2 = block[i2];
478      if (c1 != c2) return (c1 > c2);
479      s1 = quadrant[i1]; s2 = quadrant[i2];
480      if (s1 != s2) return (s1 > s2);
481      i1++; i2++;
482      /* 5 */
483      c1 = block[i1]; c2 = block[i2];
484      if (c1 != c2) return (c1 > c2);
485      s1 = quadrant[i1]; s2 = quadrant[i2];
486      if (s1 != s2) return (s1 > s2);
487      i1++; i2++;
488      /* 6 */
489      c1 = block[i1]; c2 = block[i2];
490      if (c1 != c2) return (c1 > c2);
491      s1 = quadrant[i1]; s2 = quadrant[i2];
492      if (s1 != s2) return (s1 > s2);
493      i1++; i2++;
494      /* 7 */
495      c1 = block[i1]; c2 = block[i2];
496      if (c1 != c2) return (c1 > c2);
497      s1 = quadrant[i1]; s2 = quadrant[i2];
498      if (s1 != s2) return (s1 > s2);
499      i1++; i2++;
500      /* 8 */
501      c1 = block[i1]; c2 = block[i2];
502      if (c1 != c2) return (c1 > c2);
503      s1 = quadrant[i1]; s2 = quadrant[i2];
504      if (s1 != s2) return (s1 > s2);
505      i1++; i2++;
506
507      if (i1 >= nblock) i1 -= nblock;
508      if (i2 >= nblock) i2 -= nblock;
509
510      k -= 8;
511      (*budget)--;
512   }
513      while (k >= 0);
514
515   return False;
516}
517
518
519/*---------------------------------------------*/
520/*--
521   Knuth's increments seem to work better
522   than Incerpi-Sedgewick here.  Possibly
523   because the number of elems to sort is
524   usually small, typically <= 20.
525--*/
526static
527Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
528                   9841, 29524, 88573, 265720,
529                   797161, 2391484 };
530
531static
532void mainSimpleSort ( UInt32* ptr,
533                      UChar*  block,
534                      UInt16* quadrant,
535                      Int32   nblock,
536                      Int32   lo,
537                      Int32   hi,
538                      Int32   d,
539                      Int32*  budget )
540{
541   Int32 i, j, h, bigN, hp;
542   UInt32 v;
543
544   bigN = hi - lo + 1;
545   if (bigN < 2) return;
546
547   hp = 0;
548   while (incs[hp] < bigN) hp++;
549   hp--;
550
551   for (; hp >= 0; hp--) {
552      h = incs[hp];
553
554      i = lo + h;
555      while (True) {
556
557         /*-- copy 1 --*/
558         if (i > hi) break;
559         v = ptr[i];
560         j = i;
561         while ( mainGtU (
562                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
563                 ) ) {
564            ptr[j] = ptr[j-h];
565            j = j - h;
566            if (j <= (lo + h - 1)) break;
567         }
568         ptr[j] = v;
569         i++;
570
571         /*-- copy 2 --*/
572         if (i > hi) break;
573         v = ptr[i];
574         j = i;
575         while ( mainGtU (
576                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
577                 ) ) {
578            ptr[j] = ptr[j-h];
579            j = j - h;
580            if (j <= (lo + h - 1)) break;
581         }
582         ptr[j] = v;
583         i++;
584
585         /*-- copy 3 --*/
586         if (i > hi) break;
587         v = ptr[i];
588         j = i;
589         while ( mainGtU (
590                    ptr[j-h]+d, v+d, block, quadrant, nblock, budget
591                 ) ) {
592            ptr[j] = ptr[j-h];
593            j = j - h;
594            if (j <= (lo + h - 1)) break;
595         }
596         ptr[j] = v;
597         i++;
598
599         if (*budget < 0) return;
600      }
601   }
602}
603
604
605/*---------------------------------------------*/
606/*--
607   The following is an implementation of
608   an elegant 3-way quicksort for strings,
609   described in a paper "Fast Algorithms for
610   Sorting and Searching Strings", by Robert
611   Sedgewick and Jon L. Bentley.
612--*/
613
614#define mswap(zz1, zz2) \
615   { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
616
617#define mvswap(zzp1, zzp2, zzn)       \
618{                                     \
619   Int32 yyp1 = (zzp1);               \
620   Int32 yyp2 = (zzp2);               \
621   Int32 yyn  = (zzn);                \
622   while (yyn > 0) {                  \
623      mswap(ptr[yyp1], ptr[yyp2]);    \
624      yyp1++; yyp2++; yyn--;          \
625   }                                  \
626}
627
628static
629__inline__
630UChar mmed3 ( UChar a, UChar b, UChar c )
631{
632   UChar t;
633   if (a > b) { t = a; a = b; b = t; };
634   if (b > c) {
635      b = c;
636      if (a > b) b = a;
637   }
638   return b;
639}
640
641#define mmin(a,b) ((a) < (b)) ? (a) : (b)
642
643#define mpush(lz,hz,dz) { stackLo[sp] = lz; \
644                          stackHi[sp] = hz; \
645                          stackD [sp] = dz; \
646                          sp++; }
647
648#define mpop(lz,hz,dz) { sp--;             \
649                         lz = stackLo[sp]; \
650                         hz = stackHi[sp]; \
651                         dz = stackD [sp]; }
652
653
654#define mnextsize(az) (nextHi[az]-nextLo[az])
655
656#define mnextswap(az,bz)                                        \
657   { Int32 tz;                                                  \
658     tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
659     tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
660     tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
661
662
663#define MAIN_QSORT_SMALL_THRESH 20
664#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
665#define MAIN_QSORT_STACK_SIZE 100
666
667static
668void mainQSort3 ( UInt32* ptr,
669                  UChar*  block,
670                  UInt16* quadrant,
671                  Int32   nblock,
672                  Int32   loSt,
673                  Int32   hiSt,
674                  Int32   dSt,
675                  Int32*  budget )
676{
677   Int32 unLo, unHi, ltLo, gtHi, n, m, med;
678   Int32 sp, lo, hi, d;
679
680   Int32 stackLo[MAIN_QSORT_STACK_SIZE];
681   Int32 stackHi[MAIN_QSORT_STACK_SIZE];
682   Int32 stackD [MAIN_QSORT_STACK_SIZE];
683
684   Int32 nextLo[3];
685   Int32 nextHi[3];
686   Int32 nextD [3];
687
688   sp = 0;
689   mpush ( loSt, hiSt, dSt );
690
691   while (sp > 0) {
692
693      AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
694
695      mpop ( lo, hi, d );
696      if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
697          d > MAIN_QSORT_DEPTH_THRESH) {
698         mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
699         if (*budget < 0) return;
700         continue;
701      }
702
703      med = (Int32)
704            mmed3 ( block[ptr[ lo         ]+d],
705                    block[ptr[ hi         ]+d],
706                    block[ptr[ (lo+hi)>>1 ]+d] );
707
708      unLo = ltLo = lo;
709      unHi = gtHi = hi;
710
711      while (True) {
712         while (True) {
713            if (unLo > unHi) break;
714            n = ((Int32)block[ptr[unLo]+d]) - med;
715            if (n == 0) {
716               mswap(ptr[unLo], ptr[ltLo]);
717               ltLo++; unLo++; continue;
718            };
719            if (n >  0) break;
720            unLo++;
721         }
722         while (True) {
723            if (unLo > unHi) break;
724            n = ((Int32)block[ptr[unHi]+d]) - med;
725            if (n == 0) {
726               mswap(ptr[unHi], ptr[gtHi]);
727               gtHi--; unHi--; continue;
728            };
729            if (n <  0) break;
730            unHi--;
731         }
732         if (unLo > unHi) break;
733         mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
734      }
735
736      AssertD ( unHi == unLo-1, "mainQSort3(2)" );
737
738      if (gtHi < ltLo) {
739         mpush(lo, hi, d+1 );
740         continue;
741      }
742
743      n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
744      m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
745
746      n = lo + unLo - ltLo - 1;
747      m = hi - (gtHi - unHi) + 1;
748
749      nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
750      nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
751      nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
752
753      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
754      if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
755      if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
756
757      AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
758      AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
759
760      mpush (nextLo[0], nextHi[0], nextD[0]);
761      mpush (nextLo[1], nextHi[1], nextD[1]);
762      mpush (nextLo[2], nextHi[2], nextD[2]);
763   }
764}
765
766#undef mswap
767#undef mvswap
768#undef mpush
769#undef mpop
770#undef mmin
771#undef mnextsize
772#undef mnextswap
773#undef MAIN_QSORT_SMALL_THRESH
774#undef MAIN_QSORT_DEPTH_THRESH
775#undef MAIN_QSORT_STACK_SIZE
776
777
778/*---------------------------------------------*/
779/* Pre:
780      nblock > N_OVERSHOOT
781      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
782      ((UChar*)block32) [0 .. nblock-1] holds block
783      ptr exists for [0 .. nblock-1]
784
785   Post:
786      ((UChar*)block32) [0 .. nblock-1] holds block
787      All other areas of block32 destroyed
788      ftab [0 .. 65536 ] destroyed
789      ptr [0 .. nblock-1] holds sorted order
790      if (*budget < 0), sorting was abandoned
791*/
792
793#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
794#define SETMASK (1 << 21)
795#define CLEARMASK (~(SETMASK))
796
797static
798void mainSort ( UInt32* ptr,
799                UChar*  block,
800                UInt16* quadrant,
801                UInt32* ftab,
802                Int32   nblock,
803                Int32   verb,
804                Int32*  budget )
805{
806   Int32  i, j, k, ss, sb;
807   Int32  runningOrder[256];
808   Bool   bigDone[256];
809   Int32  copyStart[256];
810   Int32  copyEnd  [256];
811   UChar  c1;
812   Int32  numQSorted;
813   UInt16 s;
814   if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
815
816   /*-- set up the 2-byte frequency table --*/
817   for (i = 65536; i >= 0; i--) ftab[i] = 0;
818
819   j = block[0] << 8;
820   i = nblock-1;
821   for (; i >= 3; i -= 4) {
822      quadrant[i] = 0;
823      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
824      ftab[j]++;
825      quadrant[i-1] = 0;
826      j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
827      ftab[j]++;
828      quadrant[i-2] = 0;
829      j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
830      ftab[j]++;
831      quadrant[i-3] = 0;
832      j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
833      ftab[j]++;
834   }
835   for (; i >= 0; i--) {
836      quadrant[i] = 0;
837      j = (j >> 8) | ( ((UInt16)block[i]) << 8);
838      ftab[j]++;
839   }
840
841   /*-- (emphasises close relationship of block & quadrant) --*/
842   for (i = 0; i < BZ_N_OVERSHOOT; i++) {
843      block   [nblock+i] = block[i];
844      quadrant[nblock+i] = 0;
845   }
846
847   if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
848
849   /*-- Complete the initial radix sort --*/
850   for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
851
852   s = block[0] << 8;
853   i = nblock-1;
854   for (; i >= 3; i -= 4) {
855      s = (s >> 8) | (block[i] << 8);
856      j = ftab[s] -1;
857      ftab[s] = j;
858      ptr[j] = i;
859      s = (s >> 8) | (block[i-1] << 8);
860      j = ftab[s] -1;
861      ftab[s] = j;
862      ptr[j] = i-1;
863      s = (s >> 8) | (block[i-2] << 8);
864      j = ftab[s] -1;
865      ftab[s] = j;
866      ptr[j] = i-2;
867      s = (s >> 8) | (block[i-3] << 8);
868      j = ftab[s] -1;
869      ftab[s] = j;
870      ptr[j] = i-3;
871   }
872   for (; i >= 0; i--) {
873      s = (s >> 8) | (block[i] << 8);
874      j = ftab[s] -1;
875      ftab[s] = j;
876      ptr[j] = i;
877   }
878
879   /*--
880      Now ftab contains the first loc of every small bucket.
881      Calculate the running order, from smallest to largest
882      big bucket.
883   --*/
884   for (i = 0; i <= 255; i++) {
885      bigDone     [i] = False;
886      runningOrder[i] = i;
887   }
888
889   {
890      Int32 vv;
891      Int32 h = 1;
892      do h = 3 * h + 1; while (h <= 256);
893      do {
894         h = h / 3;
895         for (i = h; i <= 255; i++) {
896            vv = runningOrder[i];
897            j = i;
898            while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
899               runningOrder[j] = runningOrder[j-h];
900               j = j - h;
901               if (j <= (h - 1)) goto zero;
902            }
903            zero:
904            runningOrder[j] = vv;
905         }
906      } while (h != 1);
907   }
908
909   /*--
910      The main sorting loop.
911   --*/
912
913   numQSorted = 0;
914
915   for (i = 0; i <= 255; i++) {
916
917      /*--
918         Process big buckets, starting with the least full.
919         Basically this is a 3-step process in which we call
920         mainQSort3 to sort the small buckets [ss, j], but
921         also make a big effort to avoid the calls if we can.
922      --*/
923      ss = runningOrder[i];
924
925      /*--
926         Step 1:
927         Complete the big bucket [ss] by quicksorting
928         any unsorted small buckets [ss, j], for j != ss.
929         Hopefully previous pointer-scanning phases have already
930         completed many of the small buckets [ss, j], so
931         we don't have to sort them at all.
932      --*/
933      for (j = 0; j <= 255; j++) {
934         if (j != ss) {
935            sb = (ss << 8) + j;
936            if ( ! (ftab[sb] & SETMASK) ) {
937               Int32 lo = ftab[sb]   & CLEARMASK;
938               Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
939               if (hi > lo) {
940                  if (verb >= 4)
941                     VPrintf4 ( "        qsort [0x%x, 0x%x]   "
942                                "done %d   this %d\n",
943                                ss, j, numQSorted, hi - lo + 1 );
944                  mainQSort3 (
945                     ptr, block, quadrant, nblock,
946                     lo, hi, BZ_N_RADIX, budget
947                  );
948                  numQSorted += (hi - lo + 1);
949                  if (*budget < 0) return;
950               }
951            }
952            ftab[sb] |= SETMASK;
953         }
954      }
955
956      AssertH ( !bigDone[ss], 1006 );
957
958      /*--
959         Step 2:
960         Now scan this big bucket [ss] so as to synthesise the
961         sorted order for small buckets [t, ss] for all t,
962         including, magically, the bucket [ss,ss] too.
963         This will avoid doing Real Work in subsequent Step 1's.
964      --*/
965      {
966         for (j = 0; j <= 255; j++) {
967            copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
968            copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
969         }
970         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
971            k = ptr[j]-1; if (k < 0) k += nblock;
972            c1 = block[k];
973            if (!bigDone[c1])
974               ptr[ copyStart[c1]++ ] = k;
975         }
976         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
977            k = ptr[j]-1; if (k < 0) k += nblock;
978            c1 = block[k];
979            if (!bigDone[c1])
980               ptr[ copyEnd[c1]-- ] = k;
981         }
982      }
983
984      AssertH ( (copyStart[ss]-1 == copyEnd[ss])
985                ||
986                /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
987                   Necessity for this case is demonstrated by compressing
988                   a sequence of approximately 48.5 million of character
989                   251; 1.0.0/1.0.1 will then die here. */
990                (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
991                1007 )
992
993      for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
994
995      /*--
996         Step 3:
997         The [ss] big bucket is now done.  Record this fact,
998         and update the quadrant descriptors.  Remember to
999         update quadrants in the overshoot area too, if
1000         necessary.  The "if (i < 255)" test merely skips
1001         this updating for the last bucket processed, since
1002         updating for the last bucket is pointless.
1003
1004         The quadrant array provides a way to incrementally
1005         cache sort orderings, as they appear, so as to
1006         make subsequent comparisons in fullGtU() complete
1007         faster.  For repetitive blocks this makes a big
1008         difference (but not big enough to be able to avoid
1009         the fallback sorting mechanism, exponential radix sort).
1010
1011         The precise meaning is: at all times:
1012
1013            for 0 <= i < nblock and 0 <= j <= nblock
1014
1015            if block[i] != block[j],
1016
1017               then the relative values of quadrant[i] and
1018                    quadrant[j] are meaningless.
1019
1020               else {
1021                  if quadrant[i] < quadrant[j]
1022                     then the string starting at i lexicographically
1023                     precedes the string starting at j
1024
1025                  else if quadrant[i] > quadrant[j]
1026                     then the string starting at j lexicographically
1027                     precedes the string starting at i
1028
1029                  else
1030                     the relative ordering of the strings starting
1031                     at i and j has not yet been determined.
1032               }
1033      --*/
1034      bigDone[ss] = True;
1035
1036      if (i < 255) {
1037         Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1038         Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1039         Int32 shifts   = 0;
1040
1041         while ((bbSize >> shifts) > 65534) shifts++;
1042
1043         for (j = bbSize-1; j >= 0; j--) {
1044            Int32 a2update     = ptr[bbStart + j];
1045            UInt16 qVal        = (UInt16)(j >> shifts);
1046            quadrant[a2update] = qVal;
1047            if (a2update < BZ_N_OVERSHOOT)
1048               quadrant[a2update + nblock] = qVal;
1049         }
1050         AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1051      }
1052
1053   }
1054
1055   if (verb >= 4)
1056      VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1057                 nblock, numQSorted, nblock - numQSorted );
1058}
1059
1060#undef BIGFREQ
1061#undef SETMASK
1062#undef CLEARMASK
1063
1064
1065/*---------------------------------------------*/
1066/* Pre:
1067      nblock > 0
1068      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1069      ((UChar*)arr2)  [0 .. nblock-1] holds block
1070      arr1 exists for [0 .. nblock-1]
1071
1072   Post:
1073      ((UChar*)arr2) [0 .. nblock-1] holds block
1074      All other areas of block destroyed
1075      ftab [ 0 .. 65536 ] destroyed
1076      arr1 [0 .. nblock-1] holds sorted order
1077*/
1078void BZ2_blockSort ( EState* s )
1079{
1080   UInt32* ptr    = s->ptr;
1081   UChar*  block  = s->block;
1082   UInt32* ftab   = s->ftab;
1083   Int32   nblock = s->nblock;
1084   Int32   verb   = s->verbosity;
1085   Int32   wfact  = s->workFactor;
1086   UInt16* quadrant;
1087   Int32   budget;
1088   Int32   budgetInit;
1089   Int32   i;
1090
1091   if (nblock < 10000) {
1092      fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1093   } else {
1094      /* Calculate the location for quadrant, remembering to get
1095         the alignment right.  Assumes that &(block[0]) is at least
1096         2-byte aligned -- this should be ok since block is really
1097         the first section of arr2.
1098      */
1099      i = nblock+BZ_N_OVERSHOOT;
1100      if (i & 1) i++;
1101      quadrant = (UInt16*)(&(block[i]));
1102
1103      /* (wfact-1) / 3 puts the default-factor-30
1104         transition point at very roughly the same place as
1105         with v0.1 and v0.9.0.
1106         Not that it particularly matters any more, since the
1107         resulting compressed stream is now the same regardless
1108         of whether or not we use the main sort or fallback sort.
1109      */
1110      if (wfact < 1  ) wfact = 1;
1111      if (wfact > 100) wfact = 100;
1112      budgetInit = nblock * ((wfact-1) / 3);
1113      budget = budgetInit;
1114
1115      mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1116      if (verb >= 3)
1117         VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1118                    budgetInit - budget,
1119                    nblock,
1120                    (float)(budgetInit - budget) /
1121                    (float)(nblock==0 ? 1 : nblock) );
1122      if (budget < 0) {
1123         if (verb >= 2)
1124            VPrintf0 ( "    too repetitive; using fallback"
1125                       " sorting algorithm\n" );
1126         fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1127      }
1128   }
1129
1130   s->origPtr = -1;
1131   for (i = 0; i < s->nblock; i++)
1132      if (ptr[i] == 0)
1133         { s->origPtr = i; break; };
1134
1135   AssertH( s->origPtr != -1, 1003 );
1136}
1137
1138
1139/*-------------------------------------------------------------*/
1140/*--- end                                       blocksort.c ---*/
1141/*-------------------------------------------------------------*/
1142