• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/router/busybox/archival/bz/
1/*
2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4 * See README and LICENSE files in this directory for more information.
5 */
6
7/*-------------------------------------------------------------*/
8/*--- Block sorting machinery                               ---*/
9/*---                                           blocksort.c ---*/
10/*-------------------------------------------------------------*/
11
12/* ------------------------------------------------------------------
13This file is part of bzip2/libbzip2, a program and library for
14lossless, block-sorting data compression.
15
16bzip2/libbzip2 version 1.0.4 of 20 December 2006
17Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18
19Please read the WARNING, DISCLAIMER and PATENTS sections in the
20README file.
21
22This program is released under the terms of the license contained
23in the file LICENSE.
24------------------------------------------------------------------ */
25
26/* #include "bzlib_private.h" */
27
28#define mswap(zz1, zz2) \
29{ \
30	int32_t zztmp = zz1; \
31	zz1 = zz2; \
32	zz2 = zztmp; \
33}
34
35static
36/* No measurable speed gain with inlining */
37/* ALWAYS_INLINE */
38void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39{
40	while (zzn > 0) {
41		mswap(ptr[zzp1], ptr[zzp2]);
42		zzp1++;
43		zzp2++;
44		zzn--;
45	}
46}
47
48static
49ALWAYS_INLINE
50int32_t mmin(int32_t a, int32_t b)
51{
52	return (a < b) ? a : b;
53}
54
55
56/*---------------------------------------------*/
57/*--- Fallback O(N log(N)^2) sorting        ---*/
58/*--- algorithm, for repetitive blocks      ---*/
59/*---------------------------------------------*/
60
61/*---------------------------------------------*/
62static
63inline
64void fallbackSimpleSort(uint32_t* fmap,
65		uint32_t* eclass,
66		int32_t   lo,
67		int32_t   hi)
68{
69	int32_t i, j, tmp;
70	uint32_t ec_tmp;
71
72	if (lo == hi) return;
73
74	if (hi - lo > 3) {
75		for (i = hi-4; i >= lo; i--) {
76			tmp = fmap[i];
77			ec_tmp = eclass[tmp];
78			for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79				fmap[j-4] = fmap[j];
80			fmap[j-4] = tmp;
81		}
82	}
83
84	for (i = hi-1; i >= lo; i--) {
85		tmp = fmap[i];
86		ec_tmp = eclass[tmp];
87		for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88			fmap[j-1] = fmap[j];
89		fmap[j-1] = tmp;
90	}
91}
92
93
94/*---------------------------------------------*/
95#define fpush(lz,hz) { \
96	stackLo[sp] = lz; \
97	stackHi[sp] = hz; \
98	sp++; \
99}
100
101#define fpop(lz,hz) { \
102	sp--; \
103	lz = stackLo[sp]; \
104	hz = stackHi[sp]; \
105}
106
107#define FALLBACK_QSORT_SMALL_THRESH 10
108#define FALLBACK_QSORT_STACK_SIZE   100
109
110static
111void fallbackQSort3(uint32_t* fmap,
112		uint32_t* eclass,
113		int32_t   loSt,
114		int32_t   hiSt)
115{
116	int32_t unLo, unHi, ltLo, gtHi, n, m;
117	int32_t sp, lo, hi;
118	uint32_t med, r, r3;
119	int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120	int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
121
122	r = 0;
123
124	sp = 0;
125	fpush(loSt, hiSt);
126
127	while (sp > 0) {
128		AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
129
130		fpop(lo, hi);
131		if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132			fallbackSimpleSort(fmap, eclass, lo, hi);
133			continue;
134		}
135
136		/* Random partitioning.  Median of 3 sometimes fails to
137		 * avoid bad cases.  Median of 9 seems to help but
138		 * looks rather expensive.  This too seems to work but
139		 * is cheaper.  Guidance for the magic constants
140		 * 7621 and 32768 is taken from Sedgewick's algorithms
141		 * book, chapter 35.
142		 */
143		r = ((r * 7621) + 1) % 32768;
144		r3 = r % 3;
145		if (r3 == 0)
146			med = eclass[fmap[lo]];
147		else if (r3 == 1)
148			med = eclass[fmap[(lo+hi)>>1]];
149		else
150			med = eclass[fmap[hi]];
151
152		unLo = ltLo = lo;
153		unHi = gtHi = hi;
154
155		while (1) {
156			while (1) {
157				if (unLo > unHi) break;
158				n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
159				if (n == 0) {
160					mswap(fmap[unLo], fmap[ltLo]);
161					ltLo++;
162					unLo++;
163					continue;
164				};
165				if (n > 0) break;
166				unLo++;
167			}
168			while (1) {
169				if (unLo > unHi) break;
170				n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
171				if (n == 0) {
172					mswap(fmap[unHi], fmap[gtHi]);
173					gtHi--; unHi--;
174					continue;
175				};
176				if (n < 0) break;
177				unHi--;
178			}
179			if (unLo > unHi) break;
180			mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
181		}
182
183		AssertD(unHi == unLo-1, "fallbackQSort3(2)");
184
185		if (gtHi < ltLo) continue;
186
187		n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188		m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
189
190		n = lo + unLo - ltLo - 1;
191		m = hi - (gtHi - unHi) + 1;
192
193		if (n - lo > hi - m) {
194			fpush(lo, n);
195			fpush(m, hi);
196		} else {
197			fpush(m, hi);
198			fpush(lo, n);
199		}
200	}
201}
202
203#undef fpush
204#undef fpop
205#undef FALLBACK_QSORT_SMALL_THRESH
206#undef FALLBACK_QSORT_STACK_SIZE
207
208
209/*---------------------------------------------*/
210/* Pre:
211 *	nblock > 0
212 *	eclass exists for [0 .. nblock-1]
213 *	((uint8_t*)eclass) [0 .. nblock-1] holds block
214 *	ptr exists for [0 .. nblock-1]
215 *
216 * Post:
217 *	((uint8_t*)eclass) [0 .. nblock-1] holds block
218 *	All other areas of eclass destroyed
219 *	fmap [0 .. nblock-1] holds sorted order
220 *	bhtab[0 .. 2+(nblock/32)] destroyed
221*/
222
223#define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224#define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225#define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226#define      WORD_BH(zz)  bhtab[(zz) >> 5]
227#define UNALIGNED_BH(zz)  ((zz) & 0x01f)
228
229static
230void fallbackSort(uint32_t* fmap,
231		uint32_t* eclass,
232		uint32_t* bhtab,
233		int32_t   nblock)
234{
235	int32_t ftab[257];
236	int32_t ftabCopy[256];
237	int32_t H, i, j, k, l, r, cc, cc1;
238	int32_t nNotDone;
239	int32_t nBhtab;
240	uint8_t* eclass8 = (uint8_t*)eclass;
241
242	/*
243	 * Initial 1-char radix sort to generate
244	 * initial fmap and initial BH bits.
245	 */
246	for (i = 0; i < 257;    i++) ftab[i] = 0;
247	for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248	for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
249
250	j = ftab[0];  /* bbox: optimized */
251	for (i = 1; i < 257;    i++) {
252		j += ftab[i];
253		ftab[i] = j;
254	}
255
256	for (i = 0; i < nblock; i++) {
257		j = eclass8[i];
258		k = ftab[j] - 1;
259		ftab[j] = k;
260		fmap[k] = i;
261	}
262
263	nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
264	for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265	for (i = 0; i < 256; i++) SET_BH(ftab[i]);
266
267	/*
268	 * Inductively refine the buckets.  Kind-of an
269	 * "exponential radix sort" (!), inspired by the
270	 * Manber-Myers suffix array construction algorithm.
271	 */
272
273	/*-- set sentinel bits for block-end detection --*/
274	for (i = 0; i < 32; i++) {
275		SET_BH(nblock + 2*i);
276		CLEAR_BH(nblock + 2*i + 1);
277	}
278
279	/*-- the log(N) loop --*/
280	H = 1;
281	while (1) {
282		j = 0;
283		for (i = 0; i < nblock; i++) {
284			if (ISSET_BH(i))
285				j = i;
286			k = fmap[i] - H;
287			if (k < 0)
288				k += nblock;
289			eclass[k] = j;
290		}
291
292		nNotDone = 0;
293		r = -1;
294		while (1) {
295
296		/*-- find the next non-singleton bucket --*/
297			k = r + 1;
298			while (ISSET_BH(k) && UNALIGNED_BH(k))
299				k++;
300			if (ISSET_BH(k)) {
301				while (WORD_BH(k) == 0xffffffff) k += 32;
302				while (ISSET_BH(k)) k++;
303			}
304			l = k - 1;
305			if (l >= nblock)
306				break;
307			while (!ISSET_BH(k) && UNALIGNED_BH(k))
308				k++;
309			if (!ISSET_BH(k)) {
310				while (WORD_BH(k) == 0x00000000) k += 32;
311				while (!ISSET_BH(k)) k++;
312			}
313			r = k - 1;
314			if (r >= nblock)
315				break;
316
317			/*-- now [l, r] bracket current bucket --*/
318			if (r > l) {
319				nNotDone += (r - l + 1);
320				fallbackQSort3(fmap, eclass, l, r);
321
322				/*-- scan bucket and generate header bits-- */
323				cc = -1;
324				for (i = l; i <= r; i++) {
325					cc1 = eclass[fmap[i]];
326					if (cc != cc1) {
327						SET_BH(i);
328						cc = cc1;
329					};
330				}
331			}
332		}
333
334		H *= 2;
335		if (H > nblock || nNotDone == 0)
336			break;
337	}
338
339	/*
340	 * Reconstruct the original block in
341	 * eclass8 [0 .. nblock-1], since the
342	 * previous phase destroyed it.
343	 */
344	j = 0;
345	for (i = 0; i < nblock; i++) {
346		while (ftabCopy[j] == 0)
347			j++;
348		ftabCopy[j]--;
349		eclass8[fmap[i]] = (uint8_t)j;
350	}
351	AssertH(j < 256, 1005);
352}
353
354#undef       SET_BH
355#undef     CLEAR_BH
356#undef     ISSET_BH
357#undef      WORD_BH
358#undef UNALIGNED_BH
359
360
361/*---------------------------------------------*/
362/*--- The main, O(N^2 log(N)) sorting       ---*/
363/*--- algorithm.  Faster for "normal"       ---*/
364/*--- non-repetitive blocks.                ---*/
365/*---------------------------------------------*/
366
367/*---------------------------------------------*/
368static
369NOINLINE
370int mainGtU(
371		uint32_t  i1,
372		uint32_t  i2,
373		uint8_t*  block,
374		uint16_t* quadrant,
375		uint32_t  nblock,
376		int32_t*  budget)
377{
378	int32_t  k;
379	uint8_t  c1, c2;
380	uint16_t s1, s2;
381
382/* Loop unrolling here is actually very useful
383 * (generated code is much simpler),
384 * code size increase is only 270 bytes (i386)
385 * but speeds up compression 10% overall
386 */
387
388#if CONFIG_BZIP2_FEATURE_SPEED >= 1
389
390#define TIMES_8(code) \
391	code; code; code; code; \
392	code; code; code; code;
393#define TIMES_12(code) \
394	code; code; code; code; \
395	code; code; code; code; \
396	code; code; code; code;
397
398#else
399
400#define TIMES_8(code) \
401{ \
402	int nn = 8; \
403	do { \
404		code; \
405	} while (--nn); \
406}
407#define TIMES_12(code) \
408{ \
409	int nn = 12; \
410	do { \
411		code; \
412	} while (--nn); \
413}
414
415#endif
416
417	AssertD(i1 != i2, "mainGtU");
418	TIMES_12(
419		c1 = block[i1]; c2 = block[i2];
420		if (c1 != c2) return (c1 > c2);
421		i1++; i2++;
422	)
423
424	k = nblock + 8;
425
426	do {
427		TIMES_8(
428			c1 = block[i1]; c2 = block[i2];
429			if (c1 != c2) return (c1 > c2);
430			s1 = quadrant[i1]; s2 = quadrant[i2];
431			if (s1 != s2) return (s1 > s2);
432			i1++; i2++;
433		)
434
435		if (i1 >= nblock) i1 -= nblock;
436		if (i2 >= nblock) i2 -= nblock;
437
438		(*budget)--;
439		k -= 8;
440	} while (k >= 0);
441
442	return False;
443}
444#undef TIMES_8
445#undef TIMES_12
446
447/*---------------------------------------------*/
448/*
449 * Knuth's increments seem to work better
450 * than Incerpi-Sedgewick here.  Possibly
451 * because the number of elems to sort is
452 * usually small, typically <= 20.
453 */
454static
455const int32_t incs[14] = {
456	1, 4, 13, 40, 121, 364, 1093, 3280,
457	9841, 29524, 88573, 265720,
458	797161, 2391484
459};
460
461static
462void mainSimpleSort(uint32_t* ptr,
463		uint8_t*  block,
464		uint16_t* quadrant,
465		int32_t   nblock,
466		int32_t   lo,
467		int32_t   hi,
468		int32_t   d,
469		int32_t*  budget)
470{
471	int32_t i, j, h, bigN, hp;
472	uint32_t v;
473
474	bigN = hi - lo + 1;
475	if (bigN < 2) return;
476
477	hp = 0;
478	while (incs[hp] < bigN) hp++;
479	hp--;
480
481	for (; hp >= 0; hp--) {
482		h = incs[hp];
483
484		i = lo + h;
485		while (1) {
486			/*-- copy 1 --*/
487			if (i > hi) break;
488			v = ptr[i];
489			j = i;
490			while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
491				ptr[j] = ptr[j-h];
492				j = j - h;
493				if (j <= (lo + h - 1)) break;
494			}
495			ptr[j] = v;
496			i++;
497
498/* 1.5% overall speedup, +290 bytes */
499#if CONFIG_BZIP2_FEATURE_SPEED >= 3
500			/*-- copy 2 --*/
501			if (i > hi) break;
502			v = ptr[i];
503			j = i;
504			while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
505				ptr[j] = ptr[j-h];
506				j = j - h;
507				if (j <= (lo + h - 1)) break;
508			}
509			ptr[j] = v;
510			i++;
511
512			/*-- copy 3 --*/
513			if (i > hi) break;
514			v = ptr[i];
515			j = i;
516			while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
517				ptr[j] = ptr[j-h];
518				j = j - h;
519				if (j <= (lo + h - 1)) break;
520			}
521			ptr[j] = v;
522			i++;
523#endif
524			if (*budget < 0) return;
525		}
526	}
527}
528
529
530/*---------------------------------------------*/
531/*
532 * The following is an implementation of
533 * an elegant 3-way quicksort for strings,
534 * described in a paper "Fast Algorithms for
535 * Sorting and Searching Strings", by Robert
536 * Sedgewick and Jon L. Bentley.
537 */
538
539static
540ALWAYS_INLINE
541uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
542{
543	uint8_t t;
544	if (a > b) {
545		t = a;
546		a = b;
547		b = t;
548	};
549	/* here b >= a */
550	if (b > c) {
551		b = c;
552		if (a > b)
553			b = a;
554	}
555	return b;
556}
557
558#define mpush(lz,hz,dz) \
559{ \
560	stackLo[sp] = lz; \
561	stackHi[sp] = hz; \
562	stackD [sp] = dz; \
563	sp++; \
564}
565
566#define mpop(lz,hz,dz) \
567{ \
568	sp--; \
569	lz = stackLo[sp]; \
570	hz = stackHi[sp]; \
571	dz = stackD [sp]; \
572}
573
574#define mnextsize(az) (nextHi[az] - nextLo[az])
575
576#define mnextswap(az,bz) \
577{ \
578	int32_t tz; \
579	tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580	tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581	tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
582}
583
584#define MAIN_QSORT_SMALL_THRESH 20
585#define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586#define MAIN_QSORT_STACK_SIZE   100
587
588static NOINLINE
589void mainQSort3(uint32_t* ptr,
590		uint8_t*  block,
591		uint16_t* quadrant,
592		int32_t   nblock,
593		int32_t   loSt,
594		int32_t   hiSt,
595		int32_t   dSt,
596		int32_t*  budget)
597{
598	int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599	int32_t sp, lo, hi, d;
600
601	int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602	int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603	int32_t stackD [MAIN_QSORT_STACK_SIZE];
604
605	int32_t nextLo[3];
606	int32_t nextHi[3];
607	int32_t nextD [3];
608
609	sp = 0;
610	mpush(loSt, hiSt, dSt);
611
612	while (sp > 0) {
613		AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
614
615		mpop(lo, hi, d);
616		if (hi - lo < MAIN_QSORT_SMALL_THRESH
617		 || d > MAIN_QSORT_DEPTH_THRESH
618		) {
619			mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
620			if (*budget < 0)
621				return;
622			continue;
623		}
624		med = (int32_t)	mmed3(block[ptr[lo          ] + d],
625		                      block[ptr[hi          ] + d],
626		                      block[ptr[(lo+hi) >> 1] + d]);
627
628		unLo = ltLo = lo;
629		unHi = gtHi = hi;
630
631		while (1) {
632			while (1) {
633				if (unLo > unHi)
634					break;
635				n = ((int32_t)block[ptr[unLo]+d]) - med;
636				if (n == 0) {
637					mswap(ptr[unLo], ptr[ltLo]);
638					ltLo++;
639					unLo++;
640					continue;
641				};
642				if (n >  0) break;
643				unLo++;
644			}
645			while (1) {
646				if (unLo > unHi)
647					break;
648				n = ((int32_t)block[ptr[unHi]+d]) - med;
649				if (n == 0) {
650					mswap(ptr[unHi], ptr[gtHi]);
651					gtHi--;
652					unHi--;
653					continue;
654				};
655				if (n <  0) break;
656				unHi--;
657			}
658			if (unLo > unHi)
659				break;
660			mswap(ptr[unLo], ptr[unHi]);
661			unLo++;
662			unHi--;
663		}
664
665		AssertD(unHi == unLo-1, "mainQSort3(2)");
666
667		if (gtHi < ltLo) {
668			mpush(lo, hi, d + 1);
669			continue;
670		}
671
672		n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673		m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
674
675		n = lo + unLo - ltLo - 1;
676		m = hi - (gtHi - unHi) + 1;
677
678		nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
679		nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
680		nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
681
682		if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683		if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684		if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
685
686		AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687		AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
688
689		mpush(nextLo[0], nextHi[0], nextD[0]);
690		mpush(nextLo[1], nextHi[1], nextD[1]);
691		mpush(nextLo[2], nextHi[2], nextD[2]);
692	}
693}
694
695#undef mpush
696#undef mpop
697#undef mnextsize
698#undef mnextswap
699#undef MAIN_QSORT_SMALL_THRESH
700#undef MAIN_QSORT_DEPTH_THRESH
701#undef MAIN_QSORT_STACK_SIZE
702
703
704/*---------------------------------------------*/
705/* Pre:
706 *	nblock > N_OVERSHOOT
707 *	block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708 *	((uint8_t*)block32) [0 .. nblock-1] holds block
709 *	ptr exists for [0 .. nblock-1]
710 *
711 * Post:
712 *	((uint8_t*)block32) [0 .. nblock-1] holds block
713 *	All other areas of block32 destroyed
714 *	ftab[0 .. 65536] destroyed
715 *	ptr [0 .. nblock-1] holds sorted order
716 *	if (*budget < 0), sorting was abandoned
717 */
718
719#define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720#define SETMASK (1 << 21)
721#define CLEARMASK (~(SETMASK))
722
723static NOINLINE
724void mainSort(EState* state,
725		uint32_t* ptr,
726		uint8_t*  block,
727		uint16_t* quadrant,
728		uint32_t* ftab,
729		int32_t   nblock,
730		int32_t*  budget)
731{
732	int32_t  i, j, k, ss, sb;
733	uint8_t  c1;
734	int32_t  numQSorted;
735	uint16_t s;
736	Bool     bigDone[256];
737	/* bbox: moved to EState to save stack
738	int32_t  runningOrder[256];
739	int32_t  copyStart[256];
740	int32_t  copyEnd  [256];
741	*/
742#define runningOrder (state->mainSort__runningOrder)
743#define copyStart    (state->mainSort__copyStart)
744#define copyEnd      (state->mainSort__copyEnd)
745
746	/*-- set up the 2-byte frequency table --*/
747	/* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
748	memset(ftab, 0, 65537 * sizeof(ftab[0]));
749
750	j = block[0] << 8;
751	i = nblock - 1;
752/* 3%, +300 bytes */
753#if CONFIG_BZIP2_FEATURE_SPEED >= 2
754	for (; i >= 3; i -= 4) {
755		quadrant[i] = 0;
756		j = (j >> 8) | (((uint16_t)block[i]) << 8);
757		ftab[j]++;
758		quadrant[i-1] = 0;
759		j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
760		ftab[j]++;
761		quadrant[i-2] = 0;
762		j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
763		ftab[j]++;
764		quadrant[i-3] = 0;
765		j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
766		ftab[j]++;
767	}
768#endif
769	for (; i >= 0; i--) {
770		quadrant[i] = 0;
771		j = (j >> 8) | (((uint16_t)block[i]) << 8);
772		ftab[j]++;
773	}
774
775	/*-- (emphasises close relationship of block & quadrant) --*/
776	for (i = 0; i < BZ_N_OVERSHOOT; i++) {
777		block   [nblock+i] = block[i];
778		quadrant[nblock+i] = 0;
779	}
780
781	/*-- Complete the initial radix sort --*/
782	j = ftab[0]; /* bbox: optimized */
783	for (i = 1; i <= 65536; i++) {
784		j += ftab[i];
785		ftab[i] = j;
786	}
787
788	s = block[0] << 8;
789	i = nblock - 1;
790#if CONFIG_BZIP2_FEATURE_SPEED >= 2
791	for (; i >= 3; i -= 4) {
792		s = (s >> 8) | (block[i] << 8);
793		j = ftab[s] - 1;
794		ftab[s] = j;
795		ptr[j] = i;
796		s = (s >> 8) | (block[i-1] << 8);
797		j = ftab[s] - 1;
798		ftab[s] = j;
799		ptr[j] = i-1;
800		s = (s >> 8) | (block[i-2] << 8);
801		j = ftab[s] - 1;
802		ftab[s] = j;
803		ptr[j] = i-2;
804		s = (s >> 8) | (block[i-3] << 8);
805		j = ftab[s] - 1;
806		ftab[s] = j;
807		ptr[j] = i-3;
808	}
809#endif
810	for (; i >= 0; i--) {
811		s = (s >> 8) | (block[i] << 8);
812		j = ftab[s] - 1;
813		ftab[s] = j;
814		ptr[j] = i;
815	}
816
817	/*
818	 * Now ftab contains the first loc of every small bucket.
819	 * Calculate the running order, from smallest to largest
820	 * big bucket.
821	 */
822	for (i = 0; i <= 255; i++) {
823		bigDone     [i] = False;
824		runningOrder[i] = i;
825	}
826
827	{
828		int32_t vv;
829		/* bbox: was: int32_t h = 1; */
830		/* do h = 3 * h + 1; while (h <= 256); */
831		uint32_t h = 364;
832
833		do {
834			/*h = h / 3;*/
835			h = (h * 171) >> 9; /* bbox: fast h/3 */
836			for (i = h; i <= 255; i++) {
837				vv = runningOrder[i];
838				j = i;
839				while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
840					runningOrder[j] = runningOrder[j-h];
841					j = j - h;
842					if (j <= (h - 1))
843						goto zero;
844				}
845 zero:
846				runningOrder[j] = vv;
847			}
848		} while (h != 1);
849	}
850
851	/*
852	 * The main sorting loop.
853	 */
854
855	numQSorted = 0;
856
857	for (i = 0; i <= 255; i++) {
858
859		/*
860		 * Process big buckets, starting with the least full.
861		 * Basically this is a 3-step process in which we call
862		 * mainQSort3 to sort the small buckets [ss, j], but
863		 * also make a big effort to avoid the calls if we can.
864		 */
865		ss = runningOrder[i];
866
867		/*
868		 * Step 1:
869		 * Complete the big bucket [ss] by quicksorting
870		 * any unsorted small buckets [ss, j], for j != ss.
871		 * Hopefully previous pointer-scanning phases have already
872		 * completed many of the small buckets [ss, j], so
873		 * we don't have to sort them at all.
874		 */
875		for (j = 0; j <= 255; j++) {
876			if (j != ss) {
877				sb = (ss << 8) + j;
878				if (!(ftab[sb] & SETMASK)) {
879					int32_t lo =  ftab[sb]   & CLEARMASK;
880					int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
881					if (hi > lo) {
882						mainQSort3(
883							ptr, block, quadrant, nblock,
884							lo, hi, BZ_N_RADIX, budget
885						);
886						if (*budget < 0) return;
887						numQSorted += (hi - lo + 1);
888					}
889				}
890				ftab[sb] |= SETMASK;
891			}
892		}
893
894		AssertH(!bigDone[ss], 1006);
895
896		/*
897		 * Step 2:
898		 * Now scan this big bucket [ss] so as to synthesise the
899		 * sorted order for small buckets [t, ss] for all t,
900		 * including, magically, the bucket [ss,ss] too.
901		 * This will avoid doing Real Work in subsequent Step 1's.
902		 */
903		{
904			for (j = 0; j <= 255; j++) {
905				copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
906				copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
907			}
908			for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
909				k = ptr[j] - 1;
910				if (k < 0)
911					k += nblock;
912				c1 = block[k];
913				if (!bigDone[c1])
914					ptr[copyStart[c1]++] = k;
915			}
916			for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
917				k = ptr[j]-1;
918				if (k < 0)
919					k += nblock;
920				c1 = block[k];
921				if (!bigDone[c1])
922					ptr[copyEnd[c1]--] = k;
923			}
924		}
925
926		/* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
927		 * Necessity for this case is demonstrated by compressing
928		 * a sequence of approximately 48.5 million of character
929		 * 251; 1.0.0/1.0.1 will then die here. */
930		AssertH((copyStart[ss]-1 == copyEnd[ss]) \
931		     || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
932
933		for (j = 0; j <= 255; j++)
934			ftab[(j << 8) + ss] |= SETMASK;
935
936		/*
937		 * Step 3:
938		 * The [ss] big bucket is now done.  Record this fact,
939		 * and update the quadrant descriptors.  Remember to
940		 * update quadrants in the overshoot area too, if
941		 * necessary.  The "if (i < 255)" test merely skips
942		 * this updating for the last bucket processed, since
943		 * updating for the last bucket is pointless.
944		 *
945		 * The quadrant array provides a way to incrementally
946		 * cache sort orderings, as they appear, so as to
947		 * make subsequent comparisons in fullGtU() complete
948		 * faster.  For repetitive blocks this makes a big
949		 * difference (but not big enough to be able to avoid
950		 * the fallback sorting mechanism, exponential radix sort).
951		 *
952		 * The precise meaning is: at all times:
953		 *
954		 *	for 0 <= i < nblock and 0 <= j <= nblock
955		 *
956		 *	if block[i] != block[j],
957		 *
958		 *		then the relative values of quadrant[i] and
959		 *			  quadrant[j] are meaningless.
960		 *
961		 *		else {
962		 *			if quadrant[i] < quadrant[j]
963		 *				then the string starting at i lexicographically
964		 *				precedes the string starting at j
965		 *
966		 *			else if quadrant[i] > quadrant[j]
967		 *				then the string starting at j lexicographically
968		 *				precedes the string starting at i
969		 *
970		 *			else
971		 *				the relative ordering of the strings starting
972		 *				at i and j has not yet been determined.
973		 *		}
974		 */
975		bigDone[ss] = True;
976
977		if (i < 255) {
978			int32_t bbStart = ftab[ss << 8] & CLEARMASK;
979			int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
980			int32_t shifts  = 0;
981
982			while ((bbSize >> shifts) > 65534) shifts++;
983
984			for (j = bbSize-1; j >= 0; j--) {
985				int32_t a2update   = ptr[bbStart + j];
986				uint16_t qVal      = (uint16_t)(j >> shifts);
987				quadrant[a2update] = qVal;
988				if (a2update < BZ_N_OVERSHOOT)
989					quadrant[a2update + nblock] = qVal;
990			}
991			AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
992		}
993	}
994#undef runningOrder
995#undef copyStart
996#undef copyEnd
997}
998
999#undef BIGFREQ
1000#undef SETMASK
1001#undef CLEARMASK
1002
1003
1004/*---------------------------------------------*/
1005/* Pre:
1006 *	nblock > 0
1007 *	arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008 *	  ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009 *	arr1 exists for [0 .. nblock-1]
1010 *
1011 * Post:
1012 *	((uint8_t*)arr2) [0 .. nblock-1] holds block
1013 *	All other areas of block destroyed
1014 *	ftab[0 .. 65536] destroyed
1015 *	arr1[0 .. nblock-1] holds sorted order
1016 */
1017static NOINLINE
1018void BZ2_blockSort(EState* s)
1019{
1020	/* In original bzip2 1.0.4, it's a parameter, but 30
1021	 * (which was the default) should work ok. */
1022	enum { wfact = 30 };
1023
1024	uint32_t* ptr    = s->ptr;
1025	uint8_t*  block  = s->block;
1026	uint32_t* ftab   = s->ftab;
1027	int32_t   nblock = s->nblock;
1028	uint16_t* quadrant;
1029	int32_t   budget;
1030	int32_t   i;
1031
1032	if (nblock < 10000) {
1033		fallbackSort(s->arr1, s->arr2, ftab, nblock);
1034	} else {
1035		/* Calculate the location for quadrant, remembering to get
1036		 * the alignment right.  Assumes that &(block[0]) is at least
1037		 * 2-byte aligned -- this should be ok since block is really
1038		 * the first section of arr2.
1039		 */
1040		i = nblock + BZ_N_OVERSHOOT;
1041		if (i & 1) i++;
1042		quadrant = (uint16_t*)(&(block[i]));
1043
1044		/* (wfact-1) / 3 puts the default-factor-30
1045		 * transition point at very roughly the same place as
1046		 * with v0.1 and v0.9.0.
1047		 * Not that it particularly matters any more, since the
1048		 * resulting compressed stream is now the same regardless
1049		 * of whether or not we use the main sort or fallback sort.
1050		 */
1051		budget = nblock * ((wfact-1) / 3);
1052
1053		mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1054		if (budget < 0) {
1055			fallbackSort(s->arr1, s->arr2, ftab, nblock);
1056		}
1057	}
1058
1059	s->origPtr = -1;
1060	for (i = 0; i < s->nblock; i++)
1061		if (ptr[i] == 0) {
1062			s->origPtr = i;
1063			break;
1064		};
1065
1066	AssertH(s->origPtr != -1, 1003);
1067}
1068
1069
1070/*-------------------------------------------------------------*/
1071/*--- end                                       blocksort.c ---*/
1072/*-------------------------------------------------------------*/
1073