1///////////////////////////////////////////////////////////////////////////////
2//
3/// \file       fastpos.h
4/// \brief      Kind of two-bit version of bit scan reverse
5///
6//  Authors:    Igor Pavlov
7//              Lasse Collin
8//
9//  This file has been put into the public domain.
10//  You can do whatever you want with this file.
11//
12///////////////////////////////////////////////////////////////////////////////
13
14#ifndef LZMA_FASTPOS_H
15#define LZMA_FASTPOS_H
16
17// LZMA encodes match distances (positions) by storing the highest two
18// bits using a six-bit value [0, 63], and then the missing lower bits.
19// Dictionary size is also stored using this encoding in the new .lzma
20// file format header.
21//
22// fastpos.h provides a way to quickly find out the correct six-bit
23// values. The following table gives some examples of this encoding:
24//
25//      pos   return
26//       0       0
27//       1       1
28//       2       2
29//       3       3
30//       4       4
31//       5       4
32//       6       5
33//       7       5
34//       8       6
35//      11       6
36//      12       7
37//     ...      ...
38//      15       7
39//      16       8
40//      17       8
41//     ...      ...
42//      23       8
43//      24       9
44//      25       9
45//     ...      ...
46//
47//
48// Provided functions or macros
49// ----------------------------
50//
51// get_pos_slot(pos) is the basic version. get_pos_slot_2(pos)
52// assumes that pos >= FULL_DISTANCES, thus the result is at least
53// FULL_DISTANCES_BITS * 2. Using get_pos_slot(pos) instead of
54// get_pos_slot_2(pos) would give the same result, but get_pos_slot_2(pos)
55// should be tiny bit faster due to the assumption being made.
56//
57//
58// Size vs. speed
59// --------------
60//
61// With some CPUs that have fast BSR (bit scan reverse) instruction, the
62// size optimized version is slightly faster than the bigger table based
63// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III
64// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that
65// would still have speed roughly comparable to the table version. Older
66// x86 CPUs like the original Pentium have very slow BSR; on those systems
67// the table version is a lot faster.
68//
69// On some CPUs, the table version is a lot faster when using position
70// dependent code, but with position independent code the size optimized
71// version is slightly faster. This occurs at least on 32-bit SPARC (no
72// ASM optimizations).
73//
74// I'm making the table version the default, because that has good speed
75// on all systems I have tried. The size optimized version is sometimes
76// slightly faster, but sometimes it is a lot slower.
77
78#ifdef HAVE_SMALL
79#	define get_pos_slot(pos) ((pos) <= 4 ? (pos) : get_pos_slot_2(pos))
80
81static inline uint32_t
82get_pos_slot_2(uint32_t pos)
83{
84	const uint32_t i = bsr32(pos);
85	return (i + i) + ((pos >> (i - 1)) & 1);
86}
87
88
89#else
90
91#define FASTPOS_BITS 13
92
93extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS];
94
95
96#define fastpos_shift(extra, n) \
97	((extra) + (n) * (FASTPOS_BITS - 1))
98
99#define fastpos_limit(extra, n) \
100	(UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n)))
101
102#define fastpos_result(pos, extra, n) \
103	lzma_fastpos[(pos) >> fastpos_shift(extra, n)] \
104			+ 2 * fastpos_shift(extra, n)
105
106
107static inline uint32_t
108get_pos_slot(uint32_t pos)
109{
110	// If it is small enough, we can pick the result directly from
111	// the precalculated table.
112	if (pos < fastpos_limit(0, 0))
113		return lzma_fastpos[pos];
114
115	if (pos < fastpos_limit(0, 1))
116		return fastpos_result(pos, 0, 1);
117
118	return fastpos_result(pos, 0, 2);
119}
120
121
122#ifdef FULL_DISTANCES_BITS
123static inline uint32_t
124get_pos_slot_2(uint32_t pos)
125{
126	assert(pos >= FULL_DISTANCES);
127
128	if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 0))
129		return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 0);
130
131	if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 1))
132		return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 1);
133
134	return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 2);
135}
136#endif
137
138#endif
139
140#endif
141