1207753Smm/////////////////////////////////////////////////////////////////////////////// 2207753Smm// 3207753Smm/// \file fastpos.h 4207753Smm/// \brief Kind of two-bit version of bit scan reverse 5207753Smm/// 6207753Smm// Authors: Igor Pavlov 7207753Smm// Lasse Collin 8207753Smm// 9207753Smm// This file has been put into the public domain. 10207753Smm// You can do whatever you want with this file. 11207753Smm// 12207753Smm/////////////////////////////////////////////////////////////////////////////// 13207753Smm 14207753Smm#ifndef LZMA_FASTPOS_H 15207753Smm#define LZMA_FASTPOS_H 16207753Smm 17292588Sdelphij// LZMA encodes match distances by storing the highest two bits using 18292588Sdelphij// a six-bit value [0, 63], and then the missing lower bits. 19292588Sdelphij// Dictionary size is also stored using this encoding in the .xz 20207753Smm// file format header. 21207753Smm// 22207753Smm// fastpos.h provides a way to quickly find out the correct six-bit 23207753Smm// values. The following table gives some examples of this encoding: 24207753Smm// 25292588Sdelphij// dist return 26207753Smm// 0 0 27207753Smm// 1 1 28207753Smm// 2 2 29207753Smm// 3 3 30207753Smm// 4 4 31207753Smm// 5 4 32207753Smm// 6 5 33207753Smm// 7 5 34207753Smm// 8 6 35207753Smm// 11 6 36207753Smm// 12 7 37207753Smm// ... ... 38207753Smm// 15 7 39207753Smm// 16 8 40207753Smm// 17 8 41207753Smm// ... ... 42207753Smm// 23 8 43207753Smm// 24 9 44207753Smm// 25 9 45207753Smm// ... ... 46207753Smm// 47207753Smm// 48207753Smm// Provided functions or macros 49207753Smm// ---------------------------- 50207753Smm// 51292588Sdelphij// get_dist_slot(dist) is the basic version. get_dist_slot_2(dist) 52292588Sdelphij// assumes that dist >= FULL_DISTANCES, thus the result is at least 53292588Sdelphij// FULL_DISTANCES_BITS * 2. Using get_dist_slot(dist) instead of 54292588Sdelphij// get_dist_slot_2(dist) would give the same result, but get_dist_slot_2(dist) 55207753Smm// should be tiny bit faster due to the assumption being made. 56207753Smm// 57207753Smm// 58207753Smm// Size vs. speed 59207753Smm// -------------- 60207753Smm// 61207753Smm// With some CPUs that have fast BSR (bit scan reverse) instruction, the 62207753Smm// size optimized version is slightly faster than the bigger table based 63207753Smm// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 64207753Smm// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 65207753Smm// would still have speed roughly comparable to the table version. Older 66207753Smm// x86 CPUs like the original Pentium have very slow BSR; on those systems 67207753Smm// the table version is a lot faster. 68207753Smm// 69207753Smm// On some CPUs, the table version is a lot faster when using position 70207753Smm// dependent code, but with position independent code the size optimized 71207753Smm// version is slightly faster. This occurs at least on 32-bit SPARC (no 72207753Smm// ASM optimizations). 73207753Smm// 74207753Smm// I'm making the table version the default, because that has good speed 75207753Smm// on all systems I have tried. The size optimized version is sometimes 76207753Smm// slightly faster, but sometimes it is a lot slower. 77207753Smm 78207753Smm#ifdef HAVE_SMALL 79292588Sdelphij# define get_dist_slot(dist) \ 80292588Sdelphij ((dist) <= 4 ? (dist) : get_dist_slot_2(dist)) 81207753Smm 82207753Smmstatic inline uint32_t 83292588Sdelphijget_dist_slot_2(uint32_t dist) 84207753Smm{ 85292588Sdelphij const uint32_t i = bsr32(dist); 86292588Sdelphij return (i + i) + ((dist >> (i - 1)) & 1); 87207753Smm} 88207753Smm 89207753Smm 90207753Smm#else 91207753Smm 92207753Smm#define FASTPOS_BITS 13 93207753Smm 94207753Smmextern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 95207753Smm 96207753Smm 97207753Smm#define fastpos_shift(extra, n) \ 98207753Smm ((extra) + (n) * (FASTPOS_BITS - 1)) 99207753Smm 100207753Smm#define fastpos_limit(extra, n) \ 101207753Smm (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 102207753Smm 103292588Sdelphij#define fastpos_result(dist, extra, n) \ 104292588Sdelphij lzma_fastpos[(dist) >> fastpos_shift(extra, n)] \ 105207753Smm + 2 * fastpos_shift(extra, n) 106207753Smm 107207753Smm 108207753Smmstatic inline uint32_t 109292588Sdelphijget_dist_slot(uint32_t dist) 110207753Smm{ 111207753Smm // If it is small enough, we can pick the result directly from 112207753Smm // the precalculated table. 113292588Sdelphij if (dist < fastpos_limit(0, 0)) 114292588Sdelphij return lzma_fastpos[dist]; 115207753Smm 116292588Sdelphij if (dist < fastpos_limit(0, 1)) 117292588Sdelphij return fastpos_result(dist, 0, 1); 118207753Smm 119292588Sdelphij return fastpos_result(dist, 0, 2); 120207753Smm} 121207753Smm 122207753Smm 123207753Smm#ifdef FULL_DISTANCES_BITS 124207753Smmstatic inline uint32_t 125292588Sdelphijget_dist_slot_2(uint32_t dist) 126207753Smm{ 127292588Sdelphij assert(dist >= FULL_DISTANCES); 128207753Smm 129292588Sdelphij if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 130292588Sdelphij return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 0); 131207753Smm 132292588Sdelphij if (dist < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 133292588Sdelphij return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 1); 134207753Smm 135292588Sdelphij return fastpos_result(dist, FULL_DISTANCES_BITS - 1, 2); 136207753Smm} 137207753Smm#endif 138207753Smm 139207753Smm#endif 140207753Smm 141207753Smm#endif 142