1/////////////////////////////////////////////////////////////////////////////// 2// 3/// \file fastpos.h 4/// \brief Kind of two-bit version of bit scan reverse 5/// 6// Authors: Igor Pavlov 7// Lasse Collin 8// 9// This file has been put into the public domain. 10// You can do whatever you want with this file. 11// 12/////////////////////////////////////////////////////////////////////////////// 13 14#ifndef LZMA_FASTPOS_H 15#define LZMA_FASTPOS_H 16 17// LZMA encodes match distances (positions) by storing the highest two 18// bits using a six-bit value [0, 63], and then the missing lower bits. 19// Dictionary size is also stored using this encoding in the new .lzma 20// file format header. 21// 22// fastpos.h provides a way to quickly find out the correct six-bit 23// values. The following table gives some examples of this encoding: 24// 25// pos return 26// 0 0 27// 1 1 28// 2 2 29// 3 3 30// 4 4 31// 5 4 32// 6 5 33// 7 5 34// 8 6 35// 11 6 36// 12 7 37// ... ... 38// 15 7 39// 16 8 40// 17 8 41// ... ... 42// 23 8 43// 24 9 44// 25 9 45// ... ... 46// 47// 48// Provided functions or macros 49// ---------------------------- 50// 51// get_pos_slot(pos) is the basic version. get_pos_slot_2(pos) 52// assumes that pos >= FULL_DISTANCES, thus the result is at least 53// FULL_DISTANCES_BITS * 2. Using get_pos_slot(pos) instead of 54// get_pos_slot_2(pos) would give the same result, but get_pos_slot_2(pos) 55// should be tiny bit faster due to the assumption being made. 56// 57// 58// Size vs. speed 59// -------------- 60// 61// With some CPUs that have fast BSR (bit scan reverse) instruction, the 62// size optimized version is slightly faster than the bigger table based 63// approach. Such CPUs include Intel Pentium Pro, Pentium II, Pentium III 64// and Core 2 (possibly others). AMD K7 seems to have slower BSR, but that 65// would still have speed roughly comparable to the table version. Older 66// x86 CPUs like the original Pentium have very slow BSR; on those systems 67// the table version is a lot faster. 68// 69// On some CPUs, the table version is a lot faster when using position 70// dependent code, but with position independent code the size optimized 71// version is slightly faster. This occurs at least on 32-bit SPARC (no 72// ASM optimizations). 73// 74// I'm making the table version the default, because that has good speed 75// on all systems I have tried. The size optimized version is sometimes 76// slightly faster, but sometimes it is a lot slower. 77 78#ifdef HAVE_SMALL 79# define get_pos_slot(pos) ((pos) <= 4 ? (pos) : get_pos_slot_2(pos)) 80 81static inline uint32_t 82get_pos_slot_2(uint32_t pos) 83{ 84 const uint32_t i = bsr32(pos); 85 return (i + i) + ((pos >> (i - 1)) & 1); 86} 87 88 89#else 90 91#define FASTPOS_BITS 13 92 93extern const uint8_t lzma_fastpos[1 << FASTPOS_BITS]; 94 95 96#define fastpos_shift(extra, n) \ 97 ((extra) + (n) * (FASTPOS_BITS - 1)) 98 99#define fastpos_limit(extra, n) \ 100 (UINT32_C(1) << (FASTPOS_BITS + fastpos_shift(extra, n))) 101 102#define fastpos_result(pos, extra, n) \ 103 lzma_fastpos[(pos) >> fastpos_shift(extra, n)] \ 104 + 2 * fastpos_shift(extra, n) 105 106 107static inline uint32_t 108get_pos_slot(uint32_t pos) 109{ 110 // If it is small enough, we can pick the result directly from 111 // the precalculated table. 112 if (pos < fastpos_limit(0, 0)) 113 return lzma_fastpos[pos]; 114 115 if (pos < fastpos_limit(0, 1)) 116 return fastpos_result(pos, 0, 1); 117 118 return fastpos_result(pos, 0, 2); 119} 120 121 122#ifdef FULL_DISTANCES_BITS 123static inline uint32_t 124get_pos_slot_2(uint32_t pos) 125{ 126 assert(pos >= FULL_DISTANCES); 127 128 if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 0)) 129 return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 0); 130 131 if (pos < fastpos_limit(FULL_DISTANCES_BITS - 1, 1)) 132 return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 1); 133 134 return fastpos_result(pos, FULL_DISTANCES_BITS - 1, 2); 135} 136#endif 137 138#endif 139 140#endif 141