xz_lzma2.h revision 256281
1185377Ssam/* 2187831Ssam * LZMA2 definitions 3185377Ssam * 4185377Ssam * Authors: Lasse Collin <lasse.collin@tukaani.org> 5185377Ssam * Igor Pavlov <http://7-zip.org/> 6185377Ssam * 7185377Ssam * This file has been put into the public domain. 8185377Ssam * You can do whatever you want with this file. 9185377Ssam */ 10185377Ssam 11185377Ssam#ifndef XZ_LZMA2_H 12185377Ssam#define XZ_LZMA2_H 13185377Ssam 14185377Ssam/* Range coder constants */ 15185377Ssam#define RC_SHIFT_BITS 8 16185377Ssam#define RC_TOP_BITS 24 17185377Ssam#define RC_TOP_VALUE (1 << RC_TOP_BITS) 18187345Ssam#define RC_BIT_MODEL_TOTAL_BITS 11 19185377Ssam#define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS) 20185377Ssam#define RC_MOVE_BITS 5 21185377Ssam 22185377Ssam/* 23187831Ssam * Maximum number of position states. A position state is the lowest pb 24187831Ssam * number of bits of the current uncompressed offset. In some places there 25187831Ssam * are different sets of probabilities for different position states. 26187831Ssam */ 27185377Ssam#define POS_STATES_MAX (1 << 4) 28185377Ssam 29185377Ssam/* 30185377Ssam * This enum is used to track which LZMA symbols have occurred most recently 31219394Sadrian * and in which order. This information is used to predict the next symbol. 32219394Sadrian * 33185377Ssam * Symbols: 34185377Ssam * - Literal: One 8-bit byte 35185377Ssam * - Match: Repeat a chunk of data at some distance 36185377Ssam * - Long repeat: Multi-byte match at a recently seen distance 37185377Ssam * - Short repeat: One-byte repeat at a recently seen distance 38185377Ssam * 39185377Ssam * The symbol names are in from STATE_oldest_older_previous. REP means 40185377Ssam * either short or long repeated match, and NONLIT means any non-literal. 41185377Ssam */ 42185377Ssamenum lzma_state { 43185377Ssam STATE_LIT_LIT, 44185380Ssam STATE_MATCH_LIT_LIT, 45185380Ssam STATE_REP_LIT_LIT, 46185377Ssam STATE_SHORTREP_LIT_LIT, 47185377Ssam STATE_MATCH_LIT, 48185380Ssam STATE_REP_LIT, 49185380Ssam STATE_SHORTREP_LIT, 50185377Ssam STATE_LIT_MATCH, 51185377Ssam STATE_LIT_LONGREP, 52185377Ssam STATE_LIT_SHORTREP, 53185377Ssam STATE_NONLIT_MATCH, 54219442Sadrian STATE_NONLIT_REP 55185377Ssam}; 56185377Ssam 57185377Ssam/* Total number of states */ 58185377Ssam#define STATES 12 59185377Ssam 60185377Ssam/* The lowest 7 states indicate that the previous state was a literal. */ 61185377Ssam#define LIT_STATES 7 62185377Ssam 63219442Sadrian/* Indicate that the latest symbol was a literal. */ 64185377Ssamstatic inline void lzma_state_literal(enum lzma_state *state) 65185377Ssam{ 66185380Ssam if (*state <= STATE_SHORTREP_LIT_LIT) 67185377Ssam *state = STATE_LIT_LIT; 68185377Ssam else if (*state <= STATE_LIT_SHORTREP) 69185377Ssam *state -= 3; 70185377Ssam else 71185377Ssam *state -= 6; 72185377Ssam} 73185377Ssam 74185380Ssam/* Indicate that the latest symbol was a match. */ 75219442Sadrianstatic inline void lzma_state_match(enum lzma_state *state) 76219442Sadrian{ 77185380Ssam *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH; 78219442Sadrian} 79185380Ssam 80185377Ssam/* Indicate that the latest state was a long repeated match. */ 81219442Sadrianstatic inline void lzma_state_long_rep(enum lzma_state *state) 82219442Sadrian{ 83219442Sadrian *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP; 84219442Sadrian} 85219442Sadrian 86219442Sadrian/* Indicate that the latest symbol was a short match. */ 87219442Sadrianstatic inline void lzma_state_short_rep(enum lzma_state *state) 88219442Sadrian{ 89219442Sadrian *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP; 90185377Ssam} 91219442Sadrian 92185377Ssam/* Test if the previous symbol was a literal. */ 93185377Ssamstatic inline bool lzma_state_is_literal(enum lzma_state state) 94219442Sadrian{ 95219442Sadrian return state < LIT_STATES; 96219442Sadrian} 97219442Sadrian 98185377Ssam/* Each literal coder is divided in three sections: 99219442Sadrian * - 0x001-0x0FF: Without match byte 100185377Ssam * - 0x101-0x1FF: With match byte; match bit is 0 101185377Ssam * - 0x201-0x2FF: With match byte; match bit is 1 102187831Ssam * 103187831Ssam * Match byte is used when the previous LZMA symbol was something else than 104187831Ssam * a literal (that is, it was some kind of match). 105187831Ssam */ 106187831Ssam#define LITERAL_CODER_SIZE 0x300 107187831Ssam 108187831Ssam/* Maximum number of literal coders */ 109187831Ssam#define LITERAL_CODERS_MAX (1 << 4) 110187831Ssam 111187831Ssam/* Minimum length of a match is two bytes. */ 112187831Ssam#define MATCH_LEN_MIN 2 113187831Ssam 114187831Ssam/* Match length is encoded with 4, 5, or 10 bits. 115187831Ssam * 116188264Ssam * Length Bits 117187831Ssam * 2-9 4 = Choice=0 + 3 bits 118188264Ssam * 10-17 5 = Choice=1 + Choice2=0 + 3 bits 119187831Ssam * 18-273 10 = Choice=1 + Choice2=1 + 8 bits 120188264Ssam */ 121188264Ssam#define LEN_LOW_BITS 3 122187831Ssam#define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS) 123188264Ssam#define LEN_MID_BITS 3 124187831Ssam#define LEN_MID_SYMBOLS (1 << LEN_MID_BITS) 125188264Ssam#define LEN_HIGH_BITS 8 126185377Ssam#define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS) 127185377Ssam#define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS) 128224718Sadrian 129224718Sadrian/* 130187831Ssam * Maximum length of a match is 273 which is a result of the encoding 131185377Ssam * described above. 132185377Ssam */ 133185377Ssam#define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1) 134185377Ssam 135185377Ssam/* 136185377Ssam * Different sets of probabilities are used for match distances that have 137185377Ssam * very short match length: Lengths of 2, 3, and 4 bytes have a separate 138185377Ssam * set of probabilities for each length. The matches with longer length 139185377Ssam * use a shared set of probabilities. 140185380Ssam */ 141185377Ssam#define DIST_STATES 4 142185377Ssam 143185377Ssam/* 144185377Ssam * Get the index of the appropriate probability array for decoding 145185377Ssam * the distance slot. 146185377Ssam */ 147185377Ssamstatic inline uint32_t lzma_get_dist_state(uint32_t len) 148185377Ssam{ 149185377Ssam return len < DIST_STATES + MATCH_LEN_MIN 150185377Ssam ? len - MATCH_LEN_MIN : DIST_STATES - 1; 151185377Ssam} 152185377Ssam 153185377Ssam/* 154185377Ssam * The highest two bits of a 32-bit match distance are encoded using six bits. 155185377Ssam * This six-bit value is called a distance slot. This way encoding a 32-bit 156185377Ssam * value takes 6-36 bits, larger values taking more bits. 157185377Ssam */ 158185377Ssam#define DIST_SLOT_BITS 6 159185377Ssam#define DIST_SLOTS (1 << DIST_SLOT_BITS) 160185377Ssam 161185377Ssam/* Match distances up to 127 are fully encoded using probabilities. Since 162185377Ssam * the highest two bits (distance slot) are always encoded using six bits, 163185377Ssam * the distances 0-3 don't need any additional bits to encode, since the 164185377Ssam * distance slot itself is the same as the actual distance. DIST_MODEL_START 165185377Ssam * indicates the first distance slot where at least one additional bit is 166185377Ssam * needed. 167185377Ssam */ 168185377Ssam#define DIST_MODEL_START 4 169185377Ssam 170185377Ssam/* 171185377Ssam * Match distances greater than 127 are encoded in three pieces: 172223466Sadrian * - distance slot: the highest two bits 173185377Ssam * - direct bits: 2-26 bits below the highest two bits 174185377Ssam * - alignment bits: four lowest bits 175185377Ssam * 176185377Ssam * Direct bits don't use any probabilities. 177185377Ssam * 178187831Ssam * The distance slot value of 14 is for distances 128-191. 179187831Ssam */ 180185377Ssam#define DIST_MODEL_END 14 181187831Ssam 182187831Ssam/* Distance slots that indicate a distance <= 127. */ 183185377Ssam#define FULL_DISTANCES_BITS (DIST_MODEL_END / 2) 184187831Ssam#define FULL_DISTANCES (1 << FULL_DISTANCES_BITS) 185185377Ssam 186187831Ssam/* 187187831Ssam * For match distances greater than 127, only the highest two bits and the 188187831Ssam * lowest four bits (alignment) is encoded using probabilities. 189185377Ssam */ 190187831Ssam#define ALIGN_BITS 4 191185377Ssam#define ALIGN_SIZE (1 << ALIGN_BITS) 192185377Ssam#define ALIGN_MASK (ALIGN_SIZE - 1) 193187831Ssam 194187831Ssam/* Total number of all probability variables */ 195185377Ssam#define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE) 196187831Ssam 197185377Ssam/* 198187831Ssam * LZMA remembers the four most recent match distances. Reusing these 199187831Ssam * distances tends to take less space than re-encoding the actual 200187831Ssam * distance value. 201185380Ssam */ 202187831Ssam#define REPS 4 203185377Ssam 204185377Ssam#endif 205187831Ssam