1278307Srpaulo/** 2278307Srpaulo * \file lzma/lzma12.h 3278307Srpaulo * \brief LZMA1 and LZMA2 filters 4278307Srpaulo */ 5278307Srpaulo 6278307Srpaulo/* 7278307Srpaulo * Author: Lasse Collin 8278307Srpaulo * 9278307Srpaulo * This file has been put into the public domain. 10278307Srpaulo * You can do whatever you want with this file. 11278307Srpaulo * 12278307Srpaulo * See ../lzma.h for information about liblzma as a whole. 13278307Srpaulo */ 14278307Srpaulo 15278307Srpaulo#ifndef LZMA_H_INTERNAL 16278307Srpaulo# error Never include this file directly. Use <lzma.h> instead. 17278307Srpaulo#endif 18278307Srpaulo 19278307Srpaulo 20278307Srpaulo/** 21278307Srpaulo * \brief LZMA1 Filter ID 22278307Srpaulo * 23278307Srpaulo * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils, 24278307Srpaulo * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from 25278307Srpaulo * accidentally using LZMA when they actually want LZMA2. 26278307Srpaulo * 27278307Srpaulo * LZMA1 shouldn't be used for new applications unless you _really_ know 28278307Srpaulo * what you are doing. LZMA2 is almost always a better choice. 29278307Srpaulo */ 30278307Srpaulo#define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001) 31278307Srpaulo 32278307Srpaulo/** 33278307Srpaulo * \brief LZMA2 Filter ID 34278307Srpaulo * 35278307Srpaulo * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds 36278307Srpaulo * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion 37278307Srpaulo * when trying to compress uncompressible data), possibility to change 38278307Srpaulo * lc/lp/pb in the middle of encoding, and some other internal improvements. 39278307Srpaulo */ 40278307Srpaulo#define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21) 41278307Srpaulo 42278307Srpaulo 43278307Srpaulo/** 44278307Srpaulo * \brief Match finders 45278307Srpaulo * 46278307Srpaulo * Match finder has major effect on both speed and compression ratio. 47278307Srpaulo * Usually hash chains are faster than binary trees. 48278307Srpaulo * 49278307Srpaulo * If you will use LZMA_SYNC_FLUSH often, the hash chains may be a better 50278307Srpaulo * choice, because binary trees get much higher compression ratio penalty 51278307Srpaulo * with LZMA_SYNC_FLUSH. 52278307Srpaulo * 53278307Srpaulo * The memory usage formulas are only rough estimates, which are closest to 54278307Srpaulo * reality when dict_size is a power of two. The formulas are more complex 55278307Srpaulo * in reality, and can also change a little between liblzma versions. Use 56278307Srpaulo * lzma_raw_encoder_memusage() to get more accurate estimate of memory usage. 57278307Srpaulo */ 58278307Srpaulotypedef enum { 59278307Srpaulo LZMA_MF_HC3 = 0x03, 60278307Srpaulo /**< 61278307Srpaulo * \brief Hash Chain with 2- and 3-byte hashing 62278307Srpaulo * 63278307Srpaulo * Minimum nice_len: 3 64278307Srpaulo * 65278307Srpaulo * Memory usage: 66278307Srpaulo * - dict_size <= 16 MiB: dict_size * 7.5 67278307Srpaulo * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB 68278307Srpaulo */ 69278307Srpaulo 70278307Srpaulo LZMA_MF_HC4 = 0x04, 71278307Srpaulo /**< 72278307Srpaulo * \brief Hash Chain with 2-, 3-, and 4-byte hashing 73278307Srpaulo * 74278307Srpaulo * Minimum nice_len: 4 75278307Srpaulo * 76278307Srpaulo * Memory usage: 77278307Srpaulo * - dict_size <= 32 MiB: dict_size * 7.5 78278307Srpaulo * - dict_size > 32 MiB: dict_size * 6.5 79278307Srpaulo */ 80278307Srpaulo 81278307Srpaulo LZMA_MF_BT2 = 0x12, 82278307Srpaulo /**< 83278307Srpaulo * \brief Binary Tree with 2-byte hashing 84278307Srpaulo * 85278307Srpaulo * Minimum nice_len: 2 86278307Srpaulo * 87278307Srpaulo * Memory usage: dict_size * 9.5 88278307Srpaulo */ 89278307Srpaulo 90278307Srpaulo LZMA_MF_BT3 = 0x13, 91278307Srpaulo /**< 92278307Srpaulo * \brief Binary Tree with 2- and 3-byte hashing 93278307Srpaulo * 94278307Srpaulo * Minimum nice_len: 3 95278307Srpaulo * 96278307Srpaulo * Memory usage: 97278307Srpaulo * - dict_size <= 16 MiB: dict_size * 11.5 98278307Srpaulo * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB 99278307Srpaulo */ 100278307Srpaulo 101278307Srpaulo LZMA_MF_BT4 = 0x14 102278307Srpaulo /**< 103278307Srpaulo * \brief Binary Tree with 2-, 3-, and 4-byte hashing 104278307Srpaulo * 105278307Srpaulo * Minimum nice_len: 4 106278307Srpaulo * 107278307Srpaulo * Memory usage: 108278307Srpaulo * - dict_size <= 32 MiB: dict_size * 11.5 109278307Srpaulo * - dict_size > 32 MiB: dict_size * 10.5 110278307Srpaulo */ 111278307Srpaulo} lzma_match_finder; 112278307Srpaulo 113278307Srpaulo 114278307Srpaulo/** 115278307Srpaulo * \brief Test if given match finder is supported 116278307Srpaulo * 117278307Srpaulo * Return true if the given match finder is supported by this liblzma build. 118278307Srpaulo * Otherwise false is returned. It is safe to call this with a value that 119278307Srpaulo * isn't listed in lzma_match_finder enumeration; the return value will be 120278307Srpaulo * false. 121278307Srpaulo * 122278307Srpaulo * There is no way to list which match finders are available in this 123278307Srpaulo * particular liblzma version and build. It would be useless, because 124278307Srpaulo * a new match finder, which the application developer wasn't aware, 125278307Srpaulo * could require giving additional options to the encoder that the older 126278307Srpaulo * match finders don't need. 127278307Srpaulo */ 128278307Srpauloextern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder) 129278307Srpaulo lzma_nothrow lzma_attr_const; 130278307Srpaulo 131278307Srpaulo 132278307Srpaulo/** 133278307Srpaulo * \brief Compression modes 134278307Srpaulo * 135278307Srpaulo * This selects the function used to analyze the data produced by the match 136278307Srpaulo * finder. 137278307Srpaulo */ 138278307Srpaulotypedef enum { 139278307Srpaulo LZMA_MODE_FAST = 1, 140278307Srpaulo /**< 141278307Srpaulo * \brief Fast compression 142278307Srpaulo * 143278307Srpaulo * Fast mode is usually at its best when combined with 144278307Srpaulo * a hash chain match finder. 145278307Srpaulo */ 146278307Srpaulo 147278307Srpaulo LZMA_MODE_NORMAL = 2 148278307Srpaulo /**< 149278307Srpaulo * \brief Normal compression 150278307Srpaulo * 151278307Srpaulo * This is usually notably slower than fast mode. Use this 152278307Srpaulo * together with binary tree match finders to expose the 153278307Srpaulo * full potential of the LZMA1 or LZMA2 encoder. 154278307Srpaulo */ 155278307Srpaulo} lzma_mode; 156278307Srpaulo 157278307Srpaulo 158278307Srpaulo/** 159278307Srpaulo * \brief Test if given compression mode is supported 160278307Srpaulo * 161278307Srpaulo * Return true if the given compression mode is supported by this liblzma 162278307Srpaulo * build. Otherwise false is returned. It is safe to call this with a value 163278307Srpaulo * that isn't listed in lzma_mode enumeration; the return value will be false. 164278307Srpaulo * 165278307Srpaulo * There is no way to list which modes are available in this particular 166278307Srpaulo * liblzma version and build. It would be useless, because a new compression 167278307Srpaulo * mode, which the application developer wasn't aware, could require giving 168278307Srpaulo * additional options to the encoder that the older modes don't need. 169278307Srpaulo */ 170278307Srpauloextern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode) 171278307Srpaulo lzma_nothrow lzma_attr_const; 172278307Srpaulo 173278307Srpaulo 174278307Srpaulo/** 175278307Srpaulo * \brief Options specific to the LZMA1 and LZMA2 filters 176278307Srpaulo * 177278307Srpaulo * Since LZMA1 and LZMA2 share most of the code, it's simplest to share 178278307Srpaulo * the options structure too. For encoding, all but the reserved variables 179278307Srpaulo * need to be initialized unless specifically mentioned otherwise. 180278307Srpaulo * lzma_lzma_preset() can be used to get a good starting point. 181278307Srpaulo * 182278307Srpaulo * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and 183278307Srpaulo * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb. 184278307Srpaulo */ 185278307Srpaulotypedef struct { 186278307Srpaulo /** 187278307Srpaulo * \brief Dictionary size in bytes 188278307Srpaulo * 189278307Srpaulo * Dictionary size indicates how many bytes of the recently processed 190278307Srpaulo * uncompressed data is kept in memory. One method to reduce size of 191278307Srpaulo * the uncompressed data is to store distance-length pairs, which 192278307Srpaulo * indicate what data to repeat from the dictionary buffer. Thus, 193278307Srpaulo * the bigger the dictionary, the better the compression ratio 194278307Srpaulo * usually is. 195278307Srpaulo * 196278307Srpaulo * Maximum size of the dictionary depends on multiple things: 197278307Srpaulo * - Memory usage limit 198278307Srpaulo * - Available address space (not a problem on 64-bit systems) 199278307Srpaulo * - Selected match finder (encoder only) 200278307Srpaulo * 201278307Srpaulo * Currently the maximum dictionary size for encoding is 1.5 GiB 202278307Srpaulo * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit 203278307Srpaulo * systems for certain match finder implementation reasons. In the 204278307Srpaulo * future, there may be match finders that support bigger 205278307Srpaulo * dictionaries. 206278307Srpaulo * 207278307Srpaulo * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e. 208278307Srpaulo * UINT32_MAX), so increasing the maximum dictionary size of the 209278307Srpaulo * encoder won't cause problems for old decoders. 210278307Srpaulo * 211278307Srpaulo * Because extremely small dictionaries sizes would have unneeded 212278307Srpaulo * overhead in the decoder, the minimum dictionary size is 4096 bytes. 213278307Srpaulo * 214278307Srpaulo * \note When decoding, too big dictionary does no other harm 215278307Srpaulo * than wasting memory. 216278307Srpaulo */ 217278307Srpaulo uint32_t dict_size; 218278307Srpaulo# define LZMA_DICT_SIZE_MIN UINT32_C(4096) 219278307Srpaulo# define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23) 220278307Srpaulo 221278307Srpaulo /** 222278307Srpaulo * \brief Pointer to an initial dictionary 223278307Srpaulo * 224278307Srpaulo * It is possible to initialize the LZ77 history window using 225278307Srpaulo * a preset dictionary. It is useful when compressing many 226278307Srpaulo * similar, relatively small chunks of data independently from 227278307Srpaulo * each other. The preset dictionary should contain typical 228278307Srpaulo * strings that occur in the files being compressed. The most 229278307Srpaulo * probable strings should be near the end of the preset dictionary. 230278307Srpaulo * 231278307Srpaulo * This feature should be used only in special situations. For 232278307Srpaulo * now, it works correctly only with raw encoding and decoding. 233278307Srpaulo * Currently none of the container formats supported by 234278307Srpaulo * liblzma allow preset dictionary when decoding, thus if 235278307Srpaulo * you create a .xz or .lzma file with preset dictionary, it 236278307Srpaulo * cannot be decoded with the regular decoder functions. In the 237278307Srpaulo * future, the .xz format will likely get support for preset 238278307Srpaulo * dictionary though. 239278307Srpaulo */ 240278307Srpaulo const uint8_t *preset_dict; 241278307Srpaulo 242278307Srpaulo /** 243278307Srpaulo * \brief Size of the preset dictionary 244278307Srpaulo * 245278307Srpaulo * Specifies the size of the preset dictionary. If the size is 246278307Srpaulo * bigger than dict_size, only the last dict_size bytes are 247278307Srpaulo * processed. 248278307Srpaulo * 249278307Srpaulo * This variable is read only when preset_dict is not NULL. 250278307Srpaulo * If preset_dict is not NULL but preset_dict_size is zero, 251278307Srpaulo * no preset dictionary is used (identical to only setting 252278307Srpaulo * preset_dict to NULL). 253278307Srpaulo */ 254278307Srpaulo uint32_t preset_dict_size; 255278307Srpaulo 256278307Srpaulo /** 257278307Srpaulo * \brief Number of literal context bits 258278307Srpaulo * 259278307Srpaulo * How many of the highest bits of the previous uncompressed 260278307Srpaulo * eight-bit byte (also known as `literal') are taken into 261278307Srpaulo * account when predicting the bits of the next literal. 262278307Srpaulo * 263278307Srpaulo * E.g. in typical English text, an upper-case letter is 264278307Srpaulo * often followed by a lower-case letter, and a lower-case 265278307Srpaulo * letter is usually followed by another lower-case letter. 266278307Srpaulo * In the US-ASCII character set, the highest three bits are 010 267278307Srpaulo * for upper-case letters and 011 for lower-case letters. 268278307Srpaulo * When lc is at least 3, the literal coding can take advantage of 269278307Srpaulo * this property in the uncompressed data. 270278307Srpaulo * 271278307Srpaulo * There is a limit that applies to literal context bits and literal 272278307Srpaulo * position bits together: lc + lp <= 4. Without this limit the 273278307Srpaulo * decoding could become very slow, which could have security related 274278307Srpaulo * results in some cases like email servers doing virus scanning. 275278307Srpaulo * This limit also simplifies the internal implementation in liblzma. 276278307Srpaulo * 277278307Srpaulo * There may be LZMA1 streams that have lc + lp > 4 (maximum possible 278278307Srpaulo * lc would be 8). It is not possible to decode such streams with 279278307Srpaulo * liblzma. 280278307Srpaulo */ 281278307Srpaulo uint32_t lc; 282278307Srpaulo# define LZMA_LCLP_MIN 0 283278307Srpaulo# define LZMA_LCLP_MAX 4 284278307Srpaulo# define LZMA_LC_DEFAULT 3 285278307Srpaulo 286278307Srpaulo /** 287278307Srpaulo * \brief Number of literal position bits 288278307Srpaulo * 289278307Srpaulo * lp affects what kind of alignment in the uncompressed data is 290278307Srpaulo * assumed when encoding literals. A literal is a single 8-bit byte. 291278307Srpaulo * See pb below for more information about alignment. 292278307Srpaulo */ 293278307Srpaulo uint32_t lp; 294278307Srpaulo# define LZMA_LP_DEFAULT 0 295278307Srpaulo 296278307Srpaulo /** 297278307Srpaulo * \brief Number of position bits 298278307Srpaulo * 299278307Srpaulo * pb affects what kind of alignment in the uncompressed data is 300278307Srpaulo * assumed in general. The default means four-byte alignment 301278307Srpaulo * (2^ pb =2^2=4), which is often a good choice when there's 302278307Srpaulo * no better guess. 303278307Srpaulo * 304360523Sdelphij * When the alignment is known, setting pb accordingly may reduce 305278307Srpaulo * the file size a little. E.g. with text files having one-byte 306278307Srpaulo * alignment (US-ASCII, ISO-8859-*, UTF-8), setting pb=0 can 307278307Srpaulo * improve compression slightly. For UTF-16 text, pb=1 is a good 308278307Srpaulo * choice. If the alignment is an odd number like 3 bytes, pb=0 309278307Srpaulo * might be the best choice. 310278307Srpaulo * 311278307Srpaulo * Even though the assumed alignment can be adjusted with pb and 312278307Srpaulo * lp, LZMA1 and LZMA2 still slightly favor 16-byte alignment. 313278307Srpaulo * It might be worth taking into account when designing file formats 314278307Srpaulo * that are likely to be often compressed with LZMA1 or LZMA2. 315278307Srpaulo */ 316278307Srpaulo uint32_t pb; 317278307Srpaulo# define LZMA_PB_MIN 0 318278307Srpaulo# define LZMA_PB_MAX 4 319278307Srpaulo# define LZMA_PB_DEFAULT 2 320278307Srpaulo 321278307Srpaulo /** Compression mode */ 322278307Srpaulo lzma_mode mode; 323278307Srpaulo 324278307Srpaulo /** 325278307Srpaulo * \brief Nice length of a match 326278307Srpaulo * 327278307Srpaulo * This determines how many bytes the encoder compares from the match 328278307Srpaulo * candidates when looking for the best match. Once a match of at 329278307Srpaulo * least nice_len bytes long is found, the encoder stops looking for 330278307Srpaulo * better candidates and encodes the match. (Naturally, if the found 331278307Srpaulo * match is actually longer than nice_len, the actual length is 332278307Srpaulo * encoded; it's not truncated to nice_len.) 333278307Srpaulo * 334278307Srpaulo * Bigger values usually increase the compression ratio and 335278307Srpaulo * compression time. For most files, 32 to 128 is a good value, 336278307Srpaulo * which gives very good compression ratio at good speed. 337278307Srpaulo * 338278307Srpaulo * The exact minimum value depends on the match finder. The maximum 339278307Srpaulo * is 273, which is the maximum length of a match that LZMA1 and 340278307Srpaulo * LZMA2 can encode. 341278307Srpaulo */ 342278307Srpaulo uint32_t nice_len; 343278307Srpaulo 344278307Srpaulo /** Match finder ID */ 345278307Srpaulo lzma_match_finder mf; 346278307Srpaulo 347278307Srpaulo /** 348278307Srpaulo * \brief Maximum search depth in the match finder 349278307Srpaulo * 350278307Srpaulo * For every input byte, match finder searches through the hash chain 351278307Srpaulo * or binary tree in a loop, each iteration going one step deeper in 352278307Srpaulo * the chain or tree. The searching stops if 353278307Srpaulo * - a match of at least nice_len bytes long is found; 354278307Srpaulo * - all match candidates from the hash chain or binary tree have 355278307Srpaulo * been checked; or 356278307Srpaulo * - maximum search depth is reached. 357278307Srpaulo * 358278307Srpaulo * Maximum search depth is needed to prevent the match finder from 359278307Srpaulo * wasting too much time in case there are lots of short match 360278307Srpaulo * candidates. On the other hand, stopping the search before all 361278307Srpaulo * candidates have been checked can reduce compression ratio. 362278307Srpaulo * 363278307Srpaulo * Setting depth to zero tells liblzma to use an automatic default 364278307Srpaulo * value, that depends on the selected match finder and nice_len. 365278307Srpaulo * The default is in the range [4, 200] or so (it may vary between 366278307Srpaulo * liblzma versions). 367278307Srpaulo * 368278307Srpaulo * Using a bigger depth value than the default can increase 369278307Srpaulo * compression ratio in some cases. There is no strict maximum value, 370278307Srpaulo * but high values (thousands or millions) should be used with care: 371278307Srpaulo * the encoder could remain fast enough with typical input, but 372278307Srpaulo * malicious input could cause the match finder to slow down 373278307Srpaulo * dramatically, possibly creating a denial of service attack. 374278307Srpaulo */ 375278307Srpaulo uint32_t depth; 376278307Srpaulo 377278307Srpaulo /* 378278307Srpaulo * Reserved space to allow possible future extensions without 379278307Srpaulo * breaking the ABI. You should not touch these, because the names 380278307Srpaulo * of these variables may change. These are and will never be used 381278307Srpaulo * with the currently supported options, so it is safe to leave these 382278307Srpaulo * uninitialized. 383278307Srpaulo */ 384278307Srpaulo uint32_t reserved_int1; 385278307Srpaulo uint32_t reserved_int2; 386278307Srpaulo uint32_t reserved_int3; 387278307Srpaulo uint32_t reserved_int4; 388278307Srpaulo uint32_t reserved_int5; 389278307Srpaulo uint32_t reserved_int6; 390278307Srpaulo uint32_t reserved_int7; 391278307Srpaulo uint32_t reserved_int8; 392278307Srpaulo lzma_reserved_enum reserved_enum1; 393278307Srpaulo lzma_reserved_enum reserved_enum2; 394278307Srpaulo lzma_reserved_enum reserved_enum3; 395278307Srpaulo lzma_reserved_enum reserved_enum4; 396278307Srpaulo void *reserved_ptr1; 397278307Srpaulo void *reserved_ptr2; 398278307Srpaulo 399278307Srpaulo} lzma_options_lzma; 400278307Srpaulo 401278307Srpaulo 402278307Srpaulo/** 403278307Srpaulo * \brief Set a compression preset to lzma_options_lzma structure 404278307Srpaulo * 405278307Srpaulo * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9 406278307Srpaulo * of the xz command line tool. In addition, it is possible to bitwise-or 407278307Srpaulo * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported. 408278307Srpaulo * The flags are defined in container.h, because the flags are used also 409278307Srpaulo * with lzma_easy_encoder(). 410278307Srpaulo * 411278307Srpaulo * The preset values are subject to changes between liblzma versions. 412278307Srpaulo * 413278307Srpaulo * This function is available only if LZMA1 or LZMA2 encoder has been enabled 414278307Srpaulo * when building liblzma. 415278307Srpaulo * 416278307Srpaulo * \return On success, false is returned. If the preset is not 417278307Srpaulo * supported, true is returned. 418278307Srpaulo */ 419278307Srpauloextern LZMA_API(lzma_bool) lzma_lzma_preset( 420278307Srpaulo lzma_options_lzma *options, uint32_t preset) lzma_nothrow; 421