1207753Smm/** 2207753Smm * \file lzma/lzma.h 3207753Smm * \brief LZMA1 and LZMA2 filters 4207753Smm */ 5207753Smm 6207753Smm/* 7207753Smm * Author: Lasse Collin 8207753Smm * 9207753Smm * This file has been put into the public domain. 10207753Smm * You can do whatever you want with this file. 11207753Smm * 12207753Smm * See ../lzma.h for information about liblzma as a whole. 13207753Smm */ 14207753Smm 15207753Smm#ifndef LZMA_H_INTERNAL 16207753Smm# error Never include this file directly. Use <lzma.h> instead. 17207753Smm#endif 18207753Smm 19207753Smm 20207753Smm/** 21207753Smm * \brief LZMA1 Filter ID 22207753Smm * 23207753Smm * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils, 24207753Smm * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from 25207753Smm * accidentally using LZMA when they actually want LZMA2. 26207753Smm * 27207753Smm * LZMA1 shouldn't be used for new applications unless you _really_ know 28207753Smm * what you are doing. LZMA2 is almost always a better choice. 29207753Smm */ 30207753Smm#define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001) 31207753Smm 32207753Smm/** 33207753Smm * \brief LZMA2 Filter ID 34207753Smm * 35207753Smm * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds 36207753Smm * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion 37207753Smm * when trying to compress uncompressible data), possibility to change 38207753Smm * lc/lp/pb in the middle of encoding, and some other internal improvements. 39207753Smm */ 40207753Smm#define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21) 41207753Smm 42207753Smm 43207753Smm/** 44207753Smm * \brief Match finders 45207753Smm * 46207753Smm * Match finder has major effect on both speed and compression ratio. 47207753Smm * Usually hash chains are faster than binary trees. 48207753Smm * 49215187Smm * If you will use LZMA_SYNC_FLUSH often, the hash chains may be a better 50215187Smm * choice, because binary trees get much higher compression ratio penalty 51215187Smm * with LZMA_SYNC_FLUSH. 52215187Smm * 53207753Smm * The memory usage formulas are only rough estimates, which are closest to 54207753Smm * reality when dict_size is a power of two. The formulas are more complex 55207753Smm * in reality, and can also change a little between liblzma versions. Use 56213700Smm * lzma_raw_encoder_memusage() to get more accurate estimate of memory usage. 57207753Smm */ 58207753Smmtypedef enum { 59207753Smm LZMA_MF_HC3 = 0x03, 60207753Smm /**< 61207753Smm * \brief Hash Chain with 2- and 3-byte hashing 62207753Smm * 63207753Smm * Minimum nice_len: 3 64207753Smm * 65207753Smm * Memory usage: 66207753Smm * - dict_size <= 16 MiB: dict_size * 7.5 67207753Smm * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB 68207753Smm */ 69207753Smm 70207753Smm LZMA_MF_HC4 = 0x04, 71207753Smm /**< 72207753Smm * \brief Hash Chain with 2-, 3-, and 4-byte hashing 73207753Smm * 74207753Smm * Minimum nice_len: 4 75207753Smm * 76213700Smm * Memory usage: 77213700Smm * - dict_size <= 32 MiB: dict_size * 7.5 78213700Smm * - dict_size > 32 MiB: dict_size * 6.5 79207753Smm */ 80207753Smm 81207753Smm LZMA_MF_BT2 = 0x12, 82207753Smm /**< 83207753Smm * \brief Binary Tree with 2-byte hashing 84207753Smm * 85207753Smm * Minimum nice_len: 2 86207753Smm * 87207753Smm * Memory usage: dict_size * 9.5 88207753Smm */ 89207753Smm 90207753Smm LZMA_MF_BT3 = 0x13, 91207753Smm /**< 92207753Smm * \brief Binary Tree with 2- and 3-byte hashing 93207753Smm * 94207753Smm * Minimum nice_len: 3 95207753Smm * 96207753Smm * Memory usage: 97207753Smm * - dict_size <= 16 MiB: dict_size * 11.5 98207753Smm * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB 99207753Smm */ 100207753Smm 101207753Smm LZMA_MF_BT4 = 0x14 102207753Smm /**< 103207753Smm * \brief Binary Tree with 2-, 3-, and 4-byte hashing 104207753Smm * 105207753Smm * Minimum nice_len: 4 106207753Smm * 107213700Smm * Memory usage: 108213700Smm * - dict_size <= 32 MiB: dict_size * 11.5 109213700Smm * - dict_size > 32 MiB: dict_size * 10.5 110207753Smm */ 111207753Smm} lzma_match_finder; 112207753Smm 113207753Smm 114207753Smm/** 115207753Smm * \brief Test if given match finder is supported 116207753Smm * 117207753Smm * Return true if the given match finder is supported by this liblzma build. 118207753Smm * Otherwise false is returned. It is safe to call this with a value that 119207753Smm * isn't listed in lzma_match_finder enumeration; the return value will be 120207753Smm * false. 121207753Smm * 122207753Smm * There is no way to list which match finders are available in this 123207753Smm * particular liblzma version and build. It would be useless, because 124207753Smm * a new match finder, which the application developer wasn't aware, 125207753Smm * could require giving additional options to the encoder that the older 126207753Smm * match finders don't need. 127207753Smm */ 128207753Smmextern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder) 129207753Smm lzma_nothrow lzma_attr_const; 130207753Smm 131207753Smm 132207753Smm/** 133207753Smm * \brief Compression modes 134207753Smm * 135207753Smm * This selects the function used to analyze the data produced by the match 136207753Smm * finder. 137207753Smm */ 138207753Smmtypedef enum { 139207753Smm LZMA_MODE_FAST = 1, 140207753Smm /**< 141207753Smm * \brief Fast compression 142207753Smm * 143207753Smm * Fast mode is usually at its best when combined with 144207753Smm * a hash chain match finder. 145207753Smm */ 146207753Smm 147207753Smm LZMA_MODE_NORMAL = 2 148207753Smm /**< 149207753Smm * \brief Normal compression 150207753Smm * 151207753Smm * This is usually notably slower than fast mode. Use this 152207753Smm * together with binary tree match finders to expose the 153207753Smm * full potential of the LZMA1 or LZMA2 encoder. 154207753Smm */ 155207753Smm} lzma_mode; 156207753Smm 157207753Smm 158207753Smm/** 159207753Smm * \brief Test if given compression mode is supported 160207753Smm * 161207753Smm * Return true if the given compression mode is supported by this liblzma 162207753Smm * build. Otherwise false is returned. It is safe to call this with a value 163207753Smm * that isn't listed in lzma_mode enumeration; the return value will be false. 164207753Smm * 165207753Smm * There is no way to list which modes are available in this particular 166207753Smm * liblzma version and build. It would be useless, because a new compression 167207753Smm * mode, which the application developer wasn't aware, could require giving 168207753Smm * additional options to the encoder that the older modes don't need. 169207753Smm */ 170207753Smmextern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode) 171207753Smm lzma_nothrow lzma_attr_const; 172207753Smm 173207753Smm 174207753Smm/** 175207753Smm * \brief Options specific to the LZMA1 and LZMA2 filters 176207753Smm * 177207753Smm * Since LZMA1 and LZMA2 share most of the code, it's simplest to share 178207753Smm * the options structure too. For encoding, all but the reserved variables 179207753Smm * need to be initialized unless specifically mentioned otherwise. 180215187Smm * lzma_lzma_preset() can be used to get a good starting point. 181207753Smm * 182207753Smm * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and 183207753Smm * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb. 184207753Smm */ 185207753Smmtypedef struct { 186207753Smm /** 187207753Smm * \brief Dictionary size in bytes 188207753Smm * 189207753Smm * Dictionary size indicates how many bytes of the recently processed 190207753Smm * uncompressed data is kept in memory. One method to reduce size of 191207753Smm * the uncompressed data is to store distance-length pairs, which 192207753Smm * indicate what data to repeat from the dictionary buffer. Thus, 193207753Smm * the bigger the dictionary, the better the compression ratio 194207753Smm * usually is. 195207753Smm * 196207753Smm * Maximum size of the dictionary depends on multiple things: 197207753Smm * - Memory usage limit 198207753Smm * - Available address space (not a problem on 64-bit systems) 199207753Smm * - Selected match finder (encoder only) 200207753Smm * 201207753Smm * Currently the maximum dictionary size for encoding is 1.5 GiB 202207753Smm * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit 203207753Smm * systems for certain match finder implementation reasons. In the 204207753Smm * future, there may be match finders that support bigger 205207753Smm * dictionaries. 206207753Smm * 207207753Smm * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e. 208207753Smm * UINT32_MAX), so increasing the maximum dictionary size of the 209207753Smm * encoder won't cause problems for old decoders. 210207753Smm * 211207753Smm * Because extremely small dictionaries sizes would have unneeded 212207753Smm * overhead in the decoder, the minimum dictionary size is 4096 bytes. 213207753Smm * 214207753Smm * \note When decoding, too big dictionary does no other harm 215207753Smm * than wasting memory. 216207753Smm */ 217207753Smm uint32_t dict_size; 218207753Smm# define LZMA_DICT_SIZE_MIN UINT32_C(4096) 219207753Smm# define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23) 220207753Smm 221207753Smm /** 222207753Smm * \brief Pointer to an initial dictionary 223207753Smm * 224207753Smm * It is possible to initialize the LZ77 history window using 225207753Smm * a preset dictionary. It is useful when compressing many 226207753Smm * similar, relatively small chunks of data independently from 227207753Smm * each other. The preset dictionary should contain typical 228207753Smm * strings that occur in the files being compressed. The most 229207753Smm * probable strings should be near the end of the preset dictionary. 230207753Smm * 231207753Smm * This feature should be used only in special situations. For 232207753Smm * now, it works correctly only with raw encoding and decoding. 233207753Smm * Currently none of the container formats supported by 234207753Smm * liblzma allow preset dictionary when decoding, thus if 235207753Smm * you create a .xz or .lzma file with preset dictionary, it 236207753Smm * cannot be decoded with the regular decoder functions. In the 237207753Smm * future, the .xz format will likely get support for preset 238207753Smm * dictionary though. 239207753Smm */ 240207753Smm const uint8_t *preset_dict; 241207753Smm 242207753Smm /** 243207753Smm * \brief Size of the preset dictionary 244207753Smm * 245207753Smm * Specifies the size of the preset dictionary. If the size is 246207753Smm * bigger than dict_size, only the last dict_size bytes are 247207753Smm * processed. 248207753Smm * 249207753Smm * This variable is read only when preset_dict is not NULL. 250207753Smm * If preset_dict is not NULL but preset_dict_size is zero, 251207753Smm * no preset dictionary is used (identical to only setting 252207753Smm * preset_dict to NULL). 253207753Smm */ 254207753Smm uint32_t preset_dict_size; 255207753Smm 256207753Smm /** 257207753Smm * \brief Number of literal context bits 258207753Smm * 259207753Smm * How many of the highest bits of the previous uncompressed 260207753Smm * eight-bit byte (also known as `literal') are taken into 261207753Smm * account when predicting the bits of the next literal. 262207753Smm * 263215187Smm * E.g. in typical English text, an upper-case letter is 264215187Smm * often followed by a lower-case letter, and a lower-case 265215187Smm * letter is usually followed by another lower-case letter. 266215187Smm * In the US-ASCII character set, the highest three bits are 010 267215187Smm * for upper-case letters and 011 for lower-case letters. 268215187Smm * When lc is at least 3, the literal coding can take advantage of 269215187Smm * this property in the uncompressed data. 270207753Smm * 271207753Smm * There is a limit that applies to literal context bits and literal 272207753Smm * position bits together: lc + lp <= 4. Without this limit the 273207753Smm * decoding could become very slow, which could have security related 274207753Smm * results in some cases like email servers doing virus scanning. 275207753Smm * This limit also simplifies the internal implementation in liblzma. 276207753Smm * 277207753Smm * There may be LZMA1 streams that have lc + lp > 4 (maximum possible 278207753Smm * lc would be 8). It is not possible to decode such streams with 279207753Smm * liblzma. 280207753Smm */ 281207753Smm uint32_t lc; 282207753Smm# define LZMA_LCLP_MIN 0 283207753Smm# define LZMA_LCLP_MAX 4 284207753Smm# define LZMA_LC_DEFAULT 3 285207753Smm 286207753Smm /** 287207753Smm * \brief Number of literal position bits 288207753Smm * 289215187Smm * lp affects what kind of alignment in the uncompressed data is 290215187Smm * assumed when encoding literals. A literal is a single 8-bit byte. 291215187Smm * See pb below for more information about alignment. 292207753Smm */ 293207753Smm uint32_t lp; 294207753Smm# define LZMA_LP_DEFAULT 0 295207753Smm 296207753Smm /** 297207753Smm * \brief Number of position bits 298207753Smm * 299215187Smm * pb affects what kind of alignment in the uncompressed data is 300215187Smm * assumed in general. The default means four-byte alignment 301215187Smm * (2^ pb =2^2=4), which is often a good choice when there's 302215187Smm * no better guess. 303207753Smm * 304215187Smm * When the aligment is known, setting pb accordingly may reduce 305215187Smm * the file size a little. E.g. with text files having one-byte 306215187Smm * alignment (US-ASCII, ISO-8859-*, UTF-8), setting pb=0 can 307215187Smm * improve compression slightly. For UTF-16 text, pb=1 is a good 308215187Smm * choice. If the alignment is an odd number like 3 bytes, pb=0 309215187Smm * might be the best choice. 310215187Smm * 311215187Smm * Even though the assumed alignment can be adjusted with pb and 312215187Smm * lp, LZMA1 and LZMA2 still slightly favor 16-byte alignment. 313215187Smm * It might be worth taking into account when designing file formats 314215187Smm * that are likely to be often compressed with LZMA1 or LZMA2. 315207753Smm */ 316207753Smm uint32_t pb; 317207753Smm# define LZMA_PB_MIN 0 318207753Smm# define LZMA_PB_MAX 4 319207753Smm# define LZMA_PB_DEFAULT 2 320207753Smm 321207753Smm /** Compression mode */ 322207753Smm lzma_mode mode; 323207753Smm 324207753Smm /** 325207753Smm * \brief Nice length of a match 326207753Smm * 327207753Smm * This determines how many bytes the encoder compares from the match 328207753Smm * candidates when looking for the best match. Once a match of at 329207753Smm * least nice_len bytes long is found, the encoder stops looking for 330207753Smm * better candidates and encodes the match. (Naturally, if the found 331207753Smm * match is actually longer than nice_len, the actual length is 332207753Smm * encoded; it's not truncated to nice_len.) 333207753Smm * 334207753Smm * Bigger values usually increase the compression ratio and 335207753Smm * compression time. For most files, 32 to 128 is a good value, 336207753Smm * which gives very good compression ratio at good speed. 337207753Smm * 338207753Smm * The exact minimum value depends on the match finder. The maximum 339207753Smm * is 273, which is the maximum length of a match that LZMA1 and 340207753Smm * LZMA2 can encode. 341207753Smm */ 342207753Smm uint32_t nice_len; 343207753Smm 344207753Smm /** Match finder ID */ 345207753Smm lzma_match_finder mf; 346207753Smm 347207753Smm /** 348207753Smm * \brief Maximum search depth in the match finder 349207753Smm * 350207753Smm * For every input byte, match finder searches through the hash chain 351207753Smm * or binary tree in a loop, each iteration going one step deeper in 352207753Smm * the chain or tree. The searching stops if 353207753Smm * - a match of at least nice_len bytes long is found; 354207753Smm * - all match candidates from the hash chain or binary tree have 355207753Smm * been checked; or 356207753Smm * - maximum search depth is reached. 357207753Smm * 358207753Smm * Maximum search depth is needed to prevent the match finder from 359207753Smm * wasting too much time in case there are lots of short match 360207753Smm * candidates. On the other hand, stopping the search before all 361207753Smm * candidates have been checked can reduce compression ratio. 362207753Smm * 363207753Smm * Setting depth to zero tells liblzma to use an automatic default 364207753Smm * value, that depends on the selected match finder and nice_len. 365215187Smm * The default is in the range [4, 200] or so (it may vary between 366207753Smm * liblzma versions). 367207753Smm * 368207753Smm * Using a bigger depth value than the default can increase 369207753Smm * compression ratio in some cases. There is no strict maximum value, 370207753Smm * but high values (thousands or millions) should be used with care: 371207753Smm * the encoder could remain fast enough with typical input, but 372207753Smm * malicious input could cause the match finder to slow down 373207753Smm * dramatically, possibly creating a denial of service attack. 374207753Smm */ 375207753Smm uint32_t depth; 376207753Smm 377207753Smm /* 378207753Smm * Reserved space to allow possible future extensions without 379207753Smm * breaking the ABI. You should not touch these, because the names 380207753Smm * of these variables may change. These are and will never be used 381207753Smm * with the currently supported options, so it is safe to leave these 382207753Smm * uninitialized. 383207753Smm */ 384207753Smm uint32_t reserved_int1; 385207753Smm uint32_t reserved_int2; 386207753Smm uint32_t reserved_int3; 387207753Smm uint32_t reserved_int4; 388207753Smm uint32_t reserved_int5; 389207753Smm uint32_t reserved_int6; 390207753Smm uint32_t reserved_int7; 391207753Smm uint32_t reserved_int8; 392207753Smm lzma_reserved_enum reserved_enum1; 393207753Smm lzma_reserved_enum reserved_enum2; 394207753Smm lzma_reserved_enum reserved_enum3; 395207753Smm lzma_reserved_enum reserved_enum4; 396215187Smm void *reserved_ptr1; 397215187Smm void *reserved_ptr2; 398207753Smm 399207753Smm} lzma_options_lzma; 400207753Smm 401207753Smm 402207753Smm/** 403207753Smm * \brief Set a compression preset to lzma_options_lzma structure 404207753Smm * 405207753Smm * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9 406207753Smm * of the xz command line tool. In addition, it is possible to bitwise-or 407207753Smm * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported. 408207753Smm * The flags are defined in container.h, because the flags are used also 409207753Smm * with lzma_easy_encoder(). 410207753Smm * 411207753Smm * The preset values are subject to changes between liblzma versions. 412207753Smm * 413207753Smm * This function is available only if LZMA1 or LZMA2 encoder has been enabled 414207753Smm * when building liblzma. 415219001Smm * 416219001Smm * \return On success, false is returned. If the preset is not 417219001Smm * supported, true is returned. 418207753Smm */ 419207753Smmextern LZMA_API(lzma_bool) lzma_lzma_preset( 420207753Smm lzma_options_lzma *options, uint32_t preset) lzma_nothrow; 421