1/* $NetBSD: zopen.c,v 1.16 2022/03/23 11:08:28 andvar Exp $ */ 2 3/*- 4 * Copyright (c) 1985, 1986, 1992, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Diomidis Spinellis and James A. Woods, derived from original 9 * work by Spencer Thomas and Joseph Orost. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36#if defined(LIBC_SCCS) && !defined(lint) 37#if 0 38static char sccsid[] = "@(#)zopen.c 8.1 (Berkeley) 6/27/93"; 39#else 40static char rcsid[] = "$NetBSD: zopen.c,v 1.16 2022/03/23 11:08:28 andvar Exp $"; 41#endif 42#endif /* LIBC_SCCS and not lint */ 43 44/*- 45 * fcompress.c - File compression ala IEEE Computer, June 1984. 46 * 47 * Compress authors: 48 * Spencer W. Thomas (decvax!utah-cs!thomas) 49 * Jim McKie (decvax!mcvax!jim) 50 * Steve Davies (decvax!vax135!petsd!peora!srd) 51 * Ken Turkowski (decvax!decwrl!turtlevax!ken) 52 * James A. Woods (decvax!ihnp4!ames!jaw) 53 * Joe Orost (decvax!vax135!petsd!joe) 54 * 55 * Cleaned up and converted to library returning I/O streams by 56 * Diomidis Spinellis <dds@doc.ic.ac.uk>. 57 * 58 * zopen(filename, mode, bits) 59 * Returns a FILE * that can be used for read or write. The modes 60 * supported are only "r" and "w". Seeking is not allowed. On 61 * reading the file is decompressed, on writing it is compressed. 62 * The output is compatible with compress(1) with 16 bit tables. 63 * Any file produced by compress(1) can be read. 64 */ 65 66#include <sys/param.h> 67#include <sys/stat.h> 68 69#include <errno.h> 70#include <signal.h> 71#include <stdio.h> 72#include <stdlib.h> 73#include <string.h> 74#include <unistd.h> 75 76#define BITS 16 /* Default bits. */ 77#define HSIZE 69001 /* 95% occupancy */ 78 79/* A code_int must be able to hold 2**BITS values of type int, and also -1. */ 80typedef long code_int; 81typedef long count_int; 82 83typedef u_char char_type; 84static char_type magic_header[] = 85 {'\037', '\235'}; /* 1F 9D */ 86 87#define BIT_MASK 0x1f /* Defines for third byte of header. */ 88#define BLOCK_MASK 0x80 89 90/* 91 * Masks 0x40 and 0x20 are free. I think 0x20 should mean that there is 92 * a fourth header byte (for expansion). 93 */ 94#define INIT_BITS 9 /* Initial number of bits/code. */ 95 96#define MAXCODE(n_bits) ((1 << (n_bits)) - 1) 97 98struct s_zstate { 99 FILE *zs_fp; /* File stream for I/O */ 100 char zs_mode; /* r or w */ 101 enum { 102 S_START, S_MIDDLE, S_EOF 103 } zs_state; /* State of computation */ 104 int zs_n_bits; /* Number of bits/code. */ 105 int zs_maxbits; /* User settable max # bits/code. */ 106 code_int zs_maxcode; /* Maximum code, given n_bits. */ 107 code_int zs_maxmaxcode; /* Should NEVER generate this code. */ 108 count_int zs_htab [HSIZE]; 109 u_short zs_codetab [HSIZE]; 110 code_int zs_hsize; /* For dynamic table sizing. */ 111 code_int zs_free_ent; /* First unused entry. */ 112 /* 113 * Block compression parameters -- after all codes are used up, 114 * and compression rate changes, start over. 115 */ 116 int zs_block_compress; 117 int zs_clear_flg; 118 long zs_ratio; 119 count_int zs_checkpoint; 120 int zs_offset; 121 long zs_in_count; /* Length of input. */ 122 long zs_bytes_out; /* Length of compressed output. */ 123 long zs_out_count; /* # of codes output (for debugging). */ 124 char_type zs_buf[BITS]; 125 union { 126 struct { 127 long zs_fcode; 128 code_int zs_ent; 129 code_int zs_hsize_reg; 130 int zs_hshift; 131 } w; /* Write parameters */ 132 struct { 133 char_type *zs_stackp; 134 int zs_finchar; 135 code_int zs_code, zs_oldcode, zs_incode; 136 int zs_roffset, zs_size; 137 char_type zs_gbuf[BITS]; 138 } r; /* Read parameters */ 139 } u; 140}; 141 142/* Definitions to retain old variable names */ 143#define fp zs->zs_fp 144#define zmode zs->zs_mode 145#define state zs->zs_state 146#define n_bits zs->zs_n_bits 147#define maxbits zs->zs_maxbits 148#define maxcode zs->zs_maxcode 149#define maxmaxcode zs->zs_maxmaxcode 150#define htab zs->zs_htab 151#define codetab zs->zs_codetab 152#define hsize zs->zs_hsize 153#define free_ent zs->zs_free_ent 154#define block_compress zs->zs_block_compress 155#define clear_flg zs->zs_clear_flg 156#define ratio zs->zs_ratio 157#define checkpoint zs->zs_checkpoint 158#define offset zs->zs_offset 159#define in_count zs->zs_in_count 160#define bytes_out zs->zs_bytes_out 161#define out_count zs->zs_out_count 162#define buf zs->zs_buf 163#define fcode zs->u.w.zs_fcode 164#define hsize_reg zs->u.w.zs_hsize_reg 165#define ent zs->u.w.zs_ent 166#define hshift zs->u.w.zs_hshift 167#define stackp zs->u.r.zs_stackp 168#define finchar zs->u.r.zs_finchar 169#define code zs->u.r.zs_code 170#define oldcode zs->u.r.zs_oldcode 171#define incode zs->u.r.zs_incode 172#define roffset zs->u.r.zs_roffset 173#define size zs->u.r.zs_size 174#define gbuf zs->u.r.zs_gbuf 175 176/* 177 * To save much memory, we overlay the table used by compress() with those 178 * used by decompress(). The tab_prefix table is the same size and type as 179 * the codetab. The tab_suffix table needs 2**BITS characters. We get this 180 * from the beginning of htab. The output stack uses the rest of htab, and 181 * contains characters. There is plenty of room for any possible stack 182 * (stack used to be 8000 characters). 183 */ 184 185#define htabof(i) htab[i] 186#define codetabof(i) codetab[i] 187 188#define tab_prefixof(i) codetabof(i) 189#define tab_suffixof(i) ((char_type *)(htab))[i] 190#define de_stack ((char_type *)&tab_suffixof(1 << BITS)) 191 192#define CHECK_GAP 10000 /* Ratio check interval. */ 193 194/* 195 * the next two codes should not be changed lightly, as they must not 196 * lie within the contiguous general code space. 197 */ 198#define FIRST 257 /* First free entry. */ 199#define CLEAR 256 /* Table clear output code. */ 200 201static int cl_block(struct s_zstate *); 202static code_int getcode(struct s_zstate *); 203static int output(struct s_zstate *, code_int); 204static int zclose(void *); 205FILE *zopen(const char *, const char *, int); 206static int zread(void *, char *, int); 207static int zwrite(void *, const char *, int); 208 209/*- 210 * Algorithm from "A Technique for High Performance Data Compression", 211 * Terry A. Welch, IEEE Computer Vol 17, No 6 (June 1984), pp 8-19. 212 * 213 * Algorithm: 214 * Modified Lempel-Ziv method (LZW). Basically finds common 215 * substrings and replaces them with a variable size code. This is 216 * deterministic, and can be done on the fly. Thus, the decompression 217 * procedure needs no input table, but tracks the way the table was built. 218 */ 219 220/*- 221 * compress write 222 * 223 * Algorithm: use open addressing double hashing (no chaining) on the 224 * prefix code / next character combination. We do a variant of Knuth's 225 * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime 226 * secondary probe. Here, the modular division first probe is gives way 227 * to a faster exclusive-or manipulation. Also do block compression with 228 * an adaptive reset, whereby the code table is cleared when the compression 229 * ratio decreases, but after the table fills. The variable-length output 230 * codes are re-sized at this point, and a special CLEAR code is generated 231 * for the decompressor. Late addition: construct the table according to 232 * file size for noticeable speed improvement on small files. Please direct 233 * questions about this implementation to ames!jaw. 234 */ 235static int 236zwrite(void *cookie, const char *wbp, int num) 237{ 238 code_int i; 239 int c, disp; 240 struct s_zstate *zs; 241 const u_char *bp; 242 u_char tmp; 243 int count; 244 245 if (num == 0) 246 return (0); 247 248 zs = cookie; 249 count = num; 250 bp = (const u_char *)wbp; 251 if (state == S_MIDDLE) 252 goto middle; 253 state = S_MIDDLE; 254 255 maxmaxcode = 1L << maxbits; 256 if (fwrite(magic_header, 257 sizeof(char), sizeof(magic_header), fp) != sizeof(magic_header)) 258 return (-1); 259 tmp = (u_char)(maxbits | block_compress); 260 if (fwrite(&tmp, sizeof(char), sizeof(tmp), fp) != sizeof(tmp)) 261 return (-1); 262 263 offset = 0; 264 bytes_out = 3; /* Includes 3-byte header mojo. */ 265 out_count = 0; 266 clear_flg = 0; 267 ratio = 0; 268 in_count = 1; 269 checkpoint = CHECK_GAP; 270 maxcode = MAXCODE(n_bits = INIT_BITS); 271 free_ent = ((block_compress) ? FIRST : 256); 272 273 ent = *bp++; 274 --count; 275 276 hshift = 0; 277 for (fcode = (long)hsize; fcode < 65536L; fcode *= 2L) 278 hshift++; 279 hshift = 8 - hshift; /* Set hash code range bound. */ 280 281 hsize_reg = hsize; 282 memset(htab, 0xff, hsize_reg * sizeof(count_int)); 283 284middle: while (count--) { 285 c = *bp++; 286 in_count++; 287 fcode = (long)(((long)c << maxbits) + ent); 288 i = ((c << hshift) ^ ent); /* Xor hashing. */ 289 290 if (htabof(i) == fcode) { 291 ent = codetabof(i); 292 continue; 293 } else if ((long)htabof(i) < 0) /* Empty slot. */ 294 goto nomatch; 295 disp = hsize_reg - i; /* Secondary hash (after G. Knott). */ 296 if (i == 0) 297 disp = 1; 298probe: if ((i -= disp) < 0) 299 i += hsize_reg; 300 301 if (htabof(i) == fcode) { 302 ent = codetabof(i); 303 continue; 304 } 305 if ((long)htabof(i) >= 0) 306 goto probe; 307nomatch: if (output(zs, (code_int) ent) == -1) 308 return (-1); 309 out_count++; 310 ent = c; 311 if (free_ent < maxmaxcode) { 312 codetabof(i) = free_ent++; /* code -> hashtable */ 313 htabof(i) = fcode; 314 } else if ((count_int)in_count >= 315 checkpoint && block_compress) { 316 if (cl_block(zs) == -1) 317 return (-1); 318 } 319 } 320 return (num); 321} 322 323static int 324zclose(void *cookie) 325{ 326 struct s_zstate *zs; 327 int rval; 328 329 zs = cookie; 330 if (zmode == 'w') { /* Put out the final code. */ 331 if (output(zs, (code_int) ent) == -1) { 332 (void)fclose(fp); 333 free(zs); 334 return (-1); 335 } 336 out_count++; 337 if (output(zs, (code_int) - 1) == -1) { 338 (void)fclose(fp); 339 free(zs); 340 return (-1); 341 } 342 } 343 rval = fclose(fp) == EOF ? -1 : 0; 344 free(zs); 345 return (rval); 346} 347 348/*- 349 * Output the given code. 350 * Inputs: 351 * code: A n_bits-bit integer. If == -1, then EOF. This assumes 352 * that n_bits =< (long)wordsize - 1. 353 * Outputs: 354 * Outputs code to the file. 355 * Assumptions: 356 * Chars are 8 bits long. 357 * Algorithm: 358 * Maintain a BITS character long buffer (so that 8 codes will 359 * fit in it exactly). Use the VAX insv instruction to insert each 360 * code in turn. When the buffer fills up empty it and start over. 361 */ 362 363static char_type lmask[9] = 364 {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00}; 365static char_type rmask[9] = 366 {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff}; 367 368static int 369output(struct s_zstate *zs, code_int ocode) 370{ 371 int bits, r_off; 372 char_type *bp; 373 374 r_off = offset; 375 bits = n_bits; 376 bp = buf; 377 if (ocode >= 0) { 378 /* Get to the first byte. */ 379 bp += (r_off >> 3); 380 r_off &= 7; 381 /* 382 * Since ocode is always >= 8 bits, only need to mask the first 383 * hunk on the left. 384 */ 385 *bp = (*bp & rmask[r_off]) | ((ocode << r_off) & lmask[r_off]); 386 bp++; 387 bits -= (8 - r_off); 388 ocode >>= 8 - r_off; 389 /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ 390 if (bits >= 8) { 391 *bp++ = ocode; 392 ocode >>= 8; 393 bits -= 8; 394 } 395 /* Last bits. */ 396 if (bits) 397 *bp = ocode; 398 offset += n_bits; 399 if (offset == (n_bits << 3)) { 400 bp = buf; 401 bits = n_bits; 402 bytes_out += bits; 403 if (fwrite(bp, sizeof(char), bits, fp) != (size_t)bits) 404 return (-1); 405 bp += bits; 406 bits = 0; 407 offset = 0; 408 } 409 /* 410 * If the next entry is going to be too big for the ocode size, 411 * then increase it, if possible. 412 */ 413 if (free_ent > maxcode || (clear_flg > 0)) { 414 /* 415 * Write the whole buffer, because the input side won't 416 * discover the size increase until after it has read it. 417 */ 418 if (offset > 0) { 419 if (fwrite(buf, 1, n_bits, fp) != (size_t)n_bits) 420 return (-1); 421 bytes_out += n_bits; 422 } 423 offset = 0; 424 425 if (clear_flg) { 426 maxcode = MAXCODE(n_bits = INIT_BITS); 427 clear_flg = 0; 428 } else { 429 n_bits++; 430 if (n_bits == maxbits) 431 maxcode = maxmaxcode; 432 else 433 maxcode = MAXCODE(n_bits); 434 } 435 } 436 } else { 437 /* At EOF, write the rest of the buffer. */ 438 if (offset > 0) { 439 offset = (offset + 7) / 8; 440 if (fwrite(buf, 1, offset, fp) != (size_t)offset) 441 return (-1); 442 bytes_out += offset; 443 } 444 offset = 0; 445 } 446 return (0); 447} 448 449/* 450 * Decompress read. This routine adapts to the codes in the file building 451 * the "string" table on-the-fly; requiring no table to be stored in the 452 * compressed file. The tables used herein are shared with those of the 453 * compress() routine. See the definitions above. 454 */ 455static int 456zread(void *cookie, char *rbp, int num) 457{ 458 u_int count; 459 struct s_zstate *zs; 460 u_char *bp, header[3]; 461 462 if (num == 0) 463 return (0); 464 465 zs = cookie; 466 count = num; 467 bp = (u_char *)rbp; 468 switch (state) { 469 case S_START: 470 state = S_MIDDLE; 471 break; 472 case S_MIDDLE: 473 goto middle; 474 case S_EOF: 475 goto eof; 476 } 477 478 /* Check the magic number */ 479 if (fread(header, 480 sizeof(char), sizeof(header), fp) != sizeof(header) || 481 memcmp(header, magic_header, sizeof(magic_header)) != 0) { 482 errno = EFTYPE; 483 return (-1); 484 } 485 maxbits = header[2]; /* Set -b from file. */ 486 block_compress = maxbits & BLOCK_MASK; 487 maxbits &= BIT_MASK; 488 maxmaxcode = 1L << maxbits; 489 if (maxbits > BITS || maxbits < 12) { 490 errno = EFTYPE; 491 return (-1); 492 } 493 /* As above, initialize the first 256 entries in the table. */ 494 maxcode = MAXCODE(n_bits = INIT_BITS); 495 for (code = 255; code >= 0; code--) { 496 tab_prefixof(code) = 0; 497 tab_suffixof(code) = (char_type) code; 498 } 499 free_ent = block_compress ? FIRST : 256; 500 oldcode = -1; 501 stackp = de_stack; 502 503 while ((code = getcode(zs)) > -1) { 504 505 if ((code == CLEAR) && block_compress) { 506 for (code = 255; code >= 0; code--) 507 tab_prefixof(code) = 0; 508 clear_flg = 1; 509 free_ent = FIRST; 510 oldcode = -1; 511 continue; 512 } 513 incode = code; 514 515 /* Special case for kWkWk string. */ 516 if (code >= free_ent) { 517 if (code > free_ent || oldcode == -1) { 518 /* Bad stream. */ 519 errno = EINVAL; 520 return (-1); 521 } 522 *stackp++ = finchar; 523 code = oldcode; 524 } 525 /* 526 * The above condition ensures that code < free_ent. 527 * The construction of tab_prefixof in turn guarantees that 528 * each iteration decreases code and therefore stack usage is 529 * bound by 1 << BITS - 256. 530 */ 531 532 /* Generate output characters in reverse order. */ 533 while (code >= 256) { 534 *stackp++ = tab_suffixof(code); 535 code = tab_prefixof(code); 536 } 537 *stackp++ = finchar = tab_suffixof(code); 538 539 /* And put them out in forward order. */ 540middle: do { 541 if (count-- == 0) 542 return (num); 543 *bp++ = *--stackp; 544 } while (stackp > de_stack); 545 546 /* Generate the new entry. */ 547 if ((code = free_ent) < maxmaxcode && oldcode != -1) { 548 tab_prefixof(code) = (u_short) oldcode; 549 tab_suffixof(code) = finchar; 550 free_ent = code + 1; 551 } 552 553 /* Remember previous code. */ 554 oldcode = incode; 555 } 556 state = S_EOF; 557eof: return (num - count); 558} 559 560/*- 561 * Read one code from the standard input. If EOF, return -1. 562 * Inputs: 563 * stdin 564 * Outputs: 565 * code or -1 is returned. 566 */ 567static code_int 568getcode(struct s_zstate *zs) 569{ 570 code_int gcode; 571 int r_off, bits; 572 char_type *bp; 573 574 bp = gbuf; 575 if (clear_flg > 0 || roffset >= size || free_ent > maxcode) { 576 /* 577 * If the next entry will be too big for the current gcode 578 * size, then we must increase the size. This implies reading 579 * a new buffer full, too. 580 */ 581 if (free_ent > maxcode) { 582 n_bits++; 583 if (n_bits == maxbits) /* Won't get any bigger now. */ 584 maxcode = maxmaxcode; 585 else 586 maxcode = MAXCODE(n_bits); 587 } 588 if (clear_flg > 0) { 589 maxcode = MAXCODE(n_bits = INIT_BITS); 590 clear_flg = 0; 591 } 592 size = fread(gbuf, 1, n_bits, fp); 593 if (size <= 0) /* End of file. */ 594 return (-1); 595 roffset = 0; 596 /* Round size down to integral number of codes. */ 597 size = (size << 3) - (n_bits - 1); 598 } 599 r_off = roffset; 600 bits = n_bits; 601 602 /* Get to the first byte. */ 603 bp += (r_off >> 3); 604 r_off &= 7; 605 606 /* Get first part (low order bits). */ 607 gcode = (*bp++ >> r_off); 608 bits -= (8 - r_off); 609 r_off = 8 - r_off; /* Now, roffset into gcode word. */ 610 611 /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */ 612 if (bits >= 8) { 613 gcode |= *bp++ << r_off; 614 r_off += 8; 615 bits -= 8; 616 } 617 618 /* High order bits. */ 619 gcode |= (*bp & rmask[bits]) << r_off; 620 roffset += n_bits; 621 622 return (gcode); 623} 624 625static int 626cl_block(struct s_zstate *zs) /* Table clear for block compress. */ 627{ 628 long rat; 629 630 checkpoint = in_count + CHECK_GAP; 631 632 if (in_count > 0x007fffff) { /* Shift will overflow. */ 633 rat = bytes_out >> 8; 634 if (rat == 0) /* Don't divide by zero. */ 635 rat = 0x7fffffff; 636 else 637 rat = in_count / rat; 638 } else 639 rat = (in_count << 8) / bytes_out; /* 8 fractional bits. */ 640 if (rat > ratio) 641 ratio = rat; 642 else { 643 ratio = 0; 644 memset(htab, 0xff, hsize * sizeof(count_int)); 645 free_ent = FIRST; 646 clear_flg = 1; 647 if (output(zs, (code_int) CLEAR) == -1) 648 return (-1); 649 } 650 return (0); 651} 652 653FILE * 654zopen(const char *fname, const char *mode, int bits) 655{ 656 struct s_zstate *zs; 657 658 if ((mode[0] != 'r' && mode[0] != 'w') || mode[1] != '\0' || 659 bits < 0 || bits > BITS) { 660 errno = EINVAL; 661 return (NULL); 662 } 663 664 if ((zs = calloc(1, sizeof(struct s_zstate))) == NULL) 665 return (NULL); 666 667 maxbits = bits ? bits : BITS; /* User settable max # bits/code. */ 668 maxmaxcode = 1 << maxbits; /* Should NEVER generate this code. */ 669 hsize = HSIZE; /* For dynamic table sizing. */ 670 free_ent = 0; /* First unused entry. */ 671 block_compress = BLOCK_MASK; 672 clear_flg = 0; 673 ratio = 0; 674 checkpoint = CHECK_GAP; 675 in_count = 1; /* Length of input. */ 676 out_count = 0; /* # of codes output (for debugging). */ 677 state = S_START; 678 roffset = 0; 679 size = 0; 680 681 /* 682 * Layering compress on top of stdio in order to provide buffering, 683 * and ensure that reads and write work with the data specified. 684 */ 685 if ((fp = fopen(fname, mode)) == NULL) { 686 free(zs); 687 return (NULL); 688 } 689 switch (*mode) { 690 case 'r': 691 zmode = 'r'; 692 return (funopen(zs, zread, NULL, NULL, zclose)); 693 case 'w': 694 zmode = 'w'; 695 return (funopen(zs, NULL, zwrite, NULL, zclose)); 696 } 697 /* NOTREACHED */ 698 return (NULL); 699} 700