11590Srgrimes/* 218905Swosch * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. 31590Srgrimes * Copyright (c) 1989, 1993 41590Srgrimes * The Regents of the University of California. All rights reserved. 51590Srgrimes * 61590Srgrimes * This code is derived from software contributed to Berkeley by 71590Srgrimes * James A. Woods. 81590Srgrimes * 91590Srgrimes * Redistribution and use in source and binary forms, with or without 101590Srgrimes * modification, are permitted provided that the following conditions 111590Srgrimes * are met: 121590Srgrimes * 1. Redistributions of source code must retain the above copyright 131590Srgrimes * notice, this list of conditions and the following disclaimer. 141590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 151590Srgrimes * notice, this list of conditions and the following disclaimer in the 161590Srgrimes * documentation and/or other materials provided with the distribution. 171590Srgrimes * 3. All advertising materials mentioning features or use of this software 181590Srgrimes * must display the following acknowledgement: 191590Srgrimes * This product includes software developed by the University of 201590Srgrimes * California, Berkeley and its contributors. 211590Srgrimes * 4. Neither the name of the University nor the names of its contributors 221590Srgrimes * may be used to endorse or promote products derived from this software 231590Srgrimes * without specific prior written permission. 241590Srgrimes * 251590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 261590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 271590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 281590Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 291590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 301590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 311590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 321590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 331590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 341590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 351590Srgrimes * SUCH DAMAGE. 3617776Swosch * 3750477Speter * $FreeBSD$ 381590Srgrimes */ 391590Srgrimes 40209571Sgavin#if 0 411590Srgrimes#ifndef lint 421590Srgrimesstatic char copyright[] = 431590Srgrimes"@(#) Copyright (c) 1989, 1993\n\ 441590Srgrimes The Regents of the University of California. All rights reserved.\n"; 451590Srgrimes#endif /* not lint */ 461590Srgrimes 471590Srgrimes#ifndef lint 481590Srgrimesstatic char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; 491590Srgrimes#endif /* not lint */ 50209571Sgavin#endif 511590Srgrimes 521590Srgrimes/* 531590Srgrimes * PURPOSE: sorted list compressor (works with a modified 'find' 541590Srgrimes * to encode/decode a filename database) 551590Srgrimes * 561590Srgrimes * USAGE: bigram < list > bigrams 571590Srgrimes * process bigrams (see updatedb) > common_bigrams 581590Srgrimes * code common_bigrams < list > squozen_list 591590Srgrimes * 601590Srgrimes * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1 611590Srgrimes * February/March 1983, p. 8). Output format is, per line, an 621590Srgrimes * offset differential count byte followed by a partially bigram- 631590Srgrimes * encoded ascii residue. A bigram is a two-character sequence, 641590Srgrimes * the first 128 most common of which are encoded in one byte. 651590Srgrimes * 661590Srgrimes * EXAMPLE: For simple front compression with no bigram encoding, 671590Srgrimes * if the input is... then the output is... 681590Srgrimes * 691590Srgrimes * /usr/src 0 /usr/src 701590Srgrimes * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c 711590Srgrimes * /usr/src/cmd/armadillo.c 14 armadillo.c 721590Srgrimes * /usr/tmp/zoo 5 tmp/zoo 731590Srgrimes * 741590Srgrimes * The codes are: 751590Srgrimes * 761590Srgrimes * 0-28 likeliest differential counts + offset to make nonnegative 771590Srgrimes * 30 switch code for out-of-range count to follow in next word 7818905Swosch * 31 an 8 bit char followed 791590Srgrimes * 128-255 bigram codes (128 most common, as determined by 'updatedb') 801590Srgrimes * 32-127 single character (printable) ascii residue (ie, literal) 811590Srgrimes * 8218905Swosch * The locate database store any character except newline ('\n') 8318905Swosch * and NUL ('\0'). The 8-bit character support don't wast extra 8418905Swosch * space until you have characters in file names less than 32 8518905Swosch * or greather than 127. 8618905Swosch * 871590Srgrimes * 8818905Swosch * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c 8918905Swosch * 901590Srgrimes * AUTHOR: James A. Woods, Informatics General Corp., 911590Srgrimes * NASA Ames Research Center, 10/82 9218905Swosch * 8-bit file names characters: 9318905Swosch * Wolfram Schneider, Berlin September 1996 941590Srgrimes */ 951590Srgrimes 961590Srgrimes#include <sys/param.h> 971590Srgrimes#include <err.h> 981590Srgrimes#include <errno.h> 991590Srgrimes#include <stdlib.h> 1001590Srgrimes#include <string.h> 1011590Srgrimes#include <stdio.h> 10265428Simp#include <unistd.h> 1031590Srgrimes#include "locate.h" 1041590Srgrimes 1051590Srgrimes#define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ 1061590Srgrimes 10717592Swoschu_char buf1[MAXPATHLEN] = " "; 10817592Swoschu_char buf2[MAXPATHLEN]; 10918905Swoschu_char bigrams[BGBUFSIZE + 1] = { 0 }; 1101590Srgrimes 11117776Swosch#define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ 11217776Swosch 11317592Swosch#ifdef LOOKUP 11418905Swosch#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) 11518905Swoschtypedef short bg_t; 11618905Swoschbg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; 11717592Swosch#else 11817592Swosch#define BGINDEX(x) bgindex(x) 11917592Swoschtypedef int bg_t; 12092920Simpint bgindex(char *); 12117776Swosch#endif /* LOOKUP */ 12217592Swosch 12317776Swosch 12492920Simpvoid usage(void); 1251590Srgrimes 1261590Srgrimesint 127209571Sgavinmain(int argc, char *argv[]) 1281590Srgrimes{ 129209571Sgavin u_char *cp, *oldpath, *path; 1301590Srgrimes int ch, code, count, diffcount, oldcount; 131209571Sgavin u_int i, j; 1321590Srgrimes FILE *fp; 1331590Srgrimes 13424360Simp while ((ch = getopt(argc, argv, "")) != -1) 1351590Srgrimes switch(ch) { 1361590Srgrimes default: 1371590Srgrimes usage(); 1381590Srgrimes } 1391590Srgrimes argc -= optind; 1401590Srgrimes argv += optind; 1411590Srgrimes 1421590Srgrimes if (argc != 1) 1431590Srgrimes usage(); 1441590Srgrimes 1451590Srgrimes if ((fp = fopen(argv[0], "r")) == NULL) 1461590Srgrimes err(1, "%s", argv[0]); 1471590Srgrimes 1481590Srgrimes /* First copy bigram array to stdout. */ 1491590Srgrimes (void)fgets(bigrams, BGBUFSIZE + 1, fp); 15017776Swosch 1511590Srgrimes if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE) 1521590Srgrimes err(1, "stdout"); 1531590Srgrimes (void)fclose(fp); 1541590Srgrimes 15517592Swosch#ifdef LOOKUP 15617592Swosch /* init lookup table */ 15718905Swosch for (i = 0; i < UCHAR_MAX + 1; i++) 15818905Swosch for (j = 0; j < UCHAR_MAX + 1; j++) 15917592Swosch big[i][j] = (bg_t)-1; 16017592Swosch 16117972Swosch for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) 16218905Swosch big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; 16318905Swosch 16417776Swosch#endif /* LOOKUP */ 16517592Swosch 1661590Srgrimes oldpath = buf1; 1671590Srgrimes path = buf2; 1681590Srgrimes oldcount = 0; 16917776Swosch 17017592Swosch while (fgets(path, sizeof(buf2), stdin) != NULL) { 1711590Srgrimes 17218905Swosch /* skip empty lines */ 17317592Swosch if (*path == '\n') 17417592Swosch continue; 17517592Swosch 17618905Swosch /* remove newline */ 17717972Swosch for (cp = path; *cp != '\0'; cp++) { 17819213Swosch#ifndef LOCATE_CHAR30 17919213Swosch /* old locate implementations core'd for char 30 */ 18019213Swosch if (*cp == SWITCH) 18119213Swosch *cp = '?'; 18219213Swosch else 18319213Swosch#endif /* !LOCATE_CHAR30 */ 18419213Swosch 18517592Swosch /* chop newline */ 18617592Swosch if (*cp == '\n') 18717972Swosch *cp = '\0'; 1881590Srgrimes } 1891590Srgrimes 1901590Srgrimes /* Skip longest common prefix. */ 19118905Swosch for (cp = path; *cp == *oldpath; cp++, oldpath++) 19218905Swosch if (*cp == '\0') 19318905Swosch break; 19417592Swosch 1951590Srgrimes count = cp - path; 1961590Srgrimes diffcount = count - oldcount + OFFSET; 1971590Srgrimes oldcount = count; 1981590Srgrimes if (diffcount < 0 || diffcount > 2 * OFFSET) { 1991590Srgrimes if (putchar(SWITCH) == EOF || 2001590Srgrimes putw(diffcount, stdout) == EOF) 2011590Srgrimes err(1, "stdout"); 2021590Srgrimes } else 2031590Srgrimes if (putchar(diffcount) == EOF) 2041590Srgrimes err(1, "stdout"); 2051590Srgrimes 20617972Swosch while (*cp != '\0') { 20718905Swosch /* print *two* characters */ 20818905Swosch 20918905Swosch if ((code = BGINDEX(cp)) != (bg_t)-1) { 21018905Swosch /* 21118905Swosch * print *one* as bigram 21218905Swosch * Found, so mark byte with 21318905Swosch * parity bit. 21418905Swosch */ 2151590Srgrimes if (putchar((code / 2) | PARITY) == EOF) 2161590Srgrimes err(1, "stdout"); 2171590Srgrimes cp += 2; 2181590Srgrimes } 21918905Swosch 22018905Swosch else { 22118905Swosch for (i = 0; i < 2; i++) { 22218905Swosch if (*cp == '\0') 22318905Swosch break; 22418905Swosch 22518905Swosch /* print umlauts in file names */ 22618905Swosch if (*cp < ASCII_MIN || 22718905Swosch *cp > ASCII_MAX) { 22818905Swosch if (putchar(UMLAUT) == EOF || 22918905Swosch putchar(*cp++) == EOF) 23018905Swosch err(1, "stdout"); 23118905Swosch } 23218905Swosch 23318905Swosch else { 23418905Swosch /* normal character */ 23518905Swosch if(putchar(*cp++) == EOF) 23618905Swosch err(1, "stdout"); 23718905Swosch } 23818905Swosch } 23918905Swosch 24018905Swosch } 2411590Srgrimes } 24218905Swosch 2431590Srgrimes if (path == buf1) { /* swap pointers */ 2441590Srgrimes path = buf2; 2451590Srgrimes oldpath = buf1; 2461590Srgrimes } else { 2471590Srgrimes path = buf1; 2481590Srgrimes oldpath = buf2; 2491590Srgrimes } 2501590Srgrimes } 2511590Srgrimes /* Non-zero status if there were errors */ 2521590Srgrimes if (fflush(stdout) != 0 || ferror(stdout)) 2531590Srgrimes exit(1); 2541590Srgrimes exit(0); 2551590Srgrimes} 2561590Srgrimes 25717592Swosch#ifndef LOOKUP 2581590Srgrimesint 259209571Sgavinbgindex(char *bg) /* Return location of bg in bigrams or -1. */ 2601590Srgrimes{ 261209571Sgavin char bg0, bg1, *p; 2621590Srgrimes 2631590Srgrimes bg0 = bg[0]; 2641590Srgrimes bg1 = bg[1]; 2651590Srgrimes for (p = bigrams; *p != NULL; p++) 2661590Srgrimes if (*p++ == bg0 && *p == bg1) 2671590Srgrimes break; 26817776Swosch return (*p == NULL ? -1 : (--p - bigrams)); 2691590Srgrimes} 27017592Swosch#endif /* !LOOKUP */ 2711590Srgrimes 2721590Srgrimesvoid 273209571Sgavinusage(void) 2741590Srgrimes{ 2751590Srgrimes (void)fprintf(stderr, 2761590Srgrimes "usage: locate.code common_bigrams < list > squozen_list\n"); 2771590Srgrimes exit(1); 2781590Srgrimes} 279