1/* code -- bigram- and front-encode filenames for locate 2 Copyright (C) 1994, 2000, 2003, 2004, 2005, 2007 Free Software 3 Foundation, Inc. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation, either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. 17*/ 18 19/* Compress a sorted list. 20 Works with `find' to encode a filename database to save space 21 and search time. 22 23 Usage: 24 25 bigram < file_list > bigrams 26 process-bigrams > most_common_bigrams 27 code most_common_bigrams < file_list > squeezed_list 28 29 Uses `front compression' (see ";login:", March 1983, p. 8). 30 The output begins with the 128 most common bigrams. 31 After that, the output format is, for each line, 32 an offset (from the previous line) differential count byte 33 followed by a (partially bigram-encoded) ASCII remainder. 34 The output lines have no terminating byte; the start of the next line 35 is indicated by its first byte having a value <= 30. 36 37 The encoding of the output bytes is: 38 39 0-28 likeliest differential counts + offset (14) to make nonnegative 40 30 escape code for out-of-range count to follow in next halfword 41 128-255 bigram codes (the 128 most common, as determined by `updatedb') 42 32-127 single character (printable) ASCII remainder 43 44 Written by James A. Woods <jwoods@adobe.com>. 45 Modified by David MacKenzie <djm@gnu.org>. */ 46 47#include <config.h> 48#include <stdio.h> 49#include <sys/types.h> 50 51#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) 52#include <string.h> 53#else 54#include <strings.h> 55#endif 56 57#ifdef STDC_HEADERS 58#include <stdlib.h> 59#endif 60 61#if ENABLE_NLS 62# include <libintl.h> 63# define _(Text) gettext (Text) 64#else 65# define _(Text) Text 66#define textdomain(Domain) 67#define bindtextdomain(Package, Directory) 68#endif 69#ifdef gettext_noop 70# define N_(String) gettext_noop (String) 71#else 72/* See locate.c for explanation as to why not use (String) */ 73# define N_(String) String 74#endif 75 76#include "locatedb.h" 77#include "closeout.h" 78#include "gnulib-version.h" 79 80char *xmalloc PARAMS((size_t)); 81 82/* The name this program was run with. */ 83char *program_name; 84 85/* The 128 most common bigrams in the file list, padded with NULs 86 if there are fewer. */ 87static char bigrams[257] = {0}; 88 89/* Return the offset of PATTERN in STRING, or -1 if not found. */ 90 91static int 92strindex (char *string, char *pattern) 93{ 94 register char *s; 95 96 for (s = string; *s != '\0'; s++) 97 /* Fast first char check. */ 98 if (*s == *pattern) 99 { 100 register char *p2 = pattern + 1, *s2 = s + 1; 101 while (*p2 != '\0' && *p2 == *s2) 102 p2++, s2++; 103 if (*p2 == '\0') 104 return s2 - strlen (pattern) - string; 105 } 106 return -1; 107} 108 109/* Return the length of the longest common prefix of strings S1 and S2. */ 110 111static int 112prefix_length (char *s1, char *s2) 113{ 114 register char *start; 115 116 for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++) 117 ; 118 return s1 - start; 119} 120 121extern char *version_string; 122 123static void 124usage (FILE *stream) 125{ 126 fprintf (stream, _("\ 127Usage: %s [--version | --help]\n\ 128or %s most_common_bigrams < file-list > locate-database\n"), 129 program_name, program_name); 130 fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream); 131} 132 133 134int 135main (int argc, char **argv) 136{ 137 char *path; /* The current input entry. */ 138 char *oldpath; /* The previous input entry. */ 139 size_t pathsize, oldpathsize; /* Amounts allocated for them. */ 140 int count, oldcount, diffcount; /* Their prefix lengths & the difference. */ 141 char bigram[3]; /* Bigram to search for in table. */ 142 int code; /* Index of `bigram' in bigrams table. */ 143 FILE *fp; /* Most common bigrams file. */ 144 int line_len; /* Length of input line. */ 145 146 program_name = argv[0]; 147 atexit (close_stdout); 148 149 bigram[2] = '\0'; 150 151 if (argc != 2) 152 { 153 usage(stderr); 154 return 2; 155 } 156 157 if (0 == strcmp(argv[1], "--help")) 158 { 159 usage(stdout); 160 return 0; 161 } 162 else if (0 == strcmp(argv[1], "--version")) 163 { 164 printf (_("GNU findutils version %s\n"), version_string); 165 printf (_("Built using GNU gnulib version %s\n"), gnulib_version); 166 return 0; 167 } 168 169 fp = fopen (argv[1], "r"); 170 if (fp == NULL) 171 { 172 fprintf (stderr, "%s: ", argv[0]); 173 perror (argv[1]); 174 return 1; 175 } 176 177 pathsize = oldpathsize = 1026; /* Increased as necessary by getline. */ 178 path = xmalloc (pathsize); 179 oldpath = xmalloc (oldpathsize); 180 181 /* Set to empty string, to force the first prefix count to 0. */ 182 oldpath[0] = '\0'; 183 oldcount = 0; 184 185 /* Copy the list of most common bigrams to the output, 186 padding with NULs if there are <128 of them. */ 187 fgets (bigrams, 257, fp); 188 fwrite (bigrams, 1, 256, stdout); 189 fclose (fp); 190 191 while ((line_len = getline (&path, &pathsize, stdin)) > 0) 192 { 193 char *pp; 194 195 path[line_len - 1] = '\0'; /* Remove newline. */ 196 197 /* Squelch unprintable chars in path so as not to botch decoding. */ 198 for (pp = path; *pp != '\0'; pp++) 199 { 200 if (!(*pp >= 040 && *pp < 0177)) 201 *pp = '?'; 202 } 203 204 count = prefix_length (oldpath, path); 205 diffcount = count - oldcount; 206 oldcount = count; 207 /* If the difference is small, it fits in one byte; 208 otherwise, two bytes plus a marker noting that fact. */ 209 if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET) 210 { 211 putc (LOCATEDB_OLD_ESCAPE, stdout); 212 putw (diffcount + LOCATEDB_OLD_OFFSET, stdout); 213 } 214 else 215 putc (diffcount + LOCATEDB_OLD_OFFSET, stdout); 216 217 /* Look for bigrams in the remainder of the path. */ 218 for (pp = path + count; *pp != '\0'; pp += 2) 219 { 220 if (pp[1] == '\0') 221 { 222 /* No bigram is possible; only one char is left. */ 223 putchar (*pp); 224 break; 225 } 226 bigram[0] = *pp; 227 bigram[1] = pp[1]; 228 /* Linear search for specific bigram in string table. */ 229 code = strindex (bigrams, bigram); 230 if (code % 2 == 0) 231 putchar ((code / 2) | 0200); /* It's a common bigram. */ 232 else 233 fputs (bigram, stdout); /* Write the text as printable ASCII. */ 234 } 235 236 { 237 /* Swap path and oldpath and their sizes. */ 238 char *tmppath = oldpath; 239 size_t tmppathsize = oldpathsize; 240 oldpath = path; 241 oldpathsize = pathsize; 242 path = tmppath; 243 pathsize = tmppathsize; 244 } 245 } 246 247 free (path); 248 free (oldpath); 249 250 return 0; 251} 252