1/* code -- bigram- and front-encode filenames for locate
2   Copyright (C) 1994, 2000, 2003, 2004, 2005, 2007 Free Software
3   Foundation, Inc.
4
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   This program is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17*/
18
19/* Compress a sorted list.
20   Works with `find' to encode a filename database to save space
21   and search time.
22
23   Usage:
24
25   bigram < file_list > bigrams
26   process-bigrams > most_common_bigrams
27   code most_common_bigrams < file_list > squeezed_list
28
29   Uses `front compression' (see ";login:", March 1983, p. 8).
30   The output begins with the 128 most common bigrams.
31   After that, the output format is, for each line,
32   an offset (from the previous line) differential count byte
33   followed by a (partially bigram-encoded) ASCII remainder.
34   The output lines have no terminating byte; the start of the next line
35   is indicated by its first byte having a value <= 30.
36
37   The encoding of the output bytes is:
38
39   0-28		likeliest differential counts + offset (14) to make nonnegative
40   30		escape code for out-of-range count to follow in next halfword
41   128-255	bigram codes (the 128 most common, as determined by `updatedb')
42   32-127	single character (printable) ASCII remainder
43
44   Written by James A. Woods <jwoods@adobe.com>.
45   Modified by David MacKenzie <djm@gnu.org>.  */
46
47#include <config.h>
48#include <stdio.h>
49#include <sys/types.h>
50
51#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
52#include <string.h>
53#else
54#include <strings.h>
55#endif
56
57#ifdef STDC_HEADERS
58#include <stdlib.h>
59#endif
60
61#if ENABLE_NLS
62# include <libintl.h>
63# define _(Text) gettext (Text)
64#else
65# define _(Text) Text
66#define textdomain(Domain)
67#define bindtextdomain(Package, Directory)
68#endif
69#ifdef gettext_noop
70# define N_(String) gettext_noop (String)
71#else
72/* See locate.c for explanation as to why not use (String) */
73# define N_(String) String
74#endif
75
76#include "locatedb.h"
77#include "closeout.h"
78#include "gnulib-version.h"
79
80char *xmalloc PARAMS((size_t));
81
82/* The name this program was run with.  */
83char *program_name;
84
85/* The 128 most common bigrams in the file list, padded with NULs
86   if there are fewer.  */
87static char bigrams[257] = {0};
88
89/* Return the offset of PATTERN in STRING, or -1 if not found. */
90
91static int
92strindex (char *string, char *pattern)
93{
94  register char *s;
95
96  for (s = string; *s != '\0'; s++)
97    /* Fast first char check. */
98    if (*s == *pattern)
99      {
100	register char *p2 = pattern + 1, *s2 = s + 1;
101	while (*p2 != '\0' && *p2 == *s2)
102	  p2++, s2++;
103	if (*p2 == '\0')
104	  return s2 - strlen (pattern) - string;
105      }
106  return -1;
107}
108
109/* Return the length of the longest common prefix of strings S1 and S2. */
110
111static int
112prefix_length (char *s1, char *s2)
113{
114  register char *start;
115
116  for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
117    ;
118  return s1 - start;
119}
120
121extern char *version_string;
122
123static void
124usage (FILE *stream)
125{
126  fprintf (stream, _("\
127Usage: %s [--version | --help]\n\
128or     %s most_common_bigrams < file-list > locate-database\n"),
129	   program_name, program_name);
130  fputs (_("\nReport bugs to <bug-findutils@gnu.org>.\n"), stream);
131}
132
133
134int
135main (int argc, char **argv)
136{
137  char *path;			/* The current input entry.  */
138  char *oldpath;		/* The previous input entry.  */
139  size_t pathsize, oldpathsize;	/* Amounts allocated for them.  */
140  int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
141  char bigram[3];		/* Bigram to search for in table.  */
142  int code;			/* Index of `bigram' in bigrams table.  */
143  FILE *fp;			/* Most common bigrams file.  */
144  int line_len;			/* Length of input line.  */
145
146  program_name = argv[0];
147  atexit (close_stdout);
148
149  bigram[2] = '\0';
150
151  if (argc != 2)
152    {
153      usage(stderr);
154      return 2;
155    }
156
157  if (0 == strcmp(argv[1], "--help"))
158    {
159      usage(stdout);
160      return 0;
161    }
162  else if (0 == strcmp(argv[1], "--version"))
163    {
164      printf (_("GNU findutils version %s\n"), version_string);
165      printf (_("Built using GNU gnulib version %s\n"), gnulib_version);
166      return 0;
167    }
168
169  fp = fopen (argv[1], "r");
170  if (fp == NULL)
171    {
172      fprintf (stderr, "%s: ", argv[0]);
173      perror (argv[1]);
174      return 1;
175    }
176
177  pathsize = oldpathsize = 1026; /* Increased as necessary by getline.  */
178  path = xmalloc (pathsize);
179  oldpath = xmalloc (oldpathsize);
180
181  /* Set to empty string, to force the first prefix count to 0.  */
182  oldpath[0] = '\0';
183  oldcount = 0;
184
185  /* Copy the list of most common bigrams to the output,
186     padding with NULs if there are <128 of them.  */
187  fgets (bigrams, 257, fp);
188  fwrite (bigrams, 1, 256, stdout);
189  fclose (fp);
190
191  while ((line_len = getline (&path, &pathsize, stdin)) > 0)
192    {
193      char *pp;
194
195      path[line_len - 1] = '\0'; /* Remove newline. */
196
197      /* Squelch unprintable chars in path so as not to botch decoding.  */
198      for (pp = path; *pp != '\0'; pp++)
199	{
200	  if (!(*pp >= 040 && *pp < 0177))
201	    *pp = '?';
202	}
203
204      count = prefix_length (oldpath, path);
205      diffcount = count - oldcount;
206      oldcount = count;
207      /* If the difference is small, it fits in one byte;
208	 otherwise, two bytes plus a marker noting that fact.  */
209      if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
210	{
211	  putc (LOCATEDB_OLD_ESCAPE, stdout);
212	  putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
213	}
214      else
215	putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
216
217      /* Look for bigrams in the remainder of the path.  */
218      for (pp = path + count; *pp != '\0'; pp += 2)
219	{
220	  if (pp[1] == '\0')
221	    {
222	      /* No bigram is possible; only one char is left.  */
223	      putchar (*pp);
224	      break;
225	    }
226	  bigram[0] = *pp;
227	  bigram[1] = pp[1];
228	  /* Linear search for specific bigram in string table. */
229	  code = strindex (bigrams, bigram);
230	  if (code % 2 == 0)
231	    putchar ((code / 2) | 0200); /* It's a common bigram.  */
232	  else
233	    fputs (bigram, stdout); /* Write the text as printable ASCII.  */
234	}
235
236      {
237	/* Swap path and oldpath and their sizes.  */
238	char *tmppath = oldpath;
239	size_t tmppathsize = oldpathsize;
240	oldpath = path;
241	oldpathsize = pathsize;
242	path = tmppath;
243	pathsize = tmppathsize;
244      }
245    }
246
247  free (path);
248  free (oldpath);
249
250  return 0;
251}
252