1/* tr -- a filter to translate characters
2   Copyright (C) 1991, 1995-2010 Free Software Foundation, Inc.
3
4   This program is free software: you can redistribute it and/or modify
5   it under the terms of the GNU General Public License as published by
6   the Free Software Foundation, either version 3 of the License, or
7   (at your option) any later version.
8
9   This program is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU General Public License for more details.
13
14   You should have received a copy of the GNU General Public License
15   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
16
17/* Written by Jim Meyering */
18
19#include <config.h>
20
21#include <stdio.h>
22#include <assert.h>
23#include <sys/types.h>
24#include <getopt.h>
25
26#include "system.h"
27#include "error.h"
28#include "quote.h"
29#include "safe-read.h"
30#include "xfreopen.h"
31#include "xstrtol.h"
32
33/* The official name of this program (e.g., no `g' prefix).  */
34#define PROGRAM_NAME "tr"
35
36#define AUTHORS proper_name ("Jim Meyering")
37
38enum { N_CHARS = UCHAR_MAX + 1 };
39
40/* An unsigned integer type big enough to hold a repeat count or an
41   unsigned character.  POSIX requires support for repeat counts as
42   high as 2**31 - 1.  Since repeat counts might need to expand to
43   match the length of an argument string, we need at least size_t to
44   avoid arbitrary internal limits.  It doesn't cost much to use
45   uintmax_t, though.  */
46typedef uintmax_t count;
47
48/* The value for Spec_list->state that indicates to
49   get_next that it should initialize the tail pointer.
50   Its value should be as large as possible to avoid conflict
51   a valid value for the state field -- and that may be as
52   large as any valid repeat_count.  */
53#define BEGIN_STATE (UINTMAX_MAX - 1)
54
55/* The value for Spec_list->state that indicates to
56   get_next that the element pointed to by Spec_list->tail is
57   being considered for the first time on this pass through the
58   list -- it indicates that get_next should make any necessary
59   initializations.  */
60#define NEW_ELEMENT (BEGIN_STATE + 1)
61
62/* The maximum possible repeat count.  Due to how the states are
63   implemented, it can be as much as BEGIN_STATE.  */
64#define REPEAT_COUNT_MAXIMUM BEGIN_STATE
65
66/* The following (but not CC_NO_CLASS) are indices into the array of
67   valid character class strings.  */
68enum Char_class
69  {
70    CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
71    CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
72    CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
73    CC_NO_CLASS = 9999
74  };
75
76/* Character class to which a character (returned by get_next) belonged;
77   but it is set only if the construct from which the character was obtained
78   was one of the character classes [:upper:] or [:lower:].  The value
79   is used only when translating and then, only to make sure that upper
80   and lower class constructs have the same relative positions in string1
81   and string2.  */
82enum Upper_Lower_class
83  {
84    UL_LOWER,
85    UL_UPPER,
86    UL_NONE
87  };
88
89/* The type of a List_element.  See build_spec_list for more details.  */
90enum Range_element_type
91  {
92    RE_NORMAL_CHAR,
93    RE_RANGE,
94    RE_CHAR_CLASS,
95    RE_EQUIV_CLASS,
96    RE_REPEATED_CHAR
97  };
98
99/* One construct in one of tr's argument strings.
100   For example, consider the POSIX version of the classic tr command:
101       tr -cs 'a-zA-Z_' '[\n*]'
102   String1 has 3 constructs, two of which are ranges (a-z and A-Z),
103   and a single normal character, `_'.  String2 has one construct.  */
104struct List_element
105  {
106    enum Range_element_type type;
107    struct List_element *next;
108    union
109      {
110        unsigned char normal_char;
111        struct			/* unnamed */
112          {
113            unsigned char first_char;
114            unsigned char last_char;
115          }
116        range;
117        enum Char_class char_class;
118        unsigned char equiv_code;
119        struct			/* unnamed */
120          {
121            unsigned char the_repeated_char;
122            count repeat_count;
123          }
124        repeated_char;
125      }
126    u;
127  };
128
129/* Each of tr's argument strings is parsed into a form that is easier
130   to work with: a linked list of constructs (struct List_element).
131   Each Spec_list structure also encapsulates various attributes of
132   the corresponding argument string.  The attributes are used mainly
133   to verify that the strings are valid in the context of any options
134   specified (like -s, -d, or -c).  The main exception is the member
135   `tail', which is first used to construct the list.  After construction,
136   it is used by get_next to save its state when traversing the list.
137   The member `state' serves a similar function.  */
138struct Spec_list
139  {
140    /* Points to the head of the list of range elements.
141       The first struct is a dummy; its members are never used.  */
142    struct List_element *head;
143
144    /* When appending, points to the last element.  When traversing via
145       get_next(), points to the element to process next.  Setting
146       Spec_list.state to the value BEGIN_STATE before calling get_next
147       signals get_next to initialize tail to point to head->next.  */
148    struct List_element *tail;
149
150    /* Used to save state between calls to get_next.  */
151    count state;
152
153    /* Length, in the sense that length ('a-z[:digit:]123abc')
154       is 42 ( = 26 + 10 + 6).  */
155    count length;
156
157    /* The number of [c*] and [c*0] constructs that appear in this spec.  */
158    size_t n_indefinite_repeats;
159
160    /* If n_indefinite_repeats is nonzero, this points to the List_element
161       corresponding to the last [c*] or [c*0] construct encountered in
162       this spec.  Otherwise it is undefined.  */
163    struct List_element *indefinite_repeat_element;
164
165    /* True if this spec contains at least one equivalence
166       class construct e.g. [=c=].  */
167    bool has_equiv_class;
168
169    /* True if this spec contains at least one character class
170       construct.  E.g. [:digit:].  */
171    bool has_char_class;
172
173    /* True if this spec contains at least one of the character class
174       constructs (all but upper and lower) that aren't allowed in s2.  */
175    bool has_restricted_char_class;
176  };
177
178/* A representation for escaped string1 or string2.  As a string is parsed,
179   any backslash-escaped characters (other than octal or \a, \b, \f, \n,
180   etc.) are marked as such in this structure by setting the corresponding
181   entry in the ESCAPED vector.  */
182struct E_string
183{
184  char *s;
185  bool *escaped;
186  size_t len;
187};
188
189/* Return nonzero if the Ith character of escaped string ES matches C
190   and is not escaped itself.  */
191static inline bool
192es_match (struct E_string const *es, size_t i, char c)
193{
194  return es->s[i] == c && !es->escaped[i];
195}
196
197/* When true, each sequence in the input of a repeated character
198   (call it c) is replaced (in the output) by a single occurrence of c
199   for every c in the squeeze set.  */
200static bool squeeze_repeats = false;
201
202/* When true, removes characters in the delete set from input.  */
203static bool delete = false;
204
205/* Use the complement of set1 in place of set1.  */
206static bool complement = false;
207
208/* When tr is performing translation and string1 is longer than string2,
209   POSIX says that the result is unspecified.  That gives the implementor
210   of a POSIX conforming version of tr two reasonable choices for the
211   semantics of this case.
212
213   * The BSD tr pads string2 to the length of string1 by
214   repeating the last character in string2.
215
216   * System V tr ignores characters in string1 that have no
217   corresponding character in string2.  That is, string1 is effectively
218   truncated to the length of string2.
219
220   When nonzero, this flag causes GNU tr to imitate the behavior
221   of System V tr when translating with string1 longer than string2.
222   The default is to emulate BSD tr.  This flag is ignored in modes where
223   no translation is performed.  Emulating the System V tr
224   in this exceptional case causes the relatively common BSD idiom:
225
226       tr -cs A-Za-z0-9 '\012'
227
228   to break (it would convert only zero bytes, rather than all
229   non-alphanumerics, to newlines).
230
231   WARNING: This switch does not provide general BSD or System V
232   compatibility.  For example, it doesn't disable the interpretation
233   of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
234   some unfortunate coincidence you use such constructs in scripts
235   expecting to use some other version of tr, the scripts will break.  */
236static bool truncate_set1 = false;
237
238/* An alias for (!delete && non_option_args == 2).
239   It is set in main and used there and in validate().  */
240static bool translating;
241
242static char io_buf[BUFSIZ];
243
244static char const *const char_class_name[] =
245{
246  "alnum", "alpha", "blank", "cntrl", "digit", "graph",
247  "lower", "print", "punct", "space", "upper", "xdigit"
248};
249
250/* Array of boolean values.  A character `c' is a member of the
251   squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
252   set is defined by the last (possibly, the only) string argument
253   on the command line when the squeeze option is given.  */
254static bool in_squeeze_set[N_CHARS];
255
256/* Array of boolean values.  A character `c' is a member of the
257   delete set if and only if in_delete_set[c] is true.  The delete
258   set is defined by the first (or only) string argument on the
259   command line when the delete option is given.  */
260static bool in_delete_set[N_CHARS];
261
262/* Array of character values defining the translation (if any) that
263   tr is to perform.  Translation is performed only when there are
264   two specification strings and the delete switch is not given.  */
265static char xlate[N_CHARS];
266
267static struct option const long_options[] =
268{
269  {"complement", no_argument, NULL, 'c'},
270  {"delete", no_argument, NULL, 'd'},
271  {"squeeze-repeats", no_argument, NULL, 's'},
272  {"truncate-set1", no_argument, NULL, 't'},
273  {GETOPT_HELP_OPTION_DECL},
274  {GETOPT_VERSION_OPTION_DECL},
275  {NULL, 0, NULL, 0}
276};
277
278void
279usage (int status)
280{
281  if (status != EXIT_SUCCESS)
282    fprintf (stderr, _("Try `%s --help' for more information.\n"),
283             program_name);
284  else
285    {
286      printf (_("\
287Usage: %s [OPTION]... SET1 [SET2]\n\
288"),
289              program_name);
290      fputs (_("\
291Translate, squeeze, and/or delete characters from standard input,\n\
292writing to standard output.\n\
293\n\
294  -c, -C, --complement    use the complement of SET1\n\
295  -d, --delete            delete characters in SET1, do not translate\n\
296  -s, --squeeze-repeats   replace each input sequence of a repeated character\n\
297                            that is listed in SET1 with a single occurrence\n\
298                            of that character\n\
299  -t, --truncate-set1     first truncate SET1 to length of SET2\n\
300"), stdout);
301      fputs (HELP_OPTION_DESCRIPTION, stdout);
302      fputs (VERSION_OPTION_DESCRIPTION, stdout);
303      fputs (_("\
304\n\
305SETs are specified as strings of characters.  Most represent themselves.\n\
306Interpreted sequences are:\n\
307\n\
308  \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
309  \\\\              backslash\n\
310  \\a              audible BEL\n\
311  \\b              backspace\n\
312  \\f              form feed\n\
313  \\n              new line\n\
314  \\r              return\n\
315  \\t              horizontal tab\n\
316"), stdout);
317     fputs (_("\
318  \\v              vertical tab\n\
319  CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
320  [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
321  [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
322  [:alnum:]       all letters and digits\n\
323  [:alpha:]       all letters\n\
324  [:blank:]       all horizontal whitespace\n\
325  [:cntrl:]       all control characters\n\
326  [:digit:]       all digits\n\
327"), stdout);
328     fputs (_("\
329  [:graph:]       all printable characters, not including space\n\
330  [:lower:]       all lower case letters\n\
331  [:print:]       all printable characters, including space\n\
332  [:punct:]       all punctuation characters\n\
333  [:space:]       all horizontal or vertical whitespace\n\
334  [:upper:]       all upper case letters\n\
335  [:xdigit:]      all hexadecimal digits\n\
336  [=CHAR=]        all characters which are equivalent to CHAR\n\
337"), stdout);
338     fputs (_("\
339\n\
340Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
341-t may be used only when translating.  SET2 is extended to length of\n\
342SET1 by repeating its last character as necessary.  Excess characters\n\
343of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
344expand in ascending order; used in SET2 while translating, they may\n\
345only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
346translating nor deleting; else squeezing uses SET2 and occurs after\n\
347translation or deletion.\n\
348"), stdout);
349      emit_ancillary_info ();
350    }
351  exit (status);
352}
353
354/* Return nonzero if the character C is a member of the
355   equivalence class containing the character EQUIV_CLASS.  */
356
357static inline bool
358is_equiv_class_member (unsigned char equiv_class, unsigned char c)
359{
360  return (equiv_class == c);
361}
362
363/* Return true if the character C is a member of the
364   character class CHAR_CLASS.  */
365
366static bool
367is_char_class_member (enum Char_class char_class, unsigned char c)
368{
369  int result;
370
371  switch (char_class)
372    {
373    case CC_ALNUM:
374      result = isalnum (c);
375      break;
376    case CC_ALPHA:
377      result = isalpha (c);
378      break;
379    case CC_BLANK:
380      result = isblank (c);
381      break;
382    case CC_CNTRL:
383      result = iscntrl (c);
384      break;
385    case CC_DIGIT:
386      result = isdigit (c);
387      break;
388    case CC_GRAPH:
389      result = isgraph (c);
390      break;
391    case CC_LOWER:
392      result = islower (c);
393      break;
394    case CC_PRINT:
395      result = isprint (c);
396      break;
397    case CC_PUNCT:
398      result = ispunct (c);
399      break;
400    case CC_SPACE:
401      result = isspace (c);
402      break;
403    case CC_UPPER:
404      result = isupper (c);
405      break;
406    case CC_XDIGIT:
407      result = isxdigit (c);
408      break;
409    default:
410      abort ();
411      break;
412    }
413
414  return !! result;
415}
416
417static void
418es_free (struct E_string *es)
419{
420  free (es->s);
421  free (es->escaped);
422}
423
424/* Perform the first pass over each range-spec argument S, converting all
425   \c and \ddd escapes to their one-byte representations.  If an invalid
426   quote sequence is found print an error message and return false;
427   Otherwise set *ES to the resulting string and return true.
428   The resulting array of characters may contain zero-bytes;
429   however, on input, S is assumed to be null-terminated, and hence
430   cannot contain actual (non-escaped) zero bytes.  */
431
432static bool
433unquote (char const *s, struct E_string *es)
434{
435  size_t i, j;
436  size_t len = strlen (s);
437
438  es->s = xmalloc (len);
439  es->escaped = xcalloc (len, sizeof es->escaped[0]);
440
441  j = 0;
442  for (i = 0; s[i]; i++)
443    {
444      unsigned char c;
445      int oct_digit;
446
447      switch (s[i])
448        {
449        case '\\':
450          es->escaped[j] = true;
451          switch (s[i + 1])
452            {
453            case '\\':
454              c = '\\';
455              break;
456            case 'a':
457              c = '\a';
458              break;
459            case 'b':
460              c = '\b';
461              break;
462            case 'f':
463              c = '\f';
464              break;
465            case 'n':
466              c = '\n';
467              break;
468            case 'r':
469              c = '\r';
470              break;
471            case 't':
472              c = '\t';
473              break;
474            case 'v':
475              c = '\v';
476              break;
477            case '0':
478            case '1':
479            case '2':
480            case '3':
481            case '4':
482            case '5':
483            case '6':
484            case '7':
485              c = s[i + 1] - '0';
486              oct_digit = s[i + 2] - '0';
487              if (0 <= oct_digit && oct_digit <= 7)
488                {
489                  c = 8 * c + oct_digit;
490                  ++i;
491                  oct_digit = s[i + 2] - '0';
492                  if (0 <= oct_digit && oct_digit <= 7)
493                    {
494                      if (8 * c + oct_digit < N_CHARS)
495                        {
496                          c = 8 * c + oct_digit;
497                          ++i;
498                        }
499                      else
500                        {
501                          /* A 3-digit octal number larger than \377 won't
502                             fit in 8 bits.  So we stop when adding the
503                             next digit would put us over the limit and
504                             give a warning about the ambiguity.  POSIX
505                             isn't clear on this, and we interpret this
506                             lack of clarity as meaning the resulting behavior
507                             is undefined, which means we're allowed to issue
508                             a warning.  */
509                          error (0, 0, _("warning: the ambiguous octal escape \
510\\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"),
511                                 s[i], s[i + 1], s[i + 2],
512                                 s[i], s[i + 1], s[i + 2]);
513                        }
514                    }
515                }
516              break;
517            case '\0':
518              error (0, 0, _("warning: an unescaped backslash "
519                             "at end of string is not portable"));
520              /* POSIX is not clear about this.  */
521              es->escaped[j] = false;
522              i--;
523              c = '\\';
524              break;
525            default:
526              c = s[i + 1];
527              break;
528            }
529          ++i;
530          es->s[j++] = c;
531          break;
532        default:
533          es->s[j++] = s[i];
534          break;
535        }
536    }
537  es->len = j;
538  return true;
539}
540
541/* If CLASS_STR is a valid character class string, return its index
542   in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
543
544static enum Char_class
545look_up_char_class (char const *class_str, size_t len)
546{
547  enum Char_class i;
548
549  for (i = 0; i < ARRAY_CARDINALITY (char_class_name); i++)
550    if (strncmp (class_str, char_class_name[i], len) == 0
551        && strlen (char_class_name[i]) == len)
552      return i;
553  return CC_NO_CLASS;
554}
555
556/* Return a newly allocated string with a printable version of C.
557   This function is used solely for formatting error messages.  */
558
559static char *
560make_printable_char (unsigned char c)
561{
562  char *buf = xmalloc (5);
563
564  if (isprint (c))
565    {
566      buf[0] = c;
567      buf[1] = '\0';
568    }
569  else
570    {
571      sprintf (buf, "\\%03o", c);
572    }
573  return buf;
574}
575
576/* Return a newly allocated copy of S which is suitable for printing.
577   LEN is the number of characters in S.  Most non-printing
578   (isprint) characters are represented by a backslash followed by
579   3 octal digits.  However, the characters represented by \c escapes
580   where c is one of [abfnrtv] are represented by their 2-character \c
581   sequences.  This function is used solely for printing error messages.  */
582
583static char *
584make_printable_str (char const *s, size_t len)
585{
586  /* Worst case is that every character expands to a backslash
587     followed by a 3-character octal escape sequence.  */
588  char *printable_buf = xnmalloc (len + 1, 4);
589  char *p = printable_buf;
590  size_t i;
591
592  for (i = 0; i < len; i++)
593    {
594      char buf[5];
595      char const *tmp = NULL;
596      unsigned char c = s[i];
597
598      switch (c)
599        {
600        case '\\':
601          tmp = "\\";
602          break;
603        case '\a':
604          tmp = "\\a";
605          break;
606        case '\b':
607          tmp = "\\b";
608          break;
609        case '\f':
610          tmp = "\\f";
611          break;
612        case '\n':
613          tmp = "\\n";
614          break;
615        case '\r':
616          tmp = "\\r";
617          break;
618        case '\t':
619          tmp = "\\t";
620          break;
621        case '\v':
622          tmp = "\\v";
623          break;
624        default:
625          if (isprint (c))
626            {
627              buf[0] = c;
628              buf[1] = '\0';
629            }
630          else
631            sprintf (buf, "\\%03o", c);
632          tmp = buf;
633          break;
634        }
635      p = stpcpy (p, tmp);
636    }
637  return printable_buf;
638}
639
640/* Append a newly allocated structure representing a
641   character C to the specification list LIST.  */
642
643static void
644append_normal_char (struct Spec_list *list, unsigned char c)
645{
646  struct List_element *new;
647
648  new = xmalloc (sizeof *new);
649  new->next = NULL;
650  new->type = RE_NORMAL_CHAR;
651  new->u.normal_char = c;
652  assert (list->tail);
653  list->tail->next = new;
654  list->tail = new;
655}
656
657/* Append a newly allocated structure representing the range
658   of characters from FIRST to LAST to the specification list LIST.
659   Return false if LAST precedes FIRST in the collating sequence,
660   true otherwise.  This means that '[c-c]' is acceptable.  */
661
662static bool
663append_range (struct Spec_list *list, unsigned char first, unsigned char last)
664{
665  struct List_element *new;
666
667  if (last < first)
668    {
669      char *tmp1 = make_printable_char (first);
670      char *tmp2 = make_printable_char (last);
671
672      error (0, 0,
673       _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
674             tmp1, tmp2);
675      free (tmp1);
676      free (tmp2);
677      return false;
678    }
679  new = xmalloc (sizeof *new);
680  new->next = NULL;
681  new->type = RE_RANGE;
682  new->u.range.first_char = first;
683  new->u.range.last_char = last;
684  assert (list->tail);
685  list->tail->next = new;
686  list->tail = new;
687  return true;
688}
689
690/* If CHAR_CLASS_STR is a valid character class string, append a
691   newly allocated structure representing that character class to the end
692   of the specification list LIST and return true.  If CHAR_CLASS_STR is not
693   a valid string return false.  */
694
695static bool
696append_char_class (struct Spec_list *list,
697                   char const *char_class_str, size_t len)
698{
699  enum Char_class char_class;
700  struct List_element *new;
701
702  char_class = look_up_char_class (char_class_str, len);
703  if (char_class == CC_NO_CLASS)
704    return false;
705  new = xmalloc (sizeof *new);
706  new->next = NULL;
707  new->type = RE_CHAR_CLASS;
708  new->u.char_class = char_class;
709  assert (list->tail);
710  list->tail->next = new;
711  list->tail = new;
712  return true;
713}
714
715/* Append a newly allocated structure representing a [c*n]
716   repeated character construct to the specification list LIST.
717   THE_CHAR is the single character to be repeated, and REPEAT_COUNT
718   is a non-negative repeat count.  */
719
720static void
721append_repeated_char (struct Spec_list *list, unsigned char the_char,
722                      count repeat_count)
723{
724  struct List_element *new;
725
726  new = xmalloc (sizeof *new);
727  new->next = NULL;
728  new->type = RE_REPEATED_CHAR;
729  new->u.repeated_char.the_repeated_char = the_char;
730  new->u.repeated_char.repeat_count = repeat_count;
731  assert (list->tail);
732  list->tail->next = new;
733  list->tail = new;
734}
735
736/* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
737   the length of that string, LEN, if LEN is exactly one, append
738   a newly allocated structure representing the specified
739   equivalence class to the specification list, LIST and return true.
740   If LEN is not 1, return false.  */
741
742static bool
743append_equiv_class (struct Spec_list *list,
744                    char const *equiv_class_str, size_t len)
745{
746  struct List_element *new;
747
748  if (len != 1)
749    return false;
750  new = xmalloc (sizeof *new);
751  new->next = NULL;
752  new->type = RE_EQUIV_CLASS;
753  new->u.equiv_code = *equiv_class_str;
754  assert (list->tail);
755  list->tail->next = new;
756  list->tail = new;
757  return true;
758}
759
760/* Search forward starting at START_IDX for the 2-char sequence
761   (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
762   a sequence is found, set *RESULT_IDX to the index of the first
763   character and return true.  Otherwise return false.  P may contain
764   zero bytes.  */
765
766static bool
767find_closing_delim (const struct E_string *es, size_t start_idx,
768                    char pre_bracket_char, size_t *result_idx)
769{
770  size_t i;
771
772  for (i = start_idx; i < es->len - 1; i++)
773    if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
774        && !es->escaped[i] && !es->escaped[i + 1])
775      {
776        *result_idx = i;
777        return true;
778      }
779  return false;
780}
781
782/* Parse the bracketed repeat-char syntax.  If the P_LEN characters
783   beginning with P[ START_IDX ] comprise a valid [c*n] construct,
784   then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
785   and return zero. If the second character following
786   the opening bracket is not `*' or if no closing bracket can be
787   found, return -1.  If a closing bracket is found and the
788   second char is `*', but the string between the `*' and `]' isn't
789   empty, an octal number, or a decimal number, print an error message
790   and return -2.  */
791
792static int
793find_bracketed_repeat (const struct E_string *es, size_t start_idx,
794                       unsigned char *char_to_repeat, count *repeat_count,
795                       size_t *closing_bracket_idx)
796{
797  size_t i;
798
799  assert (start_idx + 1 < es->len);
800  if (!es_match (es, start_idx + 1, '*'))
801    return -1;
802
803  for (i = start_idx + 2; i < es->len && !es->escaped[i]; i++)
804    {
805      if (es->s[i] == ']')
806        {
807          size_t digit_str_len = i - start_idx - 2;
808
809          *char_to_repeat = es->s[start_idx];
810          if (digit_str_len == 0)
811            {
812              /* We've matched [c*] -- no explicit repeat count.  */
813              *repeat_count = 0;
814            }
815          else
816            {
817              /* Here, we have found [c*s] where s should be a string
818                 of octal (if it starts with `0') or decimal digits.  */
819              char const *digit_str = &es->s[start_idx + 2];
820              char *d_end;
821              if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10,
822                               repeat_count, NULL)
823                   != LONGINT_OK)
824                  || REPEAT_COUNT_MAXIMUM < *repeat_count
825                  || digit_str + digit_str_len != d_end)
826                {
827                  char *tmp = make_printable_str (digit_str, digit_str_len);
828                  error (0, 0,
829                         _("invalid repeat count %s in [c*n] construct"),
830                         quote (tmp));
831                  free (tmp);
832                  return -2;
833                }
834            }
835          *closing_bracket_idx = i;
836          return 0;
837        }
838    }
839  return -1;			/* No bracket found.  */
840}
841
842/* Return true if the string at ES->s[IDX] matches the regular
843   expression `\*[0-9]*\]', false otherwise.  The string does not
844   match if any of its characters are escaped.  */
845
846static bool
847star_digits_closebracket (const struct E_string *es, size_t idx)
848{
849  size_t i;
850
851  if (!es_match (es, idx, '*'))
852    return false;
853
854  for (i = idx + 1; i < es->len; i++)
855    if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i])
856      return es_match (es, i, ']');
857  return false;
858}
859
860/* Convert string UNESCAPED_STRING (which has been preprocessed to
861   convert backslash-escape sequences) of length LEN characters into
862   a linked list of the following 5 types of constructs:
863      - [:str:] Character class where `str' is one of the 12 valid strings.
864      - [=c=] Equivalence class where `c' is any single character.
865      - [c*n] Repeat the single character `c' `n' times. n may be omitted.
866          However, if `n' is present, it must be a non-negative octal or
867          decimal integer.
868      - r-s Range of characters from `r' to `s'.  The second endpoint must
869          not precede the first in the current collating sequence.
870      - c Any other character is interpreted as itself.  */
871
872static bool
873build_spec_list (const struct E_string *es, struct Spec_list *result)
874{
875  char const *p;
876  size_t i;
877
878  p = es->s;
879
880  /* The main for-loop below recognizes the 4 multi-character constructs.
881     A character that matches (in its context) none of the multi-character
882     constructs is classified as `normal'.  Since all multi-character
883     constructs have at least 3 characters, any strings of length 2 or
884     less are composed solely of normal characters.  Hence, the index of
885     the outer for-loop runs only as far as LEN-2.  */
886
887  for (i = 0; i + 2 < es->len; /* empty */)
888    {
889      if (es_match (es, i, '['))
890        {
891          bool matched_multi_char_construct;
892          size_t closing_bracket_idx;
893          unsigned char char_to_repeat;
894          count repeat_count;
895          int err;
896
897          matched_multi_char_construct = true;
898          if (es_match (es, i + 1, ':') || es_match (es, i + 1, '='))
899            {
900              size_t closing_delim_idx;
901
902              if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx))
903                {
904                  size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
905                  char const *opnd_str = p + i + 2;
906
907                  if (opnd_str_len == 0)
908                    {
909                      if (p[i + 1] == ':')
910                        error (0, 0, _("missing character class name `[::]'"));
911                      else
912                        error (0, 0,
913                               _("missing equivalence class character `[==]'"));
914                      return false;
915                    }
916
917                  if (p[i + 1] == ':')
918                    {
919                      /* FIXME: big comment.  */
920                      if (!append_char_class (result, opnd_str, opnd_str_len))
921                        {
922                          if (star_digits_closebracket (es, i + 2))
923                            goto try_bracketed_repeat;
924                          else
925                            {
926                              char *tmp = make_printable_str (opnd_str,
927                                                              opnd_str_len);
928                              error (0, 0, _("invalid character class %s"),
929                                     quote (tmp));
930                              free (tmp);
931                              return false;
932                            }
933                        }
934                    }
935                  else
936                    {
937                      /* FIXME: big comment.  */
938                      if (!append_equiv_class (result, opnd_str, opnd_str_len))
939                        {
940                          if (star_digits_closebracket (es, i + 2))
941                            goto try_bracketed_repeat;
942                          else
943                            {
944                              char *tmp = make_printable_str (opnd_str,
945                                                              opnd_str_len);
946                              error (0, 0,
947               _("%s: equivalence class operand must be a single character"),
948                                     tmp);
949                              free (tmp);
950                              return false;
951                            }
952                        }
953                    }
954
955                  i = closing_delim_idx + 2;
956                  continue;
957                }
958              /* Else fall through.  This could be [:*] or [=*].  */
959            }
960
961        try_bracketed_repeat:
962
963          /* Determine whether this is a bracketed repeat range
964             matching the RE \[.\*(dec_or_oct_number)?\].  */
965          err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
966                                       &repeat_count,
967                                       &closing_bracket_idx);
968          if (err == 0)
969            {
970              append_repeated_char (result, char_to_repeat, repeat_count);
971              i = closing_bracket_idx + 1;
972            }
973          else if (err == -1)
974            {
975              matched_multi_char_construct = false;
976            }
977          else
978            {
979              /* Found a string that looked like [c*n] but the
980                 numeric part was invalid.  */
981              return false;
982            }
983
984          if (matched_multi_char_construct)
985            continue;
986
987          /* We reach this point if P does not match [:str:], [=c=],
988             [c*n], or [c*].  Now, see if P looks like a range `[-c'
989             (from `[' to `c').  */
990        }
991
992      /* Look ahead one char for ranges like a-z.  */
993      if (es_match (es, i + 1, '-'))
994        {
995          if (!append_range (result, p[i], p[i + 2]))
996            return false;
997          i += 3;
998        }
999      else
1000        {
1001          append_normal_char (result, p[i]);
1002          ++i;
1003        }
1004    }
1005
1006  /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1007  for (; i < es->len; i++)
1008    append_normal_char (result, p[i]);
1009
1010  return true;
1011}
1012
1013/* Advance past the current construct.
1014   S->tail must be non-NULL.  */
1015static void
1016skip_construct (struct Spec_list *s)
1017{
1018  s->tail = s->tail->next;
1019  s->state = NEW_ELEMENT;
1020}
1021
1022/* Given a Spec_list S (with its saved state implicit in the values
1023   of its members `tail' and `state'), return the next single character
1024   in the expansion of S's constructs.  If the last character of S was
1025   returned on the previous call or if S was empty, this function
1026   returns -1.  For example, successive calls to get_next where S
1027   represents the spec-string 'a-d[y*3]' will return the sequence
1028   of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1029   which the returned character comes is [:upper:] or [:lower:], the
1030   parameter CLASS is given a value to indicate which it was.  Otherwise
1031   CLASS is set to UL_NONE.  This value is used only when constructing
1032   the translation table to verify that any occurrences of upper and
1033   lower class constructs in the spec-strings appear in the same relative
1034   positions.  */
1035
1036static int
1037get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1038{
1039  struct List_element *p;
1040  int return_val;
1041  int i;
1042
1043  if (class)
1044    *class = UL_NONE;
1045
1046  if (s->state == BEGIN_STATE)
1047    {
1048      s->tail = s->head->next;
1049      s->state = NEW_ELEMENT;
1050    }
1051
1052  p = s->tail;
1053  if (p == NULL)
1054    return -1;
1055
1056  switch (p->type)
1057    {
1058    case RE_NORMAL_CHAR:
1059      return_val = p->u.normal_char;
1060      s->state = NEW_ELEMENT;
1061      s->tail = p->next;
1062      break;
1063
1064    case RE_RANGE:
1065      if (s->state == NEW_ELEMENT)
1066        s->state = p->u.range.first_char;
1067      else
1068        ++(s->state);
1069      return_val = s->state;
1070      if (s->state == p->u.range.last_char)
1071        {
1072          s->tail = p->next;
1073          s->state = NEW_ELEMENT;
1074        }
1075      break;
1076
1077    case RE_CHAR_CLASS:
1078      if (class)
1079        {
1080          switch (p->u.char_class)
1081            {
1082            case CC_LOWER:
1083              *class = UL_LOWER;
1084              break;
1085            case CC_UPPER:
1086              *class = UL_UPPER;
1087              break;
1088            default:
1089              break;
1090            }
1091        }
1092
1093      if (s->state == NEW_ELEMENT)
1094        {
1095          for (i = 0; i < N_CHARS; i++)
1096            if (is_char_class_member (p->u.char_class, i))
1097              break;
1098          assert (i < N_CHARS);
1099          s->state = i;
1100        }
1101      assert (is_char_class_member (p->u.char_class, s->state));
1102      return_val = s->state;
1103      for (i = s->state + 1; i < N_CHARS; i++)
1104        if (is_char_class_member (p->u.char_class, i))
1105          break;
1106      if (i < N_CHARS)
1107        s->state = i;
1108      else
1109        {
1110          s->tail = p->next;
1111          s->state = NEW_ELEMENT;
1112        }
1113      break;
1114
1115    case RE_EQUIV_CLASS:
1116      /* FIXME: this assumes that each character is alone in its own
1117         equivalence class (which appears to be correct for my
1118         LC_COLLATE.  But I don't know of any function that allows
1119         one to determine a character's equivalence class.  */
1120
1121      return_val = p->u.equiv_code;
1122      s->state = NEW_ELEMENT;
1123      s->tail = p->next;
1124      break;
1125
1126    case RE_REPEATED_CHAR:
1127      /* Here, a repeat count of n == 0 means don't repeat at all.  */
1128      if (p->u.repeated_char.repeat_count == 0)
1129        {
1130          s->tail = p->next;
1131          s->state = NEW_ELEMENT;
1132          return_val = get_next (s, class);
1133        }
1134      else
1135        {
1136          if (s->state == NEW_ELEMENT)
1137            {
1138              s->state = 0;
1139            }
1140          ++(s->state);
1141          return_val = p->u.repeated_char.the_repeated_char;
1142          if (s->state == p->u.repeated_char.repeat_count)
1143            {
1144              s->tail = p->next;
1145              s->state = NEW_ELEMENT;
1146            }
1147        }
1148      break;
1149
1150    default:
1151      abort ();
1152      break;
1153    }
1154
1155  return return_val;
1156}
1157
1158/* This is a minor kludge.  This function is called from
1159   get_spec_stats to determine the cardinality of a set derived
1160   from a complemented string.  It's a kludge in that some of the
1161   same operations are (duplicated) performed in set_initialize.  */
1162
1163static int
1164card_of_complement (struct Spec_list *s)
1165{
1166  int c;
1167  int cardinality = N_CHARS;
1168  bool in_set[N_CHARS] = { 0, };
1169
1170  s->state = BEGIN_STATE;
1171  while ((c = get_next (s, NULL)) != -1)
1172    {
1173      cardinality -= (!in_set[c]);
1174      in_set[c] = true;
1175    }
1176  return cardinality;
1177}
1178
1179/* Gather statistics about the spec-list S in preparation for the tests
1180   in validate that determine the consistency of the specs.  This function
1181   is called at most twice; once for string1, and again for any string2.
1182   LEN_S1 < 0 indicates that this is the first call and that S represents
1183   string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1184   constructs in string1, and we can use its value to resolve any
1185   indefinite repeat construct in S (which represents string2).  Hence,
1186   this function has the side-effect that it converts a valid [c*]
1187   construct in string2 to [c*n] where n is large enough (or 0) to give
1188   string2 the same length as string1.  For example, with the command
1189   tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1190   be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1191
1192static void
1193get_spec_stats (struct Spec_list *s)
1194{
1195  struct List_element *p;
1196  count length = 0;
1197
1198  s->n_indefinite_repeats = 0;
1199  s->has_equiv_class = false;
1200  s->has_restricted_char_class = false;
1201  s->has_char_class = false;
1202  for (p = s->head->next; p; p = p->next)
1203    {
1204      int i;
1205      count len = 0;
1206      count new_length;
1207
1208      switch (p->type)
1209        {
1210        case RE_NORMAL_CHAR:
1211          len = 1;
1212          break;
1213
1214        case RE_RANGE:
1215          assert (p->u.range.last_char >= p->u.range.first_char);
1216          len = p->u.range.last_char - p->u.range.first_char + 1;
1217          break;
1218
1219        case RE_CHAR_CLASS:
1220          s->has_char_class = true;
1221          for (i = 0; i < N_CHARS; i++)
1222            if (is_char_class_member (p->u.char_class, i))
1223              ++len;
1224          switch (p->u.char_class)
1225            {
1226            case CC_UPPER:
1227            case CC_LOWER:
1228              break;
1229            default:
1230              s->has_restricted_char_class = true;
1231              break;
1232            }
1233          break;
1234
1235        case RE_EQUIV_CLASS:
1236          for (i = 0; i < N_CHARS; i++)
1237            if (is_equiv_class_member (p->u.equiv_code, i))
1238              ++len;
1239          s->has_equiv_class = true;
1240          break;
1241
1242        case RE_REPEATED_CHAR:
1243          if (p->u.repeated_char.repeat_count > 0)
1244            len = p->u.repeated_char.repeat_count;
1245          else
1246            {
1247              s->indefinite_repeat_element = p;
1248              ++(s->n_indefinite_repeats);
1249            }
1250          break;
1251
1252        default:
1253          abort ();
1254          break;
1255        }
1256
1257      /* Check for arithmetic overflow in computing length.  Also, reject
1258         any length greater than the maximum repeat count, in case the
1259         length is later used to compute the repeat count for an
1260         indefinite element.  */
1261      new_length = length + len;
1262      if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM))
1263        error (EXIT_FAILURE, 0, _("too many characters in set"));
1264      length = new_length;
1265    }
1266
1267  s->length = length;
1268}
1269
1270static void
1271get_s1_spec_stats (struct Spec_list *s1)
1272{
1273  get_spec_stats (s1);
1274  if (complement)
1275    s1->length = card_of_complement (s1);
1276}
1277
1278static void
1279get_s2_spec_stats (struct Spec_list *s2, count len_s1)
1280{
1281  get_spec_stats (s2);
1282  if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1283    {
1284      s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1285        len_s1 - s2->length;
1286      s2->length = len_s1;
1287    }
1288}
1289
1290static void
1291spec_init (struct Spec_list *spec_list)
1292{
1293  struct List_element *new = xmalloc (sizeof *new);
1294  spec_list->head = spec_list->tail = new;
1295  spec_list->head->next = NULL;
1296}
1297
1298/* This function makes two passes over the argument string S.  The first
1299   one converts all \c and \ddd escapes to their one-byte representations.
1300   The second constructs a linked specification list, SPEC_LIST, of the
1301   characters and constructs that comprise the argument string.  If either
1302   of these passes detects an error, this function returns false.  */
1303
1304static bool
1305parse_str (char const *s, struct Spec_list *spec_list)
1306{
1307  struct E_string es;
1308  bool ok = unquote (s, &es) && build_spec_list (&es, spec_list);
1309  es_free (&es);
1310  return ok;
1311}
1312
1313/* Given two specification lists, S1 and S2, and assuming that
1314   S1->length > S2->length, append a single [c*n] element to S2 where c
1315   is the last character in the expansion of S2 and n is the difference
1316   between the two lengths.
1317   Upon successful completion, S2->length is set to S1->length.  The only
1318   way this function can fail to make S2 as long as S1 is when S2 has
1319   zero-length, since in that case, there is no last character to repeat.
1320   So S2->length is required to be at least 1.
1321
1322   Providing this functionality allows the user to do some pretty
1323   non-BSD (and non-portable) things:  For example, the command
1324       tr -cs '[:upper:]0-9' '[:lower:]'
1325   is almost guaranteed to give results that depend on your collating
1326   sequence.  */
1327
1328static void
1329string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1330{
1331  struct List_element *p;
1332  unsigned char char_to_repeat;
1333  int i;
1334
1335  assert (translating);
1336  assert (s1->length > s2->length);
1337  assert (s2->length > 0);
1338
1339  p = s2->tail;
1340  switch (p->type)
1341    {
1342    case RE_NORMAL_CHAR:
1343      char_to_repeat = p->u.normal_char;
1344      break;
1345    case RE_RANGE:
1346      char_to_repeat = p->u.range.last_char;
1347      break;
1348    case RE_CHAR_CLASS:
1349      for (i = N_CHARS - 1; i >= 0; i--)
1350        if (is_char_class_member (p->u.char_class, i))
1351          break;
1352      assert (i >= 0);
1353      char_to_repeat = i;
1354      break;
1355
1356    case RE_REPEATED_CHAR:
1357      char_to_repeat = p->u.repeated_char.the_repeated_char;
1358      break;
1359
1360    case RE_EQUIV_CLASS:
1361      /* This shouldn't happen, because validate exits with an error
1362         if it finds an equiv class in string2 when translating.  */
1363      abort ();
1364      break;
1365
1366    default:
1367      abort ();
1368      break;
1369    }
1370
1371  append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1372  s2->length = s1->length;
1373}
1374
1375/* Return true if S is a non-empty list in which exactly one
1376   character (but potentially, many instances of it) appears.
1377   E.g., [X*] or xxxxxxxx.  */
1378
1379static bool
1380homogeneous_spec_list (struct Spec_list *s)
1381{
1382  int b, c;
1383
1384  s->state = BEGIN_STATE;
1385
1386  if ((b = get_next (s, NULL)) == -1)
1387    return false;
1388
1389  while ((c = get_next (s, NULL)) != -1)
1390    if (c != b)
1391      return false;
1392
1393  return true;
1394}
1395
1396/* Die with an error message if S1 and S2 describe strings that
1397   are not valid with the given command line switches.
1398   A side effect of this function is that if a valid [c*] or
1399   [c*0] construct appears in string2, it is converted to [c*n]
1400   with a value for n that makes s2->length == s1->length.  By
1401   the same token, if the --truncate-set1 option is not
1402   given, S2 may be extended.  */
1403
1404static void
1405validate (struct Spec_list *s1, struct Spec_list *s2)
1406{
1407  get_s1_spec_stats (s1);
1408  if (s1->n_indefinite_repeats > 0)
1409    {
1410      error (EXIT_FAILURE, 0,
1411             _("the [c*] repeat construct may not appear in string1"));
1412    }
1413
1414  if (s2)
1415    {
1416      get_s2_spec_stats (s2, s1->length);
1417
1418      if (s2->n_indefinite_repeats > 1)
1419        {
1420          error (EXIT_FAILURE, 0,
1421                 _("only one [c*] repeat construct may appear in string2"));
1422        }
1423
1424      if (translating)
1425        {
1426          if (s2->has_equiv_class)
1427            {
1428              error (EXIT_FAILURE, 0,
1429                     _("[=c=] expressions may not appear in string2 \
1430when translating"));
1431            }
1432
1433          if (s1->length > s2->length)
1434            {
1435              if (!truncate_set1)
1436                {
1437                  /* string2 must be non-empty unless --truncate-set1 is
1438                     given or string1 is empty.  */
1439
1440                  if (s2->length == 0)
1441                    error (EXIT_FAILURE, 0,
1442                     _("when not truncating set1, string2 must be non-empty"));
1443                  string2_extend (s1, s2);
1444                }
1445            }
1446
1447          if (complement && s1->has_char_class
1448              && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1449            {
1450              error (EXIT_FAILURE, 0,
1451                     _("when translating with complemented character classes,\
1452\nstring2 must map all characters in the domain to one"));
1453            }
1454
1455          if (s2->has_restricted_char_class)
1456            {
1457              error (EXIT_FAILURE, 0,
1458                     _("when translating, the only character classes that may \
1459appear in\nstring2 are `upper' and `lower'"));
1460            }
1461        }
1462      else
1463        /* Not translating.  */
1464        {
1465          if (s2->n_indefinite_repeats > 0)
1466            error (EXIT_FAILURE, 0,
1467                   _("the [c*] construct may appear in string2 only \
1468when translating"));
1469        }
1470    }
1471}
1472
1473/* Read buffers of SIZE bytes via the function READER (if READER is
1474   NULL, read from stdin) until EOF.  When non-NULL, READER is either
1475   read_and_delete or read_and_xlate.  After each buffer is read, it is
1476   processed and written to stdout.  The buffers are processed so that
1477   multiple consecutive occurrences of the same character in the input
1478   stream are replaced by a single occurrence of that character if the
1479   character is in the squeeze set.  */
1480
1481static void
1482squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t))
1483{
1484  /* A value distinct from any character that may have been stored in a
1485     buffer as the result of a block-read in the function squeeze_filter.  */
1486  enum { NOT_A_CHAR = CHAR_MAX + 1 };
1487
1488  int char_to_squeeze = NOT_A_CHAR;
1489  size_t i = 0;
1490  size_t nr = 0;
1491
1492  for (;;)
1493    {
1494      size_t begin;
1495
1496      if (i >= nr)
1497        {
1498          nr = reader (buf, size);
1499          if (nr == 0)
1500            break;
1501          i = 0;
1502        }
1503
1504      begin = i;
1505
1506      if (char_to_squeeze == NOT_A_CHAR)
1507        {
1508          size_t out_len;
1509          /* Here, by being a little tricky, we can get a significant
1510             performance increase in most cases when the input is
1511             reasonably large.  Since tr will modify the input only
1512             if two consecutive (and identical) input characters are
1513             in the squeeze set, we can step by two through the data
1514             when searching for a character in the squeeze set.  This
1515             means there may be a little more work in a few cases and
1516             perhaps twice as much work in the worst cases where most
1517             of the input is removed by squeezing repeats.  But most
1518             uses of this functionality seem to remove less than 20-30%
1519             of the input.  */
1520          for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2)
1521            continue;
1522
1523          /* There is a special case when i == nr and we've just
1524             skipped a character (the last one in buf) that is in
1525             the squeeze set.  */
1526          if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])])
1527            --i;
1528
1529          if (i >= nr)
1530            out_len = nr - begin;
1531          else
1532            {
1533              char_to_squeeze = buf[i];
1534              /* We're about to output buf[begin..i].  */
1535              out_len = i - begin + 1;
1536
1537              /* But since we stepped by 2 in the loop above,
1538                 out_len may be one too large.  */
1539              if (i > 0 && buf[i - 1] == char_to_squeeze)
1540                --out_len;
1541
1542              /* Advance i to the index of first character to be
1543                 considered when looking for a char different from
1544                 char_to_squeeze.  */
1545              ++i;
1546            }
1547          if (out_len > 0
1548              && fwrite (&buf[begin], 1, out_len, stdout) != out_len)
1549            error (EXIT_FAILURE, errno, _("write error"));
1550        }
1551
1552      if (char_to_squeeze != NOT_A_CHAR)
1553        {
1554          /* Advance i to index of first char != char_to_squeeze
1555             (or to nr if all the rest of the characters in this
1556             buffer are the same as char_to_squeeze).  */
1557          for (; i < nr && buf[i] == char_to_squeeze; i++)
1558            continue;
1559          if (i < nr)
1560            char_to_squeeze = NOT_A_CHAR;
1561          /* If (i >= nr) we've squeezed the last character in this buffer.
1562             So now we have to read a new buffer and continue comparing
1563             characters against char_to_squeeze.  */
1564        }
1565    }
1566}
1567
1568static size_t
1569plain_read (char *buf, size_t size)
1570{
1571  size_t nr = safe_read (STDIN_FILENO, buf, size);
1572  if (nr == SAFE_READ_ERROR)
1573    error (EXIT_FAILURE, errno, _("read error"));
1574  return nr;
1575}
1576
1577/* Read buffers of SIZE bytes from stdin until one is found that
1578   contains at least one character not in the delete set.  Store
1579   in the array BUF, all characters from that buffer that are not
1580   in the delete set, and return the number of characters saved
1581   or 0 upon EOF.  */
1582
1583static size_t
1584read_and_delete (char *buf, size_t size)
1585{
1586  size_t n_saved;
1587
1588  /* This enclosing do-while loop is to make sure that
1589     we don't return zero (indicating EOF) when we've
1590     just deleted all the characters in a buffer.  */
1591  do
1592    {
1593      size_t i;
1594      size_t nr = plain_read (buf, size);
1595
1596      if (nr == 0)
1597        return 0;
1598
1599      /* This first loop may be a waste of code, but gives much
1600         better performance when no characters are deleted in
1601         the beginning of a buffer.  It just avoids the copying
1602         of buf[i] into buf[n_saved] when it would be a NOP.  */
1603
1604      for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++)
1605        continue;
1606      n_saved = i;
1607
1608      for (++i; i < nr; i++)
1609        if (!in_delete_set[to_uchar (buf[i])])
1610          buf[n_saved++] = buf[i];
1611    }
1612  while (n_saved == 0);
1613
1614  return n_saved;
1615}
1616
1617/* Read at most SIZE bytes from stdin into the array BUF.  Then
1618   perform the in-place and one-to-one mapping specified by the global
1619   array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1620
1621static size_t
1622read_and_xlate (char *buf, size_t size)
1623{
1624  size_t bytes_read = plain_read (buf, size);
1625  size_t i;
1626
1627  for (i = 0; i < bytes_read; i++)
1628    buf[i] = xlate[to_uchar (buf[i])];
1629
1630  return bytes_read;
1631}
1632
1633/* Initialize a boolean membership set, IN_SET, with the character
1634   values obtained by traversing the linked list of constructs S
1635   using the function `get_next'.  IN_SET is expected to have been
1636   initialized to all zeros by the caller.  If COMPLEMENT_THIS_SET
1637   is true the resulting set is complemented.  */
1638
1639static void
1640set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set)
1641{
1642  int c;
1643  size_t i;
1644
1645  s->state = BEGIN_STATE;
1646  while ((c = get_next (s, NULL)) != -1)
1647    in_set[c] = true;
1648  if (complement_this_set)
1649    for (i = 0; i < N_CHARS; i++)
1650      in_set[i] = (!in_set[i]);
1651}
1652
1653int
1654main (int argc, char **argv)
1655{
1656  int c;
1657  int non_option_args;
1658  int min_operands;
1659  int max_operands;
1660  struct Spec_list buf1, buf2;
1661  struct Spec_list *s1 = &buf1;
1662  struct Spec_list *s2 = &buf2;
1663
1664  initialize_main (&argc, &argv);
1665  set_program_name (argv[0]);
1666  setlocale (LC_ALL, "");
1667  bindtextdomain (PACKAGE, LOCALEDIR);
1668  textdomain (PACKAGE);
1669
1670  atexit (close_stdout);
1671
1672  while ((c = getopt_long (argc, argv, "+cCdst", long_options, NULL)) != -1)
1673    {
1674      switch (c)
1675        {
1676        case 'c':
1677        case 'C':
1678          complement = true;
1679          break;
1680
1681        case 'd':
1682          delete = true;
1683          break;
1684
1685        case 's':
1686          squeeze_repeats = true;
1687          break;
1688
1689        case 't':
1690          truncate_set1 = true;
1691          break;
1692
1693        case_GETOPT_HELP_CHAR;
1694
1695        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1696
1697        default:
1698          usage (EXIT_FAILURE);
1699          break;
1700        }
1701    }
1702
1703  non_option_args = argc - optind;
1704  translating = (non_option_args == 2 && !delete);
1705  min_operands = 1 + (delete == squeeze_repeats);
1706  max_operands = 1 + (delete <= squeeze_repeats);
1707
1708  if (non_option_args < min_operands)
1709    {
1710      if (non_option_args == 0)
1711        error (0, 0, _("missing operand"));
1712      else
1713        {
1714          error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1715          fprintf (stderr, "%s\n",
1716                   _(squeeze_repeats
1717                     ? N_("Two strings must be given when "
1718                          "both deleting and squeezing repeats.")
1719                     : N_("Two strings must be given when translating.")));
1720        }
1721      usage (EXIT_FAILURE);
1722    }
1723
1724  if (max_operands < non_option_args)
1725    {
1726      error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands]));
1727      if (non_option_args == 2)
1728        fprintf (stderr, "%s\n",
1729                 _("Only one string may be given when "
1730                   "deleting without squeezing repeats."));
1731      usage (EXIT_FAILURE);
1732    }
1733
1734  spec_init (s1);
1735  if (!parse_str (argv[optind], s1))
1736    exit (EXIT_FAILURE);
1737
1738  if (non_option_args == 2)
1739    {
1740      spec_init (s2);
1741      if (!parse_str (argv[optind + 1], s2))
1742        exit (EXIT_FAILURE);
1743    }
1744  else
1745    s2 = NULL;
1746
1747  validate (s1, s2);
1748
1749  /* Use binary I/O, since `tr' is sometimes used to transliterate
1750     non-printable characters, or characters which are stripped away
1751     by text-mode reads (like CR and ^Z).  */
1752  if (O_BINARY && ! isatty (STDIN_FILENO))
1753    xfreopen (NULL, "rb", stdin);
1754  if (O_BINARY && ! isatty (STDOUT_FILENO))
1755    xfreopen (NULL, "wb", stdout);
1756
1757  if (squeeze_repeats && non_option_args == 1)
1758    {
1759      set_initialize (s1, complement, in_squeeze_set);
1760      squeeze_filter (io_buf, sizeof io_buf, plain_read);
1761    }
1762  else if (delete && non_option_args == 1)
1763    {
1764      set_initialize (s1, complement, in_delete_set);
1765
1766      for (;;)
1767        {
1768          size_t nr = read_and_delete (io_buf, sizeof io_buf);
1769          if (nr == 0)
1770            break;
1771          if (fwrite (io_buf, 1, nr, stdout) != nr)
1772            error (EXIT_FAILURE, errno, _("write error"));
1773        }
1774    }
1775  else if (squeeze_repeats && delete && non_option_args == 2)
1776    {
1777      set_initialize (s1, complement, in_delete_set);
1778      set_initialize (s2, false, in_squeeze_set);
1779      squeeze_filter (io_buf, sizeof io_buf, read_and_delete);
1780    }
1781  else if (translating)
1782    {
1783      if (complement)
1784        {
1785          int i;
1786          bool *in_s1 = in_delete_set;
1787
1788          set_initialize (s1, false, in_s1);
1789          s2->state = BEGIN_STATE;
1790          for (i = 0; i < N_CHARS; i++)
1791            xlate[i] = i;
1792          for (i = 0; i < N_CHARS; i++)
1793            {
1794              if (!in_s1[i])
1795                {
1796                  int ch = get_next (s2, NULL);
1797                  assert (ch != -1 || truncate_set1);
1798                  if (ch == -1)
1799                    {
1800                      /* This will happen when tr is invoked like e.g.
1801                         tr -cs A-Za-z0-9 '\012'.  */
1802                      break;
1803                    }
1804                  xlate[i] = ch;
1805                }
1806            }
1807        }
1808      else
1809        {
1810          int c1, c2;
1811          int i;
1812          bool case_convert = false;
1813          enum Upper_Lower_class class_s1;
1814          enum Upper_Lower_class class_s2;
1815
1816          for (i = 0; i < N_CHARS; i++)
1817            xlate[i] = i;
1818          s1->state = BEGIN_STATE;
1819          s2->state = BEGIN_STATE;
1820          for (;;)
1821            {
1822              /* When the previous pair identified case-converting classes,
1823                 advance S1 and S2 so that each points to the following
1824                 construct.  */
1825              if (case_convert)
1826                {
1827                  skip_construct (s1);
1828                  skip_construct (s2);
1829                  case_convert = false;
1830                }
1831
1832              c1 = get_next (s1, &class_s1);
1833              c2 = get_next (s2, &class_s2);
1834
1835              /* When translating and there is an [:upper:] or [:lower:]
1836                 class in SET2, then there must be a corresponding [:lower:]
1837                 or [:upper:] class in SET1.  */
1838              if (class_s1 == UL_NONE
1839                  && (class_s2 == UL_LOWER || class_s2 == UL_UPPER))
1840                error (EXIT_FAILURE, 0,
1841                       _("misaligned [:upper:] and/or [:lower:] construct"));
1842
1843              if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1844                {
1845                  case_convert = true;
1846                  for (i = 0; i < N_CHARS; i++)
1847                    if (islower (i))
1848                      xlate[i] = toupper (i);
1849                }
1850              else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1851                {
1852                  case_convert = true;
1853                  for (i = 0; i < N_CHARS; i++)
1854                    if (isupper (i))
1855                      xlate[i] = tolower (i);
1856                }
1857              else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1858                       || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1859                {
1860                  /* POSIX says the behavior of `tr "[:upper:]" "[:upper:]"'
1861                     is undefined.  Treat it as a no-op.  */
1862                }
1863              else
1864                {
1865                  /* The following should have been checked by validate...  */
1866                  if (c1 == -1 || c2 == -1)
1867                    break;
1868                  xlate[c1] = c2;
1869                }
1870            }
1871          assert (c1 == -1 || truncate_set1);
1872        }
1873      if (squeeze_repeats)
1874        {
1875          set_initialize (s2, false, in_squeeze_set);
1876          squeeze_filter (io_buf, sizeof io_buf, read_and_xlate);
1877        }
1878      else
1879        {
1880          for (;;)
1881            {
1882              size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf);
1883              if (bytes_read == 0)
1884                break;
1885              if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read)
1886                error (EXIT_FAILURE, errno, _("write error"));
1887            }
1888        }
1889    }
1890
1891  if (close (STDIN_FILENO) != 0)
1892    error (EXIT_FAILURE, errno, _("standard input"));
1893
1894  exit (EXIT_SUCCESS);
1895}
1896