1/* tr -- a filter to translate characters 2 Copyright (C) 1991, 1995-2010 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation, either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 16 17/* Written by Jim Meyering */ 18 19#include <config.h> 20 21#include <stdio.h> 22#include <assert.h> 23#include <sys/types.h> 24#include <getopt.h> 25 26#include "system.h" 27#include "error.h" 28#include "quote.h" 29#include "safe-read.h" 30#include "xfreopen.h" 31#include "xstrtol.h" 32 33/* The official name of this program (e.g., no `g' prefix). */ 34#define PROGRAM_NAME "tr" 35 36#define AUTHORS proper_name ("Jim Meyering") 37 38enum { N_CHARS = UCHAR_MAX + 1 }; 39 40/* An unsigned integer type big enough to hold a repeat count or an 41 unsigned character. POSIX requires support for repeat counts as 42 high as 2**31 - 1. Since repeat counts might need to expand to 43 match the length of an argument string, we need at least size_t to 44 avoid arbitrary internal limits. It doesn't cost much to use 45 uintmax_t, though. */ 46typedef uintmax_t count; 47 48/* The value for Spec_list->state that indicates to 49 get_next that it should initialize the tail pointer. 50 Its value should be as large as possible to avoid conflict 51 a valid value for the state field -- and that may be as 52 large as any valid repeat_count. */ 53#define BEGIN_STATE (UINTMAX_MAX - 1) 54 55/* The value for Spec_list->state that indicates to 56 get_next that the element pointed to by Spec_list->tail is 57 being considered for the first time on this pass through the 58 list -- it indicates that get_next should make any necessary 59 initializations. */ 60#define NEW_ELEMENT (BEGIN_STATE + 1) 61 62/* The maximum possible repeat count. Due to how the states are 63 implemented, it can be as much as BEGIN_STATE. */ 64#define REPEAT_COUNT_MAXIMUM BEGIN_STATE 65 66/* The following (but not CC_NO_CLASS) are indices into the array of 67 valid character class strings. */ 68enum Char_class 69 { 70 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3, 71 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7, 72 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11, 73 CC_NO_CLASS = 9999 74 }; 75 76/* Character class to which a character (returned by get_next) belonged; 77 but it is set only if the construct from which the character was obtained 78 was one of the character classes [:upper:] or [:lower:]. The value 79 is used only when translating and then, only to make sure that upper 80 and lower class constructs have the same relative positions in string1 81 and string2. */ 82enum Upper_Lower_class 83 { 84 UL_LOWER, 85 UL_UPPER, 86 UL_NONE 87 }; 88 89/* The type of a List_element. See build_spec_list for more details. */ 90enum Range_element_type 91 { 92 RE_NORMAL_CHAR, 93 RE_RANGE, 94 RE_CHAR_CLASS, 95 RE_EQUIV_CLASS, 96 RE_REPEATED_CHAR 97 }; 98 99/* One construct in one of tr's argument strings. 100 For example, consider the POSIX version of the classic tr command: 101 tr -cs 'a-zA-Z_' '[\n*]' 102 String1 has 3 constructs, two of which are ranges (a-z and A-Z), 103 and a single normal character, `_'. String2 has one construct. */ 104struct List_element 105 { 106 enum Range_element_type type; 107 struct List_element *next; 108 union 109 { 110 unsigned char normal_char; 111 struct /* unnamed */ 112 { 113 unsigned char first_char; 114 unsigned char last_char; 115 } 116 range; 117 enum Char_class char_class; 118 unsigned char equiv_code; 119 struct /* unnamed */ 120 { 121 unsigned char the_repeated_char; 122 count repeat_count; 123 } 124 repeated_char; 125 } 126 u; 127 }; 128 129/* Each of tr's argument strings is parsed into a form that is easier 130 to work with: a linked list of constructs (struct List_element). 131 Each Spec_list structure also encapsulates various attributes of 132 the corresponding argument string. The attributes are used mainly 133 to verify that the strings are valid in the context of any options 134 specified (like -s, -d, or -c). The main exception is the member 135 `tail', which is first used to construct the list. After construction, 136 it is used by get_next to save its state when traversing the list. 137 The member `state' serves a similar function. */ 138struct Spec_list 139 { 140 /* Points to the head of the list of range elements. 141 The first struct is a dummy; its members are never used. */ 142 struct List_element *head; 143 144 /* When appending, points to the last element. When traversing via 145 get_next(), points to the element to process next. Setting 146 Spec_list.state to the value BEGIN_STATE before calling get_next 147 signals get_next to initialize tail to point to head->next. */ 148 struct List_element *tail; 149 150 /* Used to save state between calls to get_next. */ 151 count state; 152 153 /* Length, in the sense that length ('a-z[:digit:]123abc') 154 is 42 ( = 26 + 10 + 6). */ 155 count length; 156 157 /* The number of [c*] and [c*0] constructs that appear in this spec. */ 158 size_t n_indefinite_repeats; 159 160 /* If n_indefinite_repeats is nonzero, this points to the List_element 161 corresponding to the last [c*] or [c*0] construct encountered in 162 this spec. Otherwise it is undefined. */ 163 struct List_element *indefinite_repeat_element; 164 165 /* True if this spec contains at least one equivalence 166 class construct e.g. [=c=]. */ 167 bool has_equiv_class; 168 169 /* True if this spec contains at least one character class 170 construct. E.g. [:digit:]. */ 171 bool has_char_class; 172 173 /* True if this spec contains at least one of the character class 174 constructs (all but upper and lower) that aren't allowed in s2. */ 175 bool has_restricted_char_class; 176 }; 177 178/* A representation for escaped string1 or string2. As a string is parsed, 179 any backslash-escaped characters (other than octal or \a, \b, \f, \n, 180 etc.) are marked as such in this structure by setting the corresponding 181 entry in the ESCAPED vector. */ 182struct E_string 183{ 184 char *s; 185 bool *escaped; 186 size_t len; 187}; 188 189/* Return nonzero if the Ith character of escaped string ES matches C 190 and is not escaped itself. */ 191static inline bool 192es_match (struct E_string const *es, size_t i, char c) 193{ 194 return es->s[i] == c && !es->escaped[i]; 195} 196 197/* When true, each sequence in the input of a repeated character 198 (call it c) is replaced (in the output) by a single occurrence of c 199 for every c in the squeeze set. */ 200static bool squeeze_repeats = false; 201 202/* When true, removes characters in the delete set from input. */ 203static bool delete = false; 204 205/* Use the complement of set1 in place of set1. */ 206static bool complement = false; 207 208/* When tr is performing translation and string1 is longer than string2, 209 POSIX says that the result is unspecified. That gives the implementor 210 of a POSIX conforming version of tr two reasonable choices for the 211 semantics of this case. 212 213 * The BSD tr pads string2 to the length of string1 by 214 repeating the last character in string2. 215 216 * System V tr ignores characters in string1 that have no 217 corresponding character in string2. That is, string1 is effectively 218 truncated to the length of string2. 219 220 When nonzero, this flag causes GNU tr to imitate the behavior 221 of System V tr when translating with string1 longer than string2. 222 The default is to emulate BSD tr. This flag is ignored in modes where 223 no translation is performed. Emulating the System V tr 224 in this exceptional case causes the relatively common BSD idiom: 225 226 tr -cs A-Za-z0-9 '\012' 227 228 to break (it would convert only zero bytes, rather than all 229 non-alphanumerics, to newlines). 230 231 WARNING: This switch does not provide general BSD or System V 232 compatibility. For example, it doesn't disable the interpretation 233 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by 234 some unfortunate coincidence you use such constructs in scripts 235 expecting to use some other version of tr, the scripts will break. */ 236static bool truncate_set1 = false; 237 238/* An alias for (!delete && non_option_args == 2). 239 It is set in main and used there and in validate(). */ 240static bool translating; 241 242static char io_buf[BUFSIZ]; 243 244static char const *const char_class_name[] = 245{ 246 "alnum", "alpha", "blank", "cntrl", "digit", "graph", 247 "lower", "print", "punct", "space", "upper", "xdigit" 248}; 249 250/* Array of boolean values. A character `c' is a member of the 251 squeeze set if and only if in_squeeze_set[c] is true. The squeeze 252 set is defined by the last (possibly, the only) string argument 253 on the command line when the squeeze option is given. */ 254static bool in_squeeze_set[N_CHARS]; 255 256/* Array of boolean values. A character `c' is a member of the 257 delete set if and only if in_delete_set[c] is true. The delete 258 set is defined by the first (or only) string argument on the 259 command line when the delete option is given. */ 260static bool in_delete_set[N_CHARS]; 261 262/* Array of character values defining the translation (if any) that 263 tr is to perform. Translation is performed only when there are 264 two specification strings and the delete switch is not given. */ 265static char xlate[N_CHARS]; 266 267static struct option const long_options[] = 268{ 269 {"complement", no_argument, NULL, 'c'}, 270 {"delete", no_argument, NULL, 'd'}, 271 {"squeeze-repeats", no_argument, NULL, 's'}, 272 {"truncate-set1", no_argument, NULL, 't'}, 273 {GETOPT_HELP_OPTION_DECL}, 274 {GETOPT_VERSION_OPTION_DECL}, 275 {NULL, 0, NULL, 0} 276}; 277 278void 279usage (int status) 280{ 281 if (status != EXIT_SUCCESS) 282 fprintf (stderr, _("Try `%s --help' for more information.\n"), 283 program_name); 284 else 285 { 286 printf (_("\ 287Usage: %s [OPTION]... SET1 [SET2]\n\ 288"), 289 program_name); 290 fputs (_("\ 291Translate, squeeze, and/or delete characters from standard input,\n\ 292writing to standard output.\n\ 293\n\ 294 -c, -C, --complement use the complement of SET1\n\ 295 -d, --delete delete characters in SET1, do not translate\n\ 296 -s, --squeeze-repeats replace each input sequence of a repeated character\n\ 297 that is listed in SET1 with a single occurrence\n\ 298 of that character\n\ 299 -t, --truncate-set1 first truncate SET1 to length of SET2\n\ 300"), stdout); 301 fputs (HELP_OPTION_DESCRIPTION, stdout); 302 fputs (VERSION_OPTION_DESCRIPTION, stdout); 303 fputs (_("\ 304\n\ 305SETs are specified as strings of characters. Most represent themselves.\n\ 306Interpreted sequences are:\n\ 307\n\ 308 \\NNN character with octal value NNN (1 to 3 octal digits)\n\ 309 \\\\ backslash\n\ 310 \\a audible BEL\n\ 311 \\b backspace\n\ 312 \\f form feed\n\ 313 \\n new line\n\ 314 \\r return\n\ 315 \\t horizontal tab\n\ 316"), stdout); 317 fputs (_("\ 318 \\v vertical tab\n\ 319 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\ 320 [CHAR*] in SET2, copies of CHAR until length of SET1\n\ 321 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\ 322 [:alnum:] all letters and digits\n\ 323 [:alpha:] all letters\n\ 324 [:blank:] all horizontal whitespace\n\ 325 [:cntrl:] all control characters\n\ 326 [:digit:] all digits\n\ 327"), stdout); 328 fputs (_("\ 329 [:graph:] all printable characters, not including space\n\ 330 [:lower:] all lower case letters\n\ 331 [:print:] all printable characters, including space\n\ 332 [:punct:] all punctuation characters\n\ 333 [:space:] all horizontal or vertical whitespace\n\ 334 [:upper:] all upper case letters\n\ 335 [:xdigit:] all hexadecimal digits\n\ 336 [=CHAR=] all characters which are equivalent to CHAR\n\ 337"), stdout); 338 fputs (_("\ 339\n\ 340Translation occurs if -d is not given and both SET1 and SET2 appear.\n\ 341-t may be used only when translating. SET2 is extended to length of\n\ 342SET1 by repeating its last character as necessary. Excess characters\n\ 343of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\ 344expand in ascending order; used in SET2 while translating, they may\n\ 345only be used in pairs to specify case conversion. -s uses SET1 if not\n\ 346translating nor deleting; else squeezing uses SET2 and occurs after\n\ 347translation or deletion.\n\ 348"), stdout); 349 emit_ancillary_info (); 350 } 351 exit (status); 352} 353 354/* Return nonzero if the character C is a member of the 355 equivalence class containing the character EQUIV_CLASS. */ 356 357static inline bool 358is_equiv_class_member (unsigned char equiv_class, unsigned char c) 359{ 360 return (equiv_class == c); 361} 362 363/* Return true if the character C is a member of the 364 character class CHAR_CLASS. */ 365 366static bool 367is_char_class_member (enum Char_class char_class, unsigned char c) 368{ 369 int result; 370 371 switch (char_class) 372 { 373 case CC_ALNUM: 374 result = isalnum (c); 375 break; 376 case CC_ALPHA: 377 result = isalpha (c); 378 break; 379 case CC_BLANK: 380 result = isblank (c); 381 break; 382 case CC_CNTRL: 383 result = iscntrl (c); 384 break; 385 case CC_DIGIT: 386 result = isdigit (c); 387 break; 388 case CC_GRAPH: 389 result = isgraph (c); 390 break; 391 case CC_LOWER: 392 result = islower (c); 393 break; 394 case CC_PRINT: 395 result = isprint (c); 396 break; 397 case CC_PUNCT: 398 result = ispunct (c); 399 break; 400 case CC_SPACE: 401 result = isspace (c); 402 break; 403 case CC_UPPER: 404 result = isupper (c); 405 break; 406 case CC_XDIGIT: 407 result = isxdigit (c); 408 break; 409 default: 410 abort (); 411 break; 412 } 413 414 return !! result; 415} 416 417static void 418es_free (struct E_string *es) 419{ 420 free (es->s); 421 free (es->escaped); 422} 423 424/* Perform the first pass over each range-spec argument S, converting all 425 \c and \ddd escapes to their one-byte representations. If an invalid 426 quote sequence is found print an error message and return false; 427 Otherwise set *ES to the resulting string and return true. 428 The resulting array of characters may contain zero-bytes; 429 however, on input, S is assumed to be null-terminated, and hence 430 cannot contain actual (non-escaped) zero bytes. */ 431 432static bool 433unquote (char const *s, struct E_string *es) 434{ 435 size_t i, j; 436 size_t len = strlen (s); 437 438 es->s = xmalloc (len); 439 es->escaped = xcalloc (len, sizeof es->escaped[0]); 440 441 j = 0; 442 for (i = 0; s[i]; i++) 443 { 444 unsigned char c; 445 int oct_digit; 446 447 switch (s[i]) 448 { 449 case '\\': 450 es->escaped[j] = true; 451 switch (s[i + 1]) 452 { 453 case '\\': 454 c = '\\'; 455 break; 456 case 'a': 457 c = '\a'; 458 break; 459 case 'b': 460 c = '\b'; 461 break; 462 case 'f': 463 c = '\f'; 464 break; 465 case 'n': 466 c = '\n'; 467 break; 468 case 'r': 469 c = '\r'; 470 break; 471 case 't': 472 c = '\t'; 473 break; 474 case 'v': 475 c = '\v'; 476 break; 477 case '0': 478 case '1': 479 case '2': 480 case '3': 481 case '4': 482 case '5': 483 case '6': 484 case '7': 485 c = s[i + 1] - '0'; 486 oct_digit = s[i + 2] - '0'; 487 if (0 <= oct_digit && oct_digit <= 7) 488 { 489 c = 8 * c + oct_digit; 490 ++i; 491 oct_digit = s[i + 2] - '0'; 492 if (0 <= oct_digit && oct_digit <= 7) 493 { 494 if (8 * c + oct_digit < N_CHARS) 495 { 496 c = 8 * c + oct_digit; 497 ++i; 498 } 499 else 500 { 501 /* A 3-digit octal number larger than \377 won't 502 fit in 8 bits. So we stop when adding the 503 next digit would put us over the limit and 504 give a warning about the ambiguity. POSIX 505 isn't clear on this, and we interpret this 506 lack of clarity as meaning the resulting behavior 507 is undefined, which means we're allowed to issue 508 a warning. */ 509 error (0, 0, _("warning: the ambiguous octal escape \ 510\\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"), 511 s[i], s[i + 1], s[i + 2], 512 s[i], s[i + 1], s[i + 2]); 513 } 514 } 515 } 516 break; 517 case '\0': 518 error (0, 0, _("warning: an unescaped backslash " 519 "at end of string is not portable")); 520 /* POSIX is not clear about this. */ 521 es->escaped[j] = false; 522 i--; 523 c = '\\'; 524 break; 525 default: 526 c = s[i + 1]; 527 break; 528 } 529 ++i; 530 es->s[j++] = c; 531 break; 532 default: 533 es->s[j++] = s[i]; 534 break; 535 } 536 } 537 es->len = j; 538 return true; 539} 540 541/* If CLASS_STR is a valid character class string, return its index 542 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */ 543 544static enum Char_class 545look_up_char_class (char const *class_str, size_t len) 546{ 547 enum Char_class i; 548 549 for (i = 0; i < ARRAY_CARDINALITY (char_class_name); i++) 550 if (strncmp (class_str, char_class_name[i], len) == 0 551 && strlen (char_class_name[i]) == len) 552 return i; 553 return CC_NO_CLASS; 554} 555 556/* Return a newly allocated string with a printable version of C. 557 This function is used solely for formatting error messages. */ 558 559static char * 560make_printable_char (unsigned char c) 561{ 562 char *buf = xmalloc (5); 563 564 if (isprint (c)) 565 { 566 buf[0] = c; 567 buf[1] = '\0'; 568 } 569 else 570 { 571 sprintf (buf, "\\%03o", c); 572 } 573 return buf; 574} 575 576/* Return a newly allocated copy of S which is suitable for printing. 577 LEN is the number of characters in S. Most non-printing 578 (isprint) characters are represented by a backslash followed by 579 3 octal digits. However, the characters represented by \c escapes 580 where c is one of [abfnrtv] are represented by their 2-character \c 581 sequences. This function is used solely for printing error messages. */ 582 583static char * 584make_printable_str (char const *s, size_t len) 585{ 586 /* Worst case is that every character expands to a backslash 587 followed by a 3-character octal escape sequence. */ 588 char *printable_buf = xnmalloc (len + 1, 4); 589 char *p = printable_buf; 590 size_t i; 591 592 for (i = 0; i < len; i++) 593 { 594 char buf[5]; 595 char const *tmp = NULL; 596 unsigned char c = s[i]; 597 598 switch (c) 599 { 600 case '\\': 601 tmp = "\\"; 602 break; 603 case '\a': 604 tmp = "\\a"; 605 break; 606 case '\b': 607 tmp = "\\b"; 608 break; 609 case '\f': 610 tmp = "\\f"; 611 break; 612 case '\n': 613 tmp = "\\n"; 614 break; 615 case '\r': 616 tmp = "\\r"; 617 break; 618 case '\t': 619 tmp = "\\t"; 620 break; 621 case '\v': 622 tmp = "\\v"; 623 break; 624 default: 625 if (isprint (c)) 626 { 627 buf[0] = c; 628 buf[1] = '\0'; 629 } 630 else 631 sprintf (buf, "\\%03o", c); 632 tmp = buf; 633 break; 634 } 635 p = stpcpy (p, tmp); 636 } 637 return printable_buf; 638} 639 640/* Append a newly allocated structure representing a 641 character C to the specification list LIST. */ 642 643static void 644append_normal_char (struct Spec_list *list, unsigned char c) 645{ 646 struct List_element *new; 647 648 new = xmalloc (sizeof *new); 649 new->next = NULL; 650 new->type = RE_NORMAL_CHAR; 651 new->u.normal_char = c; 652 assert (list->tail); 653 list->tail->next = new; 654 list->tail = new; 655} 656 657/* Append a newly allocated structure representing the range 658 of characters from FIRST to LAST to the specification list LIST. 659 Return false if LAST precedes FIRST in the collating sequence, 660 true otherwise. This means that '[c-c]' is acceptable. */ 661 662static bool 663append_range (struct Spec_list *list, unsigned char first, unsigned char last) 664{ 665 struct List_element *new; 666 667 if (last < first) 668 { 669 char *tmp1 = make_printable_char (first); 670 char *tmp2 = make_printable_char (last); 671 672 error (0, 0, 673 _("range-endpoints of `%s-%s' are in reverse collating sequence order"), 674 tmp1, tmp2); 675 free (tmp1); 676 free (tmp2); 677 return false; 678 } 679 new = xmalloc (sizeof *new); 680 new->next = NULL; 681 new->type = RE_RANGE; 682 new->u.range.first_char = first; 683 new->u.range.last_char = last; 684 assert (list->tail); 685 list->tail->next = new; 686 list->tail = new; 687 return true; 688} 689 690/* If CHAR_CLASS_STR is a valid character class string, append a 691 newly allocated structure representing that character class to the end 692 of the specification list LIST and return true. If CHAR_CLASS_STR is not 693 a valid string return false. */ 694 695static bool 696append_char_class (struct Spec_list *list, 697 char const *char_class_str, size_t len) 698{ 699 enum Char_class char_class; 700 struct List_element *new; 701 702 char_class = look_up_char_class (char_class_str, len); 703 if (char_class == CC_NO_CLASS) 704 return false; 705 new = xmalloc (sizeof *new); 706 new->next = NULL; 707 new->type = RE_CHAR_CLASS; 708 new->u.char_class = char_class; 709 assert (list->tail); 710 list->tail->next = new; 711 list->tail = new; 712 return true; 713} 714 715/* Append a newly allocated structure representing a [c*n] 716 repeated character construct to the specification list LIST. 717 THE_CHAR is the single character to be repeated, and REPEAT_COUNT 718 is a non-negative repeat count. */ 719 720static void 721append_repeated_char (struct Spec_list *list, unsigned char the_char, 722 count repeat_count) 723{ 724 struct List_element *new; 725 726 new = xmalloc (sizeof *new); 727 new->next = NULL; 728 new->type = RE_REPEATED_CHAR; 729 new->u.repeated_char.the_repeated_char = the_char; 730 new->u.repeated_char.repeat_count = repeat_count; 731 assert (list->tail); 732 list->tail->next = new; 733 list->tail = new; 734} 735 736/* Given a string, EQUIV_CLASS_STR, from a [=str=] context and 737 the length of that string, LEN, if LEN is exactly one, append 738 a newly allocated structure representing the specified 739 equivalence class to the specification list, LIST and return true. 740 If LEN is not 1, return false. */ 741 742static bool 743append_equiv_class (struct Spec_list *list, 744 char const *equiv_class_str, size_t len) 745{ 746 struct List_element *new; 747 748 if (len != 1) 749 return false; 750 new = xmalloc (sizeof *new); 751 new->next = NULL; 752 new->type = RE_EQUIV_CLASS; 753 new->u.equiv_code = *equiv_class_str; 754 assert (list->tail); 755 list->tail->next = new; 756 list->tail = new; 757 return true; 758} 759 760/* Search forward starting at START_IDX for the 2-char sequence 761 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such 762 a sequence is found, set *RESULT_IDX to the index of the first 763 character and return true. Otherwise return false. P may contain 764 zero bytes. */ 765 766static bool 767find_closing_delim (const struct E_string *es, size_t start_idx, 768 char pre_bracket_char, size_t *result_idx) 769{ 770 size_t i; 771 772 for (i = start_idx; i < es->len - 1; i++) 773 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']' 774 && !es->escaped[i] && !es->escaped[i + 1]) 775 { 776 *result_idx = i; 777 return true; 778 } 779 return false; 780} 781 782/* Parse the bracketed repeat-char syntax. If the P_LEN characters 783 beginning with P[ START_IDX ] comprise a valid [c*n] construct, 784 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX 785 and return zero. If the second character following 786 the opening bracket is not `*' or if no closing bracket can be 787 found, return -1. If a closing bracket is found and the 788 second char is `*', but the string between the `*' and `]' isn't 789 empty, an octal number, or a decimal number, print an error message 790 and return -2. */ 791 792static int 793find_bracketed_repeat (const struct E_string *es, size_t start_idx, 794 unsigned char *char_to_repeat, count *repeat_count, 795 size_t *closing_bracket_idx) 796{ 797 size_t i; 798 799 assert (start_idx + 1 < es->len); 800 if (!es_match (es, start_idx + 1, '*')) 801 return -1; 802 803 for (i = start_idx + 2; i < es->len && !es->escaped[i]; i++) 804 { 805 if (es->s[i] == ']') 806 { 807 size_t digit_str_len = i - start_idx - 2; 808 809 *char_to_repeat = es->s[start_idx]; 810 if (digit_str_len == 0) 811 { 812 /* We've matched [c*] -- no explicit repeat count. */ 813 *repeat_count = 0; 814 } 815 else 816 { 817 /* Here, we have found [c*s] where s should be a string 818 of octal (if it starts with `0') or decimal digits. */ 819 char const *digit_str = &es->s[start_idx + 2]; 820 char *d_end; 821 if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10, 822 repeat_count, NULL) 823 != LONGINT_OK) 824 || REPEAT_COUNT_MAXIMUM < *repeat_count 825 || digit_str + digit_str_len != d_end) 826 { 827 char *tmp = make_printable_str (digit_str, digit_str_len); 828 error (0, 0, 829 _("invalid repeat count %s in [c*n] construct"), 830 quote (tmp)); 831 free (tmp); 832 return -2; 833 } 834 } 835 *closing_bracket_idx = i; 836 return 0; 837 } 838 } 839 return -1; /* No bracket found. */ 840} 841 842/* Return true if the string at ES->s[IDX] matches the regular 843 expression `\*[0-9]*\]', false otherwise. The string does not 844 match if any of its characters are escaped. */ 845 846static bool 847star_digits_closebracket (const struct E_string *es, size_t idx) 848{ 849 size_t i; 850 851 if (!es_match (es, idx, '*')) 852 return false; 853 854 for (i = idx + 1; i < es->len; i++) 855 if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i]) 856 return es_match (es, i, ']'); 857 return false; 858} 859 860/* Convert string UNESCAPED_STRING (which has been preprocessed to 861 convert backslash-escape sequences) of length LEN characters into 862 a linked list of the following 5 types of constructs: 863 - [:str:] Character class where `str' is one of the 12 valid strings. 864 - [=c=] Equivalence class where `c' is any single character. 865 - [c*n] Repeat the single character `c' `n' times. n may be omitted. 866 However, if `n' is present, it must be a non-negative octal or 867 decimal integer. 868 - r-s Range of characters from `r' to `s'. The second endpoint must 869 not precede the first in the current collating sequence. 870 - c Any other character is interpreted as itself. */ 871 872static bool 873build_spec_list (const struct E_string *es, struct Spec_list *result) 874{ 875 char const *p; 876 size_t i; 877 878 p = es->s; 879 880 /* The main for-loop below recognizes the 4 multi-character constructs. 881 A character that matches (in its context) none of the multi-character 882 constructs is classified as `normal'. Since all multi-character 883 constructs have at least 3 characters, any strings of length 2 or 884 less are composed solely of normal characters. Hence, the index of 885 the outer for-loop runs only as far as LEN-2. */ 886 887 for (i = 0; i + 2 < es->len; /* empty */) 888 { 889 if (es_match (es, i, '[')) 890 { 891 bool matched_multi_char_construct; 892 size_t closing_bracket_idx; 893 unsigned char char_to_repeat; 894 count repeat_count; 895 int err; 896 897 matched_multi_char_construct = true; 898 if (es_match (es, i + 1, ':') || es_match (es, i + 1, '=')) 899 { 900 size_t closing_delim_idx; 901 902 if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx)) 903 { 904 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1; 905 char const *opnd_str = p + i + 2; 906 907 if (opnd_str_len == 0) 908 { 909 if (p[i + 1] == ':') 910 error (0, 0, _("missing character class name `[::]'")); 911 else 912 error (0, 0, 913 _("missing equivalence class character `[==]'")); 914 return false; 915 } 916 917 if (p[i + 1] == ':') 918 { 919 /* FIXME: big comment. */ 920 if (!append_char_class (result, opnd_str, opnd_str_len)) 921 { 922 if (star_digits_closebracket (es, i + 2)) 923 goto try_bracketed_repeat; 924 else 925 { 926 char *tmp = make_printable_str (opnd_str, 927 opnd_str_len); 928 error (0, 0, _("invalid character class %s"), 929 quote (tmp)); 930 free (tmp); 931 return false; 932 } 933 } 934 } 935 else 936 { 937 /* FIXME: big comment. */ 938 if (!append_equiv_class (result, opnd_str, opnd_str_len)) 939 { 940 if (star_digits_closebracket (es, i + 2)) 941 goto try_bracketed_repeat; 942 else 943 { 944 char *tmp = make_printable_str (opnd_str, 945 opnd_str_len); 946 error (0, 0, 947 _("%s: equivalence class operand must be a single character"), 948 tmp); 949 free (tmp); 950 return false; 951 } 952 } 953 } 954 955 i = closing_delim_idx + 2; 956 continue; 957 } 958 /* Else fall through. This could be [:*] or [=*]. */ 959 } 960 961 try_bracketed_repeat: 962 963 /* Determine whether this is a bracketed repeat range 964 matching the RE \[.\*(dec_or_oct_number)?\]. */ 965 err = find_bracketed_repeat (es, i + 1, &char_to_repeat, 966 &repeat_count, 967 &closing_bracket_idx); 968 if (err == 0) 969 { 970 append_repeated_char (result, char_to_repeat, repeat_count); 971 i = closing_bracket_idx + 1; 972 } 973 else if (err == -1) 974 { 975 matched_multi_char_construct = false; 976 } 977 else 978 { 979 /* Found a string that looked like [c*n] but the 980 numeric part was invalid. */ 981 return false; 982 } 983 984 if (matched_multi_char_construct) 985 continue; 986 987 /* We reach this point if P does not match [:str:], [=c=], 988 [c*n], or [c*]. Now, see if P looks like a range `[-c' 989 (from `[' to `c'). */ 990 } 991 992 /* Look ahead one char for ranges like a-z. */ 993 if (es_match (es, i + 1, '-')) 994 { 995 if (!append_range (result, p[i], p[i + 2])) 996 return false; 997 i += 3; 998 } 999 else 1000 { 1001 append_normal_char (result, p[i]); 1002 ++i; 1003 } 1004 } 1005 1006 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */ 1007 for (; i < es->len; i++) 1008 append_normal_char (result, p[i]); 1009 1010 return true; 1011} 1012 1013/* Advance past the current construct. 1014 S->tail must be non-NULL. */ 1015static void 1016skip_construct (struct Spec_list *s) 1017{ 1018 s->tail = s->tail->next; 1019 s->state = NEW_ELEMENT; 1020} 1021 1022/* Given a Spec_list S (with its saved state implicit in the values 1023 of its members `tail' and `state'), return the next single character 1024 in the expansion of S's constructs. If the last character of S was 1025 returned on the previous call or if S was empty, this function 1026 returns -1. For example, successive calls to get_next where S 1027 represents the spec-string 'a-d[y*3]' will return the sequence 1028 of values a, b, c, d, y, y, y, -1. Finally, if the construct from 1029 which the returned character comes is [:upper:] or [:lower:], the 1030 parameter CLASS is given a value to indicate which it was. Otherwise 1031 CLASS is set to UL_NONE. This value is used only when constructing 1032 the translation table to verify that any occurrences of upper and 1033 lower class constructs in the spec-strings appear in the same relative 1034 positions. */ 1035 1036static int 1037get_next (struct Spec_list *s, enum Upper_Lower_class *class) 1038{ 1039 struct List_element *p; 1040 int return_val; 1041 int i; 1042 1043 if (class) 1044 *class = UL_NONE; 1045 1046 if (s->state == BEGIN_STATE) 1047 { 1048 s->tail = s->head->next; 1049 s->state = NEW_ELEMENT; 1050 } 1051 1052 p = s->tail; 1053 if (p == NULL) 1054 return -1; 1055 1056 switch (p->type) 1057 { 1058 case RE_NORMAL_CHAR: 1059 return_val = p->u.normal_char; 1060 s->state = NEW_ELEMENT; 1061 s->tail = p->next; 1062 break; 1063 1064 case RE_RANGE: 1065 if (s->state == NEW_ELEMENT) 1066 s->state = p->u.range.first_char; 1067 else 1068 ++(s->state); 1069 return_val = s->state; 1070 if (s->state == p->u.range.last_char) 1071 { 1072 s->tail = p->next; 1073 s->state = NEW_ELEMENT; 1074 } 1075 break; 1076 1077 case RE_CHAR_CLASS: 1078 if (class) 1079 { 1080 switch (p->u.char_class) 1081 { 1082 case CC_LOWER: 1083 *class = UL_LOWER; 1084 break; 1085 case CC_UPPER: 1086 *class = UL_UPPER; 1087 break; 1088 default: 1089 break; 1090 } 1091 } 1092 1093 if (s->state == NEW_ELEMENT) 1094 { 1095 for (i = 0; i < N_CHARS; i++) 1096 if (is_char_class_member (p->u.char_class, i)) 1097 break; 1098 assert (i < N_CHARS); 1099 s->state = i; 1100 } 1101 assert (is_char_class_member (p->u.char_class, s->state)); 1102 return_val = s->state; 1103 for (i = s->state + 1; i < N_CHARS; i++) 1104 if (is_char_class_member (p->u.char_class, i)) 1105 break; 1106 if (i < N_CHARS) 1107 s->state = i; 1108 else 1109 { 1110 s->tail = p->next; 1111 s->state = NEW_ELEMENT; 1112 } 1113 break; 1114 1115 case RE_EQUIV_CLASS: 1116 /* FIXME: this assumes that each character is alone in its own 1117 equivalence class (which appears to be correct for my 1118 LC_COLLATE. But I don't know of any function that allows 1119 one to determine a character's equivalence class. */ 1120 1121 return_val = p->u.equiv_code; 1122 s->state = NEW_ELEMENT; 1123 s->tail = p->next; 1124 break; 1125 1126 case RE_REPEATED_CHAR: 1127 /* Here, a repeat count of n == 0 means don't repeat at all. */ 1128 if (p->u.repeated_char.repeat_count == 0) 1129 { 1130 s->tail = p->next; 1131 s->state = NEW_ELEMENT; 1132 return_val = get_next (s, class); 1133 } 1134 else 1135 { 1136 if (s->state == NEW_ELEMENT) 1137 { 1138 s->state = 0; 1139 } 1140 ++(s->state); 1141 return_val = p->u.repeated_char.the_repeated_char; 1142 if (s->state == p->u.repeated_char.repeat_count) 1143 { 1144 s->tail = p->next; 1145 s->state = NEW_ELEMENT; 1146 } 1147 } 1148 break; 1149 1150 default: 1151 abort (); 1152 break; 1153 } 1154 1155 return return_val; 1156} 1157 1158/* This is a minor kludge. This function is called from 1159 get_spec_stats to determine the cardinality of a set derived 1160 from a complemented string. It's a kludge in that some of the 1161 same operations are (duplicated) performed in set_initialize. */ 1162 1163static int 1164card_of_complement (struct Spec_list *s) 1165{ 1166 int c; 1167 int cardinality = N_CHARS; 1168 bool in_set[N_CHARS] = { 0, }; 1169 1170 s->state = BEGIN_STATE; 1171 while ((c = get_next (s, NULL)) != -1) 1172 { 1173 cardinality -= (!in_set[c]); 1174 in_set[c] = true; 1175 } 1176 return cardinality; 1177} 1178 1179/* Gather statistics about the spec-list S in preparation for the tests 1180 in validate that determine the consistency of the specs. This function 1181 is called at most twice; once for string1, and again for any string2. 1182 LEN_S1 < 0 indicates that this is the first call and that S represents 1183 string1. When LEN_S1 >= 0, it is the length of the expansion of the 1184 constructs in string1, and we can use its value to resolve any 1185 indefinite repeat construct in S (which represents string2). Hence, 1186 this function has the side-effect that it converts a valid [c*] 1187 construct in string2 to [c*n] where n is large enough (or 0) to give 1188 string2 the same length as string1. For example, with the command 1189 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would 1190 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */ 1191 1192static void 1193get_spec_stats (struct Spec_list *s) 1194{ 1195 struct List_element *p; 1196 count length = 0; 1197 1198 s->n_indefinite_repeats = 0; 1199 s->has_equiv_class = false; 1200 s->has_restricted_char_class = false; 1201 s->has_char_class = false; 1202 for (p = s->head->next; p; p = p->next) 1203 { 1204 int i; 1205 count len = 0; 1206 count new_length; 1207 1208 switch (p->type) 1209 { 1210 case RE_NORMAL_CHAR: 1211 len = 1; 1212 break; 1213 1214 case RE_RANGE: 1215 assert (p->u.range.last_char >= p->u.range.first_char); 1216 len = p->u.range.last_char - p->u.range.first_char + 1; 1217 break; 1218 1219 case RE_CHAR_CLASS: 1220 s->has_char_class = true; 1221 for (i = 0; i < N_CHARS; i++) 1222 if (is_char_class_member (p->u.char_class, i)) 1223 ++len; 1224 switch (p->u.char_class) 1225 { 1226 case CC_UPPER: 1227 case CC_LOWER: 1228 break; 1229 default: 1230 s->has_restricted_char_class = true; 1231 break; 1232 } 1233 break; 1234 1235 case RE_EQUIV_CLASS: 1236 for (i = 0; i < N_CHARS; i++) 1237 if (is_equiv_class_member (p->u.equiv_code, i)) 1238 ++len; 1239 s->has_equiv_class = true; 1240 break; 1241 1242 case RE_REPEATED_CHAR: 1243 if (p->u.repeated_char.repeat_count > 0) 1244 len = p->u.repeated_char.repeat_count; 1245 else 1246 { 1247 s->indefinite_repeat_element = p; 1248 ++(s->n_indefinite_repeats); 1249 } 1250 break; 1251 1252 default: 1253 abort (); 1254 break; 1255 } 1256 1257 /* Check for arithmetic overflow in computing length. Also, reject 1258 any length greater than the maximum repeat count, in case the 1259 length is later used to compute the repeat count for an 1260 indefinite element. */ 1261 new_length = length + len; 1262 if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM)) 1263 error (EXIT_FAILURE, 0, _("too many characters in set")); 1264 length = new_length; 1265 } 1266 1267 s->length = length; 1268} 1269 1270static void 1271get_s1_spec_stats (struct Spec_list *s1) 1272{ 1273 get_spec_stats (s1); 1274 if (complement) 1275 s1->length = card_of_complement (s1); 1276} 1277 1278static void 1279get_s2_spec_stats (struct Spec_list *s2, count len_s1) 1280{ 1281 get_spec_stats (s2); 1282 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1) 1283 { 1284 s2->indefinite_repeat_element->u.repeated_char.repeat_count = 1285 len_s1 - s2->length; 1286 s2->length = len_s1; 1287 } 1288} 1289 1290static void 1291spec_init (struct Spec_list *spec_list) 1292{ 1293 struct List_element *new = xmalloc (sizeof *new); 1294 spec_list->head = spec_list->tail = new; 1295 spec_list->head->next = NULL; 1296} 1297 1298/* This function makes two passes over the argument string S. The first 1299 one converts all \c and \ddd escapes to their one-byte representations. 1300 The second constructs a linked specification list, SPEC_LIST, of the 1301 characters and constructs that comprise the argument string. If either 1302 of these passes detects an error, this function returns false. */ 1303 1304static bool 1305parse_str (char const *s, struct Spec_list *spec_list) 1306{ 1307 struct E_string es; 1308 bool ok = unquote (s, &es) && build_spec_list (&es, spec_list); 1309 es_free (&es); 1310 return ok; 1311} 1312 1313/* Given two specification lists, S1 and S2, and assuming that 1314 S1->length > S2->length, append a single [c*n] element to S2 where c 1315 is the last character in the expansion of S2 and n is the difference 1316 between the two lengths. 1317 Upon successful completion, S2->length is set to S1->length. The only 1318 way this function can fail to make S2 as long as S1 is when S2 has 1319 zero-length, since in that case, there is no last character to repeat. 1320 So S2->length is required to be at least 1. 1321 1322 Providing this functionality allows the user to do some pretty 1323 non-BSD (and non-portable) things: For example, the command 1324 tr -cs '[:upper:]0-9' '[:lower:]' 1325 is almost guaranteed to give results that depend on your collating 1326 sequence. */ 1327 1328static void 1329string2_extend (const struct Spec_list *s1, struct Spec_list *s2) 1330{ 1331 struct List_element *p; 1332 unsigned char char_to_repeat; 1333 int i; 1334 1335 assert (translating); 1336 assert (s1->length > s2->length); 1337 assert (s2->length > 0); 1338 1339 p = s2->tail; 1340 switch (p->type) 1341 { 1342 case RE_NORMAL_CHAR: 1343 char_to_repeat = p->u.normal_char; 1344 break; 1345 case RE_RANGE: 1346 char_to_repeat = p->u.range.last_char; 1347 break; 1348 case RE_CHAR_CLASS: 1349 for (i = N_CHARS - 1; i >= 0; i--) 1350 if (is_char_class_member (p->u.char_class, i)) 1351 break; 1352 assert (i >= 0); 1353 char_to_repeat = i; 1354 break; 1355 1356 case RE_REPEATED_CHAR: 1357 char_to_repeat = p->u.repeated_char.the_repeated_char; 1358 break; 1359 1360 case RE_EQUIV_CLASS: 1361 /* This shouldn't happen, because validate exits with an error 1362 if it finds an equiv class in string2 when translating. */ 1363 abort (); 1364 break; 1365 1366 default: 1367 abort (); 1368 break; 1369 } 1370 1371 append_repeated_char (s2, char_to_repeat, s1->length - s2->length); 1372 s2->length = s1->length; 1373} 1374 1375/* Return true if S is a non-empty list in which exactly one 1376 character (but potentially, many instances of it) appears. 1377 E.g., [X*] or xxxxxxxx. */ 1378 1379static bool 1380homogeneous_spec_list (struct Spec_list *s) 1381{ 1382 int b, c; 1383 1384 s->state = BEGIN_STATE; 1385 1386 if ((b = get_next (s, NULL)) == -1) 1387 return false; 1388 1389 while ((c = get_next (s, NULL)) != -1) 1390 if (c != b) 1391 return false; 1392 1393 return true; 1394} 1395 1396/* Die with an error message if S1 and S2 describe strings that 1397 are not valid with the given command line switches. 1398 A side effect of this function is that if a valid [c*] or 1399 [c*0] construct appears in string2, it is converted to [c*n] 1400 with a value for n that makes s2->length == s1->length. By 1401 the same token, if the --truncate-set1 option is not 1402 given, S2 may be extended. */ 1403 1404static void 1405validate (struct Spec_list *s1, struct Spec_list *s2) 1406{ 1407 get_s1_spec_stats (s1); 1408 if (s1->n_indefinite_repeats > 0) 1409 { 1410 error (EXIT_FAILURE, 0, 1411 _("the [c*] repeat construct may not appear in string1")); 1412 } 1413 1414 if (s2) 1415 { 1416 get_s2_spec_stats (s2, s1->length); 1417 1418 if (s2->n_indefinite_repeats > 1) 1419 { 1420 error (EXIT_FAILURE, 0, 1421 _("only one [c*] repeat construct may appear in string2")); 1422 } 1423 1424 if (translating) 1425 { 1426 if (s2->has_equiv_class) 1427 { 1428 error (EXIT_FAILURE, 0, 1429 _("[=c=] expressions may not appear in string2 \ 1430when translating")); 1431 } 1432 1433 if (s1->length > s2->length) 1434 { 1435 if (!truncate_set1) 1436 { 1437 /* string2 must be non-empty unless --truncate-set1 is 1438 given or string1 is empty. */ 1439 1440 if (s2->length == 0) 1441 error (EXIT_FAILURE, 0, 1442 _("when not truncating set1, string2 must be non-empty")); 1443 string2_extend (s1, s2); 1444 } 1445 } 1446 1447 if (complement && s1->has_char_class 1448 && ! (s2->length == s1->length && homogeneous_spec_list (s2))) 1449 { 1450 error (EXIT_FAILURE, 0, 1451 _("when translating with complemented character classes,\ 1452\nstring2 must map all characters in the domain to one")); 1453 } 1454 1455 if (s2->has_restricted_char_class) 1456 { 1457 error (EXIT_FAILURE, 0, 1458 _("when translating, the only character classes that may \ 1459appear in\nstring2 are `upper' and `lower'")); 1460 } 1461 } 1462 else 1463 /* Not translating. */ 1464 { 1465 if (s2->n_indefinite_repeats > 0) 1466 error (EXIT_FAILURE, 0, 1467 _("the [c*] construct may appear in string2 only \ 1468when translating")); 1469 } 1470 } 1471} 1472 1473/* Read buffers of SIZE bytes via the function READER (if READER is 1474 NULL, read from stdin) until EOF. When non-NULL, READER is either 1475 read_and_delete or read_and_xlate. After each buffer is read, it is 1476 processed and written to stdout. The buffers are processed so that 1477 multiple consecutive occurrences of the same character in the input 1478 stream are replaced by a single occurrence of that character if the 1479 character is in the squeeze set. */ 1480 1481static void 1482squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t)) 1483{ 1484 /* A value distinct from any character that may have been stored in a 1485 buffer as the result of a block-read in the function squeeze_filter. */ 1486 enum { NOT_A_CHAR = CHAR_MAX + 1 }; 1487 1488 int char_to_squeeze = NOT_A_CHAR; 1489 size_t i = 0; 1490 size_t nr = 0; 1491 1492 for (;;) 1493 { 1494 size_t begin; 1495 1496 if (i >= nr) 1497 { 1498 nr = reader (buf, size); 1499 if (nr == 0) 1500 break; 1501 i = 0; 1502 } 1503 1504 begin = i; 1505 1506 if (char_to_squeeze == NOT_A_CHAR) 1507 { 1508 size_t out_len; 1509 /* Here, by being a little tricky, we can get a significant 1510 performance increase in most cases when the input is 1511 reasonably large. Since tr will modify the input only 1512 if two consecutive (and identical) input characters are 1513 in the squeeze set, we can step by two through the data 1514 when searching for a character in the squeeze set. This 1515 means there may be a little more work in a few cases and 1516 perhaps twice as much work in the worst cases where most 1517 of the input is removed by squeezing repeats. But most 1518 uses of this functionality seem to remove less than 20-30% 1519 of the input. */ 1520 for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2) 1521 continue; 1522 1523 /* There is a special case when i == nr and we've just 1524 skipped a character (the last one in buf) that is in 1525 the squeeze set. */ 1526 if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])]) 1527 --i; 1528 1529 if (i >= nr) 1530 out_len = nr - begin; 1531 else 1532 { 1533 char_to_squeeze = buf[i]; 1534 /* We're about to output buf[begin..i]. */ 1535 out_len = i - begin + 1; 1536 1537 /* But since we stepped by 2 in the loop above, 1538 out_len may be one too large. */ 1539 if (i > 0 && buf[i - 1] == char_to_squeeze) 1540 --out_len; 1541 1542 /* Advance i to the index of first character to be 1543 considered when looking for a char different from 1544 char_to_squeeze. */ 1545 ++i; 1546 } 1547 if (out_len > 0 1548 && fwrite (&buf[begin], 1, out_len, stdout) != out_len) 1549 error (EXIT_FAILURE, errno, _("write error")); 1550 } 1551 1552 if (char_to_squeeze != NOT_A_CHAR) 1553 { 1554 /* Advance i to index of first char != char_to_squeeze 1555 (or to nr if all the rest of the characters in this 1556 buffer are the same as char_to_squeeze). */ 1557 for (; i < nr && buf[i] == char_to_squeeze; i++) 1558 continue; 1559 if (i < nr) 1560 char_to_squeeze = NOT_A_CHAR; 1561 /* If (i >= nr) we've squeezed the last character in this buffer. 1562 So now we have to read a new buffer and continue comparing 1563 characters against char_to_squeeze. */ 1564 } 1565 } 1566} 1567 1568static size_t 1569plain_read (char *buf, size_t size) 1570{ 1571 size_t nr = safe_read (STDIN_FILENO, buf, size); 1572 if (nr == SAFE_READ_ERROR) 1573 error (EXIT_FAILURE, errno, _("read error")); 1574 return nr; 1575} 1576 1577/* Read buffers of SIZE bytes from stdin until one is found that 1578 contains at least one character not in the delete set. Store 1579 in the array BUF, all characters from that buffer that are not 1580 in the delete set, and return the number of characters saved 1581 or 0 upon EOF. */ 1582 1583static size_t 1584read_and_delete (char *buf, size_t size) 1585{ 1586 size_t n_saved; 1587 1588 /* This enclosing do-while loop is to make sure that 1589 we don't return zero (indicating EOF) when we've 1590 just deleted all the characters in a buffer. */ 1591 do 1592 { 1593 size_t i; 1594 size_t nr = plain_read (buf, size); 1595 1596 if (nr == 0) 1597 return 0; 1598 1599 /* This first loop may be a waste of code, but gives much 1600 better performance when no characters are deleted in 1601 the beginning of a buffer. It just avoids the copying 1602 of buf[i] into buf[n_saved] when it would be a NOP. */ 1603 1604 for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++) 1605 continue; 1606 n_saved = i; 1607 1608 for (++i; i < nr; i++) 1609 if (!in_delete_set[to_uchar (buf[i])]) 1610 buf[n_saved++] = buf[i]; 1611 } 1612 while (n_saved == 0); 1613 1614 return n_saved; 1615} 1616 1617/* Read at most SIZE bytes from stdin into the array BUF. Then 1618 perform the in-place and one-to-one mapping specified by the global 1619 array `xlate'. Return the number of characters read, or 0 upon EOF. */ 1620 1621static size_t 1622read_and_xlate (char *buf, size_t size) 1623{ 1624 size_t bytes_read = plain_read (buf, size); 1625 size_t i; 1626 1627 for (i = 0; i < bytes_read; i++) 1628 buf[i] = xlate[to_uchar (buf[i])]; 1629 1630 return bytes_read; 1631} 1632 1633/* Initialize a boolean membership set, IN_SET, with the character 1634 values obtained by traversing the linked list of constructs S 1635 using the function `get_next'. IN_SET is expected to have been 1636 initialized to all zeros by the caller. If COMPLEMENT_THIS_SET 1637 is true the resulting set is complemented. */ 1638 1639static void 1640set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set) 1641{ 1642 int c; 1643 size_t i; 1644 1645 s->state = BEGIN_STATE; 1646 while ((c = get_next (s, NULL)) != -1) 1647 in_set[c] = true; 1648 if (complement_this_set) 1649 for (i = 0; i < N_CHARS; i++) 1650 in_set[i] = (!in_set[i]); 1651} 1652 1653int 1654main (int argc, char **argv) 1655{ 1656 int c; 1657 int non_option_args; 1658 int min_operands; 1659 int max_operands; 1660 struct Spec_list buf1, buf2; 1661 struct Spec_list *s1 = &buf1; 1662 struct Spec_list *s2 = &buf2; 1663 1664 initialize_main (&argc, &argv); 1665 set_program_name (argv[0]); 1666 setlocale (LC_ALL, ""); 1667 bindtextdomain (PACKAGE, LOCALEDIR); 1668 textdomain (PACKAGE); 1669 1670 atexit (close_stdout); 1671 1672 while ((c = getopt_long (argc, argv, "+cCdst", long_options, NULL)) != -1) 1673 { 1674 switch (c) 1675 { 1676 case 'c': 1677 case 'C': 1678 complement = true; 1679 break; 1680 1681 case 'd': 1682 delete = true; 1683 break; 1684 1685 case 's': 1686 squeeze_repeats = true; 1687 break; 1688 1689 case 't': 1690 truncate_set1 = true; 1691 break; 1692 1693 case_GETOPT_HELP_CHAR; 1694 1695 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); 1696 1697 default: 1698 usage (EXIT_FAILURE); 1699 break; 1700 } 1701 } 1702 1703 non_option_args = argc - optind; 1704 translating = (non_option_args == 2 && !delete); 1705 min_operands = 1 + (delete == squeeze_repeats); 1706 max_operands = 1 + (delete <= squeeze_repeats); 1707 1708 if (non_option_args < min_operands) 1709 { 1710 if (non_option_args == 0) 1711 error (0, 0, _("missing operand")); 1712 else 1713 { 1714 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1])); 1715 fprintf (stderr, "%s\n", 1716 _(squeeze_repeats 1717 ? N_("Two strings must be given when " 1718 "both deleting and squeezing repeats.") 1719 : N_("Two strings must be given when translating."))); 1720 } 1721 usage (EXIT_FAILURE); 1722 } 1723 1724 if (max_operands < non_option_args) 1725 { 1726 error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands])); 1727 if (non_option_args == 2) 1728 fprintf (stderr, "%s\n", 1729 _("Only one string may be given when " 1730 "deleting without squeezing repeats.")); 1731 usage (EXIT_FAILURE); 1732 } 1733 1734 spec_init (s1); 1735 if (!parse_str (argv[optind], s1)) 1736 exit (EXIT_FAILURE); 1737 1738 if (non_option_args == 2) 1739 { 1740 spec_init (s2); 1741 if (!parse_str (argv[optind + 1], s2)) 1742 exit (EXIT_FAILURE); 1743 } 1744 else 1745 s2 = NULL; 1746 1747 validate (s1, s2); 1748 1749 /* Use binary I/O, since `tr' is sometimes used to transliterate 1750 non-printable characters, or characters which are stripped away 1751 by text-mode reads (like CR and ^Z). */ 1752 if (O_BINARY && ! isatty (STDIN_FILENO)) 1753 xfreopen (NULL, "rb", stdin); 1754 if (O_BINARY && ! isatty (STDOUT_FILENO)) 1755 xfreopen (NULL, "wb", stdout); 1756 1757 if (squeeze_repeats && non_option_args == 1) 1758 { 1759 set_initialize (s1, complement, in_squeeze_set); 1760 squeeze_filter (io_buf, sizeof io_buf, plain_read); 1761 } 1762 else if (delete && non_option_args == 1) 1763 { 1764 set_initialize (s1, complement, in_delete_set); 1765 1766 for (;;) 1767 { 1768 size_t nr = read_and_delete (io_buf, sizeof io_buf); 1769 if (nr == 0) 1770 break; 1771 if (fwrite (io_buf, 1, nr, stdout) != nr) 1772 error (EXIT_FAILURE, errno, _("write error")); 1773 } 1774 } 1775 else if (squeeze_repeats && delete && non_option_args == 2) 1776 { 1777 set_initialize (s1, complement, in_delete_set); 1778 set_initialize (s2, false, in_squeeze_set); 1779 squeeze_filter (io_buf, sizeof io_buf, read_and_delete); 1780 } 1781 else if (translating) 1782 { 1783 if (complement) 1784 { 1785 int i; 1786 bool *in_s1 = in_delete_set; 1787 1788 set_initialize (s1, false, in_s1); 1789 s2->state = BEGIN_STATE; 1790 for (i = 0; i < N_CHARS; i++) 1791 xlate[i] = i; 1792 for (i = 0; i < N_CHARS; i++) 1793 { 1794 if (!in_s1[i]) 1795 { 1796 int ch = get_next (s2, NULL); 1797 assert (ch != -1 || truncate_set1); 1798 if (ch == -1) 1799 { 1800 /* This will happen when tr is invoked like e.g. 1801 tr -cs A-Za-z0-9 '\012'. */ 1802 break; 1803 } 1804 xlate[i] = ch; 1805 } 1806 } 1807 } 1808 else 1809 { 1810 int c1, c2; 1811 int i; 1812 bool case_convert = false; 1813 enum Upper_Lower_class class_s1; 1814 enum Upper_Lower_class class_s2; 1815 1816 for (i = 0; i < N_CHARS; i++) 1817 xlate[i] = i; 1818 s1->state = BEGIN_STATE; 1819 s2->state = BEGIN_STATE; 1820 for (;;) 1821 { 1822 /* When the previous pair identified case-converting classes, 1823 advance S1 and S2 so that each points to the following 1824 construct. */ 1825 if (case_convert) 1826 { 1827 skip_construct (s1); 1828 skip_construct (s2); 1829 case_convert = false; 1830 } 1831 1832 c1 = get_next (s1, &class_s1); 1833 c2 = get_next (s2, &class_s2); 1834 1835 /* When translating and there is an [:upper:] or [:lower:] 1836 class in SET2, then there must be a corresponding [:lower:] 1837 or [:upper:] class in SET1. */ 1838 if (class_s1 == UL_NONE 1839 && (class_s2 == UL_LOWER || class_s2 == UL_UPPER)) 1840 error (EXIT_FAILURE, 0, 1841 _("misaligned [:upper:] and/or [:lower:] construct")); 1842 1843 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER) 1844 { 1845 case_convert = true; 1846 for (i = 0; i < N_CHARS; i++) 1847 if (islower (i)) 1848 xlate[i] = toupper (i); 1849 } 1850 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER) 1851 { 1852 case_convert = true; 1853 for (i = 0; i < N_CHARS; i++) 1854 if (isupper (i)) 1855 xlate[i] = tolower (i); 1856 } 1857 else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER) 1858 || (class_s1 == UL_UPPER && class_s2 == UL_UPPER)) 1859 { 1860 /* POSIX says the behavior of `tr "[:upper:]" "[:upper:]"' 1861 is undefined. Treat it as a no-op. */ 1862 } 1863 else 1864 { 1865 /* The following should have been checked by validate... */ 1866 if (c1 == -1 || c2 == -1) 1867 break; 1868 xlate[c1] = c2; 1869 } 1870 } 1871 assert (c1 == -1 || truncate_set1); 1872 } 1873 if (squeeze_repeats) 1874 { 1875 set_initialize (s2, false, in_squeeze_set); 1876 squeeze_filter (io_buf, sizeof io_buf, read_and_xlate); 1877 } 1878 else 1879 { 1880 for (;;) 1881 { 1882 size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf); 1883 if (bytes_read == 0) 1884 break; 1885 if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read) 1886 error (EXIT_FAILURE, errno, _("write error")); 1887 } 1888 } 1889 } 1890 1891 if (close (STDIN_FILENO) != 0) 1892 error (EXIT_FAILURE, errno, _("standard input")); 1893 1894 exit (EXIT_SUCCESS); 1895} 1896