/*- * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 John Marino * * This source code is derived from the illumos localedef command, and * provided under BSD-style license terms by Nexenta Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * This file contains the "scanner", which tokenizes the input files * for localedef for processing by the higher level grammar processor. */ #include #include #include #include #include #include #include #include #include #include "localedef.h" #include "parser.h" int com_char = '#'; int esc_char = '\\'; int mb_cur_min = 1; int mb_cur_max = 1; int lineno = 1; int warnings = 0; int is_stdin = 1; FILE *input; static int nextline; //static FILE *input = stdin; static const char *filename = ""; static int instring = 0; static int escaped = 0; /* * Token space ... grows on demand. */ static char *token = NULL; static int tokidx; static int toksz = 0; static int hadtok = 0; /* * Wide string space ... grows on demand. */ static wchar_t *widestr = NULL; static int wideidx = 0; static int widesz = 0; /* * The last keyword seen. This is useful to trigger the special lexer rules * for "copy" and also collating symbols and elements. */ int last_kw = 0; static int category = T_END; static struct token { int id; const char *name; } keywords[] = { { T_COM_CHAR, "comment_char" }, { T_ESC_CHAR, "escape_char" }, { T_END, "END" }, { T_COPY, "copy" }, { T_MESSAGES, "LC_MESSAGES" }, { T_YESSTR, "yesstr" }, { T_YESEXPR, "yesexpr" }, { T_NOSTR, "nostr" }, { T_NOEXPR, "noexpr" }, { T_MONETARY, "LC_MONETARY" }, { T_INT_CURR_SYMBOL, "int_curr_symbol" }, { T_CURRENCY_SYMBOL, "currency_symbol" }, { T_MON_DECIMAL_POINT, "mon_decimal_point" }, { T_MON_THOUSANDS_SEP, "mon_thousands_sep" }, { T_POSITIVE_SIGN, "positive_sign" }, { T_NEGATIVE_SIGN, "negative_sign" }, { T_MON_GROUPING, "mon_grouping" }, { T_INT_FRAC_DIGITS, "int_frac_digits" }, { T_FRAC_DIGITS, "frac_digits" }, { T_P_CS_PRECEDES, "p_cs_precedes" }, { T_P_SEP_BY_SPACE, "p_sep_by_space" }, { T_N_CS_PRECEDES, "n_cs_precedes" }, { T_N_SEP_BY_SPACE, "n_sep_by_space" }, { T_P_SIGN_POSN, "p_sign_posn" }, { T_N_SIGN_POSN, "n_sign_posn" }, { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" }, { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" }, { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" }, { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" }, { T_INT_P_SIGN_POSN, "int_p_sign_posn" }, { T_INT_N_SIGN_POSN, "int_n_sign_posn" }, { T_COLLATE, "LC_COLLATE" }, { T_COLLATING_SYMBOL, "collating-symbol" }, { T_COLLATING_ELEMENT, "collating-element" }, { T_FROM, "from" }, { T_ORDER_START, "order_start" }, { T_ORDER_END, "order_end" }, { T_FORWARD, "forward" }, { T_BACKWARD, "backward" }, { T_POSITION, "position" }, { T_IGNORE, "IGNORE" }, { T_UNDEFINED, "UNDEFINED" }, { T_NUMERIC, "LC_NUMERIC" }, { T_DECIMAL_POINT, "decimal_point" }, { T_THOUSANDS_SEP, "thousands_sep" }, { T_GROUPING, "grouping" }, { T_TIME, "LC_TIME" }, { T_ABDAY, "abday" }, { T_DAY, "day" }, { T_ABMON, "abmon" }, { T_MON, "mon" }, { T_D_T_FMT, "d_t_fmt" }, { T_D_FMT, "d_fmt" }, { T_T_FMT, "t_fmt" }, { T_AM_PM, "am_pm" }, { T_T_FMT_AMPM, "t_fmt_ampm" }, { T_ERA, "era" }, { T_ERA_D_FMT, "era_d_fmt" }, { T_ERA_T_FMT, "era_t_fmt" }, { T_ERA_D_T_FMT, "era_d_t_fmt" }, { T_ALT_DIGITS, "alt_digits" }, { T_CTYPE, "LC_CTYPE" }, { T_ISUPPER, "upper" }, { T_ISLOWER, "lower" }, { T_ISALPHA, "alpha" }, { T_ISDIGIT, "digit" }, { T_ISPUNCT, "punct" }, { T_ISXDIGIT, "xdigit" }, { T_ISSPACE, "space" }, { T_ISPRINT, "print" }, { T_ISGRAPH, "graph" }, { T_ISBLANK, "blank" }, { T_ISCNTRL, "cntrl" }, /* * These entries are local additions, and not specified by * TOG. Note that they are not guaranteed to be accurate for * all locales, and so applications should not depend on them. */ { T_ISSPECIAL, "special" }, { T_ISENGLISH, "english" }, { T_ISPHONOGRAM, "phonogram" }, { T_ISIDEOGRAM, "ideogram" }, { T_ISNUMBER, "number" }, /* * We have to support this in the grammar, but it would be a * syntax error to define a character as one of these without * also defining it as an alpha or digit. We ignore it in our * parsing. */ { T_ISALNUM, "alnum" }, { T_TOUPPER, "toupper" }, { T_TOLOWER, "tolower" }, /* * These are keywords used in the charmap file. Note that * Solaris originally used angle brackets to wrap some of them, * but we removed that to simplify our parser. The first of these * items are "global items." */ { T_CHARMAP, "CHARMAP" }, { T_WIDTH, "WIDTH" }, { -1, NULL }, }; /* * These special words are only used in a charmap file, enclosed in <>. */ static struct token symwords[] = { { T_COM_CHAR, "comment_char" }, { T_ESC_CHAR, "escape_char" }, { T_CODE_SET, "code_set_name" }, { T_MB_CUR_MAX, "mb_cur_max" }, { T_MB_CUR_MIN, "mb_cur_min" }, { -1, NULL }, }; static int categories[] = { T_CHARMAP, T_CTYPE, T_COLLATE, T_MESSAGES, T_MONETARY, T_NUMERIC, T_TIME, T_WIDTH, 0 }; void reset_scanner(const char *fname) { if (fname == NULL) { filename = ""; is_stdin = 1; } else { if (!is_stdin) (void) fclose(input); if ((input = fopen(fname, "r")) == NULL) { perror("fopen"); exit(4); } else { is_stdin = 0; } filename = fname; } com_char = '#'; esc_char = '\\'; instring = 0; escaped = 0; lineno = 1; nextline = 1; tokidx = 0; wideidx = 0; } #define hex(x) \ (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) #define isodigit(x) ((x >= '0') && (x <= '7')) static int scanc(void) { int c; if (is_stdin) c = getc(stdin); else c = getc(input); lineno = nextline; if (c == '\n') { nextline++; } return (c); } static void unscanc(int c) { if (c == '\n') { nextline--; } if (ungetc(c, is_stdin ? stdin : input) < 0) { yyerror("ungetc failed"); } } static int scan_hex_byte(void) { int c1, c2; int v; c1 = scanc(); if (!isxdigit(c1)) { yyerror("malformed hex digit"); return (0); } c2 = scanc(); if (!isxdigit(c2)) { yyerror("malformed hex digit"); return (0); } v = ((hex(c1) << 4) | hex(c2)); return (v); } static int scan_dec_byte(void) { int c1, c2, c3; int b; c1 = scanc(); if (!isdigit(c1)) { yyerror("malformed decimal digit"); return (0); } b = c1 - '0'; c2 = scanc(); if (!isdigit(c2)) { yyerror("malformed decimal digit"); return (0); } b *= 10; b += (c2 - '0'); c3 = scanc(); if (!isdigit(c3)) { unscanc(c3); } else { b *= 10; b += (c3 - '0'); } return (b); } static int scan_oct_byte(void) { int c1, c2, c3; int b; b = 0; c1 = scanc(); if (!isodigit(c1)) { yyerror("malformed octal digit"); return (0); } b = c1 - '0'; c2 = scanc(); if (!isodigit(c2)) { yyerror("malformed octal digit"); return (0); } b *= 8; b += (c2 - '0'); c3 = scanc(); if (!isodigit(c3)) { unscanc(c3); } else { b *= 8; b += (c3 - '0'); } return (b); } void add_tok(int c) { if ((tokidx + 1) >= toksz) { toksz += 64; if ((token = realloc(token, toksz)) == NULL) { yyerror("out of memory"); tokidx = 0; toksz = 0; return; } } token[tokidx++] = (char)c; token[tokidx] = 0; } void add_wcs(wchar_t c) { if ((wideidx + 1) >= widesz) { widesz += 64; widestr = realloc(widestr, (widesz * sizeof (wchar_t))); if (widestr == NULL) { yyerror("out of memory"); wideidx = 0; widesz = 0; return; } } widestr[wideidx++] = c; widestr[wideidx] = 0; } wchar_t * get_wcs(void) { wchar_t *ws = widestr; wideidx = 0; widestr = NULL; widesz = 0; if (ws == NULL) { if ((ws = wcsdup(L"")) == NULL) { yyerror("out of memory"); } } return (ws); } static int get_byte(void) { int c; if ((c = scanc()) != esc_char) { unscanc(c); return (EOF); } c = scanc(); switch (c) { case 'd': case 'D': return (scan_dec_byte()); case 'x': case 'X': return (scan_hex_byte()); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* put the character back so we can get it */ unscanc(c); return (scan_oct_byte()); default: unscanc(c); unscanc(esc_char); return (EOF); } } int get_escaped(int c) { switch (c) { case 'n': return ('\n'); case 'r': return ('\r'); case 't': return ('\t'); case 'f': return ('\f'); case 'v': return ('\v'); case 'b': return ('\b'); case 'a': return ('\a'); default: return (c); } } int get_wide(void) { static char mbs[MB_LEN_MAX + 1] = ""; static int mbi = 0; int c; wchar_t wc; if (mb_cur_max >= (int)sizeof (mbs)) { yyerror("max multibyte character size too big"); mbi = 0; return (T_NULL); } for (;;) { if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { /* * end of the byte sequence reached, but no * valid wide decoding. fatal error. */ mbi = 0; yyerror("not a valid character encoding"); return (T_NULL); } mbs[mbi++] = c; mbs[mbi] = 0; /* does it decode? */ if (to_wide(&wc, mbs) >= 0) { break; } } mbi = 0; if ((category != T_CHARMAP) && (category != T_WIDTH)) { if (check_charmap(wc) < 0) { yyerror("no symbolic name for character"); return (T_NULL); } } yylval.wc = wc; return (T_CHAR); } int get_symbol(void) { int c; while ((c = scanc()) != EOF) { if (escaped) { escaped = 0; if (c == '\n') continue; add_tok(get_escaped(c)); continue; } if (c == esc_char) { escaped = 1; continue; } if (c == '\n') { /* well that's strange! */ yyerror("unterminated symbolic name"); continue; } if (c == '>') { /* end of symbol */ /* * This restarts the token from the beginning * the next time we scan a character. (This * token is complete.) */ if (token == NULL) { yyerror("missing symbolic name"); return (T_NULL); } tokidx = 0; /* * A few symbols are handled as keywords outside * of the normal categories. */ if (category == T_END) { int i; for (i = 0; symwords[i].name != 0; i++) { if (strcmp(token, symwords[i].name) == 0) { last_kw = symwords[i].id; return (last_kw); } } } /* * Contextual rule: Only literal characters are * permitted in CHARMAP. Anywhere else the symbolic * forms are fine. */ if ((category != T_CHARMAP) && (lookup_charmap(token, &yylval.wc)) != -1) { return (T_CHAR); } if ((yylval.collsym = lookup_collsym(token)) != NULL) { return (T_COLLSYM); } if ((yylval.collelem = lookup_collelem(token)) != NULL) { return (T_COLLELEM); } /* its an undefined symbol */ yylval.token = strdup(token); token = NULL; toksz = 0; tokidx = 0; return (T_SYMBOL); } add_tok(c); } yyerror("unterminated symbolic name"); return (EOF); } int get_category(void) { return (category); } static int consume_token(void) { int len = tokidx; int i; tokidx = 0; if (token == NULL) return (T_NULL); /* * this one is special, because we don't want it to alter the * last_kw field. */ if (strcmp(token, "...") == 0) { return (T_ELLIPSIS); } /* search for reserved words first */ for (i = 0; keywords[i].name; i++) { int j; if (strcmp(keywords[i].name, token) != 0) { continue; } last_kw = keywords[i].id; /* clear the top level category if we're done with it */ if (last_kw == T_END) { category = T_END; } /* set the top level category if we're changing */ for (j = 0; categories[j]; j++) { if (categories[j] != last_kw) continue; category = last_kw; } return (keywords[i].id); } /* maybe its a numeric constant? */ if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { char *eptr; yylval.num = strtol(token, &eptr, 10); if (*eptr != 0) yyerror("malformed number"); return (T_NUMBER); } /* * A single lone character is treated as a character literal. * To avoid duplication of effort, we stick in the charmap. */ if (len == 1) { yylval.wc = token[0]; return (T_CHAR); } /* anything else is treated as a symbolic name */ yylval.token = strdup(token); token = NULL; toksz = 0; tokidx = 0; return (T_NAME); } void scan_to_eol(void) { int c; while ((c = scanc()) != '\n') { if (c == EOF) { /* end of file without newline! */ errf("missing newline"); return; } } assert(c == '\n'); } int yylex(void) { int c; while ((c = scanc()) != EOF) { /* special handling for quoted string */ if (instring) { if (escaped) { escaped = 0; /* if newline, just eat and forget it */ if (c == '\n') continue; if (strchr("xXd01234567", c)) { unscanc(c); unscanc(esc_char); return (get_wide()); } yylval.wc = get_escaped(c); return (T_CHAR); } if (c == esc_char) { escaped = 1; continue; } switch (c) { case '<': return (get_symbol()); case '>': /* oops! should generate syntax error */ return (T_GT); case '"': instring = 0; return (T_QUOTE); default: yylval.wc = c; return (T_CHAR); } } /* escaped characters first */ if (escaped) { escaped = 0; if (c == '\n') { /* eat the newline */ continue; } hadtok = 1; if (tokidx) { /* an escape mid-token is nonsense */ return (T_NULL); } /* numeric escapes are treated as wide characters */ if (strchr("xXd01234567", c)) { unscanc(c); unscanc(esc_char); return (get_wide()); } add_tok(get_escaped(c)); continue; } /* if it is the escape charter itself note it */ if (c == esc_char) { escaped = 1; continue; } /* remove from the comment char to end of line */ if (c == com_char) { while (c != '\n') { if ((c = scanc()) == EOF) { /* end of file without newline! */ return (EOF); } } assert(c == '\n'); if (!hadtok) { /* * If there were no tokens on this line, * then just pretend it didn't exist at all. */ continue; } hadtok = 0; return (T_NL); } if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { /* * These are all token delimiters. If there * is a token already in progress, we need to * process it. */ unscanc(c); return (consume_token()); } switch (c) { case '\n': if (!hadtok) { /* * If the line was completely devoid of tokens, * then just ignore it. */ continue; } /* we're starting a new line, reset the token state */ hadtok = 0; return (T_NL); case ',': hadtok = 1; return (T_COMMA); case ';': hadtok = 1; return (T_SEMI); case '(': hadtok = 1; return (T_LPAREN); case ')': hadtok = 1; return (T_RPAREN); case '>': hadtok = 1; return (T_GT); case '<': /* symbol start! */ hadtok = 1; return (get_symbol()); case ' ': case '\t': /* whitespace, just ignore it */ continue; case '"': hadtok = 1; instring = 1; return (T_QUOTE); default: hadtok = 1; add_tok(c); continue; } } return (EOF); } void yyerror(const char *msg) { (void) fprintf(stderr, "%s: %d: error: %s\n", filename, lineno, msg); exit(4); } void errf(const char *fmt, ...) { char *msg; va_list va; va_start(va, fmt); (void) vasprintf(&msg, fmt, va); va_end(va); (void) fprintf(stderr, "%s: %d: error: %s\n", filename, lineno, msg); free(msg); exit(4); } void warn(const char *fmt, ...) { char *msg; va_list va; va_start(va, fmt); (void) vasprintf(&msg, fmt, va); va_end(va); (void) fprintf(stderr, "%s: %d: warning: %s\n", filename, lineno, msg); free(msg); warnings++; if (!warnok) exit(4); }