gnulib/lib/regcomp.c

55 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
62 				  re_token_t *token, reg_syntax_t syntax,
65 				 re_token_t *token, reg_syntax_t syntax,
68 				     re_token_t *token, reg_syntax_t syntax,
71 				  re_token_t *token, reg_syntax_t syntax,
74 				 re_dfa_t *dfa, re_token_t *token,
77 				      re_token_t *token, reg_syntax_t syntax,
81 					    re_token_t *token, int token_len,
87 					  re_token_t *token);
117 				      const re_token_t *token);
1254   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1256       int idx = node->token.opr.idx;
1257       node->token.opr.idx = dfa->subexp_map[idx];
1258       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1261   else if (node->token.type == SUBEXP
1262            && node->left && node->left->token.type == SUBEXP)
1264       Idx other_idx = node->left->token.opr.idx;
1270       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1286   if (node->left && node->left->token.type == SUBEXP)
1292   if (node->right && node->right->token.type == SUBEXP)
1315       && (node->token.opr.idx >= BITSET_WORD_BITS
1317 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1332   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1333   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1343   if (node->token.type == CONCAT)
1351       node->node_idx = re_dfa_add_node (dfa, node->token);
1362   switch (node->token.type)
1389   switch (node->token.type)
1425       if (node->token.type == OP_BACK_REF)
1430       assert (!IS_EPSILON_NODE (node->token.type));
1736 /* Functions for token which are used in the parser.  */
1738 /* Fetch a token from INPUT.
1748 /* Peek a token from INPUT, and return the length of the token.
1753 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1759       token->type = END_OF_RE;
1764   token->opr.c = c;
1766   token->word_char = 0;
1768   token->mb_partial = 0;
1772       token->type = CHARACTER;
1773       token->mb_partial = 1;
1782 	  token->type = BACK_SLASH;
1787       token->opr.c = c2;
1788       token->type = CHARACTER;
1794 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1798 	token->word_char = IS_WORD_CHAR (c2) != 0;
1804 	    token->type = OP_ALT;
1810 	      token->type = OP_BACK_REF;
1811 	      token->opr.idx = c2 - '1';
1817 	      token->type = ANCHOR;
1818 	      token->opr.ctx_type = WORD_FIRST;
1824 	      token->type = ANCHOR;
1825 	      token->opr.ctx_type = WORD_LAST;
1831 	      token->type = ANCHOR;
1832 	      token->opr.ctx_type = WORD_DELIM;
1838 	      token->type = ANCHOR;
1839 	      token->opr.ctx_type = NOT_WORD_DELIM;
1844 	    token->type = OP_WORD;
1848 	    token->type = OP_NOTWORD;
1852 	    token->type = OP_SPACE;
1856 	    token->type = OP_NOTSPACE;
1861 	      token->type = ANCHOR;
1862 	      token->opr.ctx_type = BUF_FIRST;
1868 	      token->type = ANCHOR;
1869 	      token->opr.ctx_type = BUF_LAST;
1874 	    token->type = OP_OPEN_SUBEXP;
1878 	    token->type = OP_CLOSE_SUBEXP;
1882 	    token->type = OP_DUP_PLUS;
1886 	    token->type = OP_DUP_QUESTION;
1890 	    token->type = OP_OPEN_DUP_NUM;
1894 	    token->type = OP_CLOSE_DUP_NUM;
1902   token->type = CHARACTER;
1907       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1911     token->word_char = IS_WORD_CHAR (token->opr.c);
1917 	token->type = OP_ALT;
1921 	token->type = OP_ALT;
1924       token->type = OP_DUP_ASTERISK;
1928 	token->type = OP_DUP_PLUS;
1932 	token->type = OP_DUP_QUESTION;
1936 	token->type = OP_OPEN_DUP_NUM;
1940 	token->type = OP_CLOSE_DUP_NUM;
1944 	token->type = OP_OPEN_SUBEXP;
1948 	token->type = OP_CLOSE_SUBEXP;
1951       token->type = OP_OPEN_BRACKET;
1954       token->type = OP_PERIOD;
1964       token->type = ANCHOR;
1965       token->opr.ctx_type = LINE_FIRST;
1978       token->type = ANCHOR;
1979       token->opr.ctx_type = LINE_LAST;
1987 /* Peek a token from INPUT, and return the length of the token.
1992 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1997       token->type = END_OF_RE;
2001   token->opr.c = c;
2007       token->type = CHARACTER;
2019       token->opr.c = c2;
2020       token->type = CHARACTER;
2031       token->opr.c = c2;
2036 	  token->type = OP_OPEN_COLL_ELEM;
2039 	  token->type = OP_OPEN_EQUIV_CLASS;
2044 	      token->type = OP_OPEN_CHAR_CLASS;
2049 	  token->type = CHARACTER;
2050 	  token->opr.c = c;
2059       token->type = OP_CHARSET_RANGE;
2062       token->type = OP_CLOSE_BRACKET;
2065       token->type = OP_NON_MATCH_LIST;
2068       token->type = CHARACTER;
2123 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2128   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2132   while (token->type == OP_ALT)
2134       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2135       if (token->type != OP_ALT && token->type != END_OF_RE
2136 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2138 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2164 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2169   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2173   while (token->type != OP_ALT && token->type != END_OF_RE
2174 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2176       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2204 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2209   switch (token->type)
2212       tree = create_token_tree (dfa, NULL, NULL, token);
2225 	      fetch_token (token, regexp, syntax);
2226 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2238       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2248       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2253       dfa->used_bkref_map |= 1 << token->opr.idx;
2254       tree = create_token_tree (dfa, NULL, NULL, token);
2280 	  fetch_token (token, regexp, syntax);
2281 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2285       if ((token->type == OP_CLOSE_SUBEXP) &&
2296       token->type = CHARACTER;
2299       tree = create_token_tree (dfa, NULL, NULL, token);
2307       if ((token->opr.ctx_type
2311       if (token->opr.ctx_type == WORD_DELIM
2312           || token->opr.ctx_type == NOT_WORD_DELIM)
2315 	  if (token->opr.ctx_type == WORD_DELIM)
2317 	      token->opr.ctx_type = WORD_FIRST;
2318 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2319 	      token->opr.ctx_type = WORD_LAST;
2323 	      token->opr.ctx_type = INSIDE_WORD;
2324 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2325 	      token->opr.ctx_type = INSIDE_NOTWORD;
2327 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2337 	  tree = create_token_tree (dfa, NULL, NULL, token);
2348       fetch_token (token, regexp, syntax);
2351       tree = create_token_tree (dfa, NULL, NULL, token);
2365 				 token->type == OP_NOTWORD, err);
2374 				 token->type == OP_NOTSPACE, err);
2391   fetch_token (token, regexp, syntax);
2393   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2394 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2396       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2401 	  && (token->type == OP_DUP_ASTERISK
2402 	      || token->type == OP_OPEN_DUP_NUM))
2420 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2428   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2431   if (token->type == OP_CLOSE_SUBEXP)
2435       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2436       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2451   tree->token.opr.idx = cur_nsub;
2459 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2463   re_token_t start_token = *token;
2465   if (token->type == OP_OPEN_DUP_NUM)
2468       start = fetch_number (regexp, token, syntax);
2471 	  if (token->type == CHARACTER && token->opr.c == ',')
2482 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2483 		 : ((token->type == CHARACTER && token->opr.c == ',')
2484 		    ? fetch_number (regexp, token, syntax) : REG_ERROR));
2491 	      if (token->type == END_OF_RE)
2501 	  *token = start_token;
2502 	  token->type = CHARACTER;
2517       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2518       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2521   fetch_token (token, regexp, syntax);
2553   if (elem->token.type == SUBEXP)
2554     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2743 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3051   token_len = peek_token_bracket (token, regexp, syntax);
3052   if (BE (token->type == END_OF_RE, 0))
3057   if (token->type == OP_NON_MATCH_LIST)
3065       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3066       token_len = peek_token_bracket (token, regexp, syntax);
3067       if (BE (token->type == END_OF_RE, 0))
3075   if (token->type == OP_CLOSE_BRACKET)
3076     token->type = CHARACTER;
3089       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3098       /* Get information about the next token.  We need it in any case.  */
3099       token_len = peek_token_bracket (token, regexp, syntax);
3104 	  if (BE (token->type == END_OF_RE, 0))
3109 	  if (token->type == OP_CHARSET_RANGE)
3122 		  token->type = CHARACTER;
3140 	  token_len = peek_token_bracket (token, regexp, syntax);
3215       if (BE (token->type == END_OF_RE, 0))
3220       if (token->type == OP_CLOSE_BRACKET)
3224   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3302 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3316   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3317   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3318       || token->type == OP_OPEN_EQUIV_CLASS)
3319     return parse_bracket_symbol (elem, regexp, token);
3320   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3332   elem->opr.ch = token->opr.c;
3342 		      re_token_t *token)
3344   unsigned char ch, delim = token->opr.c;
3352       if (token->type == OP_OPEN_CHAR_CLASS)
3364   switch (token->type)
3676 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3682       fetch_token (token, input, syntax);
3683       c = token->opr.c;
3684       if (BE (token->type == END_OF_RE, 0))
3686       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3688       num = ((token->type != CHARACTER || c < '0' || '9' < c
3730 		   const re_token_t *token)
3748   tree->token = *token;
3749   tree->token.duplicated = 0;
3750   tree->token.opt_subexp = 0;
3769   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3770     node->token.opt_subexp = 1;
3795   free_token (&node->token);
3815       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3819       (*p_new)->token.duplicated = 1;