contrib/libgnuregex/regcomp.c

55 static int fetch_number (re_string_t *input, re_token_t *token,
57 static int peek_token (re_token_t *token, re_string_t *input,
62 				  re_token_t *token, reg_syntax_t syntax,
65 				 re_token_t *token, reg_syntax_t syntax,
68 				     re_token_t *token, reg_syntax_t syntax,
71 				  re_token_t *token, reg_syntax_t syntax,
74 				 re_dfa_t *dfa, re_token_t *token,
77 				      re_token_t *token, reg_syntax_t syntax,
81 					    re_token_t *token, int token_len,
87 					  re_token_t *token);
117 				      const re_token_t *token);
1285   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1287       int idx = node->token.opr.idx;
1288       node->token.opr.idx = dfa->subexp_map[idx];
1289       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1292   else if (node->token.type == SUBEXP
1293 	   && node->left && node->left->token.type == SUBEXP)
1295       int other_idx = node->left->token.opr.idx;
1301       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1317   if (node->left && node->left->token.type == SUBEXP)
1323   if (node->right && node->right->token.type == SUBEXP)
1346       && (node->token.opr.idx >= BITSET_WORD_BITS
1348 	       & ((bitset_word_t) 1 << node->token.opr.idx))))
1363   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1364   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1374   if (node->token.type == CONCAT)
1382       node->node_idx = re_dfa_add_node (dfa, node->token);
1385       if (node->token.type == ANCHOR)
1386 	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1395   switch (node->token.type)
1422   switch (node->token.type)
1458       if (node->token.type == OP_BACK_REF)
1463       assert (!IS_EPSILON_NODE (node->token.type));
1760 /* Functions for token which are used in the parser.  */
1762 /* Fetch a token from INPUT.
1772 /* Peek a token from INPUT, and return the length of the token.
1777 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783       token->type = END_OF_RE;
1788   token->opr.c = c;
1790   token->word_char = 0;
1792   token->mb_partial = 0;
1796       token->type = CHARACTER;
1797       token->mb_partial = 1;
1806 	  token->type = BACK_SLASH;
1811       token->opr.c = c2;
1812       token->type = CHARACTER;
1818 	  token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1822 	token->word_char = IS_WORD_CHAR (c2) != 0;
1828 	    token->type = OP_ALT;
1834 	      token->type = OP_BACK_REF;
1835 	      token->opr.idx = c2 - '1';
1841 	      token->type = ANCHOR;
1842 	      token->opr.ctx_type = WORD_FIRST;
1848 	      token->type = ANCHOR;
1849 	      token->opr.ctx_type = WORD_LAST;
1855 	      token->type = ANCHOR;
1856 	      token->opr.ctx_type = WORD_DELIM;
1862 	      token->type = ANCHOR;
1863 	      token->opr.ctx_type = NOT_WORD_DELIM;
1868 	    token->type = OP_WORD;
1872 	    token->type = OP_NOTWORD;
1876 	    token->type = OP_SPACE;
1880 	    token->type = OP_NOTSPACE;
1885 	      token->type = ANCHOR;
1886 	      token->opr.ctx_type = BUF_FIRST;
1892 	      token->type = ANCHOR;
1893 	      token->opr.ctx_type = BUF_LAST;
1898 	    token->type = OP_OPEN_SUBEXP;
1902 	    token->type = OP_CLOSE_SUBEXP;
1906 	    token->type = OP_DUP_PLUS;
1910 	    token->type = OP_DUP_QUESTION;
1914 	    token->type = OP_OPEN_DUP_NUM;
1918 	    token->type = OP_CLOSE_DUP_NUM;
1926   token->type = CHARACTER;
1931       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1935     token->word_char = IS_WORD_CHAR (token->opr.c);
1941 	token->type = OP_ALT;
1945 	token->type = OP_ALT;
1948       token->type = OP_DUP_ASTERISK;
1952 	token->type = OP_DUP_PLUS;
1956 	token->type = OP_DUP_QUESTION;
1960 	token->type = OP_OPEN_DUP_NUM;
1964 	token->type = OP_CLOSE_DUP_NUM;
1968 	token->type = OP_OPEN_SUBEXP;
1972 	token->type = OP_CLOSE_SUBEXP;
1975       token->type = OP_OPEN_BRACKET;
1978       token->type = OP_PERIOD;
1988       token->type = ANCHOR;
1989       token->opr.ctx_type = LINE_FIRST;
2002       token->type = ANCHOR;
2003       token->opr.ctx_type = LINE_LAST;
2011 /* Peek a token from INPUT, and return the length of the token.
2016 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2021       token->type = END_OF_RE;
2025   token->opr.c = c;
2031       token->type = CHARACTER;
2043       token->opr.c = c2;
2044       token->type = CHARACTER;
2055       token->opr.c = c2;
2060 	  token->type = OP_OPEN_COLL_ELEM;
2063 	  token->type = OP_OPEN_EQUIV_CLASS;
2068 	      token->type = OP_OPEN_CHAR_CLASS;
2073 	  token->type = CHARACTER;
2074 	  token->opr.c = c;
2083       token->type = OP_CHARSET_RANGE;
2086       token->type = OP_CLOSE_BRACKET;
2089       token->type = OP_NON_MATCH_LIST;
2092       token->type = CHARACTER;
2147 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156   while (token->type == OP_ALT)
2158       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159       if (token->type != OP_ALT && token->type != END_OF_RE
2160 	  && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2162 	  branch = parse_branch (regexp, preg, token, syntax, nest, err);
2188 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2193   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197   while (token->type != OP_ALT && token->type != END_OF_RE
2198 	 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2200       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2233 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2238   switch (token->type)
2241       tree = create_token_tree (dfa, NULL, NULL, token);
2254 	      fetch_token (token, regexp, syntax);
2255 	      mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2267       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2272       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2277       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2282       dfa->used_bkref_map |= 1 << token->opr.idx;
2283       tree = create_token_tree (dfa, NULL, NULL, token);
2309 	  fetch_token (token, regexp, syntax);
2310 	  return parse_expression (regexp, preg, token, syntax, nest, err);
2314       if ((token->type == OP_CLOSE_SUBEXP) &&
2325       token->type = CHARACTER;
2328       tree = create_token_tree (dfa, NULL, NULL, token);
2336       if ((token->opr.ctx_type
2340       if (token->opr.ctx_type == WORD_DELIM
2341 	  || token->opr.ctx_type == NOT_WORD_DELIM)
2344 	  if (token->opr.ctx_type == WORD_DELIM)
2346 	      token->opr.ctx_type = WORD_FIRST;
2347 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2348 	      token->opr.ctx_type = WORD_LAST;
2352 	      token->opr.ctx_type = INSIDE_WORD;
2353 	      tree_first = create_token_tree (dfa, NULL, NULL, token);
2354 	      token->opr.ctx_type = INSIDE_NOTWORD;
2356 	  tree_last = create_token_tree (dfa, NULL, NULL, token);
2366 	  tree = create_token_tree (dfa, NULL, NULL, token);
2377       fetch_token (token, regexp, syntax);
2380       tree = create_token_tree (dfa, NULL, NULL, token);
2394 				 token->type == OP_NOTWORD, err);
2403 				 token->type == OP_NOTSPACE, err);
2420   fetch_token (token, regexp, syntax);
2422   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2423 	 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2425       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2430 	  && (token->type == OP_DUP_ASTERISK
2431 	      || token->type == OP_OPEN_DUP_NUM))
2449 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2457   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2460   if (token->type == OP_CLOSE_SUBEXP)
2464       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2465       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2484   tree->token.opr.idx = cur_nsub;
2492 	      re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2496   re_token_t start_token = *token;
2498   if (token->type == OP_OPEN_DUP_NUM)
2501       start = fetch_number (regexp, token, syntax);
2504 	  if (token->type == CHARACTER && token->opr.c == ',')
2515 	  end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516 		 : ((token->type == CHARACTER && token->opr.c == ',')
2517 		    ? fetch_number (regexp, token, syntax) : -2));
2524 	      if (token->type == END_OF_RE)
2534 	  *token = start_token;
2535 	  token->type = CHARACTER;
2541       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2550       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554   fetch_token (token, regexp, syntax);
2586   if (elem->token.type == SUBEXP)
2587     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2775 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3070   token_len = peek_token_bracket (token, regexp, syntax);
3071   if (BE (token->type == END_OF_RE, 0))
3076   if (token->type == OP_NON_MATCH_LIST)
3084       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3085       token_len = peek_token_bracket (token, regexp, syntax);
3086       if (BE (token->type == END_OF_RE, 0))
3094   if (token->type == OP_CLOSE_BRACKET)
3095     token->type = CHARACTER;
3107       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3116       /* Get information about the next token.  We need it in any case.  */
3117       token_len = peek_token_bracket (token, regexp, syntax);
3122 	  if (BE (token->type == END_OF_RE, 0))
3127 	  if (token->type == OP_CHARSET_RANGE)
3140 		  token->type = CHARACTER;
3158 	  token_len = peek_token_bracket (token, regexp, syntax);
3233       if (BE (token->type == END_OF_RE, 0))
3238       if (token->type == OP_CLOSE_BRACKET)
3242   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3320 		       re_token_t *token, int token_len, re_dfa_t *dfa,
3334   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3335   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3336       || token->type == OP_OPEN_EQUIV_CLASS)
3337     return parse_bracket_symbol (elem, regexp, token);
3338   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3350   elem->opr.ch = token->opr.c;
3360 		      re_token_t *token)
3362   unsigned char ch, delim = token->opr.c;
3370       if (token->type == OP_OPEN_CHAR_CLASS)
3382   switch (token->type)
3697 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3703       fetch_token (token, input, syntax);
3704       c = token->opr.c;
3705       if (BE (token->type == END_OF_RE, 0))
3707       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3709       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3749 		   const re_token_t *token)
3767   tree->token = *token;
3768   tree->token.duplicated = 0;
3769   tree->token.opt_subexp = 0;
3788   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3789     node->token.opt_subexp = 1;
3814   free_token (&node->token);
3834       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3838       (*p_new)->token.duplicated = 1;