regex_internal.h revision 146040
1/* Extended regular expression matching and search library. 2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 19 02111-1307 USA. */ 20 21#ifndef _REGEX_INTERNAL_H 22#define _REGEX_INTERNAL_H 1 23 24#include <assert.h> 25#include <ctype.h> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29 30#if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC 31# include <langinfo.h> 32#endif 33#if defined HAVE_LOCALE_H || defined _LIBC 34# include <locale.h> 35#endif 36#if defined HAVE_WCHAR_H || defined _LIBC 37# include <wchar.h> 38#endif /* HAVE_WCHAR_H || _LIBC */ 39#if defined HAVE_WCTYPE_H || defined _LIBC 40# include <wctype.h> 41#endif /* HAVE_WCTYPE_H || _LIBC */ 42 43/* In case that the system doesn't have isblank(). */ 44#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank 45# define isblank(ch) ((ch) == ' ' || (ch) == '\t') 46#endif 47 48#ifdef _LIBC 49# ifndef _RE_DEFINE_LOCALE_FUNCTIONS 50# define _RE_DEFINE_LOCALE_FUNCTIONS 1 51# include <locale/localeinfo.h> 52# include <locale/elem-hash.h> 53# include <locale/coll-lookup.h> 54# endif 55#endif 56 57/* This is for other GNU distributions with internationalized messages. */ 58#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC 59# include <libintl.h> 60# ifdef _LIBC 61# undef gettext 62# define gettext(msgid) \ 63 INTUSE(__dcgettext) (_libc_intl_domainname, msgid, LC_MESSAGES) 64# endif 65#else 66# define gettext(msgid) (msgid) 67#endif 68 69#ifndef gettext_noop 70/* This define is so xgettext can find the internationalizable 71 strings. */ 72# define gettext_noop(String) String 73#endif 74 75#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC 76# define RE_ENABLE_I18N 77#endif 78 79#if __GNUC__ >= 3 80# define BE(expr, val) __builtin_expect (expr, val) 81#else 82# define BE(expr, val) (expr) 83# define inline 84#endif 85 86/* Number of bits in a byte. */ 87#define BYTE_BITS 8 88/* Number of single byte character. */ 89#define SBC_MAX 256 90 91#define COLL_ELEM_LEN_MAX 8 92 93/* The character which represents newline. */ 94#define NEWLINE_CHAR '\n' 95#define WIDE_NEWLINE_CHAR L'\n' 96 97/* Rename to standard API for using out of glibc. */ 98#ifndef _LIBC 99# define __wctype wctype 100# define __iswctype iswctype 101# define __btowc btowc 102# define __mempcpy mempcpy 103# define __wcrtomb wcrtomb 104# define __regfree regfree 105# define attribute_hidden 106#endif /* not _LIBC */ 107 108#ifdef __GNUC__ 109# define __attribute(arg) __attribute__ (arg) 110#else 111# define __attribute(arg) 112#endif 113 114extern const char __re_error_msgid[] attribute_hidden; 115extern const size_t __re_error_msgid_idx[] attribute_hidden; 116 117/* Number of bits in an unsinged int. */ 118#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS) 119/* Number of unsigned int in an bit_set. */ 120#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS) 121typedef unsigned int bitset[BITSET_UINTS]; 122typedef unsigned int *re_bitset_ptr_t; 123typedef const unsigned int *re_const_bitset_ptr_t; 124 125#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS) 126#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS)) 127#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS)) 128#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS) 129#define bitset_set_all(set) \ 130 memset (set, 255, sizeof (unsigned int) * BITSET_UINTS) 131#define bitset_copy(dest,src) \ 132 memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS) 133static inline void bitset_not (bitset set); 134static inline void bitset_merge (bitset dest, const bitset src); 135static inline void bitset_not_merge (bitset dest, const bitset src); 136static inline void bitset_mask (bitset dest, const bitset src); 137 138#define PREV_WORD_CONSTRAINT 0x0001 139#define PREV_NOTWORD_CONSTRAINT 0x0002 140#define NEXT_WORD_CONSTRAINT 0x0004 141#define NEXT_NOTWORD_CONSTRAINT 0x0008 142#define PREV_NEWLINE_CONSTRAINT 0x0010 143#define NEXT_NEWLINE_CONSTRAINT 0x0020 144#define PREV_BEGBUF_CONSTRAINT 0x0040 145#define NEXT_ENDBUF_CONSTRAINT 0x0080 146#define WORD_DELIM_CONSTRAINT 0x0100 147#define NOT_WORD_DELIM_CONSTRAINT 0x0200 148 149typedef enum 150{ 151 INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, 152 WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, 153 WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT, 154 INSIDE_NOTWORD = PREV_NOTWORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT, 155 LINE_FIRST = PREV_NEWLINE_CONSTRAINT, 156 LINE_LAST = NEXT_NEWLINE_CONSTRAINT, 157 BUF_FIRST = PREV_BEGBUF_CONSTRAINT, 158 BUF_LAST = NEXT_ENDBUF_CONSTRAINT, 159 WORD_DELIM = WORD_DELIM_CONSTRAINT, 160 NOT_WORD_DELIM = NOT_WORD_DELIM_CONSTRAINT 161} re_context_type; 162 163typedef struct 164{ 165 int alloc; 166 int nelem; 167 int *elems; 168} re_node_set; 169 170typedef enum 171{ 172 NON_TYPE = 0, 173 174 /* Node type, These are used by token, node, tree. */ 175 CHARACTER = 1, 176 END_OF_RE = 2, 177 SIMPLE_BRACKET = 3, 178 OP_BACK_REF = 4, 179 OP_PERIOD = 5, 180#ifdef RE_ENABLE_I18N 181 COMPLEX_BRACKET = 6, 182 OP_UTF8_PERIOD = 7, 183#endif /* RE_ENABLE_I18N */ 184 185 /* We define EPSILON_BIT as a macro so that OP_OPEN_SUBEXP is used 186 when the debugger shows values of this enum type. */ 187#define EPSILON_BIT 8 188 OP_OPEN_SUBEXP = EPSILON_BIT | 0, 189 OP_CLOSE_SUBEXP = EPSILON_BIT | 1, 190 OP_ALT = EPSILON_BIT | 2, 191 OP_DUP_ASTERISK = EPSILON_BIT | 3, 192 ANCHOR = EPSILON_BIT | 4, 193 194 /* Tree type, these are used only by tree. */ 195 CONCAT = 16, 196 SUBEXP = 17, 197 198 /* Token type, these are used only by token. */ 199 OP_DUP_PLUS = 18, 200 OP_DUP_QUESTION, 201 OP_OPEN_BRACKET, 202 OP_CLOSE_BRACKET, 203 OP_CHARSET_RANGE, 204 OP_OPEN_DUP_NUM, 205 OP_CLOSE_DUP_NUM, 206 OP_NON_MATCH_LIST, 207 OP_OPEN_COLL_ELEM, 208 OP_CLOSE_COLL_ELEM, 209 OP_OPEN_EQUIV_CLASS, 210 OP_CLOSE_EQUIV_CLASS, 211 OP_OPEN_CHAR_CLASS, 212 OP_CLOSE_CHAR_CLASS, 213 OP_WORD, 214 OP_NOTWORD, 215 OP_SPACE, 216 OP_NOTSPACE, 217 BACK_SLASH 218 219} re_token_type_t; 220 221#ifdef RE_ENABLE_I18N 222typedef struct 223{ 224 /* Multibyte characters. */ 225 wchar_t *mbchars; 226 227 /* Collating symbols. */ 228# ifdef _LIBC 229 int32_t *coll_syms; 230# endif 231 232 /* Equivalence classes. */ 233# ifdef _LIBC 234 int32_t *equiv_classes; 235# endif 236 237 /* Range expressions. */ 238# ifdef _LIBC 239 uint32_t *range_starts; 240 uint32_t *range_ends; 241# else /* not _LIBC */ 242 wchar_t *range_starts; 243 wchar_t *range_ends; 244# endif /* not _LIBC */ 245 246 /* Character classes. */ 247 wctype_t *char_classes; 248 249 /* If this character set is the non-matching list. */ 250 unsigned int non_match : 1; 251 252 /* # of multibyte characters. */ 253 int nmbchars; 254 255 /* # of collating symbols. */ 256 int ncoll_syms; 257 258 /* # of equivalence classes. */ 259 int nequiv_classes; 260 261 /* # of range expressions. */ 262 int nranges; 263 264 /* # of character classes. */ 265 int nchar_classes; 266} re_charset_t; 267#endif /* RE_ENABLE_I18N */ 268 269typedef struct 270{ 271 union 272 { 273 unsigned char c; /* for CHARACTER */ 274 re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */ 275#ifdef RE_ENABLE_I18N 276 re_charset_t *mbcset; /* for COMPLEX_BRACKET */ 277#endif /* RE_ENABLE_I18N */ 278 int idx; /* for BACK_REF */ 279 re_context_type ctx_type; /* for ANCHOR */ 280 } opr; 281#if __GNUC__ >= 2 282 re_token_type_t type : 8; 283#else 284 re_token_type_t type; 285#endif 286 unsigned int constraint : 10; /* context constraint */ 287 unsigned int duplicated : 1; 288 unsigned int opt_subexp : 1; 289#ifdef RE_ENABLE_I18N 290 unsigned int accept_mb : 1; 291 /* These 2 bits can be moved into the union if needed (e.g. if running out 292 of bits; move opr.c to opr.c.c and move the flags to opr.c.flags). */ 293 unsigned int mb_partial : 1; 294#endif 295 unsigned int word_char : 1; 296} re_token_t; 297 298#define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT) 299 300struct re_string_t 301{ 302 /* Indicate the raw buffer which is the original string passed as an 303 argument of regexec(), re_search(), etc.. */ 304 const unsigned char *raw_mbs; 305 /* Store the multibyte string. In case of "case insensitive mode" like 306 REG_ICASE, upper cases of the string are stored, otherwise MBS points 307 the same address that RAW_MBS points. */ 308 unsigned char *mbs; 309#ifdef RE_ENABLE_I18N 310 /* Store the wide character string which is corresponding to MBS. */ 311 wint_t *wcs; 312 int *offsets; 313 mbstate_t cur_state; 314#endif 315 /* Index in RAW_MBS. Each character mbs[i] corresponds to 316 raw_mbs[raw_mbs_idx + i]. */ 317 int raw_mbs_idx; 318 /* The length of the valid characters in the buffers. */ 319 int valid_len; 320 /* The corresponding number of bytes in raw_mbs array. */ 321 int valid_raw_len; 322 /* The length of the buffers MBS and WCS. */ 323 int bufs_len; 324 /* The index in MBS, which is updated by re_string_fetch_byte. */ 325 int cur_idx; 326 /* length of RAW_MBS array. */ 327 int raw_len; 328 /* This is RAW_LEN - RAW_MBS_IDX + VALID_LEN - VALID_RAW_LEN. */ 329 int len; 330 /* End of the buffer may be shorter than its length in the cases such 331 as re_match_2, re_search_2. Then, we use STOP for end of the buffer 332 instead of LEN. */ 333 int raw_stop; 334 /* This is RAW_STOP - RAW_MBS_IDX adjusted through OFFSETS. */ 335 int stop; 336 337 /* The context of mbs[0]. We store the context independently, since 338 the context of mbs[0] may be different from raw_mbs[0], which is 339 the beginning of the input string. */ 340 unsigned int tip_context; 341 /* The translation passed as a part of an argument of re_compile_pattern. */ 342 unsigned RE_TRANSLATE_TYPE trans; 343 /* Copy of re_dfa_t's word_char. */ 344 re_const_bitset_ptr_t word_char; 345 /* 1 if REG_ICASE. */ 346 unsigned char icase; 347 unsigned char is_utf8; 348 unsigned char map_notascii; 349 unsigned char mbs_allocated; 350 unsigned char offsets_needed; 351 unsigned char newline_anchor; 352 unsigned char word_ops_used; 353 int mb_cur_max; 354}; 355typedef struct re_string_t re_string_t; 356 357 358struct re_dfa_t; 359typedef struct re_dfa_t re_dfa_t; 360 361#ifndef _LIBC 362# ifdef __i386__ 363# define internal_function __attribute ((regparm (3), stdcall)) 364# else 365# define internal_function 366# endif 367#endif 368 369#ifndef RE_NO_INTERNAL_PROTOTYPES 370static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str, 371 int len, int init_len, 372 RE_TRANSLATE_TYPE trans, int icase, 373 const re_dfa_t *dfa) 374 internal_function; 375static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str, 376 int len, RE_TRANSLATE_TYPE trans, 377 int icase, const re_dfa_t *dfa) 378 internal_function; 379static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx, 380 int eflags) internal_function; 381static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, 382 int new_buf_len) 383 internal_function; 384# ifdef RE_ENABLE_I18N 385static void build_wcs_buffer (re_string_t *pstr) internal_function; 386static int build_wcs_upper_buffer (re_string_t *pstr) internal_function; 387# endif /* RE_ENABLE_I18N */ 388static void build_upper_buffer (re_string_t *pstr) internal_function; 389static void re_string_translate_buffer (re_string_t *pstr) internal_function; 390static void re_string_destruct (re_string_t *pstr) internal_function; 391# ifdef RE_ENABLE_I18N 392static int re_string_elem_size_at (const re_string_t *pstr, int idx) 393 internal_function __attribute ((pure)); 394static inline int re_string_char_size_at (const re_string_t *pstr, int idx) 395 internal_function __attribute ((pure)); 396static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx) 397 internal_function __attribute ((pure)); 398# endif /* RE_ENABLE_I18N */ 399static unsigned int re_string_context_at (const re_string_t *input, int idx, 400 int eflags) 401 internal_function __attribute ((pure)); 402static unsigned char re_string_peek_byte_case (const re_string_t *pstr, 403 int idx) 404 internal_function __attribute ((pure)); 405static unsigned char re_string_fetch_byte_case (re_string_t *pstr) 406 internal_function __attribute ((pure)); 407#endif 408#define re_string_peek_byte(pstr, offset) \ 409 ((pstr)->mbs[(pstr)->cur_idx + offset]) 410#define re_string_fetch_byte(pstr) \ 411 ((pstr)->mbs[(pstr)->cur_idx++]) 412#define re_string_first_byte(pstr, idx) \ 413 ((idx) == (pstr)->valid_len || (pstr)->wcs[idx] != WEOF) 414#define re_string_is_single_byte_char(pstr, idx) \ 415 ((pstr)->wcs[idx] != WEOF && ((pstr)->valid_len == (idx) + 1 \ 416 || (pstr)->wcs[(idx) + 1] != WEOF)) 417#define re_string_eoi(pstr) ((pstr)->stop <= (pstr)->cur_idx) 418#define re_string_cur_idx(pstr) ((pstr)->cur_idx) 419#define re_string_get_buffer(pstr) ((pstr)->mbs) 420#define re_string_length(pstr) ((pstr)->len) 421#define re_string_byte_at(pstr,idx) ((pstr)->mbs[idx]) 422#define re_string_skip_bytes(pstr,idx) ((pstr)->cur_idx += (idx)) 423#define re_string_set_index(pstr,idx) ((pstr)->cur_idx = (idx)) 424 425#define re_malloc(t,n) ((t *) malloc ((n) * sizeof (t))) 426#define re_realloc(p,t,n) ((t *) realloc (p, (n) * sizeof (t))) 427#define re_free(p) free (p) 428 429struct bin_tree_t 430{ 431 struct bin_tree_t *parent; 432 struct bin_tree_t *left; 433 struct bin_tree_t *right; 434 struct bin_tree_t *first; 435 struct bin_tree_t *next; 436 437 re_token_t token; 438 439 /* `node_idx' is the index in dfa->nodes, if `type' == 0. 440 Otherwise `type' indicate the type of this node. */ 441 int node_idx; 442}; 443typedef struct bin_tree_t bin_tree_t; 444 445#define BIN_TREE_STORAGE_SIZE \ 446 ((1024 - sizeof (void *)) / sizeof (bin_tree_t)) 447 448struct bin_tree_storage_t 449{ 450 struct bin_tree_storage_t *next; 451 bin_tree_t data[BIN_TREE_STORAGE_SIZE]; 452}; 453typedef struct bin_tree_storage_t bin_tree_storage_t; 454 455#define CONTEXT_WORD 1 456#define CONTEXT_NEWLINE (CONTEXT_WORD << 1) 457#define CONTEXT_BEGBUF (CONTEXT_NEWLINE << 1) 458#define CONTEXT_ENDBUF (CONTEXT_BEGBUF << 1) 459 460#define IS_WORD_CONTEXT(c) ((c) & CONTEXT_WORD) 461#define IS_NEWLINE_CONTEXT(c) ((c) & CONTEXT_NEWLINE) 462#define IS_BEGBUF_CONTEXT(c) ((c) & CONTEXT_BEGBUF) 463#define IS_ENDBUF_CONTEXT(c) ((c) & CONTEXT_ENDBUF) 464#define IS_ORDINARY_CONTEXT(c) ((c) == 0) 465 466#define IS_WORD_CHAR(ch) (isalnum (ch) || (ch) == '_') 467#define IS_NEWLINE(ch) ((ch) == NEWLINE_CHAR) 468#define IS_WIDE_WORD_CHAR(ch) (iswalnum (ch) || (ch) == L'_') 469#define IS_WIDE_NEWLINE(ch) ((ch) == WIDE_NEWLINE_CHAR) 470 471#define NOT_SATISFY_PREV_CONSTRAINT(constraint,context) \ 472 ((((constraint) & PREV_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \ 473 || ((constraint & PREV_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \ 474 || ((constraint & PREV_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context))\ 475 || ((constraint & PREV_BEGBUF_CONSTRAINT) && !IS_BEGBUF_CONTEXT (context))) 476 477#define NOT_SATISFY_NEXT_CONSTRAINT(constraint,context) \ 478 ((((constraint) & NEXT_WORD_CONSTRAINT) && !IS_WORD_CONTEXT (context)) \ 479 || (((constraint) & NEXT_NOTWORD_CONSTRAINT) && IS_WORD_CONTEXT (context)) \ 480 || (((constraint) & NEXT_NEWLINE_CONSTRAINT) && !IS_NEWLINE_CONTEXT (context)) \ 481 || (((constraint) & NEXT_ENDBUF_CONSTRAINT) && !IS_ENDBUF_CONTEXT (context))) 482 483struct re_dfastate_t 484{ 485 unsigned int hash; 486 re_node_set nodes; 487 re_node_set non_eps_nodes; 488 re_node_set inveclosure; 489 re_node_set *entrance_nodes; 490 struct re_dfastate_t **trtable, **word_trtable; 491 unsigned int context : 4; 492 unsigned int halt : 1; 493 /* If this state can accept `multi byte'. 494 Note that we refer to multibyte characters, and multi character 495 collating elements as `multi byte'. */ 496 unsigned int accept_mb : 1; 497 /* If this state has backreference node(s). */ 498 unsigned int has_backref : 1; 499 unsigned int has_constraint : 1; 500}; 501typedef struct re_dfastate_t re_dfastate_t; 502 503struct re_state_table_entry 504{ 505 int num; 506 int alloc; 507 re_dfastate_t **array; 508}; 509 510/* Array type used in re_sub_match_last_t and re_sub_match_top_t. */ 511 512typedef struct 513{ 514 int next_idx; 515 int alloc; 516 re_dfastate_t **array; 517} state_array_t; 518 519/* Store information about the node NODE whose type is OP_CLOSE_SUBEXP. */ 520 521typedef struct 522{ 523 int node; 524 int str_idx; /* The position NODE match at. */ 525 state_array_t path; 526} re_sub_match_last_t; 527 528/* Store information about the node NODE whose type is OP_OPEN_SUBEXP. 529 And information about the node, whose type is OP_CLOSE_SUBEXP, 530 corresponding to NODE is stored in LASTS. */ 531 532typedef struct 533{ 534 int str_idx; 535 int node; 536 int next_last_offset; 537 state_array_t *path; 538 int alasts; /* Allocation size of LASTS. */ 539 int nlasts; /* The number of LASTS. */ 540 re_sub_match_last_t **lasts; 541} re_sub_match_top_t; 542 543struct re_backref_cache_entry 544{ 545 int node; 546 int str_idx; 547 int subexp_from; 548 int subexp_to; 549 char more; 550 char unused; 551 unsigned short int eps_reachable_subexps_map; 552}; 553 554typedef struct 555{ 556 /* The string object corresponding to the input string. */ 557 re_string_t input; 558#if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L) 559 re_dfa_t *const dfa; 560#else 561 re_dfa_t *dfa; 562#endif 563 /* EFLAGS of the argument of regexec. */ 564 int eflags; 565 /* Where the matching ends. */ 566 int match_last; 567 int last_node; 568 /* The state log used by the matcher. */ 569 re_dfastate_t **state_log; 570 int state_log_top; 571 /* Back reference cache. */ 572 int nbkref_ents; 573 int abkref_ents; 574 struct re_backref_cache_entry *bkref_ents; 575 int max_mb_elem_len; 576 int nsub_tops; 577 int asub_tops; 578 re_sub_match_top_t **sub_tops; 579} re_match_context_t; 580 581typedef struct 582{ 583 re_dfastate_t **sifted_states; 584 re_dfastate_t **limited_states; 585 int last_node; 586 int last_str_idx; 587 re_node_set limits; 588} re_sift_context_t; 589 590struct re_fail_stack_ent_t 591{ 592 int idx; 593 int node; 594 regmatch_t *regs; 595 re_node_set eps_via_nodes; 596}; 597 598struct re_fail_stack_t 599{ 600 int num; 601 int alloc; 602 struct re_fail_stack_ent_t *stack; 603}; 604 605struct re_dfa_t 606{ 607 re_token_t *nodes; 608 int nodes_alloc; 609 int nodes_len; 610 int *nexts; 611 int *org_indices; 612 re_node_set *edests; 613 re_node_set *eclosures; 614 re_node_set *inveclosures; 615 struct re_state_table_entry *state_table; 616 re_dfastate_t *init_state; 617 re_dfastate_t *init_state_word; 618 re_dfastate_t *init_state_nl; 619 re_dfastate_t *init_state_begbuf; 620 bin_tree_t *str_tree; 621 bin_tree_storage_t *str_tree_storage; 622 re_bitset_ptr_t sb_char; 623 int str_tree_storage_idx; 624 625 /* number of subexpressions `re_nsub' is in regex_t. */ 626 unsigned int state_hash_mask; 627 int states_alloc; 628 int init_node; 629 int nbackref; /* The number of backreference in this dfa. */ 630 631 /* Bitmap expressing which backreference is used. */ 632 unsigned int used_bkref_map; 633 unsigned int completed_bkref_map; 634 635 unsigned int has_plural_match : 1; 636 /* If this dfa has "multibyte node", which is a backreference or 637 a node which can accept multibyte character or multi character 638 collating element. */ 639 unsigned int has_mb_node : 1; 640 unsigned int is_utf8 : 1; 641 unsigned int map_notascii : 1; 642 unsigned int word_ops_used : 1; 643 int mb_cur_max; 644 bitset word_char; 645 reg_syntax_t syntax; 646 int *subexp_map; 647#ifdef DEBUG 648 char* re_str; 649#endif 650}; 651 652#ifndef RE_NO_INTERNAL_PROTOTYPES 653static reg_errcode_t re_node_set_alloc (re_node_set *set, int size) internal_function; 654static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem) internal_function; 655static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1, 656 int elem2) internal_function; 657#define re_node_set_init_empty(set) memset (set, '\0', sizeof (re_node_set)) 658static reg_errcode_t re_node_set_init_copy (re_node_set *dest, 659 const re_node_set *src) internal_function; 660static reg_errcode_t re_node_set_add_intersect (re_node_set *dest, 661 const re_node_set *src1, 662 const re_node_set *src2) internal_function; 663static reg_errcode_t re_node_set_init_union (re_node_set *dest, 664 const re_node_set *src1, 665 const re_node_set *src2) internal_function; 666static reg_errcode_t re_node_set_merge (re_node_set *dest, 667 const re_node_set *src) internal_function; 668static int re_node_set_insert (re_node_set *set, int elem) internal_function; 669static int re_node_set_insert_last (re_node_set *set, 670 int elem) internal_function; 671static int re_node_set_compare (const re_node_set *set1, 672 const re_node_set *set2) 673 internal_function __attribute ((pure)); 674static int re_node_set_contains (const re_node_set *set, int elem) 675 internal_function __attribute ((pure)); 676static void re_node_set_remove_at (re_node_set *set, int idx) internal_function; 677#define re_node_set_remove(set,id) \ 678 (re_node_set_remove_at (set, re_node_set_contains (set, id) - 1)) 679#define re_node_set_empty(p) ((p)->nelem = 0) 680#define re_node_set_free(set) re_free ((set)->elems) 681static int re_dfa_add_node (re_dfa_t *dfa, re_token_t token) internal_function; 682static re_dfastate_t *re_acquire_state (reg_errcode_t *err, re_dfa_t *dfa, 683 const re_node_set *nodes) internal_function; 684static re_dfastate_t *re_acquire_state_context (reg_errcode_t *err, 685 re_dfa_t *dfa, 686 const re_node_set *nodes, 687 unsigned int context) internal_function; 688static void free_state (re_dfastate_t *state) internal_function; 689#endif 690 691 692typedef enum 693{ 694 SB_CHAR, 695 MB_CHAR, 696 EQUIV_CLASS, 697 COLL_SYM, 698 CHAR_CLASS 699} bracket_elem_type; 700 701typedef struct 702{ 703 bracket_elem_type type; 704 union 705 { 706 unsigned char ch; 707 unsigned char *name; 708 wchar_t wch; 709 } opr; 710} bracket_elem_t; 711 712 713/* Inline functions for bitset operation. */ 714static inline void 715bitset_not (bitset set) 716{ 717 int bitset_i; 718 for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) 719 set[bitset_i] = ~set[bitset_i]; 720} 721 722static inline void 723bitset_merge (bitset dest, const bitset src) 724{ 725 int bitset_i; 726 for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) 727 dest[bitset_i] |= src[bitset_i]; 728} 729 730static inline void 731bitset_not_merge (bitset dest, const bitset src) 732{ 733 int i; 734 for (i = 0; i < BITSET_UINTS; ++i) 735 dest[i] |= ~src[i]; 736} 737 738static inline void 739bitset_mask (bitset dest, const bitset src) 740{ 741 int bitset_i; 742 for (bitset_i = 0; bitset_i < BITSET_UINTS; ++bitset_i) 743 dest[bitset_i] &= src[bitset_i]; 744} 745 746#if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES 747/* Inline functions for re_string. */ 748static inline int 749internal_function 750re_string_char_size_at (const re_string_t *pstr, int idx) 751{ 752 int byte_idx; 753 if (pstr->mb_cur_max == 1) 754 return 1; 755 for (byte_idx = 1; idx + byte_idx < pstr->valid_len; ++byte_idx) 756 if (pstr->wcs[idx + byte_idx] != WEOF) 757 break; 758 return byte_idx; 759} 760 761static inline wint_t 762internal_function 763re_string_wchar_at (const re_string_t *pstr, int idx) 764{ 765 if (pstr->mb_cur_max == 1) 766 return (wint_t) pstr->mbs[idx]; 767 return (wint_t) pstr->wcs[idx]; 768} 769 770static int 771internal_function 772re_string_elem_size_at (const re_string_t *pstr, int idx) 773{ 774#ifdef _LIBC 775 const unsigned char *p, *extra; 776 const int32_t *table, *indirect; 777 int32_t tmp; 778# include <locale/weight.h> 779 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 780 781 if (nrules != 0) 782 { 783 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 784 extra = (const unsigned char *) 785 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 786 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, 787 _NL_COLLATE_INDIRECTMB); 788 p = pstr->mbs + idx; 789 tmp = findidx (&p); 790 return p - pstr->mbs - idx; 791 } 792 else 793#endif /* _LIBC */ 794 return 1; 795} 796#endif /* RE_ENABLE_I18N */ 797 798#endif /* _REGEX_INTERNAL_H */ 799