1/*********************************************************************** 2* * 3* This software is part of the ast package * 4* Copyright (c) 1985-2012 AT&T Intellectual Property * 5* and is licensed under the * 6* Eclipse Public License, Version 1.0 * 7* by AT&T Intellectual Property * 8* * 9* A copy of the License is available at * 10* http://www.eclipse.org/org/documents/epl-v10.html * 11* (with md5 checksum b35adb5213ca9657e911e9befb180842) * 12* * 13* Information and Software Systems Research * 14* AT&T Research * 15* Florham Park NJ * 16* * 17* Glenn Fowler <gsf@research.att.com> * 18* David Korn <dgk@research.att.com> * 19* Phong Vo <kpv@research.att.com> * 20* * 21***********************************************************************/ 22#pragma prototyped 23 24/* 25 * posix regex implementation 26 * 27 * based on Doug McIlroy's C++ implementation 28 * Knuth-Morris-Pratt adapted from Corman-Leiserson-Rivest 29 * Boyer-Moore from conversations with David Korn, Phong Vo, Andrew Hume 30 */ 31 32#ifndef _REGLIB_H 33#define _REGLIB_H 34 35#define REG_VERSION_EXEC 20020509L 36#define REG_VERSION_MAP 20030916L /* regdisc_t.re_map */ 37 38#define re_info env 39 40#define alloc _reg_alloc 41#define classfun _reg_classfun 42#define drop _reg_drop 43#define fatal _reg_fatal 44#define state _reg_state 45 46typedef struct regsubop_s 47{ 48 int op; /* REG_SUB_LOWER,REG_SUB_UPPER */ 49 int off; /* re_rhs or match[] offset */ 50 int len; /* re_rhs len or len==0 match[] */ 51} regsubop_t; 52 53#define _REG_SUB_PRIVATE_ \ 54 char* re_cur; /* re_buf cursor */ \ 55 char* re_end; /* re_buf end */ \ 56 regsubop_t* re_ops; /* rhs ops */ \ 57 char re_rhs[1]; /* substitution rhs */ 58 59#include <ast.h> 60#include <cdt.h> 61#include <stk.h> 62 63#include "regex.h" 64 65#include <ctype.h> 66#include <errno.h> 67 68#if _BLD_DEBUG && !defined(_AST_REGEX_DEBUG) 69#define _AST_REGEX_DEBUG 1 70#endif 71 72#define MBSIZE(p) ((ast.tmp_int=mbsize(p))>0?ast.tmp_int:1) 73 74#undef RE_DUP_MAX /* posix puts this in limits.h! */ 75#define RE_DUP_MAX (INT_MAX/2-1) /* 2*RE_DUP_MAX won't overflow */ 76#define RE_DUP_INF (RE_DUP_MAX+1) /* infinity, for * */ 77#define BACK_REF_MAX 9 78 79#define REG_COMP (~REG_EXEC) 80#define REG_EXEC (REG_ADVANCE|REG_INVERT|REG_NOTBOL|REG_NOTEOL|REG_STARTEND) 81 82#define REX_NULL 0 /* null string (internal) */ 83#define REX_ALT 1 /* a|b */ 84#define REX_ALT_CATCH 2 /* REX_ALT catcher */ 85#define REX_BACK 3 /* \1, \2, etc */ 86#define REX_BEG 4 /* initial ^ */ 87#define REX_BEG_STR 5 /* initial ^ w/ no newline */ 88#define REX_BM 6 /* Boyer-Moore */ 89#define REX_CAT 7 /* catenation catcher */ 90#define REX_CLASS 8 /* [...] */ 91#define REX_COLL_CLASS 9 /* collation order [...] */ 92#define REX_CONJ 10 /* a&b */ 93#define REX_CONJ_LEFT 11 /* REX_CONJ left catcher */ 94#define REX_CONJ_RIGHT 12 /* REX_CONJ right catcher */ 95#define REX_DONE 13 /* completed match (internal) */ 96#define REX_DOT 14 /* . */ 97#define REX_END 15 /* final $ */ 98#define REX_END_STR 16 /* final $ before tail newline */ 99#define REX_EXEC 17 /* call re.re_exec() */ 100#define REX_FIN_STR 18 /* final $ w/ no newline */ 101#define REX_GROUP 19 /* \(...\) */ 102#define REX_GROUP_CATCH 20 /* REX_GROUP catcher */ 103#define REX_GROUP_AHEAD 21 /* 0-width lookahead */ 104#define REX_GROUP_AHEAD_CATCH 22 /* REX_GROUP_AHEAD catcher */ 105#define REX_GROUP_AHEAD_NOT 23 /* inverted 0-width lookahead */ 106#define REX_GROUP_BEHIND 24 /* 0-width lookbehind */ 107#define REX_GROUP_BEHIND_CATCH 25 /* REX_GROUP_BEHIND catcher */ 108#define REX_GROUP_BEHIND_NOT 26 /* inverted 0-width lookbehind */ 109#define REX_GROUP_BEHIND_NOT_CATCH 27 /* REX_GROUP_BEHIND_NOT catcher */ 110#define REX_GROUP_COND 28 /* conditional group */ 111#define REX_GROUP_COND_CATCH 29 /* conditional group catcher */ 112#define REX_GROUP_CUT 30 /* don't backtrack over this */ 113#define REX_GROUP_CUT_CATCH 31 /* REX_GROUP_CUT catcher */ 114#define REX_KMP 32 /* Knuth-Morris-Pratt */ 115#define REX_NEG 33 /* negation */ 116#define REX_NEG_CATCH 34 /* REX_NEG catcher */ 117#define REX_NEST 35 /* nested match */ 118#define REX_ONECHAR 36 /* a single-character literal */ 119#define REX_REP 37 /* Kleene closure */ 120#define REX_REP_CATCH 38 /* REX_REP catcher */ 121#define REX_STRING 39 /* some chars */ 122#define REX_TRIE 40 /* alternation of strings */ 123#define REX_WBEG 41 /* \< */ 124#define REX_WEND 42 /* \> */ 125#define REX_WORD 43 /* word boundary */ 126#define REX_WORD_NOT 44 /* not word boundary */ 127 128#define T_META ((int)UCHAR_MAX+1) 129#define T_STAR (T_META+0) 130#define T_PLUS (T_META+1) 131#define T_QUES (T_META+2) 132#define T_BANG (T_META+3) 133#define T_AT (T_META+4) 134#define T_TILDE (T_META+5) 135#define T_PERCENT (T_META+6) 136#define T_LEFT (T_META+7) 137#define T_OPEN (T_META+8) 138#define T_CLOSE (T_OPEN+1) 139#define T_RIGHT (T_OPEN+2) 140#define T_CFLX (T_OPEN+3) 141#define T_DOT (T_OPEN+4) 142#define T_DOTSTAR (T_OPEN+5) 143#define T_END (T_OPEN+6) 144#define T_BAD (T_OPEN+7) 145#define T_DOLL (T_OPEN+8) 146#define T_BRA (T_OPEN+9) 147#define T_BAR (T_OPEN+10) 148#define T_AND (T_OPEN+11) 149#define T_LT (T_OPEN+12) 150#define T_GT (T_OPEN+13) 151#define T_SLASHPLUS (T_OPEN+14) 152#define T_GROUP (T_OPEN+15) 153#define T_WORD (T_OPEN+16) 154#define T_WORD_NOT (T_WORD+1) 155#define T_BEG_STR (T_WORD+2) 156#define T_END_STR (T_WORD+3) 157#define T_FIN_STR (T_WORD+4) 158#define T_ESCAPE (T_WORD+5) 159#define T_ALNUM (T_WORD+6) 160#define T_ALNUM_NOT (T_ALNUM+1) 161#define T_DIGIT (T_ALNUM+2) 162#define T_DIGIT_NOT (T_ALNUM+3) 163#define T_SPACE (T_ALNUM+4) 164#define T_SPACE_NOT (T_ALNUM+5) 165#define T_BACK (T_ALNUM+6) 166 167#define BRE 0 168#define ERE 3 169#define ARE 6 170#define SRE 9 171#define KRE 12 172 173#define HIT SSIZE_MAX 174 175#define bitclr(p,c) ((p)[((c)>>3)&037]&=(~(1<<((c)&07)))) 176#define bitset(p,c) ((p)[((c)>>3)&037]|=(1<<((c)&07))) 177#define bittst(p,c) ((p)[((c)>>3)&037]&(1<<((c)&07))) 178 179#define setadd(p,c) bitset((p)->bits,c) 180#define setclr(p,c) bitclr((p)->bits,c) 181#define settst(p,c) bittst((p)->bits,c) 182 183#if _hdr_wchar && _lib_wctype && _lib_iswctype 184 185#include <stdio.h> /* because <wchar.h> includes it and we generate it */ 186#include <wchar.h> 187#if _hdr_wctype 188#include <wctype.h> 189#endif 190 191#if !defined(iswblank) && !_lib_iswblank 192#define _need_iswblank 1 193#define iswblank(x) _reg_iswblank(x) 194extern int _reg_iswblank(wint_t); 195#endif 196 197#if !defined(towupper) && !_lib_towupper 198#define towupper(x) toupper(x) 199#endif 200 201#if !defined(towlower) && !_lib_towlower 202#define towlower(x) tolower(x) 203#endif 204 205#else 206 207#undef _lib_wctype 208 209#ifndef iswalnum 210#define iswalnum(x) isalnum(x) 211#endif 212#ifndef iswalpha 213#define iswalpha(x) isalpha(x) 214#endif 215#ifndef iswcntrl 216#define iswcntrl(x) iscntrl(x) 217#endif 218#ifndef iswdigit 219#define iswdigit(x) isdigit(x) 220#endif 221#ifndef iswgraph 222#define iswgraph(x) isgraph(x) 223#endif 224#ifndef iswlower 225#define iswlower(x) islower(x) 226#endif 227#ifndef iswprint 228#define iswprint(x) isprint(x) 229#endif 230#ifndef iswpunct 231#define iswpunct(x) ispunct(x) 232#endif 233#ifndef iswspace 234#define iswspace(x) isspace(x) 235#endif 236#ifndef iswupper 237#define iswupper(x) isupper(x) 238#endif 239#ifndef iswxdigit 240#define iswxdigit(x) isxdigit(x) 241#endif 242 243#ifndef towlower 244#define towlower(x) tolower(x) 245#endif 246#ifndef towupper 247#define towupper(x) toupper(x) 248#endif 249 250#endif 251 252#ifndef iswblank 253#define iswblank(x) ((x)==' '||(x)=='\t') 254#endif 255 256#ifndef iswgraph 257#define iswgraph(x) (iswprint(x)&&!iswblank(x)) 258#endif 259 260#define isword(x) (isalnum(x)||(x)=='_') 261 262/* 263 * collation element support 264 */ 265 266#define COLL_KEY_MAX 32 267 268#if COLL_KEY_MAX < MB_LEN_MAX 269#undef COLL_KEY_MAX 270#define COLL_KEY_MAX MB_LEN_MAX 271#endif 272 273typedef unsigned char Ckey_t[COLL_KEY_MAX+1]; 274 275#define COLL_end 0 276#define COLL_call 1 277#define COLL_char 2 278#define COLL_range 3 279#define COLL_range_lc 4 280#define COLL_range_uc 5 281 282typedef struct Celt_s 283{ 284 short typ; 285 short min; 286 short max; 287 regclass_t fun; 288 Ckey_t beg; 289 Ckey_t end; 290} Celt_t; 291 292/* 293 * private stuff hanging off regex_t 294 */ 295 296typedef struct Stk_pos_s 297{ 298 off_t offset; 299 char* base; 300} Stk_pos_t; 301 302typedef struct Vector_s 303{ 304 Stk_t* stk; /* stack pointer */ 305 char* vec; /* the data */ 306 int inc; /* growth increment */ 307 int siz; /* element size */ 308 ssize_t max; /* max index */ 309 ssize_t cur; /* current index -- user domain */ 310} Vector_t; 311 312/* 313 * Rex_t subtypes 314 */ 315 316typedef struct Cond_s 317{ 318 unsigned char* beg; /* beginning of next match */ 319 struct Rex_s* next[2]; /* 0:no 1:yes next pattern */ 320 struct Rex_s* cont; /* right catcher */ 321 int yes; /* yes condition hit */ 322} Cond_t; 323 324typedef struct Conj_left_s 325{ 326 unsigned char* beg; /* beginning of left match */ 327 struct Rex_s* right; /* right pattern */ 328 struct Rex_s* cont; /* right catcher */ 329} Conj_left_t; 330 331typedef struct Conj_right_s 332{ 333 unsigned char* end; /* end of left match */ 334 struct Rex_s* cont; /* ambient continuation */ 335} Conj_right_t; 336 337typedef unsigned int Bm_mask_t; 338 339typedef struct Bm_s 340{ 341 Bm_mask_t** mask; 342 size_t* skip; 343 size_t* fail; 344 size_t size; 345 ssize_t back; 346 ssize_t left; 347 ssize_t right; 348 size_t complete; 349} Bm_t; 350 351typedef struct String_s 352{ 353 int* fail; 354 unsigned char* base; 355 size_t size; 356} String_t; 357 358typedef struct Set_s 359{ 360 unsigned char bits[(UCHAR_MAX+1)/CHAR_BIT]; 361} Set_t; 362 363typedef struct Collate_s 364{ 365 int invert; 366 Celt_t* elements; 367} Collate_t; 368 369typedef struct Binary_s 370{ 371 struct Rex_s* left; 372 struct Rex_s* right; 373 int serial; 374} Binary_t; 375 376typedef struct Group_s 377{ 378 int number; /* group number */ 379 int last; /* last contained group number */ 380 int size; /* lookbehind size */ 381 int back; /* backreferenced */ 382 regflags_t flags; /* group flags */ 383 union 384 { 385 Binary_t binary; 386 struct Rex_s* rex; 387 } expr; 388} Group_t; 389 390typedef struct Exec_s 391{ 392 void* data; 393 const char* text; 394 size_t size; 395} Exec_t; 396 397#define REX_NEST_open 0x01 398#define REX_NEST_close 0x02 399#define REX_NEST_escape 0x04 400#define REX_NEST_quote 0x08 401#define REX_NEST_literal 0x10 402#define REX_NEST_delimiter 0x20 403#define REX_NEST_terminator 0x40 404#define REX_NEST_separator 0x80 405 406#define REX_NEST_SHIFT 8 407 408typedef struct Nest_s 409{ 410 int primary; 411 unsigned short none; /* for Nest_t.type[-1] */ 412 unsigned short type[1]; 413} Nest_t; 414 415/* 416 * REX_ALT catcher, solely to get control at the end of an 417 * alternative to keep records for comparing matches. 418 */ 419 420typedef struct Alt_catch_s 421{ 422 struct Rex_s* cont; 423} Alt_catch_t; 424 425typedef struct Group_catch_s 426{ 427 struct Rex_s* cont; 428 regoff_t* eo; 429} Group_catch_t; 430 431typedef struct Behind_catch_s 432{ 433 struct Rex_s* cont; 434 unsigned char* beg; 435 unsigned char* end; 436} Behind_catch_t; 437 438/* 439 * REX_NEG catcher determines what string lengths can be matched, 440 * then Neg investigates continuations of other lengths. 441 * This is inefficient. For !POSITIONS expressions, we can do better: 442 * since matches to rex will be enumerated in decreasing order, 443 * we can investigate continuations whenever a length is skipped. 444 */ 445 446typedef struct Neg_catch_s 447{ 448 unsigned char* beg; 449 unsigned char* index; 450} Neg_catch_t; 451 452/* 453 * REX_REP catcher. One is created on the stack for 454 * each iteration of a complex repetition. 455 */ 456 457typedef struct Rep_catch_s 458{ 459 struct Rex_s* cont; 460 struct Rex_s* ref; 461 unsigned char* beg; 462 int n; 463} Rep_catch_t; 464 465/* 466 * data structure for an alternation of pure strings 467 * son points to a subtree of all strings with a common 468 * prefix ending in character c. sib links alternate 469 * letters in the same position of a word. end=1 if 470 * some word ends with c. the order of strings is 471 * irrelevant, except long words must be investigated 472 * before short ones. 473 */ 474 475typedef struct Trie_node_s 476{ 477 unsigned char c; 478 unsigned char end; 479 struct Trie_node_s* son; 480 struct Trie_node_s* sib; 481} Trie_node_t; 482 483typedef struct Trie_s 484{ 485 Trie_node_t** root; 486 int min; 487 int max; 488} Trie_t; 489 490/* 491 * Rex_t is a node in a regular expression 492 */ 493 494typedef struct Rex_s 495{ 496 unsigned char type; /* node type */ 497 unsigned char marked; /* already marked */ 498 short serial; /* subpattern number */ 499 regflags_t flags; /* scoped flags */ 500 int explicit; /* scoped explicit match*/ 501 struct Rex_s* next; /* remaining parts */ 502 int lo; /* lo dup count */ 503 int hi; /* hi dup count */ 504 unsigned char* map; /* fold and/or ccode map*/ 505 union 506 { 507 Alt_catch_t alt_catch; /* alt catcher */ 508 Bm_t bm; /* bm */ 509 Behind_catch_t behind_catch; /* behind catcher */ 510 Set_t* charclass; /* char class */ 511 Collate_t collate; /* collation class */ 512 Cond_t cond_catch; /* cond catcher */ 513 Conj_left_t conj_left; /* conj left catcher */ 514 Conj_right_t conj_right; /* conj right catcher */ 515 void* data; /* data after Rex_t */ 516 Exec_t exec; /* re.re_exec() args */ 517 Group_t group; /* a|b or rep */ 518 Group_catch_t group_catch; /* group catcher */ 519 Neg_catch_t neg_catch; /* neg catcher */ 520 Nest_t nest; /* nested match */ 521 unsigned char onechar; /* single char */ 522 Rep_catch_t rep_catch; /* rep catcher */ 523 String_t string; /* string/kmp */ 524 Trie_t trie; /* trie */ 525 } re; 526} Rex_t; 527 528typedef struct reglib_s /* library private regex_t info */ 529{ 530 struct Rex_s* rex; /* compiled expression */ 531 regdisc_t* disc; /* REG_DISCIPLINE discipline */ 532 const regex_t* regex; /* from regexec */ 533 unsigned char* beg; /* beginning of string */ 534 unsigned char* end; /* end of string */ 535 Vector_t* pos; /* posns of certain subpatterns */ 536 Vector_t* bestpos; /* ditto for best match */ 537 regmatch_t* match; /* subexrs in current match */ 538 regmatch_t* best; /* ditto in best match yet */ 539 Stk_pos_t stk; /* exec stack pos */ 540 size_t min; /* minimum match length */ 541 size_t nsub; /* internal re_nsub */ 542 regflags_t flags; /* flags from regcomp() */ 543 int error; /* last error */ 544 int explicit; /* explicit match on this char */ 545 int leading; /* leading match on this char */ 546 int refs; /* regcomp()+regdup() references*/ 547 Rex_t done; /* the last continuation */ 548 regstat_t stats; /* for regstat() */ 549 unsigned char fold[UCHAR_MAX+1]; /* REG_ICASE map */ 550 unsigned char hard; /* hard comp */ 551 unsigned char once; /* if 1st parse fails, quit */ 552 unsigned char separate; /* cannot combine */ 553 unsigned char stack; /* hard comp or exec */ 554 unsigned char sub; /* re_sub is valid */ 555 unsigned char test; /* debug/test bitmask */ 556} Env_t; 557 558typedef struct oldregmatch_s /* pre-20120528 regmatch_t */ 559{ 560 int rm_so; /* offset of start */ 561 int rm_eo; /* offset of end */ 562} oldregmatch_t; 563 564typedef struct State_s /* shared state */ 565{ 566 regmatch_t nomatch; 567 struct 568 { 569 unsigned char key; 570 short val[15]; 571 } escape[52]; 572 short* magic[UCHAR_MAX+1]; 573 regdisc_t disc; 574 int fatal; 575 int initialized; 576 Dt_t* attrs; 577 Dt_t* names; 578 Dtdisc_t dtdisc; 579} State_t; 580 581extern State_t state; 582 583extern void* alloc(regdisc_t*, void*, size_t); 584extern regclass_t classfun(int); 585extern void drop(regdisc_t*, Rex_t*); 586extern int fatal(regdisc_t*, int, const char*); 587 588#endif 589