1101282Smdodd/* $NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $ */ 2204977Simp 3101282Smdodd/*- 4101282Smdodd * SPDX-License-Identifier: BSD-3-Clause 5101282Smdodd * 6101282Smdodd * Copyright (c) 1992, 1993, 1994 Henry Spencer. 7101282Smdodd * Copyright (c) 1992, 1993, 1994 8101282Smdodd * The Regents of the University of California. All rights reserved. 9101282Smdodd * 10101282Smdodd * Copyright (c) 2011 The FreeBSD Foundation 11101282Smdodd * All rights reserved. 12101282Smdodd * Portions of this software were developed by David Chisnall 13101282Smdodd * under sponsorship from the FreeBSD Foundation. 14101282Smdodd * 15101282Smdodd * This code is derived from software contributed to Berkeley by 16101282Smdodd * Henry Spencer. 17101282Smdodd * 18101282Smdodd * Redistribution and use in source and binary forms, with or without 19101282Smdodd * modification, are permitted provided that the following conditions 20101282Smdodd * are met: 21101282Smdodd * 1. Redistributions of source code must retain the above copyright 22101282Smdodd * notice, this list of conditions and the following disclaimer. 23101282Smdodd * 2. Redistributions in binary form must reproduce the above copyright 24101282Smdodd * notice, this list of conditions and the following disclaimer in the 25101282Smdodd * documentation and/or other materials provided with the distribution. 26101282Smdodd * 3. Neither the name of the University nor the names of its contributors 27101282Smdodd * may be used to endorse or promote products derived from this software 28288424Sjhb * without specific prior written permission. 29168569Sdelphij * 30168569Sdelphij * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31240005Szont * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32240005Szont * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33240005Szont * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34240005Szont * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35240005Szont * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36240005Szont * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37240005Szont * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38295930Sjhb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39101282Smdodd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40288424Sjhb * SUCH DAMAGE. 41288424Sjhb * 42288424Sjhb * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 43288424Sjhb */ 44288424Sjhb 45294849Sjhb#if HAVE_NBTOOL_CONFIG_H 46288424Sjhb#include "nbtool_config.h" 47288424Sjhb#endif 48288424Sjhb 49288424Sjhb#include <sys/cdefs.h> 50288424Sjhb#if 0 51288424Sjhbstatic char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 52288424Sjhb__FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $"); 53288424Sjhb#endif 54288997Sbdrewery__RCSID("$NetBSD: regcomp.c,v 1.48 2023/08/30 20:37:24 christos Exp $"); 55288424Sjhb 56288424Sjhb#ifndef LIBHACK 57288424Sjhb#define REGEX_GNU_EXTENSIONS 58288424Sjhb 59288424Sjhb#include "namespace.h" 60288424Sjhb#endif 61288424Sjhb#include <sys/types.h> 62288424Sjhb#include <stdio.h> 63288424Sjhb#include <string.h> 64288424Sjhb#include <ctype.h> 65288424Sjhb#include <limits.h> 66288424Sjhb#include <stdlib.h> 67288424Sjhb#include <regex.h> 68288424Sjhb#include <stdbool.h> 69288424Sjhb 70288424Sjhb#if defined(__weak_alias) && !defined(LIBHACK) 71288424Sjhb__weak_alias(regcomp,_regcomp) 72288424Sjhb#endif 73288424Sjhb 74168569Sdelphij#ifdef REGEX_LIBC_COLLATE 75168569Sdelphij#include "collate.h" 76296571Sjhb#endif 77288424Sjhb 78168569Sdelphij#include "utils.h" 79168569Sdelphij#include "regex2.h" 80288424Sjhb 81240562Szont#include "cname.h" 82240562Szont 83168569Sdelphij/* 84168569Sdelphij * Branching context, used to keep track of branch state for all of the branch- 85288424Sjhb * aware functions. In addition to keeping track of branch positions for the 86288424Sjhb * p_branch_* functions, we use this to simplify some clumsiness in BREs for 87288424Sjhb * detection of whether ^ is acting as an anchor or being used erroneously and 88288424Sjhb * also for whether we're in a sub-expression or not. 89288424Sjhb */ 90296571Sjhbstruct branchc { 91288424Sjhb sopno start; 92288424Sjhb sopno back; 93101282Smdodd sopno fwd; 94101282Smdodd 95101282Smdodd int nbranch; 96153963Sbrian int nchain; 97101282Smdodd bool outer; 98101285Smdodd bool terminate; 99101373Smdodd}; 100168569Sdelphij 101168569Sdelphij/* 102240005Szont * parse structure, passed up and down to avoid global variables and 103288424Sjhb * other clumsinesses 104101282Smdodd */ 105158630Spavstruct parse { 106247338Sdelphij const char *next; /* next character in RE */ 107158630Spav const char *end; /* end of string (-> NUL normally) */ 108158630Spav int error; /* has an error been seen? */ 109158630Spav int gnuext; 110158630Spav sop *strip; /* malloced strip */ 111158630Spav sopno ssize; /* malloced strip size (allocated) */ 112158630Spav sopno slen; /* malloced strip length (used) */ 113158630Spav size_t ncsalloc; /* number of csets allocated */ 114158630Spav struct re_guts *g; 115168569Sdelphij# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 116247338Sdelphij sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 117192025Sdds sopno pend[NPAREN]; /* -> ) ([0] unused) */ 118192025Sdds bool allowbranch; /* can this expression branch? */ 119192025Sdds bool bre; /* convenience; is this a BRE? */ 120192025Sdds int pflags; /* other parsing flags -- legacy escapes? */ 121192025Sdds bool (*parse_expr)(struct parse *, struct branchc *); 122192025Sdds void (*pre_parse)(struct parse *, struct branchc *); 123192025Sdds void (*post_parse)(struct parse *, struct branchc *); 124192025Sdds}; 125 126#define PFLAG_LEGACY_ESC 0x00000001 127 128/* ========= begin header generated by ./mkh ========= */ 129#ifdef __cplusplus 130extern "C" { 131#endif 132 133/* === regcomp.c === */ 134static bool p_ere_exp(struct parse *p, struct branchc *bc); 135static void p_str(struct parse *p); 136static int p_branch_eat_delim(struct parse *p, struct branchc *bc); 137static void p_branch_ins_offset(struct parse *p, struct branchc *bc); 138static void p_branch_fix_tail(struct parse *p, struct branchc *bc); 139static bool p_branch_empty(struct parse *p, struct branchc *bc); 140static bool p_branch_do(struct parse *p, struct branchc *bc); 141static void p_bre_pre_parse(struct parse *p, struct branchc *bc); 142static void p_bre_post_parse(struct parse *p, struct branchc *bc); 143static void p_re(struct parse *p, int end1, int end2); 144static bool p_simp_re(struct parse *p, struct branchc *bc); 145static int p_count(struct parse *p); 146static void p_bracket(struct parse *p); 147static int p_range_cmp(wchar_t c1, wchar_t c2); 148static void p_b_term(struct parse *p, cset *cs); 149#ifdef REGEX_GNU_EXTENSIONS 150static int p_b_pseudoclass(struct parse *p, char c); 151#endif 152static void p_b_cclass(struct parse *p, cset *cs); 153static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); 154static void p_b_eclass(struct parse *p, cset *cs); 155static wint_t p_b_symbol(struct parse *p); 156static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 157static bool may_escape(struct parse *p, const wint_t ch); 158static wint_t othercase(wint_t ch); 159static void bothcases(struct parse *p, wint_t ch); 160static void ordinary(struct parse *p, wint_t ch); 161static void nonnewline(struct parse *p); 162static void repeat(struct parse *p, sopno start, int from, int to); 163static int seterr(struct parse *p, int e); 164static cset *allocset(struct parse *p); 165static void freeset(struct parse *p, cset *cs); 166static void CHadd(struct parse *p, cset *cs, wint_t ch); 167static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); 168static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); 169static wint_t singleton(cset *cs); 170static sopno dupl(struct parse *p, sopno start, sopno finish); 171static void doemit(struct parse *p, sop op, size_t opnd); 172static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 173static void dofwd(struct parse *p, sopno pos, sop value); 174static int enlarge(struct parse *p, sopno size); 175static void stripsnug(struct parse *p, struct re_guts *g); 176static void findmust(struct parse *p, struct re_guts *g); 177static int altoffset(sop *scan, int offset); 178static void computejumps(struct parse *p, struct re_guts *g); 179static void computematchjumps(struct parse *p, struct re_guts *g); 180static sopno pluscount(struct parse *p, struct re_guts *g); 181static wint_t wgetnext(struct parse *p); 182 183#ifdef __cplusplus 184} 185#endif 186/* ========= end header generated by ./mkh ========= */ 187 188static char nuls[10]; /* place to point scanner in event of error */ 189 190/* 191 * macros for use with parse structure 192 * BEWARE: these know that the parse structure is named `p' !!! 193 */ 194#define PEEK() (*p->next) 195#define PEEK2() (*(p->next+1)) 196#define MORE() (p->next < p->end) 197#define MORE2() (p->next+1 < p->end) 198#define SEE(c) (MORE() && PEEK() == (c)) 199#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 200#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) 201#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 202#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 203#define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) 204#define NEXT() (p->next++) 205#define NEXT2() (p->next += 2) 206#define NEXTn(n) (p->next += (n)) 207#define GETNEXT() (*p->next++) 208#define WGETNEXT() wgetnext(p) 209#define SETERROR(e) seterr(p, (e)) 210#define REQUIRE(co, e) ((co) || SETERROR(e)) 211#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 212#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 213#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 214#define EMIT(op, sopnd) doemit(p, (op), (sopnd)) 215#define INSERT(op, pos) doinsert(p, (op), HERE()-(pos)+1, pos) 216#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 217#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 218#define HERE() (p->slen) 219#define THERE() (p->slen - 1) 220#define THERETHERE() (p->slen - 2) 221#define DROP(n) (p->slen -= (n)) 222 223/* Macro used by computejump()/computematchjump() */ 224#ifndef MIN 225#define MIN(a,b) ((a)<(b)?(a):(b)) 226#endif 227 228#ifndef NLS 229static const struct { 230 const char *name; 231 int (*func)(int); 232} wctypes[] = { 233#define ADD(x) { .name = # x, .func = is ## x } 234 ADD(alnum), 235 ADD(alpha), 236 ADD(blank), 237 ADD(cntrl), 238 ADD(digit), 239 ADD(graph), 240 ADD(lower), 241 ADD(print), 242 ADD(punct), 243 ADD(space), 244 ADD(upper), 245 ADD(xdigit), 246#undef ADD 247}; 248 249wctype_t 250__regex_wctype(const char *str) 251{ 252 for (size_t i = 0; i < __arraycount(wctypes); i++) { 253 if (strcmp(wctypes[i].name, str) == 0) 254 return (wctype_t)(i + 1); 255 } 256 return (wctype_t)0; 257} 258 259int 260__regex_iswctype(wint_t c, wctype_t ct) 261{ 262 if (ct == 0) 263 return 0; 264 return (*wctypes[ct - 1].func)(c); 265} 266#endif 267 268static int /* 0 success, otherwise REG_something */ 269regcomp_internal(regex_t * __restrict preg, 270 const char * __restrict pattern, 271 int cflags, int pflags) 272{ 273 struct parse pa; 274 struct re_guts *g; 275 struct parse *p = &pa; 276 int i; 277 size_t len; 278 size_t maxlen; 279#ifdef REDEBUG 280# define GOODFLAGS(f) (f) 281#else 282# define GOODFLAGS(f) ((f)&~REG_DUMP) 283#endif 284 285 _DIAGASSERT(preg != NULL); 286 _DIAGASSERT(pattern != NULL); 287 288 cflags = GOODFLAGS(cflags); 289 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 290 return(REG_INVARG); 291 292 if (cflags®_PEND) { 293 if (preg->re_endp < pattern) 294 return(REG_INVARG); 295 len = preg->re_endp - pattern; 296 } else 297 len = strlen(pattern); 298 299 /* do the mallocs early so failure handling is easy */ 300 g = malloc(sizeof(*g)); 301 if (g == NULL) 302 return(REG_ESPACE); 303 /* 304 * Limit the pattern space to avoid a 32-bit overflow on buffer 305 * extension. Also avoid any signed overflow in case of conversion 306 * so make the real limit based on a 31-bit overflow. 307 * 308 * Likely not applicable on 64-bit systems but handle the case 309 * generically (who are we to stop people from using ~715MB+ 310 * patterns?). 311 */ 312 maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3; 313 if (len >= maxlen) { 314 free(g); 315 return(REG_ESPACE); 316 } 317 p->ssize = (sopno)(len / 2 * 3 + 1); /* ugh */ 318 assert(p->ssize >= len); 319 320 p->strip = calloc(p->ssize, sizeof(*p->strip)); 321 p->slen = 0; 322 if (p->strip == NULL) { 323 free(g); 324 return(REG_ESPACE); 325 } 326 327 /* set things up */ 328 p->g = g; 329 p->next = pattern; /* convenience; we do not modify it */ 330 p->end = p->next + len; 331 p->error = 0; 332 p->ncsalloc = 0; 333 p->pflags = pflags; 334 for (i = 0; i < NPAREN; i++) { 335 p->pbegin[i] = 0; 336 p->pend[i] = 0; 337 } 338#ifdef REGEX_GNU_EXTENSIONS 339 if ((cflags & REG_GNU) == 0) { 340 p->gnuext = false; 341 p->allowbranch = (cflags & REG_EXTENDED) != 0; 342 } else 343 p->gnuext = p->allowbranch = true; 344#else 345 p->gnuext = false; 346 p->allowbranch = (cflags & REG_EXTENDED) != 0; 347#endif 348 if (cflags & REG_EXTENDED) { 349 p->bre = false; 350 p->parse_expr = p_ere_exp; 351 p->pre_parse = NULL; 352 p->post_parse = NULL; 353 } else { 354 p->bre = true; 355 p->parse_expr = p_simp_re; 356 p->pre_parse = p_bre_pre_parse; 357 p->post_parse = p_bre_post_parse; 358 } 359 g->sets = NULL; 360 g->ncsets = 0; 361 g->cflags = cflags; 362 g->iflags = 0; 363 g->nbol = 0; 364 g->neol = 0; 365 g->must = NULL; 366 g->moffset = -1; 367 g->charjump = NULL; 368 g->matchjump = NULL; 369 g->mlen = 0; 370 g->nsub = 0; 371 g->backrefs = 0; 372 373 /* do it */ 374 EMIT(OEND, 0); 375 g->firststate = THERE(); 376 if (cflags & REG_NOSPEC) 377 p_str(p); 378 else 379 p_re(p, OUT, OUT); 380 EMIT(OEND, 0); 381 g->laststate = THERE(); 382 383 /* tidy up loose ends and fill things in */ 384 stripsnug(p, g); 385 findmust(p, g); 386 /* only use Boyer-Moore algorithm if the pattern is bigger 387 * than three characters 388 */ 389 if(g->mlen > 3) { 390 computejumps(p, g); 391 computematchjumps(p, g); 392 if(g->matchjump == NULL && g->charjump != NULL) { 393 free(g->charjump); 394 g->charjump = NULL; 395 } 396 } 397 g->nplus = pluscount(p, g); 398 g->magic = MAGIC2; 399 preg->re_nsub = g->nsub; 400 preg->re_g = g; 401 preg->re_magic = MAGIC1; 402#ifndef REDEBUG 403 /* not debugging, so can't rely on the assert() in regexec() */ 404 if (g->iflags&BAD) 405 SETERROR(REG_ASSERT); 406#endif 407 408 /* win or lose, we're done */ 409 if (p->error != 0) /* lose */ 410 regfree(preg); 411 return(p->error); 412} 413 414/* 415 - regcomp - interface for parser and compilation 416 = extern int regcomp(regex_t *, const char *, int); 417 = #define REG_BASIC 0000 418 = #define REG_EXTENDED 0001 419 = #define REG_ICASE 0002 420 = #define REG_NOSUB 0004 421 = #define REG_NEWLINE 0010 422 = #define REG_NOSPEC 0020 423 = #define REG_PEND 0040 424 = #define REG_DUMP 0200 425 */ 426int /* 0 success, otherwise REG_something */ 427regcomp(regex_t * __restrict preg, 428 const char * __restrict pattern, 429 int cflags) 430{ 431 432 return (regcomp_internal(preg, pattern, cflags, 0)); 433} 434 435/* 436 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op, 437 - return whether we should terminate or not 438 == static bool p_ere_exp(struct parse *p); 439 */ 440static bool 441p_ere_exp(struct parse *p, struct branchc *bc) 442{ 443 char c; 444 wint_t wc; 445 sopno pos; 446 int count; 447 int count2; 448#ifdef REGEX_GNU_EXTENSIONS 449 size_t i; 450 int handled; 451#endif 452 sopno subno; 453 int wascaret = 0; 454 455 _DIAGASSERT(p != NULL); 456 457 (void)bc; 458 assert(MORE()); /* caller should have ensured this */ 459 c = GETNEXT(); 460 461#ifdef REGEX_GNU_EXTENSIONS 462 handled = 0; 463#endif 464 pos = HERE(); 465 switch (c) { 466 case '(': 467 (void)REQUIRE(MORE(), REG_EPAREN); 468 p->g->nsub++; 469 subno = (sopno)p->g->nsub; 470 if (subno < NPAREN) 471 p->pbegin[subno] = HERE(); 472 EMIT(OLPAREN, subno); 473 if (!SEE(')')) 474 p_re(p, ')', IGN); 475 if (subno < NPAREN) { 476 p->pend[subno] = HERE(); 477 assert(p->pend[subno] != 0); 478 } 479 EMIT(ORPAREN, subno); 480 (void)MUSTEAT(')', REG_EPAREN); 481 break; 482#ifndef POSIX_MISTAKE 483 case ')': /* happens only if no current unmatched ( */ 484 /* 485 * You may ask, why the ifndef? Because I didn't notice 486 * this until slightly too late for 1003.2, and none of the 487 * other 1003.2 regular-expression reviewers noticed it at 488 * all. So an unmatched ) is legal POSIX, at least until 489 * we can get it fixed. 490 */ 491 SETERROR(REG_EPAREN); 492 break; 493#endif 494 case '^': 495 EMIT(OBOL, 0); 496 p->g->iflags |= USEBOL; 497 p->g->nbol++; 498 wascaret = 1; 499 break; 500 case '$': 501 EMIT(OEOL, 0); 502 p->g->iflags |= USEEOL; 503 p->g->neol++; 504 break; 505 case '|': 506 SETERROR(REG_EMPTY); 507 break; 508 case '*': 509 case '+': 510 case '?': 511 case '{': 512 SETERROR(REG_BADRPT); 513 break; 514 case '.': 515 if (p->g->cflags®_NEWLINE) 516 nonnewline(p); 517 else 518 EMIT(OANY, 0); 519 break; 520 case '[': 521 p_bracket(p); 522 break; 523 case '\\': 524 (void)REQUIRE(MORE(), REG_EESCAPE); 525 wc = WGETNEXT(); 526#ifdef REGEX_GNU_EXTENSIONS 527 if (p->gnuext) { 528 handled = 1; 529 switch (wc) { 530 case '`': 531 EMIT(OBOS, 0); 532 break; 533 case '\'': 534 EMIT(OEOS, 0); 535 break; 536 case 'B': 537 EMIT(ONWBND, 0); 538 break; 539 case 'b': 540 EMIT(OWBND, 0); 541 break; 542 case 'W': 543 case 'w': 544 case 'S': 545 case 's': 546 p_b_pseudoclass(p, wc); 547 break; 548 case 'a': 549 ordinary(p, '\a'); 550 break; 551 case 'e': 552 ordinary(p, '\e'); 553 break; 554 case 'f': 555 ordinary(p, '\f'); 556 break; 557 case 'n': 558 ordinary(p, '\n'); 559 break; 560 case 'r': 561 ordinary(p, '\r'); 562 break; 563 case 't': 564 ordinary(p, '\t'); 565 break; 566 case 'v': 567 ordinary(p, '\v'); 568 break; 569 case '1': 570 case '2': 571 case '3': 572 case '4': 573 case '5': 574 case '6': 575 case '7': 576 case '8': 577 case '9': 578 i = wc - '0'; 579 assert(i < NPAREN); 580 if (p->pend[i] != 0) { 581 assert(i <= p->g->nsub); 582 EMIT(OBACK_, i); 583 assert(p->pbegin[i] != 0); 584 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 585 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 586 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 587 EMIT(O_BACK, i); 588 } else 589 SETERROR(REG_ESUBREG); 590 p->g->backrefs = 1; 591 break; 592 default: 593 handled = 0; 594 } 595 /* Don't proceed to the POSIX bits if we've already handled it */ 596 if (handled) 597 break; 598 } 599#endif 600 switch (wc) { 601 case '<': 602 EMIT(OBOW, 0); 603 break; 604 case '>': 605 EMIT(OEOW, 0); 606 break; 607 default: 608 if (may_escape(p, wc)) 609 ordinary(p, wc); 610 else 611 SETERROR(REG_EESCAPE); 612 break; 613 } 614 break; 615 default: 616 if (p->error != 0) 617 return (false); 618 p->next--; 619 wc = WGETNEXT(); 620 ordinary(p, wc); 621 break; 622 } 623 624 if (!MORE()) 625 return (false); 626 c = PEEK(); 627 /* we call { a repetition if followed by a digit */ 628 if (!( c == '*' || c == '+' || c == '?' || c == '{')) 629 return (false); /* no repetition, we're done */ 630 else if (c == '{') 631 (void)REQUIRE(MORE2() && \ 632 (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT); 633 NEXT(); 634 635 (void)REQUIRE(!wascaret, REG_BADRPT); 636 switch (c) { 637 case '*': /* implemented as +? */ 638 /* this case does not require the (y|) trick, noKLUDGE */ 639 INSERT(OPLUS_, pos); 640 ASTERN(O_PLUS, pos); 641 INSERT(OQUEST_, pos); 642 ASTERN(O_QUEST, pos); 643 break; 644 case '+': 645 INSERT(OPLUS_, pos); 646 ASTERN(O_PLUS, pos); 647 break; 648 case '?': 649 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 650 INSERT(OCH_, pos); /* offset slightly wrong */ 651 ASTERN(OOR1, pos); /* this one's right */ 652 AHEAD(pos); /* fix the OCH_ */ 653 EMIT(OOR2, 0); /* offset very wrong... */ 654 AHEAD(THERE()); /* ...so fix it */ 655 ASTERN(O_CH, THERETHERE()); 656 break; 657 case '{': 658 count = p_count(p); 659 if (EAT(',')) { 660 if (isdigit((uch)PEEK())) { 661 count2 = p_count(p); 662 (void)REQUIRE(count <= count2, REG_BADBR); 663 } else /* single number with comma */ 664 count2 = INFINITY; 665 } else /* just a single number */ 666 count2 = count; 667 repeat(p, pos, count, count2); 668 if (!EAT('}')) { /* error heuristics */ 669 while (MORE() && PEEK() != '}') 670 NEXT(); 671 (void)REQUIRE(MORE(), REG_EBRACE); 672 SETERROR(REG_BADBR); 673 } 674 break; 675 } 676 677 if (!MORE()) 678 return (false); 679 c = PEEK(); 680 if (!( c == '*' || c == '+' || c == '?' || 681 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 682 return (false); 683 SETERROR(REG_BADRPT); 684 return (false); 685} 686 687/* 688 - p_str - string (no metacharacters) "parser" 689 == static void p_str(struct parse *p); 690 */ 691static void 692p_str(struct parse *p) 693{ 694 (void)REQUIRE(MORE(), REG_EMPTY); 695 while (MORE()) 696 ordinary(p, WGETNEXT()); 697} 698 699/* 700 * Eat consecutive branch delimiters for the kind of expression that we are 701 * parsing, return the number of delimiters that we ate. 702 */ 703static int 704p_branch_eat_delim(struct parse *p, struct branchc *bc) 705{ 706 int nskip; 707 708 (void)bc; 709 nskip = 0; 710 while (EATSPEC('|')) 711 ++nskip; 712 return (nskip); 713} 714 715/* 716 * Insert necessary branch book-keeping operations. This emits a 717 * bogus 'next' offset, since we still have more to parse 718 */ 719static void 720p_branch_ins_offset(struct parse *p, struct branchc *bc) 721{ 722 723 if (bc->nbranch == 0) { 724 INSERT(OCH_, bc->start); /* offset is wrong */ 725 bc->fwd = bc->start; 726 bc->back = bc->start; 727 } 728 729 ASTERN(OOR1, bc->back); 730 bc->back = THERE(); 731 AHEAD(bc->fwd); /* fix previous offset */ 732 bc->fwd = HERE(); 733 EMIT(OOR2, 0); /* offset is very wrong */ 734 ++bc->nbranch; 735} 736 737/* 738 * Fix the offset of the tail branch, if we actually had any branches. 739 * This is to correct the bogus placeholder offset that we use. 740 */ 741static void 742p_branch_fix_tail(struct parse *p, struct branchc *bc) 743{ 744 745 /* Fix bogus offset at the tail if we actually have branches */ 746 if (bc->nbranch > 0) { 747 AHEAD(bc->fwd); 748 ASTERN(O_CH, bc->back); 749 } 750} 751 752/* 753 * Signal to the parser that an empty branch has been encountered; this will, 754 * in the future, be used to allow for more permissive behavior with empty 755 * branches. The return value should indicate whether parsing may continue 756 * or not. 757 */ 758static bool 759p_branch_empty(struct parse *p, struct branchc *bc) 760{ 761 762 (void)bc; 763 SETERROR(REG_EMPTY); 764 return (false); 765} 766 767/* 768 * Take care of any branching requirements. This includes inserting the 769 * appropriate branching instructions as well as eating all of the branch 770 * delimiters until we either run out of pattern or need to parse more pattern. 771 */ 772static bool 773p_branch_do(struct parse *p, struct branchc *bc) 774{ 775 int ate = 0; 776 777 ate = p_branch_eat_delim(p, bc); 778 if (ate == 0) 779 return (false); 780 else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc)) 781 /* 782 * Halt parsing only if we have an empty branch and p_branch_empty 783 * indicates that we must not continue. In the future, this will not 784 * necessarily be an error. 785 */ 786 return (false); 787 p_branch_ins_offset(p, bc); 788 789 return (true); 790} 791 792static void 793p_bre_pre_parse(struct parse *p, struct branchc *bc) 794{ 795 796 (void)bc; 797 /* 798 * Does not move cleanly into expression parser because of 799 * ordinary interpration of * at the beginning position of 800 * an expression. 801 */ 802 if (EAT('^')) { 803 EMIT(OBOL, 0); 804 p->g->iflags |= USEBOL; 805 p->g->nbol++; 806 } 807} 808 809static void 810p_bre_post_parse(struct parse *p, struct branchc *bc) 811{ 812 813 /* Expression is terminating due to EOL token */ 814 if (bc->terminate) { 815 DROP(1); 816 EMIT(OEOL, 0); 817 p->g->iflags |= USEEOL; 818 p->g->neol++; 819 } 820} 821 822/* 823 - p_re - Top level parser, concatenation and BRE anchoring 824 == static void p_re(struct parse *p, int end1, int end2); 825 * Giving end1 as OUT essentially eliminates the end1/end2 check. 826 * 827 * This implementation is a bit of a kludge, in that a trailing $ is first 828 * taken as an ordinary character and then revised to be an anchor. 829 * The amount of lookahead needed to avoid this kludge is excessive. 830 */ 831static void 832p_re(struct parse *p, 833 int end1, /* first terminating character */ 834 int end2) /* second terminating character; ignored for EREs */ 835{ 836 struct branchc bc; 837 838 bc.nbranch = 0; 839 if (end1 == OUT && end2 == OUT) 840 bc.outer = true; 841 else 842 bc.outer = false; 843#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2)) 844 for (;;) { 845 bc.start = HERE(); 846 bc.nchain = 0; 847 bc.terminate = false; 848 if (p->pre_parse != NULL) 849 p->pre_parse(p, &bc); 850 while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) { 851 bc.terminate = p->parse_expr(p, &bc); 852 ++bc.nchain; 853 } 854 if (p->post_parse != NULL) 855 p->post_parse(p, &bc); 856 (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); 857#ifdef REGEX_GNU_EXTENSIONS 858 if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc)) 859 break; 860#endif 861 if (!p->allowbranch) 862 break; 863 /* 864 * p_branch_do's return value indicates whether we should 865 * continue parsing or not. This is both for correctness and 866 * a slight optimization, because it will check if we've 867 * encountered an empty branch or the end of the string 868 * immediately following a branch delimiter. 869 */ 870 if (!p_branch_do(p, &bc)) 871 break; 872 } 873#undef SEE_END 874 if (p->allowbranch) 875 p_branch_fix_tail(p, &bc); 876 assert(!MORE() || SEE(end1)); 877} 878 879/* 880 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 881 == static bool p_simp_re(struct parse *p, struct branchc *bc); 882 */ 883static bool /* was the simple RE an unbackslashed $? */ 884p_simp_re(struct parse *p, struct branchc *bc) 885{ 886 int c; 887 int cc; /* convenient/control character */ 888 int count; 889 int count2; 890 sopno pos; 891 bool handled; 892 size_t i; 893 wint_t wc; 894 sopno subno; 895# define BACKSL (1<<CHAR_BIT) 896 897 pos = HERE(); /* repetition op, if any, covers from here */ 898 handled = false; 899 900 assert(MORE()); /* caller should have ensured this */ 901 c = (uch)GETNEXT(); 902 if (c == '\\') { 903 (void)REQUIRE(MORE(), REG_EESCAPE); 904 cc = (uch)GETNEXT(); 905 c = BACKSL | cc; 906#ifdef REGEX_GNU_EXTENSIONS 907 if (p->gnuext) { 908 handled = true; 909 switch (c) { 910 case BACKSL|'`': 911 EMIT(OBOS, 0); 912 break; 913 case BACKSL|'\'': 914 EMIT(OEOS, 0); 915 break; 916 case BACKSL|'B': 917 EMIT(ONWBND, 0); 918 break; 919 case BACKSL|'b': 920 EMIT(OWBND, 0); 921 break; 922 case BACKSL|'W': 923 case BACKSL|'w': 924 case BACKSL|'S': 925 case BACKSL|'s': 926 p_b_pseudoclass(p, cc); 927 break; 928 case BACKSL|'a': 929 ordinary(p, '\a'); 930 break; 931 case BACKSL|'e': 932 ordinary(p, '\e'); 933 break; 934 case BACKSL|'f': 935 ordinary(p, '\f'); 936 break; 937 case BACKSL|'n': 938 ordinary(p, '\n'); 939 break; 940 case BACKSL|'r': 941 ordinary(p, '\r'); 942 break; 943 case BACKSL|'t': 944 ordinary(p, '\t'); 945 break; 946 case BACKSL|'v': 947 ordinary(p, '\v'); 948 break; 949 default: 950 handled = false; 951 } 952 } 953#endif 954 } 955 if (!handled) { 956 switch (c) { 957 case '.': 958 if (p->g->cflags®_NEWLINE) 959 nonnewline(p); 960 else 961 EMIT(OANY, 0); 962 break; 963 case '[': 964 p_bracket(p); 965 break; 966 case BACKSL|'<': 967 EMIT(OBOW, 0); 968 break; 969 case BACKSL|'>': 970 EMIT(OEOW, 0); 971 break; 972 case BACKSL|'{': 973 SETERROR(REG_BADRPT); 974 break; 975 case BACKSL|'(': 976 p->g->nsub++; 977 subno = (sopno)p->g->nsub; 978 if (subno < NPAREN) 979 p->pbegin[subno] = HERE(); 980 EMIT(OLPAREN, subno); 981 /* the MORE here is an error heuristic */ 982 if (MORE() && !SEETWO('\\', ')')) 983 p_re(p, '\\', ')'); 984 if (subno < NPAREN) { 985 p->pend[subno] = HERE(); 986 assert(p->pend[subno] != 0); 987 } 988 EMIT(ORPAREN, subno); 989 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 990 break; 991 case BACKSL|')': /* should not get here -- must be user */ 992 SETERROR(REG_EPAREN); 993 break; 994 case BACKSL|'1': 995 case BACKSL|'2': 996 case BACKSL|'3': 997 case BACKSL|'4': 998 case BACKSL|'5': 999 case BACKSL|'6': 1000 case BACKSL|'7': 1001 case BACKSL|'8': 1002 case BACKSL|'9': 1003 i = (c&~BACKSL) - '0'; 1004 assert(i < NPAREN); 1005 if (p->pend[i] != 0) { 1006 assert(i <= p->g->nsub); 1007 EMIT(OBACK_, i); 1008 assert(p->pbegin[i] != 0); 1009 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 1010 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 1011 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 1012 EMIT(O_BACK, i); 1013 } else 1014 SETERROR(REG_ESUBREG); 1015 p->g->backrefs = 1; 1016 break; 1017 case '*': 1018 /* 1019 * Ordinary if used as the first character beyond BOL anchor of 1020 * a (sub-)expression, counts as a bad repetition operator if it 1021 * appears otherwise. 1022 */ 1023 (void)REQUIRE(bc->nchain == 0, REG_BADRPT); 1024 /* FALLTHROUGH */ 1025 default: 1026 if (p->error != 0) 1027 return (false); /* Definitely not $... */ 1028 p->next--; 1029 wc = WGETNEXT(); 1030 if ((c & BACKSL) == 0 || may_escape(p, wc)) 1031 ordinary(p, wc); 1032 else 1033 SETERROR(REG_EESCAPE); 1034 break; 1035 } 1036 } 1037 1038 if (EAT('*')) { /* implemented as +? */ 1039 /* this case does not require the (y|) trick, noKLUDGE */ 1040 INSERT(OPLUS_, pos); 1041 ASTERN(O_PLUS, pos); 1042 INSERT(OQUEST_, pos); 1043 ASTERN(O_QUEST, pos); 1044#ifdef REGEX_GNU_EXTENSIONS 1045 } else if (p->gnuext && EATTWO('\\', '?')) { 1046 INSERT(OQUEST_, pos); 1047 ASTERN(O_QUEST, pos); 1048 } else if (p->gnuext && EATTWO('\\', '+')) { 1049 INSERT(OPLUS_, pos); 1050 ASTERN(O_PLUS, pos); 1051#endif 1052 } else if (EATTWO('\\', '{')) { 1053 count = p_count(p); 1054 if (EAT(',')) { 1055 if (MORE() && isdigit((uch)PEEK())) { 1056 count2 = p_count(p); 1057 (void)REQUIRE(count <= count2, REG_BADBR); 1058 } else /* single number with comma */ 1059 count2 = INFINITY; 1060 } else /* just a single number */ 1061 count2 = count; 1062 repeat(p, pos, count, count2); 1063 if (!EATTWO('\\', '}')) { /* error heuristics */ 1064 while (MORE() && !SEETWO('\\', '}')) 1065 NEXT(); 1066 (void)REQUIRE(MORE(), REG_EBRACE); 1067 SETERROR(REG_BADBR); 1068 } 1069 } else if (c == '$') /* $ (but not \$) ends it */ 1070 return (true); 1071 1072 return (false); 1073} 1074 1075/* 1076 - p_count - parse a repetition count 1077 == static int p_count(struct parse *p); 1078 */ 1079static int /* the value */ 1080p_count(struct parse *p) 1081{ 1082 int count = 0; 1083 int ndigits = 0; 1084 1085 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 1086 count = count*10 + ((uch)GETNEXT() - '0'); 1087 ndigits++; 1088 } 1089 1090 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 1091 return(count); 1092} 1093 1094/* 1095 - p_bracket - parse a bracketed character list 1096 == static void p_bracket(struct parse *p); 1097 */ 1098static void 1099p_bracket(struct parse *p) 1100{ 1101 cset *cs; 1102 wint_t ch; 1103 1104 /* Dept of Truly Sickening Special-Case Kludges */ 1105 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 1106 EMIT(OBOW, 0); 1107 NEXTn(6); 1108 return; 1109 } 1110 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 1111 EMIT(OEOW, 0); 1112 NEXTn(6); 1113 return; 1114 } 1115 1116 if ((cs = allocset(p)) == NULL) 1117 return; 1118 1119 if (p->g->cflags®_ICASE) 1120 cs->icase = 1; 1121 if (EAT('^')) 1122 cs->invert = 1; 1123 if (EAT(']')) 1124 CHadd(p, cs, ']'); 1125 else if (EAT('-')) 1126 CHadd(p, cs, '-'); 1127 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 1128 p_b_term(p, cs); 1129 if (EAT('-')) 1130 CHadd(p, cs, '-'); 1131 (void)MUSTEAT(']', REG_EBRACK); 1132 1133 if (p->error != 0) /* don't mess things up further */ 1134 return; 1135 1136 if (cs->invert && p->g->cflags®_NEWLINE) 1137 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); 1138 1139 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ 1140 ordinary(p, ch); 1141 freeset(p, cs); 1142 } else 1143 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1144} 1145 1146static int 1147p_range_cmp(wchar_t c1, wchar_t c2) 1148{ 1149#ifdef REGEX_LIBC_COLLATE 1150 return __wcollate_range_cmp(c1, c2); 1151#elif defined(NLS) 1152 /* Copied from libc/collate __wcollate_range_cmp */ 1153 wchar_t s1[2], s2[2]; 1154 1155 s1[0] = c1; 1156 s1[1] = L'\0'; 1157 s2[0] = c2; 1158 s2[1] = L'\0'; 1159 return wcscoll(s1, s2); 1160#else 1161 char s1[2], s2[2]; 1162 1163 s1[0] = (char)c1; 1164 s1[1] = '\0'; 1165 s2[0] = (char)c2; 1166 s2[1] = '\0'; 1167 return strcoll(s1, s2); 1168#endif 1169} 1170 1171/* 1172 - p_b_term - parse one term of a bracketed character list 1173 == static void p_b_term(struct parse *p, cset *cs); 1174 */ 1175static void 1176p_b_term(struct parse *p, cset *cs) 1177{ 1178 char c; 1179 wint_t start, finish; 1180 wint_t i; 1181#ifdef REGEX_LIBC_COLLATE 1182 struct xlocale_collate *table = 1183 (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; 1184#endif 1185 1186 _DIAGASSERT(p != NULL); 1187 _DIAGASSERT(cs != NULL); 1188 1189 /* classify what we've got */ 1190 switch ((MORE()) ? PEEK() : '\0') { 1191 case '[': 1192 c = (MORE2()) ? PEEK2() : '\0'; 1193 break; 1194 case '-': 1195 SETERROR(REG_ERANGE); 1196 return; /* NOTE RETURN */ 1197 default: 1198 c = '\0'; 1199 break; 1200 } 1201 1202 switch (c) { 1203 case ':': /* character class */ 1204 NEXT2(); 1205 (void)REQUIRE(MORE(), REG_EBRACK); 1206 c = PEEK(); 1207 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); 1208 p_b_cclass(p, cs); 1209 (void)REQUIRE(MORE(), REG_EBRACK); 1210 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 1211 break; 1212 case '=': /* equivalence class */ 1213 NEXT2(); 1214 (void)REQUIRE(MORE(), REG_EBRACK); 1215 c = PEEK(); 1216 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 1217 p_b_eclass(p, cs); 1218 (void)REQUIRE(MORE(), REG_EBRACK); 1219 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 1220 break; 1221 default: /* symbol, ordinary character, or range */ 1222 start = p_b_symbol(p); 1223 if (SEE('-') && MORE2() && PEEK2() != ']') { 1224 /* range */ 1225 NEXT(); 1226 if (EAT('-')) 1227 finish = '-'; 1228 else 1229 finish = p_b_symbol(p); 1230 } else 1231 finish = start; 1232 if (start == finish) 1233 CHadd(p, cs, start); 1234 else { 1235#ifdef REGEX_LIBC_COLLATE 1236 if (table->__collate_load_error || MB_CUR_MAX > 1) { 1237#else 1238 if (MB_CUR_MAX > 1) { 1239#endif 1240 (void)REQUIRE(start <= finish, REG_ERANGE); 1241 CHaddrange(p, cs, start, finish); 1242 } else { 1243 (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); 1244 for (i = 0; i <= UCHAR_MAX; i++) { 1245 if (p_range_cmp(start, i) <= 0 && 1246 p_range_cmp(i, finish) <= 0 ) 1247 CHadd(p, cs, i); 1248 } 1249 } 1250 } 1251 break; 1252 } 1253} 1254 1255#ifdef REGEX_GNU_EXTENSIONS 1256/* 1257 - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) 1258 == static int p_b_pseudoclass(struct parse *p, char c) 1259 */ 1260static int 1261p_b_pseudoclass(struct parse *p, char c) { 1262 cset *cs; 1263 1264 if ((cs = allocset(p)) == NULL) 1265 return(0); 1266 1267 if (p->g->cflags®_ICASE) 1268 cs->icase = 1; 1269 1270 switch (c) { 1271 case 'W': 1272 cs->invert = 1; 1273 /* FALLTHROUGH */ 1274 case 'w': 1275 p_b_cclass_named(p, cs, "alnum"); 1276 break; 1277 case 'S': 1278 cs->invert = 1; 1279 /* FALLTHROUGH */ 1280 case 's': 1281 p_b_cclass_named(p, cs, "space"); 1282 break; 1283 default: 1284 return(0); 1285 } 1286 1287 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1288 return(1); 1289} 1290#endif 1291 1292/* 1293 - p_b_cclass - parse a character-class name and deal with it 1294 == static void p_b_cclass(struct parse *p, cset *cs); 1295 */ 1296static void 1297p_b_cclass(struct parse *p, cset *cs) 1298{ 1299 const char *sp = p->next; 1300 size_t len; 1301 char clname[16]; 1302 1303 while (MORE() && isalpha((uch)PEEK())) 1304 NEXT(); 1305 len = p->next - sp; 1306 if (len >= sizeof(clname) - 1) { 1307 SETERROR(REG_ECTYPE); 1308 return; 1309 } 1310 memcpy(clname, sp, len); 1311 clname[len] = '\0'; 1312 1313 p_b_cclass_named(p, cs, clname); 1314} 1315 1316/* 1317 - p_b_cclass_named - deal with a named character class 1318 == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); 1319 */ 1320static void 1321p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { 1322 wctype_t wct; 1323 1324 if ((wct = wctype(clname)) == 0) { 1325 SETERROR(REG_ECTYPE); 1326 return; 1327 } 1328 CHaddtype(p, cs, wct); 1329} 1330 1331/* 1332 - p_b_eclass - parse an equivalence-class name and deal with it 1333 == static void p_b_eclass(struct parse *p, cset *cs); 1334 * 1335 * This implementation is incomplete. xxx 1336 */ 1337static void 1338p_b_eclass(struct parse *p, cset *cs) 1339{ 1340 wint_t c; 1341 1342 _DIAGASSERT(p != NULL); 1343 _DIAGASSERT(cs != NULL); 1344 1345 c = p_b_coll_elem(p, '='); 1346 CHadd(p, cs, c); 1347} 1348 1349/* 1350 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 1351 == static wint_t p_b_symbol(struct parse *p); 1352 */ 1353static wint_t /* value of symbol */ 1354p_b_symbol(struct parse *p) 1355{ 1356 wint_t value; 1357 1358 _DIAGASSERT(p != NULL); 1359 1360 (void)REQUIRE(MORE(), REG_EBRACK); 1361 if (!EATTWO('[', '.')) 1362 return(WGETNEXT()); 1363 1364 /* collating symbol */ 1365 value = p_b_coll_elem(p, '.'); 1366 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 1367 return(value); 1368} 1369 1370/* 1371 - p_b_coll_elem - parse a collating-element name and look it up 1372 == static wint_t p_b_coll_elem(struct parse *p, wint_t endc); 1373 */ 1374static wint_t /* value of collating element */ 1375p_b_coll_elem(struct parse *p, 1376 wint_t endc) /* name ended by endc,']' */ 1377{ 1378 const char *sp = p->next; 1379 struct cname *cp; 1380 size_t len; 1381 1382 _DIAGASSERT(p != NULL); 1383 1384 while (MORE() && !SEETWO(endc, ']')) 1385 NEXT(); 1386 if (!MORE()) { 1387 SETERROR(REG_EBRACK); 1388 return(0); 1389 } 1390 len = p->next - sp; 1391 for (cp = cnames; cp->name != NULL; cp++) 1392 if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) 1393 return(cp->code); /* known name */ 1394#ifdef NLS 1395 mbstate_t mbs; 1396 wchar_t wc; 1397 size_t clen; 1398 1399 memset(&mbs, 0, sizeof(mbs)); 1400 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) 1401 return (wc); /* single character */ 1402 else if (clen == (size_t)-1 || clen == (size_t)-2) 1403 SETERROR(REG_ILLSEQ); 1404 else 1405 SETERROR(REG_ECOLLATE); /* neither */ 1406 return(0); 1407#else 1408 if (len == 1) 1409 return *sp; /* single character */ 1410 SETERROR(REG_ECOLLATE); /* neither */ 1411 return 0; 1412#endif 1413} 1414 1415/* 1416 - may_escape - determine whether 'ch' is escape-able in the current context 1417 == static int may_escape(struct parse *p, const wint_t ch) 1418 */ 1419static bool 1420may_escape(struct parse *p, const wint_t ch) 1421{ 1422 1423 if ((p->pflags & PFLAG_LEGACY_ESC) != 0) 1424 return (true); 1425 if (iswalpha(ch) || ch == '\'' || ch == '`') 1426 return (false); 1427 return (true); 1428#ifdef NOTYET 1429 /* 1430 * Build a whitelist of characters that may be escaped to produce an 1431 * ordinary in the current context. This assumes that these have not 1432 * been otherwise interpreted as a special character. Escaping an 1433 * ordinary character yields undefined results according to 1434 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take 1435 * advantage of this and use escaped ordinary characters to provide 1436 * special meaning, e.g. \b, \B, \w, \W, \s, \S. 1437 */ 1438 switch(ch) { 1439 case '|': 1440 case '+': 1441 case '?': 1442 /* The above characters may not be escaped in BREs */ 1443 if (!(p->g->cflags®_EXTENDED)) 1444 return (false); 1445 /* Fallthrough */ 1446 case '(': 1447 case ')': 1448 case '{': 1449 case '}': 1450 case '.': 1451 case '[': 1452 case ']': 1453 case '\\': 1454 case '*': 1455 case '^': 1456 case '$': 1457 return (true); 1458 default: 1459 return (false); 1460 } 1461#endif 1462} 1463 1464/* 1465 - othercase - return the case counterpart of an alphabetic 1466 == static wint_t othercase(wint_t ch); 1467 */ 1468static wint_t /* if no counterpart, return ch */ 1469othercase(wint_t ch) 1470{ 1471 assert(iswalpha(ch)); 1472 if (iswupper(ch)) 1473 return(towlower(ch)); 1474 else if (iswlower(ch)) 1475 return(towupper(ch)); 1476 else /* peculiar, but could happen */ 1477 return(ch); 1478} 1479 1480/* 1481 - bothcases - emit a dualcase version of a two-case character 1482 == static void bothcases(struct parse *p, wint_t ch); 1483 * 1484 * Boy, is this implementation ever a kludge... 1485 */ 1486static void 1487bothcases(struct parse *p, wint_t ch) 1488{ 1489 const char *oldnext = p->next; 1490 const char *oldend = p->end; 1491 char bracket[3 + MB_LEN_MAX]; 1492 size_t n; 1493 1494 _DIAGASSERT(p != NULL); 1495 1496 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1497 p->next = bracket; 1498#ifdef NLS 1499 mbstate_t mbs; 1500 memset(&mbs, 0, sizeof(mbs)); 1501 n = wcrtomb(bracket, ch, &mbs); 1502 assert(n != (size_t)-1); 1503#else 1504 n = 0; 1505 bracket[n++] = ch; 1506#endif 1507 bracket[n] = ']'; 1508 bracket[n + 1] = '\0'; 1509 p->end = bracket+n+1; 1510 p_bracket(p); 1511 assert(p->next == p->end); 1512 p->next = oldnext; 1513 p->end = oldend; 1514} 1515 1516/* 1517 - ordinary - emit an ordinary character 1518 == static void ordinary(struct parse *p, wint_t ch); 1519 */ 1520static void 1521ordinary(struct parse *p, wint_t ch) 1522{ 1523 cset *cs; 1524 1525 _DIAGASSERT(p != NULL); 1526 1527 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) 1528 bothcases(p, ch); 1529 else if ((wint_t)(ch & OPDMASK) == ch) 1530 EMIT(OCHAR, (size_t)ch); 1531 else { 1532 /* 1533 * Kludge: character is too big to fit into an OCHAR operand. 1534 * Emit a singleton set. 1535 */ 1536 if ((cs = allocset(p)) == NULL) 1537 return; 1538 CHadd(p, cs, ch); 1539 EMIT(OANYOF, (size_t)(cs - p->g->sets)); 1540 } 1541} 1542 1543/* 1544 - nonnewline - emit REG_NEWLINE version of OANY 1545 == static void nonnewline(struct parse *p); 1546 * 1547 * Boy, is this implementation ever a kludge... 1548 */ 1549static void 1550nonnewline(struct parse *p) 1551{ 1552 const char *oldnext = p->next; 1553 const char *oldend = p->end; 1554 char bracket[4]; 1555 1556 _DIAGASSERT(p != NULL); 1557 1558 p->next = bracket; 1559 p->end = bracket+3; 1560 bracket[0] = '^'; 1561 bracket[1] = '\n'; 1562 bracket[2] = ']'; 1563 bracket[3] = '\0'; 1564 p_bracket(p); 1565 assert(p->next == bracket+3); 1566 p->next = oldnext; 1567 p->end = oldend; 1568} 1569 1570/* 1571 - repeat - generate code for a bounded repetition, recursively if needed 1572 == static void repeat(struct parse *p, sopno start, int from, int to); 1573 */ 1574static void 1575repeat(struct parse *p, 1576 sopno start, /* operand from here to end of strip */ 1577 int from, /* repeated from this number */ 1578 int to) /* to this number of times (maybe INFINITY) */ 1579{ 1580 sopno finish = HERE(); 1581# define N 2 1582# define INF 3 1583# define REP(f, t) ((f)*8 + (t)) 1584# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1585 sopno copy; 1586 1587 _DIAGASSERT(p != NULL); 1588 1589 if (p->error != 0) /* head off possible runaway recursion */ 1590 return; 1591 1592 assert(from <= to); 1593 1594 switch (REP(MAP(from), MAP(to))) { 1595 case REP(0, 0): /* must be user doing this */ 1596 DROP(finish-start); /* drop the operand */ 1597 break; 1598 case REP(0, 1): /* as x{1,1}? */ 1599 case REP(0, N): /* as x{1,n}? */ 1600 case REP(0, INF): /* as x{1,}? */ 1601 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1602 INSERT(OCH_, start); /* offset is wrong... */ 1603 repeat(p, start+1, 1, to); 1604 ASTERN(OOR1, start); 1605 AHEAD(start); /* ... fix it */ 1606 EMIT(OOR2, 0); 1607 AHEAD(THERE()); 1608 ASTERN(O_CH, THERETHERE()); 1609 break; 1610 case REP(1, 1): /* trivial case */ 1611 /* done */ 1612 break; 1613 case REP(1, N): /* as x?x{1,n-1} */ 1614 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1615 INSERT(OCH_, start); 1616 ASTERN(OOR1, start); 1617 AHEAD(start); 1618 EMIT(OOR2, 0); /* offset very wrong... */ 1619 AHEAD(THERE()); /* ...so fix it */ 1620 ASTERN(O_CH, THERETHERE()); 1621 copy = dupl(p, start+1, finish+1); 1622 assert(copy == finish+4); 1623 repeat(p, copy, 1, to-1); 1624 break; 1625 case REP(1, INF): /* as x+ */ 1626 INSERT(OPLUS_, start); 1627 ASTERN(O_PLUS, start); 1628 break; 1629 case REP(N, N): /* as xx{m-1,n-1} */ 1630 copy = dupl(p, start, finish); 1631 repeat(p, copy, from-1, to-1); 1632 break; 1633 case REP(N, INF): /* as xx{n-1,INF} */ 1634 copy = dupl(p, start, finish); 1635 repeat(p, copy, from-1, to); 1636 break; 1637 default: /* "can't happen" */ 1638 SETERROR(REG_ASSERT); /* just in case */ 1639 break; 1640 } 1641} 1642 1643/* 1644 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide 1645 - character from the parse struct, signals a REG_ILLSEQ error if the 1646 - character can't be converted. Returns the number of bytes consumed. 1647 */ 1648static wint_t 1649wgetnext(struct parse *p) 1650{ 1651#ifdef NLS 1652 mbstate_t mbs; 1653 wchar_t wc; 1654 size_t n; 1655 1656 memset(&mbs, 0, sizeof(mbs)); 1657 n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs); 1658 if (n == (size_t)-1 || n == (size_t)-2) { 1659 SETERROR(REG_ILLSEQ); 1660 return (0); 1661 } 1662 if (n == 0) 1663 n = 1; 1664 p->next += n; 1665 return wc; 1666#else 1667 return *p->next++; 1668#endif 1669} 1670 1671/* 1672 - seterr - set an error condition 1673 == static int seterr(struct parse *p, int e); 1674 */ 1675static int /* useless but makes type checking happy */ 1676seterr(struct parse *p, int e) 1677{ 1678 1679 _DIAGASSERT(p != NULL); 1680 1681 if (p->error == 0) /* keep earliest error condition */ 1682 p->error = e; 1683 p->next = nuls; /* try to bring things to a halt */ 1684 p->end = nuls; 1685 return(0); /* make the return value well-defined */ 1686} 1687 1688/* 1689 - allocset - allocate a set of characters for [] 1690 == static cset *allocset(struct parse *p); 1691 */ 1692static cset * 1693allocset(struct parse *p) 1694{ 1695 cset *cs, *ncs; 1696 1697 _DIAGASSERT(p != NULL); 1698 1699 ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs)); 1700 if (ncs == NULL) { 1701 SETERROR(REG_ESPACE); 1702 return (NULL); 1703 } 1704 p->g->sets = ncs; 1705 cs = &p->g->sets[p->g->ncsets++]; 1706 memset(cs, 0, sizeof(*cs)); 1707 1708 return(cs); 1709} 1710 1711/* 1712 - freeset - free a now-unused set 1713 == static void freeset(struct parse *p, cset *cs); 1714 */ 1715static void 1716freeset(struct parse *p, cset *cs) 1717{ 1718 cset *top; 1719 1720 _DIAGASSERT(p != NULL); 1721 _DIAGASSERT(cs != NULL); 1722 1723 top = &p->g->sets[p->g->ncsets]; 1724 1725 free(cs->wides); 1726 free(cs->ranges); 1727 free(cs->types); 1728 memset(cs, 0, sizeof(*cs)); 1729 if (cs == top-1) /* recover only the easy case */ 1730 p->g->ncsets--; 1731} 1732 1733/* 1734 - singleton - Determine whether a set contains only one character, 1735 - returning it if so, otherwise returning OUT. 1736 */ 1737static wint_t 1738singleton(cset *cs) 1739{ 1740 wint_t i, s, n; 1741 1742 for (i = n = 0; i < NC; i++) 1743 if (CHIN(cs, i)) { 1744 n++; 1745 s = i; 1746 } 1747 if (n == 1) 1748 return (s); 1749 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && 1750 cs->icase == 0) 1751 return (cs->wides[0]); 1752 /* Don't bother handling the other cases. */ 1753 return (OUT); 1754} 1755 1756/* 1757 - CHadd - add character to character set. 1758 */ 1759static void 1760CHadd(struct parse *p, cset *cs, wint_t ch) 1761{ 1762 wint_t nch, *newwides; 1763 1764 _DIAGASSERT(p != NULL); 1765 _DIAGASSERT(cs != NULL); 1766 1767 assert(ch >= 0); 1768 if (ch < NC) 1769 cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7); 1770 else { 1771 newwides = reallocarray(cs->wides, cs->nwides + 1, 1772 sizeof(*cs->wides)); 1773 if (newwides == NULL) { 1774 SETERROR(REG_ESPACE); 1775 return; 1776 } 1777 cs->wides = newwides; 1778 cs->wides[cs->nwides++] = ch; 1779 } 1780 if (cs->icase) { 1781 if ((nch = towlower(ch)) < NC) 1782 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1783 if ((nch = towupper(ch)) < NC) 1784 cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7); 1785 } 1786} 1787 1788/* 1789 - CHaddrange - add all characters in the range [min,max] to a character set. 1790 */ 1791static void 1792CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max) 1793{ 1794 crange *newranges; 1795 1796 _DIAGASSERT(p != NULL); 1797 _DIAGASSERT(cs != NULL); 1798 1799 for (; min < NC && min <= max; min++) 1800 CHadd(p, cs, min); 1801 if (min >= max) 1802 return; 1803 newranges = reallocarray(cs->ranges, cs->nranges + 1, 1804 sizeof(*cs->ranges)); 1805 if (newranges == NULL) { 1806 SETERROR(REG_ESPACE); 1807 return; 1808 } 1809 cs->ranges = newranges; 1810 cs->ranges[cs->nranges].min = min; 1811 cs->ranges[cs->nranges].max = max; 1812 cs->nranges++; 1813} 1814 1815/* 1816 - CHaddtype - add all characters of a certain type to a character set. 1817 */ 1818static void 1819CHaddtype(struct parse *p, cset *cs, wctype_t wct) 1820{ 1821 wint_t i; 1822 wctype_t *newtypes; 1823 1824 _DIAGASSERT(p != NULL); 1825 _DIAGASSERT(cs != NULL); 1826 1827 for (i = 0; i < NC; i++) 1828 if (iswctype(i, wct)) 1829 CHadd(p, cs, i); 1830 newtypes = reallocarray(cs->types, cs->ntypes + 1, 1831 sizeof(*cs->types)); 1832 if (newtypes == NULL) { 1833 SETERROR(REG_ESPACE); 1834 return; 1835 } 1836 cs->types = newtypes; 1837 cs->types[cs->ntypes++] = wct; 1838} 1839 1840/* 1841 - dupl - emit a duplicate of a bunch of sops 1842 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1843 */ 1844static sopno /* start of duplicate */ 1845dupl(struct parse *p, 1846 sopno start, /* from here */ 1847 sopno finish) /* to this less one */ 1848{ 1849 sopno ret = HERE(); 1850 sopno len = finish - start; 1851 1852 _DIAGASSERT(p != NULL); 1853 1854 assert(finish >= start); 1855 if (len == 0) 1856 return(ret); 1857 if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */ 1858 return(ret); 1859 (void) memcpy(p->strip + p->slen, 1860 p->strip + start, len * sizeof(*p->strip)); 1861 p->slen += len; 1862 return(ret); 1863} 1864 1865/* 1866 - doemit - emit a strip operator 1867 == static void doemit(struct parse *p, sop op, size_t opnd); 1868 * 1869 * It might seem better to implement this as a macro with a function as 1870 * hard-case backup, but it's just too big and messy unless there are 1871 * some changes to the data structures. Maybe later. 1872 */ 1873static void 1874doemit(struct parse *p, sop op, size_t opnd) 1875{ 1876 /* avoid making error situations worse */ 1877 if (p->error != 0) 1878 return; 1879 1880 _DIAGASSERT(p != NULL); 1881 1882 /* deal with oversize operands ("can't happen", more or less) */ 1883 assert(opnd < 1<<OPSHIFT); 1884 1885 /* deal with undersized strip */ 1886 if (p->slen >= p->ssize) 1887 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1888 return; 1889 1890 /* finally, it's all reduced to the easy case */ 1891 p->strip[p->slen++] = (sopno)SOP(op, opnd); 1892} 1893 1894/* 1895 - doinsert - insert a sop into the strip 1896 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1897 */ 1898static void 1899doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 1900{ 1901 sopno sn; 1902 sop s; 1903 int i; 1904 1905 _DIAGASSERT(p != NULL); 1906 1907 /* avoid making error situations worse */ 1908 if (p->error != 0) 1909 return; 1910 1911 sn = HERE(); 1912 EMIT(op, opnd); /* do checks, ensure space */ 1913 assert(HERE() == sn+1); 1914 s = p->strip[sn]; 1915 1916 /* adjust paren pointers */ 1917 assert(pos > 0); 1918 for (i = 1; i < NPAREN; i++) { 1919 if (p->pbegin[i] >= pos) { 1920 p->pbegin[i]++; 1921 } 1922 if (p->pend[i] >= pos) { 1923 p->pend[i]++; 1924 } 1925 } 1926 1927 memmove(&p->strip[pos+1], &p->strip[pos], 1928 (HERE()-pos-1)*sizeof(*p->strip)); 1929 p->strip[pos] = s; 1930} 1931 1932/* 1933 - dofwd - complete a forward reference 1934 == static void dofwd(struct parse *p, sopno pos, sop value); 1935 */ 1936static void 1937dofwd(struct parse *p, sopno pos, sop value) 1938{ 1939 1940 _DIAGASSERT(p != NULL); 1941 1942 /* avoid making error situations worse */ 1943 if (p->error != 0) 1944 return; 1945 1946 assert(value < 1<<OPSHIFT); 1947 p->strip[pos] = OP(p->strip[pos]) | value; 1948} 1949 1950/* 1951 - enlarge - enlarge the strip 1952 == static int enlarge(struct parse *p, sopno size); 1953 */ 1954static int 1955enlarge(struct parse *p, sopno size) 1956{ 1957 sop *sp; 1958 1959 _DIAGASSERT(p != NULL); 1960 1961 if (p->ssize >= size) 1962 return 1; 1963 1964 sp = reallocarray(p->strip, size, sizeof(*p->strip)); 1965 if (sp == NULL) { 1966 SETERROR(REG_ESPACE); 1967 return 0; 1968 } 1969 p->strip = sp; 1970 p->ssize = size; 1971 return 1; 1972} 1973 1974/* 1975 - stripsnug - compact the strip 1976 == static void stripsnug(struct parse *p, struct re_guts *g); 1977 */ 1978static void 1979stripsnug(struct parse *p, struct re_guts *g) 1980{ 1981 1982 _DIAGASSERT(p != NULL); 1983 _DIAGASSERT(g != NULL); 1984 1985 g->nstates = p->slen; 1986 g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip)); 1987 if (g->strip == NULL) { 1988 SETERROR(REG_ESPACE); 1989 g->strip = p->strip; 1990 } 1991} 1992 1993/* 1994 - findmust - fill in must and mlen with longest mandatory literal string 1995 == static void findmust(struct parse *p, struct re_guts *g); 1996 * 1997 * This algorithm could do fancy things like analyzing the operands of | 1998 * for common subsequences. Someday. This code is simple and finds most 1999 * of the interesting cases. 2000 * 2001 * Note that must and mlen got initialized during setup. 2002 */ 2003static void 2004findmust(struct parse *p, struct re_guts *g) 2005{ 2006 sop *scan; 2007 sop *start = NULL; 2008 sop *newstart = NULL; 2009 sopno newlen; 2010 sop s; 2011 char *cp; 2012 int offset; 2013 mbstate_t mbs; 2014 2015 _DIAGASSERT(p != NULL); 2016 _DIAGASSERT(g != NULL); 2017 2018 /* avoid making error situations worse */ 2019 if (p->error != 0) 2020 return; 2021 2022#ifdef notyet 2023 /* 2024 * It's not generally safe to do a ``char'' substring search on 2025 * multibyte character strings, but it's safe for at least 2026 * UTF-8 (see RFC 3629). 2027 */ 2028 if (MB_CUR_MAX > 1 && 2029 strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0) 2030 return; 2031#endif 2032 2033 /* find the longest OCHAR sequence in strip */ 2034 newlen = 0; 2035 offset = 0; 2036 g->moffset = 0; 2037 scan = g->strip + 1; 2038 do { 2039 s = *scan++; 2040 switch (OP(s)) { 2041 case OCHAR: /* sequence member */ 2042 if (newlen == 0) { /* new sequence */ 2043 memset(&mbs, 0, sizeof(mbs)); 2044 newstart = scan - 1; 2045 } 2046#ifdef NLS 2047 char buf[MB_LEN_MAX]; 2048 size_t clen = wcrtomb(buf, (int)OPND(s), &mbs); 2049 if (clen == (size_t)-1) 2050 goto toohard; 2051 newlen += (sopno)clen; 2052#else 2053 newlen++; 2054#endif 2055 break; 2056 case OPLUS_: /* things that don't break one */ 2057 case OLPAREN: 2058 case ORPAREN: 2059 break; 2060 case OQUEST_: /* things that must be skipped */ 2061 case OCH_: 2062 offset = altoffset(scan, offset); 2063 scan--; 2064 do { 2065 scan += OPND(s); 2066 s = *scan; 2067 /* assert() interferes w debug printouts */ 2068 if (OP(s) != O_QUEST && 2069 OP(s) != O_CH && OP(s) != OOR2) { 2070 g->iflags |= BAD; 2071 return; 2072 } 2073 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2074 /* FALLTHROUGH */ 2075 case OBOW: /* things that break a sequence */ 2076 case OEOW: 2077 case OBOL: 2078 case OEOL: 2079 case OBOS: 2080 case OEOS: 2081 case OWBND: 2082 case ONWBND: 2083 case O_QUEST: 2084 case O_CH: 2085 case OEND: 2086 if (newlen > (sopno)g->mlen) { /* ends one */ 2087 start = newstart; 2088 g->mlen = newlen; 2089 if (offset > -1) { 2090 g->moffset += offset; 2091 offset = newlen; 2092 } else 2093 g->moffset = offset; 2094 } else { 2095 if (offset > -1) 2096 offset += newlen; 2097 } 2098 newlen = 0; 2099 break; 2100 case OANY: 2101 if (newlen > (sopno)g->mlen) { /* ends one */ 2102 start = newstart; 2103 g->mlen = newlen; 2104 if (offset > -1) { 2105 g->moffset += offset; 2106 offset = newlen; 2107 } else 2108 g->moffset = offset; 2109 } else { 2110 if (offset > -1) 2111 offset += newlen; 2112 } 2113 if (offset > -1) 2114 offset++; 2115 newlen = 0; 2116 break; 2117 case OANYOF: /* may or may not invalidate offset */ 2118 /* First, everything as OANY */ 2119 if (newlen > (sopno)g->mlen) { /* ends one */ 2120 start = newstart; 2121 g->mlen = newlen; 2122 if (offset > -1) { 2123 g->moffset += offset; 2124 offset = newlen; 2125 } else 2126 g->moffset = offset; 2127 } else { 2128 if (offset > -1) 2129 offset += newlen; 2130 } 2131 if (offset > -1) 2132 offset++; 2133 newlen = 0; 2134 break; 2135#ifdef NLS 2136 toohard:/*FALLTHROUGH*/ 2137#endif 2138 default: 2139 /* Anything here makes it impossible or too hard 2140 * to calculate the offset -- so we give up; 2141 * save the last known good offset, in case the 2142 * must sequence doesn't occur later. 2143 */ 2144 if (newlen > (sopno)g->mlen) { /* ends one */ 2145 start = newstart; 2146 g->mlen = newlen; 2147 if (offset > -1) 2148 g->moffset += offset; 2149 else 2150 g->moffset = offset; 2151 } 2152 offset = -1; 2153 newlen = 0; 2154 break; 2155 } 2156 } while (OP(s) != OEND); 2157 2158 if (g->mlen == 0) { /* there isn't one */ 2159 g->moffset = -1; 2160 return; 2161 } 2162 2163 /* turn it into a character string */ 2164 g->must = malloc((size_t)g->mlen + 1); 2165 if (g->must == NULL) { /* argh; just forget it */ 2166 g->mlen = 0; 2167 g->moffset = -1; 2168 return; 2169 } 2170 cp = g->must; 2171 scan = start; 2172 memset(&mbs, 0, sizeof(mbs)); 2173 while (cp < g->must + g->mlen) { 2174 while (OP(s = *scan++) != OCHAR) 2175 continue; 2176#ifdef NLS 2177 size_t clen = wcrtomb(cp, (int)OPND(s), &mbs); 2178 assert(clen != (size_t)-1); 2179 cp += clen; 2180#else 2181 *cp++ = OPND(s); 2182#endif 2183 } 2184 assert(cp == g->must + g->mlen); 2185 *cp++ = '\0'; /* just on general principles */ 2186} 2187 2188/* 2189 - altoffset - choose biggest offset among multiple choices 2190 == static int altoffset(sop *scan, int offset); 2191 * 2192 * Compute, recursively if necessary, the largest offset among multiple 2193 * re paths. 2194 */ 2195static int 2196altoffset(sop *scan, int offset) 2197{ 2198 int largest; 2199 int try; 2200 sop s; 2201 2202 _DIAGASSERT(scan != NULL); 2203 2204 /* If we gave up already on offsets, return */ 2205 if (offset == -1) 2206 return -1; 2207 2208 largest = 0; 2209 try = 0; 2210 s = *scan++; 2211 while (OP(s) != O_QUEST && OP(s) != O_CH) { 2212 switch (OP(s)) { 2213 case OOR1: 2214 if (try > largest) 2215 largest = try; 2216 try = 0; 2217 break; 2218 case OQUEST_: 2219 case OCH_: 2220 try = altoffset(scan, try); 2221 if (try == -1) 2222 return -1; 2223 scan--; 2224 do { 2225 scan += OPND(s); 2226 s = *scan; 2227 if (OP(s) != O_QUEST && 2228 OP(s) != O_CH && OP(s) != OOR2) 2229 return -1; 2230 } while (OP(s) != O_QUEST && OP(s) != O_CH); 2231 /* We must skip to the next position, or we'll 2232 * leave altoffset() too early. 2233 */ 2234 scan++; 2235 break; 2236 case OANYOF: 2237 case OCHAR: 2238 case OANY: 2239 try++; 2240 /*FALLTHROUGH*/ 2241 case OBOW: 2242 case OEOW: 2243 case OWBND: 2244 case ONWBND: 2245 case OLPAREN: 2246 case ORPAREN: 2247 case OOR2: 2248 break; 2249 default: 2250 try = -1; 2251 break; 2252 } 2253 if (try == -1) 2254 return -1; 2255 s = *scan++; 2256 } 2257 2258 if (try > largest) 2259 largest = try; 2260 2261 return largest+offset; 2262} 2263 2264/* 2265 - computejumps - compute char jumps for BM scan 2266 == static void computejumps(struct parse *p, struct re_guts *g); 2267 * 2268 * This algorithm assumes g->must exists and is has size greater than 2269 * zero. It's based on the algorithm found on Computer Algorithms by 2270 * Sara Baase. 2271 * 2272 * A char jump is the number of characters one needs to jump based on 2273 * the value of the character from the text that was mismatched. 2274 */ 2275static void 2276computejumps(struct parse *p, struct re_guts *g) 2277{ 2278 int ch; 2279 size_t mindex; 2280 2281 _DIAGASSERT(p != NULL); 2282 _DIAGASSERT(g != NULL); 2283 2284 /* Avoid making errors worse */ 2285 if (p->error != 0) 2286 return; 2287 2288 g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump)); 2289 if (g->charjump == NULL) /* Not a fatal error */ 2290 return; 2291 /* Adjust for signed chars, if necessary */ 2292 g->charjump = &g->charjump[-(CHAR_MIN)]; 2293 2294 /* If the character does not exist in the pattern, the jump 2295 * is equal to the number of characters in the pattern. 2296 */ 2297 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) 2298 g->charjump[ch] = g->mlen; 2299 2300 /* If the character does exist, compute the jump that would 2301 * take us to the last character in the pattern equal to it 2302 * (notice that we match right to left, so that last character 2303 * is the first one that would be matched). 2304 */ 2305 for (mindex = 0; mindex < g->mlen; mindex++) 2306 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; 2307} 2308 2309/* 2310 - computematchjumps - compute match jumps for BM scan 2311 == static void computematchjumps(struct parse *p, struct re_guts *g); 2312 * 2313 * This algorithm assumes g->must exists and is has size greater than 2314 * zero. It's based on the algorithm found on Computer Algorithms by 2315 * Sara Baase. 2316 * 2317 * A match jump is the number of characters one needs to advance based 2318 * on the already-matched suffix. 2319 * Notice that all values here are minus (g->mlen-1), because of the way 2320 * the search algorithm works. 2321 */ 2322static void 2323computematchjumps(struct parse *p, struct re_guts *g) 2324{ 2325 size_t mindex; /* General "must" iterator */ 2326 size_t suffix; /* Keeps track of matching suffix */ 2327 size_t ssuffix; /* Keeps track of suffixes' suffix */ 2328 size_t* pmatches; /* pmatches[k] points to the next i 2329 * such that i+1...mlen is a substring 2330 * of k+1...k+mlen-i-1 2331 */ 2332 2333 _DIAGASSERT(p != NULL); 2334 _DIAGASSERT(g != NULL); 2335 2336 /* Avoid making errors worse */ 2337 if (p->error != 0) 2338 return; 2339 2340 pmatches = calloc(g->mlen, sizeof(*pmatches)); 2341 if (pmatches == NULL) { 2342 g->matchjump = NULL; 2343 return; 2344 } 2345 2346 g->matchjump = calloc(g->mlen, sizeof(*g->matchjump)); 2347 if (g->matchjump == NULL) { /* Not a fatal error */ 2348 free(pmatches); 2349 return; 2350 } 2351 2352 /* Set maximum possible jump for each character in the pattern */ 2353 for (mindex = 0; mindex < g->mlen; mindex++) 2354 g->matchjump[mindex] = 2 * g->mlen - mindex - 1; 2355 2356 /* Compute pmatches[] */ 2357 for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) { 2358 pmatches[mindex] = suffix; 2359 2360 /* If a mismatch is found, interrupting the substring, 2361 * compute the matchjump for that position. If no 2362 * mismatch is found, then a text substring mismatched 2363 * against the suffix will also mismatch against the 2364 * substring. 2365 */ 2366 while (suffix < g->mlen 2367 && g->must[mindex] != g->must[suffix]) { 2368 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2369 g->mlen - mindex - 1); 2370 suffix = pmatches[suffix]; 2371 } 2372 } 2373 2374 /* Compute the matchjump up to the last substring found to jump 2375 * to the beginning of the largest must pattern prefix matching 2376 * it's own suffix. 2377 */ 2378 for (mindex = 0; mindex <= suffix; mindex++) 2379 g->matchjump[mindex] = MIN(g->matchjump[mindex], 2380 g->mlen + suffix - mindex); 2381 2382 ssuffix = pmatches[suffix]; 2383 while (suffix < g->mlen) { 2384 while (suffix <= ssuffix && suffix < g->mlen) { 2385 g->matchjump[suffix] = MIN(g->matchjump[suffix], 2386 g->mlen + ssuffix - suffix); 2387 suffix++; 2388 } 2389 if (suffix < g->mlen) 2390 ssuffix = pmatches[ssuffix]; 2391 } 2392 2393 free(pmatches); 2394} 2395 2396/* 2397 - pluscount - count + nesting 2398 == static sopno pluscount(struct parse *p, struct re_guts *g); 2399 */ 2400static sopno /* nesting depth */ 2401pluscount(struct parse *p, struct re_guts *g) 2402{ 2403 sop *scan; 2404 sop s; 2405 sopno plusnest = 0; 2406 sopno maxnest = 0; 2407 2408 _DIAGASSERT(p != NULL); 2409 _DIAGASSERT(g != NULL); 2410 2411 if (p->error != 0) 2412 return(0); /* there may not be an OEND */ 2413 2414 scan = g->strip + 1; 2415 do { 2416 s = *scan++; 2417 switch (OP(s)) { 2418 case OPLUS_: 2419 plusnest++; 2420 break; 2421 case O_PLUS: 2422 if (plusnest > maxnest) 2423 maxnest = plusnest; 2424 plusnest--; 2425 break; 2426 } 2427 } while (OP(s) != OEND); 2428 if (plusnest != 0) 2429 g->iflags |= BAD; 2430 return(maxnest); 2431} 2432