regex.c revision 126209
1/* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P1003.2/D11.2, except for some of the 4 internationalization features.) 5 Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc. 6 7 The GNU C Library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Library General Public License as 9 published by the Free Software Foundation; either version 2 of the 10 License, or (at your option) any later version. 11 12 The GNU C Library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Library General Public License for more details. 16 17 You should have received a copy of the GNU Library General Public 18 License along with the GNU C Library; see the file COPYING.LIB. If not, 19 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 20 Boston, MA 02111-1307, USA. */ 21 22/* AIX requires this to be the first thing in the file. */ 23#if defined _AIX && !defined REGEX_MALLOC 24 #pragma alloca 25#endif 26 27#undef _GNU_SOURCE 28#define _GNU_SOURCE 29 30#ifdef HAVE_CONFIG_H 31# include <config.h> 32#endif 33 34#ifndef PARAMS 35# if defined __GNUC__ || (defined __STDC__ && __STDC__) 36# define PARAMS(args) args 37# else 38# define PARAMS(args) () 39# endif /* GCC. */ 40#endif /* Not PARAMS. */ 41 42#if defined STDC_HEADERS && !defined emacs 43# include <stddef.h> 44#else 45/* We need this for `regex.h', and perhaps for the Emacs include files. */ 46# include <sys/types.h> 47#endif 48 49#define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) 50 51/* For platform which support the ISO C amendement 1 functionality we 52 support user defined character classes. */ 53#if defined _LIBC || WIDE_CHAR_SUPPORT 54/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ 55# include <wchar.h> 56# include <wctype.h> 57#endif 58 59#ifdef _LIBC 60/* We have to keep the namespace clean. */ 61# define regfree(preg) __regfree (preg) 62# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) 63# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) 64# define regerror(errcode, preg, errbuf, errbuf_size) \ 65 __regerror(errcode, preg, errbuf, errbuf_size) 66# define re_set_registers(bu, re, nu, st, en) \ 67 __re_set_registers (bu, re, nu, st, en) 68# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ 69 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 70# define re_match(bufp, string, size, pos, regs) \ 71 __re_match (bufp, string, size, pos, regs) 72# define re_search(bufp, string, size, startpos, range, regs) \ 73 __re_search (bufp, string, size, startpos, range, regs) 74# define re_compile_pattern(pattern, length, bufp) \ 75 __re_compile_pattern (pattern, length, bufp) 76# define re_set_syntax(syntax) __re_set_syntax (syntax) 77# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ 78 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) 79# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) 80 81#define btowc __btowc 82#endif 83 84 85#ifndef _ 86/* This is for other GNU distributions with internationalized messages. 87 When compiling libc, the _ and N_ macros are predefined. */ 88# ifdef HAVE_LIBINTL_H 89# include <libintl.h> 90# else 91# define gettext(msgid) (msgid) 92# endif 93# define N_(msgid) (msgid) 94#endif 95 96/* The `emacs' switch turns on certain matching commands 97 that make sense only in Emacs. */ 98#ifdef emacs 99 100# include "lisp.h" 101# include "buffer.h" 102# include "syntax.h" 103 104#else /* not emacs */ 105 106/* If we are not linking with Emacs proper, 107 we can't use the relocating allocator 108 even if config.h says that we can. */ 109# undef REL_ALLOC 110 111# if defined STDC_HEADERS || defined _LIBC 112# include <stdlib.h> 113# else 114char *malloc (); 115char *realloc (); 116# endif 117 118/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 119 If nothing else has been done, use the method below. */ 120# ifdef INHIBIT_STRING_HEADER 121# if !(defined HAVE_BZERO && defined HAVE_BCOPY) 122# if !defined bzero && !defined bcopy 123# undef INHIBIT_STRING_HEADER 124# endif 125# endif 126# endif 127 128/* This is the normal way of making sure we have a bcopy and a bzero. 129 This is used in most programs--a few other programs avoid this 130 by defining INHIBIT_STRING_HEADER. */ 131# ifndef INHIBIT_STRING_HEADER 132# if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC 133# include <string.h> 134# ifndef bzero 135# ifndef _LIBC 136# define bzero(s, n) (memset (s, '\0', n), (s)) 137# else 138# define bzero(s, n) __bzero (s, n) 139# endif 140# endif 141# else 142# include <strings.h> 143# ifndef memcmp 144# define memcmp(s1, s2, n) bcmp (s1, s2, n) 145# endif 146# ifndef memcpy 147# define memcpy(d, s, n) (bcopy (s, d, n), (d)) 148# endif 149# endif 150# endif 151 152/* Define the syntax stuff for \<, \>, etc. */ 153 154/* This must be nonzero for the wordchar and notwordchar pattern 155 commands in re_match_2. */ 156# ifndef Sword 157# define Sword 1 158# endif 159 160# ifdef SWITCH_ENUM_BUG 161# define SWITCH_ENUM_CAST(x) ((int)(x)) 162# else 163# define SWITCH_ENUM_CAST(x) (x) 164# endif 165 166#endif /* not emacs */ 167 168/* Get the interface, including the syntax bits. */ 169#include <regex.h> 170 171/* isalpha etc. are used for the character classes. */ 172#include <ctype.h> 173 174/* Jim Meyering writes: 175 176 "... Some ctype macros are valid only for character codes that 177 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 178 using /bin/cc or gcc but without giving an ansi option). So, all 179 ctype uses should be through macros like ISPRINT... If 180 STDC_HEADERS is defined, then autoconf has verified that the ctype 181 macros don't need to be guarded with references to isascii. ... 182 Defining isascii to 1 should let any compiler worth its salt 183 eliminate the && through constant folding." 184 Solaris defines some of these symbols so we must undefine them first. */ 185 186#undef ISASCII 187#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) 188# define ISASCII(c) 1 189#else 190# define ISASCII(c) isascii(c) 191#endif 192 193#ifdef isblank 194# define ISBLANK(c) (ISASCII (c) && isblank (c)) 195#else 196# define ISBLANK(c) ((c) == ' ' || (c) == '\t') 197#endif 198#ifdef isgraph 199# define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 200#else 201# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) 202#endif 203 204#undef ISPRINT 205#define ISPRINT(c) (ISASCII (c) && isprint (c)) 206#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) 207#define ISALNUM(c) (ISASCII (c) && isalnum (c)) 208#define ISALPHA(c) (ISASCII (c) && isalpha (c)) 209#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) 210#define ISLOWER(c) (ISASCII (c) && islower (c)) 211#define ISPUNCT(c) (ISASCII (c) && ispunct (c)) 212#define ISSPACE(c) (ISASCII (c) && isspace (c)) 213#define ISUPPER(c) (ISASCII (c) && isupper (c)) 214#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 215 216#ifdef _tolower 217# define TOLOWER(c) _tolower(c) 218#else 219# define TOLOWER(c) tolower(c) 220#endif 221 222#ifndef NULL 223# define NULL (void *)0 224#endif 225 226/* We remove any previous definition of `SIGN_EXTEND_CHAR', 227 since ours (we hope) works properly with all combinations of 228 machines, compilers, `char' and `unsigned char' argument types. 229 (Per Bothner suggested the basic approach.) */ 230#undef SIGN_EXTEND_CHAR 231#if __STDC__ 232# define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 233#else /* not __STDC__ */ 234/* As in Harbison and Steele. */ 235# define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 236#endif 237 238#ifndef emacs 239/* How many characters in the character set. */ 240# define CHAR_SET_SIZE 256 241 242# ifdef SYNTAX_TABLE 243 244extern char *re_syntax_table; 245 246# else /* not SYNTAX_TABLE */ 247 248static char re_syntax_table[CHAR_SET_SIZE]; 249 250static void 251init_syntax_once () 252{ 253 register int c; 254 static int done = 0; 255 256 if (done) 257 return; 258 bzero (re_syntax_table, sizeof re_syntax_table); 259 260 for (c = 0; c < CHAR_SET_SIZE; ++c) 261 if (ISALNUM (c)) 262 re_syntax_table[c] = Sword; 263 264 re_syntax_table['_'] = Sword; 265 266 done = 1; 267} 268 269# endif /* not SYNTAX_TABLE */ 270 271# define SYNTAX(c) re_syntax_table[((c) & 0xFF)] 272 273#endif /* emacs */ 274 275/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 276 use `alloca' instead of `malloc'. This is because using malloc in 277 re_search* or re_match* could cause memory leaks when C-g is used in 278 Emacs; also, malloc is slower and causes storage fragmentation. On 279 the other hand, malloc is more portable, and easier to debug. 280 281 Because we sometimes use alloca, some routines have to be macros, 282 not functions -- `alloca'-allocated space disappears at the end of the 283 function it is called in. */ 284 285#ifdef REGEX_MALLOC 286 287# define REGEX_ALLOCATE malloc 288# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 289# define REGEX_FREE free 290 291#else /* not REGEX_MALLOC */ 292 293/* Emacs already defines alloca, sometimes. */ 294# ifndef alloca 295 296/* Make alloca work the best possible way. */ 297# ifdef __GNUC__ 298# define alloca __builtin_alloca 299# else /* not __GNUC__ */ 300# if HAVE_ALLOCA_H 301# include <alloca.h> 302# endif /* HAVE_ALLOCA_H */ 303# endif /* not __GNUC__ */ 304 305# endif /* not alloca */ 306 307# define REGEX_ALLOCATE alloca 308 309/* Assumes a `char *destination' variable. */ 310# define REGEX_REALLOCATE(source, osize, nsize) \ 311 (destination = (char *) alloca (nsize), \ 312 memcpy (destination, source, osize)) 313 314/* No need to do anything to free, after alloca. */ 315# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 316 317#endif /* not REGEX_MALLOC */ 318 319/* Define how to allocate the failure stack. */ 320 321#if defined REL_ALLOC && defined REGEX_MALLOC 322 323# define REGEX_ALLOCATE_STACK(size) \ 324 r_alloc (&failure_stack_ptr, (size)) 325# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 326 r_re_alloc (&failure_stack_ptr, (nsize)) 327# define REGEX_FREE_STACK(ptr) \ 328 r_alloc_free (&failure_stack_ptr) 329 330#else /* not using relocating allocator */ 331 332# ifdef REGEX_MALLOC 333 334# define REGEX_ALLOCATE_STACK malloc 335# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 336# define REGEX_FREE_STACK free 337 338# else /* not REGEX_MALLOC */ 339 340# define REGEX_ALLOCATE_STACK alloca 341 342# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 343 REGEX_REALLOCATE (source, osize, nsize) 344/* No need to explicitly free anything. */ 345# define REGEX_FREE_STACK(arg) 346 347# endif /* not REGEX_MALLOC */ 348#endif /* not using relocating allocator */ 349 350 351/* True if `size1' is non-NULL and PTR is pointing anywhere inside 352 `string1' or just past its end. This works if PTR is NULL, which is 353 a good thing. */ 354#define FIRST_STRING_P(ptr) \ 355 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 356 357/* (Re)Allocate N items of type T using malloc, or fail. */ 358#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 359#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 360#define RETALLOC_IF(addr, n, t) \ 361 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 362#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 363 364#define BYTEWIDTH 8 /* In bits. */ 365 366#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 367 368#undef MAX 369#undef MIN 370#define MAX(a, b) ((a) > (b) ? (a) : (b)) 371#define MIN(a, b) ((a) < (b) ? (a) : (b)) 372 373typedef char boolean; 374#define false 0 375#define true 1 376 377static int re_match_2_internal PARAMS ((struct re_pattern_buffer *bufp, 378 const char *string1, int size1, 379 const char *string2, int size2, 380 int pos, 381 struct re_registers *regs, 382 int stop)); 383 384/* These are the command codes that appear in compiled regular 385 expressions. Some opcodes are followed by argument bytes. A 386 command code can specify any interpretation whatsoever for its 387 arguments. Zero bytes may appear in the compiled regular expression. */ 388 389typedef enum 390{ 391 no_op = 0, 392 393 /* Succeed right away--no more backtracking. */ 394 succeed, 395 396 /* Followed by one byte giving n, then by n literal bytes. */ 397 exactn, 398 399 /* Matches any (more or less) character. */ 400 anychar, 401 402 /* Matches any one char belonging to specified set. First 403 following byte is number of bitmap bytes. Then come bytes 404 for a bitmap saying which chars are in. Bits in each byte 405 are ordered low-bit-first. A character is in the set if its 406 bit is 1. A character too large to have a bit in the map is 407 automatically not in the set. */ 408 charset, 409 410 /* Same parameters as charset, but match any character that is 411 not one of those specified. */ 412 charset_not, 413 414 /* Start remembering the text that is matched, for storing in a 415 register. Followed by one byte with the register number, in 416 the range 0 to one less than the pattern buffer's re_nsub 417 field. Then followed by one byte with the number of groups 418 inner to this one. (This last has to be part of the 419 start_memory only because we need it in the on_failure_jump 420 of re_match_2.) */ 421 start_memory, 422 423 /* Stop remembering the text that is matched and store it in a 424 memory register. Followed by one byte with the register 425 number, in the range 0 to one less than `re_nsub' in the 426 pattern buffer, and one byte with the number of inner groups, 427 just like `start_memory'. (We need the number of inner 428 groups here because we don't have any easy way of finding the 429 corresponding start_memory when we're at a stop_memory.) */ 430 stop_memory, 431 432 /* Match a duplicate of something remembered. Followed by one 433 byte containing the register number. */ 434 duplicate, 435 436 /* Fail unless at beginning of line. */ 437 begline, 438 439 /* Fail unless at end of line. */ 440 endline, 441 442 /* Succeeds if at beginning of buffer (if emacs) or at beginning 443 of string to be matched (if not). */ 444 begbuf, 445 446 /* Analogously, for end of buffer/string. */ 447 endbuf, 448 449 /* Followed by two byte relative address to which to jump. */ 450 jump, 451 452 /* Same as jump, but marks the end of an alternative. */ 453 jump_past_alt, 454 455 /* Followed by two-byte relative address of place to resume at 456 in case of failure. */ 457 on_failure_jump, 458 459 /* Like on_failure_jump, but pushes a placeholder instead of the 460 current string position when executed. */ 461 on_failure_keep_string_jump, 462 463 /* Throw away latest failure point and then jump to following 464 two-byte relative address. */ 465 pop_failure_jump, 466 467 /* Change to pop_failure_jump if know won't have to backtrack to 468 match; otherwise change to jump. This is used to jump 469 back to the beginning of a repeat. If what follows this jump 470 clearly won't match what the repeat does, such that we can be 471 sure that there is no use backtracking out of repetitions 472 already matched, then we change it to a pop_failure_jump. 473 Followed by two-byte address. */ 474 maybe_pop_jump, 475 476 /* Jump to following two-byte address, and push a dummy failure 477 point. This failure point will be thrown away if an attempt 478 is made to use it for a failure. A `+' construct makes this 479 before the first repeat. Also used as an intermediary kind 480 of jump when compiling an alternative. */ 481 dummy_failure_jump, 482 483 /* Push a dummy failure point and continue. Used at the end of 484 alternatives. */ 485 push_dummy_failure, 486 487 /* Followed by two-byte relative address and two-byte number n. 488 After matching N times, jump to the address upon failure. */ 489 succeed_n, 490 491 /* Followed by two-byte relative address, and two-byte number n. 492 Jump to the address N times, then fail. */ 493 jump_n, 494 495 /* Set the following two-byte relative address to the 496 subsequent two-byte number. The address *includes* the two 497 bytes of number. */ 498 set_number_at, 499 500 wordchar, /* Matches any word-constituent character. */ 501 notwordchar, /* Matches any char that is not a word-constituent. */ 502 503 wordbeg, /* Succeeds if at word beginning. */ 504 wordend, /* Succeeds if at word end. */ 505 506 wordbound, /* Succeeds if at a word boundary. */ 507 notwordbound /* Succeeds if not at a word boundary. */ 508 509#ifdef emacs 510 ,before_dot, /* Succeeds if before point. */ 511 at_dot, /* Succeeds if at point. */ 512 after_dot, /* Succeeds if after point. */ 513 514 /* Matches any character whose syntax is specified. Followed by 515 a byte which contains a syntax code, e.g., Sword. */ 516 syntaxspec, 517 518 /* Matches any character whose syntax is not that specified. */ 519 notsyntaxspec 520#endif /* emacs */ 521} re_opcode_t; 522 523/* Common operations on the compiled pattern. */ 524 525/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 526 527#define STORE_NUMBER(destination, number) \ 528 do { \ 529 (destination)[0] = (number) & 0377; \ 530 (destination)[1] = (number) >> 8; \ 531 } while (0) 532 533/* Same as STORE_NUMBER, except increment DESTINATION to 534 the byte after where the number is stored. Therefore, DESTINATION 535 must be an lvalue. */ 536 537#define STORE_NUMBER_AND_INCR(destination, number) \ 538 do { \ 539 STORE_NUMBER (destination, number); \ 540 (destination) += 2; \ 541 } while (0) 542 543/* Put into DESTINATION a number stored in two contiguous bytes starting 544 at SOURCE. */ 545 546#define EXTRACT_NUMBER(destination, source) \ 547 do { \ 548 (destination) = *(source) & 0377; \ 549 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 550 } while (0) 551 552#ifdef DEBUG 553static void extract_number _RE_ARGS ((int *dest, unsigned char *source)); 554static void 555extract_number (dest, source) 556 int *dest; 557 unsigned char *source; 558{ 559 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 560 *dest = *source & 0377; 561 *dest += temp << 8; 562} 563 564# ifndef EXTRACT_MACROS /* To debug the macros. */ 565# undef EXTRACT_NUMBER 566# define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 567# endif /* not EXTRACT_MACROS */ 568 569#endif /* DEBUG */ 570 571/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 572 SOURCE must be an lvalue. */ 573 574#define EXTRACT_NUMBER_AND_INCR(destination, source) \ 575 do { \ 576 EXTRACT_NUMBER (destination, source); \ 577 (source) += 2; \ 578 } while (0) 579 580#ifdef DEBUG 581static void extract_number_and_incr _RE_ARGS ((int *destination, 582 unsigned char **source)); 583static void 584extract_number_and_incr (destination, source) 585 int *destination; 586 unsigned char **source; 587{ 588 extract_number (destination, *source); 589 *source += 2; 590} 591 592# ifndef EXTRACT_MACROS 593# undef EXTRACT_NUMBER_AND_INCR 594# define EXTRACT_NUMBER_AND_INCR(dest, src) \ 595 extract_number_and_incr (&dest, &src) 596# endif /* not EXTRACT_MACROS */ 597 598#endif /* DEBUG */ 599 600/* If DEBUG is defined, Regex prints many voluminous messages about what 601 it is doing (if the variable `debug' is nonzero). If linked with the 602 main program in `iregex.c', you can enter patterns and strings 603 interactively. And if linked with the main program in `main.c' and 604 the other test files, you can run the already-written tests. */ 605 606#ifdef DEBUG 607 608/* We use standard I/O for debugging. */ 609# include <stdio.h> 610 611/* It is useful to test things that ``must'' be true when debugging. */ 612# include <assert.h> 613 614static int debug; 615 616# define DEBUG_STATEMENT(e) e 617# define DEBUG_PRINT1(x) if (debug) printf (x) 618# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 619# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 620# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 621# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 622 if (debug) print_partial_compiled_pattern (s, e) 623# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 624 if (debug) print_double_string (w, s1, sz1, s2, sz2) 625 626 627/* Print the fastmap in human-readable form. */ 628 629void 630print_fastmap (fastmap) 631 char *fastmap; 632{ 633 unsigned was_a_range = 0; 634 unsigned i = 0; 635 636 while (i < (1 << BYTEWIDTH)) 637 { 638 if (fastmap[i++]) 639 { 640 was_a_range = 0; 641 putchar (i - 1); 642 while (i < (1 << BYTEWIDTH) && fastmap[i]) 643 { 644 was_a_range = 1; 645 i++; 646 } 647 if (was_a_range) 648 { 649 printf ("-"); 650 putchar (i - 1); 651 } 652 } 653 } 654 putchar ('\n'); 655} 656 657 658/* Print a compiled pattern string in human-readable form, starting at 659 the START pointer into it and ending just before the pointer END. */ 660 661void 662print_partial_compiled_pattern (start, end) 663 unsigned char *start; 664 unsigned char *end; 665{ 666 int mcnt, mcnt2; 667 unsigned char *p1; 668 unsigned char *p = start; 669 unsigned char *pend = end; 670 671 if (start == NULL) 672 { 673 printf ("(null)\n"); 674 return; 675 } 676 677 /* Loop over pattern commands. */ 678 while (p < pend) 679 { 680 printf ("%d:\t", p - start); 681 682 switch ((re_opcode_t) *p++) 683 { 684 case no_op: 685 printf ("/no_op"); 686 break; 687 688 case exactn: 689 mcnt = *p++; 690 printf ("/exactn/%d", mcnt); 691 do 692 { 693 putchar ('/'); 694 putchar (*p++); 695 } 696 while (--mcnt); 697 break; 698 699 case start_memory: 700 mcnt = *p++; 701 printf ("/start_memory/%d/%d", mcnt, *p++); 702 break; 703 704 case stop_memory: 705 mcnt = *p++; 706 printf ("/stop_memory/%d/%d", mcnt, *p++); 707 break; 708 709 case duplicate: 710 printf ("/duplicate/%d", *p++); 711 break; 712 713 case anychar: 714 printf ("/anychar"); 715 break; 716 717 case charset: 718 case charset_not: 719 { 720 register int c, last = -100; 721 register int in_range = 0; 722 723 printf ("/charset [%s", 724 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 725 726 assert (p + *p < pend); 727 728 for (c = 0; c < 256; c++) 729 if (c / 8 < *p 730 && (p[1 + (c/8)] & (1 << (c % 8)))) 731 { 732 /* Are we starting a range? */ 733 if (last + 1 == c && ! in_range) 734 { 735 putchar ('-'); 736 in_range = 1; 737 } 738 /* Have we broken a range? */ 739 else if (last + 1 != c && in_range) 740 { 741 putchar (last); 742 in_range = 0; 743 } 744 745 if (! in_range) 746 putchar (c); 747 748 last = c; 749 } 750 751 if (in_range) 752 putchar (last); 753 754 putchar (']'); 755 756 p += 1 + *p; 757 } 758 break; 759 760 case begline: 761 printf ("/begline"); 762 break; 763 764 case endline: 765 printf ("/endline"); 766 break; 767 768 case on_failure_jump: 769 extract_number_and_incr (&mcnt, &p); 770 printf ("/on_failure_jump to %d", p + mcnt - start); 771 break; 772 773 case on_failure_keep_string_jump: 774 extract_number_and_incr (&mcnt, &p); 775 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); 776 break; 777 778 case dummy_failure_jump: 779 extract_number_and_incr (&mcnt, &p); 780 printf ("/dummy_failure_jump to %d", p + mcnt - start); 781 break; 782 783 case push_dummy_failure: 784 printf ("/push_dummy_failure"); 785 break; 786 787 case maybe_pop_jump: 788 extract_number_and_incr (&mcnt, &p); 789 printf ("/maybe_pop_jump to %d", p + mcnt - start); 790 break; 791 792 case pop_failure_jump: 793 extract_number_and_incr (&mcnt, &p); 794 printf ("/pop_failure_jump to %d", p + mcnt - start); 795 break; 796 797 case jump_past_alt: 798 extract_number_and_incr (&mcnt, &p); 799 printf ("/jump_past_alt to %d", p + mcnt - start); 800 break; 801 802 case jump: 803 extract_number_and_incr (&mcnt, &p); 804 printf ("/jump to %d", p + mcnt - start); 805 break; 806 807 case succeed_n: 808 extract_number_and_incr (&mcnt, &p); 809 p1 = p + mcnt; 810 extract_number_and_incr (&mcnt2, &p); 811 printf ("/succeed_n to %d, %d times", p1 - start, mcnt2); 812 break; 813 814 case jump_n: 815 extract_number_and_incr (&mcnt, &p); 816 p1 = p + mcnt; 817 extract_number_and_incr (&mcnt2, &p); 818 printf ("/jump_n to %d, %d times", p1 - start, mcnt2); 819 break; 820 821 case set_number_at: 822 extract_number_and_incr (&mcnt, &p); 823 p1 = p + mcnt; 824 extract_number_and_incr (&mcnt2, &p); 825 printf ("/set_number_at location %d to %d", p1 - start, mcnt2); 826 break; 827 828 case wordbound: 829 printf ("/wordbound"); 830 break; 831 832 case notwordbound: 833 printf ("/notwordbound"); 834 break; 835 836 case wordbeg: 837 printf ("/wordbeg"); 838 break; 839 840 case wordend: 841 printf ("/wordend"); 842 843# ifdef emacs 844 case before_dot: 845 printf ("/before_dot"); 846 break; 847 848 case at_dot: 849 printf ("/at_dot"); 850 break; 851 852 case after_dot: 853 printf ("/after_dot"); 854 break; 855 856 case syntaxspec: 857 printf ("/syntaxspec"); 858 mcnt = *p++; 859 printf ("/%d", mcnt); 860 break; 861 862 case notsyntaxspec: 863 printf ("/notsyntaxspec"); 864 mcnt = *p++; 865 printf ("/%d", mcnt); 866 break; 867# endif /* emacs */ 868 869 case wordchar: 870 printf ("/wordchar"); 871 break; 872 873 case notwordchar: 874 printf ("/notwordchar"); 875 break; 876 877 case begbuf: 878 printf ("/begbuf"); 879 break; 880 881 case endbuf: 882 printf ("/endbuf"); 883 break; 884 885 default: 886 printf ("?%d", *(p-1)); 887 } 888 889 putchar ('\n'); 890 } 891 892 printf ("%d:\tend of pattern.\n", p - start); 893} 894 895 896void 897print_compiled_pattern (bufp) 898 struct re_pattern_buffer *bufp; 899{ 900 unsigned char *buffer = bufp->buffer; 901 902 print_partial_compiled_pattern (buffer, buffer + bufp->used); 903 printf ("%ld bytes used/%ld bytes allocated.\n", 904 bufp->used, bufp->allocated); 905 906 if (bufp->fastmap_accurate && bufp->fastmap) 907 { 908 printf ("fastmap: "); 909 print_fastmap (bufp->fastmap); 910 } 911 912 printf ("re_nsub: %d\t", bufp->re_nsub); 913 printf ("regs_alloc: %d\t", bufp->regs_allocated); 914 printf ("can_be_null: %d\t", bufp->can_be_null); 915 printf ("newline_anchor: %d\n", bufp->newline_anchor); 916 printf ("no_sub: %d\t", bufp->no_sub); 917 printf ("not_bol: %d\t", bufp->not_bol); 918 printf ("not_eol: %d\t", bufp->not_eol); 919 printf ("syntax: %lx\n", bufp->syntax); 920 /* Perhaps we should print the translate table? */ 921} 922 923 924void 925print_double_string (where, string1, size1, string2, size2) 926 const char *where; 927 const char *string1; 928 const char *string2; 929 int size1; 930 int size2; 931{ 932 int this_char; 933 934 if (where == NULL) 935 printf ("(null)"); 936 else 937 { 938 if (FIRST_STRING_P (where)) 939 { 940 for (this_char = where - string1; this_char < size1; this_char++) 941 putchar (string1[this_char]); 942 943 where = string2; 944 } 945 946 for (this_char = where - string2; this_char < size2; this_char++) 947 putchar (string2[this_char]); 948 } 949} 950 951void 952printchar (c) 953 int c; 954{ 955 putc (c, stderr); 956} 957 958#else /* not DEBUG */ 959 960# undef assert 961# define assert(e) 962 963# define DEBUG_STATEMENT(e) 964# define DEBUG_PRINT1(x) 965# define DEBUG_PRINT2(x1, x2) 966# define DEBUG_PRINT3(x1, x2, x3) 967# define DEBUG_PRINT4(x1, x2, x3, x4) 968# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 969# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 970 971#endif /* not DEBUG */ 972 973/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 974 also be assigned to arbitrarily: each pattern buffer stores its own 975 syntax, so it can be changed between regex compilations. */ 976/* This has no initializer because initialized variables in Emacs 977 become read-only after dumping. */ 978reg_syntax_t re_syntax_options; 979 980 981/* Specify the precise syntax of regexps for compilation. This provides 982 for compatibility for various utilities which historically have 983 different, incompatible syntaxes. 984 985 The argument SYNTAX is a bit mask comprised of the various bits 986 defined in regex.h. We return the old syntax. */ 987 988reg_syntax_t 989re_set_syntax (syntax) 990 reg_syntax_t syntax; 991{ 992 reg_syntax_t ret = re_syntax_options; 993 994 re_syntax_options = syntax; 995#ifdef DEBUG 996 if (syntax & RE_DEBUG) 997 debug = 1; 998 else if (debug) /* was on but now is not */ 999 debug = 0; 1000#endif /* DEBUG */ 1001 return ret; 1002} 1003#ifdef _LIBC 1004weak_alias (__re_set_syntax, re_set_syntax) 1005#endif 1006 1007/* This table gives an error message for each of the error codes listed 1008 in regex.h. Obviously the order here has to be same as there. 1009 POSIX doesn't require that we do anything for REG_NOERROR, 1010 but why not be nice? */ 1011 1012#if 0 1013 /* This section is for xgettext; it sees the strings wrapped inside 1014 N_() and marks them as needing translation. They should match 1015 the strings in re_error_msgid. We can't use the usual string 1016 concatenation trick to initialize re_error_msgid, since other GNU 1017 distributions use this file with traditional C, and traditional C 1018 lacks string concatenation. */ 1019 N_("Success") /* REG_NOERROR */ 1020 N_("No match") /* REG_NOMATCH */ 1021 N_("Invalid regular expression") /* REG_BADPAT */ 1022 N_("Invalid collation character") /* REG_ECOLLATE */ 1023 N_("Invalid character class name") /* REG_ECTYPE */ 1024 N_("Trailing backslash") /* REG_EESCAPE */ 1025 N_("Invalid back reference") /* REG_ESUBREG */ 1026 N_("Unmatched [ or [^") /* REG_EBRACK */ 1027 N_("Unmatched ( or \\(") /* REG_EPAREN */ 1028 N_("Unmatched \\{") /* REG_EBRACE */ 1029 N_("Invalid content of \\{\\}") /* REG_BADBR */ 1030 N_("Invalid range end") /* REG_ERANGE */ 1031 N_("Memory exhausted") /* REG_ESPACE */ 1032 N_("Invalid preceding regular expression") /* REG_BADRPT */ 1033 N_("Premature end of regular expression") /* REG_EEND */ 1034 N_("Regular expression too big") /* REG_ESIZE */ 1035 N_("Unmatched ) or \\)") /* REG_ERPAREN */ 1036#endif 1037 1038static const char re_error_msgid[] = "\ 1039Success\0\ 1040No match\0\ 1041Invalid regular expression\0\ 1042Invalid collation character\0\ 1043Invalid character class name\0\ 1044Trailing backslash\0\ 1045Invalid back reference\0\ 1046Unmatched [ or [^\0\ 1047Unmatched ( or \\(\0\ 1048Unmatched \\{\0\ 1049Invalid content of \\{\\}\0\ 1050Invalid range end\0\ 1051Memory exhausted\0\ 1052Invalid preceding regular expression\0\ 1053Premature end of regular expression\0\ 1054Regular expression too big\0\ 1055Unmatched ) or \\)"; 1056 1057#define REG_NOERROR_IDX 0 1058#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 1059#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 1060#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 1061#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 1062#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 1063#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 1064#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 1065#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 1066#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 1067#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 1068#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 1069#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 1070#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 1071#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 1072#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 1073#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 1074 1075static const size_t re_error_msgid_idx[] = 1076 { 1077 REG_NOERROR_IDX, 1078 REG_NOMATCH_IDX, 1079 REG_BADPAT_IDX, 1080 REG_ECOLLATE_IDX, 1081 REG_ECTYPE_IDX, 1082 REG_EESCAPE_IDX, 1083 REG_ESUBREG_IDX, 1084 REG_EBRACK_IDX, 1085 REG_EPAREN_IDX, 1086 REG_EBRACE_IDX, 1087 REG_BADBR_IDX, 1088 REG_ERANGE_IDX, 1089 REG_ESPACE_IDX, 1090 REG_BADRPT_IDX, 1091 REG_EEND_IDX, 1092 REG_ESIZE_IDX, 1093 REG_ERPAREN_IDX 1094 }; 1095 1096/* Avoiding alloca during matching, to placate r_alloc. */ 1097 1098/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1099 searching and matching functions should not call alloca. On some 1100 systems, alloca is implemented in terms of malloc, and if we're 1101 using the relocating allocator routines, then malloc could cause a 1102 relocation, which might (if the strings being searched are in the 1103 ralloc heap) shift the data out from underneath the regexp 1104 routines. 1105 1106 Here's another reason to avoid allocation: Emacs 1107 processes input from X in a signal handler; processing X input may 1108 call malloc; if input arrives while a matching routine is calling 1109 malloc, then we're scrod. But Emacs can't just block input while 1110 calling matching routines; then we don't notice interrupts when 1111 they come in. So, Emacs blocks input around all regexp calls 1112 except the matching calls, which it leaves unprotected, in the 1113 faith that they will not malloc. */ 1114 1115/* Normally, this is fine. */ 1116#define MATCH_MAY_ALLOCATE 1117 1118/* When using GNU C, we are not REALLY using the C alloca, no matter 1119 what config.h may say. So don't take precautions for it. */ 1120#ifdef __GNUC__ 1121# undef C_ALLOCA 1122#endif 1123 1124/* The match routines may not allocate if (1) they would do it with malloc 1125 and (2) it's not safe for them to use malloc. 1126 Note that if REL_ALLOC is defined, matching would not use malloc for the 1127 failure stack, but we would still use it for the register vectors; 1128 so REL_ALLOC should not affect this. */ 1129#if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs 1130# undef MATCH_MAY_ALLOCATE 1131#endif 1132 1133 1134/* Failure stack declarations and macros; both re_compile_fastmap and 1135 re_match_2 use a failure stack. These have to be macros because of 1136 REGEX_ALLOCATE_STACK. */ 1137 1138 1139/* Number of failure points for which to initially allocate space 1140 when matching. If this number is exceeded, we allocate more 1141 space, so it is not a hard limit. */ 1142#ifndef INIT_FAILURE_ALLOC 1143# define INIT_FAILURE_ALLOC 5 1144#endif 1145 1146/* Roughly the maximum number of failure points on the stack. Would be 1147 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 1148 This is a variable only so users of regex can assign to it; we never 1149 change it ourselves. */ 1150 1151#ifdef INT_IS_16BIT 1152 1153# if defined MATCH_MAY_ALLOCATE 1154/* 4400 was enough to cause a crash on Alpha OSF/1, 1155 whose default stack limit is 2mb. */ 1156long int re_max_failures = 4000; 1157# else 1158long int re_max_failures = 2000; 1159# endif 1160 1161union fail_stack_elt 1162{ 1163 unsigned char *pointer; 1164 long int integer; 1165}; 1166 1167typedef union fail_stack_elt fail_stack_elt_t; 1168 1169typedef struct 1170{ 1171 fail_stack_elt_t *stack; 1172 unsigned long int size; 1173 unsigned long int avail; /* Offset of next open position. */ 1174} fail_stack_type; 1175 1176#else /* not INT_IS_16BIT */ 1177 1178# if defined MATCH_MAY_ALLOCATE 1179/* 4400 was enough to cause a crash on Alpha OSF/1, 1180 whose default stack limit is 2mb. */ 1181int re_max_failures = 20000; 1182# else 1183int re_max_failures = 2000; 1184# endif 1185 1186union fail_stack_elt 1187{ 1188 unsigned char *pointer; 1189 int integer; 1190}; 1191 1192typedef union fail_stack_elt fail_stack_elt_t; 1193 1194typedef struct 1195{ 1196 fail_stack_elt_t *stack; 1197 unsigned size; 1198 unsigned avail; /* Offset of next open position. */ 1199} fail_stack_type; 1200 1201#endif /* INT_IS_16BIT */ 1202 1203#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1204#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1205#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1206 1207 1208/* Define macros to initialize and free the failure stack. 1209 Do `return -2' if the alloc fails. */ 1210 1211#ifdef MATCH_MAY_ALLOCATE 1212# define INIT_FAIL_STACK() \ 1213 do { \ 1214 fail_stack.stack = (fail_stack_elt_t *) \ 1215 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ 1216 \ 1217 if (fail_stack.stack == NULL) \ 1218 return -2; \ 1219 \ 1220 fail_stack.size = INIT_FAILURE_ALLOC; \ 1221 fail_stack.avail = 0; \ 1222 } while (0) 1223 1224# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1225#else 1226# define INIT_FAIL_STACK() \ 1227 do { \ 1228 fail_stack.avail = 0; \ 1229 } while (0) 1230 1231# define RESET_FAIL_STACK() 1232#endif 1233 1234 1235/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 1236 1237 Return 1 if succeeds, and 0 if either ran out of memory 1238 allocating space for it or it was already too large. 1239 1240 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1241 1242#define DOUBLE_FAIL_STACK(fail_stack) \ 1243 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ 1244 ? 0 \ 1245 : ((fail_stack).stack = (fail_stack_elt_t *) \ 1246 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1247 (fail_stack).size * sizeof (fail_stack_elt_t), \ 1248 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ 1249 \ 1250 (fail_stack).stack == NULL \ 1251 ? 0 \ 1252 : ((fail_stack).size <<= 1, \ 1253 1))) 1254 1255 1256/* Push pointer POINTER on FAIL_STACK. 1257 Return 1 if was able to do so and 0 if ran out of memory allocating 1258 space to do so. */ 1259#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1260 ((FAIL_STACK_FULL () \ 1261 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 1262 ? 0 \ 1263 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1264 1)) 1265 1266/* Push a pointer value onto the failure stack. 1267 Assumes the variable `fail_stack'. Probably should only 1268 be called from within `PUSH_FAILURE_POINT'. */ 1269#define PUSH_FAILURE_POINTER(item) \ 1270 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) 1271 1272/* This pushes an integer-valued item onto the failure stack. 1273 Assumes the variable `fail_stack'. Probably should only 1274 be called from within `PUSH_FAILURE_POINT'. */ 1275#define PUSH_FAILURE_INT(item) \ 1276 fail_stack.stack[fail_stack.avail++].integer = (item) 1277 1278/* Push a fail_stack_elt_t value onto the failure stack. 1279 Assumes the variable `fail_stack'. Probably should only 1280 be called from within `PUSH_FAILURE_POINT'. */ 1281#define PUSH_FAILURE_ELT(item) \ 1282 fail_stack.stack[fail_stack.avail++] = (item) 1283 1284/* These three POP... operations complement the three PUSH... operations. 1285 All assume that `fail_stack' is nonempty. */ 1286#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1287#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1288#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1289 1290/* Used to omit pushing failure point id's when we're not debugging. */ 1291#ifdef DEBUG 1292# define DEBUG_PUSH PUSH_FAILURE_INT 1293# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1294#else 1295# define DEBUG_PUSH(item) 1296# define DEBUG_POP(item_addr) 1297#endif 1298 1299 1300/* Push the information about the state we will need 1301 if we ever fail back to it. 1302 1303 Requires variables fail_stack, regstart, regend, reg_info, and 1304 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' 1305 be declared. 1306 1307 Does `return FAILURE_CODE' if runs out of memory. */ 1308 1309#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1310 do { \ 1311 char *destination; \ 1312 /* Must be int, so when we don't save any registers, the arithmetic \ 1313 of 0 + -1 isn't done as unsigned. */ \ 1314 /* Can't be int, since there is not a shred of a guarantee that int \ 1315 is wide enough to hold a value of something to which pointer can \ 1316 be assigned */ \ 1317 active_reg_t this_reg; \ 1318 \ 1319 DEBUG_STATEMENT (failure_id++); \ 1320 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1321 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1322 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1323 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1324 \ 1325 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ 1326 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1327 \ 1328 /* Ensure we have enough space allocated for what we will push. */ \ 1329 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1330 { \ 1331 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 1332 return failure_code; \ 1333 \ 1334 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1335 (fail_stack).size); \ 1336 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1337 } \ 1338 \ 1339 /* Push the info, starting with the registers. */ \ 1340 DEBUG_PRINT1 ("\n"); \ 1341 \ 1342 if (1) \ 1343 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1344 this_reg++) \ 1345 { \ 1346 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ 1347 DEBUG_STATEMENT (num_regs_pushed++); \ 1348 \ 1349 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1350 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1351 \ 1352 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1353 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1354 \ 1355 DEBUG_PRINT2 (" info: %p\n ", \ 1356 reg_info[this_reg].word.pointer); \ 1357 DEBUG_PRINT2 (" match_null=%d", \ 1358 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1359 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1360 DEBUG_PRINT2 (" matched_something=%d", \ 1361 MATCHED_SOMETHING (reg_info[this_reg])); \ 1362 DEBUG_PRINT2 (" ever_matched=%d", \ 1363 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1364 DEBUG_PRINT1 ("\n"); \ 1365 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1366 } \ 1367 \ 1368 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ 1369 PUSH_FAILURE_INT (lowest_active_reg); \ 1370 \ 1371 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ 1372 PUSH_FAILURE_INT (highest_active_reg); \ 1373 \ 1374 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ 1375 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1376 PUSH_FAILURE_POINTER (pattern_place); \ 1377 \ 1378 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ 1379 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1380 size2); \ 1381 DEBUG_PRINT1 ("'\n"); \ 1382 PUSH_FAILURE_POINTER (string_place); \ 1383 \ 1384 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1385 DEBUG_PUSH (failure_id); \ 1386 } while (0) 1387 1388/* This is the number of items that are pushed and popped on the stack 1389 for each register. */ 1390#define NUM_REG_ITEMS 3 1391 1392/* Individual items aside from the registers. */ 1393#ifdef DEBUG 1394# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1395#else 1396# define NUM_NONREG_ITEMS 4 1397#endif 1398 1399/* We push at most this many items on the stack. */ 1400/* We used to use (num_regs - 1), which is the number of registers 1401 this regexp will save; but that was changed to 5 1402 to avoid stack overflow for a regexp with lots of parens. */ 1403#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 1404 1405/* We actually push this many items. */ 1406#define NUM_FAILURE_ITEMS \ 1407 (((0 \ 1408 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1409 * NUM_REG_ITEMS) \ 1410 + NUM_NONREG_ITEMS) 1411 1412/* How many items can still be added to the stack without overflowing it. */ 1413#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1414 1415 1416/* Pops what PUSH_FAIL_STACK pushes. 1417 1418 We restore into the parameters, all of which should be lvalues: 1419 STR -- the saved data position. 1420 PAT -- the saved pattern position. 1421 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1422 REGSTART, REGEND -- arrays of string positions. 1423 REG_INFO -- array of information about each subexpression. 1424 1425 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1426 `pend', `string1', `size1', `string2', and `size2'. */ 1427 1428#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1429{ \ 1430 DEBUG_STATEMENT (unsigned failure_id;) \ 1431 active_reg_t this_reg; \ 1432 const unsigned char *string_temp; \ 1433 \ 1434 assert (!FAIL_STACK_EMPTY ()); \ 1435 \ 1436 /* Remove failure points and point to how many regs pushed. */ \ 1437 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1438 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1439 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1440 \ 1441 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1442 \ 1443 DEBUG_POP (&failure_id); \ 1444 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1445 \ 1446 /* If the saved string location is NULL, it came from an \ 1447 on_failure_keep_string_jump opcode, and we want to throw away the \ 1448 saved NULL, thus retaining our current position in the string. */ \ 1449 string_temp = POP_FAILURE_POINTER (); \ 1450 if (string_temp != NULL) \ 1451 str = (const char *) string_temp; \ 1452 \ 1453 DEBUG_PRINT2 (" Popping string %p: `", str); \ 1454 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1455 DEBUG_PRINT1 ("'\n"); \ 1456 \ 1457 pat = (unsigned char *) POP_FAILURE_POINTER (); \ 1458 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ 1459 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1460 \ 1461 /* Restore register info. */ \ 1462 high_reg = (active_reg_t) POP_FAILURE_INT (); \ 1463 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ 1464 \ 1465 low_reg = (active_reg_t) POP_FAILURE_INT (); \ 1466 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ 1467 \ 1468 if (1) \ 1469 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1470 { \ 1471 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ 1472 \ 1473 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1474 DEBUG_PRINT2 (" info: %p\n", \ 1475 reg_info[this_reg].word.pointer); \ 1476 \ 1477 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1478 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1479 \ 1480 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1481 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1482 } \ 1483 else \ 1484 { \ 1485 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1486 { \ 1487 reg_info[this_reg].word.integer = 0; \ 1488 regend[this_reg] = 0; \ 1489 regstart[this_reg] = 0; \ 1490 } \ 1491 highest_active_reg = high_reg; \ 1492 } \ 1493 \ 1494 set_regs_matched_done = 0; \ 1495 DEBUG_STATEMENT (nfailure_points_popped++); \ 1496} /* POP_FAILURE_POINT */ 1497 1498 1499 1500/* Structure for per-register (a.k.a. per-group) information. 1501 Other register information, such as the 1502 starting and ending positions (which are addresses), and the list of 1503 inner groups (which is a bits list) are maintained in separate 1504 variables. 1505 1506 We are making a (strictly speaking) nonportable assumption here: that 1507 the compiler will pack our bit fields into something that fits into 1508 the type of `word', i.e., is something that fits into one item on the 1509 failure stack. */ 1510 1511 1512/* Declarations and macros for re_match_2. */ 1513 1514typedef union 1515{ 1516 fail_stack_elt_t word; 1517 struct 1518 { 1519 /* This field is one if this group can match the empty string, 1520 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1521#define MATCH_NULL_UNSET_VALUE 3 1522 unsigned match_null_string_p : 2; 1523 unsigned is_active : 1; 1524 unsigned matched_something : 1; 1525 unsigned ever_matched_something : 1; 1526 } bits; 1527} register_info_type; 1528 1529#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1530#define IS_ACTIVE(R) ((R).bits.is_active) 1531#define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1532#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1533 1534 1535/* Call this when have matched a real character; it sets `matched' flags 1536 for the subexpressions which we are currently inside. Also records 1537 that those subexprs have matched. */ 1538#define SET_REGS_MATCHED() \ 1539 do \ 1540 { \ 1541 if (!set_regs_matched_done) \ 1542 { \ 1543 active_reg_t r; \ 1544 set_regs_matched_done = 1; \ 1545 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1546 { \ 1547 MATCHED_SOMETHING (reg_info[r]) \ 1548 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1549 = 1; \ 1550 } \ 1551 } \ 1552 } \ 1553 while (0) 1554 1555/* Registers are set to a sentinel when they haven't yet matched. */ 1556static char reg_unset_dummy; 1557#define REG_UNSET_VALUE (®_unset_dummy) 1558#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1559 1560/* Subroutine declarations and macros for regex_compile. */ 1561 1562static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size, 1563 reg_syntax_t syntax, 1564 struct re_pattern_buffer *bufp)); 1565static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg)); 1566static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 1567 int arg1, int arg2)); 1568static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 1569 int arg, unsigned char *end)); 1570static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc, 1571 int arg1, int arg2, unsigned char *end)); 1572static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p, 1573 reg_syntax_t syntax)); 1574static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend, 1575 reg_syntax_t syntax)); 1576static reg_errcode_t compile_range _RE_ARGS ((const char **p_ptr, 1577 const char *pend, 1578 char *translate, 1579 reg_syntax_t syntax, 1580 unsigned char *b)); 1581 1582/* Fetch the next character in the uncompiled pattern---translating it 1583 if necessary. Also cast from a signed character in the constant 1584 string passed to us by the user to an unsigned char that we can use 1585 as an array index (in, e.g., `translate'). */ 1586#ifndef PATFETCH 1587# define PATFETCH(c) \ 1588 do {if (p == pend) return REG_EEND; \ 1589 c = (unsigned char) *p++; \ 1590 if (translate) c = (unsigned char) translate[c]; \ 1591 } while (0) 1592#endif 1593 1594/* Fetch the next character in the uncompiled pattern, with no 1595 translation. */ 1596#define PATFETCH_RAW(c) \ 1597 do {if (p == pend) return REG_EEND; \ 1598 c = (unsigned char) *p++; \ 1599 } while (0) 1600 1601/* Go backwards one character in the pattern. */ 1602#define PATUNFETCH p-- 1603 1604 1605/* If `translate' is non-null, return translate[D], else just D. We 1606 cast the subscript to translate because some data is declared as 1607 `char *', to avoid warnings when a string constant is passed. But 1608 when we use a character as a subscript we must make it unsigned. */ 1609#ifndef TRANSLATE 1610# define TRANSLATE(d) \ 1611 (translate ? (char) translate[(unsigned char) (d)] : (d)) 1612#endif 1613 1614 1615/* Macros for outputting the compiled pattern into `buffer'. */ 1616 1617/* If the buffer isn't allocated when it comes in, use this. */ 1618#define INIT_BUF_SIZE 32 1619 1620/* Make sure we have at least N more bytes of space in buffer. */ 1621#define GET_BUFFER_SPACE(n) \ 1622 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ 1623 EXTEND_BUFFER () 1624 1625/* Make sure we have one more byte of buffer space and then add C to it. */ 1626#define BUF_PUSH(c) \ 1627 do { \ 1628 GET_BUFFER_SPACE (1); \ 1629 *b++ = (unsigned char) (c); \ 1630 } while (0) 1631 1632 1633/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1634#define BUF_PUSH_2(c1, c2) \ 1635 do { \ 1636 GET_BUFFER_SPACE (2); \ 1637 *b++ = (unsigned char) (c1); \ 1638 *b++ = (unsigned char) (c2); \ 1639 } while (0) 1640 1641 1642/* As with BUF_PUSH_2, except for three bytes. */ 1643#define BUF_PUSH_3(c1, c2, c3) \ 1644 do { \ 1645 GET_BUFFER_SPACE (3); \ 1646 *b++ = (unsigned char) (c1); \ 1647 *b++ = (unsigned char) (c2); \ 1648 *b++ = (unsigned char) (c3); \ 1649 } while (0) 1650 1651 1652/* Store a jump with opcode OP at LOC to location TO. We store a 1653 relative address offset by the three bytes the jump itself occupies. */ 1654#define STORE_JUMP(op, loc, to) \ 1655 store_op1 (op, loc, (int) ((to) - (loc) - 3)) 1656 1657/* Likewise, for a two-argument jump. */ 1658#define STORE_JUMP2(op, loc, to, arg) \ 1659 store_op2 (op, loc, (int) ((to) - (loc) - 3), arg) 1660 1661/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1662#define INSERT_JUMP(op, loc, to) \ 1663 insert_op1 (op, loc, (int) ((to) - (loc) - 3), b) 1664 1665/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1666#define INSERT_JUMP2(op, loc, to, arg) \ 1667 insert_op2 (op, loc, (int) ((to) - (loc) - 3), arg, b) 1668 1669 1670/* This is not an arbitrary limit: the arguments which represent offsets 1671 into the pattern are two bytes long. So if 2^16 bytes turns out to 1672 be too small, many things would have to change. */ 1673/* Any other compiler which, like MSC, has allocation limit below 2^16 1674 bytes will have to use approach similar to what was done below for 1675 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up 1676 reallocating to 0 bytes. Such thing is not going to work too well. 1677 You have been warned!! */ 1678#if defined _MSC_VER && !defined WIN32 1679/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. 1680 The REALLOC define eliminates a flurry of conversion warnings, 1681 but is not required. */ 1682# define MAX_BUF_SIZE 65500L 1683# define REALLOC(p,s) realloc ((p), (size_t) (s)) 1684#else 1685# define MAX_BUF_SIZE (1L << 16) 1686# define REALLOC(p,s) realloc ((p), (s)) 1687#endif 1688 1689/* Extend the buffer by twice its current size via realloc and 1690 reset the pointers that pointed into the old block to point to the 1691 correct places in the new one. If extending the buffer results in it 1692 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 1693#define EXTEND_BUFFER() \ 1694 do { \ 1695 unsigned char *old_buffer = bufp->buffer; \ 1696 if (bufp->allocated == MAX_BUF_SIZE) \ 1697 return REG_ESIZE; \ 1698 bufp->allocated <<= 1; \ 1699 if (bufp->allocated > MAX_BUF_SIZE) \ 1700 bufp->allocated = MAX_BUF_SIZE; \ 1701 bufp->buffer = (unsigned char *) REALLOC (bufp->buffer, bufp->allocated);\ 1702 if (bufp->buffer == NULL) \ 1703 return REG_ESPACE; \ 1704 /* If the buffer moved, move all the pointers into it. */ \ 1705 if (old_buffer != bufp->buffer) \ 1706 { \ 1707 b = (b - old_buffer) + bufp->buffer; \ 1708 begalt = (begalt - old_buffer) + bufp->buffer; \ 1709 if (fixup_alt_jump) \ 1710 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ 1711 if (laststart) \ 1712 laststart = (laststart - old_buffer) + bufp->buffer; \ 1713 if (pending_exact) \ 1714 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ 1715 } \ 1716 } while (0) 1717 1718 1719/* Since we have one byte reserved for the register number argument to 1720 {start,stop}_memory, the maximum number of groups we can report 1721 things about is what fits in that byte. */ 1722#define MAX_REGNUM 255 1723 1724/* But patterns can have more than `MAX_REGNUM' registers. We just 1725 ignore the excess. */ 1726typedef unsigned regnum_t; 1727 1728 1729/* Macros for the compile stack. */ 1730 1731/* Since offsets can go either forwards or backwards, this type needs to 1732 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 1733/* int may be not enough when sizeof(int) == 2. */ 1734typedef long pattern_offset_t; 1735 1736typedef struct 1737{ 1738 pattern_offset_t begalt_offset; 1739 pattern_offset_t fixup_alt_jump; 1740 pattern_offset_t inner_group_offset; 1741 pattern_offset_t laststart_offset; 1742 regnum_t regnum; 1743} compile_stack_elt_t; 1744 1745 1746typedef struct 1747{ 1748 compile_stack_elt_t *stack; 1749 unsigned size; 1750 unsigned avail; /* Offset of next open position. */ 1751} compile_stack_type; 1752 1753 1754#define INIT_COMPILE_STACK_SIZE 32 1755 1756#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 1757#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 1758 1759/* The next available element. */ 1760#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 1761 1762 1763/* Set the bit for character C in a list. */ 1764#define SET_LIST_BIT(c) \ 1765 (b[((unsigned char) (c)) / BYTEWIDTH] \ 1766 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 1767 1768 1769/* Get the next unsigned number in the uncompiled pattern. */ 1770#define GET_UNSIGNED_NUMBER(num) \ 1771 { if (p != pend) \ 1772 { \ 1773 PATFETCH (c); \ 1774 while ('0' <= c && c <= '9') \ 1775 { \ 1776 if (num < 0) \ 1777 num = 0; \ 1778 num = num * 10 + c - '0'; \ 1779 if (p == pend) \ 1780 break; \ 1781 PATFETCH (c); \ 1782 } \ 1783 } \ 1784 } 1785 1786#if defined _LIBC || WIDE_CHAR_SUPPORT 1787/* The GNU C library provides support for user-defined character classes 1788 and the functions from ISO C amendement 1. */ 1789# ifdef CHARCLASS_NAME_MAX 1790# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX 1791# else 1792/* This shouldn't happen but some implementation might still have this 1793 problem. Use a reasonable default value. */ 1794# define CHAR_CLASS_MAX_LENGTH 256 1795# endif 1796 1797# ifdef _LIBC 1798# define IS_CHAR_CLASS(string) __wctype (string) 1799# else 1800# define IS_CHAR_CLASS(string) wctype (string) 1801# endif 1802#else 1803# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 1804 1805# define IS_CHAR_CLASS(string) \ 1806 (STREQ (string, "alpha") || STREQ (string, "upper") \ 1807 || STREQ (string, "lower") || STREQ (string, "digit") \ 1808 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 1809 || STREQ (string, "space") || STREQ (string, "print") \ 1810 || STREQ (string, "punct") || STREQ (string, "graph") \ 1811 || STREQ (string, "cntrl") || STREQ (string, "blank")) 1812#endif 1813 1814#ifndef MATCH_MAY_ALLOCATE 1815 1816/* If we cannot allocate large objects within re_match_2_internal, 1817 we make the fail stack and register vectors global. 1818 The fail stack, we grow to the maximum size when a regexp 1819 is compiled. 1820 The register vectors, we adjust in size each time we 1821 compile a regexp, according to the number of registers it needs. */ 1822 1823static fail_stack_type fail_stack; 1824 1825/* Size with which the following vectors are currently allocated. 1826 That is so we can make them bigger as needed, 1827 but never make them smaller. */ 1828static int regs_allocated_size; 1829 1830static const char ** regstart, ** regend; 1831static const char ** old_regstart, ** old_regend; 1832static const char **best_regstart, **best_regend; 1833static register_info_type *reg_info; 1834static const char **reg_dummy; 1835static register_info_type *reg_info_dummy; 1836 1837/* Make the register vectors big enough for NUM_REGS registers, 1838 but don't make them smaller. */ 1839 1840static 1841regex_grow_registers (num_regs) 1842 int num_regs; 1843{ 1844 if (num_regs > regs_allocated_size) 1845 { 1846 RETALLOC_IF (regstart, num_regs, const char *); 1847 RETALLOC_IF (regend, num_regs, const char *); 1848 RETALLOC_IF (old_regstart, num_regs, const char *); 1849 RETALLOC_IF (old_regend, num_regs, const char *); 1850 RETALLOC_IF (best_regstart, num_regs, const char *); 1851 RETALLOC_IF (best_regend, num_regs, const char *); 1852 RETALLOC_IF (reg_info, num_regs, register_info_type); 1853 RETALLOC_IF (reg_dummy, num_regs, const char *); 1854 RETALLOC_IF (reg_info_dummy, num_regs, register_info_type); 1855 1856 regs_allocated_size = num_regs; 1857 } 1858} 1859 1860#endif /* not MATCH_MAY_ALLOCATE */ 1861 1862static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type 1863 compile_stack, 1864 regnum_t regnum)); 1865 1866/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 1867 Returns one of error codes defined in `regex.h', or zero for success. 1868 1869 Assumes the `allocated' (and perhaps `buffer') and `translate' 1870 fields are set in BUFP on entry. 1871 1872 If it succeeds, results are put in BUFP (if it returns an error, the 1873 contents of BUFP are undefined): 1874 `buffer' is the compiled pattern; 1875 `syntax' is set to SYNTAX; 1876 `used' is set to the length of the compiled pattern; 1877 `fastmap_accurate' is zero; 1878 `re_nsub' is the number of subexpressions in PATTERN; 1879 `not_bol' and `not_eol' are zero; 1880 1881 The `fastmap' and `newline_anchor' fields are neither 1882 examined nor set. */ 1883 1884/* Return, freeing storage we allocated. */ 1885#define FREE_STACK_RETURN(value) \ 1886 return (free (compile_stack.stack), value) 1887 1888static reg_errcode_t 1889regex_compile (pattern, size, syntax, bufp) 1890 const char *pattern; 1891 size_t size; 1892 reg_syntax_t syntax; 1893 struct re_pattern_buffer *bufp; 1894{ 1895 /* We fetch characters from PATTERN here. Even though PATTERN is 1896 `char *' (i.e., signed), we declare these variables as unsigned, so 1897 they can be reliably used as array indices. */ 1898 register unsigned char c, c1; 1899 1900 /* A random temporary spot in PATTERN. */ 1901 const char *p1; 1902 1903 /* Points to the end of the buffer, where we should append. */ 1904 register unsigned char *b; 1905 1906 /* Keeps track of unclosed groups. */ 1907 compile_stack_type compile_stack; 1908 1909 /* Points to the current (ending) position in the pattern. */ 1910 const char *p = pattern; 1911 const char *pend = pattern + size; 1912 1913 /* How to translate the characters in the pattern. */ 1914 RE_TRANSLATE_TYPE translate = bufp->translate; 1915 1916 /* Address of the count-byte of the most recently inserted `exactn' 1917 command. This makes it possible to tell if a new exact-match 1918 character can be added to that command or if the character requires 1919 a new `exactn' command. */ 1920 unsigned char *pending_exact = 0; 1921 1922 /* Address of start of the most recently finished expression. 1923 This tells, e.g., postfix * where to find the start of its 1924 operand. Reset at the beginning of groups and alternatives. */ 1925 unsigned char *laststart = 0; 1926 1927 /* Address of beginning of regexp, or inside of last group. */ 1928 unsigned char *begalt; 1929 1930 /* Place in the uncompiled pattern (i.e., the {) to 1931 which to go back if the interval is invalid. */ 1932 const char *beg_interval; 1933 1934 /* Address of the place where a forward jump should go to the end of 1935 the containing expression. Each alternative of an `or' -- except the 1936 last -- ends with a forward jump of this sort. */ 1937 unsigned char *fixup_alt_jump = 0; 1938 1939 /* Counts open-groups as they are encountered. Remembered for the 1940 matching close-group on the compile stack, so the same register 1941 number is put in the stop_memory as the start_memory. */ 1942 regnum_t regnum = 0; 1943 1944#ifdef DEBUG 1945 DEBUG_PRINT1 ("\nCompiling pattern: "); 1946 if (debug) 1947 { 1948 unsigned debug_count; 1949 1950 for (debug_count = 0; debug_count < size; debug_count++) 1951 putchar (pattern[debug_count]); 1952 putchar ('\n'); 1953 } 1954#endif /* DEBUG */ 1955 1956 /* Initialize the compile stack. */ 1957 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 1958 if (compile_stack.stack == NULL) 1959 return REG_ESPACE; 1960 1961 compile_stack.size = INIT_COMPILE_STACK_SIZE; 1962 compile_stack.avail = 0; 1963 1964 /* Initialize the pattern buffer. */ 1965 bufp->syntax = syntax; 1966 bufp->fastmap_accurate = 0; 1967 bufp->not_bol = bufp->not_eol = 0; 1968 1969 /* Set `used' to zero, so that if we return an error, the pattern 1970 printer (for debugging) will think there's no pattern. We reset it 1971 at the end. */ 1972 bufp->used = 0; 1973 1974 /* Always count groups, whether or not bufp->no_sub is set. */ 1975 bufp->re_nsub = 0; 1976 1977#if !defined emacs && !defined SYNTAX_TABLE 1978 /* Initialize the syntax table. */ 1979 init_syntax_once (); 1980#endif 1981 1982 if (bufp->allocated == 0) 1983 { 1984 if (bufp->buffer) 1985 { /* If zero allocated, but buffer is non-null, try to realloc 1986 enough space. This loses if buffer's address is bogus, but 1987 that is the user's responsibility. */ 1988 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); 1989 } 1990 else 1991 { /* Caller did not allocate a buffer. Do it for them. */ 1992 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); 1993 } 1994 if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE); 1995 1996 bufp->allocated = INIT_BUF_SIZE; 1997 } 1998 1999 begalt = b = bufp->buffer; 2000 2001 /* Loop through the uncompiled pattern until we're at the end. */ 2002 while (p != pend) 2003 { 2004 PATFETCH (c); 2005 2006 switch (c) 2007 { 2008 case '^': 2009 { 2010 if ( /* If at start of pattern, it's an operator. */ 2011 p == pattern + 1 2012 /* If context independent, it's an operator. */ 2013 || syntax & RE_CONTEXT_INDEP_ANCHORS 2014 /* Otherwise, depends on what's come before. */ 2015 || at_begline_loc_p (pattern, p, syntax)) 2016 BUF_PUSH (begline); 2017 else 2018 goto normal_char; 2019 } 2020 break; 2021 2022 2023 case '$': 2024 { 2025 if ( /* If at end of pattern, it's an operator. */ 2026 p == pend 2027 /* If context independent, it's an operator. */ 2028 || syntax & RE_CONTEXT_INDEP_ANCHORS 2029 /* Otherwise, depends on what's next. */ 2030 || at_endline_loc_p (p, pend, syntax)) 2031 BUF_PUSH (endline); 2032 else 2033 goto normal_char; 2034 } 2035 break; 2036 2037 2038 case '+': 2039 case '?': 2040 if ((syntax & RE_BK_PLUS_QM) 2041 || (syntax & RE_LIMITED_OPS)) 2042 goto normal_char; 2043 handle_plus: 2044 case '*': 2045 /* If there is no previous pattern... */ 2046 if (!laststart) 2047 { 2048 if (syntax & RE_CONTEXT_INVALID_OPS) 2049 FREE_STACK_RETURN (REG_BADRPT); 2050 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 2051 goto normal_char; 2052 } 2053 2054 { 2055 /* Are we optimizing this jump? */ 2056 boolean keep_string_p = false; 2057 2058 /* 1 means zero (many) matches is allowed. */ 2059 char zero_times_ok = 0, many_times_ok = 0; 2060 2061 /* If there is a sequence of repetition chars, collapse it 2062 down to just one (the right one). We can't combine 2063 interval operators with these because of, e.g., `a{2}*', 2064 which should only match an even number of `a's. */ 2065 2066 for (;;) 2067 { 2068 zero_times_ok |= c != '+'; 2069 many_times_ok |= c != '?'; 2070 2071 if (p == pend) 2072 break; 2073 2074 PATFETCH (c); 2075 2076 if (c == '*' 2077 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 2078 ; 2079 2080 else if (syntax & RE_BK_PLUS_QM && c == '\\') 2081 { 2082 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2083 2084 PATFETCH (c1); 2085 if (!(c1 == '+' || c1 == '?')) 2086 { 2087 PATUNFETCH; 2088 PATUNFETCH; 2089 break; 2090 } 2091 2092 c = c1; 2093 } 2094 else 2095 { 2096 PATUNFETCH; 2097 break; 2098 } 2099 2100 /* If we get here, we found another repeat character. */ 2101 } 2102 2103 /* Star, etc. applied to an empty pattern is equivalent 2104 to an empty pattern. */ 2105 if (!laststart) 2106 break; 2107 2108 /* Now we know whether or not zero matches is allowed 2109 and also whether or not two or more matches is allowed. */ 2110 if (many_times_ok) 2111 { /* More than one repetition is allowed, so put in at the 2112 end a backward relative jump from `b' to before the next 2113 jump we're going to put in below (which jumps from 2114 laststart to after this jump). 2115 2116 But if we are at the `*' in the exact sequence `.*\n', 2117 insert an unconditional jump backwards to the ., 2118 instead of the beginning of the loop. This way we only 2119 push a failure point once, instead of every time 2120 through the loop. */ 2121 assert (p - 1 > pattern); 2122 2123 /* Allocate the space for the jump. */ 2124 GET_BUFFER_SPACE (3); 2125 2126 /* We know we are not at the first character of the pattern, 2127 because laststart was nonzero. And we've already 2128 incremented `p', by the way, to be the character after 2129 the `*'. Do we have to do something analogous here 2130 for null bytes, because of RE_DOT_NOT_NULL? */ 2131 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 2132 && zero_times_ok 2133 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 2134 && !(syntax & RE_DOT_NEWLINE)) 2135 { /* We have .*\n. */ 2136 STORE_JUMP (jump, b, laststart); 2137 keep_string_p = true; 2138 } 2139 else 2140 /* Anything else. */ 2141 STORE_JUMP (maybe_pop_jump, b, laststart - 3); 2142 2143 /* We've added more stuff to the buffer. */ 2144 b += 3; 2145 } 2146 2147 /* On failure, jump from laststart to b + 3, which will be the 2148 end of the buffer after this jump is inserted. */ 2149 GET_BUFFER_SPACE (3); 2150 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 2151 : on_failure_jump, 2152 laststart, b + 3); 2153 pending_exact = 0; 2154 b += 3; 2155 2156 if (!zero_times_ok) 2157 { 2158 /* At least one repetition is required, so insert a 2159 `dummy_failure_jump' before the initial 2160 `on_failure_jump' instruction of the loop. This 2161 effects a skip over that instruction the first time 2162 we hit that loop. */ 2163 GET_BUFFER_SPACE (3); 2164 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); 2165 b += 3; 2166 } 2167 } 2168 break; 2169 2170 2171 case '.': 2172 laststart = b; 2173 BUF_PUSH (anychar); 2174 break; 2175 2176 2177 case '[': 2178 { 2179 boolean had_char_class = false; 2180 2181 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2182 2183 /* Ensure that we have enough space to push a charset: the 2184 opcode, the length count, and the bitset; 34 bytes in all. */ 2185 GET_BUFFER_SPACE (34); 2186 2187 laststart = b; 2188 2189 /* We test `*p == '^' twice, instead of using an if 2190 statement, so we only need one BUF_PUSH. */ 2191 BUF_PUSH (*p == '^' ? charset_not : charset); 2192 if (*p == '^') 2193 p++; 2194 2195 /* Remember the first position in the bracket expression. */ 2196 p1 = p; 2197 2198 /* Push the number of bytes in the bitmap. */ 2199 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 2200 2201 /* Clear the whole map. */ 2202 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 2203 2204 /* charset_not matches newline according to a syntax bit. */ 2205 if ((re_opcode_t) b[-2] == charset_not 2206 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2207 SET_LIST_BIT ('\n'); 2208 2209 /* Read in characters and ranges, setting map bits. */ 2210 for (;;) 2211 { 2212 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2213 2214 PATFETCH (c); 2215 2216 /* \ might escape characters inside [...] and [^...]. */ 2217 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2218 { 2219 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2220 2221 PATFETCH (c1); 2222 SET_LIST_BIT (c1); 2223 continue; 2224 } 2225 2226 /* Could be the end of the bracket expression. If it's 2227 not (i.e., when the bracket expression is `[]' so 2228 far), the ']' character bit gets set way below. */ 2229 if (c == ']' && p != p1 + 1) 2230 break; 2231 2232 /* Look ahead to see if it's a range when the last thing 2233 was a character class. */ 2234 if (had_char_class && c == '-' && *p != ']') 2235 FREE_STACK_RETURN (REG_ERANGE); 2236 2237 /* Look ahead to see if it's a range when the last thing 2238 was a character: if this is a hyphen not at the 2239 beginning or the end of a list, then it's the range 2240 operator. */ 2241 if (c == '-' 2242 && !(p - 2 >= pattern && p[-2] == '[') 2243 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2244 && *p != ']') 2245 { 2246 reg_errcode_t ret 2247 = compile_range (&p, pend, translate, syntax, b); 2248 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2249 } 2250 2251 else if (p[0] == '-' && p[1] != ']') 2252 { /* This handles ranges made up of characters only. */ 2253 reg_errcode_t ret; 2254 2255 /* Move past the `-'. */ 2256 PATFETCH (c1); 2257 2258 ret = compile_range (&p, pend, translate, syntax, b); 2259 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2260 } 2261 2262 /* See if we're at the beginning of a possible character 2263 class. */ 2264 2265 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2266 { /* Leave room for the null. */ 2267 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2268 2269 PATFETCH (c); 2270 c1 = 0; 2271 2272 /* If pattern is `[[:'. */ 2273 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2274 2275 for (;;) 2276 { 2277 PATFETCH (c); 2278 if ((c == ':' && *p == ']') || p == pend) 2279 break; 2280 if (c1 < CHAR_CLASS_MAX_LENGTH) 2281 str[c1++] = c; 2282 else 2283 /* This is in any case an invalid class name. */ 2284 str[0] = '\0'; 2285 } 2286 str[c1] = '\0'; 2287 2288 /* If isn't a word bracketed by `[:' and `:]': 2289 undo the ending character, the letters, and leave 2290 the leading `:' and `[' (but set bits for them). */ 2291 if (c == ':' && *p == ']') 2292 { 2293#if defined _LIBC || WIDE_CHAR_SUPPORT 2294 boolean is_lower = STREQ (str, "lower"); 2295 boolean is_upper = STREQ (str, "upper"); 2296 wctype_t wt; 2297 int ch; 2298 2299 wt = IS_CHAR_CLASS (str); 2300 if (wt == 0) 2301 FREE_STACK_RETURN (REG_ECTYPE); 2302 2303 /* Throw away the ] at the end of the character 2304 class. */ 2305 PATFETCH (c); 2306 2307 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2308 2309 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 2310 { 2311# ifdef _LIBC 2312 if (__iswctype (__btowc (ch), wt)) 2313 SET_LIST_BIT (ch); 2314# else 2315 if (iswctype (btowc (ch), wt)) 2316 SET_LIST_BIT (ch); 2317# endif 2318 2319 if (translate && (is_upper || is_lower) 2320 && (ISUPPER (ch) || ISLOWER (ch))) 2321 SET_LIST_BIT (ch); 2322 } 2323 2324 had_char_class = true; 2325#else 2326 int ch; 2327 boolean is_alnum = STREQ (str, "alnum"); 2328 boolean is_alpha = STREQ (str, "alpha"); 2329 boolean is_blank = STREQ (str, "blank"); 2330 boolean is_cntrl = STREQ (str, "cntrl"); 2331 boolean is_digit = STREQ (str, "digit"); 2332 boolean is_graph = STREQ (str, "graph"); 2333 boolean is_lower = STREQ (str, "lower"); 2334 boolean is_print = STREQ (str, "print"); 2335 boolean is_punct = STREQ (str, "punct"); 2336 boolean is_space = STREQ (str, "space"); 2337 boolean is_upper = STREQ (str, "upper"); 2338 boolean is_xdigit = STREQ (str, "xdigit"); 2339 2340 if (!IS_CHAR_CLASS (str)) 2341 FREE_STACK_RETURN (REG_ECTYPE); 2342 2343 /* Throw away the ] at the end of the character 2344 class. */ 2345 PATFETCH (c); 2346 2347 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2348 2349 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 2350 { 2351 /* This was split into 3 if's to 2352 avoid an arbitrary limit in some compiler. */ 2353 if ( (is_alnum && ISALNUM (ch)) 2354 || (is_alpha && ISALPHA (ch)) 2355 || (is_blank && ISBLANK (ch)) 2356 || (is_cntrl && ISCNTRL (ch))) 2357 SET_LIST_BIT (ch); 2358 if ( (is_digit && ISDIGIT (ch)) 2359 || (is_graph && ISGRAPH (ch)) 2360 || (is_lower && ISLOWER (ch)) 2361 || (is_print && ISPRINT (ch))) 2362 SET_LIST_BIT (ch); 2363 if ( (is_punct && ISPUNCT (ch)) 2364 || (is_space && ISSPACE (ch)) 2365 || (is_upper && ISUPPER (ch)) 2366 || (is_xdigit && ISXDIGIT (ch))) 2367 SET_LIST_BIT (ch); 2368 if ( translate && (is_upper || is_lower) 2369 && (ISUPPER (ch) || ISLOWER (ch))) 2370 SET_LIST_BIT (ch); 2371 } 2372 had_char_class = true; 2373#endif /* libc || wctype.h */ 2374 } 2375 else 2376 { 2377 c1++; 2378 while (c1--) 2379 PATUNFETCH; 2380 SET_LIST_BIT ('['); 2381 SET_LIST_BIT (':'); 2382 had_char_class = false; 2383 } 2384 } 2385 else 2386 { 2387 had_char_class = false; 2388 SET_LIST_BIT (c); 2389 } 2390 } 2391 2392 /* Discard any (non)matching list bytes that are all 0 at the 2393 end of the map. Decrease the map-length byte too. */ 2394 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 2395 b[-1]--; 2396 b += b[-1]; 2397 } 2398 break; 2399 2400 2401 case '(': 2402 if (syntax & RE_NO_BK_PARENS) 2403 goto handle_open; 2404 else 2405 goto normal_char; 2406 2407 2408 case ')': 2409 if (syntax & RE_NO_BK_PARENS) 2410 goto handle_close; 2411 else 2412 goto normal_char; 2413 2414 2415 case '\n': 2416 if (syntax & RE_NEWLINE_ALT) 2417 goto handle_alt; 2418 else 2419 goto normal_char; 2420 2421 2422 case '|': 2423 if (syntax & RE_NO_BK_VBAR) 2424 goto handle_alt; 2425 else 2426 goto normal_char; 2427 2428 2429 case '{': 2430 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 2431 goto handle_interval; 2432 else 2433 goto normal_char; 2434 2435 2436 case '\\': 2437 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2438 2439 /* Do not translate the character after the \, so that we can 2440 distinguish, e.g., \B from \b, even if we normally would 2441 translate, e.g., B to b. */ 2442 PATFETCH_RAW (c); 2443 2444 switch (c) 2445 { 2446 case '(': 2447 if (syntax & RE_NO_BK_PARENS) 2448 goto normal_backslash; 2449 2450 handle_open: 2451 bufp->re_nsub++; 2452 regnum++; 2453 2454 if (COMPILE_STACK_FULL) 2455 { 2456 RETALLOC (compile_stack.stack, compile_stack.size << 1, 2457 compile_stack_elt_t); 2458 if (compile_stack.stack == NULL) return REG_ESPACE; 2459 2460 compile_stack.size <<= 1; 2461 } 2462 2463 /* These are the values to restore when we hit end of this 2464 group. They are all relative offsets, so that if the 2465 whole pattern moves because of realloc, they will still 2466 be valid. */ 2467 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; 2468 COMPILE_STACK_TOP.fixup_alt_jump 2469 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; 2470 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; 2471 COMPILE_STACK_TOP.regnum = regnum; 2472 2473 /* We will eventually replace the 0 with the number of 2474 groups inner to this one. But do not push a 2475 start_memory for groups beyond the last one we can 2476 represent in the compiled pattern. */ 2477 if (regnum <= MAX_REGNUM) 2478 { 2479 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; 2480 BUF_PUSH_3 (start_memory, regnum, 0); 2481 } 2482 2483 compile_stack.avail++; 2484 2485 fixup_alt_jump = 0; 2486 laststart = 0; 2487 begalt = b; 2488 /* If we've reached MAX_REGNUM groups, then this open 2489 won't actually generate any code, so we'll have to 2490 clear pending_exact explicitly. */ 2491 pending_exact = 0; 2492 break; 2493 2494 2495 case ')': 2496 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 2497 2498 if (COMPILE_STACK_EMPTY) 2499 { 2500 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 2501 goto normal_backslash; 2502 else 2503 FREE_STACK_RETURN (REG_ERPAREN); 2504 } 2505 2506 handle_close: 2507 if (fixup_alt_jump) 2508 { /* Push a dummy failure point at the end of the 2509 alternative for a possible future 2510 `pop_failure_jump' to pop. See comments at 2511 `push_dummy_failure' in `re_match_2'. */ 2512 BUF_PUSH (push_dummy_failure); 2513 2514 /* We allocated space for this jump when we assigned 2515 to `fixup_alt_jump', in the `handle_alt' case below. */ 2516 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 2517 } 2518 2519 /* See similar code for backslashed left paren above. */ 2520 if (COMPILE_STACK_EMPTY) 2521 { 2522 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 2523 goto normal_char; 2524 else 2525 FREE_STACK_RETURN (REG_ERPAREN); 2526 } 2527 2528 /* Since we just checked for an empty stack above, this 2529 ``can't happen''. */ 2530 assert (compile_stack.avail != 0); 2531 { 2532 /* We don't just want to restore into `regnum', because 2533 later groups should continue to be numbered higher, 2534 as in `(ab)c(de)' -- the second group is #2. */ 2535 regnum_t this_group_regnum; 2536 2537 compile_stack.avail--; 2538 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; 2539 fixup_alt_jump 2540 = COMPILE_STACK_TOP.fixup_alt_jump 2541 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 2542 : 0; 2543 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; 2544 this_group_regnum = COMPILE_STACK_TOP.regnum; 2545 /* If we've reached MAX_REGNUM groups, then this open 2546 won't actually generate any code, so we'll have to 2547 clear pending_exact explicitly. */ 2548 pending_exact = 0; 2549 2550 /* We're at the end of the group, so now we know how many 2551 groups were inside this one. */ 2552 if (this_group_regnum <= MAX_REGNUM) 2553 { 2554 unsigned char *inner_group_loc 2555 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; 2556 2557 *inner_group_loc = regnum - this_group_regnum; 2558 BUF_PUSH_3 (stop_memory, this_group_regnum, 2559 regnum - this_group_regnum); 2560 } 2561 } 2562 break; 2563 2564 2565 case '|': /* `\|'. */ 2566 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 2567 goto normal_backslash; 2568 handle_alt: 2569 if (syntax & RE_LIMITED_OPS) 2570 goto normal_char; 2571 2572 /* Insert before the previous alternative a jump which 2573 jumps to this alternative if the former fails. */ 2574 GET_BUFFER_SPACE (3); 2575 INSERT_JUMP (on_failure_jump, begalt, b + 6); 2576 pending_exact = 0; 2577 b += 3; 2578 2579 /* The alternative before this one has a jump after it 2580 which gets executed if it gets matched. Adjust that 2581 jump so it will jump to this alternative's analogous 2582 jump (put in below, which in turn will jump to the next 2583 (if any) alternative's such jump, etc.). The last such 2584 jump jumps to the correct final destination. A picture: 2585 _____ _____ 2586 | | | | 2587 | v | v 2588 a | b | c 2589 2590 If we are at `b', then fixup_alt_jump right now points to a 2591 three-byte space after `a'. We'll put in the jump, set 2592 fixup_alt_jump to right after `b', and leave behind three 2593 bytes which we'll fill in when we get to after `c'. */ 2594 2595 if (fixup_alt_jump) 2596 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2597 2598 /* Mark and leave space for a jump after this alternative, 2599 to be filled in later either by next alternative or 2600 when know we're at the end of a series of alternatives. */ 2601 fixup_alt_jump = b; 2602 GET_BUFFER_SPACE (3); 2603 b += 3; 2604 2605 laststart = 0; 2606 begalt = b; 2607 break; 2608 2609 2610 case '{': 2611 /* If \{ is a literal. */ 2612 if (!(syntax & RE_INTERVALS) 2613 /* If we're at `\{' and it's not the open-interval 2614 operator. */ 2615 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) 2616 || (p - 2 == pattern && p == pend)) 2617 goto normal_backslash; 2618 2619 handle_interval: 2620 { 2621 /* If got here, then the syntax allows intervals. */ 2622 2623 /* At least (most) this many matches must be made. */ 2624 int lower_bound = -1, upper_bound = -1; 2625 2626 beg_interval = p - 1; 2627 2628 if (p == pend) 2629 { 2630 if (syntax & RE_NO_BK_BRACES) 2631 goto unfetch_interval; 2632 else 2633 FREE_STACK_RETURN (REG_EBRACE); 2634 } 2635 2636 GET_UNSIGNED_NUMBER (lower_bound); 2637 2638 if (c == ',') 2639 { 2640 GET_UNSIGNED_NUMBER (upper_bound); 2641 if (upper_bound < 0) upper_bound = RE_DUP_MAX; 2642 } 2643 else 2644 /* Interval such as `{1}' => match exactly once. */ 2645 upper_bound = lower_bound; 2646 2647 if (lower_bound < 0 || upper_bound > RE_DUP_MAX 2648 || lower_bound > upper_bound) 2649 { 2650 if (syntax & RE_NO_BK_BRACES) 2651 goto unfetch_interval; 2652 else 2653 FREE_STACK_RETURN (REG_BADBR); 2654 } 2655 2656 if (!(syntax & RE_NO_BK_BRACES)) 2657 { 2658 if (c != '\\') FREE_STACK_RETURN (REG_EBRACE); 2659 2660 PATFETCH (c); 2661 } 2662 2663 if (c != '}') 2664 { 2665 if (syntax & RE_NO_BK_BRACES) 2666 goto unfetch_interval; 2667 else 2668 FREE_STACK_RETURN (REG_BADBR); 2669 } 2670 2671 /* We just parsed a valid interval. */ 2672 2673 /* If it's invalid to have no preceding re. */ 2674 if (!laststart) 2675 { 2676 if (syntax & RE_CONTEXT_INVALID_OPS) 2677 FREE_STACK_RETURN (REG_BADRPT); 2678 else if (syntax & RE_CONTEXT_INDEP_OPS) 2679 laststart = b; 2680 else 2681 goto unfetch_interval; 2682 } 2683 2684 /* If the upper bound is zero, don't want to succeed at 2685 all; jump from `laststart' to `b + 3', which will be 2686 the end of the buffer after we insert the jump. */ 2687 if (upper_bound == 0) 2688 { 2689 GET_BUFFER_SPACE (3); 2690 INSERT_JUMP (jump, laststart, b + 3); 2691 b += 3; 2692 } 2693 2694 /* Otherwise, we have a nontrivial interval. When 2695 we're all done, the pattern will look like: 2696 set_number_at <jump count> <upper bound> 2697 set_number_at <succeed_n count> <lower bound> 2698 succeed_n <after jump addr> <succeed_n count> 2699 <body of loop> 2700 jump_n <succeed_n addr> <jump count> 2701 (The upper bound and `jump_n' are omitted if 2702 `upper_bound' is 1, though.) */ 2703 else 2704 { /* If the upper bound is > 1, we need to insert 2705 more at the end of the loop. */ 2706 unsigned nbytes = 10 + (upper_bound > 1) * 10; 2707 2708 GET_BUFFER_SPACE (nbytes); 2709 2710 /* Initialize lower bound of the `succeed_n', even 2711 though it will be set during matching by its 2712 attendant `set_number_at' (inserted next), 2713 because `re_compile_fastmap' needs to know. 2714 Jump to the `jump_n' we might insert below. */ 2715 INSERT_JUMP2 (succeed_n, laststart, 2716 b + 5 + (upper_bound > 1) * 5, 2717 lower_bound); 2718 b += 5; 2719 2720 /* Code to initialize the lower bound. Insert 2721 before the `succeed_n'. The `5' is the last two 2722 bytes of this `set_number_at', plus 3 bytes of 2723 the following `succeed_n'. */ 2724 insert_op2 (set_number_at, laststart, 5, lower_bound, b); 2725 b += 5; 2726 2727 if (upper_bound > 1) 2728 { /* More than one repetition is allowed, so 2729 append a backward jump to the `succeed_n' 2730 that starts this interval. 2731 2732 When we've reached this during matching, 2733 we'll have matched the interval once, so 2734 jump back only `upper_bound - 1' times. */ 2735 STORE_JUMP2 (jump_n, b, laststart + 5, 2736 upper_bound - 1); 2737 b += 5; 2738 2739 /* The location we want to set is the second 2740 parameter of the `jump_n'; that is `b-2' as 2741 an absolute address. `laststart' will be 2742 the `set_number_at' we're about to insert; 2743 `laststart+3' the number to set, the source 2744 for the relative address. But we are 2745 inserting into the middle of the pattern -- 2746 so everything is getting moved up by 5. 2747 Conclusion: (b - 2) - (laststart + 3) + 5, 2748 i.e., b - laststart. 2749 2750 We insert this at the beginning of the loop 2751 so that if we fail during matching, we'll 2752 reinitialize the bounds. */ 2753 insert_op2 (set_number_at, laststart, b - laststart, 2754 upper_bound - 1, b); 2755 b += 5; 2756 } 2757 } 2758 pending_exact = 0; 2759 beg_interval = NULL; 2760 } 2761 break; 2762 2763 unfetch_interval: 2764 /* If an invalid interval, match the characters as literals. */ 2765 assert (beg_interval); 2766 p = beg_interval; 2767 beg_interval = NULL; 2768 2769 /* normal_char and normal_backslash need `c'. */ 2770 PATFETCH (c); 2771 2772 if (!(syntax & RE_NO_BK_BRACES)) 2773 { 2774 if (p > pattern && p[-1] == '\\') 2775 goto normal_backslash; 2776 } 2777 goto normal_char; 2778 2779#ifdef emacs 2780 /* There is no way to specify the before_dot and after_dot 2781 operators. rms says this is ok. --karl */ 2782 case '=': 2783 BUF_PUSH (at_dot); 2784 break; 2785 2786 case 's': 2787 laststart = b; 2788 PATFETCH (c); 2789 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 2790 break; 2791 2792 case 'S': 2793 laststart = b; 2794 PATFETCH (c); 2795 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 2796 break; 2797#endif /* emacs */ 2798 2799 2800 case 'w': 2801 if (syntax & RE_NO_GNU_OPS) 2802 goto normal_char; 2803 laststart = b; 2804 BUF_PUSH (wordchar); 2805 break; 2806 2807 2808 case 'W': 2809 if (syntax & RE_NO_GNU_OPS) 2810 goto normal_char; 2811 laststart = b; 2812 BUF_PUSH (notwordchar); 2813 break; 2814 2815 2816 case '<': 2817 if (syntax & RE_NO_GNU_OPS) 2818 goto normal_char; 2819 BUF_PUSH (wordbeg); 2820 break; 2821 2822 case '>': 2823 if (syntax & RE_NO_GNU_OPS) 2824 goto normal_char; 2825 BUF_PUSH (wordend); 2826 break; 2827 2828 case 'b': 2829 if (syntax & RE_NO_GNU_OPS) 2830 goto normal_char; 2831 BUF_PUSH (wordbound); 2832 break; 2833 2834 case 'B': 2835 if (syntax & RE_NO_GNU_OPS) 2836 goto normal_char; 2837 BUF_PUSH (notwordbound); 2838 break; 2839 2840 case '`': 2841 if (syntax & RE_NO_GNU_OPS) 2842 goto normal_char; 2843 BUF_PUSH (begbuf); 2844 break; 2845 2846 case '\'': 2847 if (syntax & RE_NO_GNU_OPS) 2848 goto normal_char; 2849 BUF_PUSH (endbuf); 2850 break; 2851 2852 case '1': case '2': case '3': case '4': case '5': 2853 case '6': case '7': case '8': case '9': 2854 if (syntax & RE_NO_BK_REFS) 2855 goto normal_char; 2856 2857 c1 = c - '0'; 2858 2859 if (c1 > regnum) 2860 FREE_STACK_RETURN (REG_ESUBREG); 2861 2862 /* Can't back reference to a subexpression if inside of it. */ 2863 if (group_in_compile_stack (compile_stack, (regnum_t) c1)) 2864 goto normal_char; 2865 2866 laststart = b; 2867 BUF_PUSH_2 (duplicate, c1); 2868 break; 2869 2870 2871 case '+': 2872 case '?': 2873 if (syntax & RE_BK_PLUS_QM) 2874 goto handle_plus; 2875 else 2876 goto normal_backslash; 2877 2878 default: 2879 normal_backslash: 2880 /* You might think it would be useful for \ to mean 2881 not to translate; but if we don't translate it 2882 it will never match anything. */ 2883 c = TRANSLATE (c); 2884 goto normal_char; 2885 } 2886 break; 2887 2888 2889 default: 2890 /* Expects the character in `c'. */ 2891 normal_char: 2892 /* If no exactn currently being built. */ 2893 if (!pending_exact 2894 2895 /* If last exactn not at current position. */ 2896 || pending_exact + *pending_exact + 1 != b 2897 2898 /* We have only one byte following the exactn for the count. */ 2899 || *pending_exact == (1 << BYTEWIDTH) - 1 2900 2901 /* If followed by a repetition operator. */ 2902 || *p == '*' || *p == '^' 2903 || ((syntax & RE_BK_PLUS_QM) 2904 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 2905 : (*p == '+' || *p == '?')) 2906 || ((syntax & RE_INTERVALS) 2907 && ((syntax & RE_NO_BK_BRACES) 2908 ? *p == '{' 2909 : (p[0] == '\\' && p[1] == '{')))) 2910 { 2911 /* Start building a new exactn. */ 2912 2913 laststart = b; 2914 2915 BUF_PUSH_2 (exactn, 0); 2916 pending_exact = b - 1; 2917 } 2918 2919 BUF_PUSH (c); 2920 (*pending_exact)++; 2921 break; 2922 } /* switch (c) */ 2923 } /* while p != pend */ 2924 2925 2926 /* Through the pattern now. */ 2927 2928 if (fixup_alt_jump) 2929 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 2930 2931 if (!COMPILE_STACK_EMPTY) 2932 FREE_STACK_RETURN (REG_EPAREN); 2933 2934 /* If we don't want backtracking, force success 2935 the first time we reach the end of the compiled pattern. */ 2936 if (syntax & RE_NO_POSIX_BACKTRACKING) 2937 BUF_PUSH (succeed); 2938 2939 free (compile_stack.stack); 2940 2941 /* We have succeeded; set the length of the buffer. */ 2942 bufp->used = b - bufp->buffer; 2943 2944#ifdef DEBUG 2945 if (debug) 2946 { 2947 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 2948 print_compiled_pattern (bufp); 2949 } 2950#endif /* DEBUG */ 2951 2952#ifndef MATCH_MAY_ALLOCATE 2953 /* Initialize the failure stack to the largest possible stack. This 2954 isn't necessary unless we're trying to avoid calling alloca in 2955 the search and match routines. */ 2956 { 2957 int num_regs = bufp->re_nsub + 1; 2958 2959 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 2960 is strictly greater than re_max_failures, the largest possible stack 2961 is 2 * re_max_failures failure points. */ 2962 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 2963 { 2964 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 2965 2966# ifdef emacs 2967 if (! fail_stack.stack) 2968 fail_stack.stack 2969 = (fail_stack_elt_t *) xmalloc (fail_stack.size 2970 * sizeof (fail_stack_elt_t)); 2971 else 2972 fail_stack.stack 2973 = (fail_stack_elt_t *) xrealloc (fail_stack.stack, 2974 (fail_stack.size 2975 * sizeof (fail_stack_elt_t))); 2976# else /* not emacs */ 2977 if (! fail_stack.stack) 2978 fail_stack.stack 2979 = (fail_stack_elt_t *) malloc (fail_stack.size 2980 * sizeof (fail_stack_elt_t)); 2981 else 2982 fail_stack.stack 2983 = (fail_stack_elt_t *) realloc (fail_stack.stack, 2984 (fail_stack.size 2985 * sizeof (fail_stack_elt_t))); 2986# endif /* not emacs */ 2987 } 2988 2989 regex_grow_registers (num_regs); 2990 } 2991#endif /* not MATCH_MAY_ALLOCATE */ 2992 2993 return REG_NOERROR; 2994} /* regex_compile */ 2995 2996/* Subroutines for `regex_compile'. */ 2997 2998/* Store OP at LOC followed by two-byte integer parameter ARG. */ 2999 3000static void 3001store_op1 (op, loc, arg) 3002 re_opcode_t op; 3003 unsigned char *loc; 3004 int arg; 3005{ 3006 *loc = (unsigned char) op; 3007 STORE_NUMBER (loc + 1, arg); 3008} 3009 3010 3011/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 3012 3013static void 3014store_op2 (op, loc, arg1, arg2) 3015 re_opcode_t op; 3016 unsigned char *loc; 3017 int arg1, arg2; 3018{ 3019 *loc = (unsigned char) op; 3020 STORE_NUMBER (loc + 1, arg1); 3021 STORE_NUMBER (loc + 3, arg2); 3022} 3023 3024 3025/* Copy the bytes from LOC to END to open up three bytes of space at LOC 3026 for OP followed by two-byte integer parameter ARG. */ 3027 3028static void 3029insert_op1 (op, loc, arg, end) 3030 re_opcode_t op; 3031 unsigned char *loc; 3032 int arg; 3033 unsigned char *end; 3034{ 3035 register unsigned char *pfrom = end; 3036 register unsigned char *pto = end + 3; 3037 3038 while (pfrom != loc) 3039 *--pto = *--pfrom; 3040 3041 store_op1 (op, loc, arg); 3042} 3043 3044 3045/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 3046 3047static void 3048insert_op2 (op, loc, arg1, arg2, end) 3049 re_opcode_t op; 3050 unsigned char *loc; 3051 int arg1, arg2; 3052 unsigned char *end; 3053{ 3054 register unsigned char *pfrom = end; 3055 register unsigned char *pto = end + 5; 3056 3057 while (pfrom != loc) 3058 *--pto = *--pfrom; 3059 3060 store_op2 (op, loc, arg1, arg2); 3061} 3062 3063 3064/* P points to just after a ^ in PATTERN. Return true if that ^ comes 3065 after an alternative or a begin-subexpression. We assume there is at 3066 least one character before the ^. */ 3067 3068static boolean 3069at_begline_loc_p (pattern, p, syntax) 3070 const char *pattern, *p; 3071 reg_syntax_t syntax; 3072{ 3073 const char *prev = p - 2; 3074 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 3075 3076 return 3077 /* After a subexpression? */ 3078 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 3079 /* After an alternative? */ 3080 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 3081} 3082 3083 3084/* The dual of at_begline_loc_p. This one is for $. We assume there is 3085 at least one character after the $, i.e., `P < PEND'. */ 3086 3087static boolean 3088at_endline_loc_p (p, pend, syntax) 3089 const char *p, *pend; 3090 reg_syntax_t syntax; 3091{ 3092 const char *next = p; 3093 boolean next_backslash = *next == '\\'; 3094 const char *next_next = p + 1 < pend ? p + 1 : 0; 3095 3096 return 3097 /* Before a subexpression? */ 3098 (syntax & RE_NO_BK_PARENS ? *next == ')' 3099 : next_backslash && next_next && *next_next == ')') 3100 /* Before an alternative? */ 3101 || (syntax & RE_NO_BK_VBAR ? *next == '|' 3102 : next_backslash && next_next && *next_next == '|'); 3103} 3104 3105 3106/* Returns true if REGNUM is in one of COMPILE_STACK's elements and 3107 false if it's not. */ 3108 3109static boolean 3110group_in_compile_stack (compile_stack, regnum) 3111 compile_stack_type compile_stack; 3112 regnum_t regnum; 3113{ 3114 int this_element; 3115 3116 for (this_element = compile_stack.avail - 1; 3117 this_element >= 0; 3118 this_element--) 3119 if (compile_stack.stack[this_element].regnum == regnum) 3120 return true; 3121 3122 return false; 3123} 3124 3125 3126/* Read the ending character of a range (in a bracket expression) from the 3127 uncompiled pattern *P_PTR (which ends at PEND). We assume the 3128 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 3129 Then we set the translation of all bits between the starting and 3130 ending characters (inclusive) in the compiled pattern B. 3131 3132 Return an error code. 3133 3134 We use these short variable names so we can use the same macros as 3135 `regex_compile' itself. */ 3136 3137static reg_errcode_t 3138compile_range (p_ptr, pend, translate, syntax, b) 3139 const char **p_ptr, *pend; 3140 RE_TRANSLATE_TYPE translate; 3141 reg_syntax_t syntax; 3142 unsigned char *b; 3143{ 3144 unsigned this_char; 3145 3146 const char *p = *p_ptr; 3147 reg_errcode_t ret; 3148 char range_start[2]; 3149 char range_end[2]; 3150 3151 if (p == pend) 3152 return REG_ERANGE; 3153 3154 /* Fetch the endpoints without translating them; the 3155 appropriate translation is done in the bit-setting loop below. */ 3156 range_start[0] = p[-2]; range_start[1] = '\0'; 3157 range_end[0] = p[ 0]; range_end[1] = '\0'; 3158 3159 /* Have to increment the pointer into the pattern string, so the 3160 caller isn't still at the ending character. */ 3161 (*p_ptr)++; 3162 3163 /* Report an error if the range is empty and the syntax prohibits this. */ 3164 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 3165 3166 /* Here we see why `this_char' has to be larger than an `unsigned 3167 char' -- we would otherwise go into an infinite 3168 loop, since all characters <= 0xff. */ 3169 for (this_char = 0; this_char <= (unsigned char) -1; this_char++) 3170 { 3171 char ch[2]; 3172 ch[0] = this_char; ch[1] = '\0'; 3173 if (strcoll (range_start, ch) <= 0 && strcoll (ch, range_end) <= 0) 3174 { 3175 SET_LIST_BIT (TRANSLATE (this_char)); 3176 ret = REG_NOERROR; 3177 } 3178 } 3179 3180 return ret; 3181} 3182 3183/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 3184 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 3185 characters can start a string that matches the pattern. This fastmap 3186 is used by re_search to skip quickly over impossible starting points. 3187 3188 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 3189 area as BUFP->fastmap. 3190 3191 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 3192 the pattern buffer. 3193 3194 Returns 0 if we succeed, -2 if an internal error. */ 3195 3196int 3197re_compile_fastmap (bufp) 3198 struct re_pattern_buffer *bufp; 3199{ 3200 int j, k; 3201#ifdef MATCH_MAY_ALLOCATE 3202 fail_stack_type fail_stack; 3203#endif 3204#ifndef REGEX_MALLOC 3205 char *destination; 3206#endif 3207 3208 register char *fastmap = bufp->fastmap; 3209 unsigned char *pattern = bufp->buffer; 3210 unsigned char *p = pattern; 3211 register unsigned char *pend = pattern + bufp->used; 3212 3213#ifdef REL_ALLOC 3214 /* This holds the pointer to the failure stack, when 3215 it is allocated relocatably. */ 3216 fail_stack_elt_t *failure_stack_ptr; 3217#endif 3218 3219 /* Assume that each path through the pattern can be null until 3220 proven otherwise. We set this false at the bottom of switch 3221 statement, to which we get only if a particular path doesn't 3222 match the empty string. */ 3223 boolean path_can_be_null = true; 3224 3225 /* We aren't doing a `succeed_n' to begin with. */ 3226 boolean succeed_n_p = false; 3227 3228 assert (fastmap != NULL && p != NULL); 3229 3230 INIT_FAIL_STACK (); 3231 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 3232 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 3233 bufp->can_be_null = 0; 3234 3235 while (1) 3236 { 3237 if (p == pend || *p == succeed) 3238 { 3239 /* We have reached the (effective) end of pattern. */ 3240 if (!FAIL_STACK_EMPTY ()) 3241 { 3242 bufp->can_be_null |= path_can_be_null; 3243 3244 /* Reset for next path. */ 3245 path_can_be_null = true; 3246 3247 p = fail_stack.stack[--fail_stack.avail].pointer; 3248 3249 continue; 3250 } 3251 else 3252 break; 3253 } 3254 3255 /* We should never be about to go beyond the end of the pattern. */ 3256 assert (p < pend); 3257 3258 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 3259 { 3260 3261 /* I guess the idea here is to simply not bother with a fastmap 3262 if a backreference is used, since it's too hard to figure out 3263 the fastmap for the corresponding group. Setting 3264 `can_be_null' stops `re_search_2' from using the fastmap, so 3265 that is all we do. */ 3266 case duplicate: 3267 bufp->can_be_null = 1; 3268 goto done; 3269 3270 3271 /* Following are the cases which match a character. These end 3272 with `break'. */ 3273 3274 case exactn: 3275 fastmap[p[1]] = 1; 3276 break; 3277 3278 3279 case charset: 3280 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3281 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 3282 fastmap[j] = 1; 3283 break; 3284 3285 3286 case charset_not: 3287 /* Chars beyond end of map must be allowed. */ 3288 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 3289 fastmap[j] = 1; 3290 3291 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 3292 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 3293 fastmap[j] = 1; 3294 break; 3295 3296 3297 case wordchar: 3298 for (j = 0; j < (1 << BYTEWIDTH); j++) 3299 if (SYNTAX (j) == Sword) 3300 fastmap[j] = 1; 3301 break; 3302 3303 3304 case notwordchar: 3305 for (j = 0; j < (1 << BYTEWIDTH); j++) 3306 if (SYNTAX (j) != Sword) 3307 fastmap[j] = 1; 3308 break; 3309 3310 3311 case anychar: 3312 { 3313 int fastmap_newline = fastmap['\n']; 3314 3315 /* `.' matches anything ... */ 3316 for (j = 0; j < (1 << BYTEWIDTH); j++) 3317 fastmap[j] = 1; 3318 3319 /* ... except perhaps newline. */ 3320 if (!(bufp->syntax & RE_DOT_NEWLINE)) 3321 fastmap['\n'] = fastmap_newline; 3322 3323 /* Return if we have already set `can_be_null'; if we have, 3324 then the fastmap is irrelevant. Something's wrong here. */ 3325 else if (bufp->can_be_null) 3326 goto done; 3327 3328 /* Otherwise, have to check alternative paths. */ 3329 break; 3330 } 3331 3332#ifdef emacs 3333 case syntaxspec: 3334 k = *p++; 3335 for (j = 0; j < (1 << BYTEWIDTH); j++) 3336 if (SYNTAX (j) == (enum syntaxcode) k) 3337 fastmap[j] = 1; 3338 break; 3339 3340 3341 case notsyntaxspec: 3342 k = *p++; 3343 for (j = 0; j < (1 << BYTEWIDTH); j++) 3344 if (SYNTAX (j) != (enum syntaxcode) k) 3345 fastmap[j] = 1; 3346 break; 3347 3348 3349 /* All cases after this match the empty string. These end with 3350 `continue'. */ 3351 3352 3353 case before_dot: 3354 case at_dot: 3355 case after_dot: 3356 continue; 3357#endif /* emacs */ 3358 3359 3360 case no_op: 3361 case begline: 3362 case endline: 3363 case begbuf: 3364 case endbuf: 3365 case wordbound: 3366 case notwordbound: 3367 case wordbeg: 3368 case wordend: 3369 case push_dummy_failure: 3370 continue; 3371 3372 3373 case jump_n: 3374 case pop_failure_jump: 3375 case maybe_pop_jump: 3376 case jump: 3377 case jump_past_alt: 3378 case dummy_failure_jump: 3379 EXTRACT_NUMBER_AND_INCR (j, p); 3380 p += j; 3381 if (j > 0) 3382 continue; 3383 3384 /* Jump backward implies we just went through the body of a 3385 loop and matched nothing. Opcode jumped to should be 3386 `on_failure_jump' or `succeed_n'. Just treat it like an 3387 ordinary jump. For a * loop, it has pushed its failure 3388 point already; if so, discard that as redundant. */ 3389 if ((re_opcode_t) *p != on_failure_jump 3390 && (re_opcode_t) *p != succeed_n) 3391 continue; 3392 3393 p++; 3394 EXTRACT_NUMBER_AND_INCR (j, p); 3395 p += j; 3396 3397 /* If what's on the stack is where we are now, pop it. */ 3398 if (!FAIL_STACK_EMPTY () 3399 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 3400 fail_stack.avail--; 3401 3402 continue; 3403 3404 3405 case on_failure_jump: 3406 case on_failure_keep_string_jump: 3407 handle_on_failure_jump: 3408 EXTRACT_NUMBER_AND_INCR (j, p); 3409 3410 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 3411 end of the pattern. We don't want to push such a point, 3412 since when we restore it above, entering the switch will 3413 increment `p' past the end of the pattern. We don't need 3414 to push such a point since we obviously won't find any more 3415 fastmap entries beyond `pend'. Such a pattern can match 3416 the null string, though. */ 3417 if (p + j < pend) 3418 { 3419 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 3420 { 3421 RESET_FAIL_STACK (); 3422 return -2; 3423 } 3424 } 3425 else 3426 bufp->can_be_null = 1; 3427 3428 if (succeed_n_p) 3429 { 3430 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 3431 succeed_n_p = false; 3432 } 3433 3434 continue; 3435 3436 3437 case succeed_n: 3438 /* Get to the number of times to succeed. */ 3439 p += 2; 3440 3441 /* Increment p past the n for when k != 0. */ 3442 EXTRACT_NUMBER_AND_INCR (k, p); 3443 if (k == 0) 3444 { 3445 p -= 4; 3446 succeed_n_p = true; /* Spaghetti code alert. */ 3447 goto handle_on_failure_jump; 3448 } 3449 continue; 3450 3451 3452 case set_number_at: 3453 p += 4; 3454 continue; 3455 3456 3457 case start_memory: 3458 case stop_memory: 3459 p += 2; 3460 continue; 3461 3462 3463 default: 3464 abort (); /* We have listed all the cases. */ 3465 } /* switch *p++ */ 3466 3467 /* Getting here means we have found the possible starting 3468 characters for one path of the pattern -- and that the empty 3469 string does not match. We need not follow this path further. 3470 Instead, look at the next alternative (remembered on the 3471 stack), or quit if no more. The test at the top of the loop 3472 does these things. */ 3473 path_can_be_null = false; 3474 p = pend; 3475 } /* while p */ 3476 3477 /* Set `can_be_null' for the last path (also the first path, if the 3478 pattern is empty). */ 3479 bufp->can_be_null |= path_can_be_null; 3480 3481 done: 3482 RESET_FAIL_STACK (); 3483 return 0; 3484} /* re_compile_fastmap */ 3485#ifdef _LIBC 3486weak_alias (__re_compile_fastmap, re_compile_fastmap) 3487#endif 3488 3489/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 3490 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 3491 this memory for recording register information. STARTS and ENDS 3492 must be allocated using the malloc library routine, and must each 3493 be at least NUM_REGS * sizeof (regoff_t) bytes long. 3494 3495 If NUM_REGS == 0, then subsequent matches should allocate their own 3496 register data. 3497 3498 Unless this function is called, the first search or match using 3499 PATTERN_BUFFER will allocate its own register data, without 3500 freeing the old data. */ 3501 3502void 3503re_set_registers (bufp, regs, num_regs, starts, ends) 3504 struct re_pattern_buffer *bufp; 3505 struct re_registers *regs; 3506 unsigned num_regs; 3507 regoff_t *starts, *ends; 3508{ 3509 if (num_regs) 3510 { 3511 bufp->regs_allocated = REGS_REALLOCATE; 3512 regs->num_regs = num_regs; 3513 regs->start = starts; 3514 regs->end = ends; 3515 } 3516 else 3517 { 3518 bufp->regs_allocated = REGS_UNALLOCATED; 3519 regs->num_regs = 0; 3520 regs->start = regs->end = (regoff_t *) 0; 3521 } 3522} 3523#ifdef _LIBC 3524weak_alias (__re_set_registers, re_set_registers) 3525#endif 3526 3527/* Searching routines. */ 3528 3529/* Like re_search_2, below, but only one string is specified, and 3530 doesn't let you say where to stop matching. */ 3531 3532int 3533re_search (bufp, string, size, startpos, range, regs) 3534 struct re_pattern_buffer *bufp; 3535 const char *string; 3536 int size, startpos, range; 3537 struct re_registers *regs; 3538{ 3539 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 3540 regs, size); 3541} 3542#ifdef _LIBC 3543weak_alias (__re_search, re_search) 3544#endif 3545 3546 3547/* Using the compiled pattern in BUFP->buffer, first tries to match the 3548 virtual concatenation of STRING1 and STRING2, starting first at index 3549 STARTPOS, then at STARTPOS + 1, and so on. 3550 3551 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 3552 3553 RANGE is how far to scan while trying to match. RANGE = 0 means try 3554 only at STARTPOS; in general, the last start tried is STARTPOS + 3555 RANGE. 3556 3557 In REGS, return the indices of the virtual concatenation of STRING1 3558 and STRING2 that matched the entire BUFP->buffer and its contained 3559 subexpressions. 3560 3561 Do not consider matching one past the index STOP in the virtual 3562 concatenation of STRING1 and STRING2. 3563 3564 We return either the position in the strings at which the match was 3565 found, -1 if no match, or -2 if error (such as failure 3566 stack overflow). */ 3567 3568int 3569re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) 3570 struct re_pattern_buffer *bufp; 3571 const char *string1, *string2; 3572 int size1, size2; 3573 int startpos; 3574 int range; 3575 struct re_registers *regs; 3576 int stop; 3577{ 3578 int val; 3579 register char *fastmap = bufp->fastmap; 3580 register RE_TRANSLATE_TYPE translate = bufp->translate; 3581 int total_size = size1 + size2; 3582 int endpos = startpos + range; 3583 3584 /* Check for out-of-range STARTPOS. */ 3585 if (startpos < 0 || startpos > total_size) 3586 return -1; 3587 3588 /* Fix up RANGE if it might eventually take us outside 3589 the virtual concatenation of STRING1 and STRING2. 3590 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 3591 if (endpos < 0) 3592 range = 0 - startpos; 3593 else if (endpos > total_size) 3594 range = total_size - startpos; 3595 3596 /* If the search isn't to be a backwards one, don't waste time in a 3597 search for a pattern that must be anchored. */ 3598 if (bufp->used > 0 && range > 0 3599 && ((re_opcode_t) bufp->buffer[0] == begbuf 3600 /* `begline' is like `begbuf' if it cannot match at newlines. */ 3601 || ((re_opcode_t) bufp->buffer[0] == begline 3602 && !bufp->newline_anchor))) 3603 { 3604 if (startpos > 0) 3605 return -1; 3606 else 3607 range = 1; 3608 } 3609 3610#ifdef emacs 3611 /* In a forward search for something that starts with \=. 3612 don't keep searching past point. */ 3613 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 3614 { 3615 range = PT - startpos; 3616 if (range <= 0) 3617 return -1; 3618 } 3619#endif /* emacs */ 3620 3621 /* Update the fastmap now if not correct already. */ 3622 if (fastmap && !bufp->fastmap_accurate) 3623 if (re_compile_fastmap (bufp) == -2) 3624 return -2; 3625 3626 /* Loop through the string, looking for a place to start matching. */ 3627 for (;;) 3628 { 3629 /* If a fastmap is supplied, skip quickly over characters that 3630 cannot be the start of a match. If the pattern can match the 3631 null string, however, we don't need to skip characters; we want 3632 the first null string. */ 3633 if (fastmap && startpos < total_size && !bufp->can_be_null) 3634 { 3635 if (range > 0) /* Searching forwards. */ 3636 { 3637 register const char *d; 3638 register int lim = 0; 3639 int irange = range; 3640 3641 if (startpos < size1 && startpos + range >= size1) 3642 lim = range - (size1 - startpos); 3643 3644 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 3645 3646 /* Written out as an if-else to avoid testing `translate' 3647 inside the loop. */ 3648 if (translate) 3649 while (range > lim 3650 && !fastmap[(unsigned char) 3651 translate[(unsigned char) *d++]]) 3652 range--; 3653 else 3654 while (range > lim && !fastmap[(unsigned char) *d++]) 3655 range--; 3656 3657 startpos += irange - range; 3658 } 3659 else /* Searching backwards. */ 3660 { 3661 register char c = (size1 == 0 || startpos >= size1 3662 ? string2[startpos - size1] 3663 : string1[startpos]); 3664 3665 if (!fastmap[(unsigned char) TRANSLATE (c)]) 3666 goto advance; 3667 } 3668 } 3669 3670 /* If can't match the null string, and that's all we have left, fail. */ 3671 if (range >= 0 && startpos == total_size && fastmap 3672 && !bufp->can_be_null) 3673 return -1; 3674 3675 val = re_match_2_internal (bufp, string1, size1, string2, size2, 3676 startpos, regs, stop); 3677#ifndef REGEX_MALLOC 3678# ifdef C_ALLOCA 3679 alloca (0); 3680# endif 3681#endif 3682 3683 if (val >= 0) 3684 return startpos; 3685 3686 if (val == -2) 3687 return -2; 3688 3689 advance: 3690 if (!range) 3691 break; 3692 else if (range > 0) 3693 { 3694 range--; 3695 startpos++; 3696 } 3697 else 3698 { 3699 range++; 3700 startpos--; 3701 } 3702 } 3703 return -1; 3704} /* re_search_2 */ 3705#ifdef _LIBC 3706weak_alias (__re_search_2, re_search_2) 3707#endif 3708 3709/* This converts PTR, a pointer into one of the search strings `string1' 3710 and `string2' into an offset from the beginning of that string. */ 3711#define POINTER_TO_OFFSET(ptr) \ 3712 (FIRST_STRING_P (ptr) \ 3713 ? ((regoff_t) ((ptr) - string1)) \ 3714 : ((regoff_t) ((ptr) - string2 + size1))) 3715 3716/* Macros for dealing with the split strings in re_match_2. */ 3717 3718#define MATCHING_IN_FIRST_STRING (dend == end_match_1) 3719 3720/* Call before fetching a character with *d. This switches over to 3721 string2 if necessary. */ 3722#define PREFETCH() \ 3723 while (d == dend) \ 3724 { \ 3725 /* End of string2 => fail. */ \ 3726 if (dend == end_match_2) \ 3727 goto fail; \ 3728 /* End of string1 => advance to string2. */ \ 3729 d = string2; \ 3730 dend = end_match_2; \ 3731 } 3732 3733 3734/* Test if at very beginning or at very end of the virtual concatenation 3735 of `string1' and `string2'. If only one string, it's `string2'. */ 3736#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 3737#define AT_STRINGS_END(d) ((d) == end2) 3738 3739 3740/* Test if D points to a character which is word-constituent. We have 3741 two special cases to check for: if past the end of string1, look at 3742 the first character in string2; and if before the beginning of 3743 string2, look at the last character in string1. */ 3744#define WORDCHAR_P(d) \ 3745 (SYNTAX ((d) == end1 ? *string2 \ 3746 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 3747 == Sword) 3748 3749/* Disabled due to a compiler bug -- see comment at case wordbound */ 3750#if 0 3751/* Test if the character before D and the one at D differ with respect 3752 to being word-constituent. */ 3753#define AT_WORD_BOUNDARY(d) \ 3754 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 3755 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 3756#endif 3757 3758/* Free everything we malloc. */ 3759#ifdef MATCH_MAY_ALLOCATE 3760# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 3761# define FREE_VARIABLES() \ 3762 do { \ 3763 REGEX_FREE_STACK (fail_stack.stack); \ 3764 FREE_VAR (regstart); \ 3765 FREE_VAR (regend); \ 3766 FREE_VAR (old_regstart); \ 3767 FREE_VAR (old_regend); \ 3768 FREE_VAR (best_regstart); \ 3769 FREE_VAR (best_regend); \ 3770 FREE_VAR (reg_info); \ 3771 FREE_VAR (reg_dummy); \ 3772 FREE_VAR (reg_info_dummy); \ 3773 } while (0) 3774#else 3775# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 3776#endif /* not MATCH_MAY_ALLOCATE */ 3777 3778/* These values must meet several constraints. They must not be valid 3779 register values; since we have a limit of 255 registers (because 3780 we use only one byte in the pattern for the register number), we can 3781 use numbers larger than 255. They must differ by 1, because of 3782 NUM_FAILURE_ITEMS above. And the value for the lowest register must 3783 be larger than the value for the highest register, so we do not try 3784 to actually save any registers when none are active. */ 3785#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 3786#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 3787 3788/* Matching routines. */ 3789 3790#ifndef emacs /* Emacs never uses this. */ 3791/* re_match is like re_match_2 except it takes only a single string. */ 3792 3793int 3794re_match (bufp, string, size, pos, regs) 3795 struct re_pattern_buffer *bufp; 3796 const char *string; 3797 int size, pos; 3798 struct re_registers *regs; 3799{ 3800 int result = re_match_2_internal (bufp, NULL, 0, string, size, 3801 pos, regs, size); 3802# ifndef REGEX_MALLOC 3803# ifdef C_ALLOCA 3804 alloca (0); 3805# endif 3806# endif 3807 return result; 3808} 3809# ifdef _LIBC 3810weak_alias (__re_match, re_match) 3811# endif 3812#endif /* not emacs */ 3813 3814static boolean group_match_null_string_p _RE_ARGS ((unsigned char **p, 3815 unsigned char *end, 3816 register_info_type *reg_info)); 3817static boolean alt_match_null_string_p _RE_ARGS ((unsigned char *p, 3818 unsigned char *end, 3819 register_info_type *reg_info)); 3820static boolean common_op_match_null_string_p _RE_ARGS ((unsigned char **p, 3821 unsigned char *end, 3822 register_info_type *reg_info)); 3823static int bcmp_translate _RE_ARGS ((const char *s1, const char *s2, 3824 int len, char *translate)); 3825 3826/* re_match_2 matches the compiled pattern in BUFP against the 3827 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 3828 and SIZE2, respectively). We start matching at POS, and stop 3829 matching at STOP. 3830 3831 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 3832 store offsets for the substring each group matched in REGS. See the 3833 documentation for exactly how many groups we fill. 3834 3835 We return -1 if no match, -2 if an internal error (such as the 3836 failure stack overflowing). Otherwise, we return the length of the 3837 matched substring. */ 3838 3839int 3840re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 3841 struct re_pattern_buffer *bufp; 3842 const char *string1, *string2; 3843 int size1, size2; 3844 int pos; 3845 struct re_registers *regs; 3846 int stop; 3847{ 3848 int result = re_match_2_internal (bufp, string1, size1, string2, size2, 3849 pos, regs, stop); 3850#ifndef REGEX_MALLOC 3851# ifdef C_ALLOCA 3852 alloca (0); 3853# endif 3854#endif 3855 return result; 3856} 3857#ifdef _LIBC 3858weak_alias (__re_match_2, re_match_2) 3859#endif 3860 3861/* This is a separate function so that we can force an alloca cleanup 3862 afterwards. */ 3863static int 3864re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop) 3865 struct re_pattern_buffer *bufp; 3866 const char *string1, *string2; 3867 int size1, size2; 3868 int pos; 3869 struct re_registers *regs; 3870 int stop; 3871{ 3872 /* General temporaries. */ 3873 int mcnt; 3874 unsigned char *p1; 3875 3876 /* Just past the end of the corresponding string. */ 3877 const char *end1, *end2; 3878 3879 /* Pointers into string1 and string2, just past the last characters in 3880 each to consider matching. */ 3881 const char *end_match_1, *end_match_2; 3882 3883 /* Where we are in the data, and the end of the current string. */ 3884 const char *d, *dend; 3885 3886 /* Where we are in the pattern, and the end of the pattern. */ 3887 unsigned char *p = bufp->buffer; 3888 register unsigned char *pend = p + bufp->used; 3889 3890 /* Mark the opcode just after a start_memory, so we can test for an 3891 empty subpattern when we get to the stop_memory. */ 3892 unsigned char *just_past_start_mem = 0; 3893 3894 /* We use this to map every character in the string. */ 3895 RE_TRANSLATE_TYPE translate = bufp->translate; 3896 3897 /* Failure point stack. Each place that can handle a failure further 3898 down the line pushes a failure point on this stack. It consists of 3899 restart, regend, and reg_info for all registers corresponding to 3900 the subexpressions we're currently inside, plus the number of such 3901 registers, and, finally, two char *'s. The first char * is where 3902 to resume scanning the pattern; the second one is where to resume 3903 scanning the strings. If the latter is zero, the failure point is 3904 a ``dummy''; if a failure happens and the failure point is a dummy, 3905 it gets discarded and the next next one is tried. */ 3906#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 3907 fail_stack_type fail_stack; 3908#endif 3909#ifdef DEBUG 3910 static unsigned failure_id; 3911 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 3912#endif 3913 3914#ifdef REL_ALLOC 3915 /* This holds the pointer to the failure stack, when 3916 it is allocated relocatably. */ 3917 fail_stack_elt_t *failure_stack_ptr; 3918#endif 3919 3920 /* We fill all the registers internally, independent of what we 3921 return, for use in backreferences. The number here includes 3922 an element for register zero. */ 3923 size_t num_regs = bufp->re_nsub + 1; 3924 3925 /* The currently active registers. */ 3926 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; 3927 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; 3928 3929 /* Information on the contents of registers. These are pointers into 3930 the input strings; they record just what was matched (on this 3931 attempt) by a subexpression part of the pattern, that is, the 3932 regnum-th regstart pointer points to where in the pattern we began 3933 matching and the regnum-th regend points to right after where we 3934 stopped matching the regnum-th subexpression. (The zeroth register 3935 keeps track of what the whole pattern matches.) */ 3936#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3937 const char **regstart, **regend; 3938#endif 3939 3940 /* If a group that's operated upon by a repetition operator fails to 3941 match anything, then the register for its start will need to be 3942 restored because it will have been set to wherever in the string we 3943 are when we last see its open-group operator. Similarly for a 3944 register's end. */ 3945#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3946 const char **old_regstart, **old_regend; 3947#endif 3948 3949 /* The is_active field of reg_info helps us keep track of which (possibly 3950 nested) subexpressions we are currently in. The matched_something 3951 field of reg_info[reg_num] helps us tell whether or not we have 3952 matched any of the pattern so far this time through the reg_num-th 3953 subexpression. These two fields get reset each time through any 3954 loop their register is in. */ 3955#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 3956 register_info_type *reg_info; 3957#endif 3958 3959 /* The following record the register info as found in the above 3960 variables when we find a match better than any we've seen before. 3961 This happens as we backtrack through the failure points, which in 3962 turn happens only if we have not yet matched the entire string. */ 3963 unsigned best_regs_set = false; 3964#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3965 const char **best_regstart, **best_regend; 3966#endif 3967 3968 /* Logically, this is `best_regend[0]'. But we don't want to have to 3969 allocate space for that if we're not allocating space for anything 3970 else (see below). Also, we never need info about register 0 for 3971 any of the other register vectors, and it seems rather a kludge to 3972 treat `best_regend' differently than the rest. So we keep track of 3973 the end of the best match so far in a separate variable. We 3974 initialize this to NULL so that when we backtrack the first time 3975 and need to test it, it's not garbage. */ 3976 const char *match_end = NULL; 3977 3978 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 3979 int set_regs_matched_done = 0; 3980 3981 /* Used when we pop values we don't care about. */ 3982#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 3983 const char **reg_dummy; 3984 register_info_type *reg_info_dummy; 3985#endif 3986 3987#ifdef DEBUG 3988 /* Counts the total number of registers pushed. */ 3989 unsigned num_regs_pushed = 0; 3990#endif 3991 3992 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 3993 3994 INIT_FAIL_STACK (); 3995 3996#ifdef MATCH_MAY_ALLOCATE 3997 /* Do not bother to initialize all the register variables if there are 3998 no groups in the pattern, as it takes a fair amount of time. If 3999 there are groups, we include space for register 0 (the whole 4000 pattern), even though we never use it, since it simplifies the 4001 array indexing. We should fix this. */ 4002 if (bufp->re_nsub) 4003 { 4004 regstart = REGEX_TALLOC (num_regs, const char *); 4005 regend = REGEX_TALLOC (num_regs, const char *); 4006 old_regstart = REGEX_TALLOC (num_regs, const char *); 4007 old_regend = REGEX_TALLOC (num_regs, const char *); 4008 best_regstart = REGEX_TALLOC (num_regs, const char *); 4009 best_regend = REGEX_TALLOC (num_regs, const char *); 4010 reg_info = REGEX_TALLOC (num_regs, register_info_type); 4011 reg_dummy = REGEX_TALLOC (num_regs, const char *); 4012 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); 4013 4014 if (!(regstart && regend && old_regstart && old_regend && reg_info 4015 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 4016 { 4017 FREE_VARIABLES (); 4018 return -2; 4019 } 4020 } 4021 else 4022 { 4023 /* We must initialize all our variables to NULL, so that 4024 `FREE_VARIABLES' doesn't try to free them. */ 4025 regstart = regend = old_regstart = old_regend = best_regstart 4026 = best_regend = reg_dummy = NULL; 4027 reg_info = reg_info_dummy = (register_info_type *) NULL; 4028 } 4029#endif /* MATCH_MAY_ALLOCATE */ 4030 4031 /* The starting position is bogus. */ 4032 if (pos < 0 || pos > size1 + size2) 4033 { 4034 FREE_VARIABLES (); 4035 return -1; 4036 } 4037 4038 /* Initialize subexpression text positions to -1 to mark ones that no 4039 start_memory/stop_memory has been seen for. Also initialize the 4040 register information struct. */ 4041 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 4042 { 4043 regstart[mcnt] = regend[mcnt] 4044 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 4045 4046 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 4047 IS_ACTIVE (reg_info[mcnt]) = 0; 4048 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 4049 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 4050 } 4051 4052 /* We move `string1' into `string2' if the latter's empty -- but not if 4053 `string1' is null. */ 4054 if (size2 == 0 && string1 != NULL) 4055 { 4056 string2 = string1; 4057 size2 = size1; 4058 string1 = 0; 4059 size1 = 0; 4060 } 4061 end1 = string1 + size1; 4062 end2 = string2 + size2; 4063 4064 /* Compute where to stop matching, within the two strings. */ 4065 if (stop <= size1) 4066 { 4067 end_match_1 = string1 + stop; 4068 end_match_2 = string2; 4069 } 4070 else 4071 { 4072 end_match_1 = end1; 4073 end_match_2 = string2 + stop - size1; 4074 } 4075 4076 /* `p' scans through the pattern as `d' scans through the data. 4077 `dend' is the end of the input string that `d' points within. `d' 4078 is advanced into the following input string whenever necessary, but 4079 this happens before fetching; therefore, at the beginning of the 4080 loop, `d' can be pointing at the end of a string, but it cannot 4081 equal `string2'. */ 4082 if (size1 > 0 && pos <= size1) 4083 { 4084 d = string1 + pos; 4085 dend = end_match_1; 4086 } 4087 else 4088 { 4089 d = string2 + pos - size1; 4090 dend = end_match_2; 4091 } 4092 4093 DEBUG_PRINT1 ("The compiled pattern is:\n"); 4094 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 4095 DEBUG_PRINT1 ("The string to match is: `"); 4096 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 4097 DEBUG_PRINT1 ("'\n"); 4098 4099 /* This loops over pattern commands. It exits by returning from the 4100 function if the match is complete, or it drops through if the match 4101 fails at this starting point in the input data. */ 4102 for (;;) 4103 { 4104#ifdef _LIBC 4105 DEBUG_PRINT2 ("\n%p: ", p); 4106#else 4107 DEBUG_PRINT2 ("\n0x%x: ", p); 4108#endif 4109 4110 if (p == pend) 4111 { /* End of pattern means we might have succeeded. */ 4112 DEBUG_PRINT1 ("end of pattern ... "); 4113 4114 /* If we haven't matched the entire string, and we want the 4115 longest match, try backtracking. */ 4116 if (d != end_match_2) 4117 { 4118 /* 1 if this match ends in the same string (string1 or string2) 4119 as the best previous match. */ 4120 boolean same_str_p = (FIRST_STRING_P (match_end) 4121 == MATCHING_IN_FIRST_STRING); 4122 /* 1 if this match is the best seen so far. */ 4123 boolean best_match_p; 4124 4125 /* AIX compiler got confused when this was combined 4126 with the previous declaration. */ 4127 if (same_str_p) 4128 best_match_p = d > match_end; 4129 else 4130 best_match_p = !MATCHING_IN_FIRST_STRING; 4131 4132 DEBUG_PRINT1 ("backtracking.\n"); 4133 4134 if (!FAIL_STACK_EMPTY ()) 4135 { /* More failure points to try. */ 4136 4137 /* If exceeds best match so far, save it. */ 4138 if (!best_regs_set || best_match_p) 4139 { 4140 best_regs_set = true; 4141 match_end = d; 4142 4143 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 4144 4145 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 4146 { 4147 best_regstart[mcnt] = regstart[mcnt]; 4148 best_regend[mcnt] = regend[mcnt]; 4149 } 4150 } 4151 goto fail; 4152 } 4153 4154 /* If no failure points, don't restore garbage. And if 4155 last match is real best match, don't restore second 4156 best one. */ 4157 else if (best_regs_set && !best_match_p) 4158 { 4159 restore_best_regs: 4160 /* Restore best match. It may happen that `dend == 4161 end_match_1' while the restored d is in string2. 4162 For example, the pattern `x.*y.*z' against the 4163 strings `x-' and `y-z-', if the two strings are 4164 not consecutive in memory. */ 4165 DEBUG_PRINT1 ("Restoring best registers.\n"); 4166 4167 d = match_end; 4168 dend = ((d >= string1 && d <= end1) 4169 ? end_match_1 : end_match_2); 4170 4171 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 4172 { 4173 regstart[mcnt] = best_regstart[mcnt]; 4174 regend[mcnt] = best_regend[mcnt]; 4175 } 4176 } 4177 } /* d != end_match_2 */ 4178 4179 succeed_label: 4180 DEBUG_PRINT1 ("Accepting match.\n"); 4181 4182 /* If caller wants register contents data back, do it. */ 4183 if (regs && !bufp->no_sub) 4184 { 4185 /* Have the register data arrays been allocated? */ 4186 if (bufp->regs_allocated == REGS_UNALLOCATED) 4187 { /* No. So allocate them with malloc. We need one 4188 extra element beyond `num_regs' for the `-1' marker 4189 GNU code uses. */ 4190 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 4191 regs->start = TALLOC (regs->num_regs, regoff_t); 4192 regs->end = TALLOC (regs->num_regs, regoff_t); 4193 if (regs->start == NULL || regs->end == NULL) 4194 { 4195 FREE_VARIABLES (); 4196 return -2; 4197 } 4198 bufp->regs_allocated = REGS_REALLOCATE; 4199 } 4200 else if (bufp->regs_allocated == REGS_REALLOCATE) 4201 { /* Yes. If we need more elements than were already 4202 allocated, reallocate them. If we need fewer, just 4203 leave it alone. */ 4204 if (regs->num_regs < num_regs + 1) 4205 { 4206 regs->num_regs = num_regs + 1; 4207 RETALLOC (regs->start, regs->num_regs, regoff_t); 4208 RETALLOC (regs->end, regs->num_regs, regoff_t); 4209 if (regs->start == NULL || regs->end == NULL) 4210 { 4211 FREE_VARIABLES (); 4212 return -2; 4213 } 4214 } 4215 } 4216 else 4217 { 4218 /* These braces fend off a "empty body in an else-statement" 4219 warning under GCC when assert expands to nothing. */ 4220 assert (bufp->regs_allocated == REGS_FIXED); 4221 } 4222 4223 /* Convert the pointer data in `regstart' and `regend' to 4224 indices. Register zero has to be set differently, 4225 since we haven't kept track of any info for it. */ 4226 if (regs->num_regs > 0) 4227 { 4228 regs->start[0] = pos; 4229 regs->end[0] = (MATCHING_IN_FIRST_STRING 4230 ? ((regoff_t) (d - string1)) 4231 : ((regoff_t) (d - string2 + size1))); 4232 } 4233 4234 /* Go through the first `min (num_regs, regs->num_regs)' 4235 registers, since that is all we initialized. */ 4236 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs); 4237 mcnt++) 4238 { 4239 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 4240 regs->start[mcnt] = regs->end[mcnt] = -1; 4241 else 4242 { 4243 regs->start[mcnt] 4244 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 4245 regs->end[mcnt] 4246 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 4247 } 4248 } 4249 4250 /* If the regs structure we return has more elements than 4251 were in the pattern, set the extra elements to -1. If 4252 we (re)allocated the registers, this is the case, 4253 because we always allocate enough to have at least one 4254 -1 at the end. */ 4255 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++) 4256 regs->start[mcnt] = regs->end[mcnt] = -1; 4257 } /* regs && !bufp->no_sub */ 4258 4259 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 4260 nfailure_points_pushed, nfailure_points_popped, 4261 nfailure_points_pushed - nfailure_points_popped); 4262 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 4263 4264 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 4265 ? string1 4266 : string2 - size1); 4267 4268 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 4269 4270 FREE_VARIABLES (); 4271 return mcnt; 4272 } 4273 4274 /* Otherwise match next pattern command. */ 4275 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 4276 { 4277 /* Ignore these. Used to ignore the n of succeed_n's which 4278 currently have n == 0. */ 4279 case no_op: 4280 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 4281 break; 4282 4283 case succeed: 4284 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 4285 goto succeed_label; 4286 4287 /* Match the next n pattern characters exactly. The following 4288 byte in the pattern defines n, and the n bytes after that 4289 are the characters to match. */ 4290 case exactn: 4291 mcnt = *p++; 4292 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 4293 4294 /* This is written out as an if-else so we don't waste time 4295 testing `translate' inside the loop. */ 4296 if (translate) 4297 { 4298 do 4299 { 4300 PREFETCH (); 4301 if ((unsigned char) translate[(unsigned char) *d++] 4302 != (unsigned char) *p++) 4303 goto fail; 4304 } 4305 while (--mcnt); 4306 } 4307 else 4308 { 4309 do 4310 { 4311 PREFETCH (); 4312 if (*d++ != (char) *p++) goto fail; 4313 } 4314 while (--mcnt); 4315 } 4316 SET_REGS_MATCHED (); 4317 break; 4318 4319 4320 /* Match any character except possibly a newline or a null. */ 4321 case anychar: 4322 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 4323 4324 PREFETCH (); 4325 4326 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 4327 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 4328 goto fail; 4329 4330 SET_REGS_MATCHED (); 4331 DEBUG_PRINT2 (" Matched `%d'.\n", *d); 4332 d++; 4333 break; 4334 4335 4336 case charset: 4337 case charset_not: 4338 { 4339 register unsigned char c; 4340 boolean not = (re_opcode_t) *(p - 1) == charset_not; 4341 4342 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); 4343 4344 PREFETCH (); 4345 c = TRANSLATE (*d); /* The character to match. */ 4346 4347 /* Cast to `unsigned' instead of `unsigned char' in case the 4348 bit list is a full 32 bytes long. */ 4349 if (c < (unsigned) (*p * BYTEWIDTH) 4350 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4351 not = !not; 4352 4353 p += 1 + *p; 4354 4355 if (!not) goto fail; 4356 4357 SET_REGS_MATCHED (); 4358 d++; 4359 break; 4360 } 4361 4362 4363 /* The beginning of a group is represented by start_memory. 4364 The arguments are the register number in the next byte, and the 4365 number of groups inner to this one in the next. The text 4366 matched within the group is recorded (in the internal 4367 registers data structure) under the register number. */ 4368 case start_memory: 4369 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); 4370 4371 /* Find out if this group can match the empty string. */ 4372 p1 = p; /* To send to group_match_null_string_p. */ 4373 4374 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 4375 REG_MATCH_NULL_STRING_P (reg_info[*p]) 4376 = group_match_null_string_p (&p1, pend, reg_info); 4377 4378 /* Save the position in the string where we were the last time 4379 we were at this open-group operator in case the group is 4380 operated upon by a repetition operator, e.g., with `(a*)*b' 4381 against `ab'; then we want to ignore where we are now in 4382 the string in case this attempt to match fails. */ 4383 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 4384 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 4385 : regstart[*p]; 4386 DEBUG_PRINT2 (" old_regstart: %d\n", 4387 POINTER_TO_OFFSET (old_regstart[*p])); 4388 4389 regstart[*p] = d; 4390 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 4391 4392 IS_ACTIVE (reg_info[*p]) = 1; 4393 MATCHED_SOMETHING (reg_info[*p]) = 0; 4394 4395 /* Clear this whenever we change the register activity status. */ 4396 set_regs_matched_done = 0; 4397 4398 /* This is the new highest active register. */ 4399 highest_active_reg = *p; 4400 4401 /* If nothing was active before, this is the new lowest active 4402 register. */ 4403 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 4404 lowest_active_reg = *p; 4405 4406 /* Move past the register number and inner group count. */ 4407 p += 2; 4408 just_past_start_mem = p; 4409 4410 break; 4411 4412 4413 /* The stop_memory opcode represents the end of a group. Its 4414 arguments are the same as start_memory's: the register 4415 number, and the number of inner groups. */ 4416 case stop_memory: 4417 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); 4418 4419 /* We need to save the string position the last time we were at 4420 this close-group operator in case the group is operated 4421 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 4422 against `aba'; then we want to ignore where we are now in 4423 the string in case this attempt to match fails. */ 4424 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 4425 ? REG_UNSET (regend[*p]) ? d : regend[*p] 4426 : regend[*p]; 4427 DEBUG_PRINT2 (" old_regend: %d\n", 4428 POINTER_TO_OFFSET (old_regend[*p])); 4429 4430 regend[*p] = d; 4431 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 4432 4433 /* This register isn't active anymore. */ 4434 IS_ACTIVE (reg_info[*p]) = 0; 4435 4436 /* Clear this whenever we change the register activity status. */ 4437 set_regs_matched_done = 0; 4438 4439 /* If this was the only register active, nothing is active 4440 anymore. */ 4441 if (lowest_active_reg == highest_active_reg) 4442 { 4443 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 4444 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 4445 } 4446 else 4447 { /* We must scan for the new highest active register, since 4448 it isn't necessarily one less than now: consider 4449 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 4450 new highest active register is 1. */ 4451 unsigned char r = *p - 1; 4452 while (r > 0 && !IS_ACTIVE (reg_info[r])) 4453 r--; 4454 4455 /* If we end up at register zero, that means that we saved 4456 the registers as the result of an `on_failure_jump', not 4457 a `start_memory', and we jumped to past the innermost 4458 `stop_memory'. For example, in ((.)*) we save 4459 registers 1 and 2 as a result of the *, but when we pop 4460 back to the second ), we are at the stop_memory 1. 4461 Thus, nothing is active. */ 4462 if (r == 0) 4463 { 4464 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 4465 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 4466 } 4467 else 4468 highest_active_reg = r; 4469 } 4470 4471 /* If just failed to match something this time around with a 4472 group that's operated on by a repetition operator, try to 4473 force exit from the ``loop'', and restore the register 4474 information for this group that we had before trying this 4475 last match. */ 4476 if ((!MATCHED_SOMETHING (reg_info[*p]) 4477 || just_past_start_mem == p - 1) 4478 && (p + 2) < pend) 4479 { 4480 boolean is_a_jump_n = false; 4481 4482 p1 = p + 2; 4483 mcnt = 0; 4484 switch ((re_opcode_t) *p1++) 4485 { 4486 case jump_n: 4487 is_a_jump_n = true; 4488 case pop_failure_jump: 4489 case maybe_pop_jump: 4490 case jump: 4491 case dummy_failure_jump: 4492 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4493 if (is_a_jump_n) 4494 p1 += 2; 4495 break; 4496 4497 default: 4498 /* do nothing */ ; 4499 } 4500 p1 += mcnt; 4501 4502 /* If the next operation is a jump backwards in the pattern 4503 to an on_failure_jump right before the start_memory 4504 corresponding to this stop_memory, exit from the loop 4505 by forcing a failure after pushing on the stack the 4506 on_failure_jump's jump in the pattern, and d. */ 4507 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 4508 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) 4509 { 4510 /* If this group ever matched anything, then restore 4511 what its registers were before trying this last 4512 failed match, e.g., with `(a*)*b' against `ab' for 4513 regstart[1], and, e.g., with `((a*)*(b*)*)*' 4514 against `aba' for regend[3]. 4515 4516 Also restore the registers for inner groups for, 4517 e.g., `((a*)(b*))*' against `aba' (register 3 would 4518 otherwise get trashed). */ 4519 4520 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 4521 { 4522 unsigned r; 4523 4524 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 4525 4526 /* Restore this and inner groups' (if any) registers. */ 4527 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1); 4528 r++) 4529 { 4530 regstart[r] = old_regstart[r]; 4531 4532 /* xx why this test? */ 4533 if (old_regend[r] >= regstart[r]) 4534 regend[r] = old_regend[r]; 4535 } 4536 } 4537 p1++; 4538 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 4539 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 4540 4541 goto fail; 4542 } 4543 } 4544 4545 /* Move past the register number and the inner group count. */ 4546 p += 2; 4547 break; 4548 4549 4550 /* \<digit> has been turned into a `duplicate' command which is 4551 followed by the numeric value of <digit> as the register number. */ 4552 case duplicate: 4553 { 4554 register const char *d2, *dend2; 4555 int regno = *p++; /* Get which register to match against. */ 4556 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 4557 4558 /* Can't back reference a group which we've never matched. */ 4559 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 4560 goto fail; 4561 4562 /* Where in input to try to start matching. */ 4563 d2 = regstart[regno]; 4564 4565 /* Where to stop matching; if both the place to start and 4566 the place to stop matching are in the same string, then 4567 set to the place to stop, otherwise, for now have to use 4568 the end of the first string. */ 4569 4570 dend2 = ((FIRST_STRING_P (regstart[regno]) 4571 == FIRST_STRING_P (regend[regno])) 4572 ? regend[regno] : end_match_1); 4573 for (;;) 4574 { 4575 /* If necessary, advance to next segment in register 4576 contents. */ 4577 while (d2 == dend2) 4578 { 4579 if (dend2 == end_match_2) break; 4580 if (dend2 == regend[regno]) break; 4581 4582 /* End of string1 => advance to string2. */ 4583 d2 = string2; 4584 dend2 = regend[regno]; 4585 } 4586 /* At end of register contents => success */ 4587 if (d2 == dend2) break; 4588 4589 /* If necessary, advance to next segment in data. */ 4590 PREFETCH (); 4591 4592 /* How many characters left in this segment to match. */ 4593 mcnt = dend - d; 4594 4595 /* Want how many consecutive characters we can match in 4596 one shot, so, if necessary, adjust the count. */ 4597 if (mcnt > dend2 - d2) 4598 mcnt = dend2 - d2; 4599 4600 /* Compare that many; failure if mismatch, else move 4601 past them. */ 4602 if (translate 4603 ? bcmp_translate (d, d2, mcnt, translate) 4604 : memcmp (d, d2, mcnt)) 4605 goto fail; 4606 d += mcnt, d2 += mcnt; 4607 4608 /* Do this because we've match some characters. */ 4609 SET_REGS_MATCHED (); 4610 } 4611 } 4612 break; 4613 4614 4615 /* begline matches the empty string at the beginning of the string 4616 (unless `not_bol' is set in `bufp'), and, if 4617 `newline_anchor' is set, after newlines. */ 4618 case begline: 4619 DEBUG_PRINT1 ("EXECUTING begline.\n"); 4620 4621 if (AT_STRINGS_BEG (d)) 4622 { 4623 if (!bufp->not_bol) break; 4624 } 4625 else if (d[-1] == '\n' && bufp->newline_anchor) 4626 { 4627 break; 4628 } 4629 /* In all other cases, we fail. */ 4630 goto fail; 4631 4632 4633 /* endline is the dual of begline. */ 4634 case endline: 4635 DEBUG_PRINT1 ("EXECUTING endline.\n"); 4636 4637 if (AT_STRINGS_END (d)) 4638 { 4639 if (!bufp->not_eol) break; 4640 } 4641 4642 /* We have to ``prefetch'' the next character. */ 4643 else if ((d == end1 ? *string2 : *d) == '\n' 4644 && bufp->newline_anchor) 4645 { 4646 break; 4647 } 4648 goto fail; 4649 4650 4651 /* Match at the very beginning of the data. */ 4652 case begbuf: 4653 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 4654 if (AT_STRINGS_BEG (d)) 4655 break; 4656 goto fail; 4657 4658 4659 /* Match at the very end of the data. */ 4660 case endbuf: 4661 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 4662 if (AT_STRINGS_END (d)) 4663 break; 4664 goto fail; 4665 4666 4667 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 4668 pushes NULL as the value for the string on the stack. Then 4669 `pop_failure_point' will keep the current value for the 4670 string, instead of restoring it. To see why, consider 4671 matching `foo\nbar' against `.*\n'. The .* matches the foo; 4672 then the . fails against the \n. But the next thing we want 4673 to do is match the \n against the \n; if we restored the 4674 string value, we would be back at the foo. 4675 4676 Because this is used only in specific cases, we don't need to 4677 check all the things that `on_failure_jump' does, to make 4678 sure the right things get saved on the stack. Hence we don't 4679 share its code. The only reason to push anything on the 4680 stack at all is that otherwise we would have to change 4681 `anychar's code to do something besides goto fail in this 4682 case; that seems worse than this. */ 4683 case on_failure_keep_string_jump: 4684 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 4685 4686 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4687#ifdef _LIBC 4688 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt); 4689#else 4690 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 4691#endif 4692 4693 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 4694 break; 4695 4696 4697 /* Uses of on_failure_jump: 4698 4699 Each alternative starts with an on_failure_jump that points 4700 to the beginning of the next alternative. Each alternative 4701 except the last ends with a jump that in effect jumps past 4702 the rest of the alternatives. (They really jump to the 4703 ending jump of the following alternative, because tensioning 4704 these jumps is a hassle.) 4705 4706 Repeats start with an on_failure_jump that points past both 4707 the repetition text and either the following jump or 4708 pop_failure_jump back to this on_failure_jump. */ 4709 case on_failure_jump: 4710 on_failure: 4711 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 4712 4713 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4714#ifdef _LIBC 4715 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt); 4716#else 4717 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 4718#endif 4719 4720 /* If this on_failure_jump comes right before a group (i.e., 4721 the original * applied to a group), save the information 4722 for that group and all inner ones, so that if we fail back 4723 to this point, the group's information will be correct. 4724 For example, in \(a*\)*\1, we need the preceding group, 4725 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 4726 4727 /* We can't use `p' to check ahead because we push 4728 a failure point to `p + mcnt' after we do this. */ 4729 p1 = p; 4730 4731 /* We need to skip no_op's before we look for the 4732 start_memory in case this on_failure_jump is happening as 4733 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 4734 against aba. */ 4735 while (p1 < pend && (re_opcode_t) *p1 == no_op) 4736 p1++; 4737 4738 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 4739 { 4740 /* We have a new highest active register now. This will 4741 get reset at the start_memory we are about to get to, 4742 but we will have saved all the registers relevant to 4743 this repetition op, as described above. */ 4744 highest_active_reg = *(p1 + 1) + *(p1 + 2); 4745 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 4746 lowest_active_reg = *(p1 + 1); 4747 } 4748 4749 DEBUG_PRINT1 (":\n"); 4750 PUSH_FAILURE_POINT (p + mcnt, d, -2); 4751 break; 4752 4753 4754 /* A smart repeat ends with `maybe_pop_jump'. 4755 We change it to either `pop_failure_jump' or `jump'. */ 4756 case maybe_pop_jump: 4757 EXTRACT_NUMBER_AND_INCR (mcnt, p); 4758 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 4759 { 4760 register unsigned char *p2 = p; 4761 4762 /* Compare the beginning of the repeat with what in the 4763 pattern follows its end. If we can establish that there 4764 is nothing that they would both match, i.e., that we 4765 would have to backtrack because of (as in, e.g., `a*a') 4766 then we can change to pop_failure_jump, because we'll 4767 never have to backtrack. 4768 4769 This is not true in the case of alternatives: in 4770 `(a|ab)*' we do need to backtrack to the `ab' alternative 4771 (e.g., if the string was `ab'). But instead of trying to 4772 detect that here, the alternative has put on a dummy 4773 failure point which is what we will end up popping. */ 4774 4775 /* Skip over open/close-group commands. 4776 If what follows this loop is a ...+ construct, 4777 look at what begins its body, since we will have to 4778 match at least one of that. */ 4779 while (1) 4780 { 4781 if (p2 + 2 < pend 4782 && ((re_opcode_t) *p2 == stop_memory 4783 || (re_opcode_t) *p2 == start_memory)) 4784 p2 += 3; 4785 else if (p2 + 6 < pend 4786 && (re_opcode_t) *p2 == dummy_failure_jump) 4787 p2 += 6; 4788 else 4789 break; 4790 } 4791 4792 p1 = p + mcnt; 4793 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 4794 to the `maybe_finalize_jump' of this case. Examine what 4795 follows. */ 4796 4797 /* If we're at the end of the pattern, we can change. */ 4798 if (p2 == pend) 4799 { 4800 /* Consider what happens when matching ":\(.*\)" 4801 against ":/". I don't really understand this code 4802 yet. */ 4803 p[-3] = (unsigned char) pop_failure_jump; 4804 DEBUG_PRINT1 4805 (" End of pattern: change to `pop_failure_jump'.\n"); 4806 } 4807 4808 else if ((re_opcode_t) *p2 == exactn 4809 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 4810 { 4811 register unsigned char c 4812 = *p2 == (unsigned char) endline ? '\n' : p2[2]; 4813 4814 if ((re_opcode_t) p1[3] == exactn && p1[5] != c) 4815 { 4816 p[-3] = (unsigned char) pop_failure_jump; 4817 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 4818 c, p1[5]); 4819 } 4820 4821 else if ((re_opcode_t) p1[3] == charset 4822 || (re_opcode_t) p1[3] == charset_not) 4823 { 4824 int not = (re_opcode_t) p1[3] == charset_not; 4825 4826 if (c < (unsigned char) (p1[4] * BYTEWIDTH) 4827 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 4828 not = !not; 4829 4830 /* `not' is equal to 1 if c would match, which means 4831 that we can't change to pop_failure_jump. */ 4832 if (!not) 4833 { 4834 p[-3] = (unsigned char) pop_failure_jump; 4835 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4836 } 4837 } 4838 } 4839 else if ((re_opcode_t) *p2 == charset) 4840 { 4841 /* We win if the first character of the loop is not part 4842 of the charset. */ 4843 if ((re_opcode_t) p1[3] == exactn 4844 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 4845 && (p2[2 + p1[5] / BYTEWIDTH] 4846 & (1 << (p1[5] % BYTEWIDTH))))) 4847 { 4848 p[-3] = (unsigned char) pop_failure_jump; 4849 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4850 } 4851 4852 else if ((re_opcode_t) p1[3] == charset_not) 4853 { 4854 int idx; 4855 /* We win if the charset_not inside the loop 4856 lists every character listed in the charset after. */ 4857 for (idx = 0; idx < (int) p2[1]; idx++) 4858 if (! (p2[2 + idx] == 0 4859 || (idx < (int) p1[4] 4860 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 4861 break; 4862 4863 if (idx == p2[1]) 4864 { 4865 p[-3] = (unsigned char) pop_failure_jump; 4866 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4867 } 4868 } 4869 else if ((re_opcode_t) p1[3] == charset) 4870 { 4871 int idx; 4872 /* We win if the charset inside the loop 4873 has no overlap with the one after the loop. */ 4874 for (idx = 0; 4875 idx < (int) p2[1] && idx < (int) p1[4]; 4876 idx++) 4877 if ((p2[2 + idx] & p1[5 + idx]) != 0) 4878 break; 4879 4880 if (idx == p2[1] || idx == p1[4]) 4881 { 4882 p[-3] = (unsigned char) pop_failure_jump; 4883 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 4884 } 4885 } 4886 } 4887 } 4888 p -= 2; /* Point at relative address again. */ 4889 if ((re_opcode_t) p[-1] != pop_failure_jump) 4890 { 4891 p[-1] = (unsigned char) jump; 4892 DEBUG_PRINT1 (" Match => jump.\n"); 4893 goto unconditional_jump; 4894 } 4895 /* Note fall through. */ 4896 4897 4898 /* The end of a simple repeat has a pop_failure_jump back to 4899 its matching on_failure_jump, where the latter will push a 4900 failure point. The pop_failure_jump takes off failure 4901 points put on by this pop_failure_jump's matching 4902 on_failure_jump; we got through the pattern to here from the 4903 matching on_failure_jump, so didn't fail. */ 4904 case pop_failure_jump: 4905 { 4906 /* We need to pass separate storage for the lowest and 4907 highest registers, even though we don't care about the 4908 actual values. Otherwise, we will restore only one 4909 register from the stack, since lowest will == highest in 4910 `pop_failure_point'. */ 4911 active_reg_t dummy_low_reg, dummy_high_reg; 4912 unsigned char *pdummy; 4913 const char *sdummy; 4914 4915 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 4916 POP_FAILURE_POINT (sdummy, pdummy, 4917 dummy_low_reg, dummy_high_reg, 4918 reg_dummy, reg_dummy, reg_info_dummy); 4919 } 4920 /* Note fall through. */ 4921 4922 unconditional_jump: 4923#ifdef _LIBC 4924 DEBUG_PRINT2 ("\n%p: ", p); 4925#else 4926 DEBUG_PRINT2 ("\n0x%x: ", p); 4927#endif 4928 /* Note fall through. */ 4929 4930 /* Unconditionally jump (without popping any failure points). */ 4931 case jump: 4932 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 4933 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 4934 p += mcnt; /* Do the jump. */ 4935#ifdef _LIBC 4936 DEBUG_PRINT2 ("(to %p).\n", p); 4937#else 4938 DEBUG_PRINT2 ("(to 0x%x).\n", p); 4939#endif 4940 break; 4941 4942 4943 /* We need this opcode so we can detect where alternatives end 4944 in `group_match_null_string_p' et al. */ 4945 case jump_past_alt: 4946 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 4947 goto unconditional_jump; 4948 4949 4950 /* Normally, the on_failure_jump pushes a failure point, which 4951 then gets popped at pop_failure_jump. We will end up at 4952 pop_failure_jump, also, and with a pattern of, say, `a+', we 4953 are skipping over the on_failure_jump, so we have to push 4954 something meaningless for pop_failure_jump to pop. */ 4955 case dummy_failure_jump: 4956 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 4957 /* It doesn't matter what we push for the string here. What 4958 the code at `fail' tests is the value for the pattern. */ 4959 PUSH_FAILURE_POINT (NULL, NULL, -2); 4960 goto unconditional_jump; 4961 4962 4963 /* At the end of an alternative, we need to push a dummy failure 4964 point in case we are followed by a `pop_failure_jump', because 4965 we don't want the failure point for the alternative to be 4966 popped. For example, matching `(a|ab)*' against `aab' 4967 requires that we match the `ab' alternative. */ 4968 case push_dummy_failure: 4969 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 4970 /* See comments just above at `dummy_failure_jump' about the 4971 two zeroes. */ 4972 PUSH_FAILURE_POINT (NULL, NULL, -2); 4973 break; 4974 4975 /* Have to succeed matching what follows at least n times. 4976 After that, handle like `on_failure_jump'. */ 4977 case succeed_n: 4978 EXTRACT_NUMBER (mcnt, p + 2); 4979 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 4980 4981 assert (mcnt >= 0); 4982 /* Originally, this is how many times we HAVE to succeed. */ 4983 if (mcnt > 0) 4984 { 4985 mcnt--; 4986 p += 2; 4987 STORE_NUMBER_AND_INCR (p, mcnt); 4988#ifdef _LIBC 4989 DEBUG_PRINT3 (" Setting %p to %d.\n", p - 2, mcnt); 4990#else 4991 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - 2, mcnt); 4992#endif 4993 } 4994 else if (mcnt == 0) 4995 { 4996#ifdef _LIBC 4997 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", p+2); 4998#else 4999 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); 5000#endif 5001 p[2] = (unsigned char) no_op; 5002 p[3] = (unsigned char) no_op; 5003 goto on_failure; 5004 } 5005 break; 5006 5007 case jump_n: 5008 EXTRACT_NUMBER (mcnt, p + 2); 5009 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 5010 5011 /* Originally, this is how many times we CAN jump. */ 5012 if (mcnt) 5013 { 5014 mcnt--; 5015 STORE_NUMBER (p + 2, mcnt); 5016#ifdef _LIBC 5017 DEBUG_PRINT3 (" Setting %p to %d.\n", p + 2, mcnt); 5018#else 5019 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + 2, mcnt); 5020#endif 5021 goto unconditional_jump; 5022 } 5023 /* If don't have to jump any more, skip over the rest of command. */ 5024 else 5025 p += 4; 5026 break; 5027 5028 case set_number_at: 5029 { 5030 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 5031 5032 EXTRACT_NUMBER_AND_INCR (mcnt, p); 5033 p1 = p + mcnt; 5034 EXTRACT_NUMBER_AND_INCR (mcnt, p); 5035#ifdef _LIBC 5036 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); 5037#else 5038 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 5039#endif 5040 STORE_NUMBER (p1, mcnt); 5041 break; 5042 } 5043 5044#if 0 5045 /* The DEC Alpha C compiler 3.x generates incorrect code for the 5046 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 5047 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 5048 macro and introducing temporary variables works around the bug. */ 5049 5050 case wordbound: 5051 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 5052 if (AT_WORD_BOUNDARY (d)) 5053 break; 5054 goto fail; 5055 5056 case notwordbound: 5057 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 5058 if (AT_WORD_BOUNDARY (d)) 5059 goto fail; 5060 break; 5061#else 5062 case wordbound: 5063 { 5064 boolean prevchar, thischar; 5065 5066 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 5067 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 5068 break; 5069 5070 prevchar = WORDCHAR_P (d - 1); 5071 thischar = WORDCHAR_P (d); 5072 if (prevchar != thischar) 5073 break; 5074 goto fail; 5075 } 5076 5077 case notwordbound: 5078 { 5079 boolean prevchar, thischar; 5080 5081 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 5082 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 5083 goto fail; 5084 5085 prevchar = WORDCHAR_P (d - 1); 5086 thischar = WORDCHAR_P (d); 5087 if (prevchar != thischar) 5088 goto fail; 5089 break; 5090 } 5091#endif 5092 5093 case wordbeg: 5094 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 5095 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 5096 break; 5097 goto fail; 5098 5099 case wordend: 5100 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 5101 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 5102 && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) 5103 break; 5104 goto fail; 5105 5106#ifdef emacs 5107 case before_dot: 5108 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 5109 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 5110 goto fail; 5111 break; 5112 5113 case at_dot: 5114 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 5115 if (PTR_CHAR_POS ((unsigned char *) d) != point) 5116 goto fail; 5117 break; 5118 5119 case after_dot: 5120 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 5121 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 5122 goto fail; 5123 break; 5124 5125 case syntaxspec: 5126 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 5127 mcnt = *p++; 5128 goto matchsyntax; 5129 5130 case wordchar: 5131 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 5132 mcnt = (int) Sword; 5133 matchsyntax: 5134 PREFETCH (); 5135 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 5136 d++; 5137 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 5138 goto fail; 5139 SET_REGS_MATCHED (); 5140 break; 5141 5142 case notsyntaxspec: 5143 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 5144 mcnt = *p++; 5145 goto matchnotsyntax; 5146 5147 case notwordchar: 5148 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 5149 mcnt = (int) Sword; 5150 matchnotsyntax: 5151 PREFETCH (); 5152 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 5153 d++; 5154 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 5155 goto fail; 5156 SET_REGS_MATCHED (); 5157 break; 5158 5159#else /* not emacs */ 5160 case wordchar: 5161 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 5162 PREFETCH (); 5163 if (!WORDCHAR_P (d)) 5164 goto fail; 5165 SET_REGS_MATCHED (); 5166 d++; 5167 break; 5168 5169 case notwordchar: 5170 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 5171 PREFETCH (); 5172 if (WORDCHAR_P (d)) 5173 goto fail; 5174 SET_REGS_MATCHED (); 5175 d++; 5176 break; 5177#endif /* not emacs */ 5178 5179 default: 5180 abort (); 5181 } 5182 continue; /* Successfully executed one pattern command; keep going. */ 5183 5184 5185 /* We goto here if a matching operation fails. */ 5186 fail: 5187 if (!FAIL_STACK_EMPTY ()) 5188 { /* A restart point is known. Restore to that state. */ 5189 DEBUG_PRINT1 ("\nFAIL:\n"); 5190 POP_FAILURE_POINT (d, p, 5191 lowest_active_reg, highest_active_reg, 5192 regstart, regend, reg_info); 5193 5194 /* If this failure point is a dummy, try the next one. */ 5195 if (!p) 5196 goto fail; 5197 5198 /* If we failed to the end of the pattern, don't examine *p. */ 5199 assert (p <= pend); 5200 if (p < pend) 5201 { 5202 boolean is_a_jump_n = false; 5203 5204 /* If failed to a backwards jump that's part of a repetition 5205 loop, need to pop this failure point and use the next one. */ 5206 switch ((re_opcode_t) *p) 5207 { 5208 case jump_n: 5209 is_a_jump_n = true; 5210 case maybe_pop_jump: 5211 case pop_failure_jump: 5212 case jump: 5213 p1 = p + 1; 5214 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5215 p1 += mcnt; 5216 5217 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 5218 || (!is_a_jump_n 5219 && (re_opcode_t) *p1 == on_failure_jump)) 5220 goto fail; 5221 break; 5222 default: 5223 /* do nothing */ ; 5224 } 5225 } 5226 5227 if (d >= string1 && d <= end1) 5228 dend = end_match_1; 5229 } 5230 else 5231 break; /* Matching at this starting point really fails. */ 5232 } /* for (;;) */ 5233 5234 if (best_regs_set) 5235 goto restore_best_regs; 5236 5237 FREE_VARIABLES (); 5238 5239 return -1; /* Failure to match. */ 5240} /* re_match_2 */ 5241 5242/* Subroutine definitions for re_match_2. */ 5243 5244 5245/* We are passed P pointing to a register number after a start_memory. 5246 5247 Return true if the pattern up to the corresponding stop_memory can 5248 match the empty string, and false otherwise. 5249 5250 If we find the matching stop_memory, sets P to point to one past its number. 5251 Otherwise, sets P to an undefined byte less than or equal to END. 5252 5253 We don't handle duplicates properly (yet). */ 5254 5255static boolean 5256group_match_null_string_p (p, end, reg_info) 5257 unsigned char **p, *end; 5258 register_info_type *reg_info; 5259{ 5260 int mcnt; 5261 /* Point to after the args to the start_memory. */ 5262 unsigned char *p1 = *p + 2; 5263 5264 while (p1 < end) 5265 { 5266 /* Skip over opcodes that can match nothing, and return true or 5267 false, as appropriate, when we get to one that can't, or to the 5268 matching stop_memory. */ 5269 5270 switch ((re_opcode_t) *p1) 5271 { 5272 /* Could be either a loop or a series of alternatives. */ 5273 case on_failure_jump: 5274 p1++; 5275 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5276 5277 /* If the next operation is not a jump backwards in the 5278 pattern. */ 5279 5280 if (mcnt >= 0) 5281 { 5282 /* Go through the on_failure_jumps of the alternatives, 5283 seeing if any of the alternatives cannot match nothing. 5284 The last alternative starts with only a jump, 5285 whereas the rest start with on_failure_jump and end 5286 with a jump, e.g., here is the pattern for `a|b|c': 5287 5288 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 5289 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 5290 /exactn/1/c 5291 5292 So, we have to first go through the first (n-1) 5293 alternatives and then deal with the last one separately. */ 5294 5295 5296 /* Deal with the first (n-1) alternatives, which start 5297 with an on_failure_jump (see above) that jumps to right 5298 past a jump_past_alt. */ 5299 5300 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) 5301 { 5302 /* `mcnt' holds how many bytes long the alternative 5303 is, including the ending `jump_past_alt' and 5304 its number. */ 5305 5306 if (!alt_match_null_string_p (p1, p1 + mcnt - 3, 5307 reg_info)) 5308 return false; 5309 5310 /* Move to right after this alternative, including the 5311 jump_past_alt. */ 5312 p1 += mcnt; 5313 5314 /* Break if it's the beginning of an n-th alternative 5315 that doesn't begin with an on_failure_jump. */ 5316 if ((re_opcode_t) *p1 != on_failure_jump) 5317 break; 5318 5319 /* Still have to check that it's not an n-th 5320 alternative that starts with an on_failure_jump. */ 5321 p1++; 5322 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5323 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) 5324 { 5325 /* Get to the beginning of the n-th alternative. */ 5326 p1 -= 3; 5327 break; 5328 } 5329 } 5330 5331 /* Deal with the last alternative: go back and get number 5332 of the `jump_past_alt' just before it. `mcnt' contains 5333 the length of the alternative. */ 5334 EXTRACT_NUMBER (mcnt, p1 - 2); 5335 5336 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) 5337 return false; 5338 5339 p1 += mcnt; /* Get past the n-th alternative. */ 5340 } /* if mcnt > 0 */ 5341 break; 5342 5343 5344 case stop_memory: 5345 assert (p1[1] == **p); 5346 *p = p1 + 2; 5347 return true; 5348 5349 5350 default: 5351 if (!common_op_match_null_string_p (&p1, end, reg_info)) 5352 return false; 5353 } 5354 } /* while p1 < end */ 5355 5356 return false; 5357} /* group_match_null_string_p */ 5358 5359 5360/* Similar to group_match_null_string_p, but doesn't deal with alternatives: 5361 It expects P to be the first byte of a single alternative and END one 5362 byte past the last. The alternative can contain groups. */ 5363 5364static boolean 5365alt_match_null_string_p (p, end, reg_info) 5366 unsigned char *p, *end; 5367 register_info_type *reg_info; 5368{ 5369 int mcnt; 5370 unsigned char *p1 = p; 5371 5372 while (p1 < end) 5373 { 5374 /* Skip over opcodes that can match nothing, and break when we get 5375 to one that can't. */ 5376 5377 switch ((re_opcode_t) *p1) 5378 { 5379 /* It's a loop. */ 5380 case on_failure_jump: 5381 p1++; 5382 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5383 p1 += mcnt; 5384 break; 5385 5386 default: 5387 if (!common_op_match_null_string_p (&p1, end, reg_info)) 5388 return false; 5389 } 5390 } /* while p1 < end */ 5391 5392 return true; 5393} /* alt_match_null_string_p */ 5394 5395 5396/* Deals with the ops common to group_match_null_string_p and 5397 alt_match_null_string_p. 5398 5399 Sets P to one after the op and its arguments, if any. */ 5400 5401static boolean 5402common_op_match_null_string_p (p, end, reg_info) 5403 unsigned char **p, *end; 5404 register_info_type *reg_info; 5405{ 5406 int mcnt; 5407 boolean ret; 5408 int reg_no; 5409 unsigned char *p1 = *p; 5410 5411 switch ((re_opcode_t) *p1++) 5412 { 5413 case no_op: 5414 case begline: 5415 case endline: 5416 case begbuf: 5417 case endbuf: 5418 case wordbeg: 5419 case wordend: 5420 case wordbound: 5421 case notwordbound: 5422#ifdef emacs 5423 case before_dot: 5424 case at_dot: 5425 case after_dot: 5426#endif 5427 break; 5428 5429 case start_memory: 5430 reg_no = *p1; 5431 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 5432 ret = group_match_null_string_p (&p1, end, reg_info); 5433 5434 /* Have to set this here in case we're checking a group which 5435 contains a group and a back reference to it. */ 5436 5437 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 5438 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 5439 5440 if (!ret) 5441 return false; 5442 break; 5443 5444 /* If this is an optimized succeed_n for zero times, make the jump. */ 5445 case jump: 5446 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5447 if (mcnt >= 0) 5448 p1 += mcnt; 5449 else 5450 return false; 5451 break; 5452 5453 case succeed_n: 5454 /* Get to the number of times to succeed. */ 5455 p1 += 2; 5456 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5457 5458 if (mcnt == 0) 5459 { 5460 p1 -= 4; 5461 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 5462 p1 += mcnt; 5463 } 5464 else 5465 return false; 5466 break; 5467 5468 case duplicate: 5469 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 5470 return false; 5471 break; 5472 5473 case set_number_at: 5474 p1 += 4; 5475 5476 default: 5477 /* All other opcodes mean we cannot match the empty string. */ 5478 return false; 5479 } 5480 5481 *p = p1; 5482 return true; 5483} /* common_op_match_null_string_p */ 5484 5485 5486/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 5487 bytes; nonzero otherwise. */ 5488 5489static int 5490bcmp_translate (s1, s2, len, translate) 5491 const char *s1, *s2; 5492 register int len; 5493 RE_TRANSLATE_TYPE translate; 5494{ 5495 register const unsigned char *p1 = (const unsigned char *) s1; 5496 register const unsigned char *p2 = (const unsigned char *) s2; 5497 while (len) 5498 { 5499 if (translate[*p1++] != translate[*p2++]) return 1; 5500 len--; 5501 } 5502 return 0; 5503} 5504 5505/* Entry points for GNU code. */ 5506 5507/* re_compile_pattern is the GNU regular expression compiler: it 5508 compiles PATTERN (of length SIZE) and puts the result in BUFP. 5509 Returns 0 if the pattern was valid, otherwise an error string. 5510 5511 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 5512 are set in BUFP on entry. 5513 5514 We call regex_compile to do the actual compilation. */ 5515 5516const char * 5517re_compile_pattern (pattern, length, bufp) 5518 const char *pattern; 5519 size_t length; 5520 struct re_pattern_buffer *bufp; 5521{ 5522 reg_errcode_t ret; 5523 5524 /* GNU code is written to assume at least RE_NREGS registers will be set 5525 (and at least one extra will be -1). */ 5526 bufp->regs_allocated = REGS_UNALLOCATED; 5527 5528 /* And GNU code determines whether or not to get register information 5529 by passing null for the REGS argument to re_match, etc., not by 5530 setting no_sub. */ 5531 bufp->no_sub = 0; 5532 5533 /* Match anchors at newline. */ 5534 bufp->newline_anchor = 1; 5535 5536 ret = regex_compile (pattern, length, re_syntax_options, bufp); 5537 5538 if (!ret) 5539 return NULL; 5540 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 5541} 5542#ifdef _LIBC 5543weak_alias (__re_compile_pattern, re_compile_pattern) 5544#endif 5545 5546/* Entry points compatible with 4.2 BSD regex library. We don't define 5547 them unless specifically requested. */ 5548 5549#if defined _REGEX_RE_COMP || defined _LIBC 5550 5551/* BSD has one and only one pattern buffer. */ 5552static struct re_pattern_buffer re_comp_buf; 5553 5554char * 5555#ifdef _LIBC 5556/* Make these definitions weak in libc, so POSIX programs can redefine 5557 these names if they don't use our functions, and still use 5558 regcomp/regexec below without link errors. */ 5559weak_function 5560#endif 5561re_comp (s) 5562 const char *s; 5563{ 5564 reg_errcode_t ret; 5565 5566 if (!s) 5567 { 5568 if (!re_comp_buf.buffer) 5569 return gettext ("No previous regular expression"); 5570 return 0; 5571 } 5572 5573 if (!re_comp_buf.buffer) 5574 { 5575 re_comp_buf.buffer = (unsigned char *) malloc (200); 5576 if (re_comp_buf.buffer == NULL) 5577 return (char *) gettext (re_error_msgid 5578 + re_error_msgid_idx[(int) REG_ESPACE]); 5579 re_comp_buf.allocated = 200; 5580 5581 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); 5582 if (re_comp_buf.fastmap == NULL) 5583 return (char *) gettext (re_error_msgid 5584 + re_error_msgid_idx[(int) REG_ESPACE]); 5585 } 5586 5587 /* Since `re_exec' always passes NULL for the `regs' argument, we 5588 don't need to initialize the pattern buffer fields which affect it. */ 5589 5590 /* Match anchors at newlines. */ 5591 re_comp_buf.newline_anchor = 1; 5592 5593 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 5594 5595 if (!ret) 5596 return NULL; 5597 5598 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 5599 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 5600} 5601 5602 5603int 5604#ifdef _LIBC 5605weak_function 5606#endif 5607re_exec (s) 5608 const char *s; 5609{ 5610 const int len = strlen (s); 5611 return 5612 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); 5613} 5614 5615#endif /* _REGEX_RE_COMP */ 5616 5617/* POSIX.2 functions. Don't define these for Emacs. */ 5618 5619#ifndef emacs 5620 5621/* regcomp takes a regular expression as a string and compiles it. 5622 5623 PREG is a regex_t *. We do not expect any fields to be initialized, 5624 since POSIX says we shouldn't. Thus, we set 5625 5626 `buffer' to the compiled pattern; 5627 `used' to the length of the compiled pattern; 5628 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 5629 REG_EXTENDED bit in CFLAGS is set; otherwise, to 5630 RE_SYNTAX_POSIX_BASIC; 5631 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 5632 `fastmap' to an allocated space for the fastmap; 5633 `fastmap_accurate' to zero; 5634 `re_nsub' to the number of subexpressions in PATTERN. 5635 5636 PATTERN is the address of the pattern string. 5637 5638 CFLAGS is a series of bits which affect compilation. 5639 5640 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 5641 use POSIX basic syntax. 5642 5643 If REG_NEWLINE is set, then . and [^...] don't match newline. 5644 Also, regexec will try a match beginning after every newline. 5645 5646 If REG_ICASE is set, then we considers upper- and lowercase 5647 versions of letters to be equivalent when matching. 5648 5649 If REG_NOSUB is set, then when PREG is passed to regexec, that 5650 routine will report only success or failure, and nothing about the 5651 registers. 5652 5653 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 5654 the return codes and their meanings.) */ 5655 5656int 5657regcomp (preg, pattern, cflags) 5658 regex_t *preg; 5659 const char *pattern; 5660 int cflags; 5661{ 5662 reg_errcode_t ret; 5663 reg_syntax_t syntax 5664 = (cflags & REG_EXTENDED) ? 5665 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 5666 5667 /* regex_compile will allocate the space for the compiled pattern. */ 5668 preg->buffer = 0; 5669 preg->allocated = 0; 5670 preg->used = 0; 5671 5672 /* Try to allocate space for the fastmap. */ 5673 preg->fastmap = (char *) malloc (1 << BYTEWIDTH); 5674 5675 if (cflags & REG_ICASE) 5676 { 5677 unsigned i; 5678 5679 preg->translate 5680 = (RE_TRANSLATE_TYPE) malloc (CHAR_SET_SIZE 5681 * sizeof (*(RE_TRANSLATE_TYPE)0)); 5682 if (preg->translate == NULL) 5683 return (int) REG_ESPACE; 5684 5685 /* Map uppercase characters to corresponding lowercase ones. */ 5686 for (i = 0; i < CHAR_SET_SIZE; i++) 5687 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i; 5688 } 5689 else 5690 preg->translate = NULL; 5691 5692 /* If REG_NEWLINE is set, newlines are treated differently. */ 5693 if (cflags & REG_NEWLINE) 5694 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 5695 syntax &= ~RE_DOT_NEWLINE; 5696 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 5697 /* It also changes the matching behavior. */ 5698 preg->newline_anchor = 1; 5699 } 5700 else 5701 preg->newline_anchor = 0; 5702 5703 preg->no_sub = !!(cflags & REG_NOSUB); 5704 5705 /* POSIX says a null character in the pattern terminates it, so we 5706 can use strlen here in compiling the pattern. */ 5707 ret = regex_compile (pattern, strlen (pattern), syntax, preg); 5708 5709 /* POSIX doesn't distinguish between an unmatched open-group and an 5710 unmatched close-group: both are REG_EPAREN. */ 5711 if (ret == REG_ERPAREN) ret = REG_EPAREN; 5712 5713 if (ret == REG_NOERROR && preg->fastmap) 5714 { 5715 /* Compute the fastmap now, since regexec cannot modify the pattern 5716 buffer. */ 5717 if (re_compile_fastmap (preg) == -2) 5718 { 5719 /* Some error occured while computing the fastmap, just forget 5720 about it. */ 5721 free (preg->fastmap); 5722 preg->fastmap = NULL; 5723 } 5724 } 5725 5726 return (int) ret; 5727} 5728#ifdef _LIBC 5729weak_alias (__regcomp, regcomp) 5730#endif 5731 5732 5733/* regexec searches for a given pattern, specified by PREG, in the 5734 string STRING. 5735 5736 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 5737 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 5738 least NMATCH elements, and we set them to the offsets of the 5739 corresponding matched substrings. 5740 5741 EFLAGS specifies `execution flags' which affect matching: if 5742 REG_NOTBOL is set, then ^ does not match at the beginning of the 5743 string; if REG_NOTEOL is set, then $ does not match at the end. 5744 5745 We return 0 if we find a match and REG_NOMATCH if not. */ 5746 5747int 5748regexec (preg, string, nmatch, pmatch, eflags) 5749 const regex_t *preg; 5750 const char *string; 5751 size_t nmatch; 5752 regmatch_t pmatch[]; 5753 int eflags; 5754{ 5755 int ret; 5756 struct re_registers regs; 5757 regex_t private_preg; 5758 int len = strlen (string); 5759 boolean want_reg_info = !preg->no_sub && nmatch > 0; 5760 5761 private_preg = *preg; 5762 5763 private_preg.not_bol = !!(eflags & REG_NOTBOL); 5764 private_preg.not_eol = !!(eflags & REG_NOTEOL); 5765 5766 /* The user has told us exactly how many registers to return 5767 information about, via `nmatch'. We have to pass that on to the 5768 matching routines. */ 5769 private_preg.regs_allocated = REGS_FIXED; 5770 5771 if (want_reg_info) 5772 { 5773 regs.num_regs = nmatch; 5774 regs.start = TALLOC (nmatch * 2, regoff_t); 5775 if (regs.start == NULL) 5776 return (int) REG_NOMATCH; 5777 regs.end = regs.start + nmatch; 5778 } 5779 5780 /* Perform the searching operation. */ 5781 ret = re_search (&private_preg, string, len, 5782 /* start: */ 0, /* range: */ len, 5783 want_reg_info ? ®s : (struct re_registers *) 0); 5784 5785 /* Copy the register information to the POSIX structure. */ 5786 if (want_reg_info) 5787 { 5788 if (ret >= 0) 5789 { 5790 unsigned r; 5791 5792 for (r = 0; r < nmatch; r++) 5793 { 5794 pmatch[r].rm_so = regs.start[r]; 5795 pmatch[r].rm_eo = regs.end[r]; 5796 } 5797 } 5798 5799 /* If we needed the temporary register info, free the space now. */ 5800 free (regs.start); 5801 } 5802 5803 /* We want zero return to mean success, unlike `re_search'. */ 5804 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 5805} 5806#ifdef _LIBC 5807weak_alias (__regexec, regexec) 5808#endif 5809 5810 5811/* Returns a message corresponding to an error code, ERRCODE, returned 5812 from either regcomp or regexec. We don't use PREG here. */ 5813 5814size_t 5815regerror (errcode, preg, errbuf, errbuf_size) 5816 int errcode; 5817 const regex_t *preg; 5818 char *errbuf; 5819 size_t errbuf_size; 5820{ 5821 const char *msg; 5822 size_t msg_size; 5823 5824 if (errcode < 0 5825 || errcode >= (int) (sizeof (re_error_msgid_idx) 5826 / sizeof (re_error_msgid_idx[0]))) 5827 /* Only error codes returned by the rest of the code should be passed 5828 to this routine. If we are given anything else, or if other regex 5829 code generates an invalid error code, then the program has a bug. 5830 Dump core so we can fix it. */ 5831 abort (); 5832 5833 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]); 5834 5835 msg_size = strlen (msg) + 1; /* Includes the null. */ 5836 5837 if (errbuf_size != 0) 5838 { 5839 if (msg_size > errbuf_size) 5840 { 5841#if defined HAVE_MEMPCPY || defined _LIBC 5842 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 5843#else 5844 memcpy (errbuf, msg, errbuf_size - 1); 5845 errbuf[errbuf_size - 1] = 0; 5846#endif 5847 } 5848 else 5849 memcpy (errbuf, msg, msg_size); 5850 } 5851 5852 return msg_size; 5853} 5854#ifdef _LIBC 5855weak_alias (__regerror, regerror) 5856#endif 5857 5858 5859/* Free dynamically allocated space used by PREG. */ 5860 5861void 5862regfree (preg) 5863 regex_t *preg; 5864{ 5865 if (preg->buffer != NULL) 5866 free (preg->buffer); 5867 preg->buffer = NULL; 5868 5869 preg->allocated = 0; 5870 preg->used = 0; 5871 5872 if (preg->fastmap != NULL) 5873 free (preg->fastmap); 5874 preg->fastmap = NULL; 5875 preg->fastmap_accurate = 0; 5876 5877 if (preg->translate != NULL) 5878 free (preg->translate); 5879 preg->translate = NULL; 5880} 5881#ifdef _LIBC 5882weak_alias (__regfree, regfree) 5883#endif 5884 5885#endif /* not emacs */ 5886