1/* Extended regular expression matching and search library, 2 version 0.12. 3 (Implements POSIX draft P1003.2/D11.2, except for some of the 4 internationalization features.) 5 6 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 7 2002, 2003, 2004, 2006 Free Software Foundation, Inc. 8 9 This program is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation; either version 3 of the License, or 12 (at your option) any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 21 22/* AIX requires this to be the first thing in the file. */ 23#if defined _AIX && !defined REGEX_MALLOC 24 #pragma alloca 25#endif 26 27#undef _GNU_SOURCE 28#define _GNU_SOURCE 29 30#ifdef HAVE_CONFIG_H 31# include <config.h> 32#endif 33 34#ifndef INSIDE_RECURSION 35 36# include <stddef.h> 37 38# define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) 39 40/* For platform which support the ISO C amendement 1 functionality we 41 support user defined character classes. */ 42# if defined _LIBC || WIDE_CHAR_SUPPORT 43/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ 44# include <wchar.h> 45# include <wctype.h> 46# endif 47 48# ifdef _LIBC 49/* We have to keep the namespace clean. */ 50# define regfree(preg) __regfree (preg) 51# define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) 52# define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) 53# define regerror(errcode, preg, errbuf, errbuf_size) \ 54 __regerror(errcode, preg, errbuf, errbuf_size) 55# define re_set_registers(bu, re, nu, st, en) \ 56 __re_set_registers (bu, re, nu, st, en) 57# define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ 58 __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) 59# define re_match(bufp, string, size, pos, regs) \ 60 __re_match (bufp, string, size, pos, regs) 61# define re_search(bufp, string, size, startpos, range, regs) \ 62 __re_search (bufp, string, size, startpos, range, regs) 63# define re_compile_pattern(pattern, length, bufp) \ 64 __re_compile_pattern (pattern, length, bufp) 65# define re_set_syntax(syntax) __re_set_syntax (syntax) 66# define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ 67 __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) 68# define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) 69 70# define btowc __btowc 71# define iswctype __iswctype 72# define mbrtowc __mbrtowc 73# define wcslen __wcslen 74# define wcscoll __wcscoll 75# define wcrtomb __wcrtomb 76# define mempcpy __mempcpy 77 78/* We are also using some library internals. */ 79# include <locale/localeinfo.h> 80# include <locale/elem-hash.h> 81# include <langinfo.h> 82# include <locale/coll-lookup.h> 83# endif 84 85# ifdef _LIBC 86# include <libintl.h> 87# undef gettext 88# define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) 89 /* This define is so xgettext can find the internationalizable strings. */ 90# define gettext_noop(msgid) msgid 91# else 92/* This is for other GNU distributions with internationalized messages. */ 93# include "gettext.h" 94# endif 95 96/* Support for bounded pointers. */ 97# if !defined _LIBC && !defined __BOUNDED_POINTERS__ 98# define __bounded /* nothing */ 99# define __unbounded /* nothing */ 100# define __ptrvalue /* nothing */ 101# endif 102 103/* The `emacs' switch turns on certain matching commands 104 that make sense only in Emacs. */ 105# ifdef emacs 106 107# include "lisp.h" 108# include "buffer.h" 109# include "syntax.h" 110 111# else /* not emacs */ 112 113/* If we are not linking with Emacs proper, 114 we can't use the relocating allocator 115 even if config.h says that we can. */ 116# undef REL_ALLOC 117 118# include <stdlib.h> 119 120/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 121 If nothing else has been done, use the method below. */ 122# ifdef INHIBIT_STRING_HEADER 123# if !(defined HAVE_BZERO && defined HAVE_BCOPY) 124# if !defined bzero && !defined bcopy 125# undef INHIBIT_STRING_HEADER 126# endif 127# endif 128# endif 129 130/* This is the normal way of making sure we have a bcopy and a bzero. 131 This is used in most programs--a few other programs avoid this 132 by defining INHIBIT_STRING_HEADER. */ 133# ifndef INHIBIT_STRING_HEADER 134# include <string.h> 135# ifndef bzero 136# ifndef _LIBC 137# define bzero(s, n) (memset (s, '\0', n), (s)) 138# else 139# define bzero(s, n) __bzero (s, n) 140# endif 141# endif 142# endif 143 144/* Define the syntax stuff for \<, \>, etc. */ 145 146/* This must be nonzero for the wordchar and notwordchar pattern 147 commands in re_match_2. */ 148# ifndef Sword 149# define Sword 1 150# endif 151 152# ifdef SWITCH_ENUM_BUG 153# define SWITCH_ENUM_CAST(x) ((int)(x)) 154# else 155# define SWITCH_ENUM_CAST(x) (x) 156# endif 157 158# endif /* not emacs */ 159 160# include <limits.h> 161 162# ifndef MB_LEN_MAX 163# define MB_LEN_MAX 1 164# endif 165 166/* Get the interface, including the syntax bits. */ 167# include <regex.h> 168 169/* isalpha etc. are used for the character classes. */ 170# include <ctype.h> 171 172/* Jim Meyering writes: 173 174 "... Some ctype macros are valid only for character codes that 175 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 176 using /bin/cc or gcc but without giving an ansi option). So, all 177 ctype uses should be through macros like ISPRINT... If 178 STDC_HEADERS is defined, then autoconf has verified that the ctype 179 macros don't need to be guarded with references to isascii. ... 180 Defining isascii to 1 should let any compiler worth its salt 181 eliminate the && through constant folding." 182 Solaris defines some of these symbols so we must undefine them first. */ 183 184# if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) 185# define IN_CTYPE_DOMAIN(c) 1 186# else 187# define IN_CTYPE_DOMAIN(c) isascii(c) 188# endif 189 190# ifdef isblank 191# define ISBLANK(c) (IN_CTYPE_DOMAIN (c) && isblank (c)) 192# else 193# define ISBLANK(c) ((c) == ' ' || (c) == '\t') 194# endif 195# ifdef isgraph 196# define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isgraph (c)) 197# else 198# define ISGRAPH(c) (IN_CTYPE_DOMAIN (c) && isprint (c) && !isspace (c)) 199# endif 200 201# undef ISPRINT 202# define ISPRINT(c) (IN_CTYPE_DOMAIN (c) && isprint (c)) 203# define ISDIGIT(c) (IN_CTYPE_DOMAIN (c) && isdigit (c)) 204# define ISALNUM(c) (IN_CTYPE_DOMAIN (c) && isalnum (c)) 205# define ISALPHA(c) (IN_CTYPE_DOMAIN (c) && isalpha (c)) 206# define ISCNTRL(c) (IN_CTYPE_DOMAIN (c) && iscntrl (c)) 207# define ISLOWER(c) (IN_CTYPE_DOMAIN (c) && islower (c)) 208# define ISPUNCT(c) (IN_CTYPE_DOMAIN (c) && ispunct (c)) 209# define ISSPACE(c) (IN_CTYPE_DOMAIN (c) && isspace (c)) 210# define ISUPPER(c) (IN_CTYPE_DOMAIN (c) && isupper (c)) 211# define ISXDIGIT(c) (IN_CTYPE_DOMAIN (c) && isxdigit (c)) 212 213# ifdef _tolower 214# define TOLOWER(c) _tolower(c) 215# else 216# define TOLOWER(c) tolower(c) 217# endif 218 219# ifndef emacs 220/* How many characters in the character set. */ 221# define CHAR_SET_SIZE 256 222 223# ifdef SYNTAX_TABLE 224 225extern char *re_syntax_table; 226 227# else /* not SYNTAX_TABLE */ 228 229static char re_syntax_table[CHAR_SET_SIZE]; 230 231static void 232init_syntax_once (void) 233{ 234 register int c; 235 static int done = 0; 236 237 if (done) 238 return; 239 bzero (re_syntax_table, sizeof re_syntax_table); 240 241 for (c = 0; c < CHAR_SET_SIZE; ++c) 242 if (ISALNUM (c)) 243 re_syntax_table[c] = Sword; 244 245 re_syntax_table['_'] = Sword; 246 247 done = 1; 248} 249 250# endif /* not SYNTAX_TABLE */ 251 252# define SYNTAX(c) re_syntax_table[(unsigned char) (c)] 253 254# endif /* emacs */ 255 256/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 257 use `alloca' instead of `malloc'. This is because using malloc in 258 re_search* or re_match* could cause memory leaks when C-g is used in 259 Emacs; also, malloc is slower and causes storage fragmentation. On 260 the other hand, malloc is more portable, and easier to debug. 261 262 Because we sometimes use alloca, some routines have to be macros, 263 not functions -- `alloca'-allocated space disappears at the end of the 264 function it is called in. */ 265 266# ifdef REGEX_MALLOC 267 268# define REGEX_ALLOCATE malloc 269# define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 270# define REGEX_FREE free 271 272# else /* not REGEX_MALLOC */ 273 274/* Emacs already defines alloca, sometimes. */ 275# ifndef alloca 276 277/* Make alloca work the best possible way. */ 278# include <alloca.h> 279 280# endif /* not alloca */ 281 282# define REGEX_ALLOCATE alloca 283 284/* Assumes a `char *destination' variable. */ 285# define REGEX_REALLOCATE(source, osize, nsize) \ 286 (destination = (char *) alloca (nsize), \ 287 memcpy (destination, source, osize)) 288 289/* No need to do anything to free, after alloca. */ 290# define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 291 292# endif /* not REGEX_MALLOC */ 293 294/* Define how to allocate the failure stack. */ 295 296# if defined REL_ALLOC && defined REGEX_MALLOC 297 298# define REGEX_ALLOCATE_STACK(size) \ 299 r_alloc (&failure_stack_ptr, (size)) 300# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 301 r_re_alloc (&failure_stack_ptr, (nsize)) 302# define REGEX_FREE_STACK(ptr) \ 303 r_alloc_free (&failure_stack_ptr) 304 305# else /* not using relocating allocator */ 306 307# ifdef REGEX_MALLOC 308 309# define REGEX_ALLOCATE_STACK malloc 310# define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 311# define REGEX_FREE_STACK free 312 313# else /* not REGEX_MALLOC */ 314 315# define REGEX_ALLOCATE_STACK alloca 316 317# define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 318 REGEX_REALLOCATE (source, osize, nsize) 319/* No need to explicitly free anything. */ 320# define REGEX_FREE_STACK(arg) 321 322# endif /* not REGEX_MALLOC */ 323# endif /* not using relocating allocator */ 324 325 326/* True if `size1' is non-NULL and PTR is pointing anywhere inside 327 `string1' or just past its end. This works if PTR is NULL, which is 328 a good thing. */ 329# define FIRST_STRING_P(ptr) \ 330 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 331 332/* (Re)Allocate N items of type T using malloc, or fail. */ 333# define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 334# define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 335# define RETALLOC_IF(addr, n, t) \ 336 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 337# define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 338 339# define BYTEWIDTH 8 /* In bits. */ 340 341# define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 342 343# undef MAX 344# undef MIN 345# define MAX(a, b) ((a) > (b) ? (a) : (b)) 346# define MIN(a, b) ((a) < (b) ? (a) : (b)) 347 348typedef char boolean; 349# define false 0 350# define true 1 351 352static reg_errcode_t byte_regex_compile (const char *pattern, size_t size, 353 reg_syntax_t syntax, 354 struct re_pattern_buffer *bufp); 355 356static int byte_re_match_2_internal (struct re_pattern_buffer *bufp, 357 const char *string1, int size1, 358 const char *string2, int size2, 359 int pos, 360 struct re_registers *regs, 361 int stop); 362static int byte_re_search_2 (struct re_pattern_buffer *bufp, 363 const char *string1, int size1, 364 const char *string2, int size2, 365 int startpos, int range, 366 struct re_registers *regs, int stop); 367static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp); 368 369#ifdef MBS_SUPPORT 370static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size, 371 reg_syntax_t syntax, 372 struct re_pattern_buffer *bufp); 373 374 375static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp, 376 const char *cstring1, int csize1, 377 const char *cstring2, int csize2, 378 int pos, 379 struct re_registers *regs, 380 int stop, 381 wchar_t *string1, int size1, 382 wchar_t *string2, int size2, 383 int *mbs_offset1, int *mbs_offset2); 384static int wcs_re_search_2 (struct re_pattern_buffer *bufp, 385 const char *string1, int size1, 386 const char *string2, int size2, 387 int startpos, int range, 388 struct re_registers *regs, int stop); 389static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp); 390#endif 391 392/* These are the command codes that appear in compiled regular 393 expressions. Some opcodes are followed by argument bytes. A 394 command code can specify any interpretation whatsoever for its 395 arguments. Zero bytes may appear in the compiled regular expression. */ 396 397typedef enum 398{ 399 no_op = 0, 400 401 /* Succeed right away--no more backtracking. */ 402 succeed, 403 404 /* Followed by one byte giving n, then by n literal bytes. */ 405 exactn, 406 407# ifdef MBS_SUPPORT 408 /* Same as exactn, but contains binary data. */ 409 exactn_bin, 410# endif 411 412 /* Matches any (more or less) character. */ 413 anychar, 414 415 /* Matches any one char belonging to specified set. First 416 following byte is number of bitmap bytes. Then come bytes 417 for a bitmap saying which chars are in. Bits in each byte 418 are ordered low-bit-first. A character is in the set if its 419 bit is 1. A character too large to have a bit in the map is 420 automatically not in the set. */ 421 /* ifdef MBS_SUPPORT, following element is length of character 422 classes, length of collating symbols, length of equivalence 423 classes, length of character ranges, and length of characters. 424 Next, character class element, collating symbols elements, 425 equivalence class elements, range elements, and character 426 elements follow. 427 See regex_compile function. */ 428 charset, 429 430 /* Same parameters as charset, but match any character that is 431 not one of those specified. */ 432 charset_not, 433 434 /* Start remembering the text that is matched, for storing in a 435 register. Followed by one byte with the register number, in 436 the range 0 to one less than the pattern buffer's re_nsub 437 field. Then followed by one byte with the number of groups 438 inner to this one. (This last has to be part of the 439 start_memory only because we need it in the on_failure_jump 440 of re_match_2.) */ 441 start_memory, 442 443 /* Stop remembering the text that is matched and store it in a 444 memory register. Followed by one byte with the register 445 number, in the range 0 to one less than `re_nsub' in the 446 pattern buffer, and one byte with the number of inner groups, 447 just like `start_memory'. (We need the number of inner 448 groups here because we don't have any easy way of finding the 449 corresponding start_memory when we're at a stop_memory.) */ 450 stop_memory, 451 452 /* Match a duplicate of something remembered. Followed by one 453 byte containing the register number. */ 454 duplicate, 455 456 /* Fail unless at beginning of line. */ 457 begline, 458 459 /* Fail unless at end of line. */ 460 endline, 461 462 /* Succeeds if at beginning of buffer (if emacs) or at beginning 463 of string to be matched (if not). */ 464 begbuf, 465 466 /* Analogously, for end of buffer/string. */ 467 endbuf, 468 469 /* Followed by two byte relative address to which to jump. */ 470 jump, 471 472 /* Same as jump, but marks the end of an alternative. */ 473 jump_past_alt, 474 475 /* Followed by two-byte relative address of place to resume at 476 in case of failure. */ 477 /* ifdef MBS_SUPPORT, the size of address is 1. */ 478 on_failure_jump, 479 480 /* Like on_failure_jump, but pushes a placeholder instead of the 481 current string position when executed. */ 482 on_failure_keep_string_jump, 483 484 /* Throw away latest failure point and then jump to following 485 two-byte relative address. */ 486 /* ifdef MBS_SUPPORT, the size of address is 1. */ 487 pop_failure_jump, 488 489 /* Change to pop_failure_jump if know won't have to backtrack to 490 match; otherwise change to jump. This is used to jump 491 back to the beginning of a repeat. If what follows this jump 492 clearly won't match what the repeat does, such that we can be 493 sure that there is no use backtracking out of repetitions 494 already matched, then we change it to a pop_failure_jump. 495 Followed by two-byte address. */ 496 /* ifdef MBS_SUPPORT, the size of address is 1. */ 497 maybe_pop_jump, 498 499 /* Jump to following two-byte address, and push a dummy failure 500 point. This failure point will be thrown away if an attempt 501 is made to use it for a failure. A `+' construct makes this 502 before the first repeat. Also used as an intermediary kind 503 of jump when compiling an alternative. */ 504 /* ifdef MBS_SUPPORT, the size of address is 1. */ 505 dummy_failure_jump, 506 507 /* Push a dummy failure point and continue. Used at the end of 508 alternatives. */ 509 push_dummy_failure, 510 511 /* Followed by two-byte relative address and two-byte number n. 512 After matching N times, jump to the address upon failure. */ 513 /* ifdef MBS_SUPPORT, the size of address is 1. */ 514 succeed_n, 515 516 /* Followed by two-byte relative address, and two-byte number n. 517 Jump to the address N times, then fail. */ 518 /* ifdef MBS_SUPPORT, the size of address is 1. */ 519 jump_n, 520 521 /* Set the following two-byte relative address to the 522 subsequent two-byte number. The address *includes* the two 523 bytes of number. */ 524 /* ifdef MBS_SUPPORT, the size of address is 1. */ 525 set_number_at, 526 527 wordchar, /* Matches any word-constituent character. */ 528 notwordchar, /* Matches any char that is not a word-constituent. */ 529 530 wordbeg, /* Succeeds if at word beginning. */ 531 wordend, /* Succeeds if at word end. */ 532 533 wordbound, /* Succeeds if at a word boundary. */ 534 notwordbound /* Succeeds if not at a word boundary. */ 535 536# ifdef emacs 537 ,before_dot, /* Succeeds if before point. */ 538 at_dot, /* Succeeds if at point. */ 539 after_dot, /* Succeeds if after point. */ 540 541 /* Matches any character whose syntax is specified. Followed by 542 a byte which contains a syntax code, e.g., Sword. */ 543 syntaxspec, 544 545 /* Matches any character whose syntax is not that specified. */ 546 notsyntaxspec 547# endif /* emacs */ 548} re_opcode_t; 549#endif /* not INSIDE_RECURSION */ 550 551 552#ifdef BYTE 553# define CHAR_T char 554# define UCHAR_T unsigned char 555# define COMPILED_BUFFER_VAR bufp->buffer 556# define OFFSET_ADDRESS_SIZE 2 557# define PREFIX(name) byte_##name 558# define ARG_PREFIX(name) name 559# define PUT_CHAR(c) putchar (c) 560#else 561# ifdef WCHAR 562# define CHAR_T wchar_t 563# define UCHAR_T wchar_t 564# define COMPILED_BUFFER_VAR wc_buffer 565# define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ 566# define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1) 567# define PREFIX(name) wcs_##name 568# define ARG_PREFIX(name) c##name 569/* Should we use wide stream?? */ 570# define PUT_CHAR(c) printf ("%C", c); 571# define TRUE 1 572# define FALSE 0 573# else 574# ifdef MBS_SUPPORT 575# define WCHAR 576# define INSIDE_RECURSION 577# include "regex.c" 578# undef INSIDE_RECURSION 579# endif 580# define BYTE 581# define INSIDE_RECURSION 582# include "regex.c" 583# undef INSIDE_RECURSION 584# endif 585#endif 586 587#if USE_UNLOCKED_IO 588# include "unlocked-io.h" 589#endif 590 591#ifdef INSIDE_RECURSION 592/* Common operations on the compiled pattern. */ 593 594/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 595/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 596 597# ifdef WCHAR 598# define STORE_NUMBER(destination, number) \ 599 do { \ 600 *(destination) = (UCHAR_T)(number); \ 601 } while (0) 602# else /* BYTE */ 603# define STORE_NUMBER(destination, number) \ 604 do { \ 605 (destination)[0] = (number) & 0377; \ 606 (destination)[1] = (number) >> 8; \ 607 } while (0) 608# endif /* WCHAR */ 609 610/* Same as STORE_NUMBER, except increment DESTINATION to 611 the byte after where the number is stored. Therefore, DESTINATION 612 must be an lvalue. */ 613/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 614 615# define STORE_NUMBER_AND_INCR(destination, number) \ 616 do { \ 617 STORE_NUMBER (destination, number); \ 618 (destination) += OFFSET_ADDRESS_SIZE; \ 619 } while (0) 620 621/* Put into DESTINATION a number stored in two contiguous bytes starting 622 at SOURCE. */ 623/* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ 624 625# ifdef WCHAR 626# define EXTRACT_NUMBER(destination, source) \ 627 do { \ 628 (destination) = *(source); \ 629 } while (0) 630# else /* BYTE */ 631# define EXTRACT_NUMBER(destination, source) \ 632 do { \ 633 (destination) = *(source) & 0377; \ 634 (destination) += (signed char) (*((source) + 1)) << 8; \ 635 } while (0) 636# endif 637 638# ifdef DEBUG 639static void 640PREFIX(extract_number) (int *dest, UCHAR_T *source) 641{ 642# ifdef WCHAR 643 *dest = *source; 644# else /* BYTE */ 645 signed char temp = source[1]; 646 *dest = *source & 0377; 647 *dest += temp << 8; 648# endif 649} 650 651# ifndef EXTRACT_MACROS /* To debug the macros. */ 652# undef EXTRACT_NUMBER 653# define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src) 654# endif /* not EXTRACT_MACROS */ 655 656# endif /* DEBUG */ 657 658/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 659 SOURCE must be an lvalue. */ 660 661# define EXTRACT_NUMBER_AND_INCR(destination, source) \ 662 do { \ 663 EXTRACT_NUMBER (destination, source); \ 664 (source) += OFFSET_ADDRESS_SIZE; \ 665 } while (0) 666 667# ifdef DEBUG 668static void 669PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source) 670{ 671 PREFIX(extract_number) (destination, *source); 672 *source += OFFSET_ADDRESS_SIZE; 673} 674 675# ifndef EXTRACT_MACROS 676# undef EXTRACT_NUMBER_AND_INCR 677# define EXTRACT_NUMBER_AND_INCR(dest, src) \ 678 PREFIX(extract_number_and_incr) (&dest, &src) 679# endif /* not EXTRACT_MACROS */ 680 681# endif /* DEBUG */ 682 683 684 685/* If DEBUG is defined, Regex prints many voluminous messages about what 686 it is doing (if the variable `debug' is nonzero). If linked with the 687 main program in `iregex.c', you can enter patterns and strings 688 interactively. And if linked with the main program in `main.c' and 689 the other test files, you can run the already-written tests. */ 690 691# ifdef DEBUG 692 693# ifndef DEFINED_ONCE 694 695/* We use standard I/O for debugging. */ 696# include <stdio.h> 697 698/* It is useful to test things that ``must'' be true when debugging. */ 699# include <assert.h> 700 701static int debug; 702 703# define DEBUG_STATEMENT(e) e 704# define DEBUG_PRINT1(x) if (debug) printf (x) 705# define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 706# define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 707# define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 708# endif /* not DEFINED_ONCE */ 709 710# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 711 if (debug) PREFIX(print_partial_compiled_pattern) (s, e) 712# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 713 if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2) 714 715 716/* Print the fastmap in human-readable form. */ 717 718# ifndef DEFINED_ONCE 719void 720print_fastmap (char *fastmap) 721{ 722 unsigned was_a_range = 0; 723 unsigned i = 0; 724 725 while (i < (1 << BYTEWIDTH)) 726 { 727 if (fastmap[i++]) 728 { 729 was_a_range = 0; 730 putchar (i - 1); 731 while (i < (1 << BYTEWIDTH) && fastmap[i]) 732 { 733 was_a_range = 1; 734 i++; 735 } 736 if (was_a_range) 737 { 738 printf ("-"); 739 putchar (i - 1); 740 } 741 } 742 } 743 putchar ('\n'); 744} 745# endif /* not DEFINED_ONCE */ 746 747 748/* Print a compiled pattern string in human-readable form, starting at 749 the START pointer into it and ending just before the pointer END. */ 750 751void 752PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end) 753{ 754 int mcnt, mcnt2; 755 UCHAR_T *p1; 756 UCHAR_T *p = start; 757 UCHAR_T *pend = end; 758 759 if (start == NULL) 760 { 761 printf ("(null)\n"); 762 return; 763 } 764 765 /* Loop over pattern commands. */ 766 while (p < pend) 767 { 768# ifdef _LIBC 769 printf ("%td:\t", p - start); 770# else 771 printf ("%ld:\t", (long int) (p - start)); 772# endif 773 774 switch ((re_opcode_t) *p++) 775 { 776 case no_op: 777 printf ("/no_op"); 778 break; 779 780 case exactn: 781 mcnt = *p++; 782 printf ("/exactn/%d", mcnt); 783 do 784 { 785 putchar ('/'); 786 PUT_CHAR (*p++); 787 } 788 while (--mcnt); 789 break; 790 791# ifdef MBS_SUPPORT 792 case exactn_bin: 793 mcnt = *p++; 794 printf ("/exactn_bin/%d", mcnt); 795 do 796 { 797 printf("/%lx", (long int) *p++); 798 } 799 while (--mcnt); 800 break; 801# endif /* MBS_SUPPORT */ 802 803 case start_memory: 804 mcnt = *p++; 805 printf ("/start_memory/%d/%ld", mcnt, (long int) *p++); 806 break; 807 808 case stop_memory: 809 mcnt = *p++; 810 printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++); 811 break; 812 813 case duplicate: 814 printf ("/duplicate/%ld", (long int) *p++); 815 break; 816 817 case anychar: 818 printf ("/anychar"); 819 break; 820 821 case charset: 822 case charset_not: 823 { 824# ifdef WCHAR 825 int i, length; 826 wchar_t *workp = p; 827 printf ("/charset [%s", 828 (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); 829 p += 5; 830 length = *workp++; /* the length of char_classes */ 831 for (i=0 ; i<length ; i++) 832 printf("[:%lx:]", (long int) *p++); 833 length = *workp++; /* the length of collating_symbol */ 834 for (i=0 ; i<length ;) 835 { 836 printf("[."); 837 while(*p != 0) 838 PUT_CHAR((i++,*p++)); 839 i++,p++; 840 printf(".]"); 841 } 842 length = *workp++; /* the length of equivalence_class */ 843 for (i=0 ; i<length ;) 844 { 845 printf("[="); 846 while(*p != 0) 847 PUT_CHAR((i++,*p++)); 848 i++,p++; 849 printf("=]"); 850 } 851 length = *workp++; /* the length of char_range */ 852 for (i=0 ; i<length ; i++) 853 { 854 wchar_t range_start = *p++; 855 wchar_t range_end = *p++; 856 printf("%C-%C", range_start, range_end); 857 } 858 length = *workp++; /* the length of char */ 859 for (i=0 ; i<length ; i++) 860 printf("%C", *p++); 861 putchar (']'); 862# else 863 register int c, last = -100; 864 register int in_range = 0; 865 866 printf ("/charset [%s", 867 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 868 869 assert (p + *p < pend); 870 871 for (c = 0; c < 256; c++) 872 if (c / 8 < *p 873 && (p[1 + (c/8)] & (1 << (c % 8)))) 874 { 875 /* Are we starting a range? */ 876 if (last + 1 == c && ! in_range) 877 { 878 putchar ('-'); 879 in_range = 1; 880 } 881 /* Have we broken a range? */ 882 else if (last + 1 != c && in_range) 883 { 884 putchar (last); 885 in_range = 0; 886 } 887 888 if (! in_range) 889 putchar (c); 890 891 last = c; 892 } 893 894 if (in_range) 895 putchar (last); 896 897 putchar (']'); 898 899 p += 1 + *p; 900# endif /* WCHAR */ 901 } 902 break; 903 904 case begline: 905 printf ("/begline"); 906 break; 907 908 case endline: 909 printf ("/endline"); 910 break; 911 912 case on_failure_jump: 913 PREFIX(extract_number_and_incr) (&mcnt, &p); 914# ifdef _LIBC 915 printf ("/on_failure_jump to %td", p + mcnt - start); 916# else 917 printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); 918# endif 919 break; 920 921 case on_failure_keep_string_jump: 922 PREFIX(extract_number_and_incr) (&mcnt, &p); 923# ifdef _LIBC 924 printf ("/on_failure_keep_string_jump to %td", p + mcnt - start); 925# else 926 printf ("/on_failure_keep_string_jump to %ld", 927 (long int) (p + mcnt - start)); 928# endif 929 break; 930 931 case dummy_failure_jump: 932 PREFIX(extract_number_and_incr) (&mcnt, &p); 933# ifdef _LIBC 934 printf ("/dummy_failure_jump to %td", p + mcnt - start); 935# else 936 printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); 937# endif 938 break; 939 940 case push_dummy_failure: 941 printf ("/push_dummy_failure"); 942 break; 943 944 case maybe_pop_jump: 945 PREFIX(extract_number_and_incr) (&mcnt, &p); 946# ifdef _LIBC 947 printf ("/maybe_pop_jump to %td", p + mcnt - start); 948# else 949 printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); 950# endif 951 break; 952 953 case pop_failure_jump: 954 PREFIX(extract_number_and_incr) (&mcnt, &p); 955# ifdef _LIBC 956 printf ("/pop_failure_jump to %td", p + mcnt - start); 957# else 958 printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); 959# endif 960 break; 961 962 case jump_past_alt: 963 PREFIX(extract_number_and_incr) (&mcnt, &p); 964# ifdef _LIBC 965 printf ("/jump_past_alt to %td", p + mcnt - start); 966# else 967 printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); 968# endif 969 break; 970 971 case jump: 972 PREFIX(extract_number_and_incr) (&mcnt, &p); 973# ifdef _LIBC 974 printf ("/jump to %td", p + mcnt - start); 975# else 976 printf ("/jump to %ld", (long int) (p + mcnt - start)); 977# endif 978 break; 979 980 case succeed_n: 981 PREFIX(extract_number_and_incr) (&mcnt, &p); 982 p1 = p + mcnt; 983 PREFIX(extract_number_and_incr) (&mcnt2, &p); 984# ifdef _LIBC 985 printf ("/succeed_n to %td, %d times", p1 - start, mcnt2); 986# else 987 printf ("/succeed_n to %ld, %d times", 988 (long int) (p1 - start), mcnt2); 989# endif 990 break; 991 992 case jump_n: 993 PREFIX(extract_number_and_incr) (&mcnt, &p); 994 p1 = p + mcnt; 995 PREFIX(extract_number_and_incr) (&mcnt2, &p); 996 printf ("/jump_n to %d, %d times", p1 - start, mcnt2); 997 break; 998 999 case set_number_at: 1000 PREFIX(extract_number_and_incr) (&mcnt, &p); 1001 p1 = p + mcnt; 1002 PREFIX(extract_number_and_incr) (&mcnt2, &p); 1003# ifdef _LIBC 1004 printf ("/set_number_at location %td to %d", p1 - start, mcnt2); 1005# else 1006 printf ("/set_number_at location %ld to %d", 1007 (long int) (p1 - start), mcnt2); 1008# endif 1009 break; 1010 1011 case wordbound: 1012 printf ("/wordbound"); 1013 break; 1014 1015 case notwordbound: 1016 printf ("/notwordbound"); 1017 break; 1018 1019 case wordbeg: 1020 printf ("/wordbeg"); 1021 break; 1022 1023 case wordend: 1024 printf ("/wordend"); 1025 break; 1026 1027# ifdef emacs 1028 case before_dot: 1029 printf ("/before_dot"); 1030 break; 1031 1032 case at_dot: 1033 printf ("/at_dot"); 1034 break; 1035 1036 case after_dot: 1037 printf ("/after_dot"); 1038 break; 1039 1040 case syntaxspec: 1041 printf ("/syntaxspec"); 1042 mcnt = *p++; 1043 printf ("/%d", mcnt); 1044 break; 1045 1046 case notsyntaxspec: 1047 printf ("/notsyntaxspec"); 1048 mcnt = *p++; 1049 printf ("/%d", mcnt); 1050 break; 1051# endif /* emacs */ 1052 1053 case wordchar: 1054 printf ("/wordchar"); 1055 break; 1056 1057 case notwordchar: 1058 printf ("/notwordchar"); 1059 break; 1060 1061 case begbuf: 1062 printf ("/begbuf"); 1063 break; 1064 1065 case endbuf: 1066 printf ("/endbuf"); 1067 break; 1068 1069 default: 1070 printf ("?%ld", (long int) *(p-1)); 1071 } 1072 1073 putchar ('\n'); 1074 } 1075 1076# ifdef _LIBC 1077 printf ("%td:\tend of pattern.\n", p - start); 1078# else 1079 printf ("%ld:\tend of pattern.\n", (long int) (p - start)); 1080# endif 1081} 1082 1083 1084void 1085PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp) 1086{ 1087 UCHAR_T *buffer = (UCHAR_T*) bufp->buffer; 1088 1089 PREFIX(print_partial_compiled_pattern) (buffer, buffer 1090 + bufp->used / sizeof(UCHAR_T)); 1091 printf ("%ld bytes used/%ld bytes allocated.\n", 1092 bufp->used, bufp->allocated); 1093 1094 if (bufp->fastmap_accurate && bufp->fastmap) 1095 { 1096 printf ("fastmap: "); 1097 print_fastmap (bufp->fastmap); 1098 } 1099 1100# ifdef _LIBC 1101 printf ("re_nsub: %Zd\t", bufp->re_nsub); 1102# else 1103 printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); 1104# endif 1105 printf ("regs_alloc: %d\t", bufp->regs_allocated); 1106 printf ("can_be_null: %d\t", bufp->can_be_null); 1107 printf ("newline_anchor: %d\n", bufp->newline_anchor); 1108 printf ("no_sub: %d\t", bufp->no_sub); 1109 printf ("not_bol: %d\t", bufp->not_bol); 1110 printf ("not_eol: %d\t", bufp->not_eol); 1111 printf ("syntax: %lx\n", bufp->syntax); 1112 /* Perhaps we should print the translate table? */ 1113} 1114 1115 1116void 1117PREFIX(print_double_string) (const CHAR_T *where, 1118 const CHAR_T *string1, 1119 const CHAR_T *string2, 1120 int size1, 1121 int size2) 1122{ 1123 int this_char; 1124 1125 if (where == NULL) 1126 printf ("(null)"); 1127 else 1128 { 1129 int cnt; 1130 1131 if (FIRST_STRING_P (where)) 1132 { 1133 for (this_char = where - string1; this_char < size1; this_char++) 1134 PUT_CHAR (string1[this_char]); 1135 1136 where = string2; 1137 } 1138 1139 cnt = 0; 1140 for (this_char = where - string2; this_char < size2; this_char++) 1141 { 1142 PUT_CHAR (string2[this_char]); 1143 if (++cnt > 100) 1144 { 1145 fputs ("...", stdout); 1146 break; 1147 } 1148 } 1149 } 1150} 1151 1152# ifndef DEFINED_ONCE 1153void 1154printchar (c) 1155 int c; 1156{ 1157 putc (c, stderr); 1158} 1159# endif 1160 1161# else /* not DEBUG */ 1162 1163# ifndef DEFINED_ONCE 1164# undef assert 1165# define assert(e) 1166 1167# define DEBUG_STATEMENT(e) 1168# define DEBUG_PRINT1(x) 1169# define DEBUG_PRINT2(x1, x2) 1170# define DEBUG_PRINT3(x1, x2, x3) 1171# define DEBUG_PRINT4(x1, x2, x3, x4) 1172# endif /* not DEFINED_ONCE */ 1173# define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 1174# define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 1175 1176# endif /* not DEBUG */ 1177 1178 1179 1180# ifdef WCHAR 1181/* This convert a multibyte string to a wide character string. 1182 And write their correspondances to offset_buffer(see below) 1183 and write whether each wchar_t is binary data to is_binary. 1184 This assume invalid multibyte sequences as binary data. 1185 We assume offset_buffer and is_binary is already allocated 1186 enough space. */ 1187 1188static size_t 1189convert_mbs_to_wcs (CHAR_T *dest, 1190 const unsigned char* src, 1191 1192 /* The length of multibyte string. */ 1193 size_t len, 1194 1195 /* Correspondences between src(char string) and 1196 dest(wchar_t string) for optimization. E.g.: 1197 src = "xxxyzz" 1198 dest = {'X', 'Y', 'Z'} 1199 (each "xxx", "y" and "zz" represent one 1200 multibyte character corresponding to 'X', 1201 'Y' and 'Z'.) 1202 offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 1203 0+3+1+2("zz")} 1204 = {0, 3, 4, 6} */ 1205 int *offset_buffer, 1206 1207 char *is_binary) 1208{ 1209 wchar_t *pdest = dest; 1210 const unsigned char *psrc = src; 1211 size_t wc_count = 0; 1212 1213 mbstate_t mbs; 1214 int i, consumed; 1215 size_t mb_remain = len; 1216 size_t mb_count = 0; 1217 1218 /* Initialize the conversion state. */ 1219 memset (&mbs, 0, sizeof (mbstate_t)); 1220 1221 offset_buffer[0] = 0; 1222 for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, 1223 psrc += consumed) 1224 { 1225 consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); 1226 1227 if (consumed <= 0) 1228 /* failed to convert. maybe src contains binary data. 1229 So we consume 1 byte manualy. */ 1230 { 1231 *pdest = *psrc; 1232 consumed = 1; 1233 is_binary[wc_count] = TRUE; 1234 } 1235 else 1236 is_binary[wc_count] = FALSE; 1237 /* In sjis encoding, we use yen sign as escape character in 1238 place of reverse solidus. So we convert 0x5c(yen sign in 1239 sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse 1240 solidus in UCS2). */ 1241 if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) 1242 *pdest = (wchar_t) *psrc; 1243 1244 offset_buffer[wc_count + 1] = mb_count += consumed; 1245 } 1246 1247 /* Fill remain of the buffer with sentinel. */ 1248 for (i = wc_count + 1 ; i <= len ; i++) 1249 offset_buffer[i] = mb_count + 1; 1250 1251 return wc_count; 1252} 1253 1254# endif /* WCHAR */ 1255 1256#else /* not INSIDE_RECURSION */ 1257 1258/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 1259 also be assigned to arbitrarily: each pattern buffer stores its own 1260 syntax, so it can be changed between regex compilations. */ 1261/* This has no initializer because initialized variables in Emacs 1262 become read-only after dumping. */ 1263reg_syntax_t re_syntax_options; 1264 1265 1266/* Specify the precise syntax of regexps for compilation. This provides 1267 for compatibility for various utilities which historically have 1268 different, incompatible syntaxes. 1269 1270 The argument SYNTAX is a bit mask comprised of the various bits 1271 defined in regex.h. We return the old syntax. */ 1272 1273reg_syntax_t 1274re_set_syntax (reg_syntax_t syntax) 1275{ 1276 reg_syntax_t ret = re_syntax_options; 1277 1278 re_syntax_options = syntax; 1279# ifdef DEBUG 1280 if (syntax & RE_DEBUG) 1281 debug = 1; 1282 else if (debug) /* was on but now is not */ 1283 debug = 0; 1284# endif /* DEBUG */ 1285 return ret; 1286} 1287# ifdef _LIBC 1288weak_alias (__re_set_syntax, re_set_syntax) 1289# endif 1290 1291/* This table gives an error message for each of the error codes listed 1292 in regex.h. Obviously the order here has to be same as there. 1293 POSIX doesn't require that we do anything for REG_NOERROR, 1294 but why not be nice? */ 1295 1296static const char re_error_msgid[] = 1297 { 1298# define REG_NOERROR_IDX 0 1299 gettext_noop ("Success") /* REG_NOERROR */ 1300 "\0" 1301# define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") 1302 gettext_noop ("No match") /* REG_NOMATCH */ 1303 "\0" 1304# define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") 1305 gettext_noop ("Invalid regular expression") /* REG_BADPAT */ 1306 "\0" 1307# define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") 1308 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ 1309 "\0" 1310# define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") 1311 gettext_noop ("Invalid character class name") /* REG_ECTYPE */ 1312 "\0" 1313# define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") 1314 gettext_noop ("Trailing backslash") /* REG_EESCAPE */ 1315 "\0" 1316# define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") 1317 gettext_noop ("Invalid back reference") /* REG_ESUBREG */ 1318 "\0" 1319# define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") 1320 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ 1321 "\0" 1322# define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") 1323 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ 1324 "\0" 1325# define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") 1326 gettext_noop ("Unmatched \\{") /* REG_EBRACE */ 1327 "\0" 1328# define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") 1329 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ 1330 "\0" 1331# define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") 1332 gettext_noop ("Invalid range end") /* REG_ERANGE */ 1333 "\0" 1334# define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") 1335 gettext_noop ("Memory exhausted") /* REG_ESPACE */ 1336 "\0" 1337# define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") 1338 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ 1339 "\0" 1340# define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") 1341 gettext_noop ("Premature end of regular expression") /* REG_EEND */ 1342 "\0" 1343# define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") 1344 gettext_noop ("Regular expression too big") /* REG_ESIZE */ 1345 "\0" 1346# define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") 1347 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ 1348 }; 1349 1350static const size_t re_error_msgid_idx[] = 1351 { 1352 REG_NOERROR_IDX, 1353 REG_NOMATCH_IDX, 1354 REG_BADPAT_IDX, 1355 REG_ECOLLATE_IDX, 1356 REG_ECTYPE_IDX, 1357 REG_EESCAPE_IDX, 1358 REG_ESUBREG_IDX, 1359 REG_EBRACK_IDX, 1360 REG_EPAREN_IDX, 1361 REG_EBRACE_IDX, 1362 REG_BADBR_IDX, 1363 REG_ERANGE_IDX, 1364 REG_ESPACE_IDX, 1365 REG_BADRPT_IDX, 1366 REG_EEND_IDX, 1367 REG_ESIZE_IDX, 1368 REG_ERPAREN_IDX 1369 }; 1370 1371#endif /* INSIDE_RECURSION */ 1372 1373#ifndef DEFINED_ONCE 1374/* Avoiding alloca during matching, to placate r_alloc. */ 1375 1376/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1377 searching and matching functions should not call alloca. On some 1378 systems, alloca is implemented in terms of malloc, and if we're 1379 using the relocating allocator routines, then malloc could cause a 1380 relocation, which might (if the strings being searched are in the 1381 ralloc heap) shift the data out from underneath the regexp 1382 routines. 1383 1384 Here's another reason to avoid allocation: Emacs 1385 processes input from X in a signal handler; processing X input may 1386 call malloc; if input arrives while a matching routine is calling 1387 malloc, then we're scrod. But Emacs can't just block input while 1388 calling matching routines; then we don't notice interrupts when 1389 they come in. So, Emacs blocks input around all regexp calls 1390 except the matching calls, which it leaves unprotected, in the 1391 faith that they will not malloc. */ 1392 1393/* Normally, this is fine. */ 1394# define MATCH_MAY_ALLOCATE 1395 1396/* When using GNU C, we are not REALLY using the C alloca, no matter 1397 what config.h may say. So don't take precautions for it. */ 1398# ifdef __GNUC__ 1399# undef C_ALLOCA 1400# endif 1401 1402/* The match routines may not allocate if (1) they would do it with malloc 1403 and (2) it's not safe for them to use malloc. 1404 Note that if REL_ALLOC is defined, matching would not use malloc for the 1405 failure stack, but we would still use it for the register vectors; 1406 so REL_ALLOC should not affect this. */ 1407# if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs 1408# undef MATCH_MAY_ALLOCATE 1409# endif 1410#endif /* not DEFINED_ONCE */ 1411 1412#ifdef INSIDE_RECURSION 1413/* Failure stack declarations and macros; both re_compile_fastmap and 1414 re_match_2 use a failure stack. These have to be macros because of 1415 REGEX_ALLOCATE_STACK. */ 1416 1417 1418/* Number of failure points for which to initially allocate space 1419 when matching. If this number is exceeded, we allocate more 1420 space, so it is not a hard limit. */ 1421# ifndef INIT_FAILURE_ALLOC 1422# define INIT_FAILURE_ALLOC 5 1423# endif 1424 1425/* Roughly the maximum number of failure points on the stack. Would be 1426 exactly that if always used MAX_FAILURE_ITEMS items each time we failed. 1427 This is a variable only so users of regex can assign to it; we never 1428 change it ourselves. */ 1429 1430# ifdef INT_IS_16BIT 1431 1432# ifndef DEFINED_ONCE 1433# if defined MATCH_MAY_ALLOCATE 1434/* 4400 was enough to cause a crash on Alpha OSF/1, 1435 whose default stack limit is 2mb. */ 1436long int re_max_failures = 4000; 1437# else 1438long int re_max_failures = 2000; 1439# endif 1440# endif 1441 1442union PREFIX(fail_stack_elt) 1443{ 1444 UCHAR_T *pointer; 1445 long int integer; 1446}; 1447 1448typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); 1449 1450typedef struct 1451{ 1452 PREFIX(fail_stack_elt_t) *stack; 1453 unsigned long int size; 1454 unsigned long int avail; /* Offset of next open position. */ 1455} PREFIX(fail_stack_type); 1456 1457# else /* not INT_IS_16BIT */ 1458 1459# ifndef DEFINED_ONCE 1460# if defined MATCH_MAY_ALLOCATE 1461/* 4400 was enough to cause a crash on Alpha OSF/1, 1462 whose default stack limit is 2mb. */ 1463int re_max_failures = 4000; 1464# else 1465int re_max_failures = 2000; 1466# endif 1467# endif 1468 1469union PREFIX(fail_stack_elt) 1470{ 1471 UCHAR_T *pointer; 1472 int integer; 1473}; 1474 1475typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); 1476 1477typedef struct 1478{ 1479 PREFIX(fail_stack_elt_t) *stack; 1480 unsigned size; 1481 unsigned avail; /* Offset of next open position. */ 1482} PREFIX(fail_stack_type); 1483 1484# endif /* INT_IS_16BIT */ 1485 1486# ifndef DEFINED_ONCE 1487# define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1488# define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1489# define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1490# endif 1491 1492 1493/* Define macros to initialize and free the failure stack. 1494 Do `return -2' if the alloc fails. */ 1495 1496# ifdef MATCH_MAY_ALLOCATE 1497# define INIT_FAIL_STACK() \ 1498 do { \ 1499 fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \ 1500 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \ 1501 \ 1502 if (fail_stack.stack == NULL) \ 1503 return -2; \ 1504 \ 1505 fail_stack.size = INIT_FAILURE_ALLOC; \ 1506 fail_stack.avail = 0; \ 1507 } while (0) 1508 1509# define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1510# else 1511# define INIT_FAIL_STACK() \ 1512 do { \ 1513 fail_stack.avail = 0; \ 1514 } while (0) 1515 1516# define RESET_FAIL_STACK() 1517# endif 1518 1519 1520/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. 1521 1522 Return 1 if succeeds, and 0 if either ran out of memory 1523 allocating space for it or it was already too large. 1524 1525 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1526 1527# define DOUBLE_FAIL_STACK(fail_stack) \ 1528 ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ 1529 ? 0 \ 1530 : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \ 1531 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1532 (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \ 1533 ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\ 1534 \ 1535 (fail_stack).stack == NULL \ 1536 ? 0 \ 1537 : ((fail_stack).size <<= 1, \ 1538 1))) 1539 1540 1541/* Push pointer POINTER on FAIL_STACK. 1542 Return 1 if was able to do so and 0 if ran out of memory allocating 1543 space to do so. */ 1544# define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1545 ((FAIL_STACK_FULL () \ 1546 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ 1547 ? 0 \ 1548 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1549 1)) 1550 1551/* Push a pointer value onto the failure stack. 1552 Assumes the variable `fail_stack'. Probably should only 1553 be called from within `PUSH_FAILURE_POINT'. */ 1554# define PUSH_FAILURE_POINTER(item) \ 1555 fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item) 1556 1557/* This pushes an integer-valued item onto the failure stack. 1558 Assumes the variable `fail_stack'. Probably should only 1559 be called from within `PUSH_FAILURE_POINT'. */ 1560# define PUSH_FAILURE_INT(item) \ 1561 fail_stack.stack[fail_stack.avail++].integer = (item) 1562 1563/* Push a fail_stack_elt_t value onto the failure stack. 1564 Assumes the variable `fail_stack'. Probably should only 1565 be called from within `PUSH_FAILURE_POINT'. */ 1566# define PUSH_FAILURE_ELT(item) \ 1567 fail_stack.stack[fail_stack.avail++] = (item) 1568 1569/* These three POP... operations complement the three PUSH... operations. 1570 All assume that `fail_stack' is nonempty. */ 1571# define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1572# define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1573# define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1574 1575/* Used to omit pushing failure point id's when we're not debugging. */ 1576# ifdef DEBUG 1577# define DEBUG_PUSH PUSH_FAILURE_INT 1578# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1579# else 1580# define DEBUG_PUSH(item) 1581# define DEBUG_POP(item_addr) 1582# endif 1583 1584 1585/* Push the information about the state we will need 1586 if we ever fail back to it. 1587 1588 Requires variables fail_stack, regstart, regend, reg_info, and 1589 num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' 1590 be declared. 1591 1592 Does `return FAILURE_CODE' if runs out of memory. */ 1593 1594# define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1595 do { \ 1596 char *destination; \ 1597 /* Must be int, so when we don't save any registers, the arithmetic \ 1598 of 0 + -1 isn't done as unsigned. */ \ 1599 /* Can't be int, since there is not a shred of a guarantee that int \ 1600 is wide enough to hold a value of something to which pointer can \ 1601 be assigned */ \ 1602 active_reg_t this_reg; \ 1603 \ 1604 DEBUG_STATEMENT (failure_id++); \ 1605 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1606 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1607 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1608 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1609 \ 1610 DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ 1611 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1612 \ 1613 /* Ensure we have enough space allocated for what we will push. */ \ 1614 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1615 { \ 1616 if (!DOUBLE_FAIL_STACK (fail_stack)) \ 1617 return failure_code; \ 1618 \ 1619 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1620 (fail_stack).size); \ 1621 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1622 } \ 1623 \ 1624 /* Push the info, starting with the registers. */ \ 1625 DEBUG_PRINT1 ("\n"); \ 1626 \ 1627 if (1) \ 1628 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1629 this_reg++) \ 1630 { \ 1631 DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ 1632 DEBUG_STATEMENT (num_regs_pushed++); \ 1633 \ 1634 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1635 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1636 \ 1637 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1638 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1639 \ 1640 DEBUG_PRINT2 (" info: %p\n ", \ 1641 reg_info[this_reg].word.pointer); \ 1642 DEBUG_PRINT2 (" match_null=%d", \ 1643 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1644 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1645 DEBUG_PRINT2 (" matched_something=%d", \ 1646 MATCHED_SOMETHING (reg_info[this_reg])); \ 1647 DEBUG_PRINT2 (" ever_matched=%d", \ 1648 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1649 DEBUG_PRINT1 ("\n"); \ 1650 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1651 } \ 1652 \ 1653 DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ 1654 PUSH_FAILURE_INT (lowest_active_reg); \ 1655 \ 1656 DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ 1657 PUSH_FAILURE_INT (highest_active_reg); \ 1658 \ 1659 DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ 1660 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1661 PUSH_FAILURE_POINTER (pattern_place); \ 1662 \ 1663 DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ 1664 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1665 size2); \ 1666 DEBUG_PRINT1 ("'\n"); \ 1667 PUSH_FAILURE_POINTER (string_place); \ 1668 \ 1669 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1670 DEBUG_PUSH (failure_id); \ 1671 } while (0) 1672 1673# ifndef DEFINED_ONCE 1674/* This is the number of items that are pushed and popped on the stack 1675 for each register. */ 1676# define NUM_REG_ITEMS 3 1677 1678/* Individual items aside from the registers. */ 1679# ifdef DEBUG 1680# define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1681# else 1682# define NUM_NONREG_ITEMS 4 1683# endif 1684 1685/* We push at most this many items on the stack. */ 1686/* We used to use (num_regs - 1), which is the number of registers 1687 this regexp will save; but that was changed to 5 1688 to avoid stack overflow for a regexp with lots of parens. */ 1689# define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) 1690 1691/* We actually push this many items. */ 1692# define NUM_FAILURE_ITEMS \ 1693 (((0 \ 1694 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1695 * NUM_REG_ITEMS) \ 1696 + NUM_NONREG_ITEMS) 1697 1698/* How many items can still be added to the stack without overflowing it. */ 1699# define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1700# endif /* not DEFINED_ONCE */ 1701 1702 1703/* Pops what PUSH_FAIL_STACK pushes. 1704 1705 We restore into the parameters, all of which should be lvalues: 1706 STR -- the saved data position. 1707 PAT -- the saved pattern position. 1708 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1709 REGSTART, REGEND -- arrays of string positions. 1710 REG_INFO -- array of information about each subexpression. 1711 1712 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1713 `pend', `string1', `size1', `string2', and `size2'. */ 1714# define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1715{ \ 1716 DEBUG_STATEMENT (unsigned failure_id;) \ 1717 active_reg_t this_reg; \ 1718 const UCHAR_T *string_temp; \ 1719 \ 1720 assert (!FAIL_STACK_EMPTY ()); \ 1721 \ 1722 /* Remove failure points and point to how many regs pushed. */ \ 1723 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1724 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1725 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1726 \ 1727 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1728 \ 1729 DEBUG_POP (&failure_id); \ 1730 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1731 \ 1732 /* If the saved string location is NULL, it came from an \ 1733 on_failure_keep_string_jump opcode, and we want to throw away the \ 1734 saved NULL, thus retaining our current position in the string. */ \ 1735 string_temp = POP_FAILURE_POINTER (); \ 1736 if (string_temp != NULL) \ 1737 str = (const CHAR_T *) string_temp; \ 1738 \ 1739 DEBUG_PRINT2 (" Popping string %p: `", str); \ 1740 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1741 DEBUG_PRINT1 ("'\n"); \ 1742 \ 1743 pat = (UCHAR_T *) POP_FAILURE_POINTER (); \ 1744 DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ 1745 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1746 \ 1747 /* Restore register info. */ \ 1748 high_reg = (active_reg_t) POP_FAILURE_INT (); \ 1749 DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ 1750 \ 1751 low_reg = (active_reg_t) POP_FAILURE_INT (); \ 1752 DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ 1753 \ 1754 if (1) \ 1755 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1756 { \ 1757 DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ 1758 \ 1759 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1760 DEBUG_PRINT2 (" info: %p\n", \ 1761 reg_info[this_reg].word.pointer); \ 1762 \ 1763 regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ 1764 DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ 1765 \ 1766 regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ 1767 DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ 1768 } \ 1769 else \ 1770 { \ 1771 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1772 { \ 1773 reg_info[this_reg].word.integer = 0; \ 1774 regend[this_reg] = 0; \ 1775 regstart[this_reg] = 0; \ 1776 } \ 1777 highest_active_reg = high_reg; \ 1778 } \ 1779 \ 1780 set_regs_matched_done = 0; \ 1781 DEBUG_STATEMENT (nfailure_points_popped++); \ 1782} /* POP_FAILURE_POINT */ 1783 1784/* Structure for per-register (a.k.a. per-group) information. 1785 Other register information, such as the 1786 starting and ending positions (which are addresses), and the list of 1787 inner groups (which is a bits list) are maintained in separate 1788 variables. 1789 1790 We are making a (strictly speaking) nonportable assumption here: that 1791 the compiler will pack our bit fields into something that fits into 1792 the type of `word', i.e., is something that fits into one item on the 1793 failure stack. */ 1794 1795 1796/* Declarations and macros for re_match_2. */ 1797 1798typedef union 1799{ 1800 PREFIX(fail_stack_elt_t) word; 1801 struct 1802 { 1803 /* This field is one if this group can match the empty string, 1804 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1805# define MATCH_NULL_UNSET_VALUE 3 1806 unsigned match_null_string_p : 2; 1807 unsigned is_active : 1; 1808 unsigned matched_something : 1; 1809 unsigned ever_matched_something : 1; 1810 } bits; 1811} PREFIX(register_info_type); 1812 1813# ifndef DEFINED_ONCE 1814# define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1815# define IS_ACTIVE(R) ((R).bits.is_active) 1816# define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1817# define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1818 1819 1820/* Call this when have matched a real character; it sets `matched' flags 1821 for the subexpressions which we are currently inside. Also records 1822 that those subexprs have matched. */ 1823# define SET_REGS_MATCHED() \ 1824 do \ 1825 { \ 1826 if (!set_regs_matched_done) \ 1827 { \ 1828 active_reg_t r; \ 1829 set_regs_matched_done = 1; \ 1830 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1831 { \ 1832 MATCHED_SOMETHING (reg_info[r]) \ 1833 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1834 = 1; \ 1835 } \ 1836 } \ 1837 } \ 1838 while (0) 1839# endif /* not DEFINED_ONCE */ 1840 1841/* Registers are set to a sentinel when they haven't yet matched. */ 1842static CHAR_T PREFIX(reg_unset_dummy); 1843# define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy)) 1844# define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1845 1846/* Subroutine declarations and macros for regex_compile. */ 1847static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg); 1848static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, 1849 int arg1, int arg2); 1850static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, 1851 int arg, UCHAR_T *end); 1852static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, 1853 int arg1, int arg2, UCHAR_T *end); 1854static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern, 1855 const CHAR_T *p, 1856 reg_syntax_t syntax); 1857static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p, 1858 const CHAR_T *pend, 1859 reg_syntax_t syntax); 1860# ifdef WCHAR 1861static reg_errcode_t wcs_compile_range (CHAR_T range_start, 1862 const CHAR_T **p_ptr, 1863 const CHAR_T *pend, 1864 char *translate, 1865 reg_syntax_t syntax, 1866 UCHAR_T *b, 1867 CHAR_T *char_set); 1868static void insert_space (int num, CHAR_T *loc, CHAR_T *end); 1869# else /* BYTE */ 1870static reg_errcode_t byte_compile_range (unsigned int range_start, 1871 const char **p_ptr, 1872 const char *pend, 1873 char *translate, 1874 reg_syntax_t syntax, 1875 unsigned char *b); 1876# endif /* WCHAR */ 1877 1878/* Fetch the next character in the uncompiled pattern---translating it 1879 if necessary. Also cast from a signed character in the constant 1880 string passed to us by the user to an unsigned char that we can use 1881 as an array index (in, e.g., `translate'). */ 1882/* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1883 because it is impossible to allocate 4GB array for some encodings 1884 which have 4 byte character_set like UCS4. */ 1885# ifndef PATFETCH 1886# ifdef WCHAR 1887# define PATFETCH(c) \ 1888 do {if (p == pend) return REG_EEND; \ 1889 c = (UCHAR_T) *p++; \ 1890 if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \ 1891 } while (0) 1892# else /* BYTE */ 1893# define PATFETCH(c) \ 1894 do {if (p == pend) return REG_EEND; \ 1895 c = (unsigned char) *p++; \ 1896 if (translate) c = (unsigned char) translate[c]; \ 1897 } while (0) 1898# endif /* WCHAR */ 1899# endif 1900 1901/* Fetch the next character in the uncompiled pattern, with no 1902 translation. */ 1903# define PATFETCH_RAW(c) \ 1904 do {if (p == pend) return REG_EEND; \ 1905 c = (UCHAR_T) *p++; \ 1906 } while (0) 1907 1908/* Go backwards one character in the pattern. */ 1909# define PATUNFETCH p-- 1910 1911 1912/* If `translate' is non-null, return translate[D], else just D. We 1913 cast the subscript to translate because some data is declared as 1914 `char *', to avoid warnings when a string constant is passed. But 1915 when we use a character as a subscript we must make it unsigned. */ 1916/* ifdef MBS_SUPPORT, we translate only if character <= 0xff, 1917 because it is impossible to allocate 4GB array for some encodings 1918 which have 4 byte character_set like UCS4. */ 1919 1920# ifndef TRANSLATE 1921# ifdef WCHAR 1922# define TRANSLATE(d) \ 1923 ((translate && ((UCHAR_T) (d)) <= 0xff) \ 1924 ? (char) translate[(unsigned char) (d)] : (d)) 1925# else /* BYTE */ 1926# define TRANSLATE(d) \ 1927 (translate ? (char) translate[(unsigned char) (d)] : (d)) 1928# endif /* WCHAR */ 1929# endif 1930 1931 1932/* Macros for outputting the compiled pattern into `buffer'. */ 1933 1934/* If the buffer isn't allocated when it comes in, use this. */ 1935# define INIT_BUF_SIZE (32 * sizeof(UCHAR_T)) 1936 1937/* Make sure we have at least N more bytes of space in buffer. */ 1938# ifdef WCHAR 1939# define GET_BUFFER_SPACE(n) \ 1940 while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ 1941 + (n)*sizeof(CHAR_T)) > bufp->allocated) \ 1942 EXTEND_BUFFER () 1943# else /* BYTE */ 1944# define GET_BUFFER_SPACE(n) \ 1945 while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ 1946 EXTEND_BUFFER () 1947# endif /* WCHAR */ 1948 1949/* Make sure we have one more byte of buffer space and then add C to it. */ 1950# define BUF_PUSH(c) \ 1951 do { \ 1952 GET_BUFFER_SPACE (1); \ 1953 *b++ = (UCHAR_T) (c); \ 1954 } while (0) 1955 1956 1957/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1958# define BUF_PUSH_2(c1, c2) \ 1959 do { \ 1960 GET_BUFFER_SPACE (2); \ 1961 *b++ = (UCHAR_T) (c1); \ 1962 *b++ = (UCHAR_T) (c2); \ 1963 } while (0) 1964 1965 1966/* As with BUF_PUSH_2, except for three bytes. */ 1967# define BUF_PUSH_3(c1, c2, c3) \ 1968 do { \ 1969 GET_BUFFER_SPACE (3); \ 1970 *b++ = (UCHAR_T) (c1); \ 1971 *b++ = (UCHAR_T) (c2); \ 1972 *b++ = (UCHAR_T) (c3); \ 1973 } while (0) 1974 1975/* Store a jump with opcode OP at LOC to location TO. We store a 1976 relative address offset by the three bytes the jump itself occupies. */ 1977# define STORE_JUMP(op, loc, to) \ 1978 PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) 1979 1980/* Likewise, for a two-argument jump. */ 1981# define STORE_JUMP2(op, loc, to, arg) \ 1982 PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) 1983 1984/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1985# define INSERT_JUMP(op, loc, to) \ 1986 PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) 1987 1988/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1989# define INSERT_JUMP2(op, loc, to, arg) \ 1990 PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\ 1991 arg, b) 1992 1993/* This is not an arbitrary limit: the arguments which represent offsets 1994 into the pattern are two bytes long. So if 2^16 bytes turns out to 1995 be too small, many things would have to change. */ 1996/* Any other compiler which, like MSC, has allocation limit below 2^16 1997 bytes will have to use approach similar to what was done below for 1998 MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up 1999 reallocating to 0 bytes. Such thing is not going to work too well. 2000 You have been warned!! */ 2001# ifndef DEFINED_ONCE 2002# if defined _MSC_VER && !defined WIN32 2003/* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. 2004 The REALLOC define eliminates a flurry of conversion warnings, 2005 but is not required. */ 2006# define MAX_BUF_SIZE 65500L 2007# define REALLOC(p,s) realloc ((p), (size_t) (s)) 2008# else 2009# define MAX_BUF_SIZE (1L << 16) 2010# define REALLOC(p,s) realloc ((p), (s)) 2011# endif 2012 2013/* Extend the buffer by twice its current size via realloc and 2014 reset the pointers that pointed into the old block to point to the 2015 correct places in the new one. If extending the buffer results in it 2016 being larger than MAX_BUF_SIZE, then flag memory exhausted. */ 2017# if __BOUNDED_POINTERS__ 2018# define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) 2019# define MOVE_BUFFER_POINTER(P) \ 2020 (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) 2021# define ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2022 else \ 2023 { \ 2024 SET_HIGH_BOUND (b); \ 2025 SET_HIGH_BOUND (begalt); \ 2026 if (fixup_alt_jump) \ 2027 SET_HIGH_BOUND (fixup_alt_jump); \ 2028 if (laststart) \ 2029 SET_HIGH_BOUND (laststart); \ 2030 if (pending_exact) \ 2031 SET_HIGH_BOUND (pending_exact); \ 2032 } 2033# else 2034# define MOVE_BUFFER_POINTER(P) (P) += incr 2035# define ELSE_EXTEND_BUFFER_HIGH_BOUND 2036# endif 2037# endif /* not DEFINED_ONCE */ 2038 2039# ifdef WCHAR 2040# define EXTEND_BUFFER() \ 2041 do { \ 2042 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ 2043 int wchar_count; \ 2044 if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \ 2045 return REG_ESIZE; \ 2046 bufp->allocated <<= 1; \ 2047 if (bufp->allocated > MAX_BUF_SIZE) \ 2048 bufp->allocated = MAX_BUF_SIZE; \ 2049 /* How many characters the new buffer can have? */ \ 2050 wchar_count = bufp->allocated / sizeof(UCHAR_T); \ 2051 if (wchar_count == 0) wchar_count = 1; \ 2052 /* Truncate the buffer to CHAR_T align. */ \ 2053 bufp->allocated = wchar_count * sizeof(UCHAR_T); \ 2054 RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \ 2055 bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ 2056 if (COMPILED_BUFFER_VAR == NULL) \ 2057 return REG_ESPACE; \ 2058 /* If the buffer moved, move all the pointers into it. */ \ 2059 if (old_buffer != COMPILED_BUFFER_VAR) \ 2060 { \ 2061 int incr = COMPILED_BUFFER_VAR - old_buffer; \ 2062 MOVE_BUFFER_POINTER (b); \ 2063 MOVE_BUFFER_POINTER (begalt); \ 2064 if (fixup_alt_jump) \ 2065 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2066 if (laststart) \ 2067 MOVE_BUFFER_POINTER (laststart); \ 2068 if (pending_exact) \ 2069 MOVE_BUFFER_POINTER (pending_exact); \ 2070 } \ 2071 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2072 } while (0) 2073# else /* BYTE */ 2074# define EXTEND_BUFFER() \ 2075 do { \ 2076 UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ 2077 if (bufp->allocated == MAX_BUF_SIZE) \ 2078 return REG_ESIZE; \ 2079 bufp->allocated <<= 1; \ 2080 if (bufp->allocated > MAX_BUF_SIZE) \ 2081 bufp->allocated = MAX_BUF_SIZE; \ 2082 bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, bufp->allocated); \ 2083 if (COMPILED_BUFFER_VAR == NULL) \ 2084 return REG_ESPACE; \ 2085 /* If the buffer moved, move all the pointers into it. */ \ 2086 if (old_buffer != COMPILED_BUFFER_VAR) \ 2087 { \ 2088 int incr = COMPILED_BUFFER_VAR - old_buffer; \ 2089 MOVE_BUFFER_POINTER (b); \ 2090 MOVE_BUFFER_POINTER (begalt); \ 2091 if (fixup_alt_jump) \ 2092 MOVE_BUFFER_POINTER (fixup_alt_jump); \ 2093 if (laststart) \ 2094 MOVE_BUFFER_POINTER (laststart); \ 2095 if (pending_exact) \ 2096 MOVE_BUFFER_POINTER (pending_exact); \ 2097 } \ 2098 ELSE_EXTEND_BUFFER_HIGH_BOUND \ 2099 } while (0) 2100# endif /* WCHAR */ 2101 2102# ifndef DEFINED_ONCE 2103/* Since we have one byte reserved for the register number argument to 2104 {start,stop}_memory, the maximum number of groups we can report 2105 things about is what fits in that byte. */ 2106# define MAX_REGNUM 255 2107 2108/* But patterns can have more than `MAX_REGNUM' registers. We just 2109 ignore the excess. */ 2110typedef unsigned regnum_t; 2111 2112 2113/* Macros for the compile stack. */ 2114 2115/* Since offsets can go either forwards or backwards, this type needs to 2116 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ 2117/* int may be not enough when sizeof(int) == 2. */ 2118typedef long pattern_offset_t; 2119 2120typedef struct 2121{ 2122 pattern_offset_t begalt_offset; 2123 pattern_offset_t fixup_alt_jump; 2124 pattern_offset_t inner_group_offset; 2125 pattern_offset_t laststart_offset; 2126 regnum_t regnum; 2127} compile_stack_elt_t; 2128 2129 2130typedef struct 2131{ 2132 compile_stack_elt_t *stack; 2133 unsigned size; 2134 unsigned avail; /* Offset of next open position. */ 2135} compile_stack_type; 2136 2137 2138# define INIT_COMPILE_STACK_SIZE 32 2139 2140# define COMPILE_STACK_EMPTY (compile_stack.avail == 0) 2141# define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) 2142 2143/* The next available element. */ 2144# define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) 2145 2146# endif /* not DEFINED_ONCE */ 2147 2148/* Set the bit for character C in a list. */ 2149# ifndef DEFINED_ONCE 2150# define SET_LIST_BIT(c) \ 2151 (b[((unsigned char) (c)) / BYTEWIDTH] \ 2152 |= 1 << (((unsigned char) c) % BYTEWIDTH)) 2153# endif /* DEFINED_ONCE */ 2154 2155/* Get the next unsigned number in the uncompiled pattern. */ 2156# define GET_UNSIGNED_NUMBER(num) \ 2157 { \ 2158 while (p != pend) \ 2159 { \ 2160 PATFETCH (c); \ 2161 if (c < '0' || c > '9') \ 2162 break; \ 2163 if (num <= RE_DUP_MAX) \ 2164 { \ 2165 if (num < 0) \ 2166 num = 0; \ 2167 num = num * 10 + c - '0'; \ 2168 } \ 2169 } \ 2170 } 2171 2172# ifndef DEFINED_ONCE 2173# if defined _LIBC || WIDE_CHAR_SUPPORT 2174/* The GNU C library provides support for user-defined character classes 2175 and the functions from ISO C amendement 1. */ 2176# ifdef CHARCLASS_NAME_MAX 2177# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX 2178# else 2179/* This shouldn't happen but some implementation might still have this 2180 problem. Use a reasonable default value. */ 2181# define CHAR_CLASS_MAX_LENGTH 256 2182# endif 2183 2184# ifdef _LIBC 2185# define IS_CHAR_CLASS(string) __wctype (string) 2186# else 2187# define IS_CHAR_CLASS(string) wctype (string) 2188# endif 2189# else 2190# define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ 2191 2192# define IS_CHAR_CLASS(string) \ 2193 (STREQ (string, "alpha") || STREQ (string, "upper") \ 2194 || STREQ (string, "lower") || STREQ (string, "digit") \ 2195 || STREQ (string, "alnum") || STREQ (string, "xdigit") \ 2196 || STREQ (string, "space") || STREQ (string, "print") \ 2197 || STREQ (string, "punct") || STREQ (string, "graph") \ 2198 || STREQ (string, "cntrl") || STREQ (string, "blank")) 2199# endif 2200# endif /* DEFINED_ONCE */ 2201 2202# ifndef MATCH_MAY_ALLOCATE 2203 2204/* If we cannot allocate large objects within re_match_2_internal, 2205 we make the fail stack and register vectors global. 2206 The fail stack, we grow to the maximum size when a regexp 2207 is compiled. 2208 The register vectors, we adjust in size each time we 2209 compile a regexp, according to the number of registers it needs. */ 2210 2211static PREFIX(fail_stack_type) fail_stack; 2212 2213/* Size with which the following vectors are currently allocated. 2214 That is so we can make them bigger as needed, 2215 but never make them smaller. */ 2216# ifdef DEFINED_ONCE 2217static int regs_allocated_size; 2218 2219static const char ** regstart, ** regend; 2220static const char ** old_regstart, ** old_regend; 2221static const char **best_regstart, **best_regend; 2222static const char **reg_dummy; 2223# endif /* DEFINED_ONCE */ 2224 2225static PREFIX(register_info_type) *PREFIX(reg_info); 2226static PREFIX(register_info_type) *PREFIX(reg_info_dummy); 2227 2228/* Make the register vectors big enough for NUM_REGS registers, 2229 but don't make them smaller. */ 2230 2231static void 2232PREFIX(regex_grow_registers) (int num_regs) 2233{ 2234 if (num_regs > regs_allocated_size) 2235 { 2236 RETALLOC_IF (regstart, num_regs, const char *); 2237 RETALLOC_IF (regend, num_regs, const char *); 2238 RETALLOC_IF (old_regstart, num_regs, const char *); 2239 RETALLOC_IF (old_regend, num_regs, const char *); 2240 RETALLOC_IF (best_regstart, num_regs, const char *); 2241 RETALLOC_IF (best_regend, num_regs, const char *); 2242 RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type)); 2243 RETALLOC_IF (reg_dummy, num_regs, const char *); 2244 RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type)); 2245 2246 regs_allocated_size = num_regs; 2247 } 2248} 2249 2250# endif /* not MATCH_MAY_ALLOCATE */ 2251 2252# ifndef DEFINED_ONCE 2253static boolean group_in_compile_stack (compile_stack_type 2254 compile_stack, 2255 regnum_t regnum); 2256# endif /* not DEFINED_ONCE */ 2257 2258/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. 2259 Returns one of error codes defined in `regex.h', or zero for success. 2260 2261 Assumes the `allocated' (and perhaps `buffer') and `translate' 2262 fields are set in BUFP on entry. 2263 2264 If it succeeds, results are put in BUFP (if it returns an error, the 2265 contents of BUFP are undefined): 2266 `buffer' is the compiled pattern; 2267 `syntax' is set to SYNTAX; 2268 `used' is set to the length of the compiled pattern; 2269 `fastmap_accurate' is zero; 2270 `re_nsub' is the number of subexpressions in PATTERN; 2271 `not_bol' and `not_eol' are zero; 2272 2273 The `fastmap' and `newline_anchor' fields are neither 2274 examined nor set. */ 2275 2276/* Return, freeing storage we allocated. */ 2277# ifdef WCHAR 2278# define FREE_STACK_RETURN(value) \ 2279 return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) 2280# else 2281# define FREE_STACK_RETURN(value) \ 2282 return (free (compile_stack.stack), value) 2283# endif /* WCHAR */ 2284 2285static reg_errcode_t 2286PREFIX(regex_compile) (const char *ARG_PREFIX(pattern), 2287 size_t ARG_PREFIX(size), 2288 reg_syntax_t syntax, 2289 struct re_pattern_buffer *bufp) 2290{ 2291 /* We fetch characters from PATTERN here. Even though PATTERN is 2292 `char *' (i.e., signed), we declare these variables as unsigned, so 2293 they can be reliably used as array indices. */ 2294 register UCHAR_T c, c1; 2295 2296#ifdef WCHAR 2297 /* A temporary space to keep wchar_t pattern and compiled pattern. */ 2298 CHAR_T *pattern, *COMPILED_BUFFER_VAR; 2299 size_t size; 2300 /* offset buffer for optimization. See convert_mbs_to_wc. */ 2301 int *mbs_offset = NULL; 2302 /* It hold whether each wchar_t is binary data or not. */ 2303 char *is_binary = NULL; 2304 /* A flag whether exactn is handling binary data or not. */ 2305 char is_exactn_bin = FALSE; 2306#endif /* WCHAR */ 2307 2308 /* A random temporary spot in PATTERN. */ 2309 const CHAR_T *p1; 2310 2311 /* Points to the end of the buffer, where we should append. */ 2312 register UCHAR_T *b; 2313 2314 /* Keeps track of unclosed groups. */ 2315 compile_stack_type compile_stack; 2316 2317 /* Points to the current (ending) position in the pattern. */ 2318#ifdef WCHAR 2319 const CHAR_T *p; 2320 const CHAR_T *pend; 2321#else /* BYTE */ 2322 const CHAR_T *p = pattern; 2323 const CHAR_T *pend = pattern + size; 2324#endif /* WCHAR */ 2325 2326 /* How to translate the characters in the pattern. */ 2327 RE_TRANSLATE_TYPE translate = bufp->translate; 2328 2329 /* Address of the count-byte of the most recently inserted `exactn' 2330 command. This makes it possible to tell if a new exact-match 2331 character can be added to that command or if the character requires 2332 a new `exactn' command. */ 2333 UCHAR_T *pending_exact = 0; 2334 2335 /* Address of start of the most recently finished expression. 2336 This tells, e.g., postfix * where to find the start of its 2337 operand. Reset at the beginning of groups and alternatives. */ 2338 UCHAR_T *laststart = 0; 2339 2340 /* Address of beginning of regexp, or inside of last group. */ 2341 UCHAR_T *begalt; 2342 2343 /* Address of the place where a forward jump should go to the end of 2344 the containing expression. Each alternative of an `or' -- except the 2345 last -- ends with a forward jump of this sort. */ 2346 UCHAR_T *fixup_alt_jump = 0; 2347 2348 /* Counts open-groups as they are encountered. Remembered for the 2349 matching close-group on the compile stack, so the same register 2350 number is put in the stop_memory as the start_memory. */ 2351 regnum_t regnum = 0; 2352 2353#ifdef WCHAR 2354 /* Initialize the wchar_t PATTERN and offset_buffer. */ 2355 p = pend = pattern = TALLOC(csize + 1, CHAR_T); 2356 mbs_offset = TALLOC(csize + 1, int); 2357 is_binary = TALLOC(csize + 1, char); 2358 if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) 2359 { 2360 free(pattern); 2361 free(mbs_offset); 2362 free(is_binary); 2363 return REG_ESPACE; 2364 } 2365 pattern[csize] = L'\0'; /* sentinel */ 2366 size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); 2367 pend = p + size; 2368 if (size < 0) 2369 { 2370 free(pattern); 2371 free(mbs_offset); 2372 free(is_binary); 2373 return REG_BADPAT; 2374 } 2375#endif 2376 2377#ifdef DEBUG 2378 DEBUG_PRINT1 ("\nCompiling pattern: "); 2379 if (debug) 2380 { 2381 unsigned debug_count; 2382 2383 for (debug_count = 0; debug_count < size; debug_count++) 2384 PUT_CHAR (pattern[debug_count]); 2385 putchar ('\n'); 2386 } 2387#endif /* DEBUG */ 2388 2389 /* Initialize the compile stack. */ 2390 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); 2391 if (compile_stack.stack == NULL) 2392 { 2393#ifdef WCHAR 2394 free(pattern); 2395 free(mbs_offset); 2396 free(is_binary); 2397#endif 2398 return REG_ESPACE; 2399 } 2400 2401 compile_stack.size = INIT_COMPILE_STACK_SIZE; 2402 compile_stack.avail = 0; 2403 2404 /* Initialize the pattern buffer. */ 2405 bufp->syntax = syntax; 2406 bufp->fastmap_accurate = 0; 2407 bufp->not_bol = bufp->not_eol = 0; 2408 2409 /* Set `used' to zero, so that if we return an error, the pattern 2410 printer (for debugging) will think there's no pattern. We reset it 2411 at the end. */ 2412 bufp->used = 0; 2413 2414 /* Always count groups, whether or not bufp->no_sub is set. */ 2415 bufp->re_nsub = 0; 2416 2417#if !defined emacs && !defined SYNTAX_TABLE 2418 /* Initialize the syntax table. */ 2419 init_syntax_once (); 2420#endif 2421 2422 if (bufp->allocated == 0) 2423 { 2424 if (bufp->buffer) 2425 { /* If zero allocated, but buffer is non-null, try to realloc 2426 enough space. This loses if buffer's address is bogus, but 2427 that is the user's responsibility. */ 2428#ifdef WCHAR 2429 /* Free bufp->buffer and allocate an array for wchar_t pattern 2430 buffer. */ 2431 free(bufp->buffer); 2432 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T), 2433 UCHAR_T); 2434#else 2435 RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T); 2436#endif /* WCHAR */ 2437 } 2438 else 2439 { /* Caller did not allocate a buffer. Do it for them. */ 2440 COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T), 2441 UCHAR_T); 2442 } 2443 2444 if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); 2445#ifdef WCHAR 2446 bufp->buffer = (char*)COMPILED_BUFFER_VAR; 2447#endif /* WCHAR */ 2448 bufp->allocated = INIT_BUF_SIZE; 2449 } 2450#ifdef WCHAR 2451 else 2452 COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer; 2453#endif 2454 2455 begalt = b = COMPILED_BUFFER_VAR; 2456 2457 /* Loop through the uncompiled pattern until we're at the end. */ 2458 while (p != pend) 2459 { 2460 PATFETCH (c); 2461 2462 switch (c) 2463 { 2464 case '^': 2465 { 2466 if ( /* If at start of pattern, it's an operator. */ 2467 p == pattern + 1 2468 /* If context independent, it's an operator. */ 2469 || syntax & RE_CONTEXT_INDEP_ANCHORS 2470 /* Otherwise, depends on what's come before. */ 2471 || PREFIX(at_begline_loc_p) (pattern, p, syntax)) 2472 BUF_PUSH (begline); 2473 else 2474 goto normal_char; 2475 } 2476 break; 2477 2478 2479 case '$': 2480 { 2481 if ( /* If at end of pattern, it's an operator. */ 2482 p == pend 2483 /* If context independent, it's an operator. */ 2484 || syntax & RE_CONTEXT_INDEP_ANCHORS 2485 /* Otherwise, depends on what's next. */ 2486 || PREFIX(at_endline_loc_p) (p, pend, syntax)) 2487 BUF_PUSH (endline); 2488 else 2489 goto normal_char; 2490 } 2491 break; 2492 2493 2494 case '+': 2495 case '?': 2496 if ((syntax & RE_BK_PLUS_QM) 2497 || (syntax & RE_LIMITED_OPS)) 2498 goto normal_char; 2499 handle_plus: 2500 case '*': 2501 /* If there is no previous pattern... */ 2502 if (!laststart) 2503 { 2504 if (syntax & RE_CONTEXT_INVALID_OPS) 2505 FREE_STACK_RETURN (REG_BADRPT); 2506 else if (!(syntax & RE_CONTEXT_INDEP_OPS)) 2507 goto normal_char; 2508 } 2509 2510 { 2511 /* Are we optimizing this jump? */ 2512 boolean keep_string_p = false; 2513 2514 /* 1 means zero (many) matches is allowed. */ 2515 char zero_times_ok = 0, many_times_ok = 0; 2516 2517 /* If there is a sequence of repetition chars, collapse it 2518 down to just one (the right one). We can't combine 2519 interval operators with these because of, e.g., `a{2}*', 2520 which should only match an even number of `a's. */ 2521 2522 for (;;) 2523 { 2524 zero_times_ok |= c != '+'; 2525 many_times_ok |= c != '?'; 2526 2527 if (p == pend) 2528 break; 2529 2530 PATFETCH (c); 2531 2532 if (c == '*' 2533 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) 2534 ; 2535 2536 else if (syntax & RE_BK_PLUS_QM && c == '\\') 2537 { 2538 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2539 2540 PATFETCH (c1); 2541 if (!(c1 == '+' || c1 == '?')) 2542 { 2543 PATUNFETCH; 2544 PATUNFETCH; 2545 break; 2546 } 2547 2548 c = c1; 2549 } 2550 else 2551 { 2552 PATUNFETCH; 2553 break; 2554 } 2555 2556 /* If we get here, we found another repeat character. */ 2557 } 2558 2559 /* Star, etc. applied to an empty pattern is equivalent 2560 to an empty pattern. */ 2561 if (!laststart) 2562 break; 2563 2564 /* Now we know whether or not zero matches is allowed 2565 and also whether or not two or more matches is allowed. */ 2566 if (many_times_ok) 2567 { /* More than one repetition is allowed, so put in at the 2568 end a backward relative jump from `b' to before the next 2569 jump we're going to put in below (which jumps from 2570 laststart to after this jump). 2571 2572 But if we are at the `*' in the exact sequence `.*\n', 2573 insert an unconditional jump backwards to the ., 2574 instead of the beginning of the loop. This way we only 2575 push a failure point once, instead of every time 2576 through the loop. */ 2577 assert (p - 1 > pattern); 2578 2579 /* Allocate the space for the jump. */ 2580 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2581 2582 /* We know we are not at the first character of the pattern, 2583 because laststart was nonzero. And we've already 2584 incremented `p', by the way, to be the character after 2585 the `*'. Do we have to do something analogous here 2586 for null bytes, because of RE_DOT_NOT_NULL? */ 2587 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') 2588 && zero_times_ok 2589 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') 2590 && !(syntax & RE_DOT_NEWLINE)) 2591 { /* We have .*\n. */ 2592 STORE_JUMP (jump, b, laststart); 2593 keep_string_p = true; 2594 } 2595 else 2596 /* Anything else. */ 2597 STORE_JUMP (maybe_pop_jump, b, laststart - 2598 (1 + OFFSET_ADDRESS_SIZE)); 2599 2600 /* We've added more stuff to the buffer. */ 2601 b += 1 + OFFSET_ADDRESS_SIZE; 2602 } 2603 2604 /* On failure, jump from laststart to b + 3, which will be the 2605 end of the buffer after this jump is inserted. */ 2606 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of 2607 'b + 3'. */ 2608 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2609 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump 2610 : on_failure_jump, 2611 laststart, b + 1 + OFFSET_ADDRESS_SIZE); 2612 pending_exact = 0; 2613 b += 1 + OFFSET_ADDRESS_SIZE; 2614 2615 if (!zero_times_ok) 2616 { 2617 /* At least one repetition is required, so insert a 2618 `dummy_failure_jump' before the initial 2619 `on_failure_jump' instruction of the loop. This 2620 effects a skip over that instruction the first time 2621 we hit that loop. */ 2622 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 2623 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 2624 2 + 2 * OFFSET_ADDRESS_SIZE); 2625 b += 1 + OFFSET_ADDRESS_SIZE; 2626 } 2627 } 2628 break; 2629 2630 2631 case '.': 2632 laststart = b; 2633 BUF_PUSH (anychar); 2634 break; 2635 2636 2637 case '[': 2638 { 2639 boolean had_char_class = false; 2640#ifdef WCHAR 2641 CHAR_T range_start = 0xffffffff; 2642#else 2643 unsigned int range_start = 0xffffffff; 2644#endif 2645 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2646 2647#ifdef WCHAR 2648 /* We assume a charset(_not) structure as a wchar_t array. 2649 charset[0] = (re_opcode_t) charset(_not) 2650 charset[1] = l (= length of char_classes) 2651 charset[2] = m (= length of collating_symbols) 2652 charset[3] = n (= length of equivalence_classes) 2653 charset[4] = o (= length of char_ranges) 2654 charset[5] = p (= length of chars) 2655 2656 charset[6] = char_class (wctype_t) 2657 charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) 2658 ... 2659 charset[l+5] = char_class (wctype_t) 2660 2661 charset[l+6] = collating_symbol (wchar_t) 2662 ... 2663 charset[l+m+5] = collating_symbol (wchar_t) 2664 ifdef _LIBC we use the index if 2665 _NL_COLLATE_SYMB_EXTRAMB instead of 2666 wchar_t string. 2667 2668 charset[l+m+6] = equivalence_classes (wchar_t) 2669 ... 2670 charset[l+m+n+5] = equivalence_classes (wchar_t) 2671 ifdef _LIBC we use the index in 2672 _NL_COLLATE_WEIGHT instead of 2673 wchar_t string. 2674 2675 charset[l+m+n+6] = range_start 2676 charset[l+m+n+7] = range_end 2677 ... 2678 charset[l+m+n+2o+4] = range_start 2679 charset[l+m+n+2o+5] = range_end 2680 ifdef _LIBC we use the value looked up 2681 in _NL_COLLATE_COLLSEQ instead of 2682 wchar_t character. 2683 2684 charset[l+m+n+2o+6] = char 2685 ... 2686 charset[l+m+n+2o+p+5] = char 2687 2688 */ 2689 2690 /* We need at least 6 spaces: the opcode, the length of 2691 char_classes, the length of collating_symbols, the length of 2692 equivalence_classes, the length of char_ranges, the length of 2693 chars. */ 2694 GET_BUFFER_SPACE (6); 2695 2696 /* Save b as laststart. And We use laststart as the pointer 2697 to the first element of the charset here. 2698 In other words, laststart[i] indicates charset[i]. */ 2699 laststart = b; 2700 2701 /* We test `*p == '^' twice, instead of using an if 2702 statement, so we only need one BUF_PUSH. */ 2703 BUF_PUSH (*p == '^' ? charset_not : charset); 2704 if (*p == '^') 2705 p++; 2706 2707 /* Push the length of char_classes, the length of 2708 collating_symbols, the length of equivalence_classes, the 2709 length of char_ranges and the length of chars. */ 2710 BUF_PUSH_3 (0, 0, 0); 2711 BUF_PUSH_2 (0, 0); 2712 2713 /* Remember the first position in the bracket expression. */ 2714 p1 = p; 2715 2716 /* charset_not matches newline according to a syntax bit. */ 2717 if ((re_opcode_t) b[-6] == charset_not 2718 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 2719 { 2720 BUF_PUSH('\n'); 2721 laststart[5]++; /* Update the length of characters */ 2722 } 2723 2724 /* Read in characters and ranges, setting map bits. */ 2725 for (;;) 2726 { 2727 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2728 2729 PATFETCH (c); 2730 2731 /* \ might escape characters inside [...] and [^...]. */ 2732 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 2733 { 2734 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 2735 2736 PATFETCH (c1); 2737 BUF_PUSH(c1); 2738 laststart[5]++; /* Update the length of chars */ 2739 range_start = c1; 2740 continue; 2741 } 2742 2743 /* Could be the end of the bracket expression. If it's 2744 not (i.e., when the bracket expression is `[]' so 2745 far), the ']' character bit gets set way below. */ 2746 if (c == ']' && p != p1 + 1) 2747 break; 2748 2749 /* Look ahead to see if it's a range when the last thing 2750 was a character class. */ 2751 if (had_char_class && c == '-' && *p != ']') 2752 FREE_STACK_RETURN (REG_ERANGE); 2753 2754 /* Look ahead to see if it's a range when the last thing 2755 was a character: if this is a hyphen not at the 2756 beginning or the end of a list, then it's the range 2757 operator. */ 2758 if (c == '-' 2759 && !(p - 2 >= pattern && p[-2] == '[') 2760 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 2761 && *p != ']') 2762 { 2763 reg_errcode_t ret; 2764 /* Allocate the space for range_start and range_end. */ 2765 GET_BUFFER_SPACE (2); 2766 /* Update the pointer to indicate end of buffer. */ 2767 b += 2; 2768 ret = wcs_compile_range (range_start, &p, pend, translate, 2769 syntax, b, laststart); 2770 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2771 range_start = 0xffffffff; 2772 } 2773 else if (p[0] == '-' && p[1] != ']') 2774 { /* This handles ranges made up of characters only. */ 2775 reg_errcode_t ret; 2776 2777 /* Move past the `-'. */ 2778 PATFETCH (c1); 2779 /* Allocate the space for range_start and range_end. */ 2780 GET_BUFFER_SPACE (2); 2781 /* Update the pointer to indicate end of buffer. */ 2782 b += 2; 2783 ret = wcs_compile_range (c, &p, pend, translate, syntax, b, 2784 laststart); 2785 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 2786 range_start = 0xffffffff; 2787 } 2788 2789 /* See if we're at the beginning of a possible character 2790 class. */ 2791 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 2792 { /* Leave room for the null. */ 2793 char str[CHAR_CLASS_MAX_LENGTH + 1]; 2794 2795 PATFETCH (c); 2796 c1 = 0; 2797 2798 /* If pattern is `[[:'. */ 2799 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2800 2801 for (;;) 2802 { 2803 PATFETCH (c); 2804 if ((c == ':' && *p == ']') || p == pend) 2805 break; 2806 if (c1 < CHAR_CLASS_MAX_LENGTH) 2807 str[c1++] = c; 2808 else 2809 /* This is in any case an invalid class name. */ 2810 str[0] = '\0'; 2811 } 2812 str[c1] = '\0'; 2813 2814 /* If isn't a word bracketed by `[:' and `:]': 2815 undo the ending character, the letters, and leave 2816 the leading `:' and `[' (but store them as character). */ 2817 if (c == ':' && *p == ']') 2818 { 2819 wctype_t wt; 2820 uintptr_t alignedp; 2821 2822 /* Query the character class as wctype_t. */ 2823 wt = IS_CHAR_CLASS (str); 2824 if (wt == 0) 2825 FREE_STACK_RETURN (REG_ECTYPE); 2826 2827 /* Throw away the ] at the end of the character 2828 class. */ 2829 PATFETCH (c); 2830 2831 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2832 2833 /* Allocate the space for character class. */ 2834 GET_BUFFER_SPACE(CHAR_CLASS_SIZE); 2835 /* Update the pointer to indicate end of buffer. */ 2836 b += CHAR_CLASS_SIZE; 2837 /* Move data which follow character classes 2838 not to violate the data. */ 2839 insert_space(CHAR_CLASS_SIZE, 2840 laststart + 6 + laststart[1], 2841 b - 1); 2842 alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) 2843 + __alignof__(wctype_t) - 1) 2844 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 2845 /* Store the character class. */ 2846 *((wctype_t*)alignedp) = wt; 2847 /* Update length of char_classes */ 2848 laststart[1] += CHAR_CLASS_SIZE; 2849 2850 had_char_class = true; 2851 } 2852 else 2853 { 2854 c1++; 2855 while (c1--) 2856 PATUNFETCH; 2857 BUF_PUSH ('['); 2858 BUF_PUSH (':'); 2859 laststart[5] += 2; /* Update the length of characters */ 2860 range_start = ':'; 2861 had_char_class = false; 2862 } 2863 } 2864 else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' 2865 || *p == '.')) 2866 { 2867 CHAR_T str[128]; /* Should be large enough. */ 2868 CHAR_T delim = *p; /* '=' or '.' */ 2869# ifdef _LIBC 2870 uint32_t nrules = 2871 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 2872# endif 2873 PATFETCH (c); 2874 c1 = 0; 2875 2876 /* If pattern is `[[=' or '[[.'. */ 2877 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 2878 2879 for (;;) 2880 { 2881 PATFETCH (c); 2882 if ((c == delim && *p == ']') || p == pend) 2883 break; 2884 if (c1 < sizeof (str) - 1) 2885 str[c1++] = c; 2886 else 2887 /* This is in any case an invalid class name. */ 2888 str[0] = '\0'; 2889 } 2890 str[c1] = '\0'; 2891 2892 if (c == delim && *p == ']' && str[0] != '\0') 2893 { 2894 unsigned int i, offset; 2895 /* If we have no collation data we use the default 2896 collation in which each character is in a class 2897 by itself. It also means that ASCII is the 2898 character set and therefore we cannot have character 2899 with more than one byte in the multibyte 2900 representation. */ 2901 2902 /* If not defined _LIBC, we push the name and 2903 `\0' for the sake of matching performance. */ 2904 int datasize = c1 + 1; 2905 2906# ifdef _LIBC 2907 int32_t idx = 0; 2908 if (nrules == 0) 2909# endif 2910 { 2911 if (c1 != 1) 2912 FREE_STACK_RETURN (REG_ECOLLATE); 2913 } 2914# ifdef _LIBC 2915 else 2916 { 2917 const int32_t *table; 2918 const int32_t *weights; 2919 const int32_t *extra; 2920 const int32_t *indirect; 2921 wint_t *cp; 2922 2923 /* This #include defines a local function! */ 2924# include <locale/weightwc.h> 2925 2926 if(delim == '=') 2927 { 2928 /* We push the index for equivalence class. */ 2929 cp = (wint_t*)str; 2930 2931 table = (const int32_t *) 2932 _NL_CURRENT (LC_COLLATE, 2933 _NL_COLLATE_TABLEWC); 2934 weights = (const int32_t *) 2935 _NL_CURRENT (LC_COLLATE, 2936 _NL_COLLATE_WEIGHTWC); 2937 extra = (const int32_t *) 2938 _NL_CURRENT (LC_COLLATE, 2939 _NL_COLLATE_EXTRAWC); 2940 indirect = (const int32_t *) 2941 _NL_CURRENT (LC_COLLATE, 2942 _NL_COLLATE_INDIRECTWC); 2943 2944 idx = findidx ((const wint_t**)&cp); 2945 if (idx == 0 || cp < (wint_t*) str + c1) 2946 /* This is no valid character. */ 2947 FREE_STACK_RETURN (REG_ECOLLATE); 2948 2949 str[0] = (wchar_t)idx; 2950 } 2951 else /* delim == '.' */ 2952 { 2953 /* We push collation sequence value 2954 for collating symbol. */ 2955 int32_t table_size; 2956 const int32_t *symb_table; 2957 const unsigned char *extra; 2958 int32_t idx; 2959 int32_t elem; 2960 int32_t second; 2961 int32_t hash; 2962 char char_str[c1]; 2963 2964 /* We have to convert the name to a single-byte 2965 string. This is possible since the names 2966 consist of ASCII characters and the internal 2967 representation is UCS4. */ 2968 for (i = 0; i < c1; ++i) 2969 char_str[i] = str[i]; 2970 2971 table_size = 2972 _NL_CURRENT_WORD (LC_COLLATE, 2973 _NL_COLLATE_SYMB_HASH_SIZEMB); 2974 symb_table = (const int32_t *) 2975 _NL_CURRENT (LC_COLLATE, 2976 _NL_COLLATE_SYMB_TABLEMB); 2977 extra = (const unsigned char *) 2978 _NL_CURRENT (LC_COLLATE, 2979 _NL_COLLATE_SYMB_EXTRAMB); 2980 2981 /* Locate the character in the hashing table. */ 2982 hash = elem_hash (char_str, c1); 2983 2984 idx = 0; 2985 elem = hash % table_size; 2986 second = hash % (table_size - 2); 2987 while (symb_table[2 * elem] != 0) 2988 { 2989 /* First compare the hashing value. */ 2990 if (symb_table[2 * elem] == hash 2991 && c1 == extra[symb_table[2 * elem + 1]] 2992 && memcmp (char_str, 2993 &extra[symb_table[2 * elem + 1] 2994 + 1], c1) == 0) 2995 { 2996 /* Yep, this is the entry. */ 2997 idx = symb_table[2 * elem + 1]; 2998 idx += 1 + extra[idx]; 2999 break; 3000 } 3001 3002 /* Next entry. */ 3003 elem += second; 3004 } 3005 3006 if (symb_table[2 * elem] != 0) 3007 { 3008 /* Compute the index of the byte sequence 3009 in the table. */ 3010 idx += 1 + extra[idx]; 3011 /* Adjust for the alignment. */ 3012 idx = (idx + 3) & ~3; 3013 3014 str[0] = (wchar_t) idx + 4; 3015 } 3016 else if (symb_table[2 * elem] == 0 && c1 == 1) 3017 { 3018 /* No valid character. Match it as a 3019 single byte character. */ 3020 had_char_class = false; 3021 BUF_PUSH(str[0]); 3022 /* Update the length of characters */ 3023 laststart[5]++; 3024 range_start = str[0]; 3025 3026 /* Throw away the ] at the end of the 3027 collating symbol. */ 3028 PATFETCH (c); 3029 /* exit from the switch block. */ 3030 continue; 3031 } 3032 else 3033 FREE_STACK_RETURN (REG_ECOLLATE); 3034 } 3035 datasize = 1; 3036 } 3037# endif 3038 /* Throw away the ] at the end of the equivalence 3039 class (or collating symbol). */ 3040 PATFETCH (c); 3041 3042 /* Allocate the space for the equivalence class 3043 (or collating symbol) (and '\0' if needed). */ 3044 GET_BUFFER_SPACE(datasize); 3045 /* Update the pointer to indicate end of buffer. */ 3046 b += datasize; 3047 3048 if (delim == '=') 3049 { /* equivalence class */ 3050 /* Calculate the offset of char_ranges, 3051 which is next to equivalence_classes. */ 3052 offset = laststart[1] + laststart[2] 3053 + laststart[3] +6; 3054 /* Insert space. */ 3055 insert_space(datasize, laststart + offset, b - 1); 3056 3057 /* Write the equivalence_class and \0. */ 3058 for (i = 0 ; i < datasize ; i++) 3059 laststart[offset + i] = str[i]; 3060 3061 /* Update the length of equivalence_classes. */ 3062 laststart[3] += datasize; 3063 had_char_class = true; 3064 } 3065 else /* delim == '.' */ 3066 { /* collating symbol */ 3067 /* Calculate the offset of the equivalence_classes, 3068 which is next to collating_symbols. */ 3069 offset = laststart[1] + laststart[2] + 6; 3070 /* Insert space and write the collationg_symbol 3071 and \0. */ 3072 insert_space(datasize, laststart + offset, b-1); 3073 for (i = 0 ; i < datasize ; i++) 3074 laststart[offset + i] = str[i]; 3075 3076 /* In re_match_2_internal if range_start < -1, we 3077 assume -range_start is the offset of the 3078 collating symbol which is specified as 3079 the character of the range start. So we assign 3080 -(laststart[1] + laststart[2] + 6) to 3081 range_start. */ 3082 range_start = -(laststart[1] + laststart[2] + 6); 3083 /* Update the length of collating_symbol. */ 3084 laststart[2] += datasize; 3085 had_char_class = false; 3086 } 3087 } 3088 else 3089 { 3090 c1++; 3091 while (c1--) 3092 PATUNFETCH; 3093 BUF_PUSH ('['); 3094 BUF_PUSH (delim); 3095 laststart[5] += 2; /* Update the length of characters */ 3096 range_start = delim; 3097 had_char_class = false; 3098 } 3099 } 3100 else 3101 { 3102 had_char_class = false; 3103 BUF_PUSH(c); 3104 laststart[5]++; /* Update the length of characters */ 3105 range_start = c; 3106 } 3107 } 3108 3109#else /* BYTE */ 3110 /* Ensure that we have enough space to push a charset: the 3111 opcode, the length count, and the bitset; 34 bytes in all. */ 3112 GET_BUFFER_SPACE (34); 3113 3114 laststart = b; 3115 3116 /* We test `*p == '^' twice, instead of using an if 3117 statement, so we only need one BUF_PUSH. */ 3118 BUF_PUSH (*p == '^' ? charset_not : charset); 3119 if (*p == '^') 3120 p++; 3121 3122 /* Remember the first position in the bracket expression. */ 3123 p1 = p; 3124 3125 /* Push the number of bytes in the bitmap. */ 3126 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); 3127 3128 /* Clear the whole map. */ 3129 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); 3130 3131 /* charset_not matches newline according to a syntax bit. */ 3132 if ((re_opcode_t) b[-2] == charset_not 3133 && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) 3134 SET_LIST_BIT ('\n'); 3135 3136 /* Read in characters and ranges, setting map bits. */ 3137 for (;;) 3138 { 3139 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3140 3141 PATFETCH (c); 3142 3143 /* \ might escape characters inside [...] and [^...]. */ 3144 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') 3145 { 3146 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3147 3148 PATFETCH (c1); 3149 SET_LIST_BIT (c1); 3150 range_start = c1; 3151 continue; 3152 } 3153 3154 /* Could be the end of the bracket expression. If it's 3155 not (i.e., when the bracket expression is `[]' so 3156 far), the ']' character bit gets set way below. */ 3157 if (c == ']' && p != p1 + 1) 3158 break; 3159 3160 /* Look ahead to see if it's a range when the last thing 3161 was a character class. */ 3162 if (had_char_class && c == '-' && *p != ']') 3163 FREE_STACK_RETURN (REG_ERANGE); 3164 3165 /* Look ahead to see if it's a range when the last thing 3166 was a character: if this is a hyphen not at the 3167 beginning or the end of a list, then it's the range 3168 operator. */ 3169 if (c == '-' 3170 && !(p - 2 >= pattern && p[-2] == '[') 3171 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') 3172 && *p != ']') 3173 { 3174 reg_errcode_t ret 3175 = byte_compile_range (range_start, &p, pend, translate, 3176 syntax, b); 3177 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3178 range_start = 0xffffffff; 3179 } 3180 3181 else if (p[0] == '-' && p[1] != ']') 3182 { /* This handles ranges made up of characters only. */ 3183 reg_errcode_t ret; 3184 3185 /* Move past the `-'. */ 3186 PATFETCH (c1); 3187 3188 ret = byte_compile_range (c, &p, pend, translate, syntax, b); 3189 if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); 3190 range_start = 0xffffffff; 3191 } 3192 3193 /* See if we're at the beginning of a possible character 3194 class. */ 3195 3196 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') 3197 { /* Leave room for the null. */ 3198 char str[CHAR_CLASS_MAX_LENGTH + 1]; 3199 3200 PATFETCH (c); 3201 c1 = 0; 3202 3203 /* If pattern is `[[:'. */ 3204 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3205 3206 for (;;) 3207 { 3208 PATFETCH (c); 3209 if ((c == ':' && *p == ']') || p == pend) 3210 break; 3211 if (c1 < CHAR_CLASS_MAX_LENGTH) 3212 str[c1++] = c; 3213 else 3214 /* This is in any case an invalid class name. */ 3215 str[0] = '\0'; 3216 } 3217 str[c1] = '\0'; 3218 3219 /* If isn't a word bracketed by `[:' and `:]': 3220 undo the ending character, the letters, and leave 3221 the leading `:' and `[' (but set bits for them). */ 3222 if (c == ':' && *p == ']') 3223 { 3224# if defined _LIBC || WIDE_CHAR_SUPPORT 3225 boolean is_lower = STREQ (str, "lower"); 3226 boolean is_upper = STREQ (str, "upper"); 3227 wctype_t wt; 3228 int ch; 3229 3230 wt = IS_CHAR_CLASS (str); 3231 if (wt == 0) 3232 FREE_STACK_RETURN (REG_ECTYPE); 3233 3234 /* Throw away the ] at the end of the character 3235 class. */ 3236 PATFETCH (c); 3237 3238 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3239 3240 for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) 3241 { 3242 if (iswctype (btowc (ch), wt)) 3243 SET_LIST_BIT (ch); 3244 3245 if (translate && (is_upper || is_lower) 3246 && (ISUPPER (ch) || ISLOWER (ch))) 3247 SET_LIST_BIT (ch); 3248 } 3249 3250 had_char_class = true; 3251# else 3252 int ch; 3253 boolean is_alnum = STREQ (str, "alnum"); 3254 boolean is_alpha = STREQ (str, "alpha"); 3255 boolean is_blank = STREQ (str, "blank"); 3256 boolean is_cntrl = STREQ (str, "cntrl"); 3257 boolean is_digit = STREQ (str, "digit"); 3258 boolean is_graph = STREQ (str, "graph"); 3259 boolean is_lower = STREQ (str, "lower"); 3260 boolean is_print = STREQ (str, "print"); 3261 boolean is_punct = STREQ (str, "punct"); 3262 boolean is_space = STREQ (str, "space"); 3263 boolean is_upper = STREQ (str, "upper"); 3264 boolean is_xdigit = STREQ (str, "xdigit"); 3265 3266 if (!IS_CHAR_CLASS (str)) 3267 FREE_STACK_RETURN (REG_ECTYPE); 3268 3269 /* Throw away the ] at the end of the character 3270 class. */ 3271 PATFETCH (c); 3272 3273 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3274 3275 for (ch = 0; ch < 1 << BYTEWIDTH; ch++) 3276 { 3277 /* This was split into 3 if's to 3278 avoid an arbitrary limit in some compiler. */ 3279 if ( (is_alnum && ISALNUM (ch)) 3280 || (is_alpha && ISALPHA (ch)) 3281 || (is_blank && ISBLANK (ch)) 3282 || (is_cntrl && ISCNTRL (ch))) 3283 SET_LIST_BIT (ch); 3284 if ( (is_digit && ISDIGIT (ch)) 3285 || (is_graph && ISGRAPH (ch)) 3286 || (is_lower && ISLOWER (ch)) 3287 || (is_print && ISPRINT (ch))) 3288 SET_LIST_BIT (ch); 3289 if ( (is_punct && ISPUNCT (ch)) 3290 || (is_space && ISSPACE (ch)) 3291 || (is_upper && ISUPPER (ch)) 3292 || (is_xdigit && ISXDIGIT (ch))) 3293 SET_LIST_BIT (ch); 3294 if ( translate && (is_upper || is_lower) 3295 && (ISUPPER (ch) || ISLOWER (ch))) 3296 SET_LIST_BIT (ch); 3297 } 3298 had_char_class = true; 3299# endif /* libc || wctype.h */ 3300 } 3301 else 3302 { 3303 c1++; 3304 while (c1--) 3305 PATUNFETCH; 3306 SET_LIST_BIT ('['); 3307 SET_LIST_BIT (':'); 3308 range_start = ':'; 3309 had_char_class = false; 3310 } 3311 } 3312 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') 3313 { 3314 unsigned char str[MB_LEN_MAX + 1]; 3315# ifdef _LIBC 3316 uint32_t nrules = 3317 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3318# endif 3319 3320 PATFETCH (c); 3321 c1 = 0; 3322 3323 /* If pattern is `[[='. */ 3324 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3325 3326 for (;;) 3327 { 3328 PATFETCH (c); 3329 if ((c == '=' && *p == ']') || p == pend) 3330 break; 3331 if (c1 < MB_LEN_MAX) 3332 str[c1++] = c; 3333 else 3334 /* This is in any case an invalid class name. */ 3335 str[0] = '\0'; 3336 } 3337 str[c1] = '\0'; 3338 3339 if (c == '=' && *p == ']' && str[0] != '\0') 3340 { 3341 /* If we have no collation data we use the default 3342 collation in which each character is in a class 3343 by itself. It also means that ASCII is the 3344 character set and therefore we cannot have character 3345 with more than one byte in the multibyte 3346 representation. */ 3347# ifdef _LIBC 3348 if (nrules == 0) 3349# endif 3350 { 3351 if (c1 != 1) 3352 FREE_STACK_RETURN (REG_ECOLLATE); 3353 3354 /* Throw away the ] at the end of the equivalence 3355 class. */ 3356 PATFETCH (c); 3357 3358 /* Set the bit for the character. */ 3359 SET_LIST_BIT (str[0]); 3360 } 3361# ifdef _LIBC 3362 else 3363 { 3364 /* Try to match the byte sequence in `str' against 3365 those known to the collate implementation. 3366 First find out whether the bytes in `str' are 3367 actually from exactly one character. */ 3368 const int32_t *table; 3369 const unsigned char *weights; 3370 const unsigned char *extra; 3371 const int32_t *indirect; 3372 int32_t idx; 3373 const unsigned char *cp = str; 3374 int ch; 3375 3376 /* This #include defines a local function! */ 3377# include <locale/weight.h> 3378 3379 table = (const int32_t *) 3380 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); 3381 weights = (const unsigned char *) 3382 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); 3383 extra = (const unsigned char *) 3384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); 3385 indirect = (const int32_t *) 3386 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); 3387 3388 idx = findidx (&cp); 3389 if (idx == 0 || cp < str + c1) 3390 /* This is no valid character. */ 3391 FREE_STACK_RETURN (REG_ECOLLATE); 3392 3393 /* Throw away the ] at the end of the equivalence 3394 class. */ 3395 PATFETCH (c); 3396 3397 /* Now we have to go throught the whole table 3398 and find all characters which have the same 3399 first level weight. 3400 3401 XXX Note that this is not entirely correct. 3402 we would have to match multibyte sequences 3403 but this is not possible with the current 3404 implementation. */ 3405 for (ch = 1; ch < 256; ++ch) 3406 /* XXX This test would have to be changed if we 3407 would allow matching multibyte sequences. */ 3408 if (table[ch] > 0) 3409 { 3410 int32_t idx2 = table[ch]; 3411 size_t len = weights[idx2]; 3412 3413 /* Test whether the lenghts match. */ 3414 if (weights[idx] == len) 3415 { 3416 /* They do. New compare the bytes of 3417 the weight. */ 3418 size_t cnt = 0; 3419 3420 while (cnt < len 3421 && (weights[idx + 1 + cnt] 3422 == weights[idx2 + 1 + cnt])) 3423 ++cnt; 3424 3425 if (cnt == len) 3426 /* They match. Mark the character as 3427 acceptable. */ 3428 SET_LIST_BIT (ch); 3429 } 3430 } 3431 } 3432# endif 3433 had_char_class = true; 3434 } 3435 else 3436 { 3437 c1++; 3438 while (c1--) 3439 PATUNFETCH; 3440 SET_LIST_BIT ('['); 3441 SET_LIST_BIT ('='); 3442 range_start = '='; 3443 had_char_class = false; 3444 } 3445 } 3446 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '.') 3447 { 3448 unsigned char str[128]; /* Should be large enough. */ 3449# ifdef _LIBC 3450 uint32_t nrules = 3451 _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 3452# endif 3453 3454 PATFETCH (c); 3455 c1 = 0; 3456 3457 /* If pattern is `[[.'. */ 3458 if (p == pend) FREE_STACK_RETURN (REG_EBRACK); 3459 3460 for (;;) 3461 { 3462 PATFETCH (c); 3463 if ((c == '.' && *p == ']') || p == pend) 3464 break; 3465 if (c1 < sizeof (str)) 3466 str[c1++] = c; 3467 else 3468 /* This is in any case an invalid class name. */ 3469 str[0] = '\0'; 3470 } 3471 str[c1] = '\0'; 3472 3473 if (c == '.' && *p == ']' && str[0] != '\0') 3474 { 3475 /* If we have no collation data we use the default 3476 collation in which each character is the name 3477 for its own class which contains only the one 3478 character. It also means that ASCII is the 3479 character set and therefore we cannot have character 3480 with more than one byte in the multibyte 3481 representation. */ 3482# ifdef _LIBC 3483 if (nrules == 0) 3484# endif 3485 { 3486 if (c1 != 1) 3487 FREE_STACK_RETURN (REG_ECOLLATE); 3488 3489 /* Throw away the ] at the end of the equivalence 3490 class. */ 3491 PATFETCH (c); 3492 3493 /* Set the bit for the character. */ 3494 SET_LIST_BIT (str[0]); 3495 range_start = ((const unsigned char *) str)[0]; 3496 } 3497# ifdef _LIBC 3498 else 3499 { 3500 /* Try to match the byte sequence in `str' against 3501 those known to the collate implementation. 3502 First find out whether the bytes in `str' are 3503 actually from exactly one character. */ 3504 int32_t table_size; 3505 const int32_t *symb_table; 3506 const unsigned char *extra; 3507 int32_t idx; 3508 int32_t elem; 3509 int32_t second; 3510 int32_t hash; 3511 3512 table_size = 3513 _NL_CURRENT_WORD (LC_COLLATE, 3514 _NL_COLLATE_SYMB_HASH_SIZEMB); 3515 symb_table = (const int32_t *) 3516 _NL_CURRENT (LC_COLLATE, 3517 _NL_COLLATE_SYMB_TABLEMB); 3518 extra = (const unsigned char *) 3519 _NL_CURRENT (LC_COLLATE, 3520 _NL_COLLATE_SYMB_EXTRAMB); 3521 3522 /* Locate the character in the hashing table. */ 3523 hash = elem_hash (str, c1); 3524 3525 idx = 0; 3526 elem = hash % table_size; 3527 second = hash % (table_size - 2); 3528 while (symb_table[2 * elem] != 0) 3529 { 3530 /* First compare the hashing value. */ 3531 if (symb_table[2 * elem] == hash 3532 && c1 == extra[symb_table[2 * elem + 1]] 3533 && memcmp (str, 3534 &extra[symb_table[2 * elem + 1] 3535 + 1], 3536 c1) == 0) 3537 { 3538 /* Yep, this is the entry. */ 3539 idx = symb_table[2 * elem + 1]; 3540 idx += 1 + extra[idx]; 3541 break; 3542 } 3543 3544 /* Next entry. */ 3545 elem += second; 3546 } 3547 3548 if (symb_table[2 * elem] == 0) 3549 /* This is no valid character. */ 3550 FREE_STACK_RETURN (REG_ECOLLATE); 3551 3552 /* Throw away the ] at the end of the equivalence 3553 class. */ 3554 PATFETCH (c); 3555 3556 /* Now add the multibyte character(s) we found 3557 to the accept list. 3558 3559 XXX Note that this is not entirely correct. 3560 we would have to match multibyte sequences 3561 but this is not possible with the current 3562 implementation. Also, we have to match 3563 collating symbols, which expand to more than 3564 one file, as a whole and not allow the 3565 individual bytes. */ 3566 c1 = extra[idx++]; 3567 if (c1 == 1) 3568 range_start = extra[idx]; 3569 while (c1-- > 0) 3570 { 3571 SET_LIST_BIT (extra[idx]); 3572 ++idx; 3573 } 3574 } 3575# endif 3576 had_char_class = false; 3577 } 3578 else 3579 { 3580 c1++; 3581 while (c1--) 3582 PATUNFETCH; 3583 SET_LIST_BIT ('['); 3584 SET_LIST_BIT ('.'); 3585 range_start = '.'; 3586 had_char_class = false; 3587 } 3588 } 3589 else 3590 { 3591 had_char_class = false; 3592 SET_LIST_BIT (c); 3593 range_start = c; 3594 } 3595 } 3596 3597 /* Discard any (non)matching list bytes that are all 0 at the 3598 end of the map. Decrease the map-length byte too. */ 3599 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) 3600 b[-1]--; 3601 b += b[-1]; 3602#endif /* WCHAR */ 3603 } 3604 break; 3605 3606 3607 case '(': 3608 if (syntax & RE_NO_BK_PARENS) 3609 goto handle_open; 3610 else 3611 goto normal_char; 3612 3613 3614 case ')': 3615 if (syntax & RE_NO_BK_PARENS) 3616 goto handle_close; 3617 else 3618 goto normal_char; 3619 3620 3621 case '\n': 3622 if (syntax & RE_NEWLINE_ALT) 3623 goto handle_alt; 3624 else 3625 goto normal_char; 3626 3627 3628 case '|': 3629 if (syntax & RE_NO_BK_VBAR) 3630 goto handle_alt; 3631 else 3632 goto normal_char; 3633 3634 3635 case '{': 3636 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) 3637 goto handle_interval; 3638 else 3639 goto normal_char; 3640 3641 3642 case '\\': 3643 if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); 3644 3645 /* Do not translate the character after the \, so that we can 3646 distinguish, e.g., \B from \b, even if we normally would 3647 translate, e.g., B to b. */ 3648 PATFETCH_RAW (c); 3649 3650 switch (c) 3651 { 3652 case '(': 3653 if (syntax & RE_NO_BK_PARENS) 3654 goto normal_backslash; 3655 3656 handle_open: 3657 bufp->re_nsub++; 3658 regnum++; 3659 3660 if (COMPILE_STACK_FULL) 3661 { 3662 RETALLOC (compile_stack.stack, compile_stack.size << 1, 3663 compile_stack_elt_t); 3664 if (compile_stack.stack == NULL) return REG_ESPACE; 3665 3666 compile_stack.size <<= 1; 3667 } 3668 3669 /* These are the values to restore when we hit end of this 3670 group. They are all relative offsets, so that if the 3671 whole pattern moves because of realloc, they will still 3672 be valid. */ 3673 COMPILE_STACK_TOP.begalt_offset = begalt - COMPILED_BUFFER_VAR; 3674 COMPILE_STACK_TOP.fixup_alt_jump 3675 = fixup_alt_jump ? fixup_alt_jump - COMPILED_BUFFER_VAR + 1 : 0; 3676 COMPILE_STACK_TOP.laststart_offset = b - COMPILED_BUFFER_VAR; 3677 COMPILE_STACK_TOP.regnum = regnum; 3678 3679 /* We will eventually replace the 0 with the number of 3680 groups inner to this one. But do not push a 3681 start_memory for groups beyond the last one we can 3682 represent in the compiled pattern. */ 3683 if (regnum <= MAX_REGNUM) 3684 { 3685 COMPILE_STACK_TOP.inner_group_offset = b 3686 - COMPILED_BUFFER_VAR + 2; 3687 BUF_PUSH_3 (start_memory, regnum, 0); 3688 } 3689 3690 compile_stack.avail++; 3691 3692 fixup_alt_jump = 0; 3693 laststart = 0; 3694 begalt = b; 3695 /* If we've reached MAX_REGNUM groups, then this open 3696 won't actually generate any code, so we'll have to 3697 clear pending_exact explicitly. */ 3698 pending_exact = 0; 3699 break; 3700 3701 3702 case ')': 3703 if (syntax & RE_NO_BK_PARENS) goto normal_backslash; 3704 3705 if (COMPILE_STACK_EMPTY) 3706 { 3707 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3708 goto normal_backslash; 3709 else 3710 FREE_STACK_RETURN (REG_ERPAREN); 3711 } 3712 3713 handle_close: 3714 if (fixup_alt_jump) 3715 { /* Push a dummy failure point at the end of the 3716 alternative for a possible future 3717 `pop_failure_jump' to pop. See comments at 3718 `push_dummy_failure' in `re_match_2'. */ 3719 BUF_PUSH (push_dummy_failure); 3720 3721 /* We allocated space for this jump when we assigned 3722 to `fixup_alt_jump', in the `handle_alt' case below. */ 3723 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); 3724 } 3725 3726 /* See similar code for backslashed left paren above. */ 3727 if (COMPILE_STACK_EMPTY) 3728 { 3729 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) 3730 goto normal_char; 3731 else 3732 FREE_STACK_RETURN (REG_ERPAREN); 3733 } 3734 3735 /* Since we just checked for an empty stack above, this 3736 ``can't happen''. */ 3737 assert (compile_stack.avail != 0); 3738 { 3739 /* We don't just want to restore into `regnum', because 3740 later groups should continue to be numbered higher, 3741 as in `(ab)c(de)' -- the second group is #2. */ 3742 regnum_t this_group_regnum; 3743 3744 compile_stack.avail--; 3745 begalt = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.begalt_offset; 3746 fixup_alt_jump 3747 = COMPILE_STACK_TOP.fixup_alt_jump 3748 ? COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.fixup_alt_jump - 1 3749 : 0; 3750 laststart = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.laststart_offset; 3751 this_group_regnum = COMPILE_STACK_TOP.regnum; 3752 /* If we've reached MAX_REGNUM groups, then this open 3753 won't actually generate any code, so we'll have to 3754 clear pending_exact explicitly. */ 3755 pending_exact = 0; 3756 3757 /* We're at the end of the group, so now we know how many 3758 groups were inside this one. */ 3759 if (this_group_regnum <= MAX_REGNUM) 3760 { 3761 UCHAR_T *inner_group_loc 3762 = COMPILED_BUFFER_VAR + COMPILE_STACK_TOP.inner_group_offset; 3763 3764 *inner_group_loc = regnum - this_group_regnum; 3765 BUF_PUSH_3 (stop_memory, this_group_regnum, 3766 regnum - this_group_regnum); 3767 } 3768 } 3769 break; 3770 3771 3772 case '|': /* `\|'. */ 3773 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) 3774 goto normal_backslash; 3775 handle_alt: 3776 if (syntax & RE_LIMITED_OPS) 3777 goto normal_char; 3778 3779 /* Insert before the previous alternative a jump which 3780 jumps to this alternative if the former fails. */ 3781 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3782 INSERT_JUMP (on_failure_jump, begalt, 3783 b + 2 + 2 * OFFSET_ADDRESS_SIZE); 3784 pending_exact = 0; 3785 b += 1 + OFFSET_ADDRESS_SIZE; 3786 3787 /* The alternative before this one has a jump after it 3788 which gets executed if it gets matched. Adjust that 3789 jump so it will jump to this alternative's analogous 3790 jump (put in below, which in turn will jump to the next 3791 (if any) alternative's such jump, etc.). The last such 3792 jump jumps to the correct final destination. A picture: 3793 _____ _____ 3794 | | | | 3795 | v | v 3796 a | b | c 3797 3798 If we are at `b', then fixup_alt_jump right now points to a 3799 three-byte space after `a'. We'll put in the jump, set 3800 fixup_alt_jump to right after `b', and leave behind three 3801 bytes which we'll fill in when we get to after `c'. */ 3802 3803 if (fixup_alt_jump) 3804 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 3805 3806 /* Mark and leave space for a jump after this alternative, 3807 to be filled in later either by next alternative or 3808 when know we're at the end of a series of alternatives. */ 3809 fixup_alt_jump = b; 3810 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3811 b += 1 + OFFSET_ADDRESS_SIZE; 3812 3813 laststart = 0; 3814 begalt = b; 3815 break; 3816 3817 3818 case '{': 3819 /* If \{ is a literal. */ 3820 if (!(syntax & RE_INTERVALS) 3821 /* If we're at `\{' and it's not the open-interval 3822 operator. */ 3823 || (syntax & RE_NO_BK_BRACES)) 3824 goto normal_backslash; 3825 3826 handle_interval: 3827 { 3828 /* If got here, then the syntax allows intervals. */ 3829 3830 /* At least (most) this many matches must be made. */ 3831 int lower_bound = -1, upper_bound = -1; 3832 3833 /* Place in the uncompiled pattern (i.e., just after 3834 the '{') to go back to if the interval is invalid. */ 3835 const CHAR_T *beg_interval = p; 3836 3837 if (p == pend) 3838 goto invalid_interval; 3839 3840 GET_UNSIGNED_NUMBER (lower_bound); 3841 3842 if (c == ',') 3843 { 3844 GET_UNSIGNED_NUMBER (upper_bound); 3845 if (upper_bound < 0) 3846 upper_bound = RE_DUP_MAX; 3847 } 3848 else 3849 /* Interval such as `{1}' => match exactly once. */ 3850 upper_bound = lower_bound; 3851 3852 if (! (0 <= lower_bound && lower_bound <= upper_bound)) 3853 goto invalid_interval; 3854 3855 if (!(syntax & RE_NO_BK_BRACES)) 3856 { 3857 if (c != '\\' || p == pend) 3858 goto invalid_interval; 3859 PATFETCH (c); 3860 } 3861 3862 if (c != '}') 3863 goto invalid_interval; 3864 3865 /* If it's invalid to have no preceding re. */ 3866 if (!laststart) 3867 { 3868 if (syntax & RE_CONTEXT_INVALID_OPS 3869 && !(syntax & RE_INVALID_INTERVAL_ORD)) 3870 FREE_STACK_RETURN (REG_BADRPT); 3871 else if (syntax & RE_CONTEXT_INDEP_OPS) 3872 laststart = b; 3873 else 3874 goto unfetch_interval; 3875 } 3876 3877 /* We just parsed a valid interval. */ 3878 3879 if (RE_DUP_MAX < upper_bound) 3880 FREE_STACK_RETURN (REG_BADBR); 3881 3882 /* If the upper bound is zero, don't want to succeed at 3883 all; jump from `laststart' to `b + 3', which will be 3884 the end of the buffer after we insert the jump. */ 3885 /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' 3886 instead of 'b + 3'. */ 3887 if (upper_bound == 0) 3888 { 3889 GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); 3890 INSERT_JUMP (jump, laststart, b + 1 3891 + OFFSET_ADDRESS_SIZE); 3892 b += 1 + OFFSET_ADDRESS_SIZE; 3893 } 3894 3895 /* Otherwise, we have a nontrivial interval. When 3896 we're all done, the pattern will look like: 3897 set_number_at <jump count> <upper bound> 3898 set_number_at <succeed_n count> <lower bound> 3899 succeed_n <after jump addr> <succeed_n count> 3900 <body of loop> 3901 jump_n <succeed_n addr> <jump count> 3902 (The upper bound and `jump_n' are omitted if 3903 `upper_bound' is 1, though.) */ 3904 else 3905 { /* If the upper bound is > 1, we need to insert 3906 more at the end of the loop. */ 3907 unsigned nbytes = 2 + 4 * OFFSET_ADDRESS_SIZE + 3908 (upper_bound > 1) * (2 + 4 * OFFSET_ADDRESS_SIZE); 3909 3910 GET_BUFFER_SPACE (nbytes); 3911 3912 /* Initialize lower bound of the `succeed_n', even 3913 though it will be set during matching by its 3914 attendant `set_number_at' (inserted next), 3915 because `re_compile_fastmap' needs to know. 3916 Jump to the `jump_n' we might insert below. */ 3917 INSERT_JUMP2 (succeed_n, laststart, 3918 b + 1 + 2 * OFFSET_ADDRESS_SIZE 3919 + (upper_bound > 1) * (1 + 2 * OFFSET_ADDRESS_SIZE) 3920 , lower_bound); 3921 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3922 3923 /* Code to initialize the lower bound. Insert 3924 before the `succeed_n'. The `5' is the last two 3925 bytes of this `set_number_at', plus 3 bytes of 3926 the following `succeed_n'. */ 3927 /* ifdef WCHAR, The '1+2*OFFSET_ADDRESS_SIZE' 3928 is the 'set_number_at', plus '1+OFFSET_ADDRESS_SIZE' 3929 of the following `succeed_n'. */ 3930 PREFIX(insert_op2) (set_number_at, laststart, 1 3931 + 2 * OFFSET_ADDRESS_SIZE, lower_bound, b); 3932 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3933 3934 if (upper_bound > 1) 3935 { /* More than one repetition is allowed, so 3936 append a backward jump to the `succeed_n' 3937 that starts this interval. 3938 3939 When we've reached this during matching, 3940 we'll have matched the interval once, so 3941 jump back only `upper_bound - 1' times. */ 3942 STORE_JUMP2 (jump_n, b, laststart 3943 + 2 * OFFSET_ADDRESS_SIZE + 1, 3944 upper_bound - 1); 3945 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3946 3947 /* The location we want to set is the second 3948 parameter of the `jump_n'; that is `b-2' as 3949 an absolute address. `laststart' will be 3950 the `set_number_at' we're about to insert; 3951 `laststart+3' the number to set, the source 3952 for the relative address. But we are 3953 inserting into the middle of the pattern -- 3954 so everything is getting moved up by 5. 3955 Conclusion: (b - 2) - (laststart + 3) + 5, 3956 i.e., b - laststart. 3957 3958 We insert this at the beginning of the loop 3959 so that if we fail during matching, we'll 3960 reinitialize the bounds. */ 3961 PREFIX(insert_op2) (set_number_at, laststart, 3962 b - laststart, 3963 upper_bound - 1, b); 3964 b += 1 + 2 * OFFSET_ADDRESS_SIZE; 3965 } 3966 } 3967 pending_exact = 0; 3968 break; 3969 3970 invalid_interval: 3971 if (!(syntax & RE_INVALID_INTERVAL_ORD)) 3972 FREE_STACK_RETURN (p == pend ? REG_EBRACE : REG_BADBR); 3973 unfetch_interval: 3974 /* Match the characters as literals. */ 3975 p = beg_interval; 3976 c = '{'; 3977 if (syntax & RE_NO_BK_BRACES) 3978 goto normal_char; 3979 else 3980 goto normal_backslash; 3981 } 3982 3983#ifdef emacs 3984 /* There is no way to specify the before_dot and after_dot 3985 operators. rms says this is ok. --karl */ 3986 case '=': 3987 BUF_PUSH (at_dot); 3988 break; 3989 3990 case 's': 3991 laststart = b; 3992 PATFETCH (c); 3993 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); 3994 break; 3995 3996 case 'S': 3997 laststart = b; 3998 PATFETCH (c); 3999 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); 4000 break; 4001#endif /* emacs */ 4002 4003 4004 case 'w': 4005 if (syntax & RE_NO_GNU_OPS) 4006 goto normal_char; 4007 laststart = b; 4008 BUF_PUSH (wordchar); 4009 break; 4010 4011 4012 case 'W': 4013 if (syntax & RE_NO_GNU_OPS) 4014 goto normal_char; 4015 laststart = b; 4016 BUF_PUSH (notwordchar); 4017 break; 4018 4019 4020 case '<': 4021 if (syntax & RE_NO_GNU_OPS) 4022 goto normal_char; 4023 BUF_PUSH (wordbeg); 4024 break; 4025 4026 case '>': 4027 if (syntax & RE_NO_GNU_OPS) 4028 goto normal_char; 4029 BUF_PUSH (wordend); 4030 break; 4031 4032 case 'b': 4033 if (syntax & RE_NO_GNU_OPS) 4034 goto normal_char; 4035 BUF_PUSH (wordbound); 4036 break; 4037 4038 case 'B': 4039 if (syntax & RE_NO_GNU_OPS) 4040 goto normal_char; 4041 BUF_PUSH (notwordbound); 4042 break; 4043 4044 case '`': 4045 if (syntax & RE_NO_GNU_OPS) 4046 goto normal_char; 4047 BUF_PUSH (begbuf); 4048 break; 4049 4050 case '\'': 4051 if (syntax & RE_NO_GNU_OPS) 4052 goto normal_char; 4053 BUF_PUSH (endbuf); 4054 break; 4055 4056 case '1': case '2': case '3': case '4': case '5': 4057 case '6': case '7': case '8': case '9': 4058 if (syntax & RE_NO_BK_REFS) 4059 goto normal_char; 4060 4061 c1 = c - '0'; 4062 4063 if (c1 > regnum) 4064 FREE_STACK_RETURN (REG_ESUBREG); 4065 4066 /* Can't back reference to a subexpression if inside of it. */ 4067 if (group_in_compile_stack (compile_stack, (regnum_t) c1)) 4068 goto normal_char; 4069 4070 laststart = b; 4071 BUF_PUSH_2 (duplicate, c1); 4072 break; 4073 4074 4075 case '+': 4076 case '?': 4077 if (syntax & RE_BK_PLUS_QM) 4078 goto handle_plus; 4079 else 4080 goto normal_backslash; 4081 4082 default: 4083 normal_backslash: 4084 /* You might think it would be useful for \ to mean 4085 not to translate; but if we don't translate it 4086 it will never match anything. */ 4087 c = TRANSLATE (c); 4088 goto normal_char; 4089 } 4090 break; 4091 4092 4093 default: 4094 /* Expects the character in `c'. */ 4095 normal_char: 4096 /* If no exactn currently being built. */ 4097 if (!pending_exact 4098#ifdef WCHAR 4099 /* If last exactn handle binary(or character) and 4100 new exactn handle character(or binary). */ 4101 || is_exactn_bin != is_binary[p - 1 - pattern] 4102#endif /* WCHAR */ 4103 4104 /* If last exactn not at current position. */ 4105 || pending_exact + *pending_exact + 1 != b 4106 4107 /* We have only one byte following the exactn for the count. */ 4108 || *pending_exact == (1 << BYTEWIDTH) - 1 4109 4110 /* If followed by a repetition operator. */ 4111 || *p == '*' || *p == '^' 4112 || ((syntax & RE_BK_PLUS_QM) 4113 ? *p == '\\' && (p[1] == '+' || p[1] == '?') 4114 : (*p == '+' || *p == '?')) 4115 || ((syntax & RE_INTERVALS) 4116 && ((syntax & RE_NO_BK_BRACES) 4117 ? *p == '{' 4118 : (p[0] == '\\' && p[1] == '{')))) 4119 { 4120 /* Start building a new exactn. */ 4121 4122 laststart = b; 4123 4124#ifdef WCHAR 4125 /* Is this exactn binary data or character? */ 4126 is_exactn_bin = is_binary[p - 1 - pattern]; 4127 if (is_exactn_bin) 4128 BUF_PUSH_2 (exactn_bin, 0); 4129 else 4130 BUF_PUSH_2 (exactn, 0); 4131#else 4132 BUF_PUSH_2 (exactn, 0); 4133#endif /* WCHAR */ 4134 pending_exact = b - 1; 4135 } 4136 4137 BUF_PUSH (c); 4138 (*pending_exact)++; 4139 break; 4140 } /* switch (c) */ 4141 } /* while p != pend */ 4142 4143 4144 /* Through the pattern now. */ 4145 4146 if (fixup_alt_jump) 4147 STORE_JUMP (jump_past_alt, fixup_alt_jump, b); 4148 4149 if (!COMPILE_STACK_EMPTY) 4150 FREE_STACK_RETURN (REG_EPAREN); 4151 4152 /* If we don't want backtracking, force success 4153 the first time we reach the end of the compiled pattern. */ 4154 if (syntax & RE_NO_POSIX_BACKTRACKING) 4155 BUF_PUSH (succeed); 4156 4157#ifdef WCHAR 4158 free (pattern); 4159 free (mbs_offset); 4160 free (is_binary); 4161#endif 4162 free (compile_stack.stack); 4163 4164 /* We have succeeded; set the length of the buffer. */ 4165#ifdef WCHAR 4166 bufp->used = (uintptr_t) b - (uintptr_t) COMPILED_BUFFER_VAR; 4167#else 4168 bufp->used = b - bufp->buffer; 4169#endif 4170 4171#ifdef DEBUG 4172 if (debug) 4173 { 4174 DEBUG_PRINT1 ("\nCompiled pattern: \n"); 4175 PREFIX(print_compiled_pattern) (bufp); 4176 } 4177#endif /* DEBUG */ 4178 4179#ifndef MATCH_MAY_ALLOCATE 4180 /* Initialize the failure stack to the largest possible stack. This 4181 isn't necessary unless we're trying to avoid calling alloca in 4182 the search and match routines. */ 4183 { 4184 int num_regs = bufp->re_nsub + 1; 4185 4186 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size 4187 is strictly greater than re_max_failures, the largest possible stack 4188 is 2 * re_max_failures failure points. */ 4189 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) 4190 { 4191 fail_stack.size = (2 * re_max_failures * MAX_FAILURE_ITEMS); 4192 4193# ifdef emacs 4194 if (! fail_stack.stack) 4195 fail_stack.stack 4196 = (PREFIX(fail_stack_elt_t) *) xmalloc (fail_stack.size 4197 * sizeof (PREFIX(fail_stack_elt_t))); 4198 else 4199 fail_stack.stack 4200 = (PREFIX(fail_stack_elt_t) *) xrealloc (fail_stack.stack, 4201 (fail_stack.size 4202 * sizeof (PREFIX(fail_stack_elt_t)))); 4203# else /* not emacs */ 4204 if (! fail_stack.stack) 4205 fail_stack.stack 4206 = malloc (fail_stack.size * sizeof (PREFIX(fail_stack_elt_t))); 4207 else 4208 fail_stack.stack 4209 = realloc (fail_stack.stack, 4210 fail_stack.size * sizeof (PREFIX(fail_stack_elt_t))); 4211# endif /* not emacs */ 4212 } 4213 4214 PREFIX(regex_grow_registers) (num_regs); 4215 } 4216#endif /* not MATCH_MAY_ALLOCATE */ 4217 4218 return REG_NOERROR; 4219} /* regex_compile */ 4220 4221/* Subroutines for `regex_compile'. */ 4222 4223/* Store OP at LOC followed by two-byte integer parameter ARG. */ 4224/* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4225 4226static void 4227PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg) 4228{ 4229 *loc = (UCHAR_T) op; 4230 STORE_NUMBER (loc + 1, arg); 4231} 4232 4233 4234/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ 4235/* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4236 4237static void 4238PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2) 4239{ 4240 *loc = (UCHAR_T) op; 4241 STORE_NUMBER (loc + 1, arg1); 4242 STORE_NUMBER (loc + 1 + OFFSET_ADDRESS_SIZE, arg2); 4243} 4244 4245 4246/* Copy the bytes from LOC to END to open up three bytes of space at LOC 4247 for OP followed by two-byte integer parameter ARG. */ 4248/* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4249 4250static void 4251PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, int arg, UCHAR_T *end) 4252{ 4253 register UCHAR_T *pfrom = end; 4254 register UCHAR_T *pto = end + 1 + OFFSET_ADDRESS_SIZE; 4255 4256 while (pfrom != loc) 4257 *--pto = *--pfrom; 4258 4259 PREFIX(store_op1) (op, loc, arg); 4260} 4261 4262 4263/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ 4264/* ifdef WCHAR, integer parameter is 1 wchar_t. */ 4265 4266static void 4267PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, int arg1, int arg2, 4268 UCHAR_T *end) 4269{ 4270 register UCHAR_T *pfrom = end; 4271 register UCHAR_T *pto = end + 1 + 2 * OFFSET_ADDRESS_SIZE; 4272 4273 while (pfrom != loc) 4274 *--pto = *--pfrom; 4275 4276 PREFIX(store_op2) (op, loc, arg1, arg2); 4277} 4278 4279 4280/* P points to just after a ^ in PATTERN. Return true if that ^ comes 4281 after an alternative or a begin-subexpression. We assume there is at 4282 least one character before the ^. */ 4283 4284static boolean 4285PREFIX(at_begline_loc_p) (const CHAR_T *pattern, const CHAR_T *p, 4286 reg_syntax_t syntax) 4287{ 4288 const CHAR_T *prev = p - 2; 4289 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; 4290 4291 return 4292 /* After a subexpression? */ 4293 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) 4294 /* After an alternative? */ 4295 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); 4296} 4297 4298 4299/* The dual of at_begline_loc_p. This one is for $. We assume there is 4300 at least one character after the $, i.e., `P < PEND'. */ 4301 4302static boolean 4303PREFIX(at_endline_loc_p) (const CHAR_T *p, const CHAR_T *pend, 4304 reg_syntax_t syntax) 4305{ 4306 const CHAR_T *next = p; 4307 boolean next_backslash = *next == '\\'; 4308 const CHAR_T *next_next = p + 1 < pend ? p + 1 : 0; 4309 4310 return 4311 /* Before a subexpression? */ 4312 (syntax & RE_NO_BK_PARENS ? *next == ')' 4313 : next_backslash && next_next && *next_next == ')') 4314 /* Before an alternative? */ 4315 || (syntax & RE_NO_BK_VBAR ? *next == '|' 4316 : next_backslash && next_next && *next_next == '|'); 4317} 4318 4319#else /* not INSIDE_RECURSION */ 4320 4321/* Returns true if REGNUM is in one of COMPILE_STACK's elements and 4322 false if it's not. */ 4323 4324static boolean 4325group_in_compile_stack (compile_stack_type compile_stack, 4326 regnum_t regnum) 4327{ 4328 int this_element; 4329 4330 for (this_element = compile_stack.avail - 1; 4331 this_element >= 0; 4332 this_element--) 4333 if (compile_stack.stack[this_element].regnum == regnum) 4334 return true; 4335 4336 return false; 4337} 4338#endif /* not INSIDE_RECURSION */ 4339 4340#ifdef INSIDE_RECURSION 4341 4342#ifdef WCHAR 4343/* This insert space, which size is "num", into the pattern at "loc". 4344 "end" must point the end of the allocated buffer. */ 4345static void 4346insert_space (int num, CHAR_T *loc, CHAR_T *end) 4347{ 4348 register CHAR_T *pto = end; 4349 register CHAR_T *pfrom = end - num; 4350 4351 while (pfrom >= loc) 4352 *pto-- = *pfrom--; 4353} 4354#endif /* WCHAR */ 4355 4356#ifdef WCHAR 4357static reg_errcode_t 4358wcs_compile_range (CHAR_T range_start_char, 4359 const CHAR_T **p_ptr, const CHAR_T *pend, 4360 RE_TRANSLATE_TYPE translate, reg_syntax_t syntax, 4361 CHAR_T *b, CHAR_T *char_set) 4362{ 4363 const CHAR_T *p = *p_ptr; 4364 CHAR_T range_start, range_end; 4365 reg_errcode_t ret; 4366# ifdef _LIBC 4367 uint32_t nrules; 4368 uint32_t start_val, end_val; 4369# endif 4370 if (p == pend) 4371 return REG_ERANGE; 4372 4373# ifdef _LIBC 4374 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 4375 if (nrules != 0) 4376 { 4377 const char *collseq = (const char *) _NL_CURRENT(LC_COLLATE, 4378 _NL_COLLATE_COLLSEQWC); 4379 const unsigned char *extra = (const unsigned char *) 4380 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 4381 4382 if (range_start_char < -1) 4383 { 4384 /* range_start is a collating symbol. */ 4385 int32_t *wextra; 4386 /* Retreive the index and get collation sequence value. */ 4387 wextra = (int32_t*)(extra + char_set[-range_start_char]); 4388 start_val = wextra[1 + *wextra]; 4389 } 4390 else 4391 start_val = collseq_table_lookup(collseq, TRANSLATE(range_start_char)); 4392 4393 end_val = collseq_table_lookup (collseq, TRANSLATE (p[0])); 4394 4395 /* Report an error if the range is empty and the syntax prohibits 4396 this. */ 4397 ret = ((syntax & RE_NO_EMPTY_RANGES) 4398 && (start_val > end_val))? REG_ERANGE : REG_NOERROR; 4399 4400 /* Insert space to the end of the char_ranges. */ 4401 insert_space(2, b - char_set[5] - 2, b - 1); 4402 *(b - char_set[5] - 2) = (wchar_t)start_val; 4403 *(b - char_set[5] - 1) = (wchar_t)end_val; 4404 char_set[4]++; /* ranges_index */ 4405 } 4406 else 4407# endif 4408 { 4409 range_start = (range_start_char >= 0)? TRANSLATE (range_start_char): 4410 range_start_char; 4411 range_end = TRANSLATE (p[0]); 4412 /* Report an error if the range is empty and the syntax prohibits 4413 this. */ 4414 ret = ((syntax & RE_NO_EMPTY_RANGES) 4415 && (range_start > range_end))? REG_ERANGE : REG_NOERROR; 4416 4417 /* Insert space to the end of the char_ranges. */ 4418 insert_space(2, b - char_set[5] - 2, b - 1); 4419 *(b - char_set[5] - 2) = range_start; 4420 *(b - char_set[5] - 1) = range_end; 4421 char_set[4]++; /* ranges_index */ 4422 } 4423 /* Have to increment the pointer into the pattern string, so the 4424 caller isn't still at the ending character. */ 4425 (*p_ptr)++; 4426 4427 return ret; 4428} 4429#else /* BYTE */ 4430/* Read the ending character of a range (in a bracket expression) from the 4431 uncompiled pattern *P_PTR (which ends at PEND). We assume the 4432 starting character is in `P[-2]'. (`P[-1]' is the character `-'.) 4433 Then we set the translation of all bits between the starting and 4434 ending characters (inclusive) in the compiled pattern B. 4435 4436 Return an error code. 4437 4438 We use these short variable names so we can use the same macros as 4439 `regex_compile' itself. */ 4440 4441static reg_errcode_t 4442byte_compile_range (unsigned int range_start_char, 4443 const char **p_ptr, const char *pend, 4444 RE_TRANSLATE_TYPE translate, reg_syntax_t syntax, 4445 unsigned char *b) 4446{ 4447 unsigned this_char; 4448 const char *p = *p_ptr; 4449 reg_errcode_t ret; 4450# if _LIBC 4451 const unsigned char *collseq; 4452 unsigned int start_colseq; 4453 unsigned int end_colseq; 4454# else 4455 unsigned end_char; 4456# endif 4457 4458 if (p == pend) 4459 return REG_ERANGE; 4460 4461 /* Have to increment the pointer into the pattern string, so the 4462 caller isn't still at the ending character. */ 4463 (*p_ptr)++; 4464 4465 /* Report an error if the range is empty and the syntax prohibits this. */ 4466 ret = syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; 4467 4468# if _LIBC 4469 collseq = (const unsigned char *) _NL_CURRENT (LC_COLLATE, 4470 _NL_COLLATE_COLLSEQMB); 4471 4472 start_colseq = collseq[(unsigned char) TRANSLATE (range_start_char)]; 4473 end_colseq = collseq[(unsigned char) TRANSLATE (p[0])]; 4474 for (this_char = 0; this_char <= (unsigned char) -1; ++this_char) 4475 { 4476 unsigned int this_colseq = collseq[(unsigned char) TRANSLATE (this_char)]; 4477 4478 if (start_colseq <= this_colseq && this_colseq <= end_colseq) 4479 { 4480 SET_LIST_BIT (TRANSLATE (this_char)); 4481 ret = REG_NOERROR; 4482 } 4483 } 4484# else 4485 /* Here we see why `this_char' has to be larger than an `unsigned 4486 char' -- we would otherwise go into an infinite loop, since all 4487 characters <= 0xff. */ 4488 range_start_char = TRANSLATE (range_start_char); 4489 /* TRANSLATE(p[0]) is casted to char (not unsigned char) in TRANSLATE, 4490 and some compilers cast it to int implicitly, so following for_loop 4491 may fall to (almost) infinite loop. 4492 e.g. If translate[p[0]] = 0xff, end_char may equals to 0xffffffff. 4493 To avoid this, we cast p[0] to unsigned int and truncate it. */ 4494 end_char = ((unsigned)TRANSLATE(p[0]) & ((1 << BYTEWIDTH) - 1)); 4495 4496 for (this_char = range_start_char; this_char <= end_char; ++this_char) 4497 { 4498 SET_LIST_BIT (TRANSLATE (this_char)); 4499 ret = REG_NOERROR; 4500 } 4501# endif 4502 4503 return ret; 4504} 4505#endif /* WCHAR */ 4506 4507/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in 4508 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible 4509 characters can start a string that matches the pattern. This fastmap 4510 is used by re_search to skip quickly over impossible starting points. 4511 4512 The caller must supply the address of a (1 << BYTEWIDTH)-byte data 4513 area as BUFP->fastmap. 4514 4515 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in 4516 the pattern buffer. 4517 4518 Returns 0 if we succeed, -2 if an internal error. */ 4519 4520#ifdef WCHAR 4521/* local function for re_compile_fastmap. 4522 truncate wchar_t character to char. */ 4523 4524static unsigned char 4525truncate_wchar (CHAR_T c) 4526{ 4527 unsigned char buf[MB_CUR_MAX]; 4528 mbstate_t state; 4529 int retval; 4530 memset (&state, '\0', sizeof (state)); 4531 retval = wcrtomb (buf, c, &state); 4532 return retval > 0 ? buf[0] : (unsigned char) c; 4533} 4534#endif /* WCHAR */ 4535 4536static int 4537PREFIX(re_compile_fastmap) (struct re_pattern_buffer *bufp) 4538{ 4539 int j, k; 4540#ifdef MATCH_MAY_ALLOCATE 4541 PREFIX(fail_stack_type) fail_stack; 4542#endif 4543#ifndef REGEX_MALLOC 4544 char *destination; 4545#endif 4546 4547 register char *fastmap = bufp->fastmap; 4548 4549#ifdef WCHAR 4550 /* We need to cast pattern to (wchar_t*), because we casted this compiled 4551 pattern to (char*) in regex_compile. */ 4552 UCHAR_T *pattern = (UCHAR_T*)bufp->buffer; 4553 register UCHAR_T *pend = (UCHAR_T*) (bufp->buffer + bufp->used); 4554#else /* BYTE */ 4555 UCHAR_T *pattern = bufp->buffer; 4556 register UCHAR_T *pend = pattern + bufp->used; 4557#endif /* WCHAR */ 4558 UCHAR_T *p = pattern; 4559 4560#ifdef REL_ALLOC 4561 /* This holds the pointer to the failure stack, when 4562 it is allocated relocatably. */ 4563 fail_stack_elt_t *failure_stack_ptr; 4564#endif 4565 4566 /* Assume that each path through the pattern can be null until 4567 proven otherwise. We set this false at the bottom of switch 4568 statement, to which we get only if a particular path doesn't 4569 match the empty string. */ 4570 boolean path_can_be_null = true; 4571 4572 /* We aren't doing a `succeed_n' to begin with. */ 4573 boolean succeed_n_p = false; 4574 4575 assert (fastmap != NULL && p != NULL); 4576 4577 INIT_FAIL_STACK (); 4578 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ 4579 bufp->fastmap_accurate = 1; /* It will be when we're done. */ 4580 bufp->can_be_null = 0; 4581 4582 while (1) 4583 { 4584 if (p == pend || *p == succeed) 4585 { 4586 /* We have reached the (effective) end of pattern. */ 4587 if (!FAIL_STACK_EMPTY ()) 4588 { 4589 bufp->can_be_null |= path_can_be_null; 4590 4591 /* Reset for next path. */ 4592 path_can_be_null = true; 4593 4594 p = fail_stack.stack[--fail_stack.avail].pointer; 4595 4596 continue; 4597 } 4598 else 4599 break; 4600 } 4601 4602 /* We should never be about to go beyond the end of the pattern. */ 4603 assert (p < pend); 4604 4605 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 4606 { 4607 4608 /* I guess the idea here is to simply not bother with a fastmap 4609 if a backreference is used, since it's too hard to figure out 4610 the fastmap for the corresponding group. Setting 4611 `can_be_null' stops `re_search_2' from using the fastmap, so 4612 that is all we do. */ 4613 case duplicate: 4614 bufp->can_be_null = 1; 4615 goto done; 4616 4617 4618 /* Following are the cases which match a character. These end 4619 with `break'. */ 4620 4621#ifdef WCHAR 4622 case exactn: 4623 fastmap[truncate_wchar(p[1])] = 1; 4624 break; 4625#else /* BYTE */ 4626 case exactn: 4627 fastmap[p[1]] = 1; 4628 break; 4629#endif /* WCHAR */ 4630#ifdef MBS_SUPPORT 4631 case exactn_bin: 4632 fastmap[p[1]] = 1; 4633 break; 4634#endif 4635 4636#ifdef WCHAR 4637 /* It is hard to distinguish fastmap from (multi byte) characters 4638 which depends on current locale. */ 4639 case charset: 4640 case charset_not: 4641 case wordchar: 4642 case notwordchar: 4643 bufp->can_be_null = 1; 4644 goto done; 4645#else /* BYTE */ 4646 case charset: 4647 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4648 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) 4649 fastmap[j] = 1; 4650 break; 4651 4652 4653 case charset_not: 4654 /* Chars beyond end of map must be allowed. */ 4655 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) 4656 fastmap[j] = 1; 4657 4658 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) 4659 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) 4660 fastmap[j] = 1; 4661 break; 4662 4663 4664 case wordchar: 4665 for (j = 0; j < (1 << BYTEWIDTH); j++) 4666 if (SYNTAX (j) == Sword) 4667 fastmap[j] = 1; 4668 break; 4669 4670 4671 case notwordchar: 4672 for (j = 0; j < (1 << BYTEWIDTH); j++) 4673 if (SYNTAX (j) != Sword) 4674 fastmap[j] = 1; 4675 break; 4676#endif /* WCHAR */ 4677 4678 case anychar: 4679 { 4680 int fastmap_newline = fastmap['\n']; 4681 4682 /* `.' matches anything ... */ 4683 for (j = 0; j < (1 << BYTEWIDTH); j++) 4684 fastmap[j] = 1; 4685 4686 /* ... except perhaps newline. */ 4687 if (!(bufp->syntax & RE_DOT_NEWLINE)) 4688 fastmap['\n'] = fastmap_newline; 4689 4690 /* Return if we have already set `can_be_null'; if we have, 4691 then the fastmap is irrelevant. Something's wrong here. */ 4692 else if (bufp->can_be_null) 4693 goto done; 4694 4695 /* Otherwise, have to check alternative paths. */ 4696 break; 4697 } 4698 4699#ifdef emacs 4700 case syntaxspec: 4701 k = *p++; 4702 for (j = 0; j < (1 << BYTEWIDTH); j++) 4703 if (SYNTAX (j) == (enum syntaxcode) k) 4704 fastmap[j] = 1; 4705 break; 4706 4707 4708 case notsyntaxspec: 4709 k = *p++; 4710 for (j = 0; j < (1 << BYTEWIDTH); j++) 4711 if (SYNTAX (j) != (enum syntaxcode) k) 4712 fastmap[j] = 1; 4713 break; 4714 4715 4716 /* All cases after this match the empty string. These end with 4717 `continue'. */ 4718 4719 4720 case before_dot: 4721 case at_dot: 4722 case after_dot: 4723 continue; 4724#endif /* emacs */ 4725 4726 4727 case no_op: 4728 case begline: 4729 case endline: 4730 case begbuf: 4731 case endbuf: 4732 case wordbound: 4733 case notwordbound: 4734 case wordbeg: 4735 case wordend: 4736 case push_dummy_failure: 4737 continue; 4738 4739 4740 case jump_n: 4741 case pop_failure_jump: 4742 case maybe_pop_jump: 4743 case jump: 4744 case jump_past_alt: 4745 case dummy_failure_jump: 4746 EXTRACT_NUMBER_AND_INCR (j, p); 4747 p += j; 4748 if (j > 0) 4749 continue; 4750 4751 /* Jump backward implies we just went through the body of a 4752 loop and matched nothing. Opcode jumped to should be 4753 `on_failure_jump' or `succeed_n'. Just treat it like an 4754 ordinary jump. For a * loop, it has pushed its failure 4755 point already; if so, discard that as redundant. */ 4756 if ((re_opcode_t) *p != on_failure_jump 4757 && (re_opcode_t) *p != succeed_n) 4758 continue; 4759 4760 p++; 4761 EXTRACT_NUMBER_AND_INCR (j, p); 4762 p += j; 4763 4764 /* If what's on the stack is where we are now, pop it. */ 4765 if (!FAIL_STACK_EMPTY () 4766 && fail_stack.stack[fail_stack.avail - 1].pointer == p) 4767 fail_stack.avail--; 4768 4769 continue; 4770 4771 4772 case on_failure_jump: 4773 case on_failure_keep_string_jump: 4774 handle_on_failure_jump: 4775 EXTRACT_NUMBER_AND_INCR (j, p); 4776 4777 /* For some patterns, e.g., `(a?)?', `p+j' here points to the 4778 end of the pattern. We don't want to push such a point, 4779 since when we restore it above, entering the switch will 4780 increment `p' past the end of the pattern. We don't need 4781 to push such a point since we obviously won't find any more 4782 fastmap entries beyond `pend'. Such a pattern can match 4783 the null string, though. */ 4784 if (p + j < pend) 4785 { 4786 if (!PUSH_PATTERN_OP (p + j, fail_stack)) 4787 { 4788 RESET_FAIL_STACK (); 4789 return -2; 4790 } 4791 } 4792 else 4793 bufp->can_be_null = 1; 4794 4795 if (succeed_n_p) 4796 { 4797 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ 4798 succeed_n_p = false; 4799 } 4800 4801 continue; 4802 4803 4804 case succeed_n: 4805 /* Get to the number of times to succeed. */ 4806 p += OFFSET_ADDRESS_SIZE; 4807 4808 /* Increment p past the n for when k != 0. */ 4809 EXTRACT_NUMBER_AND_INCR (k, p); 4810 if (k == 0) 4811 { 4812 p -= 2 * OFFSET_ADDRESS_SIZE; 4813 succeed_n_p = true; /* Spaghetti code alert. */ 4814 goto handle_on_failure_jump; 4815 } 4816 continue; 4817 4818 4819 case set_number_at: 4820 p += 2 * OFFSET_ADDRESS_SIZE; 4821 continue; 4822 4823 4824 case start_memory: 4825 case stop_memory: 4826 p += 2; 4827 continue; 4828 4829 4830 default: 4831 abort (); /* We have listed all the cases. */ 4832 } /* switch *p++ */ 4833 4834 /* Getting here means we have found the possible starting 4835 characters for one path of the pattern -- and that the empty 4836 string does not match. We need not follow this path further. 4837 Instead, look at the next alternative (remembered on the 4838 stack), or quit if no more. The test at the top of the loop 4839 does these things. */ 4840 path_can_be_null = false; 4841 p = pend; 4842 } /* while p */ 4843 4844 /* Set `can_be_null' for the last path (also the first path, if the 4845 pattern is empty). */ 4846 bufp->can_be_null |= path_can_be_null; 4847 4848 done: 4849 RESET_FAIL_STACK (); 4850 return 0; 4851} 4852 4853#else /* not INSIDE_RECURSION */ 4854 4855int 4856re_compile_fastmap (struct re_pattern_buffer *bufp) 4857{ 4858# ifdef MBS_SUPPORT 4859 if (MB_CUR_MAX != 1) 4860 return wcs_re_compile_fastmap(bufp); 4861 else 4862# endif 4863 return byte_re_compile_fastmap(bufp); 4864} /* re_compile_fastmap */ 4865#ifdef _LIBC 4866weak_alias (__re_compile_fastmap, re_compile_fastmap) 4867#endif 4868 4869 4870/* Set REGS to hold NUM_REGS registers, storing them in STARTS and 4871 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use 4872 this memory for recording register information. STARTS and ENDS 4873 must be allocated using the malloc library routine, and must each 4874 be at least NUM_REGS * sizeof (regoff_t) bytes long. 4875 4876 If NUM_REGS == 0, then subsequent matches should allocate their own 4877 register data. 4878 4879 Unless this function is called, the first search or match using 4880 PATTERN_BUFFER will allocate its own register data, without 4881 freeing the old data. */ 4882 4883void 4884re_set_registers (struct re_pattern_buffer *bufp, 4885 struct re_registers *regs, 4886 unsigned int num_regs, 4887 regoff_t *starts, regoff_t *ends) 4888{ 4889 if (num_regs) 4890 { 4891 bufp->regs_allocated = REGS_REALLOCATE; 4892 regs->num_regs = num_regs; 4893 regs->start = starts; 4894 regs->end = ends; 4895 } 4896 else 4897 { 4898 bufp->regs_allocated = REGS_UNALLOCATED; 4899 regs->num_regs = 0; 4900 regs->start = regs->end = (regoff_t *) 0; 4901 } 4902} 4903#ifdef _LIBC 4904weak_alias (__re_set_registers, re_set_registers) 4905#endif 4906 4907/* Searching routines. */ 4908 4909/* Like re_search_2, below, but only one string is specified, and 4910 doesn't let you say where to stop matching. */ 4911 4912int 4913re_search (struct re_pattern_buffer *bufp, 4914 const char *string, 4915 int size, int startpos, int range, 4916 struct re_registers *regs) 4917{ 4918 return re_search_2 (bufp, NULL, 0, string, size, startpos, range, 4919 regs, size); 4920} 4921#ifdef _LIBC 4922weak_alias (__re_search, re_search) 4923#endif 4924 4925 4926/* Using the compiled pattern in BUFP->buffer, first tries to match the 4927 virtual concatenation of STRING1 and STRING2, starting first at index 4928 STARTPOS, then at STARTPOS + 1, and so on. 4929 4930 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. 4931 4932 RANGE is how far to scan while trying to match. RANGE = 0 means try 4933 only at STARTPOS; in general, the last start tried is STARTPOS + 4934 RANGE. 4935 4936 In REGS, return the indices of the virtual concatenation of STRING1 4937 and STRING2 that matched the entire BUFP->buffer and its contained 4938 subexpressions. 4939 4940 Do not consider matching one past the index STOP in the virtual 4941 concatenation of STRING1 and STRING2. 4942 4943 We return either the position in the strings at which the match was 4944 found, -1 if no match, or -2 if error (such as failure 4945 stack overflow). */ 4946 4947int 4948re_search_2 (struct re_pattern_buffer *bufp, 4949 const char *string1, int size1, 4950 const char *string2, int size2, 4951 int startpos, int range, 4952 struct re_registers *regs, 4953 int stop) 4954{ 4955# ifdef MBS_SUPPORT 4956 if (MB_CUR_MAX != 1) 4957 return wcs_re_search_2 (bufp, string1, size1, string2, size2, startpos, 4958 range, regs, stop); 4959 else 4960# endif 4961 return byte_re_search_2 (bufp, string1, size1, string2, size2, startpos, 4962 range, regs, stop); 4963} /* re_search_2 */ 4964#ifdef _LIBC 4965weak_alias (__re_search_2, re_search_2) 4966#endif 4967 4968#endif /* not INSIDE_RECURSION */ 4969 4970#ifdef INSIDE_RECURSION 4971 4972#ifdef MATCH_MAY_ALLOCATE 4973# define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL 4974#else 4975# define FREE_VAR(var) if (var) free (var); var = NULL 4976#endif 4977 4978#ifdef WCHAR 4979# define MAX_ALLOCA_SIZE 2000 4980 4981# define FREE_WCS_BUFFERS() \ 4982 do { \ 4983 if (size1 > MAX_ALLOCA_SIZE) \ 4984 { \ 4985 free (wcs_string1); \ 4986 free (mbs_offset1); \ 4987 } \ 4988 else \ 4989 { \ 4990 FREE_VAR (wcs_string1); \ 4991 FREE_VAR (mbs_offset1); \ 4992 } \ 4993 if (size2 > MAX_ALLOCA_SIZE) \ 4994 { \ 4995 free (wcs_string2); \ 4996 free (mbs_offset2); \ 4997 } \ 4998 else \ 4999 { \ 5000 FREE_VAR (wcs_string2); \ 5001 FREE_VAR (mbs_offset2); \ 5002 } \ 5003 } while (0) 5004 5005#endif 5006 5007 5008static int 5009PREFIX(re_search_2) (struct re_pattern_buffer *bufp, 5010 const char *string1, int size1, 5011 const char *string2, int size2, 5012 int startpos, int range, 5013 struct re_registers *regs, 5014 int stop) 5015{ 5016 int val; 5017 register char *fastmap = bufp->fastmap; 5018 register RE_TRANSLATE_TYPE translate = bufp->translate; 5019 int total_size = size1 + size2; 5020 int endpos = startpos + range; 5021#ifdef WCHAR 5022 /* We need wchar_t* buffers correspond to cstring1, cstring2. */ 5023 wchar_t *wcs_string1 = NULL, *wcs_string2 = NULL; 5024 /* We need the size of wchar_t buffers correspond to csize1, csize2. */ 5025 int wcs_size1 = 0, wcs_size2 = 0; 5026 /* offset buffer for optimizatoin. See convert_mbs_to_wc. */ 5027 int *mbs_offset1 = NULL, *mbs_offset2 = NULL; 5028 /* They hold whether each wchar_t is binary data or not. */ 5029 char *is_binary = NULL; 5030#endif /* WCHAR */ 5031 5032 /* Check for out-of-range STARTPOS. */ 5033 if (startpos < 0 || startpos > total_size) 5034 return -1; 5035 5036 /* Fix up RANGE if it might eventually take us outside 5037 the virtual concatenation of STRING1 and STRING2. 5038 Make sure we won't move STARTPOS below 0 or above TOTAL_SIZE. */ 5039 if (endpos < 0) 5040 range = 0 - startpos; 5041 else if (endpos > total_size) 5042 range = total_size - startpos; 5043 5044 /* If the search isn't to be a backwards one, don't waste time in a 5045 search for a pattern that must be anchored. */ 5046 if (bufp->used > 0 && range > 0 5047 && ((re_opcode_t) bufp->buffer[0] == begbuf 5048 /* `begline' is like `begbuf' if it cannot match at newlines. */ 5049 || ((re_opcode_t) bufp->buffer[0] == begline 5050 && !bufp->newline_anchor))) 5051 { 5052 if (startpos > 0) 5053 return -1; 5054 else 5055 range = 1; 5056 } 5057 5058#ifdef emacs 5059 /* In a forward search for something that starts with \=. 5060 don't keep searching past point. */ 5061 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot && range > 0) 5062 { 5063 range = PT - startpos; 5064 if (range <= 0) 5065 return -1; 5066 } 5067#endif /* emacs */ 5068 5069 /* Update the fastmap now if not correct already. */ 5070 if (fastmap && !bufp->fastmap_accurate) 5071 if (re_compile_fastmap (bufp) == -2) 5072 return -2; 5073 5074#ifdef WCHAR 5075 /* Allocate wchar_t array for wcs_string1 and wcs_string2 and 5076 fill them with converted string. */ 5077 if (size1 != 0) 5078 { 5079 if (size1 > MAX_ALLOCA_SIZE) 5080 { 5081 wcs_string1 = TALLOC (size1 + 1, CHAR_T); 5082 mbs_offset1 = TALLOC (size1 + 1, int); 5083 is_binary = TALLOC (size1 + 1, char); 5084 } 5085 else 5086 { 5087 wcs_string1 = REGEX_TALLOC (size1 + 1, CHAR_T); 5088 mbs_offset1 = REGEX_TALLOC (size1 + 1, int); 5089 is_binary = REGEX_TALLOC (size1 + 1, char); 5090 } 5091 if (!wcs_string1 || !mbs_offset1 || !is_binary) 5092 { 5093 if (size1 > MAX_ALLOCA_SIZE) 5094 { 5095 free (wcs_string1); 5096 free (mbs_offset1); 5097 free (is_binary); 5098 } 5099 else 5100 { 5101 FREE_VAR (wcs_string1); 5102 FREE_VAR (mbs_offset1); 5103 FREE_VAR (is_binary); 5104 } 5105 return -2; 5106 } 5107 wcs_size1 = convert_mbs_to_wcs(wcs_string1, string1, size1, 5108 mbs_offset1, is_binary); 5109 wcs_string1[wcs_size1] = L'\0'; /* for a sentinel */ 5110 if (size1 > MAX_ALLOCA_SIZE) 5111 free (is_binary); 5112 else 5113 FREE_VAR (is_binary); 5114 } 5115 if (size2 != 0) 5116 { 5117 if (size2 > MAX_ALLOCA_SIZE) 5118 { 5119 wcs_string2 = TALLOC (size2 + 1, CHAR_T); 5120 mbs_offset2 = TALLOC (size2 + 1, int); 5121 is_binary = TALLOC (size2 + 1, char); 5122 } 5123 else 5124 { 5125 wcs_string2 = REGEX_TALLOC (size2 + 1, CHAR_T); 5126 mbs_offset2 = REGEX_TALLOC (size2 + 1, int); 5127 is_binary = REGEX_TALLOC (size2 + 1, char); 5128 } 5129 if (!wcs_string2 || !mbs_offset2 || !is_binary) 5130 { 5131 FREE_WCS_BUFFERS (); 5132 if (size2 > MAX_ALLOCA_SIZE) 5133 free (is_binary); 5134 else 5135 FREE_VAR (is_binary); 5136 return -2; 5137 } 5138 wcs_size2 = convert_mbs_to_wcs(wcs_string2, string2, size2, 5139 mbs_offset2, is_binary); 5140 wcs_string2[wcs_size2] = L'\0'; /* for a sentinel */ 5141 if (size2 > MAX_ALLOCA_SIZE) 5142 free (is_binary); 5143 else 5144 FREE_VAR (is_binary); 5145 } 5146#endif /* WCHAR */ 5147 5148 5149 /* Loop through the string, looking for a place to start matching. */ 5150 for (;;) 5151 { 5152 /* If a fastmap is supplied, skip quickly over characters that 5153 cannot be the start of a match. If the pattern can match the 5154 null string, however, we don't need to skip characters; we want 5155 the first null string. */ 5156 if (fastmap && startpos < total_size && !bufp->can_be_null) 5157 { 5158 if (range > 0) /* Searching forwards. */ 5159 { 5160 register const char *d; 5161 register int lim = 0; 5162 int irange = range; 5163 5164 if (startpos < size1 && startpos + range >= size1) 5165 lim = range - (size1 - startpos); 5166 5167 d = (startpos >= size1 ? string2 - size1 : string1) + startpos; 5168 5169 /* Written out as an if-else to avoid testing `translate' 5170 inside the loop. */ 5171 if (translate) 5172 while (range > lim 5173 && !fastmap[(unsigned char) 5174 translate[(unsigned char) *d++]]) 5175 range--; 5176 else 5177 while (range > lim && !fastmap[(unsigned char) *d++]) 5178 range--; 5179 5180 startpos += irange - range; 5181 } 5182 else /* Searching backwards. */ 5183 { 5184 register CHAR_T c = (size1 == 0 || startpos >= size1 5185 ? string2[startpos - size1] 5186 : string1[startpos]); 5187 5188 if (!fastmap[(unsigned char) TRANSLATE (c)]) 5189 goto advance; 5190 } 5191 } 5192 5193 /* If can't match the null string, and that's all we have left, fail. */ 5194 if (range >= 0 && startpos == total_size && fastmap 5195 && !bufp->can_be_null) 5196 { 5197#ifdef WCHAR 5198 FREE_WCS_BUFFERS (); 5199#endif 5200 return -1; 5201 } 5202 5203#ifdef WCHAR 5204 val = wcs_re_match_2_internal (bufp, string1, size1, string2, 5205 size2, startpos, regs, stop, 5206 wcs_string1, wcs_size1, 5207 wcs_string2, wcs_size2, 5208 mbs_offset1, mbs_offset2); 5209#else /* BYTE */ 5210 val = byte_re_match_2_internal (bufp, string1, size1, string2, 5211 size2, startpos, regs, stop); 5212#endif /* BYTE */ 5213 5214#ifndef REGEX_MALLOC 5215# ifdef C_ALLOCA 5216 alloca (0); 5217# endif 5218#endif 5219 5220 if (val >= 0) 5221 { 5222#ifdef WCHAR 5223 FREE_WCS_BUFFERS (); 5224#endif 5225 return startpos; 5226 } 5227 5228 if (val == -2) 5229 { 5230#ifdef WCHAR 5231 FREE_WCS_BUFFERS (); 5232#endif 5233 return -2; 5234 } 5235 5236 advance: 5237 if (!range) 5238 break; 5239 else if (range > 0) 5240 { 5241 range--; 5242 startpos++; 5243 } 5244 else 5245 { 5246 range++; 5247 startpos--; 5248 } 5249 } 5250#ifdef WCHAR 5251 FREE_WCS_BUFFERS (); 5252#endif 5253 return -1; 5254} 5255 5256#ifdef WCHAR 5257/* This converts PTR, a pointer into one of the search wchar_t strings 5258 `string1' and `string2' into an multibyte string offset from the 5259 beginning of that string. We use mbs_offset to optimize. 5260 See convert_mbs_to_wcs. */ 5261# define POINTER_TO_OFFSET(ptr) \ 5262 (FIRST_STRING_P (ptr) \ 5263 ? ((regoff_t)(mbs_offset1 != NULL? mbs_offset1[(ptr)-string1] : 0)) \ 5264 : ((regoff_t)((mbs_offset2 != NULL? mbs_offset2[(ptr)-string2] : 0) \ 5265 + csize1))) 5266#else /* BYTE */ 5267/* This converts PTR, a pointer into one of the search strings `string1' 5268 and `string2' into an offset from the beginning of that string. */ 5269# define POINTER_TO_OFFSET(ptr) \ 5270 (FIRST_STRING_P (ptr) \ 5271 ? ((regoff_t) ((ptr) - string1)) \ 5272 : ((regoff_t) ((ptr) - string2 + size1))) 5273#endif /* WCHAR */ 5274 5275/* Macros for dealing with the split strings in re_match_2. */ 5276 5277#define MATCHING_IN_FIRST_STRING (dend == end_match_1) 5278 5279/* Call before fetching a character with *d. This switches over to 5280 string2 if necessary. */ 5281#define PREFETCH() \ 5282 while (d == dend) \ 5283 { \ 5284 /* End of string2 => fail. */ \ 5285 if (dend == end_match_2) \ 5286 goto fail; \ 5287 /* End of string1 => advance to string2. */ \ 5288 d = string2; \ 5289 dend = end_match_2; \ 5290 } 5291 5292/* Test if at very beginning or at very end of the virtual concatenation 5293 of `string1' and `string2'. If only one string, it's `string2'. */ 5294#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) 5295#define AT_STRINGS_END(d) ((d) == end2) 5296 5297 5298/* Test if D points to a character which is word-constituent. We have 5299 two special cases to check for: if past the end of string1, look at 5300 the first character in string2; and if before the beginning of 5301 string2, look at the last character in string1. */ 5302#ifdef WCHAR 5303/* Use internationalized API instead of SYNTAX. */ 5304# define WORDCHAR_P(d) \ 5305 (iswalnum ((wint_t)((d) == end1 ? *string2 \ 5306 : (d) == string2 - 1 ? *(end1 - 1) : *(d))) != 0 \ 5307 || ((d) == end1 ? *string2 \ 5308 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == L'_') 5309#else /* BYTE */ 5310# define WORDCHAR_P(d) \ 5311 (SYNTAX ((d) == end1 ? *string2 \ 5312 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ 5313 == Sword) 5314#endif /* WCHAR */ 5315 5316/* Disabled due to a compiler bug -- see comment at case wordbound */ 5317#if 0 5318/* Test if the character before D and the one at D differ with respect 5319 to being word-constituent. */ 5320#define AT_WORD_BOUNDARY(d) \ 5321 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ 5322 || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) 5323#endif 5324 5325/* Free everything we malloc. */ 5326#ifdef MATCH_MAY_ALLOCATE 5327# ifdef WCHAR 5328# define FREE_VARIABLES() \ 5329 do { \ 5330 REGEX_FREE_STACK (fail_stack.stack); \ 5331 FREE_VAR (regstart); \ 5332 FREE_VAR (regend); \ 5333 FREE_VAR (old_regstart); \ 5334 FREE_VAR (old_regend); \ 5335 FREE_VAR (best_regstart); \ 5336 FREE_VAR (best_regend); \ 5337 FREE_VAR (reg_info); \ 5338 FREE_VAR (reg_dummy); \ 5339 FREE_VAR (reg_info_dummy); \ 5340 if (!cant_free_wcs_buf) \ 5341 { \ 5342 FREE_VAR (string1); \ 5343 FREE_VAR (string2); \ 5344 FREE_VAR (mbs_offset1); \ 5345 FREE_VAR (mbs_offset2); \ 5346 } \ 5347 } while (0) 5348# else /* BYTE */ 5349# define FREE_VARIABLES() \ 5350 do { \ 5351 REGEX_FREE_STACK (fail_stack.stack); \ 5352 FREE_VAR (regstart); \ 5353 FREE_VAR (regend); \ 5354 FREE_VAR (old_regstart); \ 5355 FREE_VAR (old_regend); \ 5356 FREE_VAR (best_regstart); \ 5357 FREE_VAR (best_regend); \ 5358 FREE_VAR (reg_info); \ 5359 FREE_VAR (reg_dummy); \ 5360 FREE_VAR (reg_info_dummy); \ 5361 } while (0) 5362# endif /* WCHAR */ 5363#else 5364# ifdef WCHAR 5365# define FREE_VARIABLES() \ 5366 do { \ 5367 if (!cant_free_wcs_buf) \ 5368 { \ 5369 FREE_VAR (string1); \ 5370 FREE_VAR (string2); \ 5371 FREE_VAR (mbs_offset1); \ 5372 FREE_VAR (mbs_offset2); \ 5373 } \ 5374 } while (0) 5375# else /* BYTE */ 5376# define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */ 5377# endif /* WCHAR */ 5378#endif /* not MATCH_MAY_ALLOCATE */ 5379 5380/* These values must meet several constraints. They must not be valid 5381 register values; since we have a limit of 255 registers (because 5382 we use only one byte in the pattern for the register number), we can 5383 use numbers larger than 255. They must differ by 1, because of 5384 NUM_FAILURE_ITEMS above. And the value for the lowest register must 5385 be larger than the value for the highest register, so we do not try 5386 to actually save any registers when none are active. */ 5387#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) 5388#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) 5389 5390#else /* not INSIDE_RECURSION */ 5391/* Matching routines. */ 5392 5393#ifndef emacs /* Emacs never uses this. */ 5394/* re_match is like re_match_2 except it takes only a single string. */ 5395 5396int 5397re_match (struct re_pattern_buffer *bufp, 5398 const char *string, 5399 int size, int pos, 5400 struct re_registers *regs) 5401{ 5402 int result; 5403# ifdef MBS_SUPPORT 5404 if (MB_CUR_MAX != 1) 5405 result = wcs_re_match_2_internal (bufp, NULL, 0, string, size, 5406 pos, regs, size, 5407 NULL, 0, NULL, 0, NULL, NULL); 5408 else 5409# endif 5410 result = byte_re_match_2_internal (bufp, NULL, 0, string, size, 5411 pos, regs, size); 5412# ifndef REGEX_MALLOC 5413# ifdef C_ALLOCA 5414 alloca (0); 5415# endif 5416# endif 5417 return result; 5418} 5419# ifdef _LIBC 5420weak_alias (__re_match, re_match) 5421# endif 5422#endif /* not emacs */ 5423 5424#endif /* not INSIDE_RECURSION */ 5425 5426#ifdef INSIDE_RECURSION 5427static boolean PREFIX(group_match_null_string_p) (UCHAR_T **p, 5428 UCHAR_T *end, 5429 PREFIX(register_info_type) *reg_info); 5430static boolean PREFIX(alt_match_null_string_p) (UCHAR_T *p, 5431 UCHAR_T *end, 5432 PREFIX(register_info_type) *reg_info); 5433static boolean PREFIX(common_op_match_null_string_p) (UCHAR_T **p, 5434 UCHAR_T *end, 5435 PREFIX(register_info_type) *reg_info); 5436static int PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, 5437 int len, char *translate); 5438#else /* not INSIDE_RECURSION */ 5439 5440/* re_match_2 matches the compiled pattern in BUFP against the 5441 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 5442 and SIZE2, respectively). We start matching at POS, and stop 5443 matching at STOP. 5444 5445 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we 5446 store offsets for the substring each group matched in REGS. See the 5447 documentation for exactly how many groups we fill. 5448 5449 We return -1 if no match, -2 if an internal error (such as the 5450 failure stack overflowing). Otherwise, we return the length of the 5451 matched substring. */ 5452 5453int 5454re_match_2 (struct re_pattern_buffer *bufp, 5455 const char *string1, int size1, 5456 const char *string2, int size2, 5457 int pos, struct re_registers *regs, 5458 int stop) 5459{ 5460 int result; 5461# ifdef MBS_SUPPORT 5462 if (MB_CUR_MAX != 1) 5463 result = wcs_re_match_2_internal (bufp, string1, size1, string2, size2, 5464 pos, regs, stop, 5465 NULL, 0, NULL, 0, NULL, NULL); 5466 else 5467# endif 5468 result = byte_re_match_2_internal (bufp, string1, size1, string2, size2, 5469 pos, regs, stop); 5470 5471#ifndef REGEX_MALLOC 5472# ifdef C_ALLOCA 5473 alloca (0); 5474# endif 5475#endif 5476 return result; 5477} 5478#ifdef _LIBC 5479weak_alias (__re_match_2, re_match_2) 5480#endif 5481 5482#endif /* not INSIDE_RECURSION */ 5483 5484#ifdef INSIDE_RECURSION 5485 5486#ifdef WCHAR 5487 5488/* This check the substring (from 0, to length) of the multibyte string, 5489 to which offset_buffer correspond. And count how many wchar_t_characters 5490 the substring occupy. We use offset_buffer to optimization. 5491 See convert_mbs_to_wcs. */ 5492 5493static int 5494count_mbs_length (int *offset_buffer, int length) 5495{ 5496 int upper, lower; 5497 5498 /* Check whether the size is valid. */ 5499 if (length < 0) 5500 return -1; 5501 5502 if (offset_buffer == NULL) 5503 return 0; 5504 5505 /* If there are no multibyte character, offset_buffer[i] == i. 5506 Optmize for this case. */ 5507 if (offset_buffer[length] == length) 5508 return length; 5509 5510 /* Set up upper with length. (because for all i, offset_buffer[i] >= i) */ 5511 upper = length; 5512 lower = 0; 5513 5514 while (true) 5515 { 5516 int middle = (lower + upper) / 2; 5517 if (middle == lower || middle == upper) 5518 break; 5519 if (offset_buffer[middle] > length) 5520 upper = middle; 5521 else if (offset_buffer[middle] < length) 5522 lower = middle; 5523 else 5524 return middle; 5525 } 5526 5527 return -1; 5528} 5529#endif /* WCHAR */ 5530 5531/* This is a separate function so that we can force an alloca cleanup 5532 afterwards. */ 5533#ifdef WCHAR 5534static int 5535wcs_re_match_2_internal (struct re_pattern_buffer *bufp, 5536 const char *cstring1, int csize1, 5537 const char *cstring2, int csize2, 5538 int pos, 5539 struct re_registers *regs, 5540 int stop, 5541 /* string1 == string2 == NULL means 5542 string1/2, size1/2 and mbs_offset1/2 need 5543 setting up in this function. */ 5544 /* We need wchar_t * buffers corresponding to 5545 cstring1, cstring2. */ 5546 wchar_t *string1, int size1, 5547 wchar_t *string2, int size2, 5548 /* Offset buffer for optimization. See 5549 convert_mbs_to_wc. */ 5550 int *mbs_offset1, 5551 int *mbs_offset2) 5552#else /* BYTE */ 5553static int 5554byte_re_match_2_internal (struct re_pattern_buffer *bufp, 5555 const char *string1, int size1, 5556 const char *string2, int size2, 5557 int pos, 5558 struct re_registers *regs, 5559 int stop) 5560#endif /* BYTE */ 5561{ 5562 /* General temporaries. */ 5563 int mcnt; 5564 UCHAR_T *p1; 5565#ifdef WCHAR 5566 /* They hold whether each wchar_t is binary data or not. */ 5567 char *is_binary = NULL; 5568 /* If true, we can't free string1/2, mbs_offset1/2. */ 5569 int cant_free_wcs_buf = 1; 5570#endif /* WCHAR */ 5571 5572 /* Just past the end of the corresponding string. */ 5573 const CHAR_T *end1, *end2; 5574 5575 /* Pointers into string1 and string2, just past the last characters in 5576 each to consider matching. */ 5577 const CHAR_T *end_match_1, *end_match_2; 5578 5579 /* Where we are in the data, and the end of the current string. */ 5580 const CHAR_T *d, *dend; 5581 5582 /* Where we are in the pattern, and the end of the pattern. */ 5583#ifdef WCHAR 5584 UCHAR_T *pattern, *p; 5585 register UCHAR_T *pend; 5586#else /* BYTE */ 5587 UCHAR_T *p = bufp->buffer; 5588 register UCHAR_T *pend = p + bufp->used; 5589#endif /* WCHAR */ 5590 5591 /* Mark the opcode just after a start_memory, so we can test for an 5592 empty subpattern when we get to the stop_memory. */ 5593 UCHAR_T *just_past_start_mem = 0; 5594 5595 /* We use this to map every character in the string. */ 5596 RE_TRANSLATE_TYPE translate = bufp->translate; 5597 5598 /* Failure point stack. Each place that can handle a failure further 5599 down the line pushes a failure point on this stack. It consists of 5600 restart, regend, and reg_info for all registers corresponding to 5601 the subexpressions we're currently inside, plus the number of such 5602 registers, and, finally, two char *'s. The first char * is where 5603 to resume scanning the pattern; the second one is where to resume 5604 scanning the strings. If the latter is zero, the failure point is 5605 a ``dummy''; if a failure happens and the failure point is a dummy, 5606 it gets discarded and the next next one is tried. */ 5607#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5608 PREFIX(fail_stack_type) fail_stack; 5609#endif 5610#ifdef DEBUG 5611 static unsigned failure_id; 5612 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; 5613#endif 5614 5615#ifdef REL_ALLOC 5616 /* This holds the pointer to the failure stack, when 5617 it is allocated relocatably. */ 5618 fail_stack_elt_t *failure_stack_ptr; 5619#endif 5620 5621 /* We fill all the registers internally, independent of what we 5622 return, for use in backreferences. The number here includes 5623 an element for register zero. */ 5624 size_t num_regs = bufp->re_nsub + 1; 5625 5626 /* The currently active registers. */ 5627 active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; 5628 active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; 5629 5630 /* Information on the contents of registers. These are pointers into 5631 the input strings; they record just what was matched (on this 5632 attempt) by a subexpression part of the pattern, that is, the 5633 regnum-th regstart pointer points to where in the pattern we began 5634 matching and the regnum-th regend points to right after where we 5635 stopped matching the regnum-th subexpression. (The zeroth register 5636 keeps track of what the whole pattern matches.) */ 5637#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5638 const CHAR_T **regstart, **regend; 5639#endif 5640 5641 /* If a group that's operated upon by a repetition operator fails to 5642 match anything, then the register for its start will need to be 5643 restored because it will have been set to wherever in the string we 5644 are when we last see its open-group operator. Similarly for a 5645 register's end. */ 5646#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5647 const CHAR_T **old_regstart, **old_regend; 5648#endif 5649 5650 /* The is_active field of reg_info helps us keep track of which (possibly 5651 nested) subexpressions we are currently in. The matched_something 5652 field of reg_info[reg_num] helps us tell whether or not we have 5653 matched any of the pattern so far this time through the reg_num-th 5654 subexpression. These two fields get reset each time through any 5655 loop their register is in. */ 5656#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */ 5657 PREFIX(register_info_type) *reg_info; 5658#endif 5659 5660 /* The following record the register info as found in the above 5661 variables when we find a match better than any we've seen before. 5662 This happens as we backtrack through the failure points, which in 5663 turn happens only if we have not yet matched the entire string. */ 5664 unsigned best_regs_set = false; 5665#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5666 const CHAR_T **best_regstart, **best_regend; 5667#endif 5668 5669 /* Logically, this is `best_regend[0]'. But we don't want to have to 5670 allocate space for that if we're not allocating space for anything 5671 else (see below). Also, we never need info about register 0 for 5672 any of the other register vectors, and it seems rather a kludge to 5673 treat `best_regend' differently than the rest. So we keep track of 5674 the end of the best match so far in a separate variable. We 5675 initialize this to NULL so that when we backtrack the first time 5676 and need to test it, it's not garbage. */ 5677 const CHAR_T *match_end = NULL; 5678 5679 /* This helps SET_REGS_MATCHED avoid doing redundant work. */ 5680 int set_regs_matched_done = 0; 5681 5682 /* Used when we pop values we don't care about. */ 5683#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */ 5684 const CHAR_T **reg_dummy; 5685 PREFIX(register_info_type) *reg_info_dummy; 5686#endif 5687 5688#ifdef DEBUG 5689 /* Counts the total number of registers pushed. */ 5690 unsigned num_regs_pushed = 0; 5691#endif 5692 5693 /* Definitions for state transitions. More efficiently for gcc. */ 5694#ifdef __GNUC__ 5695# if defined HAVE_SUBTRACT_LOCAL_LABELS && defined SHARED 5696# define NEXT \ 5697 do \ 5698 { \ 5699 int offset; \ 5700 const void *__unbounded ptr; \ 5701 offset = (p == pend \ 5702 ? 0 : jmptable[SWITCH_ENUM_CAST ((re_opcode_t) *p++)]); \ 5703 ptr = &&end_of_pattern + offset; \ 5704 goto *ptr; \ 5705 } \ 5706 while (0) 5707# define REF(x) \ 5708 &&label_##x - &&end_of_pattern 5709# define JUMP_TABLE_TYPE const int 5710# else 5711# define NEXT \ 5712 do \ 5713 { \ 5714 const void *__unbounded ptr; \ 5715 ptr = (p == pend ? &&end_of_pattern \ 5716 : jmptable[SWITCH_ENUM_CAST ((re_opcode_t) *p++)]); \ 5717 goto *ptr; \ 5718 } \ 5719 while (0) 5720# define REF(x) \ 5721 &&label_##x 5722# define JUMP_TABLE_TYPE const void *const 5723# endif 5724# define CASE(x) label_##x 5725 static JUMP_TABLE_TYPE jmptable[] = 5726 { 5727 REF (no_op), 5728 REF (succeed), 5729 REF (exactn), 5730# ifdef MBS_SUPPORT 5731 REF (exactn_bin), 5732# endif 5733 REF (anychar), 5734 REF (charset), 5735 REF (charset_not), 5736 REF (start_memory), 5737 REF (stop_memory), 5738 REF (duplicate), 5739 REF (begline), 5740 REF (endline), 5741 REF (begbuf), 5742 REF (endbuf), 5743 REF (jump), 5744 REF (jump_past_alt), 5745 REF (on_failure_jump), 5746 REF (on_failure_keep_string_jump), 5747 REF (pop_failure_jump), 5748 REF (maybe_pop_jump), 5749 REF (dummy_failure_jump), 5750 REF (push_dummy_failure), 5751 REF (succeed_n), 5752 REF (jump_n), 5753 REF (set_number_at), 5754 REF (wordchar), 5755 REF (notwordchar), 5756 REF (wordbeg), 5757 REF (wordend), 5758 REF (wordbound), 5759 REF (notwordbound) 5760# ifdef emacs 5761 ,REF (before_dot), 5762 REF (at_dot), 5763 REF (after_dot), 5764 REF (syntaxspec), 5765 REF (notsyntaxspec) 5766# endif 5767 }; 5768#else 5769# define NEXT \ 5770 break 5771# define CASE(x) \ 5772 case x 5773#endif 5774 5775 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); 5776 5777 INIT_FAIL_STACK (); 5778 5779#ifdef MATCH_MAY_ALLOCATE 5780 /* Do not bother to initialize all the register variables if there are 5781 no groups in the pattern, as it takes a fair amount of time. If 5782 there are groups, we include space for register 0 (the whole 5783 pattern), even though we never use it, since it simplifies the 5784 array indexing. We should fix this. */ 5785 if (bufp->re_nsub) 5786 { 5787 regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5788 regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5789 old_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5790 old_regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5791 best_regstart = REGEX_TALLOC (num_regs, const CHAR_T *); 5792 best_regend = REGEX_TALLOC (num_regs, const CHAR_T *); 5793 reg_info = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); 5794 reg_dummy = REGEX_TALLOC (num_regs, const CHAR_T *); 5795 reg_info_dummy = REGEX_TALLOC (num_regs, PREFIX(register_info_type)); 5796 5797 if (!(regstart && regend && old_regstart && old_regend && reg_info 5798 && best_regstart && best_regend && reg_dummy && reg_info_dummy)) 5799 { 5800 FREE_VARIABLES (); 5801 return -2; 5802 } 5803 } 5804 else 5805 { 5806 /* We must initialize all our variables to NULL, so that 5807 `FREE_VARIABLES' doesn't try to free them. */ 5808 regstart = regend = old_regstart = old_regend = best_regstart 5809 = best_regend = reg_dummy = NULL; 5810 reg_info = reg_info_dummy = (PREFIX(register_info_type) *) NULL; 5811 } 5812#endif /* MATCH_MAY_ALLOCATE */ 5813 5814 /* The starting position is bogus. */ 5815#ifdef WCHAR 5816 if (pos < 0 || pos > csize1 + csize2) 5817#else /* BYTE */ 5818 if (pos < 0 || pos > size1 + size2) 5819#endif 5820 { 5821 FREE_VARIABLES (); 5822 return -1; 5823 } 5824 5825#ifdef WCHAR 5826 /* Allocate wchar_t array for string1 and string2 and 5827 fill them with converted string. */ 5828 if (string1 == NULL && string2 == NULL) 5829 { 5830 /* We need seting up buffers here. */ 5831 5832 /* We must free wcs buffers in this function. */ 5833 cant_free_wcs_buf = 0; 5834 5835 if (csize1 != 0) 5836 { 5837 string1 = REGEX_TALLOC (csize1 + 1, CHAR_T); 5838 mbs_offset1 = REGEX_TALLOC (csize1 + 1, int); 5839 is_binary = REGEX_TALLOC (csize1 + 1, char); 5840 if (!string1 || !mbs_offset1 || !is_binary) 5841 { 5842 FREE_VAR (string1); 5843 FREE_VAR (mbs_offset1); 5844 FREE_VAR (is_binary); 5845 return -2; 5846 } 5847 } 5848 if (csize2 != 0) 5849 { 5850 string2 = REGEX_TALLOC (csize2 + 1, CHAR_T); 5851 mbs_offset2 = REGEX_TALLOC (csize2 + 1, int); 5852 is_binary = REGEX_TALLOC (csize2 + 1, char); 5853 if (!string2 || !mbs_offset2 || !is_binary) 5854 { 5855 FREE_VAR (string1); 5856 FREE_VAR (mbs_offset1); 5857 FREE_VAR (string2); 5858 FREE_VAR (mbs_offset2); 5859 FREE_VAR (is_binary); 5860 return -2; 5861 } 5862 size2 = convert_mbs_to_wcs(string2, cstring2, csize2, 5863 mbs_offset2, is_binary); 5864 string2[size2] = L'\0'; /* for a sentinel */ 5865 FREE_VAR (is_binary); 5866 } 5867 } 5868 5869 /* We need to cast pattern to (wchar_t*), because we casted this compiled 5870 pattern to (char*) in regex_compile. */ 5871 p = pattern = (CHAR_T*)bufp->buffer; 5872 pend = (CHAR_T*)(bufp->buffer + bufp->used); 5873 5874#endif /* WCHAR */ 5875 5876 /* Initialize subexpression text positions to -1 to mark ones that no 5877 start_memory/stop_memory has been seen for. Also initialize the 5878 register information struct. */ 5879 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 5880 { 5881 regstart[mcnt] = regend[mcnt] 5882 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; 5883 5884 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; 5885 IS_ACTIVE (reg_info[mcnt]) = 0; 5886 MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5887 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; 5888 } 5889 5890 /* We move `string1' into `string2' if the latter's empty -- but not if 5891 `string1' is null. */ 5892 if (size2 == 0 && string1 != NULL) 5893 { 5894 string2 = string1; 5895 size2 = size1; 5896 string1 = 0; 5897 size1 = 0; 5898#ifdef WCHAR 5899 mbs_offset2 = mbs_offset1; 5900 csize2 = csize1; 5901 mbs_offset1 = NULL; 5902 csize1 = 0; 5903#endif 5904 } 5905 end1 = string1 + size1; 5906 end2 = string2 + size2; 5907 5908 /* Compute where to stop matching, within the two strings. */ 5909#ifdef WCHAR 5910 if (stop <= csize1) 5911 { 5912 mcnt = count_mbs_length(mbs_offset1, stop); 5913 end_match_1 = string1 + mcnt; 5914 end_match_2 = string2; 5915 } 5916 else 5917 { 5918 if (stop > csize1 + csize2) 5919 stop = csize1 + csize2; 5920 end_match_1 = end1; 5921 mcnt = count_mbs_length(mbs_offset2, stop-csize1); 5922 end_match_2 = string2 + mcnt; 5923 } 5924 if (mcnt < 0) 5925 { /* count_mbs_length return error. */ 5926 FREE_VARIABLES (); 5927 return -1; 5928 } 5929#else 5930 if (stop <= size1) 5931 { 5932 end_match_1 = string1 + stop; 5933 end_match_2 = string2; 5934 } 5935 else 5936 { 5937 end_match_1 = end1; 5938 end_match_2 = string2 + stop - size1; 5939 } 5940#endif /* WCHAR */ 5941 5942 /* `p' scans through the pattern as `d' scans through the data. 5943 `dend' is the end of the input string that `d' points within. `d' 5944 is advanced into the following input string whenever necessary, but 5945 this happens before fetching; therefore, at the beginning of the 5946 loop, `d' can be pointing at the end of a string, but it cannot 5947 equal `string2'. */ 5948#ifdef WCHAR 5949 if (size1 > 0 && pos <= csize1) 5950 { 5951 mcnt = count_mbs_length(mbs_offset1, pos); 5952 d = string1 + mcnt; 5953 dend = end_match_1; 5954 } 5955 else 5956 { 5957 mcnt = count_mbs_length(mbs_offset2, pos-csize1); 5958 d = string2 + mcnt; 5959 dend = end_match_2; 5960 } 5961 5962 if (mcnt < 0) 5963 { /* count_mbs_length return error. */ 5964 FREE_VARIABLES (); 5965 return -1; 5966 } 5967#else 5968 if (size1 > 0 && pos <= size1) 5969 { 5970 d = string1 + pos; 5971 dend = end_match_1; 5972 } 5973 else 5974 { 5975 d = string2 + pos - size1; 5976 dend = end_match_2; 5977 } 5978#endif /* WCHAR */ 5979 5980 DEBUG_PRINT1 ("The compiled pattern is:\n"); 5981 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); 5982 DEBUG_PRINT1 ("The string to match is: `"); 5983 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); 5984 DEBUG_PRINT1 ("'\n"); 5985 5986 /* This loops over pattern commands. It exits by returning from the 5987 function if the match is complete, or it drops through if the match 5988 fails at this starting point in the input data. */ 5989 for (;;) 5990 { 5991#ifdef _LIBC 5992 DEBUG_PRINT2 ("\n%p: ", p); 5993#else 5994 DEBUG_PRINT2 ("\n0x%x: ", p); 5995#endif 5996 5997#ifdef __GNUC__ 5998 NEXT; 5999#else 6000 if (p == pend) 6001#endif 6002 { 6003#ifdef __GNUC__ 6004 end_of_pattern: 6005#endif 6006 /* End of pattern means we might have succeeded. */ 6007 DEBUG_PRINT1 ("end of pattern ... "); 6008 6009 /* If we haven't matched the entire string, and we want the 6010 longest match, try backtracking. */ 6011 if (d != end_match_2) 6012 { 6013 /* 1 if this match is the best seen so far. */ 6014 boolean best_match_p; 6015 { 6016 /* 1 if this match ends in the same string (string1 or string2) 6017 as the best previous match. */ 6018 boolean same_str_p = (FIRST_STRING_P (match_end) 6019 == MATCHING_IN_FIRST_STRING); 6020 6021 /* AIX compiler got confused when this was combined 6022 with the previous declaration. */ 6023 if (same_str_p) 6024 best_match_p = d > match_end; 6025 else 6026 best_match_p = !MATCHING_IN_FIRST_STRING; 6027 } 6028 6029 DEBUG_PRINT1 ("backtracking.\n"); 6030 6031 if (!FAIL_STACK_EMPTY ()) 6032 { /* More failure points to try. */ 6033 6034 /* If exceeds best match so far, save it. */ 6035 if (!best_regs_set || best_match_p) 6036 { 6037 best_regs_set = true; 6038 match_end = d; 6039 6040 DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); 6041 6042 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 6043 { 6044 best_regstart[mcnt] = regstart[mcnt]; 6045 best_regend[mcnt] = regend[mcnt]; 6046 } 6047 } 6048 goto fail; 6049 } 6050 6051 /* If no failure points, don't restore garbage. And if 6052 last match is real best match, don't restore second 6053 best one. */ 6054 else if (best_regs_set && !best_match_p) 6055 { 6056 restore_best_regs: 6057 /* Restore best match. It may happen that `dend == 6058 end_match_1' while the restored d is in string2. 6059 For example, the pattern `x.*y.*z' against the 6060 strings `x-' and `y-z-', if the two strings are 6061 not consecutive in memory. */ 6062 DEBUG_PRINT1 ("Restoring best registers.\n"); 6063 6064 d = match_end; 6065 dend = ((d >= string1 && d <= end1) 6066 ? end_match_1 : end_match_2); 6067 6068 for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) 6069 { 6070 regstart[mcnt] = best_regstart[mcnt]; 6071 regend[mcnt] = best_regend[mcnt]; 6072 } 6073 } 6074 } /* d != end_match_2 */ 6075 6076 succeed_label: 6077 DEBUG_PRINT1 ("Accepting match.\n"); 6078 /* If caller wants register contents data back, do it. */ 6079 if (regs && !bufp->no_sub) 6080 { 6081 /* Have the register data arrays been allocated? */ 6082 if (bufp->regs_allocated == REGS_UNALLOCATED) 6083 { /* No. So allocate them with malloc. We need one 6084 extra element beyond `num_regs' for the `-1' marker 6085 GNU code uses. */ 6086 regs->num_regs = MAX (RE_NREGS, num_regs + 1); 6087 regs->start = TALLOC (regs->num_regs, regoff_t); 6088 regs->end = TALLOC (regs->num_regs, regoff_t); 6089 if (regs->start == NULL || regs->end == NULL) 6090 { 6091 FREE_VARIABLES (); 6092 return -2; 6093 } 6094 bufp->regs_allocated = REGS_REALLOCATE; 6095 } 6096 else if (bufp->regs_allocated == REGS_REALLOCATE) 6097 { /* Yes. If we need more elements than were already 6098 allocated, reallocate them. If we need fewer, just 6099 leave it alone. */ 6100 if (regs->num_regs < num_regs + 1) 6101 { 6102 regs->num_regs = num_regs + 1; 6103 RETALLOC (regs->start, regs->num_regs, regoff_t); 6104 RETALLOC (regs->end, regs->num_regs, regoff_t); 6105 if (regs->start == NULL || regs->end == NULL) 6106 { 6107 FREE_VARIABLES (); 6108 return -2; 6109 } 6110 } 6111 } 6112 else 6113 { 6114 /* These braces fend off a "empty body in an else-statement" 6115 warning under GCC when assert expands to nothing. */ 6116 assert (bufp->regs_allocated == REGS_FIXED); 6117 } 6118 6119 /* Convert the pointer data in `regstart' and `regend' to 6120 indices. Register zero has to be set differently, 6121 since we haven't kept track of any info for it. */ 6122 if (regs->num_regs > 0) 6123 { 6124 regs->start[0] = pos; 6125#ifdef WCHAR 6126 if (MATCHING_IN_FIRST_STRING) 6127 regs->end[0] = (mbs_offset1 != NULL ? 6128 mbs_offset1[d-string1] : 0); 6129 else 6130 regs->end[0] = csize1 + (mbs_offset2 != NULL 6131 ? mbs_offset2[d-string2] : 0); 6132#else 6133 regs->end[0] = (MATCHING_IN_FIRST_STRING 6134 ? ((regoff_t) (d - string1)) 6135 : ((regoff_t) (d - string2 + size1))); 6136#endif /* WCHAR */ 6137 } 6138 6139 /* Go through the first `min (num_regs, regs->num_regs)' 6140 registers, since that is all we initialized. */ 6141 for (mcnt = 1; (unsigned) mcnt < MIN (num_regs, regs->num_regs); 6142 mcnt++) 6143 { 6144 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) 6145 regs->start[mcnt] = regs->end[mcnt] = -1; 6146 else 6147 { 6148 regs->start[mcnt] 6149 = (regoff_t) POINTER_TO_OFFSET (regstart[mcnt]); 6150 regs->end[mcnt] 6151 = (regoff_t) POINTER_TO_OFFSET (regend[mcnt]); 6152 } 6153 } 6154 6155 /* If the regs structure we return has more elements than 6156 were in the pattern, set the extra elements to -1. If 6157 we (re)allocated the registers, this is the case, 6158 because we always allocate enough to have at least one 6159 -1 at the end. */ 6160 for (mcnt = num_regs; (unsigned) mcnt < regs->num_regs; mcnt++) 6161 regs->start[mcnt] = regs->end[mcnt] = -1; 6162 } /* regs && !bufp->no_sub */ 6163 6164 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", 6165 nfailure_points_pushed, nfailure_points_popped, 6166 nfailure_points_pushed - nfailure_points_popped); 6167 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); 6168 6169#ifdef WCHAR 6170 if (MATCHING_IN_FIRST_STRING) 6171 mcnt = mbs_offset1 != NULL ? mbs_offset1[d-string1] : 0; 6172 else 6173 mcnt = (mbs_offset2 != NULL ? mbs_offset2[d-string2] : 0) + 6174 csize1; 6175 mcnt -= pos; 6176#else 6177 mcnt = d - pos - (MATCHING_IN_FIRST_STRING 6178 ? string1 : string2 - size1); 6179#endif /* WCHAR */ 6180 6181 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); 6182 6183 FREE_VARIABLES (); 6184 return mcnt; 6185 } 6186 6187#ifndef __GNUC__ 6188 /* Otherwise match next pattern command. */ 6189 switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++)) 6190 { 6191#endif 6192 /* Ignore these. Used to ignore the n of succeed_n's which 6193 currently have n == 0. */ 6194 CASE (no_op): 6195 DEBUG_PRINT1 ("EXECUTING no_op.\n"); 6196 NEXT; 6197 6198 CASE (succeed): 6199 DEBUG_PRINT1 ("EXECUTING succeed.\n"); 6200 goto succeed_label; 6201 6202 /* Match the next n pattern characters exactly. The following 6203 byte in the pattern defines n, and the n bytes after that 6204 are the characters to match. */ 6205 CASE (exactn): 6206#ifdef MBS_SUPPORT 6207 CASE (exactn_bin): 6208#endif 6209 mcnt = *p++; 6210 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); 6211 6212 /* This is written out as an if-else so we don't waste time 6213 testing `translate' inside the loop. */ 6214 if (translate) 6215 { 6216 do 6217 { 6218 PREFETCH (); 6219#ifdef WCHAR 6220 if (*d <= 0xff) 6221 { 6222 if ((UCHAR_T) translate[(unsigned char) *d++] 6223 != (UCHAR_T) *p++) 6224 goto fail; 6225 } 6226 else 6227 { 6228 if (*d++ != (CHAR_T) *p++) 6229 goto fail; 6230 } 6231#else 6232 if ((UCHAR_T) translate[(unsigned char) *d++] 6233 != (UCHAR_T) *p++) 6234 goto fail; 6235#endif /* WCHAR */ 6236 } 6237 while (--mcnt); 6238 } 6239 else 6240 { 6241 do 6242 { 6243 PREFETCH (); 6244 if (*d++ != (CHAR_T) *p++) goto fail; 6245 } 6246 while (--mcnt); 6247 } 6248 SET_REGS_MATCHED (); 6249 NEXT; 6250 6251 6252 /* Match any character except possibly a newline or a null. */ 6253 CASE (anychar): 6254 DEBUG_PRINT1 ("EXECUTING anychar.\n"); 6255 6256 PREFETCH (); 6257 6258 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') 6259 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) 6260 goto fail; 6261 6262 SET_REGS_MATCHED (); 6263 DEBUG_PRINT2 (" Matched `%ld'.\n", (long int) *d); 6264 d++; 6265 NEXT; 6266 6267 6268 CASE (charset): 6269 CASE (charset_not): 6270 { 6271 register UCHAR_T c; 6272#ifdef WCHAR 6273 unsigned int i, char_class_length, coll_symbol_length, 6274 equiv_class_length, ranges_length, chars_length, length; 6275 CHAR_T *workp, *workp2, *charset_top; 6276#define WORK_BUFFER_SIZE 128 6277 CHAR_T str_buf[WORK_BUFFER_SIZE]; 6278# ifdef _LIBC 6279 uint32_t nrules; 6280# endif /* _LIBC */ 6281#endif /* WCHAR */ 6282 boolean invert = (re_opcode_t) *(p - 1) == charset_not; 6283 6284 DEBUG_PRINT2 ("EXECUTING charset%s.\n", invert ? "_not" : ""); 6285 PREFETCH (); 6286 c = TRANSLATE (*d); /* The character to match. */ 6287#ifdef WCHAR 6288# ifdef _LIBC 6289 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); 6290# endif /* _LIBC */ 6291 charset_top = p - 1; 6292 char_class_length = *p++; 6293 coll_symbol_length = *p++; 6294 equiv_class_length = *p++; 6295 ranges_length = *p++; 6296 chars_length = *p++; 6297 /* p points charset[6], so the address of the next instruction 6298 (charset[l+m+n+2o+k+p']) equals p[l+m+n+2*o+p'], 6299 where l=length of char_classes, m=length of collating_symbol, 6300 n=equivalence_class, o=length of char_range, 6301 p'=length of character. */ 6302 workp = p; 6303 /* Update p to indicate the next instruction. */ 6304 p += char_class_length + coll_symbol_length+ equiv_class_length + 6305 2*ranges_length + chars_length; 6306 6307 /* match with char_class? */ 6308 for (i = 0; i < char_class_length ; i += CHAR_CLASS_SIZE) 6309 { 6310 wctype_t wctype; 6311 uintptr_t alignedp = ((uintptr_t)workp 6312 + __alignof__(wctype_t) - 1) 6313 & ~(uintptr_t)(__alignof__(wctype_t) - 1); 6314 wctype = *((wctype_t*)alignedp); 6315 workp += CHAR_CLASS_SIZE; 6316 if (iswctype((wint_t)c, wctype)) 6317 goto char_set_matched; 6318 } 6319 6320 /* match with collating_symbol? */ 6321# ifdef _LIBC 6322 if (nrules != 0) 6323 { 6324 const unsigned char *extra = (const unsigned char *) 6325 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB); 6326 6327 for (workp2 = workp + coll_symbol_length ; workp < workp2 ; 6328 workp++) 6329 { 6330 int32_t *wextra; 6331 wextra = (int32_t*)(extra + *workp++); 6332 for (i = 0; i < *wextra; ++i) 6333 if (TRANSLATE(d[i]) != wextra[1 + i]) 6334 break; 6335 6336 if (i == *wextra) 6337 { 6338 /* Update d, however d will be incremented at 6339 char_set_matched:, we decrement d here. */ 6340 d += i - 1; 6341 goto char_set_matched; 6342 } 6343 } 6344 } 6345 else /* (nrules == 0) */ 6346# endif 6347 /* If we can't look up collation data, we use wcscoll 6348 instead. */ 6349 { 6350 for (workp2 = workp + coll_symbol_length ; workp < workp2 ;) 6351 { 6352 const CHAR_T *backup_d = d, *backup_dend = dend; 6353 length = wcslen (workp); 6354 6355 /* If wcscoll(the collating symbol, whole string) > 0, 6356 any substring of the string never match with the 6357 collating symbol. */ 6358 if (wcscoll (workp, d) > 0) 6359 { 6360 workp += length + 1; 6361 continue; 6362 } 6363 6364 /* First, we compare the collating symbol with 6365 the first character of the string. 6366 If it don't match, we add the next character to 6367 the compare buffer in turn. */ 6368 for (i = 0 ; i < WORK_BUFFER_SIZE-1 ; i++, d++) 6369 { 6370 int match; 6371 if (d == dend) 6372 { 6373 if (dend == end_match_2) 6374 break; 6375 d = string2; 6376 dend = end_match_2; 6377 } 6378 6379 /* add next character to the compare buffer. */ 6380 str_buf[i] = TRANSLATE(*d); 6381 str_buf[i+1] = '\0'; 6382 6383 match = wcscoll (workp, str_buf); 6384 if (match == 0) 6385 goto char_set_matched; 6386 6387 if (match < 0) 6388 /* (str_buf > workp) indicate (str_buf + X > workp), 6389 because for all X (str_buf + X > str_buf). 6390 So we don't need continue this loop. */ 6391 break; 6392 6393 /* Otherwise(str_buf < workp), 6394 (str_buf+next_character) may equals (workp). 6395 So we continue this loop. */ 6396 } 6397 /* not matched */ 6398 d = backup_d; 6399 dend = backup_dend; 6400 workp += length + 1; 6401 } 6402 } 6403 /* match with equivalence_class? */ 6404# ifdef _LIBC 6405 if (nrules != 0) 6406 { 6407 const CHAR_T *backup_d = d, *backup_dend = dend; 6408 /* Try to match the equivalence class against 6409 those known to the collate implementation. */ 6410 const int32_t *table; 6411 const int32_t *weights; 6412 const int32_t *extra; 6413 const int32_t *indirect; 6414 int32_t idx, idx2; 6415 wint_t *cp; 6416 size_t len; 6417 6418 /* This #include defines a local function! */ 6419# include <locale/weightwc.h> 6420 6421 table = (const int32_t *) 6422 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEWC); 6423 weights = (const wint_t *) 6424 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTWC); 6425 extra = (const wint_t *) 6426 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAWC); 6427 indirect = (const int32_t *) 6428 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTWC); 6429 6430 /* Write 1 collating element to str_buf, and 6431 get its index. */ 6432 idx2 = 0; 6433 6434 for (i = 0 ; idx2 == 0 && i < WORK_BUFFER_SIZE - 1; i++) 6435 { 6436 cp = (wint_t*)str_buf; 6437 if (d == dend) 6438 { 6439 if (dend == end_match_2) 6440 break; 6441 d = string2; 6442 dend = end_match_2; 6443 } 6444 str_buf[i] = TRANSLATE(*(d+i)); 6445 str_buf[i+1] = '\0'; /* sentinel */ 6446 idx2 = findidx ((const wint_t**)&cp); 6447 } 6448 6449 /* Update d, however d will be incremented at 6450 char_set_matched:, we decrement d here. */ 6451 d = backup_d + ((wchar_t*)cp - (wchar_t*)str_buf - 1); 6452 if (d >= dend) 6453 { 6454 if (dend == end_match_2) 6455 d = dend; 6456 else 6457 { 6458 d = string2; 6459 dend = end_match_2; 6460 } 6461 } 6462 6463 len = weights[idx2]; 6464 6465 for (workp2 = workp + equiv_class_length ; workp < workp2 ; 6466 workp++) 6467 { 6468 idx = (int32_t)*workp; 6469 /* We already checked idx != 0 in regex_compile. */ 6470 6471 if (idx2 != 0 && len == weights[idx]) 6472 { 6473 int cnt = 0; 6474 while (cnt < len && (weights[idx + 1 + cnt] 6475 == weights[idx2 + 1 + cnt])) 6476 ++cnt; 6477 6478 if (cnt == len) 6479 goto char_set_matched; 6480 } 6481 } 6482 /* not matched */ 6483 d = backup_d; 6484 dend = backup_dend; 6485 } 6486 else /* (nrules == 0) */ 6487# endif 6488 /* If we can't look up collation data, we use wcscoll 6489 instead. */ 6490 { 6491 for (workp2 = workp + equiv_class_length ; workp < workp2 ;) 6492 { 6493 const CHAR_T *backup_d = d, *backup_dend = dend; 6494 length = wcslen (workp); 6495 6496 /* If wcscoll(the collating symbol, whole string) > 0, 6497 any substring of the string never match with the 6498 collating symbol. */ 6499 if (wcscoll (workp, d) > 0) 6500 { 6501 workp += length + 1; 6502 break; 6503 } 6504 6505 /* First, we compare the equivalence class with 6506 the first character of the string. 6507 If it don't match, we add the next character to 6508 the compare buffer in turn. */ 6509 for (i = 0 ; i < WORK_BUFFER_SIZE - 1 ; i++, d++) 6510 { 6511 int match; 6512 if (d == dend) 6513 { 6514 if (dend == end_match_2) 6515 break; 6516 d = string2; 6517 dend = end_match_2; 6518 } 6519 6520 /* add next character to the compare buffer. */ 6521 str_buf[i] = TRANSLATE(*d); 6522 str_buf[i+1] = '\0'; 6523 6524 match = wcscoll (workp, str_buf); 6525 6526 if (match == 0) 6527 goto char_set_matched; 6528 6529 if (match < 0) 6530 /* (str_buf > workp) indicate (str_buf + X > workp), 6531 because for all X (str_buf + X > str_buf). 6532 So we don't need continue this loop. */ 6533 break; 6534 6535 /* Otherwise(str_buf < workp), 6536 (str_buf+next_character) may equals (workp). 6537 So we continue this loop. */ 6538 } 6539 /* not matched */ 6540 d = backup_d; 6541 dend = backup_dend; 6542 workp += length + 1; 6543 } 6544 } 6545 6546 /* match with char_range? */ 6547# ifdef _LIBC 6548 if (nrules != 0) 6549 { 6550 uint32_t collseqval; 6551 const char *collseq = (const char *) 6552 _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQWC); 6553 6554 collseqval = collseq_table_lookup (collseq, c); 6555 6556 for (; workp < p - chars_length ;) 6557 { 6558 uint32_t start_val, end_val; 6559 6560 /* We already compute the collation sequence value 6561 of the characters (or collating symbols). */ 6562 start_val = (uint32_t) *workp++; /* range_start */ 6563 end_val = (uint32_t) *workp++; /* range_end */ 6564 6565 if (start_val <= collseqval && collseqval <= end_val) 6566 goto char_set_matched; 6567 } 6568 } 6569 else 6570# endif 6571 { 6572 /* We set range_start_char at str_buf[0], range_end_char 6573 at str_buf[4], and compared char at str_buf[2]. */ 6574 str_buf[1] = 0; 6575 str_buf[2] = c; 6576 str_buf[3] = 0; 6577 str_buf[5] = 0; 6578 for (; workp < p - chars_length ;) 6579 { 6580 wchar_t *range_start_char, *range_end_char; 6581 6582 /* match if (range_start_char <= c <= range_end_char). */ 6583 6584 /* If range_start(or end) < 0, we assume -range_start(end) 6585 is the offset of the collating symbol which is specified 6586 as the character of the range start(end). */ 6587 6588 /* range_start */ 6589 if (*workp < 0) 6590 range_start_char = charset_top - (*workp++); 6591 else 6592 { 6593 str_buf[0] = *workp++; 6594 range_start_char = str_buf; 6595 } 6596 6597 /* range_end */ 6598 if (*workp < 0) 6599 range_end_char = charset_top - (*workp++); 6600 else 6601 { 6602 str_buf[4] = *workp++; 6603 range_end_char = str_buf + 4; 6604 } 6605 6606 if (wcscoll (range_start_char, str_buf+2) <= 0 6607 && wcscoll (str_buf+2, range_end_char) <= 0) 6608 goto char_set_matched; 6609 } 6610 } 6611 6612 /* match with char? */ 6613 for (; workp < p ; workp++) 6614 if (c == *workp) 6615 goto char_set_matched; 6616 6617 invert = !invert; 6618 6619 char_set_matched: 6620 if (invert) goto fail; 6621#else 6622 /* Cast to `unsigned' instead of `unsigned char' in case the 6623 bit list is a full 32 bytes long. */ 6624 if (c < (unsigned) (*p * BYTEWIDTH) 6625 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 6626 invert = !invert; 6627 6628 p += 1 + *p; 6629 6630 if (!invert) goto fail; 6631#undef WORK_BUFFER_SIZE 6632#endif /* WCHAR */ 6633 SET_REGS_MATCHED (); 6634 d++; 6635 NEXT; 6636 } 6637 6638 6639 /* The beginning of a group is represented by start_memory. 6640 The arguments are the register number in the next byte, and the 6641 number of groups inner to this one in the next. The text 6642 matched within the group is recorded (in the internal 6643 registers data structure) under the register number. */ 6644 CASE (start_memory): 6645 DEBUG_PRINT3 ("EXECUTING start_memory %ld (%ld):\n", 6646 (long int) *p, (long int) p[1]); 6647 6648 /* Find out if this group can match the empty string. */ 6649 p1 = p; /* To send to group_match_null_string_p. */ 6650 6651 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) 6652 REG_MATCH_NULL_STRING_P (reg_info[*p]) 6653 = PREFIX(group_match_null_string_p) (&p1, pend, reg_info); 6654 6655 /* Save the position in the string where we were the last time 6656 we were at this open-group operator in case the group is 6657 operated upon by a repetition operator, e.g., with `(a*)*b' 6658 against `ab'; then we want to ignore where we are now in 6659 the string in case this attempt to match fails. */ 6660 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6661 ? REG_UNSET (regstart[*p]) ? d : regstart[*p] 6662 : regstart[*p]; 6663 DEBUG_PRINT2 (" old_regstart: %d\n", 6664 POINTER_TO_OFFSET (old_regstart[*p])); 6665 6666 regstart[*p] = d; 6667 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); 6668 6669 IS_ACTIVE (reg_info[*p]) = 1; 6670 MATCHED_SOMETHING (reg_info[*p]) = 0; 6671 6672 /* Clear this whenever we change the register activity status. */ 6673 set_regs_matched_done = 0; 6674 6675 /* This is the new highest active register. */ 6676 highest_active_reg = *p; 6677 6678 /* If nothing was active before, this is the new lowest active 6679 register. */ 6680 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 6681 lowest_active_reg = *p; 6682 6683 /* Move past the register number and inner group count. */ 6684 p += 2; 6685 just_past_start_mem = p; 6686 6687 NEXT; 6688 6689 6690 /* The stop_memory opcode represents the end of a group. Its 6691 arguments are the same as start_memory's: the register 6692 number, and the number of inner groups. */ 6693 CASE (stop_memory): 6694 DEBUG_PRINT3 ("EXECUTING stop_memory %ld (%ld):\n", 6695 (long int) *p, (long int) p[1]); 6696 6697 /* We need to save the string position the last time we were at 6698 this close-group operator in case the group is operated 6699 upon by a repetition operator, e.g., with `((a*)*(b*)*)*' 6700 against `aba'; then we want to ignore where we are now in 6701 the string in case this attempt to match fails. */ 6702 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) 6703 ? REG_UNSET (regend[*p]) ? d : regend[*p] 6704 : regend[*p]; 6705 DEBUG_PRINT2 (" old_regend: %d\n", 6706 POINTER_TO_OFFSET (old_regend[*p])); 6707 6708 regend[*p] = d; 6709 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); 6710 6711 /* This register isn't active anymore. */ 6712 IS_ACTIVE (reg_info[*p]) = 0; 6713 6714 /* Clear this whenever we change the register activity status. */ 6715 set_regs_matched_done = 0; 6716 6717 /* If this was the only register active, nothing is active 6718 anymore. */ 6719 if (lowest_active_reg == highest_active_reg) 6720 { 6721 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6722 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6723 } 6724 else 6725 { /* We must scan for the new highest active register, since 6726 it isn't necessarily one less than now: consider 6727 (a(b)c(d(e)f)g). When group 3 ends, after the f), the 6728 new highest active register is 1. */ 6729 UCHAR_T r = *p - 1; 6730 while (r > 0 && !IS_ACTIVE (reg_info[r])) 6731 r--; 6732 6733 /* If we end up at register zero, that means that we saved 6734 the registers as the result of an `on_failure_jump', not 6735 a `start_memory', and we jumped to past the innermost 6736 `stop_memory'. For example, in ((.)*) we save 6737 registers 1 and 2 as a result of the *, but when we pop 6738 back to the second ), we are at the stop_memory 1. 6739 Thus, nothing is active. */ 6740 if (r == 0) 6741 { 6742 lowest_active_reg = NO_LOWEST_ACTIVE_REG; 6743 highest_active_reg = NO_HIGHEST_ACTIVE_REG; 6744 } 6745 else 6746 highest_active_reg = r; 6747 } 6748 6749 /* If just failed to match something this time around with a 6750 group that's operated on by a repetition operator, try to 6751 force exit from the ``loop'', and restore the register 6752 information for this group that we had before trying this 6753 last match. */ 6754 if ((!MATCHED_SOMETHING (reg_info[*p]) 6755 || just_past_start_mem == p - 1) 6756 && (p + 2) < pend) 6757 { 6758 boolean is_a_jump_n = false; 6759 6760 p1 = p + 2; 6761 mcnt = 0; 6762 switch ((re_opcode_t) *p1++) 6763 { 6764 case jump_n: 6765 is_a_jump_n = true; 6766 case pop_failure_jump: 6767 case maybe_pop_jump: 6768 case jump: 6769 case dummy_failure_jump: 6770 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6771 if (is_a_jump_n) 6772 p1 += OFFSET_ADDRESS_SIZE; 6773 break; 6774 6775 default: 6776 /* do nothing */ ; 6777 } 6778 p1 += mcnt; 6779 6780 /* If the next operation is a jump backwards in the pattern 6781 to an on_failure_jump right before the start_memory 6782 corresponding to this stop_memory, exit from the loop 6783 by forcing a failure after pushing on the stack the 6784 on_failure_jump's jump in the pattern, and d. */ 6785 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump 6786 && (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == start_memory 6787 && p1[2+OFFSET_ADDRESS_SIZE] == *p) 6788 { 6789 /* If this group ever matched anything, then restore 6790 what its registers were before trying this last 6791 failed match, e.g., with `(a*)*b' against `ab' for 6792 regstart[1], and, e.g., with `((a*)*(b*)*)*' 6793 against `aba' for regend[3]. 6794 6795 Also restore the registers for inner groups for, 6796 e.g., `((a*)(b*))*' against `aba' (register 3 would 6797 otherwise get trashed). */ 6798 6799 if (EVER_MATCHED_SOMETHING (reg_info[*p])) 6800 { 6801 unsigned r; 6802 6803 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; 6804 6805 /* Restore this and inner groups' (if any) registers. */ 6806 for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1); 6807 r++) 6808 { 6809 regstart[r] = old_regstart[r]; 6810 6811 /* xx why this test? */ 6812 if (old_regend[r] >= regstart[r]) 6813 regend[r] = old_regend[r]; 6814 } 6815 } 6816 p1++; 6817 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 6818 PUSH_FAILURE_POINT (p1 + mcnt, d, -2); 6819 6820 goto fail; 6821 } 6822 } 6823 6824 /* Move past the register number and the inner group count. */ 6825 p += 2; 6826 NEXT; 6827 6828 6829 /* \<digit> has been turned into a `duplicate' command which is 6830 followed by the numeric value of <digit> as the register number. */ 6831 CASE (duplicate): 6832 { 6833 register const CHAR_T *d2, *dend2; 6834 int regno = *p++; /* Get which register to match against. */ 6835 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); 6836 6837 /* Can't back reference a group which we've never matched. */ 6838 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) 6839 goto fail; 6840 6841 /* Where in input to try to start matching. */ 6842 d2 = regstart[regno]; 6843 6844 /* Where to stop matching; if both the place to start and 6845 the place to stop matching are in the same string, then 6846 set to the place to stop, otherwise, for now have to use 6847 the end of the first string. */ 6848 6849 dend2 = ((FIRST_STRING_P (regstart[regno]) 6850 == FIRST_STRING_P (regend[regno])) 6851 ? regend[regno] : end_match_1); 6852 for (;;) 6853 { 6854 /* If necessary, advance to next segment in register 6855 contents. */ 6856 while (d2 == dend2) 6857 { 6858 if (dend2 == end_match_2) break; 6859 if (dend2 == regend[regno]) break; 6860 6861 /* End of string1 => advance to string2. */ 6862 d2 = string2; 6863 dend2 = regend[regno]; 6864 } 6865 /* At end of register contents => success */ 6866 if (d2 == dend2) break; 6867 6868 /* If necessary, advance to next segment in data. */ 6869 PREFETCH (); 6870 6871 /* How many characters left in this segment to match. */ 6872 mcnt = dend - d; 6873 6874 /* Want how many consecutive characters we can match in 6875 one shot, so, if necessary, adjust the count. */ 6876 if (mcnt > dend2 - d2) 6877 mcnt = dend2 - d2; 6878 6879 /* Compare that many; failure if mismatch, else move 6880 past them. */ 6881 if (translate 6882 ? PREFIX(bcmp_translate) (d, d2, mcnt, translate) 6883 : memcmp (d, d2, mcnt*sizeof(UCHAR_T))) 6884 goto fail; 6885 d += mcnt, d2 += mcnt; 6886 6887 /* Do this because we've match some characters. */ 6888 SET_REGS_MATCHED (); 6889 } 6890 } 6891 NEXT; 6892 6893 6894 /* begline matches the empty string at the beginning of the string 6895 (unless `not_bol' is set in `bufp'), and, if 6896 `newline_anchor' is set, after newlines. */ 6897 CASE (begline): 6898 DEBUG_PRINT1 ("EXECUTING begline.\n"); 6899 6900 if (AT_STRINGS_BEG (d)) 6901 { 6902 if (!bufp->not_bol) 6903 { 6904 NEXT; 6905 } 6906 } 6907 else if (d[-1] == '\n' && bufp->newline_anchor) 6908 { 6909 NEXT; 6910 } 6911 /* In all other cases, we fail. */ 6912 goto fail; 6913 6914 6915 /* endline is the dual of begline. */ 6916 CASE (endline): 6917 DEBUG_PRINT1 ("EXECUTING endline.\n"); 6918 6919 if (AT_STRINGS_END (d)) 6920 { 6921 if (!bufp->not_eol) 6922 { 6923 NEXT; 6924 } 6925 } 6926 6927 /* We have to ``prefetch'' the next character. */ 6928 else if ((d == end1 ? *string2 : *d) == '\n' 6929 && bufp->newline_anchor) 6930 { 6931 NEXT; 6932 } 6933 goto fail; 6934 6935 6936 /* Match at the very beginning of the data. */ 6937 CASE (begbuf): 6938 DEBUG_PRINT1 ("EXECUTING begbuf.\n"); 6939 if (AT_STRINGS_BEG (d)) 6940 { 6941 NEXT; 6942 } 6943 goto fail; 6944 6945 6946 /* Match at the very end of the data. */ 6947 CASE (endbuf): 6948 DEBUG_PRINT1 ("EXECUTING endbuf.\n"); 6949 if (AT_STRINGS_END (d)) 6950 { 6951 NEXT; 6952 } 6953 goto fail; 6954 6955 6956 /* on_failure_keep_string_jump is used to optimize `.*\n'. It 6957 pushes NULL as the value for the string on the stack. Then 6958 `pop_failure_point' will keep the current value for the 6959 string, instead of restoring it. To see why, consider 6960 matching `foo\nbar' against `.*\n'. The .* matches the foo; 6961 then the . fails against the \n. But the next thing we want 6962 to do is match the \n against the \n; if we restored the 6963 string value, we would be back at the foo. 6964 6965 Because this is used only in specific cases, we don't need to 6966 check all the things that `on_failure_jump' does, to make 6967 sure the right things get saved on the stack. Hence we don't 6968 share its code. The only reason to push anything on the 6969 stack at all is that otherwise we would have to change 6970 `anychar's code to do something besides goto fail in this 6971 case; that seems worse than this. */ 6972 CASE (on_failure_keep_string_jump): 6973 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); 6974 6975 EXTRACT_NUMBER_AND_INCR (mcnt, p); 6976#ifdef _LIBC 6977 DEBUG_PRINT3 (" %d (to %p):\n", mcnt, p + mcnt); 6978#else 6979 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); 6980#endif 6981 6982 PUSH_FAILURE_POINT (p + mcnt, NULL, -2); 6983 NEXT; 6984 6985 6986 /* Uses of on_failure_jump: 6987 6988 Each alternative starts with an on_failure_jump that points 6989 to the beginning of the next alternative. Each alternative 6990 except the last ends with a jump that in effect jumps past 6991 the rest of the alternatives. (They really jump to the 6992 ending jump of the following alternative, because tensioning 6993 these jumps is a hassle.) 6994 6995 Repeats start with an on_failure_jump that points past both 6996 the repetition text and either the following jump or 6997 pop_failure_jump back to this on_failure_jump. */ 6998 CASE (on_failure_jump): 6999 on_failure: 7000 DEBUG_PRINT1 ("EXECUTING on_failure_jump"); 7001 7002 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7003#ifdef _LIBC 7004 DEBUG_PRINT3 (" %d (to %p)", mcnt, p + mcnt); 7005#else 7006 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); 7007#endif 7008 7009 /* If this on_failure_jump comes right before a group (i.e., 7010 the original * applied to a group), save the information 7011 for that group and all inner ones, so that if we fail back 7012 to this point, the group's information will be correct. 7013 For example, in \(a*\)*\1, we need the preceding group, 7014 and in \(zz\(a*\)b*\)\2, we need the inner group. */ 7015 7016 /* We can't use `p' to check ahead because we push 7017 a failure point to `p + mcnt' after we do this. */ 7018 p1 = p; 7019 7020 /* We need to skip no_op's before we look for the 7021 start_memory in case this on_failure_jump is happening as 7022 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 7023 against aba. */ 7024 while (p1 < pend && (re_opcode_t) *p1 == no_op) 7025 p1++; 7026 7027 if (p1 < pend && (re_opcode_t) *p1 == start_memory) 7028 { 7029 /* We have a new highest active register now. This will 7030 get reset at the start_memory we are about to get to, 7031 but we will have saved all the registers relevant to 7032 this repetition op, as described above. */ 7033 highest_active_reg = *(p1 + 1) + *(p1 + 2); 7034 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) 7035 lowest_active_reg = *(p1 + 1); 7036 } 7037 7038 DEBUG_PRINT1 (":\n"); 7039 PUSH_FAILURE_POINT (p + mcnt, d, -2); 7040 NEXT; 7041 7042 7043 /* A smart repeat ends with `maybe_pop_jump'. 7044 We change it to either `pop_failure_jump' or `jump'. */ 7045 CASE (maybe_pop_jump): 7046 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7047 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); 7048 { 7049 register UCHAR_T *p2 = p; 7050 7051 /* Compare the beginning of the repeat with what in the 7052 pattern follows its end. If we can establish that there 7053 is nothing that they would both match, i.e., that we 7054 would have to backtrack because of (as in, e.g., `a*a') 7055 then we can change to pop_failure_jump, because we'll 7056 never have to backtrack. 7057 7058 This is not true in the case of alternatives: in 7059 `(a|ab)*' we do need to backtrack to the `ab' alternative 7060 (e.g., if the string was `ab'). But instead of trying to 7061 detect that here, the alternative has put on a dummy 7062 failure point which is what we will end up popping. */ 7063 7064 /* Skip over open/close-group commands. 7065 If what follows this loop is a ...+ construct, 7066 look at what begins its body, since we will have to 7067 match at least one of that. */ 7068 while (1) 7069 { 7070 if (p2 + 2 < pend 7071 && ((re_opcode_t) *p2 == stop_memory 7072 || (re_opcode_t) *p2 == start_memory)) 7073 p2 += 3; 7074 else if (p2 + 2 + 2 * OFFSET_ADDRESS_SIZE < pend 7075 && (re_opcode_t) *p2 == dummy_failure_jump) 7076 p2 += 2 + 2 * OFFSET_ADDRESS_SIZE; 7077 else 7078 break; 7079 } 7080 7081 p1 = p + mcnt; 7082 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding 7083 to the `maybe_finalize_jump' of this case. Examine what 7084 follows. */ 7085 7086 /* If we're at the end of the pattern, we can change. */ 7087 if (p2 == pend) 7088 { 7089 /* Consider what happens when matching ":\(.*\)" 7090 against ":/". I don't really understand this code 7091 yet. */ 7092 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T) 7093 pop_failure_jump; 7094 DEBUG_PRINT1 7095 (" End of pattern: change to `pop_failure_jump'.\n"); 7096 } 7097 7098 else if ((re_opcode_t) *p2 == exactn 7099#ifdef MBS_SUPPORT 7100 || (re_opcode_t) *p2 == exactn_bin 7101#endif 7102 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) 7103 { 7104 register UCHAR_T c 7105 = *p2 == (UCHAR_T) endline ? '\n' : p2[2]; 7106 7107 if (((re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn 7108#ifdef MBS_SUPPORT 7109 || (re_opcode_t) p1[1+OFFSET_ADDRESS_SIZE] == exactn_bin 7110#endif 7111 ) && p1[3+OFFSET_ADDRESS_SIZE] != c) 7112 { 7113 p[-(1+OFFSET_ADDRESS_SIZE)] = (UCHAR_T) 7114 pop_failure_jump; 7115#ifdef WCHAR 7116 DEBUG_PRINT3 (" %C != %C => pop_failure_jump.\n", 7117 (wint_t) c, 7118 (wint_t) p1[3+OFFSET_ADDRESS_SIZE]); 7119#else 7120 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", 7121 (char) c, 7122 (char) p1[3+OFFSET_ADDRESS_SIZE]); 7123#endif 7124 } 7125 7126#ifndef WCHAR 7127 else if ((re_opcode_t) p1[3] == charset 7128 || (re_opcode_t) p1[3] == charset_not) 7129 { 7130 int invert = (re_opcode_t) p1[3] == charset_not; 7131 7132 if (c < (unsigned) (p1[4] * BYTEWIDTH) 7133 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) 7134 invert = !invert; 7135 7136 /* `not' is equal to 1 if c would match, which means 7137 that we can't change to pop_failure_jump. */ 7138 if (!invert) 7139 { 7140 p[-3] = (unsigned char) pop_failure_jump; 7141 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7142 } 7143 } 7144#endif /* not WCHAR */ 7145 } 7146#ifndef WCHAR 7147 else if ((re_opcode_t) *p2 == charset) 7148 { 7149 /* We win if the first character of the loop is not part 7150 of the charset. */ 7151 if ((re_opcode_t) p1[3] == exactn 7152 && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5] 7153 && (p2[2 + p1[5] / BYTEWIDTH] 7154 & (1 << (p1[5] % BYTEWIDTH))))) 7155 { 7156 p[-3] = (unsigned char) pop_failure_jump; 7157 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7158 } 7159 7160 else if ((re_opcode_t) p1[3] == charset_not) 7161 { 7162 int idx; 7163 /* We win if the charset_not inside the loop 7164 lists every character listed in the charset after. */ 7165 for (idx = 0; idx < (int) p2[1]; idx++) 7166 if (! (p2[2 + idx] == 0 7167 || (idx < (int) p1[4] 7168 && ((p2[2 + idx] & ~ p1[5 + idx]) == 0)))) 7169 break; 7170 7171 if (idx == p2[1]) 7172 { 7173 p[-3] = (unsigned char) pop_failure_jump; 7174 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7175 } 7176 } 7177 else if ((re_opcode_t) p1[3] == charset) 7178 { 7179 int idx; 7180 /* We win if the charset inside the loop 7181 has no overlap with the one after the loop. */ 7182 for (idx = 0; 7183 idx < (int) p2[1] && idx < (int) p1[4]; 7184 idx++) 7185 if ((p2[2 + idx] & p1[5 + idx]) != 0) 7186 break; 7187 7188 if (idx == p2[1] || idx == p1[4]) 7189 { 7190 p[-3] = (unsigned char) pop_failure_jump; 7191 DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); 7192 } 7193 } 7194 } 7195#endif /* not WCHAR */ 7196 } 7197 p -= OFFSET_ADDRESS_SIZE; /* Point at relative address again. */ 7198 if ((re_opcode_t) p[-1] != pop_failure_jump) 7199 { 7200 p[-1] = (UCHAR_T) jump; 7201 DEBUG_PRINT1 (" Match => jump.\n"); 7202 goto unconditional_jump; 7203 } 7204 /* Note fall through. */ 7205 7206 7207 /* The end of a simple repeat has a pop_failure_jump back to 7208 its matching on_failure_jump, where the latter will push a 7209 failure point. The pop_failure_jump takes off failure 7210 points put on by this pop_failure_jump's matching 7211 on_failure_jump; we got through the pattern to here from the 7212 matching on_failure_jump, so didn't fail. */ 7213 CASE (pop_failure_jump): 7214 { 7215 /* We need to pass separate storage for the lowest and 7216 highest registers, even though we don't care about the 7217 actual values. Otherwise, we will restore only one 7218 register from the stack, since lowest will == highest in 7219 `pop_failure_point'. */ 7220 active_reg_t dummy_low_reg, dummy_high_reg; 7221 UCHAR_T *pdummy = NULL; 7222 const CHAR_T *sdummy = NULL; 7223 7224 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); 7225 POP_FAILURE_POINT (sdummy, pdummy, 7226 dummy_low_reg, dummy_high_reg, 7227 reg_dummy, reg_dummy, reg_info_dummy); 7228 } 7229 /* Note fall through. */ 7230 7231 unconditional_jump: 7232#ifdef _LIBC 7233 DEBUG_PRINT2 ("\n%p: ", p); 7234#else 7235 DEBUG_PRINT2 ("\n0x%x: ", p); 7236#endif 7237 /* Note fall through. */ 7238 7239 /* Unconditionally jump (without popping any failure points). */ 7240 CASE (jump): 7241 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ 7242 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); 7243 p += mcnt; /* Do the jump. */ 7244#ifdef _LIBC 7245 DEBUG_PRINT2 ("(to %p).\n", p); 7246#else 7247 DEBUG_PRINT2 ("(to 0x%x).\n", p); 7248#endif 7249 NEXT; 7250 7251 7252 /* We need this opcode so we can detect where alternatives end 7253 in `group_match_null_string_p' et al. */ 7254 CASE (jump_past_alt): 7255 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); 7256 goto unconditional_jump; 7257 7258 7259 /* Normally, the on_failure_jump pushes a failure point, which 7260 then gets popped at pop_failure_jump. We will end up at 7261 pop_failure_jump, also, and with a pattern of, say, `a+', we 7262 are skipping over the on_failure_jump, so we have to push 7263 something meaningless for pop_failure_jump to pop. */ 7264 CASE (dummy_failure_jump): 7265 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); 7266 /* It doesn't matter what we push for the string here. What 7267 the code at `fail' tests is the value for the pattern. */ 7268 PUSH_FAILURE_POINT (NULL, NULL, -2); 7269 goto unconditional_jump; 7270 7271 7272 /* At the end of an alternative, we need to push a dummy failure 7273 point in case we are followed by a `pop_failure_jump', because 7274 we don't want the failure point for the alternative to be 7275 popped. For example, matching `(a|ab)*' against `aab' 7276 requires that we match the `ab' alternative. */ 7277 CASE (push_dummy_failure): 7278 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); 7279 /* See comments just above at `dummy_failure_jump' about the 7280 two zeroes. */ 7281 PUSH_FAILURE_POINT (NULL, NULL, -2); 7282 NEXT; 7283 7284 /* Have to succeed matching what follows at least n times. 7285 After that, handle like `on_failure_jump'. */ 7286 CASE (succeed_n): 7287 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 7288 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); 7289 7290 assert (mcnt >= 0); 7291 /* Originally, this is how many times we HAVE to succeed. */ 7292 if (mcnt > 0) 7293 { 7294 mcnt--; 7295 p += OFFSET_ADDRESS_SIZE; 7296 STORE_NUMBER_AND_INCR (p, mcnt); 7297#ifdef _LIBC 7298 DEBUG_PRINT3 (" Setting %p to %d.\n", p - OFFSET_ADDRESS_SIZE 7299 , mcnt); 7300#else 7301 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p - OFFSET_ADDRESS_SIZE 7302 , mcnt); 7303#endif 7304 } 7305 else if (mcnt == 0) 7306 { 7307#ifdef _LIBC 7308 DEBUG_PRINT2 (" Setting two bytes from %p to no_op.\n", 7309 p + OFFSET_ADDRESS_SIZE); 7310#else 7311 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", 7312 p + OFFSET_ADDRESS_SIZE); 7313#endif /* _LIBC */ 7314 7315#ifdef WCHAR 7316 p[1] = (UCHAR_T) no_op; 7317#else 7318 p[2] = (UCHAR_T) no_op; 7319 p[3] = (UCHAR_T) no_op; 7320#endif /* WCHAR */ 7321 goto on_failure; 7322 } 7323 NEXT; 7324 7325 CASE (jump_n): 7326 EXTRACT_NUMBER (mcnt, p + OFFSET_ADDRESS_SIZE); 7327 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); 7328 7329 /* Originally, this is how many times we CAN jump. */ 7330 if (mcnt) 7331 { 7332 mcnt--; 7333 STORE_NUMBER (p + OFFSET_ADDRESS_SIZE, mcnt); 7334 7335#ifdef _LIBC 7336 DEBUG_PRINT3 (" Setting %p to %d.\n", p + OFFSET_ADDRESS_SIZE, 7337 mcnt); 7338#else 7339 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p + OFFSET_ADDRESS_SIZE, 7340 mcnt); 7341#endif /* _LIBC */ 7342 goto unconditional_jump; 7343 } 7344 /* If don't have to jump any more, skip over the rest of command. */ 7345 else 7346 p += 2 * OFFSET_ADDRESS_SIZE; 7347 NEXT; 7348 7349 CASE (set_number_at): 7350 { 7351 DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); 7352 7353 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7354 p1 = p + mcnt; 7355 EXTRACT_NUMBER_AND_INCR (mcnt, p); 7356#ifdef _LIBC 7357 DEBUG_PRINT3 (" Setting %p to %d.\n", p1, mcnt); 7358#else 7359 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); 7360#endif 7361 STORE_NUMBER (p1, mcnt); 7362 NEXT; 7363 } 7364 7365#if 0 7366 /* The DEC Alpha C compiler 3.x generates incorrect code for the 7367 test WORDCHAR_P (d - 1) != WORDCHAR_P (d) in the expansion of 7368 AT_WORD_BOUNDARY, so this code is disabled. Expanding the 7369 macro and introducing temporary variables works around the bug. */ 7370 7371 CASE (wordbound): 7372 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7373 if (AT_WORD_BOUNDARY (d)) 7374 { 7375 NEXT; 7376 } 7377 goto fail; 7378 7379 CASE (notwordbound): 7380 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7381 if (AT_WORD_BOUNDARY (d)) 7382 goto fail; 7383 NEXT; 7384#else 7385 CASE (wordbound): 7386 { 7387 boolean prevchar, thischar; 7388 7389 DEBUG_PRINT1 ("EXECUTING wordbound.\n"); 7390 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7391 { 7392 NEXT; 7393 } 7394 7395 prevchar = WORDCHAR_P (d - 1); 7396 thischar = WORDCHAR_P (d); 7397 if (prevchar != thischar) 7398 { 7399 NEXT; 7400 } 7401 goto fail; 7402 } 7403 7404 CASE (notwordbound): 7405 { 7406 boolean prevchar, thischar; 7407 7408 DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); 7409 if (AT_STRINGS_BEG (d) || AT_STRINGS_END (d)) 7410 goto fail; 7411 7412 prevchar = WORDCHAR_P (d - 1); 7413 thischar = WORDCHAR_P (d); 7414 if (prevchar != thischar) 7415 goto fail; 7416 NEXT; 7417 } 7418#endif 7419 7420 CASE (wordbeg): 7421 DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); 7422 if (!AT_STRINGS_END (d) && WORDCHAR_P (d) 7423 && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) 7424 { 7425 NEXT; 7426 } 7427 goto fail; 7428 7429 CASE (wordend): 7430 DEBUG_PRINT1 ("EXECUTING wordend.\n"); 7431 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) 7432 && (AT_STRINGS_END (d) || !WORDCHAR_P (d))) 7433 { 7434 NEXT; 7435 } 7436 goto fail; 7437 7438#ifdef emacs 7439 CASE (before_dot): 7440 DEBUG_PRINT1 ("EXECUTING before_dot.\n"); 7441 if (PTR_CHAR_POS ((unsigned char *) d) >= point) 7442 goto fail; 7443 NEXT; 7444 7445 CASE (at_dot): 7446 DEBUG_PRINT1 ("EXECUTING at_dot.\n"); 7447 if (PTR_CHAR_POS ((unsigned char *) d) != point) 7448 goto fail; 7449 NEXT; 7450 7451 CASE (after_dot): 7452 DEBUG_PRINT1 ("EXECUTING after_dot.\n"); 7453 if (PTR_CHAR_POS ((unsigned char *) d) <= point) 7454 goto fail; 7455 NEXT; 7456 7457 CASE (syntaxspec): 7458 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); 7459 mcnt = *p++; 7460 goto matchsyntax; 7461 7462 CASE (wordchar): 7463 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); 7464 mcnt = (int) Sword; 7465 matchsyntax: 7466 PREFETCH (); 7467 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7468 d++; 7469 if (SYNTAX (d[-1]) != (enum syntaxcode) mcnt) 7470 goto fail; 7471 SET_REGS_MATCHED (); 7472 NEXT; 7473 7474 CASE (notsyntaxspec): 7475 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); 7476 mcnt = *p++; 7477 goto matchnotsyntax; 7478 7479 CASE (notwordchar): 7480 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); 7481 mcnt = (int) Sword; 7482 matchnotsyntax: 7483 PREFETCH (); 7484 /* Can't use *d++ here; SYNTAX may be an unsafe macro. */ 7485 d++; 7486 if (SYNTAX (d[-1]) == (enum syntaxcode) mcnt) 7487 goto fail; 7488 SET_REGS_MATCHED (); 7489 NEXT; 7490 7491#else /* not emacs */ 7492 CASE (wordchar): 7493 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); 7494 PREFETCH (); 7495 if (!WORDCHAR_P (d)) 7496 goto fail; 7497 SET_REGS_MATCHED (); 7498 d++; 7499 NEXT; 7500 7501 CASE (notwordchar): 7502 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); 7503 PREFETCH (); 7504 if (WORDCHAR_P (d)) 7505 goto fail; 7506 SET_REGS_MATCHED (); 7507 d++; 7508 NEXT; 7509#endif /* not emacs */ 7510 7511#ifndef __GNUC__ 7512 default: 7513 abort (); 7514 } 7515 continue; /* Successfully executed one pattern command; keep going. */ 7516#endif 7517 7518 7519 /* We goto here if a matching operation fails. */ 7520 fail: 7521 if (!FAIL_STACK_EMPTY ()) 7522 { /* A restart point is known. Restore to that state. */ 7523 DEBUG_PRINT1 ("\nFAIL:\n"); 7524 POP_FAILURE_POINT (d, p, 7525 lowest_active_reg, highest_active_reg, 7526 regstart, regend, reg_info); 7527 7528 /* If this failure point is a dummy, try the next one. */ 7529 if (!p) 7530 goto fail; 7531 7532 /* If we failed to the end of the pattern, don't examine *p. */ 7533 assert (p <= pend); 7534 if (p < pend) 7535 { 7536 boolean is_a_jump_n = false; 7537 7538 /* If failed to a backwards jump that's part of a repetition 7539 loop, need to pop this failure point and use the next one. */ 7540 switch ((re_opcode_t) *p) 7541 { 7542 case jump_n: 7543 is_a_jump_n = true; 7544 case maybe_pop_jump: 7545 case pop_failure_jump: 7546 case jump: 7547 p1 = p + 1; 7548 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7549 p1 += mcnt; 7550 7551 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) 7552 || (!is_a_jump_n 7553 && (re_opcode_t) *p1 == on_failure_jump)) 7554 goto fail; 7555 break; 7556 default: 7557 /* do nothing */ ; 7558 } 7559 } 7560 7561 if (d >= string1 && d <= end1) 7562 dend = end_match_1; 7563 } 7564 else 7565 break; /* Matching at this starting point really fails. */ 7566 } /* for (;;) */ 7567 7568 if (best_regs_set) 7569 goto restore_best_regs; 7570 7571 FREE_VARIABLES (); 7572 7573 return -1; /* Failure to match. */ 7574} /* re_match_2 */ 7575 7576/* Subroutine definitions for re_match_2. */ 7577 7578 7579/* We are passed P pointing to a register number after a start_memory. 7580 7581 Return true if the pattern up to the corresponding stop_memory can 7582 match the empty string, and false otherwise. 7583 7584 If we find the matching stop_memory, sets P to point to one past its number. 7585 Otherwise, sets P to an undefined byte less than or equal to END. 7586 7587 We don't handle duplicates properly (yet). */ 7588 7589static boolean 7590PREFIX(group_match_null_string_p) (UCHAR_T **p, UCHAR_T *end, 7591 PREFIX(register_info_type) *reg_info) 7592{ 7593 int mcnt; 7594 /* Point to after the args to the start_memory. */ 7595 UCHAR_T *p1 = *p + 2; 7596 7597 while (p1 < end) 7598 { 7599 /* Skip over opcodes that can match nothing, and return true or 7600 false, as appropriate, when we get to one that can't, or to the 7601 matching stop_memory. */ 7602 7603 switch ((re_opcode_t) *p1) 7604 { 7605 /* Could be either a loop or a series of alternatives. */ 7606 case on_failure_jump: 7607 p1++; 7608 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7609 7610 /* If the next operation is not a jump backwards in the 7611 pattern. */ 7612 7613 if (mcnt >= 0) 7614 { 7615 /* Go through the on_failure_jumps of the alternatives, 7616 seeing if any of the alternatives cannot match nothing. 7617 The last alternative starts with only a jump, 7618 whereas the rest start with on_failure_jump and end 7619 with a jump, e.g., here is the pattern for `a|b|c': 7620 7621 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 7622 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 7623 /exactn/1/c 7624 7625 So, we have to first go through the first (n-1) 7626 alternatives and then deal with the last one separately. */ 7627 7628 7629 /* Deal with the first (n-1) alternatives, which start 7630 with an on_failure_jump (see above) that jumps to right 7631 past a jump_past_alt. */ 7632 7633 while ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] == 7634 jump_past_alt) 7635 { 7636 /* `mcnt' holds how many bytes long the alternative 7637 is, including the ending `jump_past_alt' and 7638 its number. */ 7639 7640 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt - 7641 (1 + OFFSET_ADDRESS_SIZE), 7642 reg_info)) 7643 return false; 7644 7645 /* Move to right after this alternative, including the 7646 jump_past_alt. */ 7647 p1 += mcnt; 7648 7649 /* Break if it's the beginning of an n-th alternative 7650 that doesn't begin with an on_failure_jump. */ 7651 if ((re_opcode_t) *p1 != on_failure_jump) 7652 break; 7653 7654 /* Still have to check that it's not an n-th 7655 alternative that starts with an on_failure_jump. */ 7656 p1++; 7657 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7658 if ((re_opcode_t) p1[mcnt-(1+OFFSET_ADDRESS_SIZE)] != 7659 jump_past_alt) 7660 { 7661 /* Get to the beginning of the n-th alternative. */ 7662 p1 -= 1 + OFFSET_ADDRESS_SIZE; 7663 break; 7664 } 7665 } 7666 7667 /* Deal with the last alternative: go back and get number 7668 of the `jump_past_alt' just before it. `mcnt' contains 7669 the length of the alternative. */ 7670 EXTRACT_NUMBER (mcnt, p1 - OFFSET_ADDRESS_SIZE); 7671 7672 if (!PREFIX(alt_match_null_string_p) (p1, p1 + mcnt, reg_info)) 7673 return false; 7674 7675 p1 += mcnt; /* Get past the n-th alternative. */ 7676 } /* if mcnt > 0 */ 7677 break; 7678 7679 7680 case stop_memory: 7681 assert (p1[1] == **p); 7682 *p = p1 + 2; 7683 return true; 7684 7685 7686 default: 7687 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) 7688 return false; 7689 } 7690 } /* while p1 < end */ 7691 7692 return false; 7693} /* group_match_null_string_p */ 7694 7695 7696/* Similar to group_match_null_string_p, but doesn't deal with alternatives: 7697 It expects P to be the first byte of a single alternative and END one 7698 byte past the last. The alternative can contain groups. */ 7699 7700static boolean 7701PREFIX(alt_match_null_string_p) (UCHAR_T *p, UCHAR_T *end, 7702 PREFIX(register_info_type) *reg_info) 7703{ 7704 int mcnt; 7705 UCHAR_T *p1 = p; 7706 7707 while (p1 < end) 7708 { 7709 /* Skip over opcodes that can match nothing, and break when we get 7710 to one that can't. */ 7711 7712 switch ((re_opcode_t) *p1) 7713 { 7714 /* It's a loop. */ 7715 case on_failure_jump: 7716 p1++; 7717 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7718 p1 += mcnt; 7719 break; 7720 7721 default: 7722 if (!PREFIX(common_op_match_null_string_p) (&p1, end, reg_info)) 7723 return false; 7724 } 7725 } /* while p1 < end */ 7726 7727 return true; 7728} /* alt_match_null_string_p */ 7729 7730 7731/* Deals with the ops common to group_match_null_string_p and 7732 alt_match_null_string_p. 7733 7734 Sets P to one after the op and its arguments, if any. */ 7735 7736static boolean 7737PREFIX(common_op_match_null_string_p) (UCHAR_T **p, UCHAR_T *end, 7738 PREFIX(register_info_type) *reg_info) 7739{ 7740 int mcnt; 7741 boolean ret; 7742 int reg_no; 7743 UCHAR_T *p1 = *p; 7744 7745 switch ((re_opcode_t) *p1++) 7746 { 7747 case no_op: 7748 case begline: 7749 case endline: 7750 case begbuf: 7751 case endbuf: 7752 case wordbeg: 7753 case wordend: 7754 case wordbound: 7755 case notwordbound: 7756#ifdef emacs 7757 case before_dot: 7758 case at_dot: 7759 case after_dot: 7760#endif 7761 break; 7762 7763 case start_memory: 7764 reg_no = *p1; 7765 assert (reg_no > 0 && reg_no <= MAX_REGNUM); 7766 ret = PREFIX(group_match_null_string_p) (&p1, end, reg_info); 7767 7768 /* Have to set this here in case we're checking a group which 7769 contains a group and a back reference to it. */ 7770 7771 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) 7772 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; 7773 7774 if (!ret) 7775 return false; 7776 break; 7777 7778 /* If this is an optimized succeed_n for zero times, make the jump. */ 7779 case jump: 7780 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7781 if (mcnt >= 0) 7782 p1 += mcnt; 7783 else 7784 return false; 7785 break; 7786 7787 case succeed_n: 7788 /* Get to the number of times to succeed. */ 7789 p1 += OFFSET_ADDRESS_SIZE; 7790 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7791 7792 if (mcnt == 0) 7793 { 7794 p1 -= 2 * OFFSET_ADDRESS_SIZE; 7795 EXTRACT_NUMBER_AND_INCR (mcnt, p1); 7796 p1 += mcnt; 7797 } 7798 else 7799 return false; 7800 break; 7801 7802 case duplicate: 7803 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) 7804 return false; 7805 break; 7806 7807 case set_number_at: 7808 p1 += 2 * OFFSET_ADDRESS_SIZE; 7809 7810 default: 7811 /* All other opcodes mean we cannot match the empty string. */ 7812 return false; 7813 } 7814 7815 *p = p1; 7816 return true; 7817} /* common_op_match_null_string_p */ 7818 7819 7820/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN 7821 bytes; nonzero otherwise. */ 7822 7823static int 7824PREFIX(bcmp_translate) (const CHAR_T *s1, const CHAR_T *s2, 7825 register int len, 7826 RE_TRANSLATE_TYPE translate) 7827{ 7828 register const UCHAR_T *p1 = (const UCHAR_T *) s1; 7829 register const UCHAR_T *p2 = (const UCHAR_T *) s2; 7830 while (len) 7831 { 7832#ifdef WCHAR 7833 if (((*p1<=0xff)?translate[*p1++]:*p1++) 7834 != ((*p2<=0xff)?translate[*p2++]:*p2++)) 7835 return 1; 7836#else /* BYTE */ 7837 if (translate[*p1++] != translate[*p2++]) return 1; 7838#endif /* WCHAR */ 7839 len--; 7840 } 7841 return 0; 7842} 7843 7844 7845#else /* not INSIDE_RECURSION */ 7846 7847/* Entry points for GNU code. */ 7848 7849/* re_compile_pattern is the GNU regular expression compiler: it 7850 compiles PATTERN (of length SIZE) and puts the result in BUFP. 7851 Returns 0 if the pattern was valid, otherwise an error string. 7852 7853 Assumes the `allocated' (and perhaps `buffer') and `translate' fields 7854 are set in BUFP on entry. 7855 7856 We call regex_compile to do the actual compilation. */ 7857 7858const char * 7859re_compile_pattern (const char *pattern, 7860 size_t length, 7861 struct re_pattern_buffer *bufp) 7862{ 7863 reg_errcode_t ret; 7864 7865 /* GNU code is written to assume at least RE_NREGS registers will be set 7866 (and at least one extra will be -1). */ 7867 bufp->regs_allocated = REGS_UNALLOCATED; 7868 7869 /* And GNU code determines whether or not to get register information 7870 by passing null for the REGS argument to re_match, etc., not by 7871 setting no_sub. */ 7872 bufp->no_sub = 0; 7873 7874 /* Match anchors at newline. */ 7875 bufp->newline_anchor = 1; 7876 7877# ifdef MBS_SUPPORT 7878 if (MB_CUR_MAX != 1) 7879 ret = wcs_regex_compile (pattern, length, re_syntax_options, bufp); 7880 else 7881# endif 7882 ret = byte_regex_compile (pattern, length, re_syntax_options, bufp); 7883 7884 if (!ret) 7885 return NULL; 7886 return gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 7887} 7888#ifdef _LIBC 7889weak_alias (__re_compile_pattern, re_compile_pattern) 7890#endif 7891 7892/* Entry points compatible with 4.2 BSD regex library. We don't define 7893 them unless specifically requested. */ 7894 7895#if defined _REGEX_RE_COMP || defined _LIBC 7896 7897/* BSD has one and only one pattern buffer. */ 7898static struct re_pattern_buffer re_comp_buf; 7899 7900char * 7901#ifdef _LIBC 7902/* Make these definitions weak in libc, so POSIX programs can redefine 7903 these names if they don't use our functions, and still use 7904 regcomp/regexec below without link errors. */ 7905weak_function 7906#endif 7907re_comp (const char *s) 7908{ 7909 reg_errcode_t ret; 7910 7911 if (!s) 7912 { 7913 if (!re_comp_buf.buffer) 7914 return (char *) gettext ("No previous regular expression"); 7915 return 0; 7916 } 7917 7918 if (!re_comp_buf.buffer) 7919 { 7920 re_comp_buf.buffer = malloc (200); 7921 if (re_comp_buf.buffer == NULL) 7922 return (char *) gettext (re_error_msgid 7923 + re_error_msgid_idx[(int) REG_ESPACE]); 7924 re_comp_buf.allocated = 200; 7925 7926 re_comp_buf.fastmap = malloc (1 << BYTEWIDTH); 7927 if (re_comp_buf.fastmap == NULL) 7928 return (char *) gettext (re_error_msgid 7929 + re_error_msgid_idx[(int) REG_ESPACE]); 7930 } 7931 7932 /* Since `re_exec' always passes NULL for the `regs' argument, we 7933 don't need to initialize the pattern buffer fields which affect it. */ 7934 7935 /* Match anchors at newlines. */ 7936 re_comp_buf.newline_anchor = 1; 7937 7938# ifdef MBS_SUPPORT 7939 if (MB_CUR_MAX != 1) 7940 ret = wcs_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 7941 else 7942# endif 7943 ret = byte_regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); 7944 7945 if (!ret) 7946 return NULL; 7947 7948 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ 7949 return (char *) gettext (re_error_msgid + re_error_msgid_idx[(int) ret]); 7950} 7951 7952 7953int 7954#ifdef _LIBC 7955weak_function 7956#endif 7957re_exec (const char *s) 7958{ 7959 const int len = strlen (s); 7960 return 7961 0 <= re_search (&re_comp_buf, s, len, 0, len, 0); 7962} 7963 7964#endif /* _REGEX_RE_COMP */ 7965 7966/* POSIX.2 functions. Don't define these for Emacs. */ 7967 7968#ifndef emacs 7969 7970/* regcomp takes a regular expression as a string and compiles it. 7971 7972 PREG is a regex_t *. We do not expect any fields to be initialized, 7973 since POSIX says we shouldn't. Thus, we set 7974 7975 `buffer' to the compiled pattern; 7976 `used' to the length of the compiled pattern; 7977 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the 7978 REG_EXTENDED bit in CFLAGS is set; otherwise, to 7979 RE_SYNTAX_POSIX_BASIC; 7980 `newline_anchor' to REG_NEWLINE being set in CFLAGS; 7981 `fastmap' to an allocated space for the fastmap; 7982 `fastmap_accurate' to zero; 7983 `re_nsub' to the number of subexpressions in PATTERN. 7984 7985 PATTERN is the address of the pattern string. 7986 7987 CFLAGS is a series of bits which affect compilation. 7988 7989 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we 7990 use POSIX basic syntax. 7991 7992 If REG_NEWLINE is set, then . and [^...] don't match newline. 7993 Also, regexec will try a match beginning after every newline. 7994 7995 If REG_ICASE is set, then we considers upper- and lowercase 7996 versions of letters to be equivalent when matching. 7997 7998 If REG_NOSUB is set, then when PREG is passed to regexec, that 7999 routine will report only success or failure, and nothing about the 8000 registers. 8001 8002 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for 8003 the return codes and their meanings.) */ 8004 8005int 8006regcomp (regex_t *preg, const char *pattern, int cflags) 8007{ 8008 reg_errcode_t ret; 8009 reg_syntax_t syntax 8010 = (cflags & REG_EXTENDED) ? 8011 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; 8012 8013 /* regex_compile will allocate the space for the compiled pattern. */ 8014 preg->buffer = 0; 8015 preg->allocated = 0; 8016 preg->used = 0; 8017 8018 /* Try to allocate space for the fastmap. */ 8019 preg->fastmap = (char *) malloc (1 << BYTEWIDTH); 8020 8021 if (cflags & REG_ICASE) 8022 { 8023 unsigned i; 8024 8025 preg->translate = 8026 (RE_TRANSLATE_TYPE) 8027 malloc (CHAR_SET_SIZE * sizeof (*(RE_TRANSLATE_TYPE)0)); 8028 if (preg->translate == NULL) 8029 return (int) REG_ESPACE; 8030 8031 /* Map uppercase characters to corresponding lowercase ones. */ 8032 for (i = 0; i < CHAR_SET_SIZE; i++) 8033 preg->translate[i] = ISUPPER (i) ? TOLOWER (i) : i; 8034 } 8035 else 8036 preg->translate = NULL; 8037 8038 /* If REG_NEWLINE is set, newlines are treated differently. */ 8039 if (cflags & REG_NEWLINE) 8040 { /* REG_NEWLINE implies neither . nor [^...] match newline. */ 8041 syntax &= ~RE_DOT_NEWLINE; 8042 syntax |= RE_HAT_LISTS_NOT_NEWLINE; 8043 /* It also changes the matching behavior. */ 8044 preg->newline_anchor = 1; 8045 } 8046 else 8047 preg->newline_anchor = 0; 8048 8049 preg->no_sub = !!(cflags & REG_NOSUB); 8050 8051 /* POSIX says a null character in the pattern terminates it, so we 8052 can use strlen here in compiling the pattern. */ 8053# ifdef MBS_SUPPORT 8054 if (MB_CUR_MAX != 1) 8055 ret = wcs_regex_compile (pattern, strlen (pattern), syntax, preg); 8056 else 8057# endif 8058 ret = byte_regex_compile (pattern, strlen (pattern), syntax, preg); 8059 8060 /* POSIX doesn't distinguish between an unmatched open-group and an 8061 unmatched close-group: both are REG_EPAREN. */ 8062 if (ret == REG_ERPAREN) ret = REG_EPAREN; 8063 8064 if (ret == REG_NOERROR && preg->fastmap) 8065 { 8066 /* Compute the fastmap now, since regexec cannot modify the pattern 8067 buffer. */ 8068 if (re_compile_fastmap (preg) == -2) 8069 { 8070 /* Some error occurred while computing the fastmap, just forget 8071 about it. */ 8072 free (preg->fastmap); 8073 preg->fastmap = NULL; 8074 } 8075 } 8076 8077 return (int) ret; 8078} 8079#ifdef _LIBC 8080weak_alias (__regcomp, regcomp) 8081#endif 8082 8083 8084/* regexec searches for a given pattern, specified by PREG, in the 8085 string STRING. 8086 8087 If NMATCH is zero or REG_NOSUB was set in the cflags argument to 8088 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at 8089 least NMATCH elements, and we set them to the offsets of the 8090 corresponding matched substrings. 8091 8092 EFLAGS specifies `execution flags' which affect matching: if 8093 REG_NOTBOL is set, then ^ does not match at the beginning of the 8094 string; if REG_NOTEOL is set, then $ does not match at the end. 8095 8096 We return 0 if we find a match and REG_NOMATCH if not. */ 8097 8098int 8099regexec (const regex_t *preg, const char *string, 8100 size_t nmatch, regmatch_t pmatch[], int eflags) 8101{ 8102 int ret; 8103 struct re_registers regs; 8104 regex_t private_preg; 8105 int len = strlen (string); 8106 boolean want_reg_info = !preg->no_sub && nmatch > 0; 8107 8108 private_preg = *preg; 8109 8110 private_preg.not_bol = !!(eflags & REG_NOTBOL); 8111 private_preg.not_eol = !!(eflags & REG_NOTEOL); 8112 8113 /* The user has told us exactly how many registers to return 8114 information about, via `nmatch'. We have to pass that on to the 8115 matching routines. */ 8116 private_preg.regs_allocated = REGS_FIXED; 8117 8118 if (want_reg_info) 8119 { 8120 regs.num_regs = nmatch; 8121 regs.start = TALLOC (nmatch * 2, regoff_t); 8122 if (regs.start == NULL) 8123 return (int) REG_NOMATCH; 8124 regs.end = regs.start + nmatch; 8125 } 8126 8127 /* Perform the searching operation. */ 8128 ret = re_search (&private_preg, string, len, 8129 /* start: */ 0, /* range: */ len, 8130 want_reg_info ? ®s : 0); 8131 8132 /* Copy the register information to the POSIX structure. */ 8133 if (want_reg_info) 8134 { 8135 if (ret >= 0) 8136 { 8137 unsigned r; 8138 8139 for (r = 0; r < nmatch; r++) 8140 { 8141 pmatch[r].rm_so = regs.start[r]; 8142 pmatch[r].rm_eo = regs.end[r]; 8143 } 8144 } 8145 8146 /* If we needed the temporary register info, free the space now. */ 8147 free (regs.start); 8148 } 8149 8150 /* We want zero return to mean success, unlike `re_search'. */ 8151 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; 8152} 8153#ifdef _LIBC 8154weak_alias (__regexec, regexec) 8155#endif 8156 8157 8158/* Returns a message corresponding to an error code, ERRCODE, returned 8159 from either regcomp or regexec. We don't use PREG here. */ 8160 8161size_t 8162regerror (int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) 8163{ 8164 const char *msg; 8165 size_t msg_size; 8166 8167 if (errcode < 0 8168 || errcode >= (int) (sizeof (re_error_msgid_idx) 8169 / sizeof (re_error_msgid_idx[0]))) 8170 /* Only error codes returned by the rest of the code should be passed 8171 to this routine. If we are given anything else, or if other regex 8172 code generates an invalid error code, then the program has a bug. 8173 Dump core so we can fix it. */ 8174 abort (); 8175 8176 msg = gettext (re_error_msgid + re_error_msgid_idx[errcode]); 8177 8178 msg_size = strlen (msg) + 1; /* Includes the null. */ 8179 8180 if (errbuf_size != 0) 8181 { 8182 if (msg_size > errbuf_size) 8183 { 8184#if defined HAVE_MEMPCPY || defined _LIBC 8185 *((char *) mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; 8186#else 8187 memcpy (errbuf, msg, errbuf_size - 1); 8188 errbuf[errbuf_size - 1] = 0; 8189#endif 8190 } 8191 else 8192 memcpy (errbuf, msg, msg_size); 8193 } 8194 8195 return msg_size; 8196} 8197#ifdef _LIBC 8198weak_alias (__regerror, regerror) 8199#endif 8200 8201 8202/* Free dynamically allocated space used by PREG. */ 8203 8204void 8205regfree (regex_t *preg) 8206{ 8207 if (preg->buffer != NULL) 8208 free (preg->buffer); 8209 preg->buffer = NULL; 8210 8211 preg->allocated = 0; 8212 preg->used = 0; 8213 8214 if (preg->fastmap != NULL) 8215 free (preg->fastmap); 8216 preg->fastmap = NULL; 8217 preg->fastmap_accurate = 0; 8218 8219 if (preg->translate != NULL) 8220 free (preg->translate); 8221 preg->translate = NULL; 8222} 8223#ifdef _LIBC 8224weak_alias (__regfree, regfree) 8225#endif 8226 8227#endif /* not emacs */ 8228 8229#endif /* not INSIDE_RECURSION */ 8230 8231 8232#undef STORE_NUMBER 8233#undef STORE_NUMBER_AND_INCR 8234#undef EXTRACT_NUMBER 8235#undef EXTRACT_NUMBER_AND_INCR 8236 8237#undef DEBUG_PRINT_COMPILED_PATTERN 8238#undef DEBUG_PRINT_DOUBLE_STRING 8239 8240#undef INIT_FAIL_STACK 8241#undef RESET_FAIL_STACK 8242#undef DOUBLE_FAIL_STACK 8243#undef PUSH_PATTERN_OP 8244#undef PUSH_FAILURE_POINTER 8245#undef PUSH_FAILURE_INT 8246#undef PUSH_FAILURE_ELT 8247#undef POP_FAILURE_POINTER 8248#undef POP_FAILURE_INT 8249#undef POP_FAILURE_ELT 8250#undef DEBUG_PUSH 8251#undef DEBUG_POP 8252#undef PUSH_FAILURE_POINT 8253#undef POP_FAILURE_POINT 8254 8255#undef REG_UNSET_VALUE 8256#undef REG_UNSET 8257 8258#undef PATFETCH 8259#undef PATFETCH_RAW 8260#undef PATUNFETCH 8261#undef TRANSLATE 8262 8263#undef INIT_BUF_SIZE 8264#undef GET_BUFFER_SPACE 8265#undef BUF_PUSH 8266#undef BUF_PUSH_2 8267#undef BUF_PUSH_3 8268#undef STORE_JUMP 8269#undef STORE_JUMP2 8270#undef INSERT_JUMP 8271#undef INSERT_JUMP2 8272#undef EXTEND_BUFFER 8273#undef GET_UNSIGNED_NUMBER 8274#undef FREE_STACK_RETURN 8275 8276# undef POINTER_TO_OFFSET 8277# undef MATCHING_IN_FRST_STRING 8278# undef PREFETCH 8279# undef AT_STRINGS_BEG 8280# undef AT_STRINGS_END 8281# undef WORDCHAR_P 8282# undef FREE_VAR 8283# undef FREE_VARIABLES 8284# undef NO_HIGHEST_ACTIVE_REG 8285# undef NO_LOWEST_ACTIVE_REG 8286 8287# undef CHAR_T 8288# undef UCHAR_T 8289# undef COMPILED_BUFFER_VAR 8290# undef OFFSET_ADDRESS_SIZE 8291# undef CHAR_CLASS_SIZE 8292# undef PREFIX 8293# undef ARG_PREFIX 8294# undef PUT_CHAR 8295# undef BYTE 8296# undef WCHAR 8297 8298# define DEFINED_ONCE 8299