1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2016 University of Cambridge 10 11----------------------------------------------------------------------------- 12Redistribution and use in source and binary forms, with or without 13modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36POSSIBILITY OF SUCH DAMAGE. 37----------------------------------------------------------------------------- 38*/ 39 40 41/* This module contains the external function pcre_compile(), along with 42supporting internal functions that are not used by other modules. */ 43 44 45#ifdef HAVE_CONFIG_H 46#include "config.h" 47#endif 48 49#define NLBLOCK cd /* Block containing newline information */ 50#define PSSTART start_pattern /* Field containing pattern start */ 51#define PSEND end_pattern /* Field containing pattern end */ 52 53#include "pcre_internal.h" 54 55 56/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which 57is also used by pcretest. PCRE_DEBUG is not defined when building a production 58library. We do not need to select pcre16_printint.c specially, because the 59COMPILE_PCREx macro will already be appropriately set. */ 60 61#ifdef PCRE_DEBUG 62/* pcre_printint.c should not include any headers */ 63#define PCRE_INCLUDED 64#include "pcre_printint.c" 65#undef PCRE_INCLUDED 66#endif 67 68 69/* Macro for setting individual bits in class bitmaps. */ 70 71#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7)) 72 73/* Maximum length value to check against when making sure that the integer that 74holds the compiled pattern length does not overflow. We make it a bit less than 75INT_MAX to allow for adding in group terminating bytes, so that we don't have 76to check them every time. */ 77 78#define OFLOW_MAX (INT_MAX - 20) 79 80/* Definitions to allow mutual recursion */ 81 82static int 83 add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *, 84 const pcre_uint32 *, unsigned int); 85 86static BOOL 87 compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int, 88 pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *, 89 compile_data *, int *); 90 91 92 93/************************************************* 94* Code parameters and static tables * 95*************************************************/ 96 97/* This value specifies the size of stack workspace that is used during the 98first pre-compile phase that determines how much memory is required. The regex 99is partly compiled into this space, but the compiled parts are discarded as 100soon as they can be, so that hopefully there will never be an overrun. The code 101does, however, check for an overrun. The largest amount I've seen used is 218, 102so this number is very generous. 103 104The same workspace is used during the second, actual compile phase for 105remembering forward references to groups so that they can be filled in at the 106end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE 107is 4 there is plenty of room for most patterns. However, the memory can get 108filled up by repetitions of forward references, for example patterns like 109/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so 110that the workspace is expanded using malloc() in this situation. The value 111below is therefore a minimum, and we put a maximum on it for safety. The 112minimum is now also defined in terms of LINK_SIZE so that the use of malloc() 113kicks in at the same number of forward references in all cases. */ 114 115#define COMPILE_WORK_SIZE (2048*LINK_SIZE) 116#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE) 117 118/* This value determines the size of the initial vector that is used for 119remembering named groups during the pre-compile. It is allocated on the stack, 120but if it is too small, it is expanded using malloc(), in a similar way to the 121workspace. The value is the number of slots in the list. */ 122 123#define NAMED_GROUP_LIST_SIZE 20 124 125/* The overrun tests check for a slightly smaller size so that they detect the 126overrun before it actually does run off the end of the data block. */ 127 128#define WORK_SIZE_SAFETY_MARGIN (100) 129 130/* Private flags added to firstchar and reqchar. */ 131 132#define REQ_CASELESS (1 << 0) /* Indicates caselessness */ 133#define REQ_VARY (1 << 1) /* Reqchar followed non-literal item */ 134/* Negative values for the firstchar and reqchar flags */ 135#define REQ_UNSET (-2) 136#define REQ_NONE (-1) 137 138/* Repeated character flags. */ 139 140#define UTF_LENGTH 0x10000000l /* The char contains its length. */ 141 142/* Table for handling escaped characters in the range '0'-'z'. Positive returns 143are simple data values; negative values are for special things like \d and so 144on. Zero means further processing is needed (for things like \x), or the escape 145is invalid. */ 146 147#ifndef EBCDIC 148 149/* This is the "normal" table for ASCII systems or for EBCDIC systems running 150in UTF-8 mode. */ 151 152static const short int escapes[] = { 153 0, 0, 154 0, 0, 155 0, 0, 156 0, 0, 157 0, 0, 158 CHAR_COLON, CHAR_SEMICOLON, 159 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, 160 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, 161 CHAR_COMMERCIAL_AT, -ESC_A, 162 -ESC_B, -ESC_C, 163 -ESC_D, -ESC_E, 164 0, -ESC_G, 165 -ESC_H, 0, 166 0, -ESC_K, 167 0, 0, 168 -ESC_N, 0, 169 -ESC_P, -ESC_Q, 170 -ESC_R, -ESC_S, 171 0, 0, 172 -ESC_V, -ESC_W, 173 -ESC_X, 0, 174 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, 175 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, 176 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, 177 CHAR_GRAVE_ACCENT, ESC_a, 178 -ESC_b, 0, 179 -ESC_d, ESC_e, 180 ESC_f, 0, 181 -ESC_h, 0, 182 0, -ESC_k, 183 0, 0, 184 ESC_n, 0, 185 -ESC_p, 0, 186 ESC_r, -ESC_s, 187 ESC_tee, 0, 188 -ESC_v, -ESC_w, 189 0, 0, 190 -ESC_z 191}; 192 193#else 194 195/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */ 196 197static const short int escapes[] = { 198/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', 199/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, 200/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', 201/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, 202/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', 203/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, 204/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', 205/* 80 */ 0, ESC_a, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, 206/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, 207/* 90 */ 0, 0, -ESC_k, 0, 0, ESC_n, 0, -ESC_p, 208/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, 209/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, 210/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, 211/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, 212/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', 213/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, 214/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, 215/* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P, 216/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, 217/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, 218/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, 219/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, 220/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 221}; 222 223/* We also need a table of characters that may follow \c in an EBCDIC 224environment for characters 0-31. */ 225 226static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"; 227 228#endif 229 230 231/* Table of special "verbs" like (*PRUNE). This is a short table, so it is 232searched linearly. Put all the names into a single string, in order to reduce 233the number of relocations when a shared library is dynamically linked. The 234string is built from string macros so that it works in UTF-8 mode on EBCDIC 235platforms. */ 236 237typedef struct verbitem { 238 int len; /* Length of verb name */ 239 int op; /* Op when no arg, or -1 if arg mandatory */ 240 int op_arg; /* Op when arg present, or -1 if not allowed */ 241} verbitem; 242 243static const char verbnames[] = 244 "\0" /* Empty name is a shorthand for MARK */ 245 STRING_MARK0 246 STRING_ACCEPT0 247 STRING_COMMIT0 248 STRING_F0 249 STRING_FAIL0 250 STRING_PRUNE0 251 STRING_SKIP0 252 STRING_THEN; 253 254static const verbitem verbs[] = { 255 { 0, -1, OP_MARK }, 256 { 4, -1, OP_MARK }, 257 { 6, OP_ACCEPT, -1 }, 258 { 6, OP_COMMIT, -1 }, 259 { 1, OP_FAIL, -1 }, 260 { 4, OP_FAIL, -1 }, 261 { 5, OP_PRUNE, OP_PRUNE_ARG }, 262 { 4, OP_SKIP, OP_SKIP_ARG }, 263 { 4, OP_THEN, OP_THEN_ARG } 264}; 265 266static const int verbcount = sizeof(verbs)/sizeof(verbitem); 267 268 269/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in 270another regex library. */ 271 272static const pcre_uchar sub_start_of_word[] = { 273 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, 274 CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' }; 275 276static const pcre_uchar sub_end_of_word[] = { 277 CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK, 278 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, 279 CHAR_RIGHT_PARENTHESIS, '\0' }; 280 281 282/* Tables of names of POSIX character classes and their lengths. The names are 283now all in a single string, to reduce the number of relocations when a shared 284library is dynamically loaded. The list of lengths is terminated by a zero 285length entry. The first three must be alpha, lower, upper, as this is assumed 286for handling case independence. The indices for graph, print, and punct are 287needed, so identify them. */ 288 289static const char posix_names[] = 290 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 291 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 292 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 293 STRING_word0 STRING_xdigit; 294 295static const pcre_uint8 posix_name_lengths[] = { 296 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; 297 298#define PC_GRAPH 8 299#define PC_PRINT 9 300#define PC_PUNCT 10 301 302 303/* Table of class bit maps for each POSIX class. Each class is formed from a 304base map, with an optional addition or removal of another map. Then, for some 305classes, there is some additional tweaking: for [:blank:] the vertical space 306characters are removed, and for [:alpha:] and [:alnum:] the underscore 307character is removed. The triples in the table consist of the base map offset, 308second map offset or -1 if no second map, and a non-negative value for map 309addition or a negative value for map subtraction (if there are two maps). The 310absolute value of the third field has these meanings: 0 => no tweaking, 1 => 311remove vertical space characters, 2 => remove underscore. */ 312 313static const int posix_class_maps[] = { 314 cbit_word, cbit_digit, -2, /* alpha */ 315 cbit_lower, -1, 0, /* lower */ 316 cbit_upper, -1, 0, /* upper */ 317 cbit_word, -1, 2, /* alnum - word without underscore */ 318 cbit_print, cbit_cntrl, 0, /* ascii */ 319 cbit_space, -1, 1, /* blank - a GNU extension */ 320 cbit_cntrl, -1, 0, /* cntrl */ 321 cbit_digit, -1, 0, /* digit */ 322 cbit_graph, -1, 0, /* graph */ 323 cbit_print, -1, 0, /* print */ 324 cbit_punct, -1, 0, /* punct */ 325 cbit_space, -1, 0, /* space */ 326 cbit_word, -1, 0, /* word - a Perl extension */ 327 cbit_xdigit,-1, 0 /* xdigit */ 328}; 329 330/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by 331Unicode property escapes. */ 332 333#ifdef SUPPORT_UCP 334static const pcre_uchar string_PNd[] = { 335 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 336 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 337static const pcre_uchar string_pNd[] = { 338 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 339 CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 340static const pcre_uchar string_PXsp[] = { 341 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 342 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 343static const pcre_uchar string_pXsp[] = { 344 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 345 CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 346static const pcre_uchar string_PXwd[] = { 347 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 348 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 349static const pcre_uchar string_pXwd[] = { 350 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 351 CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 352 353static const pcre_uchar *substitutes[] = { 354 string_PNd, /* \D */ 355 string_pNd, /* \d */ 356 string_PXsp, /* \S */ /* Xsp is Perl space, but from 8.34, Perl */ 357 string_pXsp, /* \s */ /* space and POSIX space are the same. */ 358 string_PXwd, /* \W */ 359 string_pXwd /* \w */ 360}; 361 362/* The POSIX class substitutes must be in the order of the POSIX class names, 363defined above, and there are both positive and negative cases. NULL means no 364general substitute of a Unicode property escape (\p or \P). However, for some 365POSIX classes (e.g. graph, print, punct) a special property code is compiled 366directly. */ 367 368static const pcre_uchar string_pL[] = { 369 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 370 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 371static const pcre_uchar string_pLl[] = { 372 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 373 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 374static const pcre_uchar string_pLu[] = { 375 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 376 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 377static const pcre_uchar string_pXan[] = { 378 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 379 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 380static const pcre_uchar string_h[] = { 381 CHAR_BACKSLASH, CHAR_h, '\0' }; 382static const pcre_uchar string_pXps[] = { 383 CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET, 384 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 385static const pcre_uchar string_PL[] = { 386 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 387 CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 388static const pcre_uchar string_PLl[] = { 389 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 390 CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 391static const pcre_uchar string_PLu[] = { 392 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 393 CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 394static const pcre_uchar string_PXan[] = { 395 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 396 CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 397static const pcre_uchar string_H[] = { 398 CHAR_BACKSLASH, CHAR_H, '\0' }; 399static const pcre_uchar string_PXps[] = { 400 CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET, 401 CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' }; 402 403static const pcre_uchar *posix_substitutes[] = { 404 string_pL, /* alpha */ 405 string_pLl, /* lower */ 406 string_pLu, /* upper */ 407 string_pXan, /* alnum */ 408 NULL, /* ascii */ 409 string_h, /* blank */ 410 NULL, /* cntrl */ 411 string_pNd, /* digit */ 412 NULL, /* graph */ 413 NULL, /* print */ 414 NULL, /* punct */ 415 string_pXps, /* space */ /* Xps is POSIX space, but from 8.34 */ 416 string_pXwd, /* word */ /* Perl and POSIX space are the same */ 417 NULL, /* xdigit */ 418 /* Negated cases */ 419 string_PL, /* ^alpha */ 420 string_PLl, /* ^lower */ 421 string_PLu, /* ^upper */ 422 string_PXan, /* ^alnum */ 423 NULL, /* ^ascii */ 424 string_H, /* ^blank */ 425 NULL, /* ^cntrl */ 426 string_PNd, /* ^digit */ 427 NULL, /* ^graph */ 428 NULL, /* ^print */ 429 NULL, /* ^punct */ 430 string_PXps, /* ^space */ /* Xps is POSIX space, but from 8.34 */ 431 string_PXwd, /* ^word */ /* Perl and POSIX space are the same */ 432 NULL /* ^xdigit */ 433}; 434#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *)) 435#endif 436 437#define STRING(a) # a 438#define XSTRING(s) STRING(s) 439 440/* The texts of compile-time error messages. These are "char *" because they 441are passed to the outside world. Do not ever re-use any error number, because 442they are documented. Always add a new error instead. Messages marked DEAD below 443are no longer used. This used to be a table of strings, but in order to reduce 444the number of relocations needed when a shared library is loaded dynamically, 445it is now one long string. We cannot use a table of offsets, because the 446lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we 447simply count through to the one we want - this isn't a performance issue 448because these strings are used only when there is a compilation error. 449 450Each substring ends with \0 to insert a null character. This includes the final 451substring, so that the whole string ends with \0\0, which can be detected when 452counting through. */ 453 454static const char error_texts[] = 455 "no error\0" 456 "\\ at end of pattern\0" 457 "\\c at end of pattern\0" 458 "unrecognized character follows \\\0" 459 "numbers out of order in {} quantifier\0" 460 /* 5 */ 461 "number too big in {} quantifier\0" 462 "missing terminating ] for character class\0" 463 "invalid escape sequence in character class\0" 464 "range out of order in character class\0" 465 "nothing to repeat\0" 466 /* 10 */ 467 "internal error: invalid forward reference offset\0" 468 "internal error: unexpected repeat\0" 469 "unrecognized character after (? or (?-\0" 470 "POSIX named classes are supported only within a class\0" 471 "missing )\0" 472 /* 15 */ 473 "reference to non-existent subpattern\0" 474 "erroffset passed as NULL\0" 475 "unknown option bit(s) set\0" 476 "missing ) after comment\0" 477 "parentheses nested too deeply\0" /** DEAD **/ 478 /* 20 */ 479 "regular expression is too large\0" 480 "failed to get memory\0" 481 "unmatched parentheses\0" 482 "internal error: code overflow\0" 483 "unrecognized character after (?<\0" 484 /* 25 */ 485 "lookbehind assertion is not fixed length\0" 486 "malformed number or name after (?(\0" 487 "conditional group contains more than two branches\0" 488 "assertion expected after (?( or (?(?C)\0" 489 "(?R or (?[+-]digits must be followed by )\0" 490 /* 30 */ 491 "unknown POSIX class name\0" 492 "POSIX collating elements are not supported\0" 493 "this version of PCRE is compiled without UTF support\0" 494 "spare error\0" /** DEAD **/ 495 "character value in \\x{} or \\o{} is too large\0" 496 /* 35 */ 497 "invalid condition (?(0)\0" 498 "\\C not allowed in lookbehind assertion\0" 499 "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0" 500 "number after (?C is > 255\0" 501 "closing ) for (?C expected\0" 502 /* 40 */ 503 "recursive call could loop indefinitely\0" 504 "unrecognized character after (?P\0" 505 "syntax error in subpattern name (missing terminator)\0" 506 "two named subpatterns have the same name\0" 507 "invalid UTF-8 string\0" 508 /* 45 */ 509 "support for \\P, \\p, and \\X has not been compiled\0" 510 "malformed \\P or \\p sequence\0" 511 "unknown property name after \\P or \\p\0" 512 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" 513 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" 514 /* 50 */ 515 "repeated subpattern is too long\0" /** DEAD **/ 516 "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0" 517 "internal error: overran compiling workspace\0" 518 "internal error: previously-checked referenced subpattern not found\0" 519 "DEFINE group contains more than one branch\0" 520 /* 55 */ 521 "repeating a DEFINE group is not allowed\0" /** DEAD **/ 522 "inconsistent NEWLINE options\0" 523 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" 524 "a numbered reference must not be zero\0" 525 "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0" 526 /* 60 */ 527 "(*VERB) not recognized or malformed\0" 528 "number is too big\0" 529 "subpattern name expected\0" 530 "digit expected after (?+\0" 531 "] is an invalid data character in JavaScript compatibility mode\0" 532 /* 65 */ 533 "different names for subpatterns of the same number are not allowed\0" 534 "(*MARK) must have an argument\0" 535 "this version of PCRE is not compiled with Unicode property support\0" 536#ifndef EBCDIC 537 "\\c must be followed by an ASCII character\0" 538#else 539 "\\c must be followed by a letter or one of [\\]^_?\0" 540#endif 541 "\\k is not followed by a braced, angle-bracketed, or quoted name\0" 542 /* 70 */ 543 "internal error: unknown opcode in find_fixedlength()\0" 544 "\\N is not supported in a class\0" 545 "too many forward references\0" 546 "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" 547 "invalid UTF-16 string\0" 548 /* 75 */ 549 "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" 550 "character value in \\u.... sequence is too large\0" 551 "invalid UTF-32 string\0" 552 "setting UTF is disabled by the application\0" 553 "non-hex character in \\x{} (closing brace missing?)\0" 554 /* 80 */ 555 "non-octal character in \\o{} (closing brace missing?)\0" 556 "missing opening brace after \\o\0" 557 "parentheses are too deeply nested\0" 558 "invalid range in character class\0" 559 "group name must start with a non-digit\0" 560 /* 85 */ 561 "parentheses are too deeply nested (stack check)\0" 562 "digits missing in \\x{} or \\o{}\0" 563 "regular expression is too complicated\0" 564 ; 565 566/* Table to identify digits and hex digits. This is used when compiling 567patterns. Note that the tables in chartables are dependent on the locale, and 568may mark arbitrary characters as digits - but the PCRE compiling code expects 569to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have 570a private table here. It costs 256 bytes, but it is a lot faster than doing 571character value tests (at least in some simple cases I timed), and in some 572applications one wants PCRE to compile efficiently as well as match 573efficiently. 574 575For convenience, we use the same bit definitions as in chartables: 576 577 0x04 decimal digit 578 0x08 hexadecimal digit 579 580Then we can use ctype_digit and ctype_xdigit in the code. */ 581 582/* Using a simple comparison for decimal numbers rather than a memory read 583is much faster, and the resulting code is simpler (the compiler turns it 584into a subtraction and unsigned comparison). */ 585 586#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9) 587 588#ifndef EBCDIC 589 590/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in 591UTF-8 mode. */ 592 593static const pcre_uint8 digitab[] = 594 { 595 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 596 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 597 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ 598 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ 600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ 601 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ 602 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ 603 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ 604 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ 605 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ 606 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ 607 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ 608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ 609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ 610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ 611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ 612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ 613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ 614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ 615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ 616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ 617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ 618 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 619 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ 620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ 621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ 622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ 623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ 624 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ 625 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 626 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ 627 628#else 629 630/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ 631 632static const pcre_uint8 digitab[] = 633 { 634 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 635 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 636 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ 637 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 638 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ 639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ 641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ 643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ 644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ 645 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ 646 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ 647 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ 648 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ 649 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 650 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ 651 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 652 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ 653 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 654 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ 655 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 656 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ 657 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 658 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ 659 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 660 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ 661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 662 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ 663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 664 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ 665 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 666 667static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */ 668 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ 669 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ 670 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ 671 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 672 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ 673 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 674 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ 675 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 676 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ 677 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ 678 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ 679 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ 680 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ 681 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ 682 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ 683 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 684 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ 685 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 686 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ 687 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 688 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ 689 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 690 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ 691 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 692 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ 693 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 694 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ 695 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 696 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ 697 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 698 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ 699 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 700#endif 701 702 703/* This table is used to check whether auto-possessification is possible 704between adjacent character-type opcodes. The left-hand (repeated) opcode is 705used to select the row, and the right-hand opcode is use to select the column. 706A value of 1 means that auto-possessification is OK. For example, the second 707value in the first row means that \D+\d can be turned into \D++\d. 708 709The Unicode property types (\P and \p) have to be present to fill out the table 710because of what their opcode values are, but the table values should always be 711zero because property types are handled separately in the code. The last four 712columns apply to items that cannot be repeated, so there is no need to have 713rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is 714*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ 715 716#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1) 717#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1) 718 719static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = { 720/* \D \d \S \s \W \w . .+ \C \P \p \R \H \h \V \v \X \Z \z $ $M */ 721 { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \D */ 722 { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \d */ 723 { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \S */ 724 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \s */ 725 { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \W */ 726 { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 }, /* \w */ 727 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* . */ 728 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* .+ */ 729 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }, /* \C */ 730 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \P */ 731 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* \p */ 732 { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \R */ 733 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 }, /* \H */ 734 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \h */ 735 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 }, /* \V */ 736 { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 }, /* \v */ 737 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 } /* \X */ 738}; 739 740#ifdef SUPPORT_UCP 741 742/* This table is used to check whether auto-possessification is possible 743between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The 744left-hand (repeated) opcode is used to select the row, and the right-hand 745opcode is used to select the column. The values are as follows: 746 747 0 Always return FALSE (never auto-possessify) 748 1 Character groups are distinct (possessify if both are OP_PROP) 749 2 Check character categories in the same group (general or particular) 750 3 TRUE if the two opcodes are not the same (PROP vs NOTPROP) 751 752 4 Check left general category vs right particular category 753 5 Check right general category vs left particular category 754 755 6 Left alphanum vs right general category 756 7 Left space vs right general category 757 8 Left word vs right general category 758 759 9 Right alphanum vs left general category 760 10 Right space vs left general category 761 11 Right word vs left general category 762 763 12 Left alphanum vs right particular category 764 13 Left space vs right particular category 765 14 Left word vs right particular category 766 767 15 Right alphanum vs left particular category 768 16 Right space vs left particular category 769 17 Right word vs left particular category 770*/ 771 772static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = { 773/* ANY LAMP GC PC SC ALNUM SPACE PXSPACE WORD CLIST UCNC */ 774 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_ANY */ 775 { 0, 3, 0, 0, 0, 3, 1, 1, 0, 0, 0 }, /* PT_LAMP */ 776 { 0, 0, 2, 4, 0, 9, 10, 10, 11, 0, 0 }, /* PT_GC */ 777 { 0, 0, 5, 2, 0, 15, 16, 16, 17, 0, 0 }, /* PT_PC */ 778 { 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 }, /* PT_SC */ 779 { 0, 3, 6, 12, 0, 3, 1, 1, 0, 0, 0 }, /* PT_ALNUM */ 780 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_SPACE */ 781 { 0, 1, 7, 13, 0, 1, 3, 3, 1, 0, 0 }, /* PT_PXSPACE */ 782 { 0, 0, 8, 14, 0, 0, 1, 1, 3, 0, 0 }, /* PT_WORD */ 783 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* PT_CLIST */ 784 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 } /* PT_UCNC */ 785}; 786 787/* This table is used to check whether auto-possessification is possible 788between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one 789specifies a general category and the other specifies a particular category. The 790row is selected by the general category and the column by the particular 791category. The value is 1 if the particular category is not part of the general 792category. */ 793 794static const pcre_uint8 catposstab[7][30] = { 795/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */ 796 { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* C */ 797 { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* L */ 798 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* M */ 799 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, /* N */ 800 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }, /* P */ 801 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 }, /* S */ 802 { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 } /* Z */ 803}; 804 805/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against 806a general or particular category. The properties in each row are those 807that apply to the character set in question. Duplication means that a little 808unnecessary work is done when checking, but this keeps things much simpler 809because they can all use the same code. For more details see the comment where 810this table is used. 811 812Note: SPACE and PXSPACE used to be different because Perl excluded VT from 813"space", but from Perl 5.18 it's included, so both categories are treated the 814same here. */ 815 816static const pcre_uint8 posspropstab[3][4] = { 817 { ucp_L, ucp_N, ucp_N, ucp_Nl }, /* ALNUM, 3rd and 4th values redundant */ 818 { ucp_Z, ucp_Z, ucp_C, ucp_Cc }, /* SPACE and PXSPACE, 2nd value redundant */ 819 { ucp_L, ucp_N, ucp_P, ucp_Po } /* WORD */ 820}; 821#endif 822 823/* This table is used when converting repeating opcodes into possessified 824versions as a result of an explicit possessive quantifier such as ++. A zero 825value means there is no possessified version - in those cases the item in 826question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT 827because all relevant opcodes are less than that. */ 828 829static const pcre_uint8 opcode_possessify[] = { 830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 15 */ 831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 - 31 */ 832 833 0, /* NOTI */ 834 OP_POSSTAR, 0, /* STAR, MINSTAR */ 835 OP_POSPLUS, 0, /* PLUS, MINPLUS */ 836 OP_POSQUERY, 0, /* QUERY, MINQUERY */ 837 OP_POSUPTO, 0, /* UPTO, MINUPTO */ 838 0, /* EXACT */ 839 0, 0, 0, 0, /* POS{STAR,PLUS,QUERY,UPTO} */ 840 841 OP_POSSTARI, 0, /* STARI, MINSTARI */ 842 OP_POSPLUSI, 0, /* PLUSI, MINPLUSI */ 843 OP_POSQUERYI, 0, /* QUERYI, MINQUERYI */ 844 OP_POSUPTOI, 0, /* UPTOI, MINUPTOI */ 845 0, /* EXACTI */ 846 0, 0, 0, 0, /* POS{STARI,PLUSI,QUERYI,UPTOI} */ 847 848 OP_NOTPOSSTAR, 0, /* NOTSTAR, NOTMINSTAR */ 849 OP_NOTPOSPLUS, 0, /* NOTPLUS, NOTMINPLUS */ 850 OP_NOTPOSQUERY, 0, /* NOTQUERY, NOTMINQUERY */ 851 OP_NOTPOSUPTO, 0, /* NOTUPTO, NOTMINUPTO */ 852 0, /* NOTEXACT */ 853 0, 0, 0, 0, /* NOTPOS{STAR,PLUS,QUERY,UPTO} */ 854 855 OP_NOTPOSSTARI, 0, /* NOTSTARI, NOTMINSTARI */ 856 OP_NOTPOSPLUSI, 0, /* NOTPLUSI, NOTMINPLUSI */ 857 OP_NOTPOSQUERYI, 0, /* NOTQUERYI, NOTMINQUERYI */ 858 OP_NOTPOSUPTOI, 0, /* NOTUPTOI, NOTMINUPTOI */ 859 0, /* NOTEXACTI */ 860 0, 0, 0, 0, /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */ 861 862 OP_TYPEPOSSTAR, 0, /* TYPESTAR, TYPEMINSTAR */ 863 OP_TYPEPOSPLUS, 0, /* TYPEPLUS, TYPEMINPLUS */ 864 OP_TYPEPOSQUERY, 0, /* TYPEQUERY, TYPEMINQUERY */ 865 OP_TYPEPOSUPTO, 0, /* TYPEUPTO, TYPEMINUPTO */ 866 0, /* TYPEEXACT */ 867 0, 0, 0, 0, /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */ 868 869 OP_CRPOSSTAR, 0, /* CRSTAR, CRMINSTAR */ 870 OP_CRPOSPLUS, 0, /* CRPLUS, CRMINPLUS */ 871 OP_CRPOSQUERY, 0, /* CRQUERY, CRMINQUERY */ 872 OP_CRPOSRANGE, 0, /* CRRANGE, CRMINRANGE */ 873 0, 0, 0, 0, /* CRPOS{STAR,PLUS,QUERY,RANGE} */ 874 875 0, 0, 0, /* CLASS, NCLASS, XCLASS */ 876 0, 0, /* REF, REFI */ 877 0, 0, /* DNREF, DNREFI */ 878 0, 0 /* RECURSE, CALLOUT */ 879}; 880 881 882 883/************************************************* 884* Find an error text * 885*************************************************/ 886 887/* The error texts are now all in one long string, to save on relocations. As 888some of the text is of unknown length, we can't use a table of offsets. 889Instead, just count through the strings. This is not a performance issue 890because it happens only when there has been a compilation error. 891 892Argument: the error number 893Returns: pointer to the error string 894*/ 895 896static const char * 897find_error_text(int n) 898{ 899const char *s = error_texts; 900for (; n > 0; n--) 901 { 902 while (*s++ != CHAR_NULL) {}; 903 if (*s == CHAR_NULL) return "Error text not found (please report)"; 904 } 905return s; 906} 907 908 909 910/************************************************* 911* Expand the workspace * 912*************************************************/ 913 914/* This function is called during the second compiling phase, if the number of 915forward references fills the existing workspace, which is originally a block on 916the stack. A larger block is obtained from malloc() unless the ultimate limit 917has been reached or the increase will be rather small. 918 919Argument: pointer to the compile data block 920Returns: 0 if all went well, else an error number 921*/ 922 923static int 924expand_workspace(compile_data *cd) 925{ 926pcre_uchar *newspace; 927int newsize = cd->workspace_size * 2; 928 929if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX; 930if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX || 931 newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN) 932 return ERR72; 933 934newspace = (PUBL(malloc))(IN_UCHARS(newsize)); 935if (newspace == NULL) return ERR21; 936memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar)); 937cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace); 938if (cd->workspace_size > COMPILE_WORK_SIZE) 939 (PUBL(free))((void *)cd->start_workspace); 940cd->start_workspace = newspace; 941cd->workspace_size = newsize; 942return 0; 943} 944 945 946 947/************************************************* 948* Check for counted repeat * 949*************************************************/ 950 951/* This function is called when a '{' is encountered in a place where it might 952start a quantifier. It looks ahead to see if it really is a quantifier or not. 953It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} 954where the ddds are digits. 955 956Arguments: 957 p pointer to the first char after '{' 958 959Returns: TRUE or FALSE 960*/ 961 962static BOOL 963is_counted_repeat(const pcre_uchar *p) 964{ 965if (!IS_DIGIT(*p)) return FALSE; 966p++; 967while (IS_DIGIT(*p)) p++; 968if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 969 970if (*p++ != CHAR_COMMA) return FALSE; 971if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 972 973if (!IS_DIGIT(*p)) return FALSE; 974p++; 975while (IS_DIGIT(*p)) p++; 976 977return (*p == CHAR_RIGHT_CURLY_BRACKET); 978} 979 980 981 982/************************************************* 983* Handle escapes * 984*************************************************/ 985 986/* This function is called when a \ has been encountered. It either returns a 987positive value for a simple escape such as \n, or 0 for a data character which 988will be placed in chptr. A backreference to group n is returned as negative n. 989When UTF-8 is enabled, a positive value greater than 255 may be returned in 990chptr. On entry, ptr is pointing at the \. On exit, it is on the final 991character of the escape sequence. 992 993Arguments: 994 ptrptr points to the pattern position pointer 995 chptr points to a returned data character 996 errorcodeptr points to the errorcode variable 997 bracount number of previous extracting brackets 998 options the options bits 999 isclass TRUE if inside a character class 1000 1001Returns: zero => a data character 1002 positive => a special escape sequence 1003 negative => a back reference 1004 on error, errorcodeptr is set 1005*/ 1006 1007static int 1008check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr, 1009 int bracount, int options, BOOL isclass) 1010{ 1011/* PCRE_UTF16 has the same value as PCRE_UTF8. */ 1012BOOL utf = (options & PCRE_UTF8) != 0; 1013const pcre_uchar *ptr = *ptrptr + 1; 1014pcre_uint32 c; 1015int escape = 0; 1016int i; 1017 1018GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 1019ptr--; /* Set pointer back to the last byte */ 1020 1021/* If backslash is at the end of the pattern, it's an error. */ 1022 1023if (c == CHAR_NULL) *errorcodeptr = ERR1; 1024 1025/* Non-alphanumerics are literals. For digits or letters, do an initial lookup 1026in a table. A non-zero result is something that can be returned immediately. 1027Otherwise further processing may be required. */ 1028 1029#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1030/* Not alphanumeric */ 1031else if (c < CHAR_0 || c > CHAR_z) {} 1032else if ((i = escapes[c - CHAR_0]) != 0) 1033 { if (i > 0) c = (pcre_uint32)i; else escape = -i; } 1034 1035#else /* EBCDIC coding */ 1036/* Not alphanumeric */ 1037else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} 1038else if ((i = escapes[c - 0x48]) != 0) { if (i > 0) c = (pcre_uint32)i; else escape = -i; } 1039#endif 1040 1041/* Escapes that need further processing, or are illegal. */ 1042 1043else 1044 { 1045 const pcre_uchar *oldptr; 1046 BOOL braced, negated, overflow; 1047 int s; 1048 1049 switch (c) 1050 { 1051 /* A number of Perl escapes are not handled by PCRE. We give an explicit 1052 error. */ 1053 1054 case CHAR_l: 1055 case CHAR_L: 1056 *errorcodeptr = ERR37; 1057 break; 1058 1059 case CHAR_u: 1060 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) 1061 { 1062 /* In JavaScript, \u must be followed by four hexadecimal numbers. 1063 Otherwise it is a lowercase u letter. */ 1064 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0 1065 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0 1066 && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0 1067 && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0) 1068 { 1069 c = 0; 1070 for (i = 0; i < 4; ++i) 1071 { 1072 register pcre_uint32 cc = *(++ptr); 1073#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1074 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 1075 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 1076#else /* EBCDIC coding */ 1077 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ 1078 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 1079#endif 1080 } 1081 1082#if defined COMPILE_PCRE8 1083 if (c > (utf ? 0x10ffffU : 0xffU)) 1084#elif defined COMPILE_PCRE16 1085 if (c > (utf ? 0x10ffffU : 0xffffU)) 1086#elif defined COMPILE_PCRE32 1087 if (utf && c > 0x10ffffU) 1088#endif 1089 { 1090 *errorcodeptr = ERR76; 1091 } 1092 else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 1093 } 1094 } 1095 else 1096 *errorcodeptr = ERR37; 1097 break; 1098 1099 case CHAR_U: 1100 /* In JavaScript, \U is an uppercase U letter. */ 1101 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37; 1102 break; 1103 1104 /* In a character class, \g is just a literal "g". Outside a character 1105 class, \g must be followed by one of a number of specific things: 1106 1107 (1) A number, either plain or braced. If positive, it is an absolute 1108 backreference. If negative, it is a relative backreference. This is a Perl 1109 5.10 feature. 1110 1111 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This 1112 is part of Perl's movement towards a unified syntax for back references. As 1113 this is synonymous with \k{name}, we fudge it up by pretending it really 1114 was \k. 1115 1116 (3) For Oniguruma compatibility we also support \g followed by a name or a 1117 number either in angle brackets or in single quotes. However, these are 1118 (possibly recursive) subroutine calls, _not_ backreferences. Just return 1119 the ESC_g code (cf \k). */ 1120 1121 case CHAR_g: 1122 if (isclass) break; 1123 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) 1124 { 1125 escape = ESC_g; 1126 break; 1127 } 1128 1129 /* Handle the Perl-compatible cases */ 1130 1131 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 1132 { 1133 const pcre_uchar *p; 1134 for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++) 1135 if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break; 1136 if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET) 1137 { 1138 escape = ESC_k; 1139 break; 1140 } 1141 braced = TRUE; 1142 ptr++; 1143 } 1144 else braced = FALSE; 1145 1146 if (ptr[1] == CHAR_MINUS) 1147 { 1148 negated = TRUE; 1149 ptr++; 1150 } 1151 else negated = FALSE; 1152 1153 /* The integer range is limited by the machine's int representation. */ 1154 s = 0; 1155 overflow = FALSE; 1156 while (IS_DIGIT(ptr[1])) 1157 { 1158 if (s > INT_MAX / 10 - 1) /* Integer overflow */ 1159 { 1160 overflow = TRUE; 1161 break; 1162 } 1163 s = s * 10 + (int)(*(++ptr) - CHAR_0); 1164 } 1165 if (overflow) /* Integer overflow */ 1166 { 1167 while (IS_DIGIT(ptr[1])) 1168 ptr++; 1169 *errorcodeptr = ERR61; 1170 break; 1171 } 1172 1173 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) 1174 { 1175 *errorcodeptr = ERR57; 1176 break; 1177 } 1178 1179 if (s == 0) 1180 { 1181 *errorcodeptr = ERR58; 1182 break; 1183 } 1184 1185 if (negated) 1186 { 1187 if (s > bracount) 1188 { 1189 *errorcodeptr = ERR15; 1190 break; 1191 } 1192 s = bracount - (s - 1); 1193 } 1194 1195 escape = -s; 1196 break; 1197 1198 /* The handling of escape sequences consisting of a string of digits 1199 starting with one that is not zero is not straightforward. Perl has changed 1200 over the years. Nowadays \g{} for backreferences and \o{} for octal are 1201 recommended to avoid the ambiguities in the old syntax. 1202 1203 Outside a character class, the digits are read as a decimal number. If the 1204 number is less than 8 (used to be 10), or if there are that many previous 1205 extracting left brackets, then it is a back reference. Otherwise, up to 1206 three octal digits are read to form an escaped byte. Thus \123 is likely to 1207 be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If 1208 the octal value is greater than 377, the least significant 8 bits are 1209 taken. \8 and \9 are treated as the literal characters 8 and 9. 1210 1211 Inside a character class, \ followed by a digit is always either a literal 1212 8 or 9 or an octal number. */ 1213 1214 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: 1215 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 1216 1217 if (!isclass) 1218 { 1219 oldptr = ptr; 1220 /* The integer range is limited by the machine's int representation. */ 1221 s = (int)(c -CHAR_0); 1222 overflow = FALSE; 1223 while (IS_DIGIT(ptr[1])) 1224 { 1225 if (s > INT_MAX / 10 - 1) /* Integer overflow */ 1226 { 1227 overflow = TRUE; 1228 break; 1229 } 1230 s = s * 10 + (int)(*(++ptr) - CHAR_0); 1231 } 1232 if (overflow) /* Integer overflow */ 1233 { 1234 while (IS_DIGIT(ptr[1])) 1235 ptr++; 1236 *errorcodeptr = ERR61; 1237 break; 1238 } 1239 if (s < 8 || s <= bracount) /* Check for back reference */ 1240 { 1241 escape = -s; 1242 break; 1243 } 1244 ptr = oldptr; /* Put the pointer back and fall through */ 1245 } 1246 1247 /* Handle a digit following \ when the number is not a back reference. If 1248 the first digit is 8 or 9, Perl used to generate a binary zero byte and 1249 then treat the digit as a following literal. At least by Perl 5.18 this 1250 changed so as not to insert the binary zero. */ 1251 1252 if ((c = *ptr) >= CHAR_8) break; 1253 1254 /* Fall through with a digit less than 8 */ 1255 1256 /* \0 always starts an octal number, but we may drop through to here with a 1257 larger first octal digit. The original code used just to take the least 1258 significant 8 bits of octal numbers (I think this is what early Perls used 1259 to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, 1260 but no more than 3 octal digits. */ 1261 1262 case CHAR_0: 1263 c -= CHAR_0; 1264 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) 1265 c = c * 8 + *(++ptr) - CHAR_0; 1266#ifdef COMPILE_PCRE8 1267 if (!utf && c > 0xff) *errorcodeptr = ERR51; 1268#endif 1269 break; 1270 1271 /* \o is a relatively new Perl feature, supporting a more general way of 1272 specifying character codes in octal. The only supported form is \o{ddd}. */ 1273 1274 case CHAR_o: 1275 if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else 1276 if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else 1277 { 1278 ptr += 2; 1279 c = 0; 1280 overflow = FALSE; 1281 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) 1282 { 1283 register pcre_uint32 cc = *ptr++; 1284 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ 1285#ifdef COMPILE_PCRE32 1286 if (c >= 0x20000000l) { overflow = TRUE; break; } 1287#endif 1288 c = (c << 3) + cc - CHAR_0 ; 1289#if defined COMPILE_PCRE8 1290 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } 1291#elif defined COMPILE_PCRE16 1292 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } 1293#elif defined COMPILE_PCRE32 1294 if (utf && c > 0x10ffffU) { overflow = TRUE; break; } 1295#endif 1296 } 1297 if (overflow) 1298 { 1299 while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++; 1300 *errorcodeptr = ERR34; 1301 } 1302 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 1303 { 1304 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 1305 } 1306 else *errorcodeptr = ERR80; 1307 } 1308 break; 1309 1310 /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal 1311 numbers. Otherwise it is a lowercase x letter. */ 1312 1313 case CHAR_x: 1314 if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) 1315 { 1316 if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0 1317 && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0) 1318 { 1319 c = 0; 1320 for (i = 0; i < 2; ++i) 1321 { 1322 register pcre_uint32 cc = *(++ptr); 1323#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1324 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 1325 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 1326#else /* EBCDIC coding */ 1327 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ 1328 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 1329#endif 1330 } 1331 } 1332 } /* End JavaScript handling */ 1333 1334 /* Handle \x in Perl's style. \x{ddd} is a character number which can be 1335 greater than 0xff in utf or non-8bit mode, but only if the ddd are hex 1336 digits. If not, { used to be treated as a data character. However, Perl 1337 seems to read hex digits up to the first non-such, and ignore the rest, so 1338 that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE 1339 now gives an error. */ 1340 1341 else 1342 { 1343 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 1344 { 1345 ptr += 2; 1346 if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 1347 { 1348 *errorcodeptr = ERR86; 1349 break; 1350 } 1351 c = 0; 1352 overflow = FALSE; 1353 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) 1354 { 1355 register pcre_uint32 cc = *ptr++; 1356 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ 1357 1358#ifdef COMPILE_PCRE32 1359 if (c >= 0x10000000l) { overflow = TRUE; break; } 1360#endif 1361 1362#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1363 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 1364 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 1365#else /* EBCDIC coding */ 1366 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ 1367 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 1368#endif 1369 1370#if defined COMPILE_PCRE8 1371 if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; } 1372#elif defined COMPILE_PCRE16 1373 if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; } 1374#elif defined COMPILE_PCRE32 1375 if (utf && c > 0x10ffffU) { overflow = TRUE; break; } 1376#endif 1377 } 1378 1379 if (overflow) 1380 { 1381 while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++; 1382 *errorcodeptr = ERR34; 1383 } 1384 1385 else if (*ptr == CHAR_RIGHT_CURLY_BRACKET) 1386 { 1387 if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; 1388 } 1389 1390 /* If the sequence of hex digits does not end with '}', give an error. 1391 We used just to recognize this construct and fall through to the normal 1392 \x handling, but nowadays Perl gives an error, which seems much more 1393 sensible, so we do too. */ 1394 1395 else *errorcodeptr = ERR79; 1396 } /* End of \x{} processing */ 1397 1398 /* Read a single-byte hex-defined char (up to two hex digits after \x) */ 1399 1400 else 1401 { 1402 c = 0; 1403 while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0) 1404 { 1405 pcre_uint32 cc; /* Some compilers don't like */ 1406 cc = *(++ptr); /* ++ in initializers */ 1407#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1408 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 1409 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 1410#else /* EBCDIC coding */ 1411 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ 1412 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 1413#endif 1414 } 1415 } /* End of \xdd handling */ 1416 } /* End of Perl-style \x handling */ 1417 break; 1418 1419 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. 1420 An error is given if the byte following \c is not an ASCII character. This 1421 coding is ASCII-specific, but then the whole concept of \cx is 1422 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ 1423 1424 case CHAR_c: 1425 c = *(++ptr); 1426 if (c == CHAR_NULL) 1427 { 1428 *errorcodeptr = ERR2; 1429 break; 1430 } 1431#ifndef EBCDIC /* ASCII/UTF-8 coding */ 1432 if (c > 127) /* Excludes all non-ASCII in either mode */ 1433 { 1434 *errorcodeptr = ERR68; 1435 break; 1436 } 1437 if (c >= CHAR_a && c <= CHAR_z) c -= 32; 1438 c ^= 0x40; 1439#else /* EBCDIC coding */ 1440 if (c >= CHAR_a && c <= CHAR_z) c += 64; 1441 if (c == CHAR_QUESTION_MARK) 1442 c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff; 1443 else 1444 { 1445 for (i = 0; i < 32; i++) 1446 { 1447 if (c == ebcdic_escape_c[i]) break; 1448 } 1449 if (i < 32) c = i; else *errorcodeptr = ERR68; 1450 } 1451#endif 1452 break; 1453 1454 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any 1455 other alphanumeric following \ is an error if PCRE_EXTRA was set; 1456 otherwise, for Perl compatibility, it is a literal. This code looks a bit 1457 odd, but there used to be some cases other than the default, and there may 1458 be again in future, so I haven't "optimized" it. */ 1459 1460 default: 1461 if ((options & PCRE_EXTRA) != 0) switch(c) 1462 { 1463 default: 1464 *errorcodeptr = ERR3; 1465 break; 1466 } 1467 break; 1468 } 1469 } 1470 1471/* Perl supports \N{name} for character names, as well as plain \N for "not 1472newline". PCRE does not support \N{name}. However, it does support 1473quantification such as \N{2,3}. */ 1474 1475if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && 1476 !is_counted_repeat(ptr+2)) 1477 *errorcodeptr = ERR37; 1478 1479/* If PCRE_UCP is set, we change the values for \d etc. */ 1480 1481if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w) 1482 escape += (ESC_DU - ESC_D); 1483 1484/* Set the pointer to the final character before returning. */ 1485 1486*ptrptr = ptr; 1487*chptr = c; 1488return escape; 1489} 1490 1491 1492 1493#ifdef SUPPORT_UCP 1494/************************************************* 1495* Handle \P and \p * 1496*************************************************/ 1497 1498/* This function is called after \P or \p has been encountered, provided that 1499PCRE is compiled with support for Unicode properties. On entry, ptrptr is 1500pointing at the P or p. On exit, it is pointing at the final character of the 1501escape sequence. 1502 1503Argument: 1504 ptrptr points to the pattern position pointer 1505 negptr points to a boolean that is set TRUE for negation else FALSE 1506 ptypeptr points to an unsigned int that is set to the type value 1507 pdataptr points to an unsigned int that is set to the detailed property value 1508 errorcodeptr points to the error code variable 1509 1510Returns: TRUE if the type value was found, or FALSE for an invalid type 1511*/ 1512 1513static BOOL 1514get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr, 1515 unsigned int *pdataptr, int *errorcodeptr) 1516{ 1517pcre_uchar c; 1518int i, bot, top; 1519const pcre_uchar *ptr = *ptrptr; 1520pcre_uchar name[32]; 1521 1522c = *(++ptr); 1523if (c == CHAR_NULL) goto ERROR_RETURN; 1524 1525*negptr = FALSE; 1526 1527/* \P or \p can be followed by a name in {}, optionally preceded by ^ for 1528negation. */ 1529 1530if (c == CHAR_LEFT_CURLY_BRACKET) 1531 { 1532 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) 1533 { 1534 *negptr = TRUE; 1535 ptr++; 1536 } 1537 for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++) 1538 { 1539 c = *(++ptr); 1540 if (c == CHAR_NULL) goto ERROR_RETURN; 1541 if (c == CHAR_RIGHT_CURLY_BRACKET) break; 1542 name[i] = c; 1543 } 1544 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; 1545 name[i] = 0; 1546 } 1547 1548/* Otherwise there is just one following character */ 1549 1550else 1551 { 1552 name[0] = c; 1553 name[1] = 0; 1554 } 1555 1556*ptrptr = ptr; 1557 1558/* Search for a recognized property name using binary chop */ 1559 1560bot = 0; 1561top = PRIV(utt_size); 1562 1563while (bot < top) 1564 { 1565 int r; 1566 i = (bot + top) >> 1; 1567 r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset); 1568 if (r == 0) 1569 { 1570 *ptypeptr = PRIV(utt)[i].type; 1571 *pdataptr = PRIV(utt)[i].value; 1572 return TRUE; 1573 } 1574 if (r > 0) bot = i + 1; else top = i; 1575 } 1576 1577*errorcodeptr = ERR47; 1578*ptrptr = ptr; 1579return FALSE; 1580 1581ERROR_RETURN: 1582*errorcodeptr = ERR46; 1583*ptrptr = ptr; 1584return FALSE; 1585} 1586#endif 1587 1588 1589 1590/************************************************* 1591* Read repeat counts * 1592*************************************************/ 1593 1594/* Read an item of the form {n,m} and return the values. This is called only 1595after is_counted_repeat() has confirmed that a repeat-count quantifier exists, 1596so the syntax is guaranteed to be correct, but we need to check the values. 1597 1598Arguments: 1599 p pointer to first char after '{' 1600 minp pointer to int for min 1601 maxp pointer to int for max 1602 returned as -1 if no max 1603 errorcodeptr points to error code variable 1604 1605Returns: pointer to '}' on success; 1606 current ptr on error, with errorcodeptr set non-zero 1607*/ 1608 1609static const pcre_uchar * 1610read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr) 1611{ 1612int min = 0; 1613int max = -1; 1614 1615while (IS_DIGIT(*p)) 1616 { 1617 min = min * 10 + (int)(*p++ - CHAR_0); 1618 if (min > 65535) 1619 { 1620 *errorcodeptr = ERR5; 1621 return p; 1622 } 1623 } 1624 1625if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else 1626 { 1627 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) 1628 { 1629 max = 0; 1630 while(IS_DIGIT(*p)) 1631 { 1632 max = max * 10 + (int)(*p++ - CHAR_0); 1633 if (max > 65535) 1634 { 1635 *errorcodeptr = ERR5; 1636 return p; 1637 } 1638 } 1639 if (max < min) 1640 { 1641 *errorcodeptr = ERR4; 1642 return p; 1643 } 1644 } 1645 } 1646 1647*minp = min; 1648*maxp = max; 1649return p; 1650} 1651 1652 1653 1654/************************************************* 1655* Find first significant op code * 1656*************************************************/ 1657 1658/* This is called by several functions that scan a compiled expression looking 1659for a fixed first character, or an anchoring op code etc. It skips over things 1660that do not influence this. For some calls, it makes sense to skip negative 1661forward and all backward assertions, and also the \b assertion; for others it 1662does not. 1663 1664Arguments: 1665 code pointer to the start of the group 1666 skipassert TRUE if certain assertions are to be skipped 1667 1668Returns: pointer to the first significant opcode 1669*/ 1670 1671static const pcre_uchar* 1672first_significant_code(const pcre_uchar *code, BOOL skipassert) 1673{ 1674for (;;) 1675 { 1676 switch ((int)*code) 1677 { 1678 case OP_ASSERT_NOT: 1679 case OP_ASSERTBACK: 1680 case OP_ASSERTBACK_NOT: 1681 if (!skipassert) return code; 1682 do code += GET(code, 1); while (*code == OP_ALT); 1683 code += PRIV(OP_lengths)[*code]; 1684 break; 1685 1686 case OP_WORD_BOUNDARY: 1687 case OP_NOT_WORD_BOUNDARY: 1688 if (!skipassert) return code; 1689 /* Fall through */ 1690 1691 case OP_CALLOUT: 1692 case OP_CREF: 1693 case OP_DNCREF: 1694 case OP_RREF: 1695 case OP_DNRREF: 1696 case OP_DEF: 1697 code += PRIV(OP_lengths)[*code]; 1698 break; 1699 1700 default: 1701 return code; 1702 } 1703 } 1704/* Control never reaches here */ 1705} 1706 1707 1708 1709/************************************************* 1710* Find the fixed length of a branch * 1711*************************************************/ 1712 1713/* Scan a branch and compute the fixed length of subject that will match it, 1714if the length is fixed. This is needed for dealing with backward assertions. 1715In UTF8 mode, the result is in characters rather than bytes. The branch is 1716temporarily terminated with OP_END when this function is called. 1717 1718This function is called when a backward assertion is encountered, so that if it 1719fails, the error message can point to the correct place in the pattern. 1720However, we cannot do this when the assertion contains subroutine calls, 1721because they can be forward references. We solve this by remembering this case 1722and doing the check at the end; a flag specifies which mode we are running in. 1723 1724Arguments: 1725 code points to the start of the pattern (the bracket) 1726 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 1727 atend TRUE if called when the pattern is complete 1728 cd the "compile data" structure 1729 recurses chain of recurse_check to catch mutual recursion 1730 1731Returns: the fixed length, 1732 or -1 if there is no fixed length, 1733 or -2 if \C was encountered (in UTF-8 mode only) 1734 or -3 if an OP_RECURSE item was encountered and atend is FALSE 1735 or -4 if an unknown opcode was encountered (internal error) 1736*/ 1737 1738static int 1739find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd, 1740 recurse_check *recurses) 1741{ 1742int length = -1; 1743recurse_check this_recurse; 1744register int branchlength = 0; 1745register pcre_uchar *cc = code + 1 + LINK_SIZE; 1746 1747/* Scan along the opcodes for this branch. If we get to the end of the 1748branch, check the length against that of the other branches. */ 1749 1750for (;;) 1751 { 1752 int d; 1753 pcre_uchar *ce, *cs; 1754 register pcre_uchar op = *cc; 1755 1756 switch (op) 1757 { 1758 /* We only need to continue for OP_CBRA (normal capturing bracket) and 1759 OP_BRA (normal non-capturing bracket) because the other variants of these 1760 opcodes are all concerned with unlimited repeated groups, which of course 1761 are not of fixed length. */ 1762 1763 case OP_CBRA: 1764 case OP_BRA: 1765 case OP_ONCE: 1766 case OP_ONCE_NC: 1767 case OP_COND: 1768 d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd, 1769 recurses); 1770 if (d < 0) return d; 1771 branchlength += d; 1772 do cc += GET(cc, 1); while (*cc == OP_ALT); 1773 cc += 1 + LINK_SIZE; 1774 break; 1775 1776 /* Reached end of a branch; if it's a ket it is the end of a nested call. 1777 If it's ALT it is an alternation in a nested call. An ACCEPT is effectively 1778 an ALT. If it is END it's the end of the outer call. All can be handled by 1779 the same code. Note that we must not include the OP_KETRxxx opcodes here, 1780 because they all imply an unlimited repeat. */ 1781 1782 case OP_ALT: 1783 case OP_KET: 1784 case OP_END: 1785 case OP_ACCEPT: 1786 case OP_ASSERT_ACCEPT: 1787 if (length < 0) length = branchlength; 1788 else if (length != branchlength) return -1; 1789 if (*cc != OP_ALT) return length; 1790 cc += 1 + LINK_SIZE; 1791 branchlength = 0; 1792 break; 1793 1794 /* A true recursion implies not fixed length, but a subroutine call may 1795 be OK. If the subroutine is a forward reference, we can't deal with 1796 it until the end of the pattern, so return -3. */ 1797 1798 case OP_RECURSE: 1799 if (!atend) return -3; 1800 cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ 1801 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ 1802 if (cc > cs && cc < ce) return -1; /* Recursion */ 1803 else /* Check for mutual recursion */ 1804 { 1805 recurse_check *r = recurses; 1806 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break; 1807 if (r != NULL) return -1; /* Mutual recursion */ 1808 } 1809 this_recurse.prev = recurses; 1810 this_recurse.group = cs; 1811 d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse); 1812 if (d < 0) return d; 1813 branchlength += d; 1814 cc += 1 + LINK_SIZE; 1815 break; 1816 1817 /* Skip over assertive subpatterns */ 1818 1819 case OP_ASSERT: 1820 case OP_ASSERT_NOT: 1821 case OP_ASSERTBACK: 1822 case OP_ASSERTBACK_NOT: 1823 do cc += GET(cc, 1); while (*cc == OP_ALT); 1824 cc += 1 + LINK_SIZE; 1825 break; 1826 1827 /* Skip over things that don't match chars */ 1828 1829 case OP_MARK: 1830 case OP_PRUNE_ARG: 1831 case OP_SKIP_ARG: 1832 case OP_THEN_ARG: 1833 cc += cc[1] + PRIV(OP_lengths)[*cc]; 1834 break; 1835 1836 case OP_CALLOUT: 1837 case OP_CIRC: 1838 case OP_CIRCM: 1839 case OP_CLOSE: 1840 case OP_COMMIT: 1841 case OP_CREF: 1842 case OP_DEF: 1843 case OP_DNCREF: 1844 case OP_DNRREF: 1845 case OP_DOLL: 1846 case OP_DOLLM: 1847 case OP_EOD: 1848 case OP_EODN: 1849 case OP_FAIL: 1850 case OP_NOT_WORD_BOUNDARY: 1851 case OP_PRUNE: 1852 case OP_REVERSE: 1853 case OP_RREF: 1854 case OP_SET_SOM: 1855 case OP_SKIP: 1856 case OP_SOD: 1857 case OP_SOM: 1858 case OP_THEN: 1859 case OP_WORD_BOUNDARY: 1860 cc += PRIV(OP_lengths)[*cc]; 1861 break; 1862 1863 /* Handle literal characters */ 1864 1865 case OP_CHAR: 1866 case OP_CHARI: 1867 case OP_NOT: 1868 case OP_NOTI: 1869 branchlength++; 1870 cc += 2; 1871#ifdef SUPPORT_UTF 1872 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 1873#endif 1874 break; 1875 1876 /* Handle exact repetitions. The count is already in characters, but we 1877 need to skip over a multibyte character in UTF8 mode. */ 1878 1879 case OP_EXACT: 1880 case OP_EXACTI: 1881 case OP_NOTEXACT: 1882 case OP_NOTEXACTI: 1883 branchlength += (int)GET2(cc,1); 1884 cc += 2 + IMM2_SIZE; 1885#ifdef SUPPORT_UTF 1886 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); 1887#endif 1888 break; 1889 1890 case OP_TYPEEXACT: 1891 branchlength += GET2(cc,1); 1892 if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) 1893 cc += 2; 1894 cc += 1 + IMM2_SIZE + 1; 1895 break; 1896 1897 /* Handle single-char matchers */ 1898 1899 case OP_PROP: 1900 case OP_NOTPROP: 1901 cc += 2; 1902 /* Fall through */ 1903 1904 case OP_HSPACE: 1905 case OP_VSPACE: 1906 case OP_NOT_HSPACE: 1907 case OP_NOT_VSPACE: 1908 case OP_NOT_DIGIT: 1909 case OP_DIGIT: 1910 case OP_NOT_WHITESPACE: 1911 case OP_WHITESPACE: 1912 case OP_NOT_WORDCHAR: 1913 case OP_WORDCHAR: 1914 case OP_ANY: 1915 case OP_ALLANY: 1916 branchlength++; 1917 cc++; 1918 break; 1919 1920 /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode; 1921 otherwise \C is coded as OP_ALLANY. */ 1922 1923 case OP_ANYBYTE: 1924 return -2; 1925 1926 /* Check a class for variable quantification */ 1927 1928 case OP_CLASS: 1929 case OP_NCLASS: 1930#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 1931 case OP_XCLASS: 1932 /* The original code caused an unsigned overflow in 64 bit systems, 1933 so now we use a conditional statement. */ 1934 if (op == OP_XCLASS) 1935 cc += GET(cc, 1); 1936 else 1937 cc += PRIV(OP_lengths)[OP_CLASS]; 1938#else 1939 cc += PRIV(OP_lengths)[OP_CLASS]; 1940#endif 1941 1942 switch (*cc) 1943 { 1944 case OP_CRSTAR: 1945 case OP_CRMINSTAR: 1946 case OP_CRPLUS: 1947 case OP_CRMINPLUS: 1948 case OP_CRQUERY: 1949 case OP_CRMINQUERY: 1950 case OP_CRPOSSTAR: 1951 case OP_CRPOSPLUS: 1952 case OP_CRPOSQUERY: 1953 return -1; 1954 1955 case OP_CRRANGE: 1956 case OP_CRMINRANGE: 1957 case OP_CRPOSRANGE: 1958 if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1; 1959 branchlength += (int)GET2(cc,1); 1960 cc += 1 + 2 * IMM2_SIZE; 1961 break; 1962 1963 default: 1964 branchlength++; 1965 } 1966 break; 1967 1968 /* Anything else is variable length */ 1969 1970 case OP_ANYNL: 1971 case OP_BRAMINZERO: 1972 case OP_BRAPOS: 1973 case OP_BRAPOSZERO: 1974 case OP_BRAZERO: 1975 case OP_CBRAPOS: 1976 case OP_EXTUNI: 1977 case OP_KETRMAX: 1978 case OP_KETRMIN: 1979 case OP_KETRPOS: 1980 case OP_MINPLUS: 1981 case OP_MINPLUSI: 1982 case OP_MINQUERY: 1983 case OP_MINQUERYI: 1984 case OP_MINSTAR: 1985 case OP_MINSTARI: 1986 case OP_MINUPTO: 1987 case OP_MINUPTOI: 1988 case OP_NOTMINPLUS: 1989 case OP_NOTMINPLUSI: 1990 case OP_NOTMINQUERY: 1991 case OP_NOTMINQUERYI: 1992 case OP_NOTMINSTAR: 1993 case OP_NOTMINSTARI: 1994 case OP_NOTMINUPTO: 1995 case OP_NOTMINUPTOI: 1996 case OP_NOTPLUS: 1997 case OP_NOTPLUSI: 1998 case OP_NOTPOSPLUS: 1999 case OP_NOTPOSPLUSI: 2000 case OP_NOTPOSQUERY: 2001 case OP_NOTPOSQUERYI: 2002 case OP_NOTPOSSTAR: 2003 case OP_NOTPOSSTARI: 2004 case OP_NOTPOSUPTO: 2005 case OP_NOTPOSUPTOI: 2006 case OP_NOTQUERY: 2007 case OP_NOTQUERYI: 2008 case OP_NOTSTAR: 2009 case OP_NOTSTARI: 2010 case OP_NOTUPTO: 2011 case OP_NOTUPTOI: 2012 case OP_PLUS: 2013 case OP_PLUSI: 2014 case OP_POSPLUS: 2015 case OP_POSPLUSI: 2016 case OP_POSQUERY: 2017 case OP_POSQUERYI: 2018 case OP_POSSTAR: 2019 case OP_POSSTARI: 2020 case OP_POSUPTO: 2021 case OP_POSUPTOI: 2022 case OP_QUERY: 2023 case OP_QUERYI: 2024 case OP_REF: 2025 case OP_REFI: 2026 case OP_DNREF: 2027 case OP_DNREFI: 2028 case OP_SBRA: 2029 case OP_SBRAPOS: 2030 case OP_SCBRA: 2031 case OP_SCBRAPOS: 2032 case OP_SCOND: 2033 case OP_SKIPZERO: 2034 case OP_STAR: 2035 case OP_STARI: 2036 case OP_TYPEMINPLUS: 2037 case OP_TYPEMINQUERY: 2038 case OP_TYPEMINSTAR: 2039 case OP_TYPEMINUPTO: 2040 case OP_TYPEPLUS: 2041 case OP_TYPEPOSPLUS: 2042 case OP_TYPEPOSQUERY: 2043 case OP_TYPEPOSSTAR: 2044 case OP_TYPEPOSUPTO: 2045 case OP_TYPEQUERY: 2046 case OP_TYPESTAR: 2047 case OP_TYPEUPTO: 2048 case OP_UPTO: 2049 case OP_UPTOI: 2050 return -1; 2051 2052 /* Catch unrecognized opcodes so that when new ones are added they 2053 are not forgotten, as has happened in the past. */ 2054 2055 default: 2056 return -4; 2057 } 2058 } 2059/* Control never gets here */ 2060} 2061 2062 2063 2064/************************************************* 2065* Scan compiled regex for specific bracket * 2066*************************************************/ 2067 2068/* This little function scans through a compiled pattern until it finds a 2069capturing bracket with the given number, or, if the number is negative, an 2070instance of OP_REVERSE for a lookbehind. The function is global in the C sense 2071so that it can be called from pcre_study() when finding the minimum matching 2072length. 2073 2074Arguments: 2075 code points to start of expression 2076 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 2077 number the required bracket number or negative to find a lookbehind 2078 2079Returns: pointer to the opcode for the bracket, or NULL if not found 2080*/ 2081 2082const pcre_uchar * 2083PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number) 2084{ 2085for (;;) 2086 { 2087 register pcre_uchar c = *code; 2088 2089 if (c == OP_END) return NULL; 2090 2091 /* XCLASS is used for classes that cannot be represented just by a bit 2092 map. This includes negated single high-valued characters. The length in 2093 the table is zero; the actual length is stored in the compiled code. */ 2094 2095 if (c == OP_XCLASS) code += GET(code, 1); 2096 2097 /* Handle recursion */ 2098 2099 else if (c == OP_REVERSE) 2100 { 2101 if (number < 0) return (pcre_uchar *)code; 2102 code += PRIV(OP_lengths)[c]; 2103 } 2104 2105 /* Handle capturing bracket */ 2106 2107 else if (c == OP_CBRA || c == OP_SCBRA || 2108 c == OP_CBRAPOS || c == OP_SCBRAPOS) 2109 { 2110 int n = (int)GET2(code, 1+LINK_SIZE); 2111 if (n == number) return (pcre_uchar *)code; 2112 code += PRIV(OP_lengths)[c]; 2113 } 2114 2115 /* Otherwise, we can get the item's length from the table, except that for 2116 repeated character types, we have to test for \p and \P, which have an extra 2117 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we 2118 must add in its length. */ 2119 2120 else 2121 { 2122 switch(c) 2123 { 2124 case OP_TYPESTAR: 2125 case OP_TYPEMINSTAR: 2126 case OP_TYPEPLUS: 2127 case OP_TYPEMINPLUS: 2128 case OP_TYPEQUERY: 2129 case OP_TYPEMINQUERY: 2130 case OP_TYPEPOSSTAR: 2131 case OP_TYPEPOSPLUS: 2132 case OP_TYPEPOSQUERY: 2133 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 2134 break; 2135 2136 case OP_TYPEUPTO: 2137 case OP_TYPEMINUPTO: 2138 case OP_TYPEEXACT: 2139 case OP_TYPEPOSUPTO: 2140 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 2141 code += 2; 2142 break; 2143 2144 case OP_MARK: 2145 case OP_PRUNE_ARG: 2146 case OP_SKIP_ARG: 2147 case OP_THEN_ARG: 2148 code += code[1]; 2149 break; 2150 } 2151 2152 /* Add in the fixed length from the table */ 2153 2154 code += PRIV(OP_lengths)[c]; 2155 2156 /* In UTF-8 mode, opcodes that are followed by a character may be followed by 2157 a multi-byte character. The length in the table is a minimum, so we have to 2158 arrange to skip the extra bytes. */ 2159 2160#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2161 if (utf) switch(c) 2162 { 2163 case OP_CHAR: 2164 case OP_CHARI: 2165 case OP_NOT: 2166 case OP_NOTI: 2167 case OP_EXACT: 2168 case OP_EXACTI: 2169 case OP_NOTEXACT: 2170 case OP_NOTEXACTI: 2171 case OP_UPTO: 2172 case OP_UPTOI: 2173 case OP_NOTUPTO: 2174 case OP_NOTUPTOI: 2175 case OP_MINUPTO: 2176 case OP_MINUPTOI: 2177 case OP_NOTMINUPTO: 2178 case OP_NOTMINUPTOI: 2179 case OP_POSUPTO: 2180 case OP_POSUPTOI: 2181 case OP_NOTPOSUPTO: 2182 case OP_NOTPOSUPTOI: 2183 case OP_STAR: 2184 case OP_STARI: 2185 case OP_NOTSTAR: 2186 case OP_NOTSTARI: 2187 case OP_MINSTAR: 2188 case OP_MINSTARI: 2189 case OP_NOTMINSTAR: 2190 case OP_NOTMINSTARI: 2191 case OP_POSSTAR: 2192 case OP_POSSTARI: 2193 case OP_NOTPOSSTAR: 2194 case OP_NOTPOSSTARI: 2195 case OP_PLUS: 2196 case OP_PLUSI: 2197 case OP_NOTPLUS: 2198 case OP_NOTPLUSI: 2199 case OP_MINPLUS: 2200 case OP_MINPLUSI: 2201 case OP_NOTMINPLUS: 2202 case OP_NOTMINPLUSI: 2203 case OP_POSPLUS: 2204 case OP_POSPLUSI: 2205 case OP_NOTPOSPLUS: 2206 case OP_NOTPOSPLUSI: 2207 case OP_QUERY: 2208 case OP_QUERYI: 2209 case OP_NOTQUERY: 2210 case OP_NOTQUERYI: 2211 case OP_MINQUERY: 2212 case OP_MINQUERYI: 2213 case OP_NOTMINQUERY: 2214 case OP_NOTMINQUERYI: 2215 case OP_POSQUERY: 2216 case OP_POSQUERYI: 2217 case OP_NOTPOSQUERY: 2218 case OP_NOTPOSQUERYI: 2219 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 2220 break; 2221 } 2222#else 2223 (void)(utf); /* Keep compiler happy by referencing function argument */ 2224#endif 2225 } 2226 } 2227} 2228 2229 2230 2231/************************************************* 2232* Scan compiled regex for recursion reference * 2233*************************************************/ 2234 2235/* This little function scans through a compiled pattern until it finds an 2236instance of OP_RECURSE. 2237 2238Arguments: 2239 code points to start of expression 2240 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 2241 2242Returns: pointer to the opcode for OP_RECURSE, or NULL if not found 2243*/ 2244 2245static const pcre_uchar * 2246find_recurse(const pcre_uchar *code, BOOL utf) 2247{ 2248for (;;) 2249 { 2250 register pcre_uchar c = *code; 2251 if (c == OP_END) return NULL; 2252 if (c == OP_RECURSE) return code; 2253 2254 /* XCLASS is used for classes that cannot be represented just by a bit 2255 map. This includes negated single high-valued characters. The length in 2256 the table is zero; the actual length is stored in the compiled code. */ 2257 2258 if (c == OP_XCLASS) code += GET(code, 1); 2259 2260 /* Otherwise, we can get the item's length from the table, except that for 2261 repeated character types, we have to test for \p and \P, which have an extra 2262 two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we 2263 must add in its length. */ 2264 2265 else 2266 { 2267 switch(c) 2268 { 2269 case OP_TYPESTAR: 2270 case OP_TYPEMINSTAR: 2271 case OP_TYPEPLUS: 2272 case OP_TYPEMINPLUS: 2273 case OP_TYPEQUERY: 2274 case OP_TYPEMINQUERY: 2275 case OP_TYPEPOSSTAR: 2276 case OP_TYPEPOSPLUS: 2277 case OP_TYPEPOSQUERY: 2278 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 2279 break; 2280 2281 case OP_TYPEPOSUPTO: 2282 case OP_TYPEUPTO: 2283 case OP_TYPEMINUPTO: 2284 case OP_TYPEEXACT: 2285 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 2286 code += 2; 2287 break; 2288 2289 case OP_MARK: 2290 case OP_PRUNE_ARG: 2291 case OP_SKIP_ARG: 2292 case OP_THEN_ARG: 2293 code += code[1]; 2294 break; 2295 } 2296 2297 /* Add in the fixed length from the table */ 2298 2299 code += PRIV(OP_lengths)[c]; 2300 2301 /* In UTF-8 mode, opcodes that are followed by a character may be followed 2302 by a multi-byte character. The length in the table is a minimum, so we have 2303 to arrange to skip the extra bytes. */ 2304 2305#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2306 if (utf) switch(c) 2307 { 2308 case OP_CHAR: 2309 case OP_CHARI: 2310 case OP_NOT: 2311 case OP_NOTI: 2312 case OP_EXACT: 2313 case OP_EXACTI: 2314 case OP_NOTEXACT: 2315 case OP_NOTEXACTI: 2316 case OP_UPTO: 2317 case OP_UPTOI: 2318 case OP_NOTUPTO: 2319 case OP_NOTUPTOI: 2320 case OP_MINUPTO: 2321 case OP_MINUPTOI: 2322 case OP_NOTMINUPTO: 2323 case OP_NOTMINUPTOI: 2324 case OP_POSUPTO: 2325 case OP_POSUPTOI: 2326 case OP_NOTPOSUPTO: 2327 case OP_NOTPOSUPTOI: 2328 case OP_STAR: 2329 case OP_STARI: 2330 case OP_NOTSTAR: 2331 case OP_NOTSTARI: 2332 case OP_MINSTAR: 2333 case OP_MINSTARI: 2334 case OP_NOTMINSTAR: 2335 case OP_NOTMINSTARI: 2336 case OP_POSSTAR: 2337 case OP_POSSTARI: 2338 case OP_NOTPOSSTAR: 2339 case OP_NOTPOSSTARI: 2340 case OP_PLUS: 2341 case OP_PLUSI: 2342 case OP_NOTPLUS: 2343 case OP_NOTPLUSI: 2344 case OP_MINPLUS: 2345 case OP_MINPLUSI: 2346 case OP_NOTMINPLUS: 2347 case OP_NOTMINPLUSI: 2348 case OP_POSPLUS: 2349 case OP_POSPLUSI: 2350 case OP_NOTPOSPLUS: 2351 case OP_NOTPOSPLUSI: 2352 case OP_QUERY: 2353 case OP_QUERYI: 2354 case OP_NOTQUERY: 2355 case OP_NOTQUERYI: 2356 case OP_MINQUERY: 2357 case OP_MINQUERYI: 2358 case OP_NOTMINQUERY: 2359 case OP_NOTMINQUERYI: 2360 case OP_POSQUERY: 2361 case OP_POSQUERYI: 2362 case OP_NOTPOSQUERY: 2363 case OP_NOTPOSQUERYI: 2364 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 2365 break; 2366 } 2367#else 2368 (void)(utf); /* Keep compiler happy by referencing function argument */ 2369#endif 2370 } 2371 } 2372} 2373 2374 2375 2376/************************************************* 2377* Scan compiled branch for non-emptiness * 2378*************************************************/ 2379 2380/* This function scans through a branch of a compiled pattern to see whether it 2381can match the empty string or not. It is called from could_be_empty() 2382below and from compile_branch() when checking for an unlimited repeat of a 2383group that can match nothing. Note that first_significant_code() skips over 2384backward and negative forward assertions when its final argument is TRUE. If we 2385hit an unclosed bracket, we return "empty" - this means we've struck an inner 2386bracket whose current branch will already have been scanned. 2387 2388Arguments: 2389 code points to start of search 2390 endcode points to where to stop 2391 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 2392 cd contains pointers to tables etc. 2393 recurses chain of recurse_check to catch mutual recursion 2394 2395Returns: TRUE if what is matched could be empty 2396*/ 2397 2398static BOOL 2399could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, 2400 BOOL utf, compile_data *cd, recurse_check *recurses) 2401{ 2402register pcre_uchar c; 2403recurse_check this_recurse; 2404 2405for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); 2406 code < endcode; 2407 code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE)) 2408 { 2409 const pcre_uchar *ccode; 2410 2411 c = *code; 2412 2413 /* Skip over forward assertions; the other assertions are skipped by 2414 first_significant_code() with a TRUE final argument. */ 2415 2416 if (c == OP_ASSERT) 2417 { 2418 do code += GET(code, 1); while (*code == OP_ALT); 2419 c = *code; 2420 continue; 2421 } 2422 2423 /* For a recursion/subroutine call, if its end has been reached, which 2424 implies a backward reference subroutine call, we can scan it. If it's a 2425 forward reference subroutine call, we can't. To detect forward reference 2426 we have to scan up the list that is kept in the workspace. This function is 2427 called only when doing the real compile, not during the pre-compile that 2428 measures the size of the compiled pattern. */ 2429 2430 if (c == OP_RECURSE) 2431 { 2432 const pcre_uchar *scode = cd->start_code + GET(code, 1); 2433 const pcre_uchar *endgroup = scode; 2434 BOOL empty_branch; 2435 2436 /* Test for forward reference or uncompleted reference. This is disabled 2437 when called to scan a completed pattern by setting cd->start_workspace to 2438 NULL. */ 2439 2440 if (cd->start_workspace != NULL) 2441 { 2442 const pcre_uchar *tcode; 2443 for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE) 2444 if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE; 2445 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ 2446 } 2447 2448 /* If the reference is to a completed group, we need to detect whether this 2449 is a recursive call, as otherwise there will be an infinite loop. If it is 2450 a recursion, just skip over it. Simple recursions are easily detected. For 2451 mutual recursions we keep a chain on the stack. */ 2452 2453 do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT); 2454 if (code >= scode && code <= endgroup) continue; /* Simple recursion */ 2455 else 2456 { 2457 recurse_check *r = recurses; 2458 for (r = recurses; r != NULL; r = r->prev) 2459 if (r->group == scode) break; 2460 if (r != NULL) continue; /* Mutual recursion */ 2461 } 2462 2463 /* Completed reference; scan the referenced group, remembering it on the 2464 stack chain to detect mutual recursions. */ 2465 2466 empty_branch = FALSE; 2467 this_recurse.prev = recurses; 2468 this_recurse.group = scode; 2469 2470 do 2471 { 2472 if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse)) 2473 { 2474 empty_branch = TRUE; 2475 break; 2476 } 2477 scode += GET(scode, 1); 2478 } 2479 while (*scode == OP_ALT); 2480 2481 if (!empty_branch) return FALSE; /* All branches are non-empty */ 2482 continue; 2483 } 2484 2485 /* Groups with zero repeats can of course be empty; skip them. */ 2486 2487 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO || 2488 c == OP_BRAPOSZERO) 2489 { 2490 code += PRIV(OP_lengths)[c]; 2491 do code += GET(code, 1); while (*code == OP_ALT); 2492 c = *code; 2493 continue; 2494 } 2495 2496 /* A nested group that is already marked as "could be empty" can just be 2497 skipped. */ 2498 2499 if (c == OP_SBRA || c == OP_SBRAPOS || 2500 c == OP_SCBRA || c == OP_SCBRAPOS) 2501 { 2502 do code += GET(code, 1); while (*code == OP_ALT); 2503 c = *code; 2504 continue; 2505 } 2506 2507 /* For other groups, scan the branches. */ 2508 2509 if (c == OP_BRA || c == OP_BRAPOS || 2510 c == OP_CBRA || c == OP_CBRAPOS || 2511 c == OP_ONCE || c == OP_ONCE_NC || 2512 c == OP_COND || c == OP_SCOND) 2513 { 2514 BOOL empty_branch; 2515 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ 2516 2517 /* If a conditional group has only one branch, there is a second, implied, 2518 empty branch, so just skip over the conditional, because it could be empty. 2519 Otherwise, scan the individual branches of the group. */ 2520 2521 if (c == OP_COND && code[GET(code, 1)] != OP_ALT) 2522 code += GET(code, 1); 2523 else 2524 { 2525 empty_branch = FALSE; 2526 do 2527 { 2528 if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd, 2529 recurses)) empty_branch = TRUE; 2530 code += GET(code, 1); 2531 } 2532 while (*code == OP_ALT); 2533 if (!empty_branch) return FALSE; /* All branches are non-empty */ 2534 } 2535 2536 c = *code; 2537 continue; 2538 } 2539 2540 /* Handle the other opcodes */ 2541 2542 switch (c) 2543 { 2544 /* Check for quantifiers after a class. XCLASS is used for classes that 2545 cannot be represented just by a bit map. This includes negated single 2546 high-valued characters. The length in PRIV(OP_lengths)[] is zero; the 2547 actual length is stored in the compiled code, so we must update "code" 2548 here. */ 2549 2550#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 2551 case OP_XCLASS: 2552 ccode = code += GET(code, 1); 2553 goto CHECK_CLASS_REPEAT; 2554#endif 2555 2556 case OP_CLASS: 2557 case OP_NCLASS: 2558 ccode = code + PRIV(OP_lengths)[OP_CLASS]; 2559 2560#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 2561 CHECK_CLASS_REPEAT: 2562#endif 2563 2564 switch (*ccode) 2565 { 2566 case OP_CRSTAR: /* These could be empty; continue */ 2567 case OP_CRMINSTAR: 2568 case OP_CRQUERY: 2569 case OP_CRMINQUERY: 2570 case OP_CRPOSSTAR: 2571 case OP_CRPOSQUERY: 2572 break; 2573 2574 default: /* Non-repeat => class must match */ 2575 case OP_CRPLUS: /* These repeats aren't empty */ 2576 case OP_CRMINPLUS: 2577 case OP_CRPOSPLUS: 2578 return FALSE; 2579 2580 case OP_CRRANGE: 2581 case OP_CRMINRANGE: 2582 case OP_CRPOSRANGE: 2583 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ 2584 break; 2585 } 2586 break; 2587 2588 /* Opcodes that must match a character */ 2589 2590 case OP_ANY: 2591 case OP_ALLANY: 2592 case OP_ANYBYTE: 2593 2594 case OP_PROP: 2595 case OP_NOTPROP: 2596 case OP_ANYNL: 2597 2598 case OP_NOT_HSPACE: 2599 case OP_HSPACE: 2600 case OP_NOT_VSPACE: 2601 case OP_VSPACE: 2602 case OP_EXTUNI: 2603 2604 case OP_NOT_DIGIT: 2605 case OP_DIGIT: 2606 case OP_NOT_WHITESPACE: 2607 case OP_WHITESPACE: 2608 case OP_NOT_WORDCHAR: 2609 case OP_WORDCHAR: 2610 2611 case OP_CHAR: 2612 case OP_CHARI: 2613 case OP_NOT: 2614 case OP_NOTI: 2615 2616 case OP_PLUS: 2617 case OP_PLUSI: 2618 case OP_MINPLUS: 2619 case OP_MINPLUSI: 2620 2621 case OP_NOTPLUS: 2622 case OP_NOTPLUSI: 2623 case OP_NOTMINPLUS: 2624 case OP_NOTMINPLUSI: 2625 2626 case OP_POSPLUS: 2627 case OP_POSPLUSI: 2628 case OP_NOTPOSPLUS: 2629 case OP_NOTPOSPLUSI: 2630 2631 case OP_EXACT: 2632 case OP_EXACTI: 2633 case OP_NOTEXACT: 2634 case OP_NOTEXACTI: 2635 2636 case OP_TYPEPLUS: 2637 case OP_TYPEMINPLUS: 2638 case OP_TYPEPOSPLUS: 2639 case OP_TYPEEXACT: 2640 2641 return FALSE; 2642 2643 /* These are going to continue, as they may be empty, but we have to 2644 fudge the length for the \p and \P cases. */ 2645 2646 case OP_TYPESTAR: 2647 case OP_TYPEMINSTAR: 2648 case OP_TYPEPOSSTAR: 2649 case OP_TYPEQUERY: 2650 case OP_TYPEMINQUERY: 2651 case OP_TYPEPOSQUERY: 2652 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 2653 break; 2654 2655 /* Same for these */ 2656 2657 case OP_TYPEUPTO: 2658 case OP_TYPEMINUPTO: 2659 case OP_TYPEPOSUPTO: 2660 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 2661 code += 2; 2662 break; 2663 2664 /* End of branch */ 2665 2666 case OP_KET: 2667 case OP_KETRMAX: 2668 case OP_KETRMIN: 2669 case OP_KETRPOS: 2670 case OP_ALT: 2671 return TRUE; 2672 2673 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, 2674 MINUPTO, and POSUPTO and their caseless and negative versions may be 2675 followed by a multibyte character. */ 2676 2677#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 2678 case OP_STAR: 2679 case OP_STARI: 2680 case OP_NOTSTAR: 2681 case OP_NOTSTARI: 2682 2683 case OP_MINSTAR: 2684 case OP_MINSTARI: 2685 case OP_NOTMINSTAR: 2686 case OP_NOTMINSTARI: 2687 2688 case OP_POSSTAR: 2689 case OP_POSSTARI: 2690 case OP_NOTPOSSTAR: 2691 case OP_NOTPOSSTARI: 2692 2693 case OP_QUERY: 2694 case OP_QUERYI: 2695 case OP_NOTQUERY: 2696 case OP_NOTQUERYI: 2697 2698 case OP_MINQUERY: 2699 case OP_MINQUERYI: 2700 case OP_NOTMINQUERY: 2701 case OP_NOTMINQUERYI: 2702 2703 case OP_POSQUERY: 2704 case OP_POSQUERYI: 2705 case OP_NOTPOSQUERY: 2706 case OP_NOTPOSQUERYI: 2707 2708 if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]); 2709 break; 2710 2711 case OP_UPTO: 2712 case OP_UPTOI: 2713 case OP_NOTUPTO: 2714 case OP_NOTUPTOI: 2715 2716 case OP_MINUPTO: 2717 case OP_MINUPTOI: 2718 case OP_NOTMINUPTO: 2719 case OP_NOTMINUPTOI: 2720 2721 case OP_POSUPTO: 2722 case OP_POSUPTOI: 2723 case OP_NOTPOSUPTO: 2724 case OP_NOTPOSUPTOI: 2725 2726 if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]); 2727 break; 2728#endif 2729 2730 /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument 2731 string. */ 2732 2733 case OP_MARK: 2734 case OP_PRUNE_ARG: 2735 case OP_SKIP_ARG: 2736 case OP_THEN_ARG: 2737 code += code[1]; 2738 break; 2739 2740 /* None of the remaining opcodes are required to match a character. */ 2741 2742 default: 2743 break; 2744 } 2745 } 2746 2747return TRUE; 2748} 2749 2750 2751 2752/************************************************* 2753* Scan compiled regex for non-emptiness * 2754*************************************************/ 2755 2756/* This function is called to check for left recursive calls. We want to check 2757the current branch of the current pattern to see if it could match the empty 2758string. If it could, we must look outwards for branches at other levels, 2759stopping when we pass beyond the bracket which is the subject of the recursion. 2760This function is called only during the real compile, not during the 2761pre-compile. 2762 2763Arguments: 2764 code points to start of the recursion 2765 endcode points to where to stop (current RECURSE item) 2766 bcptr points to the chain of current (unclosed) branch starts 2767 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 2768 cd pointers to tables etc 2769 2770Returns: TRUE if what is matched could be empty 2771*/ 2772 2773static BOOL 2774could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, 2775 branch_chain *bcptr, BOOL utf, compile_data *cd) 2776{ 2777while (bcptr != NULL && bcptr->current_branch >= code) 2778 { 2779 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL)) 2780 return FALSE; 2781 bcptr = bcptr->outer; 2782 } 2783return TRUE; 2784} 2785 2786 2787 2788/************************************************* 2789* Base opcode of repeated opcodes * 2790*************************************************/ 2791 2792/* Returns the base opcode for repeated single character type opcodes. If the 2793opcode is not a repeated character type, it returns with the original value. 2794 2795Arguments: c opcode 2796Returns: base opcode for the type 2797*/ 2798 2799static pcre_uchar 2800get_repeat_base(pcre_uchar c) 2801{ 2802return (c > OP_TYPEPOSUPTO)? c : 2803 (c >= OP_TYPESTAR)? OP_TYPESTAR : 2804 (c >= OP_NOTSTARI)? OP_NOTSTARI : 2805 (c >= OP_NOTSTAR)? OP_NOTSTAR : 2806 (c >= OP_STARI)? OP_STARI : 2807 OP_STAR; 2808} 2809 2810 2811 2812#ifdef SUPPORT_UCP 2813/************************************************* 2814* Check a character and a property * 2815*************************************************/ 2816 2817/* This function is called by check_auto_possessive() when a property item 2818is adjacent to a fixed character. 2819 2820Arguments: 2821 c the character 2822 ptype the property type 2823 pdata the data for the type 2824 negated TRUE if it's a negated property (\P or \p{^) 2825 2826Returns: TRUE if auto-possessifying is OK 2827*/ 2828 2829static BOOL 2830check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata, 2831 BOOL negated) 2832{ 2833const pcre_uint32 *p; 2834const ucd_record *prop = GET_UCD(c); 2835 2836switch(ptype) 2837 { 2838 case PT_LAMP: 2839 return (prop->chartype == ucp_Lu || 2840 prop->chartype == ucp_Ll || 2841 prop->chartype == ucp_Lt) == negated; 2842 2843 case PT_GC: 2844 return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated; 2845 2846 case PT_PC: 2847 return (pdata == prop->chartype) == negated; 2848 2849 case PT_SC: 2850 return (pdata == prop->script) == negated; 2851 2852 /* These are specials */ 2853 2854 case PT_ALNUM: 2855 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2856 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated; 2857 2858 /* Perl space used to exclude VT, but from Perl 5.18 it is included, which 2859 means that Perl space and POSIX space are now identical. PCRE was changed 2860 at release 8.34. */ 2861 2862 case PT_SPACE: /* Perl space */ 2863 case PT_PXSPACE: /* POSIX space */ 2864 switch(c) 2865 { 2866 HSPACE_CASES: 2867 VSPACE_CASES: 2868 return negated; 2869 2870 default: 2871 return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated; 2872 } 2873 break; /* Control never reaches here */ 2874 2875 case PT_WORD: 2876 return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || 2877 PRIV(ucp_gentype)[prop->chartype] == ucp_N || 2878 c == CHAR_UNDERSCORE) == negated; 2879 2880 case PT_CLIST: 2881 p = PRIV(ucd_caseless_sets) + prop->caseset; 2882 for (;;) 2883 { 2884 if (c < *p) return !negated; 2885 if (c == *p++) return negated; 2886 } 2887 break; /* Control never reaches here */ 2888 } 2889 2890return FALSE; 2891} 2892#endif /* SUPPORT_UCP */ 2893 2894 2895 2896/************************************************* 2897* Fill the character property list * 2898*************************************************/ 2899 2900/* Checks whether the code points to an opcode that can take part in auto- 2901possessification, and if so, fills a list with its properties. 2902 2903Arguments: 2904 code points to start of expression 2905 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode 2906 fcc points to case-flipping table 2907 list points to output list 2908 list[0] will be filled with the opcode 2909 list[1] will be non-zero if this opcode 2910 can match an empty character string 2911 list[2..7] depends on the opcode 2912 2913Returns: points to the start of the next opcode if *code is accepted 2914 NULL if *code is not accepted 2915*/ 2916 2917static const pcre_uchar * 2918get_chr_property_list(const pcre_uchar *code, BOOL utf, 2919 const pcre_uint8 *fcc, pcre_uint32 *list) 2920{ 2921pcre_uchar c = *code; 2922pcre_uchar base; 2923const pcre_uchar *end; 2924pcre_uint32 chr; 2925 2926#ifdef SUPPORT_UCP 2927pcre_uint32 *clist_dest; 2928const pcre_uint32 *clist_src; 2929#else 2930utf = utf; /* Suppress "unused parameter" compiler warning */ 2931#endif 2932 2933list[0] = c; 2934list[1] = FALSE; 2935code++; 2936 2937if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) 2938 { 2939 base = get_repeat_base(c); 2940 c -= (base - OP_STAR); 2941 2942 if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO) 2943 code += IMM2_SIZE; 2944 2945 list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS); 2946 2947 switch(base) 2948 { 2949 case OP_STAR: 2950 list[0] = OP_CHAR; 2951 break; 2952 2953 case OP_STARI: 2954 list[0] = OP_CHARI; 2955 break; 2956 2957 case OP_NOTSTAR: 2958 list[0] = OP_NOT; 2959 break; 2960 2961 case OP_NOTSTARI: 2962 list[0] = OP_NOTI; 2963 break; 2964 2965 case OP_TYPESTAR: 2966 list[0] = *code; 2967 code++; 2968 break; 2969 } 2970 c = list[0]; 2971 } 2972 2973switch(c) 2974 { 2975 case OP_NOT_DIGIT: 2976 case OP_DIGIT: 2977 case OP_NOT_WHITESPACE: 2978 case OP_WHITESPACE: 2979 case OP_NOT_WORDCHAR: 2980 case OP_WORDCHAR: 2981 case OP_ANY: 2982 case OP_ALLANY: 2983 case OP_ANYNL: 2984 case OP_NOT_HSPACE: 2985 case OP_HSPACE: 2986 case OP_NOT_VSPACE: 2987 case OP_VSPACE: 2988 case OP_EXTUNI: 2989 case OP_EODN: 2990 case OP_EOD: 2991 case OP_DOLL: 2992 case OP_DOLLM: 2993 return code; 2994 2995 case OP_CHAR: 2996 case OP_NOT: 2997 GETCHARINCTEST(chr, code); 2998 list[2] = chr; 2999 list[3] = NOTACHAR; 3000 return code; 3001 3002 case OP_CHARI: 3003 case OP_NOTI: 3004 list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT; 3005 GETCHARINCTEST(chr, code); 3006 list[2] = chr; 3007 3008#ifdef SUPPORT_UCP 3009 if (chr < 128 || (chr < 256 && !utf)) 3010 list[3] = fcc[chr]; 3011 else 3012 list[3] = UCD_OTHERCASE(chr); 3013#elif defined SUPPORT_UTF || !defined COMPILE_PCRE8 3014 list[3] = (chr < 256) ? fcc[chr] : chr; 3015#else 3016 list[3] = fcc[chr]; 3017#endif 3018 3019 /* The othercase might be the same value. */ 3020 3021 if (chr == list[3]) 3022 list[3] = NOTACHAR; 3023 else 3024 list[4] = NOTACHAR; 3025 return code; 3026 3027#ifdef SUPPORT_UCP 3028 case OP_PROP: 3029 case OP_NOTPROP: 3030 if (code[0] != PT_CLIST) 3031 { 3032 list[2] = code[0]; 3033 list[3] = code[1]; 3034 return code + 2; 3035 } 3036 3037 /* Convert only if we have enough space. */ 3038 3039 clist_src = PRIV(ucd_caseless_sets) + code[1]; 3040 clist_dest = list + 2; 3041 code += 2; 3042 3043 do { 3044 if (clist_dest >= list + 8) 3045 { 3046 /* Early return if there is not enough space. This should never 3047 happen, since all clists are shorter than 5 character now. */ 3048 list[2] = code[0]; 3049 list[3] = code[1]; 3050 return code; 3051 } 3052 *clist_dest++ = *clist_src; 3053 } 3054 while(*clist_src++ != NOTACHAR); 3055 3056 /* All characters are stored. The terminating NOTACHAR 3057 is copied form the clist itself. */ 3058 3059 list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT; 3060 return code; 3061#endif 3062 3063 case OP_NCLASS: 3064 case OP_CLASS: 3065#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3066 case OP_XCLASS: 3067 if (c == OP_XCLASS) 3068 end = code + GET(code, 0) - 1; 3069 else 3070#endif 3071 end = code + 32 / sizeof(pcre_uchar); 3072 3073 switch(*end) 3074 { 3075 case OP_CRSTAR: 3076 case OP_CRMINSTAR: 3077 case OP_CRQUERY: 3078 case OP_CRMINQUERY: 3079 case OP_CRPOSSTAR: 3080 case OP_CRPOSQUERY: 3081 list[1] = TRUE; 3082 end++; 3083 break; 3084 3085 case OP_CRPLUS: 3086 case OP_CRMINPLUS: 3087 case OP_CRPOSPLUS: 3088 end++; 3089 break; 3090 3091 case OP_CRRANGE: 3092 case OP_CRMINRANGE: 3093 case OP_CRPOSRANGE: 3094 list[1] = (GET2(end, 1) == 0); 3095 end += 1 + 2 * IMM2_SIZE; 3096 break; 3097 } 3098 list[2] = (pcre_uint32)(end - code); 3099 return end; 3100 } 3101return NULL; /* Opcode not accepted */ 3102} 3103 3104 3105 3106/************************************************* 3107* Scan further character sets for match * 3108*************************************************/ 3109 3110/* Checks whether the base and the current opcode have a common character, in 3111which case the base cannot be possessified. 3112 3113Arguments: 3114 code points to the byte code 3115 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 3116 cd static compile data 3117 base_list the data list of the base opcode 3118 3119Returns: TRUE if the auto-possessification is possible 3120*/ 3121 3122static BOOL 3123compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd, 3124 const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit) 3125{ 3126pcre_uchar c; 3127pcre_uint32 list[8]; 3128const pcre_uint32 *chr_ptr; 3129const pcre_uint32 *ochr_ptr; 3130const pcre_uint32 *list_ptr; 3131const pcre_uchar *next_code; 3132#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3133const pcre_uchar *xclass_flags; 3134#endif 3135const pcre_uint8 *class_bitset; 3136const pcre_uint8 *set1, *set2, *set_end; 3137pcre_uint32 chr; 3138BOOL accepted, invert_bits; 3139BOOL entered_a_group = FALSE; 3140 3141if (*rec_limit == 0) return FALSE; 3142--(*rec_limit); 3143 3144/* Note: the base_list[1] contains whether the current opcode has greedy 3145(represented by a non-zero value) quantifier. This is a different from 3146other character type lists, which stores here that the character iterator 3147matches to an empty string (also represented by a non-zero value). */ 3148 3149for(;;) 3150 { 3151 /* All operations move the code pointer forward. 3152 Therefore infinite recursions are not possible. */ 3153 3154 c = *code; 3155 3156 /* Skip over callouts */ 3157 3158 if (c == OP_CALLOUT) 3159 { 3160 code += PRIV(OP_lengths)[c]; 3161 continue; 3162 } 3163 3164 if (c == OP_ALT) 3165 { 3166 do code += GET(code, 1); while (*code == OP_ALT); 3167 c = *code; 3168 } 3169 3170 switch(c) 3171 { 3172 case OP_END: 3173 case OP_KETRPOS: 3174 /* TRUE only in greedy case. The non-greedy case could be replaced by 3175 an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT 3176 uses more memory, which we cannot get at this stage.) */ 3177 3178 return base_list[1] != 0; 3179 3180 case OP_KET: 3181 /* If the bracket is capturing, and referenced by an OP_RECURSE, or 3182 it is an atomic sub-pattern (assert, once, etc.) the non-greedy case 3183 cannot be converted to a possessive form. */ 3184 3185 if (base_list[1] == 0) return FALSE; 3186 3187 switch(*(code - GET(code, 1))) 3188 { 3189 case OP_ASSERT: 3190 case OP_ASSERT_NOT: 3191 case OP_ASSERTBACK: 3192 case OP_ASSERTBACK_NOT: 3193 case OP_ONCE: 3194 case OP_ONCE_NC: 3195 /* Atomic sub-patterns and assertions can always auto-possessify their 3196 last iterator. However, if the group was entered as a result of checking 3197 a previous iterator, this is not possible. */ 3198 3199 return !entered_a_group; 3200 } 3201 3202 code += PRIV(OP_lengths)[c]; 3203 continue; 3204 3205 case OP_ONCE: 3206 case OP_ONCE_NC: 3207 case OP_BRA: 3208 case OP_CBRA: 3209 next_code = code + GET(code, 1); 3210 code += PRIV(OP_lengths)[c]; 3211 3212 while (*next_code == OP_ALT) 3213 { 3214 if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit)) 3215 return FALSE; 3216 code = next_code + 1 + LINK_SIZE; 3217 next_code += GET(next_code, 1); 3218 } 3219 3220 entered_a_group = TRUE; 3221 continue; 3222 3223 case OP_BRAZERO: 3224 case OP_BRAMINZERO: 3225 3226 next_code = code + 1; 3227 if (*next_code != OP_BRA && *next_code != OP_CBRA 3228 && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE; 3229 3230 do next_code += GET(next_code, 1); while (*next_code == OP_ALT); 3231 3232 /* The bracket content will be checked by the 3233 OP_BRA/OP_CBRA case above. */ 3234 next_code += 1 + LINK_SIZE; 3235 if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit)) 3236 return FALSE; 3237 3238 code += PRIV(OP_lengths)[c]; 3239 continue; 3240 3241 default: 3242 break; 3243 } 3244 3245 /* Check for a supported opcode, and load its properties. */ 3246 3247 code = get_chr_property_list(code, utf, cd->fcc, list); 3248 if (code == NULL) return FALSE; /* Unsupported */ 3249 3250 /* If either opcode is a small character list, set pointers for comparing 3251 characters from that list with another list, or with a property. */ 3252 3253 if (base_list[0] == OP_CHAR) 3254 { 3255 chr_ptr = base_list + 2; 3256 list_ptr = list; 3257 } 3258 else if (list[0] == OP_CHAR) 3259 { 3260 chr_ptr = list + 2; 3261 list_ptr = base_list; 3262 } 3263 3264 /* Character bitsets can also be compared to certain opcodes. */ 3265 3266 else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS 3267#ifdef COMPILE_PCRE8 3268 /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */ 3269 || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS)) 3270#endif 3271 ) 3272 { 3273#ifdef COMPILE_PCRE8 3274 if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS)) 3275#else 3276 if (base_list[0] == OP_CLASS) 3277#endif 3278 { 3279 set1 = (pcre_uint8 *)(base_end - base_list[2]); 3280 list_ptr = list; 3281 } 3282 else 3283 { 3284 set1 = (pcre_uint8 *)(code - list[2]); 3285 list_ptr = base_list; 3286 } 3287 3288 invert_bits = FALSE; 3289 switch(list_ptr[0]) 3290 { 3291 case OP_CLASS: 3292 case OP_NCLASS: 3293 set2 = (pcre_uint8 *) 3294 ((list_ptr == list ? code : base_end) - list_ptr[2]); 3295 break; 3296 3297#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3298 case OP_XCLASS: 3299 xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE; 3300 if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE; 3301 if ((*xclass_flags & XCL_MAP) == 0) 3302 { 3303 /* No bits are set for characters < 256. */ 3304 if (list[1] == 0) return TRUE; 3305 /* Might be an empty repeat. */ 3306 continue; 3307 } 3308 set2 = (pcre_uint8 *)(xclass_flags + 1); 3309 break; 3310#endif 3311 3312 case OP_NOT_DIGIT: 3313 invert_bits = TRUE; 3314 /* Fall through */ 3315 case OP_DIGIT: 3316 set2 = (pcre_uint8 *)(cd->cbits + cbit_digit); 3317 break; 3318 3319 case OP_NOT_WHITESPACE: 3320 invert_bits = TRUE; 3321 /* Fall through */ 3322 case OP_WHITESPACE: 3323 set2 = (pcre_uint8 *)(cd->cbits + cbit_space); 3324 break; 3325 3326 case OP_NOT_WORDCHAR: 3327 invert_bits = TRUE; 3328 /* Fall through */ 3329 case OP_WORDCHAR: 3330 set2 = (pcre_uint8 *)(cd->cbits + cbit_word); 3331 break; 3332 3333 default: 3334 return FALSE; 3335 } 3336 3337 /* Because the sets are unaligned, we need 3338 to perform byte comparison here. */ 3339 set_end = set1 + 32; 3340 if (invert_bits) 3341 { 3342 do 3343 { 3344 if ((*set1++ & ~(*set2++)) != 0) return FALSE; 3345 } 3346 while (set1 < set_end); 3347 } 3348 else 3349 { 3350 do 3351 { 3352 if ((*set1++ & *set2++) != 0) return FALSE; 3353 } 3354 while (set1 < set_end); 3355 } 3356 3357 if (list[1] == 0) return TRUE; 3358 /* Might be an empty repeat. */ 3359 continue; 3360 } 3361 3362 /* Some property combinations also acceptable. Unicode property opcodes are 3363 processed specially; the rest can be handled with a lookup table. */ 3364 3365 else 3366 { 3367 pcre_uint32 leftop, rightop; 3368 3369 leftop = base_list[0]; 3370 rightop = list[0]; 3371 3372#ifdef SUPPORT_UCP 3373 accepted = FALSE; /* Always set in non-unicode case. */ 3374 if (leftop == OP_PROP || leftop == OP_NOTPROP) 3375 { 3376 if (rightop == OP_EOD) 3377 accepted = TRUE; 3378 else if (rightop == OP_PROP || rightop == OP_NOTPROP) 3379 { 3380 int n; 3381 const pcre_uint8 *p; 3382 BOOL same = leftop == rightop; 3383 BOOL lisprop = leftop == OP_PROP; 3384 BOOL risprop = rightop == OP_PROP; 3385 BOOL bothprop = lisprop && risprop; 3386 3387 /* There's a table that specifies how each combination is to be 3388 processed: 3389 0 Always return FALSE (never auto-possessify) 3390 1 Character groups are distinct (possessify if both are OP_PROP) 3391 2 Check character categories in the same group (general or particular) 3392 3 Return TRUE if the two opcodes are not the same 3393 ... see comments below 3394 */ 3395 3396 n = propposstab[base_list[2]][list[2]]; 3397 switch(n) 3398 { 3399 case 0: break; 3400 case 1: accepted = bothprop; break; 3401 case 2: accepted = (base_list[3] == list[3]) != same; break; 3402 case 3: accepted = !same; break; 3403 3404 case 4: /* Left general category, right particular category */ 3405 accepted = risprop && catposstab[base_list[3]][list[3]] == same; 3406 break; 3407 3408 case 5: /* Right general category, left particular category */ 3409 accepted = lisprop && catposstab[list[3]][base_list[3]] == same; 3410 break; 3411 3412 /* This code is logically tricky. Think hard before fiddling with it. 3413 The posspropstab table has four entries per row. Each row relates to 3414 one of PCRE's special properties such as ALNUM or SPACE or WORD. 3415 Only WORD actually needs all four entries, but using repeats for the 3416 others means they can all use the same code below. 3417 3418 The first two entries in each row are Unicode general categories, and 3419 apply always, because all the characters they include are part of the 3420 PCRE character set. The third and fourth entries are a general and a 3421 particular category, respectively, that include one or more relevant 3422 characters. One or the other is used, depending on whether the check 3423 is for a general or a particular category. However, in both cases the 3424 category contains more characters than the specials that are defined 3425 for the property being tested against. Therefore, it cannot be used 3426 in a NOTPROP case. 3427 3428 Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po. 3429 Underscore is covered by ucp_P or ucp_Po. */ 3430 3431 case 6: /* Left alphanum vs right general category */ 3432 case 7: /* Left space vs right general category */ 3433 case 8: /* Left word vs right general category */ 3434 p = posspropstab[n-6]; 3435 accepted = risprop && lisprop == 3436 (list[3] != p[0] && 3437 list[3] != p[1] && 3438 (list[3] != p[2] || !lisprop)); 3439 break; 3440 3441 case 9: /* Right alphanum vs left general category */ 3442 case 10: /* Right space vs left general category */ 3443 case 11: /* Right word vs left general category */ 3444 p = posspropstab[n-9]; 3445 accepted = lisprop && risprop == 3446 (base_list[3] != p[0] && 3447 base_list[3] != p[1] && 3448 (base_list[3] != p[2] || !risprop)); 3449 break; 3450 3451 case 12: /* Left alphanum vs right particular category */ 3452 case 13: /* Left space vs right particular category */ 3453 case 14: /* Left word vs right particular category */ 3454 p = posspropstab[n-12]; 3455 accepted = risprop && lisprop == 3456 (catposstab[p[0]][list[3]] && 3457 catposstab[p[1]][list[3]] && 3458 (list[3] != p[3] || !lisprop)); 3459 break; 3460 3461 case 15: /* Right alphanum vs left particular category */ 3462 case 16: /* Right space vs left particular category */ 3463 case 17: /* Right word vs left particular category */ 3464 p = posspropstab[n-15]; 3465 accepted = lisprop && risprop == 3466 (catposstab[p[0]][base_list[3]] && 3467 catposstab[p[1]][base_list[3]] && 3468 (base_list[3] != p[3] || !risprop)); 3469 break; 3470 } 3471 } 3472 } 3473 3474 else 3475#endif /* SUPPORT_UCP */ 3476 3477 accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP && 3478 rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP && 3479 autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP]; 3480 3481 if (!accepted) return FALSE; 3482 3483 if (list[1] == 0) return TRUE; 3484 /* Might be an empty repeat. */ 3485 continue; 3486 } 3487 3488 /* Control reaches here only if one of the items is a small character list. 3489 All characters are checked against the other side. */ 3490 3491 do 3492 { 3493 chr = *chr_ptr; 3494 3495 switch(list_ptr[0]) 3496 { 3497 case OP_CHAR: 3498 ochr_ptr = list_ptr + 2; 3499 do 3500 { 3501 if (chr == *ochr_ptr) return FALSE; 3502 ochr_ptr++; 3503 } 3504 while(*ochr_ptr != NOTACHAR); 3505 break; 3506 3507 case OP_NOT: 3508 ochr_ptr = list_ptr + 2; 3509 do 3510 { 3511 if (chr == *ochr_ptr) 3512 break; 3513 ochr_ptr++; 3514 } 3515 while(*ochr_ptr != NOTACHAR); 3516 if (*ochr_ptr == NOTACHAR) return FALSE; /* Not found */ 3517 break; 3518 3519 /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* 3520 set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ 3521 3522 case OP_DIGIT: 3523 if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE; 3524 break; 3525 3526 case OP_NOT_DIGIT: 3527 if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE; 3528 break; 3529 3530 case OP_WHITESPACE: 3531 if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE; 3532 break; 3533 3534 case OP_NOT_WHITESPACE: 3535 if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE; 3536 break; 3537 3538 case OP_WORDCHAR: 3539 if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE; 3540 break; 3541 3542 case OP_NOT_WORDCHAR: 3543 if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE; 3544 break; 3545 3546 case OP_HSPACE: 3547 switch(chr) 3548 { 3549 HSPACE_CASES: return FALSE; 3550 default: break; 3551 } 3552 break; 3553 3554 case OP_NOT_HSPACE: 3555 switch(chr) 3556 { 3557 HSPACE_CASES: break; 3558 default: return FALSE; 3559 } 3560 break; 3561 3562 case OP_ANYNL: 3563 case OP_VSPACE: 3564 switch(chr) 3565 { 3566 VSPACE_CASES: return FALSE; 3567 default: break; 3568 } 3569 break; 3570 3571 case OP_NOT_VSPACE: 3572 switch(chr) 3573 { 3574 VSPACE_CASES: break; 3575 default: return FALSE; 3576 } 3577 break; 3578 3579 case OP_DOLL: 3580 case OP_EODN: 3581 switch (chr) 3582 { 3583 case CHAR_CR: 3584 case CHAR_LF: 3585 case CHAR_VT: 3586 case CHAR_FF: 3587 case CHAR_NEL: 3588#ifndef EBCDIC 3589 case 0x2028: 3590 case 0x2029: 3591#endif /* Not EBCDIC */ 3592 return FALSE; 3593 } 3594 break; 3595 3596 case OP_EOD: /* Can always possessify before \z */ 3597 break; 3598 3599#ifdef SUPPORT_UCP 3600 case OP_PROP: 3601 case OP_NOTPROP: 3602 if (!check_char_prop(chr, list_ptr[2], list_ptr[3], 3603 list_ptr[0] == OP_NOTPROP)) 3604 return FALSE; 3605 break; 3606#endif 3607 3608 case OP_NCLASS: 3609 if (chr > 255) return FALSE; 3610 /* Fall through */ 3611 3612 case OP_CLASS: 3613 if (chr > 255) break; 3614 class_bitset = (pcre_uint8 *) 3615 ((list_ptr == list ? code : base_end) - list_ptr[2]); 3616 if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE; 3617 break; 3618 3619#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3620 case OP_XCLASS: 3621 if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) - 3622 list_ptr[2] + LINK_SIZE, utf)) return FALSE; 3623 break; 3624#endif 3625 3626 default: 3627 return FALSE; 3628 } 3629 3630 chr_ptr++; 3631 } 3632 while(*chr_ptr != NOTACHAR); 3633 3634 /* At least one character must be matched from this opcode. */ 3635 3636 if (list[1] == 0) return TRUE; 3637 } 3638 3639/* Control never reaches here. There used to be a fail-save return FALSE; here, 3640but some compilers complain about an unreachable statement. */ 3641 3642} 3643 3644 3645 3646/************************************************* 3647* Scan compiled regex for auto-possession * 3648*************************************************/ 3649 3650/* Replaces single character iterations with their possessive alternatives 3651if appropriate. This function modifies the compiled opcode! 3652 3653Arguments: 3654 code points to start of the byte code 3655 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 3656 cd static compile data 3657 3658Returns: nothing 3659*/ 3660 3661static void 3662auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd) 3663{ 3664register pcre_uchar c; 3665const pcre_uchar *end; 3666pcre_uchar *repeat_opcode; 3667pcre_uint32 list[8]; 3668int rec_limit; 3669 3670for (;;) 3671 { 3672 c = *code; 3673 3674 /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK, 3675 it may compile without complaining, but may get into a loop here if the code 3676 pointer points to a bad value. This is, of course a documentated possibility, 3677 when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and 3678 just give up on this optimization. */ 3679 3680 if (c >= OP_TABLE_LENGTH) return; 3681 3682 if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) 3683 { 3684 c -= get_repeat_base(c) - OP_STAR; 3685 end = (c <= OP_MINUPTO) ? 3686 get_chr_property_list(code, utf, cd->fcc, list) : NULL; 3687 list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; 3688 3689 rec_limit = 1000; 3690 if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit)) 3691 { 3692 switch(c) 3693 { 3694 case OP_STAR: 3695 *code += OP_POSSTAR - OP_STAR; 3696 break; 3697 3698 case OP_MINSTAR: 3699 *code += OP_POSSTAR - OP_MINSTAR; 3700 break; 3701 3702 case OP_PLUS: 3703 *code += OP_POSPLUS - OP_PLUS; 3704 break; 3705 3706 case OP_MINPLUS: 3707 *code += OP_POSPLUS - OP_MINPLUS; 3708 break; 3709 3710 case OP_QUERY: 3711 *code += OP_POSQUERY - OP_QUERY; 3712 break; 3713 3714 case OP_MINQUERY: 3715 *code += OP_POSQUERY - OP_MINQUERY; 3716 break; 3717 3718 case OP_UPTO: 3719 *code += OP_POSUPTO - OP_UPTO; 3720 break; 3721 3722 case OP_MINUPTO: 3723 *code += OP_POSUPTO - OP_MINUPTO; 3724 break; 3725 } 3726 } 3727 c = *code; 3728 } 3729 else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) 3730 { 3731#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3732 if (c == OP_XCLASS) 3733 repeat_opcode = code + GET(code, 1); 3734 else 3735#endif 3736 repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar)); 3737 3738 c = *repeat_opcode; 3739 if (c >= OP_CRSTAR && c <= OP_CRMINRANGE) 3740 { 3741 /* end must not be NULL. */ 3742 end = get_chr_property_list(code, utf, cd->fcc, list); 3743 3744 list[1] = (c & 1) == 0; 3745 3746 rec_limit = 1000; 3747 if (compare_opcodes(end, utf, cd, list, end, &rec_limit)) 3748 { 3749 switch (c) 3750 { 3751 case OP_CRSTAR: 3752 case OP_CRMINSTAR: 3753 *repeat_opcode = OP_CRPOSSTAR; 3754 break; 3755 3756 case OP_CRPLUS: 3757 case OP_CRMINPLUS: 3758 *repeat_opcode = OP_CRPOSPLUS; 3759 break; 3760 3761 case OP_CRQUERY: 3762 case OP_CRMINQUERY: 3763 *repeat_opcode = OP_CRPOSQUERY; 3764 break; 3765 3766 case OP_CRRANGE: 3767 case OP_CRMINRANGE: 3768 *repeat_opcode = OP_CRPOSRANGE; 3769 break; 3770 } 3771 } 3772 } 3773 c = *code; 3774 } 3775 3776 switch(c) 3777 { 3778 case OP_END: 3779 return; 3780 3781 case OP_TYPESTAR: 3782 case OP_TYPEMINSTAR: 3783 case OP_TYPEPLUS: 3784 case OP_TYPEMINPLUS: 3785 case OP_TYPEQUERY: 3786 case OP_TYPEMINQUERY: 3787 case OP_TYPEPOSSTAR: 3788 case OP_TYPEPOSPLUS: 3789 case OP_TYPEPOSQUERY: 3790 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 3791 break; 3792 3793 case OP_TYPEUPTO: 3794 case OP_TYPEMINUPTO: 3795 case OP_TYPEEXACT: 3796 case OP_TYPEPOSUPTO: 3797 if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) 3798 code += 2; 3799 break; 3800 3801#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 3802 case OP_XCLASS: 3803 code += GET(code, 1); 3804 break; 3805#endif 3806 3807 case OP_MARK: 3808 case OP_PRUNE_ARG: 3809 case OP_SKIP_ARG: 3810 case OP_THEN_ARG: 3811 code += code[1]; 3812 break; 3813 } 3814 3815 /* Add in the fixed length from the table */ 3816 3817 code += PRIV(OP_lengths)[c]; 3818 3819 /* In UTF-8 mode, opcodes that are followed by a character may be followed by 3820 a multi-byte character. The length in the table is a minimum, so we have to 3821 arrange to skip the extra bytes. */ 3822 3823#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 3824 if (utf) switch(c) 3825 { 3826 case OP_CHAR: 3827 case OP_CHARI: 3828 case OP_NOT: 3829 case OP_NOTI: 3830 case OP_STAR: 3831 case OP_MINSTAR: 3832 case OP_PLUS: 3833 case OP_MINPLUS: 3834 case OP_QUERY: 3835 case OP_MINQUERY: 3836 case OP_UPTO: 3837 case OP_MINUPTO: 3838 case OP_EXACT: 3839 case OP_POSSTAR: 3840 case OP_POSPLUS: 3841 case OP_POSQUERY: 3842 case OP_POSUPTO: 3843 case OP_STARI: 3844 case OP_MINSTARI: 3845 case OP_PLUSI: 3846 case OP_MINPLUSI: 3847 case OP_QUERYI: 3848 case OP_MINQUERYI: 3849 case OP_UPTOI: 3850 case OP_MINUPTOI: 3851 case OP_EXACTI: 3852 case OP_POSSTARI: 3853 case OP_POSPLUSI: 3854 case OP_POSQUERYI: 3855 case OP_POSUPTOI: 3856 case OP_NOTSTAR: 3857 case OP_NOTMINSTAR: 3858 case OP_NOTPLUS: 3859 case OP_NOTMINPLUS: 3860 case OP_NOTQUERY: 3861 case OP_NOTMINQUERY: 3862 case OP_NOTUPTO: 3863 case OP_NOTMINUPTO: 3864 case OP_NOTEXACT: 3865 case OP_NOTPOSSTAR: 3866 case OP_NOTPOSPLUS: 3867 case OP_NOTPOSQUERY: 3868 case OP_NOTPOSUPTO: 3869 case OP_NOTSTARI: 3870 case OP_NOTMINSTARI: 3871 case OP_NOTPLUSI: 3872 case OP_NOTMINPLUSI: 3873 case OP_NOTQUERYI: 3874 case OP_NOTMINQUERYI: 3875 case OP_NOTUPTOI: 3876 case OP_NOTMINUPTOI: 3877 case OP_NOTEXACTI: 3878 case OP_NOTPOSSTARI: 3879 case OP_NOTPOSPLUSI: 3880 case OP_NOTPOSQUERYI: 3881 case OP_NOTPOSUPTOI: 3882 if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); 3883 break; 3884 } 3885#else 3886 (void)(utf); /* Keep compiler happy by referencing function argument */ 3887#endif 3888 } 3889} 3890 3891 3892 3893/************************************************* 3894* Check for POSIX class syntax * 3895*************************************************/ 3896 3897/* This function is called when the sequence "[:" or "[." or "[=" is 3898encountered in a character class. It checks whether this is followed by a 3899sequence of characters terminated by a matching ":]" or ".]" or "=]". If we 3900reach an unescaped ']' without the special preceding character, return FALSE. 3901 3902Originally, this function only recognized a sequence of letters between the 3903terminators, but it seems that Perl recognizes any sequence of characters, 3904though of course unknown POSIX names are subsequently rejected. Perl gives an 3905"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE 3906didn't consider this to be a POSIX class. Likewise for [:1234:]. 3907 3908The problem in trying to be exactly like Perl is in the handling of escapes. We 3909have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX 3910class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code 3911below handles the special cases \\ and \], but does not try to do any other 3912escape processing. This makes it different from Perl for cases such as 3913[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does 3914not recognize "l\ower". This is a lesser evil than not diagnosing bad classes 3915when Perl does, I think. 3916 3917A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. 3918It seems that the appearance of a nested POSIX class supersedes an apparent 3919external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or 3920a digit. 3921 3922In Perl, unescaped square brackets may also appear as part of class names. For 3923example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for 3924[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not 3925seem right at all. PCRE does not allow closing square brackets in POSIX class 3926names. 3927 3928Arguments: 3929 ptr pointer to the initial [ 3930 endptr where to return the end pointer 3931 3932Returns: TRUE or FALSE 3933*/ 3934 3935static BOOL 3936check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr) 3937{ 3938pcre_uchar terminator; /* Don't combine these lines; the Solaris cc */ 3939terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ 3940for (++ptr; *ptr != CHAR_NULL; ptr++) 3941 { 3942 if (*ptr == CHAR_BACKSLASH && 3943 (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || 3944 ptr[1] == CHAR_BACKSLASH)) 3945 ptr++; 3946 else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) || 3947 *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; 3948 else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 3949 { 3950 *endptr = ptr; 3951 return TRUE; 3952 } 3953 } 3954return FALSE; 3955} 3956 3957 3958 3959 3960/************************************************* 3961* Check POSIX class name * 3962*************************************************/ 3963 3964/* This function is called to check the name given in a POSIX-style class entry 3965such as [:alnum:]. 3966 3967Arguments: 3968 ptr points to the first letter 3969 len the length of the name 3970 3971Returns: a value representing the name, or -1 if unknown 3972*/ 3973 3974static int 3975check_posix_name(const pcre_uchar *ptr, int len) 3976{ 3977const char *pn = posix_names; 3978register int yield = 0; 3979while (posix_name_lengths[yield] != 0) 3980 { 3981 if (len == posix_name_lengths[yield] && 3982 STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield; 3983 pn += posix_name_lengths[yield] + 1; 3984 yield++; 3985 } 3986return -1; 3987} 3988 3989 3990/************************************************* 3991* Adjust OP_RECURSE items in repeated group * 3992*************************************************/ 3993 3994/* OP_RECURSE items contain an offset from the start of the regex to the group 3995that is referenced. This means that groups can be replicated for fixed 3996repetition simply by copying (because the recursion is allowed to refer to 3997earlier groups that are outside the current group). However, when a group is 3998optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is 3999inserted before it, after it has been compiled. This means that any OP_RECURSE 4000items within it that refer to the group itself or any contained groups have to 4001have their offsets adjusted. That one of the jobs of this function. Before it 4002is called, the partially compiled regex must be temporarily terminated with 4003OP_END. 4004 4005This function has been extended to cope with forward references for recursions 4006and subroutine calls. It must check the list of such references for the 4007group we are dealing with. If it finds that one of the recursions in the 4008current group is on this list, it does not adjust the value in the reference 4009(which is a group number). After the group has been scanned, all the offsets in 4010the forward reference list for the group are adjusted. 4011 4012Arguments: 4013 group points to the start of the group 4014 adjust the amount by which the group is to be moved 4015 utf TRUE in UTF-8 / UTF-16 / UTF-32 mode 4016 cd contains pointers to tables etc. 4017 save_hwm_offset the hwm forward reference offset at the start of the group 4018 4019Returns: nothing 4020*/ 4021 4022static void 4023adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, 4024 size_t save_hwm_offset) 4025{ 4026int offset; 4027pcre_uchar *hc; 4028pcre_uchar *ptr = group; 4029 4030while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) 4031 { 4032 for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; 4033 hc += LINK_SIZE) 4034 { 4035 offset = (int)GET(hc, 0); 4036 if (cd->start_code + offset == ptr + 1) break; 4037 } 4038 4039 /* If we have not found this recursion on the forward reference list, adjust 4040 the recursion's offset if it's after the start of this group. */ 4041 4042 if (hc >= cd->hwm) 4043 { 4044 offset = (int)GET(ptr, 1); 4045 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); 4046 } 4047 4048 ptr += 1 + LINK_SIZE; 4049 } 4050 4051/* Now adjust all forward reference offsets for the group. */ 4052 4053for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm; 4054 hc += LINK_SIZE) 4055 { 4056 offset = (int)GET(hc, 0); 4057 PUT(hc, 0, offset + adjust); 4058 } 4059} 4060 4061 4062 4063/************************************************* 4064* Insert an automatic callout point * 4065*************************************************/ 4066 4067/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert 4068callout points before each pattern item. 4069 4070Arguments: 4071 code current code pointer 4072 ptr current pattern pointer 4073 cd pointers to tables etc 4074 4075Returns: new code pointer 4076*/ 4077 4078static pcre_uchar * 4079auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd) 4080{ 4081*code++ = OP_CALLOUT; 4082*code++ = 255; 4083PUT(code, 0, (int)(ptr - cd->start_pattern)); /* Pattern offset */ 4084PUT(code, LINK_SIZE, 0); /* Default length */ 4085return code + 2 * LINK_SIZE; 4086} 4087 4088 4089 4090/************************************************* 4091* Complete a callout item * 4092*************************************************/ 4093 4094/* A callout item contains the length of the next item in the pattern, which 4095we can't fill in till after we have reached the relevant point. This is used 4096for both automatic and manual callouts. 4097 4098Arguments: 4099 previous_callout points to previous callout item 4100 ptr current pattern pointer 4101 cd pointers to tables etc 4102 4103Returns: nothing 4104*/ 4105 4106static void 4107complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd) 4108{ 4109int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2)); 4110PUT(previous_callout, 2 + LINK_SIZE, length); 4111} 4112 4113 4114 4115#ifdef SUPPORT_UCP 4116/************************************************* 4117* Get othercase range * 4118*************************************************/ 4119 4120/* This function is passed the start and end of a class range, in UTF-8 mode 4121with UCP support. It searches up the characters, looking for ranges of 4122characters in the "other" case. Each call returns the next one, updating the 4123start address. A character with multiple other cases is returned on its own 4124with a special return value. 4125 4126Arguments: 4127 cptr points to starting character value; updated 4128 d end value 4129 ocptr where to put start of othercase range 4130 odptr where to put end of othercase range 4131 4132Yield: -1 when no more 4133 0 when a range is returned 4134 >0 the CASESET offset for char with multiple other cases 4135 in this case, ocptr contains the original 4136*/ 4137 4138static int 4139get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr, 4140 pcre_uint32 *odptr) 4141{ 4142pcre_uint32 c, othercase, next; 4143unsigned int co; 4144 4145/* Find the first character that has an other case. If it has multiple other 4146cases, return its case offset value. */ 4147 4148for (c = *cptr; c <= d; c++) 4149 { 4150 if ((co = UCD_CASESET(c)) != 0) 4151 { 4152 *ocptr = c++; /* Character that has the set */ 4153 *cptr = c; /* Rest of input range */ 4154 return (int)co; 4155 } 4156 if ((othercase = UCD_OTHERCASE(c)) != c) break; 4157 } 4158 4159if (c > d) return -1; /* Reached end of range */ 4160 4161/* Found a character that has a single other case. Search for the end of the 4162range, which is either the end of the input range, or a character that has zero 4163or more than one other cases. */ 4164 4165*ocptr = othercase; 4166next = othercase + 1; 4167 4168for (++c; c <= d; c++) 4169 { 4170 if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break; 4171 next++; 4172 } 4173 4174*odptr = next - 1; /* End of othercase range */ 4175*cptr = c; /* Rest of input range */ 4176return 0; 4177} 4178#endif /* SUPPORT_UCP */ 4179 4180 4181 4182/************************************************* 4183* Add a character or range to a class * 4184*************************************************/ 4185 4186/* This function packages up the logic of adding a character or range of 4187characters to a class. The character values in the arguments will be within the 4188valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is 4189mutually recursive with the function immediately below. 4190 4191Arguments: 4192 classbits the bit map for characters < 256 4193 uchardptr points to the pointer for extra data 4194 options the options word 4195 cd contains pointers to tables etc. 4196 start start of range character 4197 end end of range character 4198 4199Returns: the number of < 256 characters added 4200 the pointer to extra data is updated 4201*/ 4202 4203static int 4204add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, 4205 compile_data *cd, pcre_uint32 start, pcre_uint32 end) 4206{ 4207pcre_uint32 c; 4208pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff); 4209int n8 = 0; 4210 4211/* If caseless matching is required, scan the range and process alternate 4212cases. In Unicode, there are 8-bit characters that have alternate cases that 4213are greater than 255 and vice-versa. Sometimes we can just extend the original 4214range. */ 4215 4216if ((options & PCRE_CASELESS) != 0) 4217 { 4218#ifdef SUPPORT_UCP 4219 if ((options & PCRE_UTF8) != 0) 4220 { 4221 int rc; 4222 pcre_uint32 oc, od; 4223 4224 options &= ~PCRE_CASELESS; /* Remove for recursive calls */ 4225 c = start; 4226 4227 while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) 4228 { 4229 /* Handle a single character that has more than one other case. */ 4230 4231 if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd, 4232 PRIV(ucd_caseless_sets) + rc, oc); 4233 4234 /* Do nothing if the other case range is within the original range. */ 4235 4236 else if (oc >= start && od <= end) continue; 4237 4238 /* Extend the original range if there is overlap, noting that if oc < c, we 4239 can't have od > end because a subrange is always shorter than the basic 4240 range. Otherwise, use a recursive call to add the additional range. */ 4241 4242 else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ 4243 else if (od > end && oc <= end + 1) 4244 { 4245 end = od; /* Extend upwards */ 4246 if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff); 4247 } 4248 else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od); 4249 } 4250 } 4251 else 4252#endif /* SUPPORT_UCP */ 4253 4254 /* Not UTF-mode, or no UCP */ 4255 4256 for (c = start; c <= classbits_end; c++) 4257 { 4258 SETBIT(classbits, cd->fcc[c]); 4259 n8++; 4260 } 4261 } 4262 4263/* Now handle the original range. Adjust the final value according to the bit 4264length - this means that the same lists of (e.g.) horizontal spaces can be used 4265in all cases. */ 4266 4267#if defined COMPILE_PCRE8 4268#ifdef SUPPORT_UTF 4269 if ((options & PCRE_UTF8) == 0) 4270#endif 4271 if (end > 0xff) end = 0xff; 4272 4273#elif defined COMPILE_PCRE16 4274#ifdef SUPPORT_UTF 4275 if ((options & PCRE_UTF16) == 0) 4276#endif 4277 if (end > 0xffff) end = 0xffff; 4278 4279#endif /* COMPILE_PCRE[8|16] */ 4280 4281/* Use the bitmap for characters < 256. Otherwise use extra data.*/ 4282 4283for (c = start; c <= classbits_end; c++) 4284 { 4285 /* Regardless of start, c will always be <= 255. */ 4286 SETBIT(classbits, c); 4287 n8++; 4288 } 4289 4290#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4291if (start <= 0xff) start = 0xff + 1; 4292 4293if (end >= start) 4294 { 4295 pcre_uchar *uchardata = *uchardptr; 4296#ifdef SUPPORT_UTF 4297 if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */ 4298 { 4299 if (start < end) 4300 { 4301 *uchardata++ = XCL_RANGE; 4302 uchardata += PRIV(ord2utf)(start, uchardata); 4303 uchardata += PRIV(ord2utf)(end, uchardata); 4304 } 4305 else if (start == end) 4306 { 4307 *uchardata++ = XCL_SINGLE; 4308 uchardata += PRIV(ord2utf)(start, uchardata); 4309 } 4310 } 4311 else 4312#endif /* SUPPORT_UTF */ 4313 4314 /* Without UTF support, character values are constrained by the bit length, 4315 and can only be > 256 for 16-bit and 32-bit libraries. */ 4316 4317#ifdef COMPILE_PCRE8 4318 {} 4319#else 4320 if (start < end) 4321 { 4322 *uchardata++ = XCL_RANGE; 4323 *uchardata++ = start; 4324 *uchardata++ = end; 4325 } 4326 else if (start == end) 4327 { 4328 *uchardata++ = XCL_SINGLE; 4329 *uchardata++ = start; 4330 } 4331#endif 4332 4333 *uchardptr = uchardata; /* Updata extra data pointer */ 4334 } 4335#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */ 4336 4337return n8; /* Number of 8-bit characters */ 4338} 4339 4340 4341 4342 4343/************************************************* 4344* Add a list of characters to a class * 4345*************************************************/ 4346 4347/* This function is used for adding a list of case-equivalent characters to a 4348class, and also for adding a list of horizontal or vertical whitespace. If the 4349list is in order (which it should be), ranges of characters are detected and 4350handled appropriately. This function is mutually recursive with the function 4351above. 4352 4353Arguments: 4354 classbits the bit map for characters < 256 4355 uchardptr points to the pointer for extra data 4356 options the options word 4357 cd contains pointers to tables etc. 4358 p points to row of 32-bit values, terminated by NOTACHAR 4359 except character to omit; this is used when adding lists of 4360 case-equivalent characters to avoid including the one we 4361 already know about 4362 4363Returns: the number of < 256 characters added 4364 the pointer to extra data is updated 4365*/ 4366 4367static int 4368add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, 4369 compile_data *cd, const pcre_uint32 *p, unsigned int except) 4370{ 4371int n8 = 0; 4372while (p[0] < NOTACHAR) 4373 { 4374 int n = 0; 4375 if (p[0] != except) 4376 { 4377 while(p[n+1] == p[0] + n + 1) n++; 4378 n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]); 4379 } 4380 p += n + 1; 4381 } 4382return n8; 4383} 4384 4385 4386 4387/************************************************* 4388* Add characters not in a list to a class * 4389*************************************************/ 4390 4391/* This function is used for adding the complement of a list of horizontal or 4392vertical whitespace to a class. The list must be in order. 4393 4394Arguments: 4395 classbits the bit map for characters < 256 4396 uchardptr points to the pointer for extra data 4397 options the options word 4398 cd contains pointers to tables etc. 4399 p points to row of 32-bit values, terminated by NOTACHAR 4400 4401Returns: the number of < 256 characters added 4402 the pointer to extra data is updated 4403*/ 4404 4405static int 4406add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, 4407 int options, compile_data *cd, const pcre_uint32 *p) 4408{ 4409BOOL utf = (options & PCRE_UTF8) != 0; 4410int n8 = 0; 4411if (p[0] > 0) 4412 n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1); 4413while (p[0] < NOTACHAR) 4414 { 4415 while (p[1] == p[0] + 1) p++; 4416 n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1, 4417 (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1); 4418 p++; 4419 } 4420return n8; 4421} 4422 4423 4424 4425/************************************************* 4426* Compile one branch * 4427*************************************************/ 4428 4429/* Scan the pattern, compiling it into the a vector. If the options are 4430changed during the branch, the pointer is used to change the external options 4431bits. This function is used during the pre-compile phase when we are trying 4432to find out the amount of memory needed, as well as during the real compile 4433phase. The value of lengthptr distinguishes the two phases. 4434 4435Arguments: 4436 optionsptr pointer to the option bits 4437 codeptr points to the pointer to the current code point 4438 ptrptr points to the current pattern pointer 4439 errorcodeptr points to error code variable 4440 firstcharptr place to put the first required character 4441 firstcharflagsptr place to put the first character flags, or a negative number 4442 reqcharptr place to put the last required character 4443 reqcharflagsptr place to put the last required character flags, or a negative number 4444 bcptr points to current branch chain 4445 cond_depth conditional nesting depth 4446 cd contains pointers to tables etc. 4447 lengthptr NULL during the real compile phase 4448 points to length accumulator during pre-compile phase 4449 4450Returns: TRUE on success 4451 FALSE, with *errorcodeptr set non-zero on error 4452*/ 4453 4454static BOOL 4455compile_branch(int *optionsptr, pcre_uchar **codeptr, 4456 const pcre_uchar **ptrptr, int *errorcodeptr, 4457 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr, 4458 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr, 4459 branch_chain *bcptr, int cond_depth, 4460 compile_data *cd, int *lengthptr) 4461{ 4462int repeat_type, op_type; 4463int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 4464int bravalue = 0; 4465int greedy_default, greedy_non_default; 4466pcre_uint32 firstchar, reqchar; 4467pcre_int32 firstcharflags, reqcharflags; 4468pcre_uint32 zeroreqchar, zerofirstchar; 4469pcre_int32 zeroreqcharflags, zerofirstcharflags; 4470pcre_int32 req_caseopt, reqvary, tempreqvary; 4471int options = *optionsptr; /* May change dynamically */ 4472int after_manual_callout = 0; 4473int length_prevgroup = 0; 4474register pcre_uint32 c; 4475int escape; 4476register pcre_uchar *code = *codeptr; 4477pcre_uchar *last_code = code; 4478pcre_uchar *orig_code = code; 4479pcre_uchar *tempcode; 4480BOOL inescq = FALSE; 4481BOOL groupsetfirstchar = FALSE; 4482const pcre_uchar *ptr = *ptrptr; 4483const pcre_uchar *tempptr; 4484const pcre_uchar *nestptr = NULL; 4485pcre_uchar *previous = NULL; 4486pcre_uchar *previous_callout = NULL; 4487size_t item_hwm_offset = 0; 4488pcre_uint8 classbits[32]; 4489 4490/* We can fish out the UTF-8 setting once and for all into a BOOL, but we 4491must not do this for other options (e.g. PCRE_EXTENDED) because they may change 4492dynamically as we process the pattern. */ 4493 4494#ifdef SUPPORT_UTF 4495/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */ 4496BOOL utf = (options & PCRE_UTF8) != 0; 4497#ifndef COMPILE_PCRE32 4498pcre_uchar utf_chars[6]; 4499#endif 4500#else 4501BOOL utf = FALSE; 4502#endif 4503 4504/* Helper variables for OP_XCLASS opcode (for characters > 255). We define 4505class_uchardata always so that it can be passed to add_to_class() always, 4506though it will not be used in non-UTF 8-bit cases. This avoids having to supply 4507alternative calls for the different cases. */ 4508 4509pcre_uchar *class_uchardata; 4510#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4511BOOL xclass; 4512pcre_uchar *class_uchardata_base; 4513#endif 4514 4515#ifdef PCRE_DEBUG 4516if (lengthptr != NULL) DPRINTF((">> start branch\n")); 4517#endif 4518 4519/* Set up the default and non-default settings for greediness */ 4520 4521greedy_default = ((options & PCRE_UNGREEDY) != 0); 4522greedy_non_default = greedy_default ^ 1; 4523 4524/* Initialize no first byte, no required byte. REQ_UNSET means "no char 4525matching encountered yet". It gets changed to REQ_NONE if we hit something that 4526matches a non-fixed char first char; reqchar just remains unset if we never 4527find one. 4528 4529When we hit a repeat whose minimum is zero, we may have to adjust these values 4530to take the zero repeat into account. This is implemented by setting them to 4531zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual 4532item types that can be repeated set these backoff variables appropriately. */ 4533 4534firstchar = reqchar = zerofirstchar = zeroreqchar = 0; 4535firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET; 4536 4537/* The variable req_caseopt contains either the REQ_CASELESS value 4538or zero, according to the current setting of the caseless flag. The 4539REQ_CASELESS leaves the lower 28 bit empty. It is added into the 4540firstchar or reqchar variables to record the case status of the 4541value. This is used only for ASCII characters. */ 4542 4543req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0; 4544 4545/* Switch on next character until the end of the branch */ 4546 4547for (;; ptr++) 4548 { 4549 BOOL negate_class; 4550 BOOL should_flip_negation; 4551 BOOL possessive_quantifier; 4552 BOOL is_quantifier; 4553 BOOL is_recurse; 4554 BOOL reset_bracount; 4555 int class_has_8bitchar; 4556 int class_one_char; 4557#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4558 BOOL xclass_has_prop; 4559#endif 4560 int newoptions; 4561 int recno; 4562 int refsign; 4563 int skipbytes; 4564 pcre_uint32 subreqchar, subfirstchar; 4565 pcre_int32 subreqcharflags, subfirstcharflags; 4566 int terminator; 4567 unsigned int mclength; 4568 unsigned int tempbracount; 4569 pcre_uint32 ec; 4570 pcre_uchar mcbuffer[8]; 4571 4572 /* Come here to restart the loop without advancing the pointer. */ 4573 4574 REDO_LOOP: 4575 4576 /* Get next character in the pattern */ 4577 4578 c = *ptr; 4579 4580 /* If we are at the end of a nested substitution, revert to the outer level 4581 string. Nesting only happens one level deep. */ 4582 4583 if (c == CHAR_NULL && nestptr != NULL) 4584 { 4585 ptr = nestptr; 4586 nestptr = NULL; 4587 c = *ptr; 4588 } 4589 4590 /* If we are in the pre-compile phase, accumulate the length used for the 4591 previous cycle of this loop. */ 4592 4593 if (lengthptr != NULL) 4594 { 4595#ifdef PCRE_DEBUG 4596 if (code > cd->hwm) cd->hwm = code; /* High water info */ 4597#endif 4598 if (code > cd->start_workspace + cd->workspace_size - 4599 WORK_SIZE_SAFETY_MARGIN) /* Check for overrun */ 4600 { 4601 *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)? 4602 ERR52 : ERR87; 4603 goto FAILED; 4604 } 4605 4606 /* There is at least one situation where code goes backwards: this is the 4607 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, 4608 the class is simply eliminated. However, it is created first, so we have to 4609 allow memory for it. Therefore, don't ever reduce the length at this point. 4610 */ 4611 4612 if (code < last_code) code = last_code; 4613 4614 /* Paranoid check for integer overflow */ 4615 4616 if (OFLOW_MAX - *lengthptr < code - last_code) 4617 { 4618 *errorcodeptr = ERR20; 4619 goto FAILED; 4620 } 4621 4622 *lengthptr += (int)(code - last_code); 4623 DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr, 4624 (int)(code - last_code), c, c)); 4625 4626 /* If "previous" is set and it is not at the start of the work space, move 4627 it back to there, in order to avoid filling up the work space. Otherwise, 4628 if "previous" is NULL, reset the current code pointer to the start. */ 4629 4630 if (previous != NULL) 4631 { 4632 if (previous > orig_code) 4633 { 4634 memmove(orig_code, previous, IN_UCHARS(code - previous)); 4635 code -= previous - orig_code; 4636 previous = orig_code; 4637 } 4638 } 4639 else code = orig_code; 4640 4641 /* Remember where this code item starts so we can pick up the length 4642 next time round. */ 4643 4644 last_code = code; 4645 } 4646 4647 /* In the real compile phase, just check the workspace used by the forward 4648 reference list. */ 4649 4650 else if (cd->hwm > cd->start_workspace + cd->workspace_size) 4651 { 4652 *errorcodeptr = ERR52; 4653 goto FAILED; 4654 } 4655 4656 /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an 4657 isolated \E is ignored. */ 4658 4659 if (c != CHAR_NULL) 4660 { 4661 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) 4662 { 4663 inescq = FALSE; 4664 ptr++; 4665 continue; 4666 } 4667 else if (inescq) 4668 { 4669 if (previous_callout != NULL) 4670 { 4671 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 4672 complete_callout(previous_callout, ptr, cd); 4673 previous_callout = NULL; 4674 } 4675 if ((options & PCRE_AUTO_CALLOUT) != 0) 4676 { 4677 previous_callout = code; 4678 code = auto_callout(code, ptr, cd); 4679 } 4680 goto NORMAL_CHAR; 4681 } 4682 4683 /* Check for the start of a \Q...\E sequence. We must do this here rather 4684 than later in case it is immediately followed by \E, which turns it into a 4685 "do nothing" sequence. */ 4686 4687 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q) 4688 { 4689 inescq = TRUE; 4690 ptr++; 4691 continue; 4692 } 4693 } 4694 4695 /* In extended mode, skip white space and comments. */ 4696 4697 if ((options & PCRE_EXTENDED) != 0) 4698 { 4699 const pcre_uchar *wscptr = ptr; 4700 while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr); 4701 if (c == CHAR_NUMBER_SIGN) 4702 { 4703 ptr++; 4704 while (*ptr != CHAR_NULL) 4705 { 4706 if (IS_NEWLINE(ptr)) /* For non-fixed-length newline cases, */ 4707 { /* IS_NEWLINE sets cd->nllen. */ 4708 ptr += cd->nllen; 4709 break; 4710 } 4711 ptr++; 4712#ifdef SUPPORT_UTF 4713 if (utf) FORWARDCHAR(ptr); 4714#endif 4715 } 4716 } 4717 4718 /* If we skipped any characters, restart the loop. Otherwise, we didn't see 4719 a comment. */ 4720 4721 if (ptr > wscptr) goto REDO_LOOP; 4722 } 4723 4724 /* Skip over (?# comments. We need to do this here because we want to know if 4725 the next thing is a quantifier, and these comments may come between an item 4726 and its quantifier. */ 4727 4728 if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK && 4729 ptr[2] == CHAR_NUMBER_SIGN) 4730 { 4731 ptr += 3; 4732 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; 4733 if (*ptr == CHAR_NULL) 4734 { 4735 *errorcodeptr = ERR18; 4736 goto FAILED; 4737 } 4738 continue; 4739 } 4740 4741 /* See if the next thing is a quantifier. */ 4742 4743 is_quantifier = 4744 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || 4745 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); 4746 4747 /* Fill in length of a previous callout, except when the next thing is a 4748 quantifier or when processing a property substitution string in UCP mode. */ 4749 4750 if (!is_quantifier && previous_callout != NULL && nestptr == NULL && 4751 after_manual_callout-- <= 0) 4752 { 4753 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 4754 complete_callout(previous_callout, ptr, cd); 4755 previous_callout = NULL; 4756 } 4757 4758 /* Create auto callout, except for quantifiers, or while processing property 4759 strings that are substituted for \w etc in UCP mode. */ 4760 4761 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL) 4762 { 4763 previous_callout = code; 4764 code = auto_callout(code, ptr, cd); 4765 } 4766 4767 /* Process the next pattern item. */ 4768 4769 switch(c) 4770 { 4771 /* ===================================================================*/ 4772 case CHAR_NULL: /* The branch terminates at string end */ 4773 case CHAR_VERTICAL_LINE: /* or | or ) */ 4774 case CHAR_RIGHT_PARENTHESIS: 4775 *firstcharptr = firstchar; 4776 *firstcharflagsptr = firstcharflags; 4777 *reqcharptr = reqchar; 4778 *reqcharflagsptr = reqcharflags; 4779 *codeptr = code; 4780 *ptrptr = ptr; 4781 if (lengthptr != NULL) 4782 { 4783 if (OFLOW_MAX - *lengthptr < code - last_code) 4784 { 4785 *errorcodeptr = ERR20; 4786 goto FAILED; 4787 } 4788 *lengthptr += (int)(code - last_code); /* To include callout length */ 4789 DPRINTF((">> end branch\n")); 4790 } 4791 return TRUE; 4792 4793 4794 /* ===================================================================*/ 4795 /* Handle single-character metacharacters. In multiline mode, ^ disables 4796 the setting of any following char as a first character. */ 4797 4798 case CHAR_CIRCUMFLEX_ACCENT: 4799 previous = NULL; 4800 if ((options & PCRE_MULTILINE) != 0) 4801 { 4802 if (firstcharflags == REQ_UNSET) 4803 zerofirstcharflags = firstcharflags = REQ_NONE; 4804 *code++ = OP_CIRCM; 4805 } 4806 else *code++ = OP_CIRC; 4807 break; 4808 4809 case CHAR_DOLLAR_SIGN: 4810 previous = NULL; 4811 *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL; 4812 break; 4813 4814 /* There can never be a first char if '.' is first, whatever happens about 4815 repeats. The value of reqchar doesn't change either. */ 4816 4817 case CHAR_DOT: 4818 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 4819 zerofirstchar = firstchar; 4820 zerofirstcharflags = firstcharflags; 4821 zeroreqchar = reqchar; 4822 zeroreqcharflags = reqcharflags; 4823 previous = code; 4824 item_hwm_offset = cd->hwm - cd->start_workspace; 4825 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; 4826 break; 4827 4828 4829 /* ===================================================================*/ 4830 /* Character classes. If the included characters are all < 256, we build a 4831 32-byte bitmap of the permitted characters, except in the special case 4832 where there is only one such character. For negated classes, we build the 4833 map as usual, then invert it at the end. However, we use a different opcode 4834 so that data characters > 255 can be handled correctly. 4835 4836 If the class contains characters outside the 0-255 range, a different 4837 opcode is compiled. It may optionally have a bit map for characters < 256, 4838 but those above are are explicitly listed afterwards. A flag byte tells 4839 whether the bitmap is present, and whether this is a negated class or not. 4840 4841 In JavaScript compatibility mode, an isolated ']' causes an error. In 4842 default (Perl) mode, it is treated as a data character. */ 4843 4844 case CHAR_RIGHT_SQUARE_BRACKET: 4845 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) 4846 { 4847 *errorcodeptr = ERR64; 4848 goto FAILED; 4849 } 4850 goto NORMAL_CHAR; 4851 4852 /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is 4853 used for "start of word" and "end of word". As these are otherwise illegal 4854 sequences, we don't break anything by recognizing them. They are replaced 4855 by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are 4856 erroneous and are handled by the normal code below. */ 4857 4858 case CHAR_LEFT_SQUARE_BRACKET: 4859 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0) 4860 { 4861 nestptr = ptr + 7; 4862 ptr = sub_start_of_word; 4863 goto REDO_LOOP; 4864 } 4865 4866 if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0) 4867 { 4868 nestptr = ptr + 7; 4869 ptr = sub_end_of_word; 4870 goto REDO_LOOP; 4871 } 4872 4873 /* Handle a real character class. */ 4874 4875 previous = code; 4876 item_hwm_offset = cd->hwm - cd->start_workspace; 4877 4878 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 4879 they are encountered at the top level, so we'll do that too. */ 4880 4881 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 4882 ptr[1] == CHAR_EQUALS_SIGN) && 4883 check_posix_syntax(ptr, &tempptr)) 4884 { 4885 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31; 4886 goto FAILED; 4887 } 4888 4889 /* If the first character is '^', set the negation flag and skip it. Also, 4890 if the first few characters (either before or after ^) are \Q\E or \E we 4891 skip them too. This makes for compatibility with Perl. */ 4892 4893 negate_class = FALSE; 4894 for (;;) 4895 { 4896 c = *(++ptr); 4897 if (c == CHAR_BACKSLASH) 4898 { 4899 if (ptr[1] == CHAR_E) 4900 ptr++; 4901 else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0) 4902 ptr += 3; 4903 else 4904 break; 4905 } 4906 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) 4907 negate_class = TRUE; 4908 else break; 4909 } 4910 4911 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, 4912 an initial ']' is taken as a data character -- the code below handles 4913 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas 4914 [^] must match any character, so generate OP_ALLANY. */ 4915 4916 if (c == CHAR_RIGHT_SQUARE_BRACKET && 4917 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) 4918 { 4919 *code++ = negate_class? OP_ALLANY : OP_FAIL; 4920 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 4921 zerofirstchar = firstchar; 4922 zerofirstcharflags = firstcharflags; 4923 break; 4924 } 4925 4926 /* If a class contains a negative special such as \S, we need to flip the 4927 negation flag at the end, so that support for characters > 255 works 4928 correctly (they are all included in the class). */ 4929 4930 should_flip_negation = FALSE; 4931 4932 /* Extended class (xclass) will be used when characters > 255 4933 might match. */ 4934 4935#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4936 xclass = FALSE; 4937 class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ 4938 class_uchardata_base = class_uchardata; /* Save the start */ 4939#endif 4940 4941 /* For optimization purposes, we track some properties of the class: 4942 class_has_8bitchar will be non-zero if the class contains at least one < 4943 256 character; class_one_char will be 1 if the class contains just one 4944 character; xclass_has_prop will be TRUE if unicode property checks 4945 are present in the class. */ 4946 4947 class_has_8bitchar = 0; 4948 class_one_char = 0; 4949#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4950 xclass_has_prop = FALSE; 4951#endif 4952 4953 /* Initialize the 32-char bit map to all zeros. We build the map in a 4954 temporary bit of memory, in case the class contains fewer than two 4955 8-bit characters because in that case the compiled code doesn't use the bit 4956 map. */ 4957 4958 memset(classbits, 0, 32 * sizeof(pcre_uint8)); 4959 4960 /* Process characters until ] is reached. By writing this as a "do" it 4961 means that an initial ] is taken as a data character. At the start of the 4962 loop, c contains the first byte of the character. */ 4963 4964 if (c != CHAR_NULL) do 4965 { 4966 const pcre_uchar *oldptr; 4967 4968#ifdef SUPPORT_UTF 4969 if (utf && HAS_EXTRALEN(c)) 4970 { /* Braces are required because the */ 4971 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ 4972 } 4973#endif 4974 4975#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 4976 /* In the pre-compile phase, accumulate the length of any extra 4977 data and reset the pointer. This is so that very large classes that 4978 contain a zillion > 255 characters no longer overwrite the work space 4979 (which is on the stack). We have to remember that there was XCLASS data, 4980 however. */ 4981 4982 if (class_uchardata > class_uchardata_base) xclass = TRUE; 4983 4984 if (lengthptr != NULL && class_uchardata > class_uchardata_base) 4985 { 4986 *lengthptr += (int)(class_uchardata - class_uchardata_base); 4987 class_uchardata = class_uchardata_base; 4988 } 4989#endif 4990 4991 /* Inside \Q...\E everything is literal except \E */ 4992 4993 if (inescq) 4994 { 4995 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ 4996 { 4997 inescq = FALSE; /* Reset literal state */ 4998 ptr++; /* Skip the 'E' */ 4999 continue; /* Carry on with next */ 5000 } 5001 goto CHECK_RANGE; /* Could be range if \E follows */ 5002 } 5003 5004 /* Handle POSIX class names. Perl allows a negation extension of the 5005 form [:^name:]. A square bracket that doesn't match the syntax is 5006 treated as a literal. We also recognize the POSIX constructions 5007 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5008 5.6 and 5.8 do. */ 5009 5010 if (c == CHAR_LEFT_SQUARE_BRACKET && 5011 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 5012 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) 5013 { 5014 BOOL local_negate = FALSE; 5015 int posix_class, taboffset, tabopt; 5016 register const pcre_uint8 *cbits = cd->cbits; 5017 pcre_uint8 pbits[32]; 5018 5019 if (ptr[1] != CHAR_COLON) 5020 { 5021 *errorcodeptr = ERR31; 5022 goto FAILED; 5023 } 5024 5025 ptr += 2; 5026 if (*ptr == CHAR_CIRCUMFLEX_ACCENT) 5027 { 5028 local_negate = TRUE; 5029 should_flip_negation = TRUE; /* Note negative special */ 5030 ptr++; 5031 } 5032 5033 posix_class = check_posix_name(ptr, (int)(tempptr - ptr)); 5034 if (posix_class < 0) 5035 { 5036 *errorcodeptr = ERR30; 5037 goto FAILED; 5038 } 5039 5040 /* If matching is caseless, upper and lower are converted to 5041 alpha. This relies on the fact that the class table starts with 5042 alpha, lower, upper as the first 3 entries. */ 5043 5044 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) 5045 posix_class = 0; 5046 5047 /* When PCRE_UCP is set, some of the POSIX classes are converted to 5048 different escape sequences that use Unicode properties \p or \P. Others 5049 that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP 5050 directly. */ 5051 5052#ifdef SUPPORT_UCP 5053 if ((options & PCRE_UCP) != 0) 5054 { 5055 unsigned int ptype = 0; 5056 int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); 5057 5058 /* The posix_substitutes table specifies which POSIX classes can be 5059 converted to \p or \P items. */ 5060 5061 if (posix_substitutes[pc] != NULL) 5062 { 5063 nestptr = tempptr + 1; 5064 ptr = posix_substitutes[pc] - 1; 5065 continue; 5066 } 5067 5068 /* There are three other classes that generate special property calls 5069 that are recognized only in an XCLASS. */ 5070 5071 else switch(posix_class) 5072 { 5073 case PC_GRAPH: 5074 ptype = PT_PXGRAPH; 5075 /* Fall through */ 5076 case PC_PRINT: 5077 if (ptype == 0) ptype = PT_PXPRINT; 5078 /* Fall through */ 5079 case PC_PUNCT: 5080 if (ptype == 0) ptype = PT_PXPUNCT; 5081 *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; 5082 *class_uchardata++ = ptype; 5083 *class_uchardata++ = 0; 5084 xclass_has_prop = TRUE; 5085 ptr = tempptr + 1; 5086 continue; 5087 5088 /* For the other POSIX classes (ascii, cntrl, xdigit) we are going 5089 to fall through to the non-UCP case and build a bit map for 5090 characters with code points less than 256. If we are in a negated 5091 POSIX class, characters with code points greater than 255 must 5092 either all match or all not match. In the special case where we 5093 have not yet generated any xclass data, and this is the final item 5094 in the overall class, we need do nothing: later on, the opcode 5095 OP_NCLASS will be used to indicate that characters greater than 255 5096 are acceptable. If we have already seen an xclass item or one may 5097 follow (we have to assume that it might if this is not the end of 5098 the class), explicitly list all wide codepoints, which will then 5099 either not match or match, depending on whether the class is or is 5100 not negated. */ 5101 5102 default: 5103 if (local_negate && 5104 (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET)) 5105 { 5106 *class_uchardata++ = XCL_RANGE; 5107 class_uchardata += PRIV(ord2utf)(0x100, class_uchardata); 5108 class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); 5109 } 5110 break; 5111 } 5112 } 5113#endif 5114 /* In the non-UCP case, or when UCP makes no difference, we build the 5115 bit map for the POSIX class in a chunk of local store because we may be 5116 adding and subtracting from it, and we don't want to subtract bits that 5117 may be in the main map already. At the end we or the result into the 5118 bit map that is being built. */ 5119 5120 posix_class *= 3; 5121 5122 /* Copy in the first table (always present) */ 5123 5124 memcpy(pbits, cbits + posix_class_maps[posix_class], 5125 32 * sizeof(pcre_uint8)); 5126 5127 /* If there is a second table, add or remove it as required. */ 5128 5129 taboffset = posix_class_maps[posix_class + 1]; 5130 tabopt = posix_class_maps[posix_class + 2]; 5131 5132 if (taboffset >= 0) 5133 { 5134 if (tabopt >= 0) 5135 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; 5136 else 5137 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; 5138 } 5139 5140 /* Now see if we need to remove any special characters. An option 5141 value of 1 removes vertical space and 2 removes underscore. */ 5142 5143 if (tabopt < 0) tabopt = -tabopt; 5144 if (tabopt == 1) pbits[1] &= ~0x3c; 5145 else if (tabopt == 2) pbits[11] &= 0x7f; 5146 5147 /* Add the POSIX table or its complement into the main table that is 5148 being built and we are done. */ 5149 5150 if (local_negate) 5151 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; 5152 else 5153 for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; 5154 5155 ptr = tempptr + 1; 5156 /* Every class contains at least one < 256 character. */ 5157 class_has_8bitchar = 1; 5158 /* Every class contains at least two characters. */ 5159 class_one_char = 2; 5160 continue; /* End of POSIX syntax handling */ 5161 } 5162 5163 /* Backslash may introduce a single character, or it may introduce one 5164 of the specials, which just set a flag. The sequence \b is a special 5165 case. Inside a class (and only there) it is treated as backspace. We 5166 assume that other escapes have more than one character in them, so 5167 speculatively set both class_has_8bitchar and class_one_char bigger 5168 than one. Unrecognized escapes fall through and are either treated 5169 as literal characters (by default), or are faulted if 5170 PCRE_EXTRA is set. */ 5171 5172 if (c == CHAR_BACKSLASH) 5173 { 5174 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, 5175 TRUE); 5176 if (*errorcodeptr != 0) goto FAILED; 5177 if (escape == 0) c = ec; 5178 else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ 5179 else if (escape == ESC_N) /* \N is not supported in a class */ 5180 { 5181 *errorcodeptr = ERR71; 5182 goto FAILED; 5183 } 5184 else if (escape == ESC_Q) /* Handle start of quoted string */ 5185 { 5186 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 5187 { 5188 ptr += 2; /* avoid empty string */ 5189 } 5190 else inescq = TRUE; 5191 continue; 5192 } 5193 else if (escape == ESC_E) continue; /* Ignore orphan \E */ 5194 5195 else 5196 { 5197 register const pcre_uint8 *cbits = cd->cbits; 5198 /* Every class contains at least two < 256 characters. */ 5199 class_has_8bitchar++; 5200 /* Every class contains at least two characters. */ 5201 class_one_char += 2; 5202 5203 switch (escape) 5204 { 5205#ifdef SUPPORT_UCP 5206 case ESC_du: /* These are the values given for \d etc */ 5207 case ESC_DU: /* when PCRE_UCP is set. We replace the */ 5208 case ESC_wu: /* escape sequence with an appropriate \p */ 5209 case ESC_WU: /* or \P to test Unicode properties instead */ 5210 case ESC_su: /* of the default ASCII testing. */ 5211 case ESC_SU: 5212 nestptr = ptr; 5213 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ 5214 class_has_8bitchar--; /* Undo! */ 5215 continue; 5216#endif 5217 case ESC_d: 5218 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; 5219 continue; 5220 5221 case ESC_D: 5222 should_flip_negation = TRUE; 5223 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; 5224 continue; 5225 5226 case ESC_w: 5227 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; 5228 continue; 5229 5230 case ESC_W: 5231 should_flip_negation = TRUE; 5232 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; 5233 continue; 5234 5235 /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl 5236 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was 5237 previously set by something earlier in the character class. 5238 Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so 5239 we could just adjust the appropriate bit. From PCRE 8.34 we no 5240 longer treat \s and \S specially. */ 5241 5242 case ESC_s: 5243 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; 5244 continue; 5245 5246 case ESC_S: 5247 should_flip_negation = TRUE; 5248 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; 5249 continue; 5250 5251 /* The rest apply in both UCP and non-UCP cases. */ 5252 5253 case ESC_h: 5254 (void)add_list_to_class(classbits, &class_uchardata, options, cd, 5255 PRIV(hspace_list), NOTACHAR); 5256 continue; 5257 5258 case ESC_H: 5259 (void)add_not_list_to_class(classbits, &class_uchardata, options, 5260 cd, PRIV(hspace_list)); 5261 continue; 5262 5263 case ESC_v: 5264 (void)add_list_to_class(classbits, &class_uchardata, options, cd, 5265 PRIV(vspace_list), NOTACHAR); 5266 continue; 5267 5268 case ESC_V: 5269 (void)add_not_list_to_class(classbits, &class_uchardata, options, 5270 cd, PRIV(vspace_list)); 5271 continue; 5272 5273 case ESC_p: 5274 case ESC_P: 5275#ifdef SUPPORT_UCP 5276 { 5277 BOOL negated; 5278 unsigned int ptype = 0, pdata = 0; 5279 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr)) 5280 goto FAILED; 5281 *class_uchardata++ = ((escape == ESC_p) != negated)? 5282 XCL_PROP : XCL_NOTPROP; 5283 *class_uchardata++ = ptype; 5284 *class_uchardata++ = pdata; 5285 xclass_has_prop = TRUE; 5286 class_has_8bitchar--; /* Undo! */ 5287 continue; 5288 } 5289#else 5290 *errorcodeptr = ERR45; 5291 goto FAILED; 5292#endif 5293 /* Unrecognized escapes are faulted if PCRE is running in its 5294 strict mode. By default, for compatibility with Perl, they are 5295 treated as literals. */ 5296 5297 default: 5298 if ((options & PCRE_EXTRA) != 0) 5299 { 5300 *errorcodeptr = ERR7; 5301 goto FAILED; 5302 } 5303 class_has_8bitchar--; /* Undo the speculative increase. */ 5304 class_one_char -= 2; /* Undo the speculative increase. */ 5305 c = *ptr; /* Get the final character and fall through */ 5306 break; 5307 } 5308 } 5309 5310 /* Fall through if the escape just defined a single character (c >= 0). 5311 This may be greater than 256. */ 5312 5313 escape = 0; 5314 5315 } /* End of backslash handling */ 5316 5317 /* A character may be followed by '-' to form a range. However, Perl does 5318 not permit ']' to be the end of the range. A '-' character at the end is 5319 treated as a literal. Perl ignores orphaned \E sequences entirely. The 5320 code for handling \Q and \E is messy. */ 5321 5322 CHECK_RANGE: 5323 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 5324 { 5325 inescq = FALSE; 5326 ptr += 2; 5327 } 5328 oldptr = ptr; 5329 5330 /* Remember if \r or \n were explicitly used */ 5331 5332 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; 5333 5334 /* Check for range */ 5335 5336 if (!inescq && ptr[1] == CHAR_MINUS) 5337 { 5338 pcre_uint32 d; 5339 ptr += 2; 5340 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; 5341 5342 /* If we hit \Q (not followed by \E) at this point, go into escaped 5343 mode. */ 5344 5345 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) 5346 { 5347 ptr += 2; 5348 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) 5349 { ptr += 2; continue; } 5350 inescq = TRUE; 5351 break; 5352 } 5353 5354 /* Minus (hyphen) at the end of a class is treated as a literal, so put 5355 back the pointer and jump to handle the character that preceded it. */ 5356 5357 if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) 5358 { 5359 ptr = oldptr; 5360 goto CLASS_SINGLE_CHARACTER; 5361 } 5362 5363 /* Otherwise, we have a potential range; pick up the next character */ 5364 5365#ifdef SUPPORT_UTF 5366 if (utf) 5367 { /* Braces are required because the */ 5368 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ 5369 } 5370 else 5371#endif 5372 d = *ptr; /* Not UTF-8 mode */ 5373 5374 /* The second part of a range can be a single-character escape 5375 sequence, but not any of the other escapes. Perl treats a hyphen as a 5376 literal in such circumstances. However, in Perl's warning mode, a 5377 warning is given, so PCRE now faults it as it is almost certainly a 5378 mistake on the user's part. */ 5379 5380 if (!inescq) 5381 { 5382 if (d == CHAR_BACKSLASH) 5383 { 5384 int descape; 5385 descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE); 5386 if (*errorcodeptr != 0) goto FAILED; 5387 5388 /* 0 means a character was put into d; \b is backspace; any other 5389 special causes an error. */ 5390 5391 if (descape != 0) 5392 { 5393 if (descape == ESC_b) d = CHAR_BS; else 5394 { 5395 *errorcodeptr = ERR83; 5396 goto FAILED; 5397 } 5398 } 5399 } 5400 5401 /* A hyphen followed by a POSIX class is treated in the same way. */ 5402 5403 else if (d == CHAR_LEFT_SQUARE_BRACKET && 5404 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 5405 ptr[1] == CHAR_EQUALS_SIGN) && 5406 check_posix_syntax(ptr, &tempptr)) 5407 { 5408 *errorcodeptr = ERR83; 5409 goto FAILED; 5410 } 5411 } 5412 5413 /* Check that the two values are in the correct order. Optimize 5414 one-character ranges. */ 5415 5416 if (d < c) 5417 { 5418 *errorcodeptr = ERR8; 5419 goto FAILED; 5420 } 5421 if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */ 5422 5423 /* We have found a character range, so single character optimizations 5424 cannot be done anymore. Any value greater than 1 indicates that there 5425 is more than one character. */ 5426 5427 class_one_char = 2; 5428 5429 /* Remember an explicit \r or \n, and add the range to the class. */ 5430 5431 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; 5432 5433 class_has_8bitchar += 5434 add_to_class(classbits, &class_uchardata, options, cd, c, d); 5435 5436 continue; /* Go get the next char in the class */ 5437 } 5438 5439 /* Handle a single character - we can get here for a normal non-escape 5440 char, or after \ that introduces a single character or for an apparent 5441 range that isn't. Only the value 1 matters for class_one_char, so don't 5442 increase it if it is already 2 or more ... just in case there's a class 5443 with a zillion characters in it. */ 5444 5445 CLASS_SINGLE_CHARACTER: 5446 if (class_one_char < 2) class_one_char++; 5447 5448 /* If xclass_has_prop is false and class_one_char is 1, we have the first 5449 single character in the class, and there have been no prior ranges, or 5450 XCLASS items generated by escapes. If this is the final character in the 5451 class, we can optimize by turning the item into a 1-character OP_CHAR[I] 5452 if it's positive, or OP_NOT[I] if it's negative. In the positive case, it 5453 can cause firstchar to be set. Otherwise, there can be no first char if 5454 this item is first, whatever repeat count may follow. In the case of 5455 reqchar, save the previous value for reinstating. */ 5456 5457 if (!inescq && 5458#ifdef SUPPORT_UCP 5459 !xclass_has_prop && 5460#endif 5461 class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 5462 { 5463 ptr++; 5464 zeroreqchar = reqchar; 5465 zeroreqcharflags = reqcharflags; 5466 5467 if (negate_class) 5468 { 5469#ifdef SUPPORT_UCP 5470 int d; 5471#endif 5472 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 5473 zerofirstchar = firstchar; 5474 zerofirstcharflags = firstcharflags; 5475 5476 /* For caseless UTF-8 mode when UCP support is available, check 5477 whether this character has more than one other case. If so, generate 5478 a special OP_NOTPROP item instead of OP_NOTI. */ 5479 5480#ifdef SUPPORT_UCP 5481 if (utf && (options & PCRE_CASELESS) != 0 && 5482 (d = UCD_CASESET(c)) != 0) 5483 { 5484 *code++ = OP_NOTPROP; 5485 *code++ = PT_CLIST; 5486 *code++ = d; 5487 } 5488 else 5489#endif 5490 /* Char has only one other case, or UCP not available */ 5491 5492 { 5493 *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; 5494#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5495 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) 5496 code += PRIV(ord2utf)(c, code); 5497 else 5498#endif 5499 *code++ = c; 5500 } 5501 5502 /* We are finished with this character class */ 5503 5504 goto END_CLASS; 5505 } 5506 5507 /* For a single, positive character, get the value into mcbuffer, and 5508 then we can handle this with the normal one-character code. */ 5509 5510#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5511 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) 5512 mclength = PRIV(ord2utf)(c, mcbuffer); 5513 else 5514#endif 5515 { 5516 mcbuffer[0] = c; 5517 mclength = 1; 5518 } 5519 goto ONE_CHAR; 5520 } /* End of 1-char optimization */ 5521 5522 /* There is more than one character in the class, or an XCLASS item 5523 has been generated. Add this character to the class. */ 5524 5525 class_has_8bitchar += 5526 add_to_class(classbits, &class_uchardata, options, cd, c, c); 5527 } 5528 5529 /* Loop until ']' reached. This "while" is the end of the "do" far above. 5530 If we are at the end of an internal nested string, revert to the outer 5531 string. */ 5532 5533 while (((c = *(++ptr)) != CHAR_NULL || 5534 (nestptr != NULL && 5535 (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) && 5536 (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); 5537 5538 /* Check for missing terminating ']' */ 5539 5540 if (c == CHAR_NULL) 5541 { 5542 *errorcodeptr = ERR6; 5543 goto FAILED; 5544 } 5545 5546 /* We will need an XCLASS if data has been placed in class_uchardata. In 5547 the second phase this is a sufficient test. However, in the pre-compile 5548 phase, class_uchardata gets emptied to prevent workspace overflow, so it 5549 only if the very last character in the class needs XCLASS will it contain 5550 anything at this point. For this reason, xclass gets set TRUE above when 5551 uchar_classdata is emptied, and that's why this code is the way it is here 5552 instead of just doing a test on class_uchardata below. */ 5553 5554#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 5555 if (class_uchardata > class_uchardata_base) xclass = TRUE; 5556#endif 5557 5558 /* If this is the first thing in the branch, there can be no first char 5559 setting, whatever the repeat count. Any reqchar setting must remain 5560 unchanged after any kind of repeat. */ 5561 5562 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 5563 zerofirstchar = firstchar; 5564 zerofirstcharflags = firstcharflags; 5565 zeroreqchar = reqchar; 5566 zeroreqcharflags = reqcharflags; 5567 5568 /* If there are characters with values > 255, we have to compile an 5569 extended class, with its own opcode, unless there was a negated special 5570 such as \S in the class, and PCRE_UCP is not set, because in that case all 5571 characters > 255 are in the class, so any that were explicitly given as 5572 well can be ignored. If (when there are explicit characters > 255 that must 5573 be listed) there are no characters < 256, we can omit the bitmap in the 5574 actual compiled code. */ 5575 5576#ifdef SUPPORT_UTF 5577 if (xclass && (xclass_has_prop || !should_flip_negation || 5578 (options & PCRE_UCP) != 0)) 5579#elif !defined COMPILE_PCRE8 5580 if (xclass && (xclass_has_prop || !should_flip_negation)) 5581#endif 5582#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 5583 { 5584 *class_uchardata++ = XCL_END; /* Marks the end of extra data */ 5585 *code++ = OP_XCLASS; 5586 code += LINK_SIZE; 5587 *code = negate_class? XCL_NOT:0; 5588 if (xclass_has_prop) *code |= XCL_HASPROP; 5589 5590 /* If the map is required, move up the extra data to make room for it; 5591 otherwise just move the code pointer to the end of the extra data. */ 5592 5593 if (class_has_8bitchar > 0) 5594 { 5595 *code++ |= XCL_MAP; 5596 memmove(code + (32 / sizeof(pcre_uchar)), code, 5597 IN_UCHARS(class_uchardata - code)); 5598 if (negate_class && !xclass_has_prop) 5599 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; 5600 memcpy(code, classbits, 32); 5601 code = class_uchardata + (32 / sizeof(pcre_uchar)); 5602 } 5603 else code = class_uchardata; 5604 5605 /* Now fill in the complete length of the item */ 5606 5607 PUT(previous, 1, (int)(code - previous)); 5608 break; /* End of class handling */ 5609 } 5610 5611 /* Even though any XCLASS list is now discarded, we must allow for 5612 its memory. */ 5613 5614 if (lengthptr != NULL) 5615 *lengthptr += (int)(class_uchardata - class_uchardata_base); 5616#endif 5617 5618 /* If there are no characters > 255, or they are all to be included or 5619 excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the 5620 whole class was negated and whether there were negative specials such as \S 5621 (non-UCP) in the class. Then copy the 32-byte map into the code vector, 5622 negating it if necessary. */ 5623 5624 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; 5625 if (lengthptr == NULL) /* Save time in the pre-compile phase */ 5626 { 5627 if (negate_class) 5628 for (c = 0; c < 32; c++) classbits[c] = ~classbits[c]; 5629 memcpy(code, classbits, 32); 5630 } 5631 code += 32 / sizeof(pcre_uchar); 5632 5633 END_CLASS: 5634 break; 5635 5636 5637 /* ===================================================================*/ 5638 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this 5639 has been tested above. */ 5640 5641 case CHAR_LEFT_CURLY_BRACKET: 5642 if (!is_quantifier) goto NORMAL_CHAR; 5643 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); 5644 if (*errorcodeptr != 0) goto FAILED; 5645 goto REPEAT; 5646 5647 case CHAR_ASTERISK: 5648 repeat_min = 0; 5649 repeat_max = -1; 5650 goto REPEAT; 5651 5652 case CHAR_PLUS: 5653 repeat_min = 1; 5654 repeat_max = -1; 5655 goto REPEAT; 5656 5657 case CHAR_QUESTION_MARK: 5658 repeat_min = 0; 5659 repeat_max = 1; 5660 5661 REPEAT: 5662 if (previous == NULL) 5663 { 5664 *errorcodeptr = ERR9; 5665 goto FAILED; 5666 } 5667 5668 if (repeat_min == 0) 5669 { 5670 firstchar = zerofirstchar; /* Adjust for zero repeat */ 5671 firstcharflags = zerofirstcharflags; 5672 reqchar = zeroreqchar; /* Ditto */ 5673 reqcharflags = zeroreqcharflags; 5674 } 5675 5676 /* Remember whether this is a variable length repeat */ 5677 5678 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 5679 5680 op_type = 0; /* Default single-char op codes */ 5681 possessive_quantifier = FALSE; /* Default not possessive quantifier */ 5682 5683 /* Save start of previous item, in case we have to move it up in order to 5684 insert something before it. */ 5685 5686 tempcode = previous; 5687 5688 /* Before checking for a possessive quantifier, we must skip over 5689 whitespace and comments in extended mode because Perl allows white space at 5690 this point. */ 5691 5692 if ((options & PCRE_EXTENDED) != 0) 5693 { 5694 const pcre_uchar *p = ptr + 1; 5695 for (;;) 5696 { 5697 while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++; 5698 if (*p != CHAR_NUMBER_SIGN) break; 5699 p++; 5700 while (*p != CHAR_NULL) 5701 { 5702 if (IS_NEWLINE(p)) /* For non-fixed-length newline cases, */ 5703 { /* IS_NEWLINE sets cd->nllen. */ 5704 p += cd->nllen; 5705 break; 5706 } 5707 p++; 5708#ifdef SUPPORT_UTF 5709 if (utf) FORWARDCHAR(p); 5710#endif 5711 } /* Loop for comment characters */ 5712 } /* Loop for multiple comments */ 5713 ptr = p - 1; /* Character before the next significant one. */ 5714 } 5715 5716 /* If the next character is '+', we have a possessive quantifier. This 5717 implies greediness, whatever the setting of the PCRE_UNGREEDY option. 5718 If the next character is '?' this is a minimizing repeat, by default, 5719 but if PCRE_UNGREEDY is set, it works the other way round. We change the 5720 repeat type to the non-default. */ 5721 5722 if (ptr[1] == CHAR_PLUS) 5723 { 5724 repeat_type = 0; /* Force greedy */ 5725 possessive_quantifier = TRUE; 5726 ptr++; 5727 } 5728 else if (ptr[1] == CHAR_QUESTION_MARK) 5729 { 5730 repeat_type = greedy_non_default; 5731 ptr++; 5732 } 5733 else repeat_type = greedy_default; 5734 5735 /* If previous was a recursion call, wrap it in atomic brackets so that 5736 previous becomes the atomic group. All recursions were so wrapped in the 5737 past, but it no longer happens for non-repeated recursions. In fact, the 5738 repeated ones could be re-implemented independently so as not to need this, 5739 but for the moment we rely on the code for repeating groups. */ 5740 5741 if (*previous == OP_RECURSE) 5742 { 5743 memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE)); 5744 *previous = OP_ONCE; 5745 PUT(previous, 1, 2 + 2*LINK_SIZE); 5746 previous[2 + 2*LINK_SIZE] = OP_KET; 5747 PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); 5748 code += 2 + 2 * LINK_SIZE; 5749 length_prevgroup = 3 + 3*LINK_SIZE; 5750 5751 /* When actually compiling, we need to check whether this was a forward 5752 reference, and if so, adjust the offset. */ 5753 5754 if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE) 5755 { 5756 int offset = GET(cd->hwm, -LINK_SIZE); 5757 if (offset == previous + 1 - cd->start_code) 5758 PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE); 5759 } 5760 } 5761 5762 /* Now handle repetition for the different types of item. */ 5763 5764 /* If previous was a character or negated character match, abolish the item 5765 and generate a repeat item instead. If a char item has a minimum of more 5766 than one, ensure that it is set in reqchar - it might not be if a sequence 5767 such as x{3} is the first thing in a branch because the x will have gone 5768 into firstchar instead. */ 5769 5770 if (*previous == OP_CHAR || *previous == OP_CHARI 5771 || *previous == OP_NOT || *previous == OP_NOTI) 5772 { 5773 switch (*previous) 5774 { 5775 default: /* Make compiler happy. */ 5776 case OP_CHAR: op_type = OP_STAR - OP_STAR; break; 5777 case OP_CHARI: op_type = OP_STARI - OP_STAR; break; 5778 case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; 5779 case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; 5780 } 5781 5782 /* Deal with UTF characters that take up more than one character. It's 5783 easier to write this out separately than try to macrify it. Use c to 5784 hold the length of the character in bytes, plus UTF_LENGTH to flag that 5785 it's a length rather than a small character. */ 5786 5787#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5788 if (utf && NOT_FIRSTCHAR(code[-1])) 5789 { 5790 pcre_uchar *lastchar = code - 1; 5791 BACKCHAR(lastchar); 5792 c = (int)(code - lastchar); /* Length of UTF-8 character */ 5793 memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */ 5794 c |= UTF_LENGTH; /* Flag c as a length */ 5795 } 5796 else 5797#endif /* SUPPORT_UTF */ 5798 5799 /* Handle the case of a single charater - either with no UTF support, or 5800 with UTF disabled, or for a single character UTF character. */ 5801 { 5802 c = code[-1]; 5803 if (*previous <= OP_CHARI && repeat_min > 1) 5804 { 5805 reqchar = c; 5806 reqcharflags = req_caseopt | cd->req_varyopt; 5807 } 5808 } 5809 5810 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 5811 } 5812 5813 /* If previous was a character type match (\d or similar), abolish it and 5814 create a suitable repeat item. The code is shared with single-character 5815 repeats by setting op_type to add a suitable offset into repeat_type. Note 5816 the the Unicode property types will be present only when SUPPORT_UCP is 5817 defined, but we don't wrap the little bits of code here because it just 5818 makes it horribly messy. */ 5819 5820 else if (*previous < OP_EODN) 5821 { 5822 pcre_uchar *oldcode; 5823 int prop_type, prop_value; 5824 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 5825 c = *previous; 5826 5827 OUTPUT_SINGLE_REPEAT: 5828 if (*previous == OP_PROP || *previous == OP_NOTPROP) 5829 { 5830 prop_type = previous[1]; 5831 prop_value = previous[2]; 5832 } 5833 else prop_type = prop_value = -1; 5834 5835 oldcode = code; 5836 code = previous; /* Usually overwrite previous item */ 5837 5838 /* If the maximum is zero then the minimum must also be zero; Perl allows 5839 this case, so we do too - by simply omitting the item altogether. */ 5840 5841 if (repeat_max == 0) goto END_REPEAT; 5842 5843 /* Combine the op_type with the repeat_type */ 5844 5845 repeat_type += op_type; 5846 5847 /* A minimum of zero is handled either as the special case * or ?, or as 5848 an UPTO, with the maximum given. */ 5849 5850 if (repeat_min == 0) 5851 { 5852 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; 5853 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 5854 else 5855 { 5856 *code++ = OP_UPTO + repeat_type; 5857 PUT2INC(code, 0, repeat_max); 5858 } 5859 } 5860 5861 /* A repeat minimum of 1 is optimized into some special cases. If the 5862 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is 5863 left in place and, if the maximum is greater than 1, we use OP_UPTO with 5864 one less than the maximum. */ 5865 5866 else if (repeat_min == 1) 5867 { 5868 if (repeat_max == -1) 5869 *code++ = OP_PLUS + repeat_type; 5870 else 5871 { 5872 code = oldcode; /* leave previous item in place */ 5873 if (repeat_max == 1) goto END_REPEAT; 5874 *code++ = OP_UPTO + repeat_type; 5875 PUT2INC(code, 0, repeat_max - 1); 5876 } 5877 } 5878 5879 /* The case {n,n} is just an EXACT, while the general case {n,m} is 5880 handled as an EXACT followed by an UPTO. */ 5881 5882 else 5883 { 5884 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 5885 PUT2INC(code, 0, repeat_min); 5886 5887 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, 5888 we have to insert the character for the previous code. For a repeated 5889 Unicode property match, there are two extra bytes that define the 5890 required property. In UTF-8 mode, long characters have their length in 5891 c, with the UTF_LENGTH bit as a flag. */ 5892 5893 if (repeat_max < 0) 5894 { 5895#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5896 if (utf && (c & UTF_LENGTH) != 0) 5897 { 5898 memcpy(code, utf_chars, IN_UCHARS(c & 7)); 5899 code += c & 7; 5900 } 5901 else 5902#endif 5903 { 5904 *code++ = c; 5905 if (prop_type >= 0) 5906 { 5907 *code++ = prop_type; 5908 *code++ = prop_value; 5909 } 5910 } 5911 *code++ = OP_STAR + repeat_type; 5912 } 5913 5914 /* Else insert an UPTO if the max is greater than the min, again 5915 preceded by the character, for the previously inserted code. If the 5916 UPTO is just for 1 instance, we can use QUERY instead. */ 5917 5918 else if (repeat_max != repeat_min) 5919 { 5920#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5921 if (utf && (c & UTF_LENGTH) != 0) 5922 { 5923 memcpy(code, utf_chars, IN_UCHARS(c & 7)); 5924 code += c & 7; 5925 } 5926 else 5927#endif 5928 *code++ = c; 5929 if (prop_type >= 0) 5930 { 5931 *code++ = prop_type; 5932 *code++ = prop_value; 5933 } 5934 repeat_max -= repeat_min; 5935 5936 if (repeat_max == 1) 5937 { 5938 *code++ = OP_QUERY + repeat_type; 5939 } 5940 else 5941 { 5942 *code++ = OP_UPTO + repeat_type; 5943 PUT2INC(code, 0, repeat_max); 5944 } 5945 } 5946 } 5947 5948 /* The character or character type itself comes last in all cases. */ 5949 5950#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 5951 if (utf && (c & UTF_LENGTH) != 0) 5952 { 5953 memcpy(code, utf_chars, IN_UCHARS(c & 7)); 5954 code += c & 7; 5955 } 5956 else 5957#endif 5958 *code++ = c; 5959 5960 /* For a repeated Unicode property match, there are two extra bytes that 5961 define the required property. */ 5962 5963#ifdef SUPPORT_UCP 5964 if (prop_type >= 0) 5965 { 5966 *code++ = prop_type; 5967 *code++ = prop_value; 5968 } 5969#endif 5970 } 5971 5972 /* If previous was a character class or a back reference, we put the repeat 5973 stuff after it, but just skip the item if the repeat was {0,0}. */ 5974 5975 else if (*previous == OP_CLASS || *previous == OP_NCLASS || 5976#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 5977 *previous == OP_XCLASS || 5978#endif 5979 *previous == OP_REF || *previous == OP_REFI || 5980 *previous == OP_DNREF || *previous == OP_DNREFI) 5981 { 5982 if (repeat_max == 0) 5983 { 5984 code = previous; 5985 goto END_REPEAT; 5986 } 5987 5988 if (repeat_min == 0 && repeat_max == -1) 5989 *code++ = OP_CRSTAR + repeat_type; 5990 else if (repeat_min == 1 && repeat_max == -1) 5991 *code++ = OP_CRPLUS + repeat_type; 5992 else if (repeat_min == 0 && repeat_max == 1) 5993 *code++ = OP_CRQUERY + repeat_type; 5994 else 5995 { 5996 *code++ = OP_CRRANGE + repeat_type; 5997 PUT2INC(code, 0, repeat_min); 5998 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ 5999 PUT2INC(code, 0, repeat_max); 6000 } 6001 } 6002 6003 /* If previous was a bracket group, we may have to replicate it in certain 6004 cases. Note that at this point we can encounter only the "basic" bracket 6005 opcodes such as BRA and CBRA, as this is the place where they get converted 6006 into the more special varieties such as BRAPOS and SBRA. A test for >= 6007 OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK, 6008 ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND. 6009 Originally, PCRE did not allow repetition of assertions, but now it does, 6010 for Perl compatibility. */ 6011 6012 else if (*previous >= OP_ASSERT && *previous <= OP_COND) 6013 { 6014 register int i; 6015 int len = (int)(code - previous); 6016 size_t base_hwm_offset = item_hwm_offset; 6017 pcre_uchar *bralink = NULL; 6018 pcre_uchar *brazeroptr = NULL; 6019 6020 /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so 6021 we just ignore the repeat. */ 6022 6023 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) 6024 goto END_REPEAT; 6025 6026 /* There is no sense in actually repeating assertions. The only potential 6027 use of repetition is in cases when the assertion is optional. Therefore, 6028 if the minimum is greater than zero, just ignore the repeat. If the 6029 maximum is not zero or one, set it to 1. */ 6030 6031 if (*previous < OP_ONCE) /* Assertion */ 6032 { 6033 if (repeat_min > 0) goto END_REPEAT; 6034 if (repeat_max < 0 || repeat_max > 1) repeat_max = 1; 6035 } 6036 6037 /* The case of a zero minimum is special because of the need to stick 6038 OP_BRAZERO in front of it, and because the group appears once in the 6039 data, whereas in other cases it appears the minimum number of times. For 6040 this reason, it is simplest to treat this case separately, as otherwise 6041 the code gets far too messy. There are several special subcases when the 6042 minimum is zero. */ 6043 6044 if (repeat_min == 0) 6045 { 6046 /* If the maximum is also zero, we used to just omit the group from the 6047 output altogether, like this: 6048 6049 ** if (repeat_max == 0) 6050 ** { 6051 ** code = previous; 6052 ** goto END_REPEAT; 6053 ** } 6054 6055 However, that fails when a group or a subgroup within it is referenced 6056 as a subroutine from elsewhere in the pattern, so now we stick in 6057 OP_SKIPZERO in front of it so that it is skipped on execution. As we 6058 don't have a list of which groups are referenced, we cannot do this 6059 selectively. 6060 6061 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO 6062 and do no more at this point. However, we do need to adjust any 6063 OP_RECURSE calls inside the group that refer to the group itself or any 6064 internal or forward referenced group, because the offset is from the 6065 start of the whole regex. Temporarily terminate the pattern while doing 6066 this. */ 6067 6068 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ 6069 { 6070 *code = OP_END; 6071 adjust_recurse(previous, 1, utf, cd, item_hwm_offset); 6072 memmove(previous + 1, previous, IN_UCHARS(len)); 6073 code++; 6074 if (repeat_max == 0) 6075 { 6076 *previous++ = OP_SKIPZERO; 6077 goto END_REPEAT; 6078 } 6079 brazeroptr = previous; /* Save for possessive optimizing */ 6080 *previous++ = OP_BRAZERO + repeat_type; 6081 } 6082 6083 /* If the maximum is greater than 1 and limited, we have to replicate 6084 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 6085 The first one has to be handled carefully because it's the original 6086 copy, which has to be moved up. The remainder can be handled by code 6087 that is common with the non-zero minimum case below. We have to 6088 adjust the value or repeat_max, since one less copy is required. Once 6089 again, we may have to adjust any OP_RECURSE calls inside the group. */ 6090 6091 else 6092 { 6093 int offset; 6094 *code = OP_END; 6095 adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset); 6096 memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); 6097 code += 2 + LINK_SIZE; 6098 *previous++ = OP_BRAZERO + repeat_type; 6099 *previous++ = OP_BRA; 6100 6101 /* We chain together the bracket offset fields that have to be 6102 filled in later when the ends of the brackets are reached. */ 6103 6104 offset = (bralink == NULL)? 0 : (int)(previous - bralink); 6105 bralink = previous; 6106 PUTINC(previous, 0, offset); 6107 } 6108 6109 repeat_max--; 6110 } 6111 6112 /* If the minimum is greater than zero, replicate the group as many 6113 times as necessary, and adjust the maximum to the number of subsequent 6114 copies that we need. If we set a first char from the group, and didn't 6115 set a required char, copy the latter from the former. If there are any 6116 forward reference subroutine calls in the group, there will be entries on 6117 the workspace list; replicate these with an appropriate increment. */ 6118 6119 else 6120 { 6121 if (repeat_min > 1) 6122 { 6123 /* In the pre-compile phase, we don't actually do the replication. We 6124 just adjust the length as if we had. Do some paranoid checks for 6125 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit 6126 integer type when available, otherwise double. */ 6127 6128 if (lengthptr != NULL) 6129 { 6130 int delta = (repeat_min - 1)*length_prevgroup; 6131 if ((INT64_OR_DOUBLE)(repeat_min - 1)* 6132 (INT64_OR_DOUBLE)length_prevgroup > 6133 (INT64_OR_DOUBLE)INT_MAX || 6134 OFLOW_MAX - *lengthptr < delta) 6135 { 6136 *errorcodeptr = ERR20; 6137 goto FAILED; 6138 } 6139 *lengthptr += delta; 6140 } 6141 6142 /* This is compiling for real. If there is a set first byte for 6143 the group, and we have not yet set a "required byte", set it. Make 6144 sure there is enough workspace for copying forward references before 6145 doing the copy. */ 6146 6147 else 6148 { 6149 if (groupsetfirstchar && reqcharflags < 0) 6150 { 6151 reqchar = firstchar; 6152 reqcharflags = firstcharflags; 6153 } 6154 6155 for (i = 1; i < repeat_min; i++) 6156 { 6157 pcre_uchar *hc; 6158 size_t this_hwm_offset = cd->hwm - cd->start_workspace; 6159 memcpy(code, previous, IN_UCHARS(len)); 6160 6161 while (cd->hwm > cd->start_workspace + cd->workspace_size - 6162 WORK_SIZE_SAFETY_MARGIN - 6163 (this_hwm_offset - base_hwm_offset)) 6164 { 6165 *errorcodeptr = expand_workspace(cd); 6166 if (*errorcodeptr != 0) goto FAILED; 6167 } 6168 6169 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; 6170 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; 6171 hc += LINK_SIZE) 6172 { 6173 PUT(cd->hwm, 0, GET(hc, 0) + len); 6174 cd->hwm += LINK_SIZE; 6175 } 6176 base_hwm_offset = this_hwm_offset; 6177 code += len; 6178 } 6179 } 6180 } 6181 6182 if (repeat_max > 0) repeat_max -= repeat_min; 6183 } 6184 6185 /* This code is common to both the zero and non-zero minimum cases. If 6186 the maximum is limited, it replicates the group in a nested fashion, 6187 remembering the bracket starts on a stack. In the case of a zero minimum, 6188 the first one was set up above. In all cases the repeat_max now specifies 6189 the number of additional copies needed. Again, we must remember to 6190 replicate entries on the forward reference list. */ 6191 6192 if (repeat_max >= 0) 6193 { 6194 /* In the pre-compile phase, we don't actually do the replication. We 6195 just adjust the length as if we had. For each repetition we must add 1 6196 to the length for BRAZERO and for all but the last repetition we must 6197 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some 6198 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is 6199 a 64-bit integer type when available, otherwise double. */ 6200 6201 if (lengthptr != NULL && repeat_max > 0) 6202 { 6203 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 6204 2 - 2*LINK_SIZE; /* Last one doesn't nest */ 6205 if ((INT64_OR_DOUBLE)repeat_max * 6206 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) 6207 > (INT64_OR_DOUBLE)INT_MAX || 6208 OFLOW_MAX - *lengthptr < delta) 6209 { 6210 *errorcodeptr = ERR20; 6211 goto FAILED; 6212 } 6213 *lengthptr += delta; 6214 } 6215 6216 /* This is compiling for real */ 6217 6218 else for (i = repeat_max - 1; i >= 0; i--) 6219 { 6220 pcre_uchar *hc; 6221 size_t this_hwm_offset = cd->hwm - cd->start_workspace; 6222 6223 *code++ = OP_BRAZERO + repeat_type; 6224 6225 /* All but the final copy start a new nesting, maintaining the 6226 chain of brackets outstanding. */ 6227 6228 if (i != 0) 6229 { 6230 int offset; 6231 *code++ = OP_BRA; 6232 offset = (bralink == NULL)? 0 : (int)(code - bralink); 6233 bralink = code; 6234 PUTINC(code, 0, offset); 6235 } 6236 6237 memcpy(code, previous, IN_UCHARS(len)); 6238 6239 /* Ensure there is enough workspace for forward references before 6240 copying them. */ 6241 6242 while (cd->hwm > cd->start_workspace + cd->workspace_size - 6243 WORK_SIZE_SAFETY_MARGIN - 6244 (this_hwm_offset - base_hwm_offset)) 6245 { 6246 *errorcodeptr = expand_workspace(cd); 6247 if (*errorcodeptr != 0) goto FAILED; 6248 } 6249 6250 for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset; 6251 hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset; 6252 hc += LINK_SIZE) 6253 { 6254 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); 6255 cd->hwm += LINK_SIZE; 6256 } 6257 base_hwm_offset = this_hwm_offset; 6258 code += len; 6259 } 6260 6261 /* Now chain through the pending brackets, and fill in their length 6262 fields (which are holding the chain links pro tem). */ 6263 6264 while (bralink != NULL) 6265 { 6266 int oldlinkoffset; 6267 int offset = (int)(code - bralink + 1); 6268 pcre_uchar *bra = code - offset; 6269 oldlinkoffset = GET(bra, 1); 6270 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 6271 *code++ = OP_KET; 6272 PUTINC(code, 0, offset); 6273 PUT(bra, 1, offset); 6274 } 6275 } 6276 6277 /* If the maximum is unlimited, set a repeater in the final copy. For 6278 ONCE brackets, that's all we need to do. However, possessively repeated 6279 ONCE brackets can be converted into non-capturing brackets, as the 6280 behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to 6281 deal with possessive ONCEs specially. 6282 6283 Otherwise, when we are doing the actual compile phase, check to see 6284 whether this group is one that could match an empty string. If so, 6285 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so 6286 that runtime checking can be done. [This check is also applied to ONCE 6287 groups at runtime, but in a different way.] 6288 6289 Then, if the quantifier was possessive and the bracket is not a 6290 conditional, we convert the BRA code to the POS form, and the KET code to 6291 KETRPOS. (It turns out to be convenient at runtime to detect this kind of 6292 subpattern at both the start and at the end.) The use of special opcodes 6293 makes it possible to reduce greatly the stack usage in pcre_exec(). If 6294 the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO. 6295 6296 Then, if the minimum number of matches is 1 or 0, cancel the possessive 6297 flag so that the default action below, of wrapping everything inside 6298 atomic brackets, does not happen. When the minimum is greater than 1, 6299 there will be earlier copies of the group, and so we still have to wrap 6300 the whole thing. */ 6301 6302 else 6303 { 6304 pcre_uchar *ketcode = code - 1 - LINK_SIZE; 6305 pcre_uchar *bracode = ketcode - GET(ketcode, 1); 6306 6307 /* Convert possessive ONCE brackets to non-capturing */ 6308 6309 if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) && 6310 possessive_quantifier) *bracode = OP_BRA; 6311 6312 /* For non-possessive ONCE brackets, all we need to do is to 6313 set the KET. */ 6314 6315 if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC) 6316 *ketcode = OP_KETRMAX + repeat_type; 6317 6318 /* Handle non-ONCE brackets and possessive ONCEs (which have been 6319 converted to non-capturing above). */ 6320 6321 else 6322 { 6323 /* In the compile phase, check for empty string matching. */ 6324 6325 if (lengthptr == NULL) 6326 { 6327 pcre_uchar *scode = bracode; 6328 do 6329 { 6330 if (could_be_empty_branch(scode, ketcode, utf, cd, NULL)) 6331 { 6332 *bracode += OP_SBRA - OP_BRA; 6333 break; 6334 } 6335 scode += GET(scode, 1); 6336 } 6337 while (*scode == OP_ALT); 6338 } 6339 6340 /* A conditional group with only one branch has an implicit empty 6341 alternative branch. */ 6342 6343 if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT) 6344 *bracode = OP_SCOND; 6345 6346 /* Handle possessive quantifiers. */ 6347 6348 if (possessive_quantifier) 6349 { 6350 /* For COND brackets, we wrap the whole thing in a possessively 6351 repeated non-capturing bracket, because we have not invented POS 6352 versions of the COND opcodes. Because we are moving code along, we 6353 must ensure that any pending recursive references are updated. */ 6354 6355 if (*bracode == OP_COND || *bracode == OP_SCOND) 6356 { 6357 int nlen = (int)(code - bracode); 6358 *code = OP_END; 6359 adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); 6360 memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); 6361 code += 1 + LINK_SIZE; 6362 nlen += 1 + LINK_SIZE; 6363 *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS; 6364 *code++ = OP_KETRPOS; 6365 PUTINC(code, 0, nlen); 6366 PUT(bracode, 1, nlen); 6367 } 6368 6369 /* For non-COND brackets, we modify the BRA code and use KETRPOS. */ 6370 6371 else 6372 { 6373 *bracode += 1; /* Switch to xxxPOS opcodes */ 6374 *ketcode = OP_KETRPOS; 6375 } 6376 6377 /* If the minimum is zero, mark it as possessive, then unset the 6378 possessive flag when the minimum is 0 or 1. */ 6379 6380 if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO; 6381 if (repeat_min < 2) possessive_quantifier = FALSE; 6382 } 6383 6384 /* Non-possessive quantifier */ 6385 6386 else *ketcode = OP_KETRMAX + repeat_type; 6387 } 6388 } 6389 } 6390 6391 /* If previous is OP_FAIL, it was generated by an empty class [] in 6392 JavaScript mode. The other ways in which OP_FAIL can be generated, that is 6393 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" 6394 error above. We can just ignore the repeat in JS case. */ 6395 6396 else if (*previous == OP_FAIL) goto END_REPEAT; 6397 6398 /* Else there's some kind of shambles */ 6399 6400 else 6401 { 6402 *errorcodeptr = ERR11; 6403 goto FAILED; 6404 } 6405 6406 /* If the character following a repeat is '+', possessive_quantifier is 6407 TRUE. For some opcodes, there are special alternative opcodes for this 6408 case. For anything else, we wrap the entire repeated item inside OP_ONCE 6409 brackets. Logically, the '+' notation is just syntactic sugar, taken from 6410 Sun's Java package, but the special opcodes can optimize it. 6411 6412 Some (but not all) possessively repeated subpatterns have already been 6413 completely handled in the code just above. For them, possessive_quantifier 6414 is always FALSE at this stage. Note that the repeated item starts at 6415 tempcode, not at previous, which might be the first part of a string whose 6416 (former) last char we repeated. */ 6417 6418 if (possessive_quantifier) 6419 { 6420 int len; 6421 6422 /* Possessifying an EXACT quantifier has no effect, so we can ignore it. 6423 However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6}, 6424 {5,}, or {5,10}). We skip over an EXACT item; if the length of what 6425 remains is greater than zero, there's a further opcode that can be 6426 handled. If not, do nothing, leaving the EXACT alone. */ 6427 6428 switch(*tempcode) 6429 { 6430 case OP_TYPEEXACT: 6431 tempcode += PRIV(OP_lengths)[*tempcode] + 6432 ((tempcode[1 + IMM2_SIZE] == OP_PROP 6433 || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0); 6434 break; 6435 6436 /* CHAR opcodes are used for exacts whose count is 1. */ 6437 6438 case OP_CHAR: 6439 case OP_CHARI: 6440 case OP_NOT: 6441 case OP_NOTI: 6442 case OP_EXACT: 6443 case OP_EXACTI: 6444 case OP_NOTEXACT: 6445 case OP_NOTEXACTI: 6446 tempcode += PRIV(OP_lengths)[*tempcode]; 6447#ifdef SUPPORT_UTF 6448 if (utf && HAS_EXTRALEN(tempcode[-1])) 6449 tempcode += GET_EXTRALEN(tempcode[-1]); 6450#endif 6451 break; 6452 6453 /* For the class opcodes, the repeat operator appears at the end; 6454 adjust tempcode to point to it. */ 6455 6456 case OP_CLASS: 6457 case OP_NCLASS: 6458 tempcode += 1 + 32/sizeof(pcre_uchar); 6459 break; 6460 6461#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 6462 case OP_XCLASS: 6463 tempcode += GET(tempcode, 1); 6464 break; 6465#endif 6466 } 6467 6468 /* If tempcode is equal to code (which points to the end of the repeated 6469 item), it means we have skipped an EXACT item but there is no following 6470 QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In 6471 all other cases, tempcode will be pointing to the repeat opcode, and will 6472 be less than code, so the value of len will be greater than 0. */ 6473 6474 len = (int)(code - tempcode); 6475 if (len > 0) 6476 { 6477 unsigned int repcode = *tempcode; 6478 6479 /* There is a table for possessifying opcodes, all of which are less 6480 than OP_CALLOUT. A zero entry means there is no possessified version. 6481 */ 6482 6483 if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0) 6484 *tempcode = opcode_possessify[repcode]; 6485 6486 /* For opcode without a special possessified version, wrap the item in 6487 ONCE brackets. Because we are moving code along, we must ensure that any 6488 pending recursive references are updated. */ 6489 6490 else 6491 { 6492 *code = OP_END; 6493 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); 6494 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); 6495 code += 1 + LINK_SIZE; 6496 len += 1 + LINK_SIZE; 6497 tempcode[0] = OP_ONCE; 6498 *code++ = OP_KET; 6499 PUTINC(code, 0, len); 6500 PUT(tempcode, 1, len); 6501 } 6502 } 6503 6504#ifdef NEVER 6505 if (len > 0) switch (*tempcode) 6506 { 6507 case OP_STAR: *tempcode = OP_POSSTAR; break; 6508 case OP_PLUS: *tempcode = OP_POSPLUS; break; 6509 case OP_QUERY: *tempcode = OP_POSQUERY; break; 6510 case OP_UPTO: *tempcode = OP_POSUPTO; break; 6511 6512 case OP_STARI: *tempcode = OP_POSSTARI; break; 6513 case OP_PLUSI: *tempcode = OP_POSPLUSI; break; 6514 case OP_QUERYI: *tempcode = OP_POSQUERYI; break; 6515 case OP_UPTOI: *tempcode = OP_POSUPTOI; break; 6516 6517 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; 6518 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; 6519 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; 6520 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; 6521 6522 case OP_NOTSTARI: *tempcode = OP_NOTPOSSTARI; break; 6523 case OP_NOTPLUSI: *tempcode = OP_NOTPOSPLUSI; break; 6524 case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break; 6525 case OP_NOTUPTOI: *tempcode = OP_NOTPOSUPTOI; break; 6526 6527 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; 6528 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; 6529 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; 6530 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; 6531 6532 case OP_CRSTAR: *tempcode = OP_CRPOSSTAR; break; 6533 case OP_CRPLUS: *tempcode = OP_CRPOSPLUS; break; 6534 case OP_CRQUERY: *tempcode = OP_CRPOSQUERY; break; 6535 case OP_CRRANGE: *tempcode = OP_CRPOSRANGE; break; 6536 6537 /* Because we are moving code along, we must ensure that any 6538 pending recursive references are updated. */ 6539 6540 default: 6541 *code = OP_END; 6542 adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset); 6543 memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); 6544 code += 1 + LINK_SIZE; 6545 len += 1 + LINK_SIZE; 6546 tempcode[0] = OP_ONCE; 6547 *code++ = OP_KET; 6548 PUTINC(code, 0, len); 6549 PUT(tempcode, 1, len); 6550 break; 6551 } 6552#endif 6553 } 6554 6555 /* In all case we no longer have a previous item. We also set the 6556 "follows varying string" flag for subsequently encountered reqchars if 6557 it isn't already set and we have just passed a varying length item. */ 6558 6559 END_REPEAT: 6560 previous = NULL; 6561 cd->req_varyopt |= reqvary; 6562 break; 6563 6564 6565 /* ===================================================================*/ 6566 /* Start of nested parenthesized sub-expression, or comment or lookahead or 6567 lookbehind or option setting or condition or all the other extended 6568 parenthesis forms. */ 6569 6570 case CHAR_LEFT_PARENTHESIS: 6571 ptr++; 6572 6573 /* Now deal with various "verbs" that can be introduced by '*'. */ 6574 6575 if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':' 6576 || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0)))) 6577 { 6578 int i, namelen; 6579 int arglen = 0; 6580 const char *vn = verbnames; 6581 const pcre_uchar *name = ptr + 1; 6582 const pcre_uchar *arg = NULL; 6583 previous = NULL; 6584 ptr++; 6585 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; 6586 namelen = (int)(ptr - name); 6587 6588 /* It appears that Perl allows any characters whatsoever, other than 6589 a closing parenthesis, to appear in arguments, so we no longer insist on 6590 letters, digits, and underscores. */ 6591 6592 if (*ptr == CHAR_COLON) 6593 { 6594 arg = ++ptr; 6595 while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; 6596 arglen = (int)(ptr - arg); 6597 if ((unsigned int)arglen > MAX_MARK) 6598 { 6599 *errorcodeptr = ERR75; 6600 goto FAILED; 6601 } 6602 } 6603 6604 if (*ptr != CHAR_RIGHT_PARENTHESIS) 6605 { 6606 *errorcodeptr = ERR60; 6607 goto FAILED; 6608 } 6609 6610 /* Scan the table of verb names */ 6611 6612 for (i = 0; i < verbcount; i++) 6613 { 6614 if (namelen == verbs[i].len && 6615 STRNCMP_UC_C8(name, vn, namelen) == 0) 6616 { 6617 int setverb; 6618 6619 /* Check for open captures before ACCEPT and convert it to 6620 ASSERT_ACCEPT if in an assertion. */ 6621 6622 if (verbs[i].op == OP_ACCEPT) 6623 { 6624 open_capitem *oc; 6625 if (arglen != 0) 6626 { 6627 *errorcodeptr = ERR59; 6628 goto FAILED; 6629 } 6630 cd->had_accept = TRUE; 6631 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 6632 { 6633 if (lengthptr != NULL) 6634 { 6635#ifdef COMPILE_PCRE8 6636 *lengthptr += 1 + IMM2_SIZE; 6637#elif defined COMPILE_PCRE16 6638 *lengthptr += 2 + IMM2_SIZE; 6639#elif defined COMPILE_PCRE32 6640 *lengthptr += 4 + IMM2_SIZE; 6641#endif 6642 } 6643 else 6644 { 6645 *code++ = OP_CLOSE; 6646 PUT2INC(code, 0, oc->number); 6647 } 6648 } 6649 setverb = *code++ = 6650 (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; 6651 6652 /* Do not set firstchar after *ACCEPT */ 6653 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 6654 } 6655 6656 /* Handle other cases with/without an argument */ 6657 6658 else if (arglen == 0) 6659 { 6660 if (verbs[i].op < 0) /* Argument is mandatory */ 6661 { 6662 *errorcodeptr = ERR66; 6663 goto FAILED; 6664 } 6665 setverb = *code++ = verbs[i].op; 6666 } 6667 6668 else 6669 { 6670 if (verbs[i].op_arg < 0) /* Argument is forbidden */ 6671 { 6672 *errorcodeptr = ERR59; 6673 goto FAILED; 6674 } 6675 setverb = *code++ = verbs[i].op_arg; 6676 if (lengthptr != NULL) /* In pass 1 just add in the length */ 6677 { /* to avoid potential workspace */ 6678 *lengthptr += arglen; /* overflow. */ 6679 *code++ = 0; 6680 } 6681 else 6682 { 6683 *code++ = arglen; 6684 memcpy(code, arg, IN_UCHARS(arglen)); 6685 code += arglen; 6686 } 6687 *code++ = 0; 6688 } 6689 6690 switch (setverb) 6691 { 6692 case OP_THEN: 6693 case OP_THEN_ARG: 6694 cd->external_flags |= PCRE_HASTHEN; 6695 break; 6696 6697 case OP_PRUNE: 6698 case OP_PRUNE_ARG: 6699 case OP_SKIP: 6700 case OP_SKIP_ARG: 6701 cd->had_pruneorskip = TRUE; 6702 break; 6703 } 6704 6705 break; /* Found verb, exit loop */ 6706 } 6707 6708 vn += verbs[i].len + 1; 6709 } 6710 6711 if (i < verbcount) continue; /* Successfully handled a verb */ 6712 *errorcodeptr = ERR60; /* Verb not recognized */ 6713 goto FAILED; 6714 } 6715 6716 /* Initialize for "real" parentheses */ 6717 6718 newoptions = options; 6719 skipbytes = 0; 6720 bravalue = OP_CBRA; 6721 item_hwm_offset = cd->hwm - cd->start_workspace; 6722 reset_bracount = FALSE; 6723 6724 /* Deal with the extended parentheses; all are introduced by '?', and the 6725 appearance of any of them means that this is not a capturing group. */ 6726 6727 if (*ptr == CHAR_QUESTION_MARK) 6728 { 6729 int i, set, unset, namelen; 6730 int *optset; 6731 const pcre_uchar *name; 6732 pcre_uchar *slot; 6733 6734 switch (*(++ptr)) 6735 { 6736 /* ------------------------------------------------------------ */ 6737 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ 6738 reset_bracount = TRUE; 6739 cd->dupgroups = TRUE; /* Record (?| encountered */ 6740 /* Fall through */ 6741 6742 /* ------------------------------------------------------------ */ 6743 case CHAR_COLON: /* Non-capturing bracket */ 6744 bravalue = OP_BRA; 6745 ptr++; 6746 break; 6747 6748 6749 /* ------------------------------------------------------------ */ 6750 case CHAR_LEFT_PARENTHESIS: 6751 bravalue = OP_COND; /* Conditional group */ 6752 tempptr = ptr; 6753 6754 /* A condition can be an assertion, a number (referring to a numbered 6755 group's having been set), a name (referring to a named group), or 'R', 6756 referring to recursion. R<digits> and R&name are also permitted for 6757 recursion tests. 6758 6759 There are ways of testing a named group: (?(name)) is used by Python; 6760 Perl 5.10 onwards uses (?(<name>) or (?('name')). 6761 6762 There is one unfortunate ambiguity, caused by history. 'R' can be the 6763 recursive thing or the name 'R' (and similarly for 'R' followed by 6764 digits). We look for a name first; if not found, we try the other case. 6765 6766 For compatibility with auto-callouts, we allow a callout to be 6767 specified before a condition that is an assertion. First, check for the 6768 syntax of a callout; if found, adjust the temporary pointer that is 6769 used to check for an assertion condition. That's all that is needed! */ 6770 6771 if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C) 6772 { 6773 for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break; 6774 if (ptr[i] == CHAR_RIGHT_PARENTHESIS) 6775 tempptr += i + 1; 6776 6777 /* tempptr should now be pointing to the opening parenthesis of the 6778 assertion condition. */ 6779 6780 if (*tempptr != CHAR_LEFT_PARENTHESIS) 6781 { 6782 *errorcodeptr = ERR28; 6783 goto FAILED; 6784 } 6785 } 6786 6787 /* For conditions that are assertions, check the syntax, and then exit 6788 the switch. This will take control down to where bracketed groups, 6789 including assertions, are processed. */ 6790 6791 if (tempptr[1] == CHAR_QUESTION_MARK && 6792 (tempptr[2] == CHAR_EQUALS_SIGN || 6793 tempptr[2] == CHAR_EXCLAMATION_MARK || 6794 (tempptr[2] == CHAR_LESS_THAN_SIGN && 6795 (tempptr[3] == CHAR_EQUALS_SIGN || 6796 tempptr[3] == CHAR_EXCLAMATION_MARK)))) 6797 { 6798 cd->iscondassert = TRUE; 6799 break; 6800 } 6801 6802 /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all 6803 need to skip at least 1+IMM2_SIZE bytes at the start of the group. */ 6804 6805 code[1+LINK_SIZE] = OP_CREF; 6806 skipbytes = 1+IMM2_SIZE; 6807 refsign = -1; /* => not a number */ 6808 namelen = -1; /* => not a name; must set to avoid warning */ 6809 name = NULL; /* Always set to avoid warning */ 6810 recno = 0; /* Always set to avoid warning */ 6811 6812 /* Check for a test for recursion in a named group. */ 6813 6814 ptr++; 6815 if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND) 6816 { 6817 terminator = -1; 6818 ptr += 2; 6819 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ 6820 } 6821 6822 /* Check for a test for a named group's having been set, using the Perl 6823 syntax (?(<name>) or (?('name'), and also allow for the original PCRE 6824 syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */ 6825 6826 else if (*ptr == CHAR_LESS_THAN_SIGN) 6827 { 6828 terminator = CHAR_GREATER_THAN_SIGN; 6829 ptr++; 6830 } 6831 else if (*ptr == CHAR_APOSTROPHE) 6832 { 6833 terminator = CHAR_APOSTROPHE; 6834 ptr++; 6835 } 6836 else 6837 { 6838 terminator = CHAR_NULL; 6839 if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++; 6840 else if (IS_DIGIT(*ptr)) refsign = 0; 6841 } 6842 6843 /* Handle a number */ 6844 6845 if (refsign >= 0) 6846 { 6847 while (IS_DIGIT(*ptr)) 6848 { 6849 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 6850 { 6851 while (IS_DIGIT(*ptr)) ptr++; 6852 *errorcodeptr = ERR61; 6853 goto FAILED; 6854 } 6855 recno = recno * 10 + (int)(*ptr - CHAR_0); 6856 ptr++; 6857 } 6858 } 6859 6860 /* Otherwise we expect to read a name; anything else is an error. When 6861 a name is one of a number of duplicates, a different opcode is used and 6862 it needs more memory. Unfortunately we cannot tell whether a name is a 6863 duplicate in the first pass, so we have to allow for more memory. */ 6864 6865 else 6866 { 6867 if (IS_DIGIT(*ptr)) 6868 { 6869 *errorcodeptr = ERR84; 6870 goto FAILED; 6871 } 6872 if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0) 6873 { 6874 *errorcodeptr = ERR28; /* Assertion expected */ 6875 goto FAILED; 6876 } 6877 name = ptr++; 6878 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) 6879 { 6880 ptr++; 6881 } 6882 namelen = (int)(ptr - name); 6883 if (lengthptr != NULL) skipbytes += IMM2_SIZE; 6884 } 6885 6886 /* Check the terminator */ 6887 6888 if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) || 6889 *ptr++ != CHAR_RIGHT_PARENTHESIS) 6890 { 6891 ptr--; /* Error offset */ 6892 *errorcodeptr = ERR26; /* Malformed number or name */ 6893 goto FAILED; 6894 } 6895 6896 /* Do no further checking in the pre-compile phase. */ 6897 6898 if (lengthptr != NULL) break; 6899 6900 /* In the real compile we do the work of looking for the actual 6901 reference. If refsign is not negative, it means we have a number in 6902 recno. */ 6903 6904 if (refsign >= 0) 6905 { 6906 if (recno <= 0) 6907 { 6908 *errorcodeptr = ERR35; 6909 goto FAILED; 6910 } 6911 if (refsign != 0) recno = (refsign == CHAR_MINUS)? 6912 cd->bracount - recno + 1 : recno + cd->bracount; 6913 if (recno <= 0 || recno > cd->final_bracount) 6914 { 6915 *errorcodeptr = ERR15; 6916 goto FAILED; 6917 } 6918 PUT2(code, 2+LINK_SIZE, recno); 6919 if (recno > cd->top_backref) cd->top_backref = recno; 6920 break; 6921 } 6922 6923 /* Otherwise look for the name. */ 6924 6925 slot = cd->name_table; 6926 for (i = 0; i < cd->names_found; i++) 6927 { 6928 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break; 6929 slot += cd->name_entry_size; 6930 } 6931 6932 /* Found the named subpattern. If the name is duplicated, add one to 6933 the opcode to change CREF/RREF into DNCREF/DNRREF and insert 6934 appropriate data values. Otherwise, just insert the unique subpattern 6935 number. */ 6936 6937 if (i < cd->names_found) 6938 { 6939 int offset = i++; 6940 int count = 1; 6941 recno = GET2(slot, 0); /* Number from first found */ 6942 if (recno > cd->top_backref) cd->top_backref = recno; 6943 for (; i < cd->names_found; i++) 6944 { 6945 slot += cd->name_entry_size; 6946 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 || 6947 (slot+IMM2_SIZE)[namelen] != 0) break; 6948 count++; 6949 } 6950 6951 if (count > 1) 6952 { 6953 PUT2(code, 2+LINK_SIZE, offset); 6954 PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); 6955 skipbytes += IMM2_SIZE; 6956 code[1+LINK_SIZE]++; 6957 } 6958 else /* Not a duplicated name */ 6959 { 6960 PUT2(code, 2+LINK_SIZE, recno); 6961 } 6962 } 6963 6964 /* If terminator == CHAR_NULL it means that the name followed directly 6965 after the opening parenthesis [e.g. (?(abc)...] and in this case there 6966 are some further alternatives to try. For the cases where terminator != 6967 CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ] 6968 we have now checked all the possibilities, so give an error. */ 6969 6970 else if (terminator != CHAR_NULL) 6971 { 6972 *errorcodeptr = ERR15; 6973 goto FAILED; 6974 } 6975 6976 /* Check for (?(R) for recursion. Allow digits after R to specify a 6977 specific group number. */ 6978 6979 else if (*name == CHAR_R) 6980 { 6981 recno = 0; 6982 for (i = 1; i < namelen; i++) 6983 { 6984 if (!IS_DIGIT(name[i])) 6985 { 6986 *errorcodeptr = ERR15; 6987 goto FAILED; 6988 } 6989 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 6990 { 6991 *errorcodeptr = ERR61; 6992 goto FAILED; 6993 } 6994 recno = recno * 10 + name[i] - CHAR_0; 6995 } 6996 if (recno == 0) recno = RREF_ANY; 6997 code[1+LINK_SIZE] = OP_RREF; /* Change test type */ 6998 PUT2(code, 2+LINK_SIZE, recno); 6999 } 7000 7001 /* Similarly, check for the (?(DEFINE) "condition", which is always 7002 false. */ 7003 7004 else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0) 7005 { 7006 code[1+LINK_SIZE] = OP_DEF; 7007 skipbytes = 1; 7008 } 7009 7010 /* Reference to an unidentified subpattern. */ 7011 7012 else 7013 { 7014 *errorcodeptr = ERR15; 7015 goto FAILED; 7016 } 7017 break; 7018 7019 7020 /* ------------------------------------------------------------ */ 7021 case CHAR_EQUALS_SIGN: /* Positive lookahead */ 7022 bravalue = OP_ASSERT; 7023 cd->assert_depth += 1; 7024 ptr++; 7025 break; 7026 7027 /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird 7028 thing to do, but Perl allows all assertions to be quantified, and when 7029 they contain capturing parentheses there may be a potential use for 7030 this feature. Not that that applies to a quantified (?!) but we allow 7031 it for uniformity. */ 7032 7033 /* ------------------------------------------------------------ */ 7034 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ 7035 ptr++; 7036 if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK && 7037 ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK && 7038 (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2))) 7039 { 7040 *code++ = OP_FAIL; 7041 previous = NULL; 7042 continue; 7043 } 7044 bravalue = OP_ASSERT_NOT; 7045 cd->assert_depth += 1; 7046 break; 7047 7048 7049 /* ------------------------------------------------------------ */ 7050 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ 7051 switch (ptr[1]) 7052 { 7053 case CHAR_EQUALS_SIGN: /* Positive lookbehind */ 7054 bravalue = OP_ASSERTBACK; 7055 cd->assert_depth += 1; 7056 ptr += 2; 7057 break; 7058 7059 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ 7060 bravalue = OP_ASSERTBACK_NOT; 7061 cd->assert_depth += 1; 7062 ptr += 2; 7063 break; 7064 7065 default: /* Could be name define, else bad */ 7066 if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0) 7067 goto DEFINE_NAME; 7068 ptr++; /* Correct offset for error */ 7069 *errorcodeptr = ERR24; 7070 goto FAILED; 7071 } 7072 break; 7073 7074 7075 /* ------------------------------------------------------------ */ 7076 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ 7077 bravalue = OP_ONCE; 7078 ptr++; 7079 break; 7080 7081 7082 /* ------------------------------------------------------------ */ 7083 case CHAR_C: /* Callout - may be followed by digits; */ 7084 previous_callout = code; /* Save for later completion */ 7085 after_manual_callout = 1; /* Skip one item before completing */ 7086 *code++ = OP_CALLOUT; 7087 { 7088 int n = 0; 7089 ptr++; 7090 while(IS_DIGIT(*ptr)) 7091 n = n * 10 + *ptr++ - CHAR_0; 7092 if (*ptr != CHAR_RIGHT_PARENTHESIS) 7093 { 7094 *errorcodeptr = ERR39; 7095 goto FAILED; 7096 } 7097 if (n > 255) 7098 { 7099 *errorcodeptr = ERR38; 7100 goto FAILED; 7101 } 7102 *code++ = n; 7103 PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */ 7104 PUT(code, LINK_SIZE, 0); /* Default length */ 7105 code += 2 * LINK_SIZE; 7106 } 7107 previous = NULL; 7108 continue; 7109 7110 7111 /* ------------------------------------------------------------ */ 7112 case CHAR_P: /* Python-style named subpattern handling */ 7113 if (*(++ptr) == CHAR_EQUALS_SIGN || 7114 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ 7115 { 7116 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; 7117 terminator = CHAR_RIGHT_PARENTHESIS; 7118 goto NAMED_REF_OR_RECURSE; 7119 } 7120 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ 7121 { 7122 *errorcodeptr = ERR41; 7123 goto FAILED; 7124 } 7125 /* Fall through to handle (?P< as (?< is handled */ 7126 7127 7128 /* ------------------------------------------------------------ */ 7129 DEFINE_NAME: /* Come here from (?< handling */ 7130 case CHAR_APOSTROPHE: 7131 terminator = (*ptr == CHAR_LESS_THAN_SIGN)? 7132 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 7133 name = ++ptr; 7134 if (IS_DIGIT(*ptr)) 7135 { 7136 *errorcodeptr = ERR84; /* Group name must start with non-digit */ 7137 goto FAILED; 7138 } 7139 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++; 7140 namelen = (int)(ptr - name); 7141 7142 /* In the pre-compile phase, do a syntax check, remember the longest 7143 name, and then remember the group in a vector, expanding it if 7144 necessary. Duplicates for the same number are skipped; other duplicates 7145 are checked for validity. In the actual compile, there is nothing to 7146 do. */ 7147 7148 if (lengthptr != NULL) 7149 { 7150 named_group *ng; 7151 pcre_uint32 number = cd->bracount + 1; 7152 7153 if (*ptr != (pcre_uchar)terminator) 7154 { 7155 *errorcodeptr = ERR42; 7156 goto FAILED; 7157 } 7158 7159 if (cd->names_found >= MAX_NAME_COUNT) 7160 { 7161 *errorcodeptr = ERR49; 7162 goto FAILED; 7163 } 7164 7165 if (namelen + IMM2_SIZE + 1 > cd->name_entry_size) 7166 { 7167 cd->name_entry_size = namelen + IMM2_SIZE + 1; 7168 if (namelen > MAX_NAME_SIZE) 7169 { 7170 *errorcodeptr = ERR48; 7171 goto FAILED; 7172 } 7173 } 7174 7175 /* Scan the list to check for duplicates. For duplicate names, if the 7176 number is the same, break the loop, which causes the name to be 7177 discarded; otherwise, if DUPNAMES is not set, give an error. 7178 If it is set, allow the name with a different number, but continue 7179 scanning in case this is a duplicate with the same number. For 7180 non-duplicate names, give an error if the number is duplicated. */ 7181 7182 ng = cd->named_groups; 7183 for (i = 0; i < cd->names_found; i++, ng++) 7184 { 7185 if (namelen == ng->length && 7186 STRNCMP_UC_UC(name, ng->name, namelen) == 0) 7187 { 7188 if (ng->number == number) break; 7189 if ((options & PCRE_DUPNAMES) == 0) 7190 { 7191 *errorcodeptr = ERR43; 7192 goto FAILED; 7193 } 7194 cd->dupnames = TRUE; /* Duplicate names exist */ 7195 } 7196 else if (ng->number == number) 7197 { 7198 *errorcodeptr = ERR65; 7199 goto FAILED; 7200 } 7201 } 7202 7203 if (i >= cd->names_found) /* Not a duplicate with same number */ 7204 { 7205 /* Increase the list size if necessary */ 7206 7207 if (cd->names_found >= cd->named_group_list_size) 7208 { 7209 int newsize = cd->named_group_list_size * 2; 7210 named_group *newspace = (PUBL(malloc)) 7211 (newsize * sizeof(named_group)); 7212 7213 if (newspace == NULL) 7214 { 7215 *errorcodeptr = ERR21; 7216 goto FAILED; 7217 } 7218 7219 memcpy(newspace, cd->named_groups, 7220 cd->named_group_list_size * sizeof(named_group)); 7221 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE) 7222 (PUBL(free))((void *)cd->named_groups); 7223 cd->named_groups = newspace; 7224 cd->named_group_list_size = newsize; 7225 } 7226 7227 cd->named_groups[cd->names_found].name = name; 7228 cd->named_groups[cd->names_found].length = namelen; 7229 cd->named_groups[cd->names_found].number = number; 7230 cd->names_found++; 7231 } 7232 } 7233 7234 ptr++; /* Move past > or ' in both passes. */ 7235 goto NUMBERED_GROUP; 7236 7237 7238 /* ------------------------------------------------------------ */ 7239 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ 7240 terminator = CHAR_RIGHT_PARENTHESIS; 7241 is_recurse = TRUE; 7242 /* Fall through */ 7243 7244 /* We come here from the Python syntax above that handles both 7245 references (?P=name) and recursion (?P>name), as well as falling 7246 through from the Perl recursion syntax (?&name). We also come here from 7247 the Perl \k<name> or \k'name' back reference syntax and the \k{name} 7248 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ 7249 7250 NAMED_REF_OR_RECURSE: 7251 name = ++ptr; 7252 if (IS_DIGIT(*ptr)) 7253 { 7254 *errorcodeptr = ERR84; /* Group name must start with non-digit */ 7255 goto FAILED; 7256 } 7257 while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++; 7258 namelen = (int)(ptr - name); 7259 7260 /* In the pre-compile phase, do a syntax check. We used to just set 7261 a dummy reference number, because it was not used in the first pass. 7262 However, with the change of recursive back references to be atomic, 7263 we have to look for the number so that this state can be identified, as 7264 otherwise the incorrect length is computed. If it's not a backwards 7265 reference, the dummy number will do. */ 7266 7267 if (lengthptr != NULL) 7268 { 7269 named_group *ng; 7270 recno = 0; 7271 7272 if (namelen == 0) 7273 { 7274 *errorcodeptr = ERR62; 7275 goto FAILED; 7276 } 7277 if (*ptr != (pcre_uchar)terminator) 7278 { 7279 *errorcodeptr = ERR42; 7280 goto FAILED; 7281 } 7282 if (namelen > MAX_NAME_SIZE) 7283 { 7284 *errorcodeptr = ERR48; 7285 goto FAILED; 7286 } 7287 7288 /* Count named back references. */ 7289 7290 if (!is_recurse) cd->namedrefcount++; 7291 7292 /* We have to allow for a named reference to a duplicated name (this 7293 cannot be determined until the second pass). This needs an extra 7294 16-bit data item. */ 7295 7296 *lengthptr += IMM2_SIZE; 7297 7298 /* If this is a forward reference and we are within a (?|...) group, 7299 the reference may end up as the number of a group which we are 7300 currently inside, that is, it could be a recursive reference. In the 7301 real compile this will be picked up and the reference wrapped with 7302 OP_ONCE to make it atomic, so we must space in case this occurs. */ 7303 7304 /* In fact, this can happen for a non-forward reference because 7305 another group with the same number might be created later. This 7306 issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance 7307 only mode, we finesse the bug by allowing more memory always. */ 7308 7309 *lengthptr += 4 + 4*LINK_SIZE; 7310 7311 /* It is even worse than that. The current reference may be to an 7312 existing named group with a different number (so apparently not 7313 recursive) but which later on is also attached to a group with the 7314 current number. This can only happen if $(| has been previous 7315 encountered. In that case, we allow yet more memory, just in case. 7316 (Again, this is fixed "properly" in PCRE2. */ 7317 7318 if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE; 7319 7320 /* Otherwise, check for recursion here. The name table does not exist 7321 in the first pass; instead we must scan the list of names encountered 7322 so far in order to get the number. If the name is not found, leave 7323 the value of recno as 0 for a forward reference. */ 7324 7325 /* This patch (removing "else") fixes a problem when a reference is 7326 to multiple identically named nested groups from within the nest. 7327 Once again, it is not the "proper" fix, and it results in an 7328 over-allocation of memory. */ 7329 7330 /* else */ 7331 { 7332 ng = cd->named_groups; 7333 for (i = 0; i < cd->names_found; i++, ng++) 7334 { 7335 if (namelen == ng->length && 7336 STRNCMP_UC_UC(name, ng->name, namelen) == 0) 7337 { 7338 open_capitem *oc; 7339 recno = ng->number; 7340 if (is_recurse) break; 7341 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 7342 { 7343 if (oc->number == recno) 7344 { 7345 oc->flag = TRUE; 7346 break; 7347 } 7348 } 7349 } 7350 } 7351 } 7352 } 7353 7354 /* In the real compile, search the name table. We check the name 7355 first, and then check that we have reached the end of the name in the 7356 table. That way, if the name is longer than any in the table, the 7357 comparison will fail without reading beyond the table entry. */ 7358 7359 else 7360 { 7361 slot = cd->name_table; 7362 for (i = 0; i < cd->names_found; i++) 7363 { 7364 if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 && 7365 slot[IMM2_SIZE+namelen] == 0) 7366 break; 7367 slot += cd->name_entry_size; 7368 } 7369 7370 if (i < cd->names_found) 7371 { 7372 recno = GET2(slot, 0); 7373 } 7374 else 7375 { 7376 *errorcodeptr = ERR15; 7377 goto FAILED; 7378 } 7379 } 7380 7381 /* In both phases, for recursions, we can now go to the code than 7382 handles numerical recursion. */ 7383 7384 if (is_recurse) goto HANDLE_RECURSION; 7385 7386 /* In the second pass we must see if the name is duplicated. If so, we 7387 generate a different opcode. */ 7388 7389 if (lengthptr == NULL && cd->dupnames) 7390 { 7391 int count = 1; 7392 unsigned int index = i; 7393 pcre_uchar *cslot = slot + cd->name_entry_size; 7394 7395 for (i++; i < cd->names_found; i++) 7396 { 7397 if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break; 7398 count++; 7399 cslot += cd->name_entry_size; 7400 } 7401 7402 if (count > 1) 7403 { 7404 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 7405 previous = code; 7406 item_hwm_offset = cd->hwm - cd->start_workspace; 7407 *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF; 7408 PUT2INC(code, 0, index); 7409 PUT2INC(code, 0, count); 7410 7411 /* Process each potentially referenced group. */ 7412 7413 for (; slot < cslot; slot += cd->name_entry_size) 7414 { 7415 open_capitem *oc; 7416 recno = GET2(slot, 0); 7417 cd->backref_map |= (recno < 32)? (1 << recno) : 1; 7418 if (recno > cd->top_backref) cd->top_backref = recno; 7419 7420 /* Check to see if this back reference is recursive, that it, it 7421 is inside the group that it references. A flag is set so that the 7422 group can be made atomic. */ 7423 7424 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 7425 { 7426 if (oc->number == recno) 7427 { 7428 oc->flag = TRUE; 7429 break; 7430 } 7431 } 7432 } 7433 7434 continue; /* End of back ref handling */ 7435 } 7436 } 7437 7438 /* First pass, or a non-duplicated name. */ 7439 7440 goto HANDLE_REFERENCE; 7441 7442 7443 /* ------------------------------------------------------------ */ 7444 case CHAR_R: /* Recursion, same as (?0) */ 7445 recno = 0; 7446 if (*(++ptr) != CHAR_RIGHT_PARENTHESIS) 7447 { 7448 *errorcodeptr = ERR29; 7449 goto FAILED; 7450 } 7451 goto HANDLE_RECURSION; 7452 7453 7454 /* ------------------------------------------------------------ */ 7455 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ 7456 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: 7457 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 7458 { 7459 const pcre_uchar *called; 7460 terminator = CHAR_RIGHT_PARENTHESIS; 7461 7462 /* Come here from the \g<...> and \g'...' code (Oniguruma 7463 compatibility). However, the syntax has been checked to ensure that 7464 the ... are a (signed) number, so that neither ERR63 nor ERR29 will 7465 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY 7466 ever be taken. */ 7467 7468 HANDLE_NUMERICAL_RECURSION: 7469 7470 if ((refsign = *ptr) == CHAR_PLUS) 7471 { 7472 ptr++; 7473 if (!IS_DIGIT(*ptr)) 7474 { 7475 *errorcodeptr = ERR63; 7476 goto FAILED; 7477 } 7478 } 7479 else if (refsign == CHAR_MINUS) 7480 { 7481 if (!IS_DIGIT(ptr[1])) 7482 goto OTHER_CHAR_AFTER_QUERY; 7483 ptr++; 7484 } 7485 7486 recno = 0; 7487 while(IS_DIGIT(*ptr)) 7488 { 7489 if (recno > INT_MAX / 10 - 1) /* Integer overflow */ 7490 { 7491 while (IS_DIGIT(*ptr)) ptr++; 7492 *errorcodeptr = ERR61; 7493 goto FAILED; 7494 } 7495 recno = recno * 10 + *ptr++ - CHAR_0; 7496 } 7497 7498 if (*ptr != (pcre_uchar)terminator) 7499 { 7500 *errorcodeptr = ERR29; 7501 goto FAILED; 7502 } 7503 7504 if (refsign == CHAR_MINUS) 7505 { 7506 if (recno == 0) 7507 { 7508 *errorcodeptr = ERR58; 7509 goto FAILED; 7510 } 7511 recno = cd->bracount - recno + 1; 7512 if (recno <= 0) 7513 { 7514 *errorcodeptr = ERR15; 7515 goto FAILED; 7516 } 7517 } 7518 else if (refsign == CHAR_PLUS) 7519 { 7520 if (recno == 0) 7521 { 7522 *errorcodeptr = ERR58; 7523 goto FAILED; 7524 } 7525 recno += cd->bracount; 7526 } 7527 7528 /* Come here from code above that handles a named recursion */ 7529 7530 HANDLE_RECURSION: 7531 7532 previous = code; 7533 item_hwm_offset = cd->hwm - cd->start_workspace; 7534 called = cd->start_code; 7535 7536 /* When we are actually compiling, find the bracket that is being 7537 referenced. Temporarily end the regex in case it doesn't exist before 7538 this point. If we end up with a forward reference, first check that 7539 the bracket does occur later so we can give the error (and position) 7540 now. Then remember this forward reference in the workspace so it can 7541 be filled in at the end. */ 7542 7543 if (lengthptr == NULL) 7544 { 7545 *code = OP_END; 7546 if (recno != 0) 7547 called = PRIV(find_bracket)(cd->start_code, utf, recno); 7548 7549 /* Forward reference */ 7550 7551 if (called == NULL) 7552 { 7553 if (recno > cd->final_bracount) 7554 { 7555 *errorcodeptr = ERR15; 7556 goto FAILED; 7557 } 7558 7559 /* Fudge the value of "called" so that when it is inserted as an 7560 offset below, what it actually inserted is the reference number 7561 of the group. Then remember the forward reference. */ 7562 7563 called = cd->start_code + recno; 7564 if (cd->hwm >= cd->start_workspace + cd->workspace_size - 7565 WORK_SIZE_SAFETY_MARGIN) 7566 { 7567 *errorcodeptr = expand_workspace(cd); 7568 if (*errorcodeptr != 0) goto FAILED; 7569 } 7570 PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code)); 7571 } 7572 7573 /* If not a forward reference, and the subpattern is still open, 7574 this is a recursive call. We check to see if this is a left 7575 recursion that could loop for ever, and diagnose that case. We 7576 must not, however, do this check if we are in a conditional 7577 subpattern because the condition might be testing for recursion in 7578 a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid. 7579 Forever loops are also detected at runtime, so those that occur in 7580 conditional subpatterns will be picked up then. */ 7581 7582 else if (GET(called, 1) == 0 && cond_depth <= 0 && 7583 could_be_empty(called, code, bcptr, utf, cd)) 7584 { 7585 *errorcodeptr = ERR40; 7586 goto FAILED; 7587 } 7588 } 7589 7590 /* Insert the recursion/subroutine item. It does not have a set first 7591 character (relevant if it is repeated, because it will then be 7592 wrapped with ONCE brackets). */ 7593 7594 *code = OP_RECURSE; 7595 PUT(code, 1, (int)(called - cd->start_code)); 7596 code += 1 + LINK_SIZE; 7597 groupsetfirstchar = FALSE; 7598 } 7599 7600 /* Can't determine a first byte now */ 7601 7602 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 7603 continue; 7604 7605 7606 /* ------------------------------------------------------------ */ 7607 default: /* Other characters: check option setting */ 7608 OTHER_CHAR_AFTER_QUERY: 7609 set = unset = 0; 7610 optset = &set; 7611 7612 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) 7613 { 7614 switch (*ptr++) 7615 { 7616 case CHAR_MINUS: optset = &unset; break; 7617 7618 case CHAR_J: /* Record that it changed in the external options */ 7619 *optset |= PCRE_DUPNAMES; 7620 cd->external_flags |= PCRE_JCHANGED; 7621 break; 7622 7623 case CHAR_i: *optset |= PCRE_CASELESS; break; 7624 case CHAR_m: *optset |= PCRE_MULTILINE; break; 7625 case CHAR_s: *optset |= PCRE_DOTALL; break; 7626 case CHAR_x: *optset |= PCRE_EXTENDED; break; 7627 case CHAR_U: *optset |= PCRE_UNGREEDY; break; 7628 case CHAR_X: *optset |= PCRE_EXTRA; break; 7629 7630 default: *errorcodeptr = ERR12; 7631 ptr--; /* Correct the offset */ 7632 goto FAILED; 7633 } 7634 } 7635 7636 /* Set up the changed option bits, but don't change anything yet. */ 7637 7638 newoptions = (options | set) & (~unset); 7639 7640 /* If the options ended with ')' this is not the start of a nested 7641 group with option changes, so the options change at this level. 7642 If we are not at the pattern start, reset the greedy defaults and the 7643 case value for firstchar and reqchar. */ 7644 7645 if (*ptr == CHAR_RIGHT_PARENTHESIS) 7646 { 7647 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); 7648 greedy_non_default = greedy_default ^ 1; 7649 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0; 7650 7651 /* Change options at this level, and pass them back for use 7652 in subsequent branches. */ 7653 7654 *optionsptr = options = newoptions; 7655 previous = NULL; /* This item can't be repeated */ 7656 continue; /* It is complete */ 7657 } 7658 7659 /* If the options ended with ':' we are heading into a nested group 7660 with possible change of options. Such groups are non-capturing and are 7661 not assertions of any kind. All we need to do is skip over the ':'; 7662 the newoptions value is handled below. */ 7663 7664 bravalue = OP_BRA; 7665 ptr++; 7666 } /* End of switch for character following (? */ 7667 } /* End of (? handling */ 7668 7669 /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE 7670 is set, all unadorned brackets become non-capturing and behave like (?:...) 7671 brackets. */ 7672 7673 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) 7674 { 7675 bravalue = OP_BRA; 7676 } 7677 7678 /* Else we have a capturing group. */ 7679 7680 else 7681 { 7682 NUMBERED_GROUP: 7683 cd->bracount += 1; 7684 PUT2(code, 1+LINK_SIZE, cd->bracount); 7685 skipbytes = IMM2_SIZE; 7686 } 7687 7688 /* Process nested bracketed regex. First check for parentheses nested too 7689 deeply. */ 7690 7691 if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT) 7692 { 7693 *errorcodeptr = ERR82; 7694 goto FAILED; 7695 } 7696 7697 /* All assertions used not to be repeatable, but this was changed for Perl 7698 compatibility. All kinds can now be repeated except for assertions that are 7699 conditions (Perl also forbids these to be repeated). We copy code into a 7700 non-register variable (tempcode) in order to be able to pass its address 7701 because some compilers complain otherwise. At the start of a conditional 7702 group whose condition is an assertion, cd->iscondassert is set. We unset it 7703 here so as to allow assertions later in the group to be quantified. */ 7704 7705 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT && 7706 cd->iscondassert) 7707 { 7708 previous = NULL; 7709 cd->iscondassert = FALSE; 7710 } 7711 else 7712 { 7713 previous = code; 7714 item_hwm_offset = cd->hwm - cd->start_workspace; 7715 } 7716 7717 *code = bravalue; 7718 tempcode = code; 7719 tempreqvary = cd->req_varyopt; /* Save value before bracket */ 7720 tempbracount = cd->bracount; /* Save value before bracket */ 7721 length_prevgroup = 0; /* Initialize for pre-compile phase */ 7722 7723 if (!compile_regex( 7724 newoptions, /* The complete new option state */ 7725 &tempcode, /* Where to put code (updated) */ 7726 &ptr, /* Input pointer (updated) */ 7727 errorcodeptr, /* Where to put an error message */ 7728 (bravalue == OP_ASSERTBACK || 7729 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ 7730 reset_bracount, /* True if (?| group */ 7731 skipbytes, /* Skip over bracket number */ 7732 cond_depth + 7733 ((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ 7734 &subfirstchar, /* For possible first char */ 7735 &subfirstcharflags, 7736 &subreqchar, /* For possible last char */ 7737 &subreqcharflags, 7738 bcptr, /* Current branch chain */ 7739 cd, /* Tables block */ 7740 (lengthptr == NULL)? NULL : /* Actual compile phase */ 7741 &length_prevgroup /* Pre-compile phase */ 7742 )) 7743 goto FAILED; 7744 7745 cd->parens_depth -= 1; 7746 7747 /* If this was an atomic group and there are no capturing groups within it, 7748 generate OP_ONCE_NC instead of OP_ONCE. */ 7749 7750 if (bravalue == OP_ONCE && cd->bracount <= tempbracount) 7751 *code = OP_ONCE_NC; 7752 7753 if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) 7754 cd->assert_depth -= 1; 7755 7756 /* At the end of compiling, code is still pointing to the start of the 7757 group, while tempcode has been updated to point past the end of the group. 7758 The pattern pointer (ptr) is on the bracket. 7759 7760 If this is a conditional bracket, check that there are no more than 7761 two branches in the group, or just one if it's a DEFINE group. We do this 7762 in the real compile phase, not in the pre-pass, where the whole group may 7763 not be available. */ 7764 7765 if (bravalue == OP_COND && lengthptr == NULL) 7766 { 7767 pcre_uchar *tc = code; 7768 int condcount = 0; 7769 7770 do { 7771 condcount++; 7772 tc += GET(tc,1); 7773 } 7774 while (*tc != OP_KET); 7775 7776 /* A DEFINE group is never obeyed inline (the "condition" is always 7777 false). It must have only one branch. */ 7778 7779 if (code[LINK_SIZE+1] == OP_DEF) 7780 { 7781 if (condcount > 1) 7782 { 7783 *errorcodeptr = ERR54; 7784 goto FAILED; 7785 } 7786 bravalue = OP_DEF; /* Just a flag to suppress char handling below */ 7787 } 7788 7789 /* A "normal" conditional group. If there is just one branch, we must not 7790 make use of its firstchar or reqchar, because this is equivalent to an 7791 empty second branch. */ 7792 7793 else 7794 { 7795 if (condcount > 2) 7796 { 7797 *errorcodeptr = ERR27; 7798 goto FAILED; 7799 } 7800 if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE; 7801 } 7802 } 7803 7804 /* Error if hit end of pattern */ 7805 7806 if (*ptr != CHAR_RIGHT_PARENTHESIS) 7807 { 7808 *errorcodeptr = ERR14; 7809 goto FAILED; 7810 } 7811 7812 /* In the pre-compile phase, update the length by the length of the group, 7813 less the brackets at either end. Then reduce the compiled code to just a 7814 set of non-capturing brackets so that it doesn't use much memory if it is 7815 duplicated by a quantifier.*/ 7816 7817 if (lengthptr != NULL) 7818 { 7819 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) 7820 { 7821 *errorcodeptr = ERR20; 7822 goto FAILED; 7823 } 7824 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; 7825 code++; /* This already contains bravalue */ 7826 PUTINC(code, 0, 1 + LINK_SIZE); 7827 *code++ = OP_KET; 7828 PUTINC(code, 0, 1 + LINK_SIZE); 7829 break; /* No need to waste time with special character handling */ 7830 } 7831 7832 /* Otherwise update the main code pointer to the end of the group. */ 7833 7834 code = tempcode; 7835 7836 /* For a DEFINE group, required and first character settings are not 7837 relevant. */ 7838 7839 if (bravalue == OP_DEF) break; 7840 7841 /* Handle updating of the required and first characters for other types of 7842 group. Update for normal brackets of all kinds, and conditions with two 7843 branches (see code above). If the bracket is followed by a quantifier with 7844 zero repeat, we have to back off. Hence the definition of zeroreqchar and 7845 zerofirstchar outside the main loop so that they can be accessed for the 7846 back off. */ 7847 7848 zeroreqchar = reqchar; 7849 zeroreqcharflags = reqcharflags; 7850 zerofirstchar = firstchar; 7851 zerofirstcharflags = firstcharflags; 7852 groupsetfirstchar = FALSE; 7853 7854 if (bravalue >= OP_ONCE) 7855 { 7856 /* If we have not yet set a firstchar in this branch, take it from the 7857 subpattern, remembering that it was set here so that a repeat of more 7858 than one can replicate it as reqchar if necessary. If the subpattern has 7859 no firstchar, set "none" for the whole branch. In both cases, a zero 7860 repeat forces firstchar to "none". */ 7861 7862 if (firstcharflags == REQ_UNSET) 7863 { 7864 if (subfirstcharflags >= 0) 7865 { 7866 firstchar = subfirstchar; 7867 firstcharflags = subfirstcharflags; 7868 groupsetfirstchar = TRUE; 7869 } 7870 else firstcharflags = REQ_NONE; 7871 zerofirstcharflags = REQ_NONE; 7872 } 7873 7874 /* If firstchar was previously set, convert the subpattern's firstchar 7875 into reqchar if there wasn't one, using the vary flag that was in 7876 existence beforehand. */ 7877 7878 else if (subfirstcharflags >= 0 && subreqcharflags < 0) 7879 { 7880 subreqchar = subfirstchar; 7881 subreqcharflags = subfirstcharflags | tempreqvary; 7882 } 7883 7884 /* If the subpattern set a required byte (or set a first byte that isn't 7885 really the first byte - see above), set it. */ 7886 7887 if (subreqcharflags >= 0) 7888 { 7889 reqchar = subreqchar; 7890 reqcharflags = subreqcharflags; 7891 } 7892 } 7893 7894 /* For a forward assertion, we take the reqchar, if set. This can be 7895 helpful if the pattern that follows the assertion doesn't set a different 7896 char. For example, it's useful for /(?=abcde).+/. We can't set firstchar 7897 for an assertion, however because it leads to incorrect effect for patterns 7898 such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead 7899 of a firstchar. This is overcome by a scan at the end if there's no 7900 firstchar, looking for an asserted first char. */ 7901 7902 else if (bravalue == OP_ASSERT && subreqcharflags >= 0) 7903 { 7904 reqchar = subreqchar; 7905 reqcharflags = subreqcharflags; 7906 } 7907 break; /* End of processing '(' */ 7908 7909 7910 /* ===================================================================*/ 7911 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values 7912 are arranged to be the negation of the corresponding OP_values in the 7913 default case when PCRE_UCP is not set. For the back references, the values 7914 are negative the reference number. Only back references and those types 7915 that consume a character may be repeated. We can test for values between 7916 ESC_b and ESC_Z for the latter; this may have to change if any new ones are 7917 ever created. */ 7918 7919 case CHAR_BACKSLASH: 7920 tempptr = ptr; 7921 escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE); 7922 if (*errorcodeptr != 0) goto FAILED; 7923 7924 if (escape == 0) /* The escape coded a single character */ 7925 c = ec; 7926 else 7927 { 7928 /* For metasequences that actually match a character, we disable the 7929 setting of a first character if it hasn't already been set. */ 7930 7931 if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z) 7932 firstcharflags = REQ_NONE; 7933 7934 /* Set values to reset to if this is followed by a zero repeat. */ 7935 7936 zerofirstchar = firstchar; 7937 zerofirstcharflags = firstcharflags; 7938 zeroreqchar = reqchar; 7939 zeroreqcharflags = reqcharflags; 7940 7941 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' 7942 is a subroutine call by number (Oniguruma syntax). In fact, the value 7943 ESC_g is returned only for these cases. So we don't need to check for < 7944 or ' if the value is ESC_g. For the Perl syntax \g{n} the value is 7945 -n, and for the Perl syntax \g{name} the result is ESC_k (as 7946 that is a synonym for a named back reference). */ 7947 7948 if (escape == ESC_g) 7949 { 7950 const pcre_uchar *p; 7951 pcre_uint32 cf; 7952 7953 item_hwm_offset = cd->hwm - cd->start_workspace; /* Normally this is set when '(' is read */ 7954 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 7955 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 7956 7957 /* These two statements stop the compiler for warning about possibly 7958 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In 7959 fact, because we do the check for a number below, the paths that 7960 would actually be in error are never taken. */ 7961 7962 skipbytes = 0; 7963 reset_bracount = FALSE; 7964 7965 /* If it's not a signed or unsigned number, treat it as a name. */ 7966 7967 cf = ptr[1]; 7968 if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf)) 7969 { 7970 is_recurse = TRUE; 7971 goto NAMED_REF_OR_RECURSE; 7972 } 7973 7974 /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus 7975 or a digit. */ 7976 7977 p = ptr + 2; 7978 while (IS_DIGIT(*p)) p++; 7979 if (*p != (pcre_uchar)terminator) 7980 { 7981 *errorcodeptr = ERR57; 7982 goto FAILED; 7983 } 7984 ptr++; 7985 goto HANDLE_NUMERICAL_RECURSION; 7986 } 7987 7988 /* \k<name> or \k'name' is a back reference by name (Perl syntax). 7989 We also support \k{name} (.NET syntax). */ 7990 7991 if (escape == ESC_k) 7992 { 7993 if ((ptr[1] != CHAR_LESS_THAN_SIGN && 7994 ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) 7995 { 7996 *errorcodeptr = ERR69; 7997 goto FAILED; 7998 } 7999 is_recurse = FALSE; 8000 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 8001 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? 8002 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; 8003 goto NAMED_REF_OR_RECURSE; 8004 } 8005 8006 /* Back references are handled specially; must disable firstchar if 8007 not set to cope with cases like (?=(\w+))\1: which would otherwise set 8008 ':' later. */ 8009 8010 if (escape < 0) 8011 { 8012 open_capitem *oc; 8013 recno = -escape; 8014 8015 /* Come here from named backref handling when the reference is to a 8016 single group (i.e. not to a duplicated name. */ 8017 8018 HANDLE_REFERENCE: 8019 if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE; 8020 previous = code; 8021 item_hwm_offset = cd->hwm - cd->start_workspace; 8022 *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF; 8023 PUT2INC(code, 0, recno); 8024 cd->backref_map |= (recno < 32)? (1 << recno) : 1; 8025 if (recno > cd->top_backref) cd->top_backref = recno; 8026 8027 /* Check to see if this back reference is recursive, that it, it 8028 is inside the group that it references. A flag is set so that the 8029 group can be made atomic. */ 8030 8031 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 8032 { 8033 if (oc->number == recno) 8034 { 8035 oc->flag = TRUE; 8036 break; 8037 } 8038 } 8039 } 8040 8041 /* So are Unicode property matches, if supported. */ 8042 8043#ifdef SUPPORT_UCP 8044 else if (escape == ESC_P || escape == ESC_p) 8045 { 8046 BOOL negated; 8047 unsigned int ptype = 0, pdata = 0; 8048 if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr)) 8049 goto FAILED; 8050 previous = code; 8051 item_hwm_offset = cd->hwm - cd->start_workspace; 8052 *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP; 8053 *code++ = ptype; 8054 *code++ = pdata; 8055 } 8056#else 8057 8058 /* If Unicode properties are not supported, \X, \P, and \p are not 8059 allowed. */ 8060 8061 else if (escape == ESC_X || escape == ESC_P || escape == ESC_p) 8062 { 8063 *errorcodeptr = ERR45; 8064 goto FAILED; 8065 } 8066#endif 8067 8068 /* For the rest (including \X when Unicode properties are supported), we 8069 can obtain the OP value by negating the escape value in the default 8070 situation when PCRE_UCP is not set. When it *is* set, we substitute 8071 Unicode property tests. Note that \b and \B do a one-character 8072 lookbehind, and \A also behaves as if it does. */ 8073 8074 else 8075 { 8076 if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) && 8077 cd->max_lookbehind == 0) 8078 cd->max_lookbehind = 1; 8079#ifdef SUPPORT_UCP 8080 if (escape >= ESC_DU && escape <= ESC_wu) 8081 { 8082 nestptr = ptr + 1; /* Where to resume */ 8083 ptr = substitutes[escape - ESC_DU] - 1; /* Just before substitute */ 8084 } 8085 else 8086#endif 8087 /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE 8088 so that it works in DFA mode and in lookbehinds. */ 8089 8090 { 8091 previous = (escape > ESC_b && escape < ESC_Z)? code : NULL; 8092 item_hwm_offset = cd->hwm - cd->start_workspace; 8093 *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape; 8094 } 8095 } 8096 continue; 8097 } 8098 8099 /* We have a data character whose value is in c. In UTF-8 mode it may have 8100 a value > 127. We set its representation in the length/buffer, and then 8101 handle it as a data character. */ 8102 8103#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 8104 if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) 8105 mclength = PRIV(ord2utf)(c, mcbuffer); 8106 else 8107#endif 8108 8109 { 8110 mcbuffer[0] = c; 8111 mclength = 1; 8112 } 8113 goto ONE_CHAR; 8114 8115 8116 /* ===================================================================*/ 8117 /* Handle a literal character. It is guaranteed not to be whitespace or # 8118 when the extended flag is set. If we are in a UTF mode, it may be a 8119 multi-unit literal character. */ 8120 8121 default: 8122 NORMAL_CHAR: 8123 mclength = 1; 8124 mcbuffer[0] = c; 8125 8126#ifdef SUPPORT_UTF 8127 if (utf && HAS_EXTRALEN(c)) 8128 ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); 8129#endif 8130 8131 /* At this point we have the character's bytes in mcbuffer, and the length 8132 in mclength. When not in UTF-8 mode, the length is always 1. */ 8133 8134 ONE_CHAR: 8135 previous = code; 8136 item_hwm_offset = cd->hwm - cd->start_workspace; 8137 8138 /* For caseless UTF-8 mode when UCP support is available, check whether 8139 this character has more than one other case. If so, generate a special 8140 OP_PROP item instead of OP_CHARI. */ 8141 8142#ifdef SUPPORT_UCP 8143 if (utf && (options & PCRE_CASELESS) != 0) 8144 { 8145 GETCHAR(c, mcbuffer); 8146 if ((c = UCD_CASESET(c)) != 0) 8147 { 8148 *code++ = OP_PROP; 8149 *code++ = PT_CLIST; 8150 *code++ = c; 8151 if (firstcharflags == REQ_UNSET) 8152 firstcharflags = zerofirstcharflags = REQ_NONE; 8153 break; 8154 } 8155 } 8156#endif 8157 8158 /* Caseful matches, or not one of the multicase characters. */ 8159 8160 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR; 8161 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; 8162 8163 /* Remember if \r or \n were seen */ 8164 8165 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) 8166 cd->external_flags |= PCRE_HASCRORLF; 8167 8168 /* Set the first and required bytes appropriately. If no previous first 8169 byte, set it from this character, but revert to none on a zero repeat. 8170 Otherwise, leave the firstchar value alone, and don't change it on a zero 8171 repeat. */ 8172 8173 if (firstcharflags == REQ_UNSET) 8174 { 8175 zerofirstcharflags = REQ_NONE; 8176 zeroreqchar = reqchar; 8177 zeroreqcharflags = reqcharflags; 8178 8179 /* If the character is more than one byte long, we can set firstchar 8180 only if it is not to be matched caselessly. */ 8181 8182 if (mclength == 1 || req_caseopt == 0) 8183 { 8184 firstchar = mcbuffer[0] | req_caseopt; 8185 firstchar = mcbuffer[0]; 8186 firstcharflags = req_caseopt; 8187 8188 if (mclength != 1) 8189 { 8190 reqchar = code[-1]; 8191 reqcharflags = cd->req_varyopt; 8192 } 8193 } 8194 else firstcharflags = reqcharflags = REQ_NONE; 8195 } 8196 8197 /* firstchar was previously set; we can set reqchar only if the length is 8198 1 or the matching is caseful. */ 8199 8200 else 8201 { 8202 zerofirstchar = firstchar; 8203 zerofirstcharflags = firstcharflags; 8204 zeroreqchar = reqchar; 8205 zeroreqcharflags = reqcharflags; 8206 if (mclength == 1 || req_caseopt == 0) 8207 { 8208 reqchar = code[-1]; 8209 reqcharflags = req_caseopt | cd->req_varyopt; 8210 } 8211 } 8212 8213 break; /* End of literal character handling */ 8214 } 8215 } /* end of big loop */ 8216 8217 8218/* Control never reaches here by falling through, only by a goto for all the 8219error states. Pass back the position in the pattern so that it can be displayed 8220to the user for diagnosing the error. */ 8221 8222FAILED: 8223*ptrptr = ptr; 8224return FALSE; 8225} 8226 8227 8228 8229/************************************************* 8230* Compile sequence of alternatives * 8231*************************************************/ 8232 8233/* On entry, ptr is pointing past the bracket character, but on return it 8234points to the closing bracket, or vertical bar, or end of string. The code 8235variable is pointing at the byte into which the BRA operator has been stored. 8236This function is used during the pre-compile phase when we are trying to find 8237out the amount of memory needed, as well as during the real compile phase. The 8238value of lengthptr distinguishes the two phases. 8239 8240Arguments: 8241 options option bits, including any changes for this subpattern 8242 codeptr -> the address of the current code pointer 8243 ptrptr -> the address of the current pattern pointer 8244 errorcodeptr -> pointer to error code variable 8245 lookbehind TRUE if this is a lookbehind assertion 8246 reset_bracount TRUE to reset the count for each branch 8247 skipbytes skip this many bytes at start (for brackets and OP_COND) 8248 cond_depth depth of nesting for conditional subpatterns 8249 firstcharptr place to put the first required character 8250 firstcharflagsptr place to put the first character flags, or a negative number 8251 reqcharptr place to put the last required character 8252 reqcharflagsptr place to put the last required character flags, or a negative number 8253 bcptr pointer to the chain of currently open branches 8254 cd points to the data block with tables pointers etc. 8255 lengthptr NULL during the real compile phase 8256 points to length accumulator during pre-compile phase 8257 8258Returns: TRUE on success 8259*/ 8260 8261static BOOL 8262compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr, 8263 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, 8264 int cond_depth, 8265 pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr, 8266 pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr, 8267 branch_chain *bcptr, compile_data *cd, int *lengthptr) 8268{ 8269const pcre_uchar *ptr = *ptrptr; 8270pcre_uchar *code = *codeptr; 8271pcre_uchar *last_branch = code; 8272pcre_uchar *start_bracket = code; 8273pcre_uchar *reverse_count = NULL; 8274open_capitem capitem; 8275int capnumber = 0; 8276pcre_uint32 firstchar, reqchar; 8277pcre_int32 firstcharflags, reqcharflags; 8278pcre_uint32 branchfirstchar, branchreqchar; 8279pcre_int32 branchfirstcharflags, branchreqcharflags; 8280int length; 8281unsigned int orig_bracount; 8282unsigned int max_bracount; 8283branch_chain bc; 8284size_t save_hwm_offset; 8285 8286/* If set, call the external function that checks for stack availability. */ 8287 8288if (PUBL(stack_guard) != NULL && PUBL(stack_guard)()) 8289 { 8290 *errorcodeptr= ERR85; 8291 return FALSE; 8292 } 8293 8294/* Miscellaneous initialization */ 8295 8296bc.outer = bcptr; 8297bc.current_branch = code; 8298 8299firstchar = reqchar = 0; 8300firstcharflags = reqcharflags = REQ_UNSET; 8301 8302save_hwm_offset = cd->hwm - cd->start_workspace; 8303 8304/* Accumulate the length for use in the pre-compile phase. Start with the 8305length of the BRA and KET and any extra bytes that are required at the 8306beginning. We accumulate in a local variable to save frequent testing of 8307lenthptr for NULL. We cannot do this by looking at the value of code at the 8308start and end of each alternative, because compiled items are discarded during 8309the pre-compile phase so that the work space is not exceeded. */ 8310 8311length = 2 + 2*LINK_SIZE + skipbytes; 8312 8313/* WARNING: If the above line is changed for any reason, you must also change 8314the code that abstracts option settings at the start of the pattern and makes 8315them global. It tests the value of length for (2 + 2*LINK_SIZE) in the 8316pre-compile phase to find out whether anything has yet been compiled or not. */ 8317 8318/* If this is a capturing subpattern, add to the chain of open capturing items 8319so that we can detect them if (*ACCEPT) is encountered. This is also used to 8320detect groups that contain recursive back references to themselves. Note that 8321only OP_CBRA need be tested here; changing this opcode to one of its variants, 8322e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */ 8323 8324if (*code == OP_CBRA) 8325 { 8326 capnumber = GET2(code, 1 + LINK_SIZE); 8327 capitem.number = capnumber; 8328 capitem.next = cd->open_caps; 8329 capitem.flag = FALSE; 8330 cd->open_caps = &capitem; 8331 } 8332 8333/* Offset is set zero to mark that this bracket is still open */ 8334 8335PUT(code, 1, 0); 8336code += 1 + LINK_SIZE + skipbytes; 8337 8338/* Loop for each alternative branch */ 8339 8340orig_bracount = max_bracount = cd->bracount; 8341for (;;) 8342 { 8343 /* For a (?| group, reset the capturing bracket count so that each branch 8344 uses the same numbers. */ 8345 8346 if (reset_bracount) cd->bracount = orig_bracount; 8347 8348 /* Set up dummy OP_REVERSE if lookbehind assertion */ 8349 8350 if (lookbehind) 8351 { 8352 *code++ = OP_REVERSE; 8353 reverse_count = code; 8354 PUTINC(code, 0, 0); 8355 length += 1 + LINK_SIZE; 8356 } 8357 8358 /* Now compile the branch; in the pre-compile phase its length gets added 8359 into the length. */ 8360 8361 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar, 8362 &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc, 8363 cond_depth, cd, (lengthptr == NULL)? NULL : &length)) 8364 { 8365 *ptrptr = ptr; 8366 return FALSE; 8367 } 8368 8369 /* Keep the highest bracket count in case (?| was used and some branch 8370 has fewer than the rest. */ 8371 8372 if (cd->bracount > max_bracount) max_bracount = cd->bracount; 8373 8374 /* In the real compile phase, there is some post-processing to be done. */ 8375 8376 if (lengthptr == NULL) 8377 { 8378 /* If this is the first branch, the firstchar and reqchar values for the 8379 branch become the values for the regex. */ 8380 8381 if (*last_branch != OP_ALT) 8382 { 8383 firstchar = branchfirstchar; 8384 firstcharflags = branchfirstcharflags; 8385 reqchar = branchreqchar; 8386 reqcharflags = branchreqcharflags; 8387 } 8388 8389 /* If this is not the first branch, the first char and reqchar have to 8390 match the values from all the previous branches, except that if the 8391 previous value for reqchar didn't have REQ_VARY set, it can still match, 8392 and we set REQ_VARY for the regex. */ 8393 8394 else 8395 { 8396 /* If we previously had a firstchar, but it doesn't match the new branch, 8397 we have to abandon the firstchar for the regex, but if there was 8398 previously no reqchar, it takes on the value of the old firstchar. */ 8399 8400 if (firstcharflags >= 0 && 8401 (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar)) 8402 { 8403 if (reqcharflags < 0) 8404 { 8405 reqchar = firstchar; 8406 reqcharflags = firstcharflags; 8407 } 8408 firstcharflags = REQ_NONE; 8409 } 8410 8411 /* If we (now or from before) have no firstchar, a firstchar from the 8412 branch becomes a reqchar if there isn't a branch reqchar. */ 8413 8414 if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0) 8415 { 8416 branchreqchar = branchfirstchar; 8417 branchreqcharflags = branchfirstcharflags; 8418 } 8419 8420 /* Now ensure that the reqchars match */ 8421 8422 if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) || 8423 reqchar != branchreqchar) 8424 reqcharflags = REQ_NONE; 8425 else 8426 { 8427 reqchar = branchreqchar; 8428 reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */ 8429 } 8430 } 8431 8432 /* If lookbehind, check that this branch matches a fixed-length string, and 8433 put the length into the OP_REVERSE item. Temporarily mark the end of the 8434 branch with OP_END. If the branch contains OP_RECURSE, the result is -3 8435 because there may be forward references that we can't check here. Set a 8436 flag to cause another lookbehind check at the end. Why not do it all at the 8437 end? Because common, erroneous checks are picked up here and the offset of 8438 the problem can be shown. */ 8439 8440 if (lookbehind) 8441 { 8442 int fixed_length; 8443 *code = OP_END; 8444 fixed_length = find_fixedlength(last_branch, (options & PCRE_UTF8) != 0, 8445 FALSE, cd, NULL); 8446 DPRINTF(("fixed length = %d\n", fixed_length)); 8447 if (fixed_length == -3) 8448 { 8449 cd->check_lookbehind = TRUE; 8450 } 8451 else if (fixed_length < 0) 8452 { 8453 *errorcodeptr = (fixed_length == -2)? ERR36 : 8454 (fixed_length == -4)? ERR70: ERR25; 8455 *ptrptr = ptr; 8456 return FALSE; 8457 } 8458 else 8459 { 8460 if (fixed_length > cd->max_lookbehind) 8461 cd->max_lookbehind = fixed_length; 8462 PUT(reverse_count, 0, fixed_length); 8463 } 8464 } 8465 } 8466 8467 /* Reached end of expression, either ')' or end of pattern. In the real 8468 compile phase, go back through the alternative branches and reverse the chain 8469 of offsets, with the field in the BRA item now becoming an offset to the 8470 first alternative. If there are no alternatives, it points to the end of the 8471 group. The length in the terminating ket is always the length of the whole 8472 bracketed item. Return leaving the pointer at the terminating char. */ 8473 8474 if (*ptr != CHAR_VERTICAL_LINE) 8475 { 8476 if (lengthptr == NULL) 8477 { 8478 int branch_length = (int)(code - last_branch); 8479 do 8480 { 8481 int prev_length = GET(last_branch, 1); 8482 PUT(last_branch, 1, branch_length); 8483 branch_length = prev_length; 8484 last_branch -= branch_length; 8485 } 8486 while (branch_length > 0); 8487 } 8488 8489 /* Fill in the ket */ 8490 8491 *code = OP_KET; 8492 PUT(code, 1, (int)(code - start_bracket)); 8493 code += 1 + LINK_SIZE; 8494 8495 /* If it was a capturing subpattern, check to see if it contained any 8496 recursive back references. If so, we must wrap it in atomic brackets. 8497 Because we are moving code along, we must ensure that any pending recursive 8498 references are updated. In any event, remove the block from the chain. */ 8499 8500 if (capnumber > 0) 8501 { 8502 if (cd->open_caps->flag) 8503 { 8504 *code = OP_END; 8505 adjust_recurse(start_bracket, 1 + LINK_SIZE, 8506 (options & PCRE_UTF8) != 0, cd, save_hwm_offset); 8507 memmove(start_bracket + 1 + LINK_SIZE, start_bracket, 8508 IN_UCHARS(code - start_bracket)); 8509 *start_bracket = OP_ONCE; 8510 code += 1 + LINK_SIZE; 8511 PUT(start_bracket, 1, (int)(code - start_bracket)); 8512 *code = OP_KET; 8513 PUT(code, 1, (int)(code - start_bracket)); 8514 code += 1 + LINK_SIZE; 8515 length += 2 + 2*LINK_SIZE; 8516 } 8517 cd->open_caps = cd->open_caps->next; 8518 } 8519 8520 /* Retain the highest bracket number, in case resetting was used. */ 8521 8522 cd->bracount = max_bracount; 8523 8524 /* Set values to pass back */ 8525 8526 *codeptr = code; 8527 *ptrptr = ptr; 8528 *firstcharptr = firstchar; 8529 *firstcharflagsptr = firstcharflags; 8530 *reqcharptr = reqchar; 8531 *reqcharflagsptr = reqcharflags; 8532 if (lengthptr != NULL) 8533 { 8534 if (OFLOW_MAX - *lengthptr < length) 8535 { 8536 *errorcodeptr = ERR20; 8537 return FALSE; 8538 } 8539 *lengthptr += length; 8540 } 8541 return TRUE; 8542 } 8543 8544 /* Another branch follows. In the pre-compile phase, we can move the code 8545 pointer back to where it was for the start of the first branch. (That is, 8546 pretend that each branch is the only one.) 8547 8548 In the real compile phase, insert an ALT node. Its length field points back 8549 to the previous branch while the bracket remains open. At the end the chain 8550 is reversed. It's done like this so that the start of the bracket has a 8551 zero offset until it is closed, making it possible to detect recursion. */ 8552 8553 if (lengthptr != NULL) 8554 { 8555 code = *codeptr + 1 + LINK_SIZE + skipbytes; 8556 length += 1 + LINK_SIZE; 8557 } 8558 else 8559 { 8560 *code = OP_ALT; 8561 PUT(code, 1, (int)(code - last_branch)); 8562 bc.current_branch = last_branch = code; 8563 code += 1 + LINK_SIZE; 8564 } 8565 8566 ptr++; 8567 } 8568/* Control never reaches here */ 8569} 8570 8571 8572 8573 8574/************************************************* 8575* Check for anchored expression * 8576*************************************************/ 8577 8578/* Try to find out if this is an anchored regular expression. Consider each 8579alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket 8580all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then 8581it's anchored. However, if this is a multiline pattern, then only OP_SOD will 8582be found, because ^ generates OP_CIRCM in that mode. 8583 8584We can also consider a regex to be anchored if OP_SOM starts all its branches. 8585This is the code for \G, which means "match at start of match position, taking 8586into account the match offset". 8587 8588A branch is also implicitly anchored if it starts with .* and DOTALL is set, 8589because that will try the rest of the pattern at all possible matching points, 8590so there is no point trying again.... er .... 8591 8592.... except when the .* appears inside capturing parentheses, and there is a 8593subsequent back reference to those parentheses. We haven't enough information 8594to catch that case precisely. 8595 8596At first, the best we could do was to detect when .* was in capturing brackets 8597and the highest back reference was greater than or equal to that level. 8598However, by keeping a bitmap of the first 31 back references, we can catch some 8599of the more common cases more precisely. 8600 8601... A second exception is when the .* appears inside an atomic group, because 8602this prevents the number of characters it matches from being adjusted. 8603 8604Arguments: 8605 code points to start of expression (the bracket) 8606 bracket_map a bitmap of which brackets we are inside while testing; this 8607 handles up to substring 31; after that we just have to take 8608 the less precise approach 8609 cd points to the compile data block 8610 atomcount atomic group level 8611 8612Returns: TRUE or FALSE 8613*/ 8614 8615static BOOL 8616is_anchored(register const pcre_uchar *code, unsigned int bracket_map, 8617 compile_data *cd, int atomcount) 8618{ 8619do { 8620 const pcre_uchar *scode = first_significant_code( 8621 code + PRIV(OP_lengths)[*code], FALSE); 8622 register int op = *scode; 8623 8624 /* Non-capturing brackets */ 8625 8626 if (op == OP_BRA || op == OP_BRAPOS || 8627 op == OP_SBRA || op == OP_SBRAPOS) 8628 { 8629 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE; 8630 } 8631 8632 /* Capturing brackets */ 8633 8634 else if (op == OP_CBRA || op == OP_CBRAPOS || 8635 op == OP_SCBRA || op == OP_SCBRAPOS) 8636 { 8637 int n = GET2(scode, 1+LINK_SIZE); 8638 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); 8639 if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE; 8640 } 8641 8642 /* Positive forward assertions and conditions */ 8643 8644 else if (op == OP_ASSERT || op == OP_COND) 8645 { 8646 if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE; 8647 } 8648 8649 /* Atomic groups */ 8650 8651 else if (op == OP_ONCE || op == OP_ONCE_NC) 8652 { 8653 if (!is_anchored(scode, bracket_map, cd, atomcount + 1)) 8654 return FALSE; 8655 } 8656 8657 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and 8658 it isn't in brackets that are or may be referenced or inside an atomic 8659 group. */ 8660 8661 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || 8662 op == OP_TYPEPOSSTAR)) 8663 { 8664 if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 || 8665 atomcount > 0 || cd->had_pruneorskip) 8666 return FALSE; 8667 } 8668 8669 /* Check for explicit anchoring */ 8670 8671 else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; 8672 8673 code += GET(code, 1); 8674 } 8675while (*code == OP_ALT); /* Loop for each alternative */ 8676return TRUE; 8677} 8678 8679 8680 8681/************************************************* 8682* Check for starting with ^ or .* * 8683*************************************************/ 8684 8685/* This is called to find out if every branch starts with ^ or .* so that 8686"first char" processing can be done to speed things up in multiline 8687matching and for non-DOTALL patterns that start with .* (which must start at 8688the beginning or after \n). As in the case of is_anchored() (see above), we 8689have to take account of back references to capturing brackets that contain .* 8690because in that case we can't make the assumption. Also, the appearance of .* 8691inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not 8692count, because once again the assumption no longer holds. 8693 8694Arguments: 8695 code points to start of expression (the bracket) 8696 bracket_map a bitmap of which brackets we are inside while testing; this 8697 handles up to substring 31; after that we just have to take 8698 the less precise approach 8699 cd points to the compile data 8700 atomcount atomic group level 8701 8702Returns: TRUE or FALSE 8703*/ 8704 8705static BOOL 8706is_startline(const pcre_uchar *code, unsigned int bracket_map, 8707 compile_data *cd, int atomcount) 8708{ 8709do { 8710 const pcre_uchar *scode = first_significant_code( 8711 code + PRIV(OP_lengths)[*code], FALSE); 8712 register int op = *scode; 8713 8714 /* If we are at the start of a conditional assertion group, *both* the 8715 conditional assertion *and* what follows the condition must satisfy the test 8716 for start of line. Other kinds of condition fail. Note that there may be an 8717 auto-callout at the start of a condition. */ 8718 8719 if (op == OP_COND) 8720 { 8721 scode += 1 + LINK_SIZE; 8722 if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT]; 8723 switch (*scode) 8724 { 8725 case OP_CREF: 8726 case OP_DNCREF: 8727 case OP_RREF: 8728 case OP_DNRREF: 8729 case OP_DEF: 8730 case OP_FAIL: 8731 return FALSE; 8732 8733 default: /* Assertion */ 8734 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; 8735 do scode += GET(scode, 1); while (*scode == OP_ALT); 8736 scode += 1 + LINK_SIZE; 8737 break; 8738 } 8739 scode = first_significant_code(scode, FALSE); 8740 op = *scode; 8741 } 8742 8743 /* Non-capturing brackets */ 8744 8745 if (op == OP_BRA || op == OP_BRAPOS || 8746 op == OP_SBRA || op == OP_SBRAPOS) 8747 { 8748 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; 8749 } 8750 8751 /* Capturing brackets */ 8752 8753 else if (op == OP_CBRA || op == OP_CBRAPOS || 8754 op == OP_SCBRA || op == OP_SCBRAPOS) 8755 { 8756 int n = GET2(scode, 1+LINK_SIZE); 8757 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); 8758 if (!is_startline(scode, new_map, cd, atomcount)) return FALSE; 8759 } 8760 8761 /* Positive forward assertions */ 8762 8763 else if (op == OP_ASSERT) 8764 { 8765 if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; 8766 } 8767 8768 /* Atomic brackets */ 8769 8770 else if (op == OP_ONCE || op == OP_ONCE_NC) 8771 { 8772 if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE; 8773 } 8774 8775 /* .* means "start at start or after \n" if it isn't in atomic brackets or 8776 brackets that may be referenced, as long as the pattern does not contain 8777 *PRUNE or *SKIP, because these break the feature. Consider, for example, 8778 /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the 8779 start of a line. */ 8780 8781 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) 8782 { 8783 if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 || 8784 atomcount > 0 || cd->had_pruneorskip) 8785 return FALSE; 8786 } 8787 8788 /* Check for explicit circumflex; anything else gives a FALSE result. Note 8789 in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC 8790 because the number of characters matched by .* cannot be adjusted inside 8791 them. */ 8792 8793 else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; 8794 8795 /* Move on to the next alternative */ 8796 8797 code += GET(code, 1); 8798 } 8799while (*code == OP_ALT); /* Loop for each alternative */ 8800return TRUE; 8801} 8802 8803 8804 8805/************************************************* 8806* Check for asserted fixed first char * 8807*************************************************/ 8808 8809/* During compilation, the "first char" settings from forward assertions are 8810discarded, because they can cause conflicts with actual literals that follow. 8811However, if we end up without a first char setting for an unanchored pattern, 8812it is worth scanning the regex to see if there is an initial asserted first 8813char. If all branches start with the same asserted char, or with a 8814non-conditional bracket all of whose alternatives start with the same asserted 8815char (recurse ad lib), then we return that char, with the flags set to zero or 8816REQ_CASELESS; otherwise return zero with REQ_NONE in the flags. 8817 8818Arguments: 8819 code points to start of expression (the bracket) 8820 flags points to the first char flags, or to REQ_NONE 8821 inassert TRUE if in an assertion 8822 8823Returns: the fixed first char, or 0 with REQ_NONE in flags 8824*/ 8825 8826static pcre_uint32 8827find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags, 8828 BOOL inassert) 8829{ 8830register pcre_uint32 c = 0; 8831int cflags = REQ_NONE; 8832 8833*flags = REQ_NONE; 8834do { 8835 pcre_uint32 d; 8836 int dflags; 8837 int xl = (*code == OP_CBRA || *code == OP_SCBRA || 8838 *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0; 8839 const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl, 8840 TRUE); 8841 register pcre_uchar op = *scode; 8842 8843 switch(op) 8844 { 8845 default: 8846 return 0; 8847 8848 case OP_BRA: 8849 case OP_BRAPOS: 8850 case OP_CBRA: 8851 case OP_SCBRA: 8852 case OP_CBRAPOS: 8853 case OP_SCBRAPOS: 8854 case OP_ASSERT: 8855 case OP_ONCE: 8856 case OP_ONCE_NC: 8857 d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT); 8858 if (dflags < 0) 8859 return 0; 8860 if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0; 8861 break; 8862 8863 case OP_EXACT: 8864 scode += IMM2_SIZE; 8865 /* Fall through */ 8866 8867 case OP_CHAR: 8868 case OP_PLUS: 8869 case OP_MINPLUS: 8870 case OP_POSPLUS: 8871 if (!inassert) return 0; 8872 if (cflags < 0) { c = scode[1]; cflags = 0; } 8873 else if (c != scode[1]) return 0; 8874 break; 8875 8876 case OP_EXACTI: 8877 scode += IMM2_SIZE; 8878 /* Fall through */ 8879 8880 case OP_CHARI: 8881 case OP_PLUSI: 8882 case OP_MINPLUSI: 8883 case OP_POSPLUSI: 8884 if (!inassert) return 0; 8885 if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; } 8886 else if (c != scode[1]) return 0; 8887 break; 8888 } 8889 8890 code += GET(code, 1); 8891 } 8892while (*code == OP_ALT); 8893 8894*flags = cflags; 8895return c; 8896} 8897 8898 8899 8900/************************************************* 8901* Add an entry to the name/number table * 8902*************************************************/ 8903 8904/* This function is called between compiling passes to add an entry to the 8905name/number table, maintaining alphabetical order. Checking for permitted 8906and forbidden duplicates has already been done. 8907 8908Arguments: 8909 cd the compile data block 8910 name the name to add 8911 length the length of the name 8912 groupno the group number 8913 8914Returns: nothing 8915*/ 8916 8917static void 8918add_name(compile_data *cd, const pcre_uchar *name, int length, 8919 unsigned int groupno) 8920{ 8921int i; 8922pcre_uchar *slot = cd->name_table; 8923 8924for (i = 0; i < cd->names_found; i++) 8925 { 8926 int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length)); 8927 if (crc == 0 && slot[IMM2_SIZE+length] != 0) 8928 crc = -1; /* Current name is a substring */ 8929 8930 /* Make space in the table and break the loop for an earlier name. For a 8931 duplicate or later name, carry on. We do this for duplicates so that in the 8932 simple case (when ?(| is not used) they are in order of their numbers. In all 8933 cases they are in the order in which they appear in the pattern. */ 8934 8935 if (crc < 0) 8936 { 8937 memmove(slot + cd->name_entry_size, slot, 8938 IN_UCHARS((cd->names_found - i) * cd->name_entry_size)); 8939 break; 8940 } 8941 8942 /* Continue the loop for a later or duplicate name */ 8943 8944 slot += cd->name_entry_size; 8945 } 8946 8947PUT2(slot, 0, groupno); 8948memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length)); 8949slot[IMM2_SIZE + length] = 0; 8950cd->names_found++; 8951} 8952 8953 8954 8955/************************************************* 8956* Compile a Regular Expression * 8957*************************************************/ 8958 8959/* This function takes a string and returns a pointer to a block of store 8960holding a compiled version of the expression. The original API for this 8961function had no error code return variable; it is retained for backwards 8962compatibility. The new function is given a new name. 8963 8964Arguments: 8965 pattern the regular expression 8966 options various option bits 8967 errorcodeptr pointer to error code variable (pcre_compile2() only) 8968 can be NULL if you don't want a code value 8969 errorptr pointer to pointer to error text 8970 erroroffset ptr offset in pattern where error was detected 8971 tables pointer to character tables or NULL 8972 8973Returns: pointer to compiled data block, or NULL on error, 8974 with errorptr and erroroffset set 8975*/ 8976 8977#if defined COMPILE_PCRE8 8978PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION 8979pcre_compile(const char *pattern, int options, const char **errorptr, 8980 int *erroroffset, const unsigned char *tables) 8981#elif defined COMPILE_PCRE16 8982PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION 8983pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr, 8984 int *erroroffset, const unsigned char *tables) 8985#elif defined COMPILE_PCRE32 8986PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION 8987pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr, 8988 int *erroroffset, const unsigned char *tables) 8989#endif 8990{ 8991#if defined COMPILE_PCRE8 8992return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); 8993#elif defined COMPILE_PCRE16 8994return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables); 8995#elif defined COMPILE_PCRE32 8996return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables); 8997#endif 8998} 8999 9000 9001#if defined COMPILE_PCRE8 9002PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION 9003pcre_compile2(const char *pattern, int options, int *errorcodeptr, 9004 const char **errorptr, int *erroroffset, const unsigned char *tables) 9005#elif defined COMPILE_PCRE16 9006PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION 9007pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr, 9008 const char **errorptr, int *erroroffset, const unsigned char *tables) 9009#elif defined COMPILE_PCRE32 9010PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION 9011pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr, 9012 const char **errorptr, int *erroroffset, const unsigned char *tables) 9013#endif 9014{ 9015REAL_PCRE *re; 9016int length = 1; /* For final END opcode */ 9017pcre_int32 firstcharflags, reqcharflags; 9018pcre_uint32 firstchar, reqchar; 9019pcre_uint32 limit_match = PCRE_UINT32_MAX; 9020pcre_uint32 limit_recursion = PCRE_UINT32_MAX; 9021int newline; 9022int errorcode = 0; 9023int skipatstart = 0; 9024BOOL utf; 9025BOOL never_utf = FALSE; 9026size_t size; 9027pcre_uchar *code; 9028const pcre_uchar *codestart; 9029const pcre_uchar *ptr; 9030compile_data compile_block; 9031compile_data *cd = &compile_block; 9032 9033/* This space is used for "compiling" into during the first phase, when we are 9034computing the amount of memory that is needed. Compiled items are thrown away 9035as soon as possible, so that a fairly large buffer should be sufficient for 9036this purpose. The same space is used in the second phase for remembering where 9037to fill in forward references to subpatterns. That may overflow, in which case 9038new memory is obtained from malloc(). */ 9039 9040pcre_uchar cworkspace[COMPILE_WORK_SIZE]; 9041 9042/* This vector is used for remembering name groups during the pre-compile. In a 9043similar way to cworkspace, it can be expanded using malloc() if necessary. */ 9044 9045named_group named_groups[NAMED_GROUP_LIST_SIZE]; 9046 9047/* Set this early so that early errors get offset 0. */ 9048 9049ptr = (const pcre_uchar *)pattern; 9050 9051/* We can't pass back an error message if errorptr is NULL; I guess the best we 9052can do is just return NULL, but we can set a code value if there is a code 9053pointer. */ 9054 9055if (errorptr == NULL) 9056 { 9057 if (errorcodeptr != NULL) *errorcodeptr = 99; 9058 return NULL; 9059 } 9060 9061*errorptr = NULL; 9062if (errorcodeptr != NULL) *errorcodeptr = ERR0; 9063 9064/* However, we can give a message for this error */ 9065 9066if (erroroffset == NULL) 9067 { 9068 errorcode = ERR16; 9069 goto PCRE_EARLY_ERROR_RETURN2; 9070 } 9071 9072*erroroffset = 0; 9073 9074/* Set up pointers to the individual character tables */ 9075 9076if (tables == NULL) tables = PRIV(default_tables); 9077cd->lcc = tables + lcc_offset; 9078cd->fcc = tables + fcc_offset; 9079cd->cbits = tables + cbits_offset; 9080cd->ctypes = tables + ctypes_offset; 9081 9082/* Check that all undefined public option bits are zero */ 9083 9084if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) 9085 { 9086 errorcode = ERR17; 9087 goto PCRE_EARLY_ERROR_RETURN; 9088 } 9089 9090/* If PCRE_NEVER_UTF is set, remember it. */ 9091 9092if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE; 9093 9094/* Check for global one-time settings at the start of the pattern, and remember 9095the offset for later. */ 9096 9097cd->external_flags = 0; /* Initialize here for LIMIT_MATCH/RECURSION */ 9098 9099while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && 9100 ptr[skipatstart+1] == CHAR_ASTERISK) 9101 { 9102 int newnl = 0; 9103 int newbsr = 0; 9104 9105/* For completeness and backward compatibility, (*UTFn) is supported in the 9106relevant libraries, but (*UTF) is generic and always supported. Note that 9107PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */ 9108 9109#ifdef COMPILE_PCRE8 9110 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0) 9111 { skipatstart += 7; options |= PCRE_UTF8; continue; } 9112#endif 9113#ifdef COMPILE_PCRE16 9114 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0) 9115 { skipatstart += 8; options |= PCRE_UTF16; continue; } 9116#endif 9117#ifdef COMPILE_PCRE32 9118 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0) 9119 { skipatstart += 8; options |= PCRE_UTF32; continue; } 9120#endif 9121 9122 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0) 9123 { skipatstart += 6; options |= PCRE_UTF8; continue; } 9124 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0) 9125 { skipatstart += 6; options |= PCRE_UCP; continue; } 9126 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0) 9127 { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; } 9128 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0) 9129 { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; } 9130 9131 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0) 9132 { 9133 pcre_uint32 c = 0; 9134 int p = skipatstart + 14; 9135 while (isdigit(ptr[p])) 9136 { 9137 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow */ 9138 c = c*10 + ptr[p++] - CHAR_0; 9139 } 9140 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break; 9141 if (c < limit_match) 9142 { 9143 limit_match = c; 9144 cd->external_flags |= PCRE_MLSET; 9145 } 9146 skipatstart = p; 9147 continue; 9148 } 9149 9150 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0) 9151 { 9152 pcre_uint32 c = 0; 9153 int p = skipatstart + 18; 9154 while (isdigit(ptr[p])) 9155 { 9156 if (c > PCRE_UINT32_MAX / 10 - 1) break; /* Integer overflow check */ 9157 c = c*10 + ptr[p++] - CHAR_0; 9158 } 9159 if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break; 9160 if (c < limit_recursion) 9161 { 9162 limit_recursion = c; 9163 cd->external_flags |= PCRE_RLSET; 9164 } 9165 skipatstart = p; 9166 continue; 9167 } 9168 9169 if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0) 9170 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } 9171 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3) == 0) 9172 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } 9173 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5) == 0) 9174 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } 9175 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0) 9176 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } 9177 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0) 9178 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } 9179 9180 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) 9181 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } 9182 else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) 9183 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } 9184 9185 if (newnl != 0) 9186 options = (options & ~PCRE_NEWLINE_BITS) | newnl; 9187 else if (newbsr != 0) 9188 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; 9189 else break; 9190 } 9191 9192/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ 9193utf = (options & PCRE_UTF8) != 0; 9194if (utf && never_utf) 9195 { 9196 errorcode = ERR78; 9197 goto PCRE_EARLY_ERROR_RETURN2; 9198 } 9199 9200/* Can't support UTF unless PCRE has been compiled to include the code. The 9201return of an error code from PRIV(valid_utf)() is a new feature, introduced in 9202release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is 9203not used here. */ 9204 9205#ifdef SUPPORT_UTF 9206if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && 9207 (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) 9208 { 9209#if defined COMPILE_PCRE8 9210 errorcode = ERR44; 9211#elif defined COMPILE_PCRE16 9212 errorcode = ERR74; 9213#elif defined COMPILE_PCRE32 9214 errorcode = ERR77; 9215#endif 9216 goto PCRE_EARLY_ERROR_RETURN2; 9217 } 9218#else 9219if (utf) 9220 { 9221 errorcode = ERR32; 9222 goto PCRE_EARLY_ERROR_RETURN; 9223 } 9224#endif 9225 9226/* Can't support UCP unless PCRE has been compiled to include the code. */ 9227 9228#ifndef SUPPORT_UCP 9229if ((options & PCRE_UCP) != 0) 9230 { 9231 errorcode = ERR67; 9232 goto PCRE_EARLY_ERROR_RETURN; 9233 } 9234#endif 9235 9236/* Check validity of \R options. */ 9237 9238if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 9239 (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 9240 { 9241 errorcode = ERR56; 9242 goto PCRE_EARLY_ERROR_RETURN; 9243 } 9244 9245/* Handle different types of newline. The three bits give seven cases. The 9246current code allows for fixed one- or two-byte sequences, plus "any" and 9247"anycrlf". */ 9248 9249switch (options & PCRE_NEWLINE_BITS) 9250 { 9251 case 0: newline = NEWLINE; break; /* Build-time default */ 9252 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 9253 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 9254 case PCRE_NEWLINE_CR+ 9255 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 9256 case PCRE_NEWLINE_ANY: newline = -1; break; 9257 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 9258 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; 9259 } 9260 9261if (newline == -2) 9262 { 9263 cd->nltype = NLTYPE_ANYCRLF; 9264 } 9265else if (newline < 0) 9266 { 9267 cd->nltype = NLTYPE_ANY; 9268 } 9269else 9270 { 9271 cd->nltype = NLTYPE_FIXED; 9272 if (newline > 255) 9273 { 9274 cd->nllen = 2; 9275 cd->nl[0] = (newline >> 8) & 255; 9276 cd->nl[1] = newline & 255; 9277 } 9278 else 9279 { 9280 cd->nllen = 1; 9281 cd->nl[0] = newline; 9282 } 9283 } 9284 9285/* Maximum back reference and backref bitmap. The bitmap records up to 31 back 9286references to help in deciding whether (.*) can be treated as anchored or not. 9287*/ 9288 9289cd->top_backref = 0; 9290cd->backref_map = 0; 9291 9292/* Reflect pattern for debugging output */ 9293 9294DPRINTF(("------------------------------------------------------------------\n")); 9295#ifdef PCRE_DEBUG 9296print_puchar(stdout, (PCRE_PUCHAR)pattern); 9297#endif 9298DPRINTF(("\n")); 9299 9300/* Pretend to compile the pattern while actually just accumulating the length 9301of memory required. This behaviour is triggered by passing a non-NULL final 9302argument to compile_regex(). We pass a block of workspace (cworkspace) for it 9303to compile parts of the pattern into; the compiled code is discarded when it is 9304no longer needed, so hopefully this workspace will never overflow, though there 9305is a test for its doing so. */ 9306 9307cd->bracount = cd->final_bracount = 0; 9308cd->names_found = 0; 9309cd->name_entry_size = 0; 9310cd->name_table = NULL; 9311cd->dupnames = FALSE; 9312cd->dupgroups = FALSE; 9313cd->namedrefcount = 0; 9314cd->start_code = cworkspace; 9315cd->hwm = cworkspace; 9316cd->iscondassert = FALSE; 9317cd->start_workspace = cworkspace; 9318cd->workspace_size = COMPILE_WORK_SIZE; 9319cd->named_groups = named_groups; 9320cd->named_group_list_size = NAMED_GROUP_LIST_SIZE; 9321cd->start_pattern = (const pcre_uchar *)pattern; 9322cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern)); 9323cd->req_varyopt = 0; 9324cd->parens_depth = 0; 9325cd->assert_depth = 0; 9326cd->max_lookbehind = 0; 9327cd->external_options = options; 9328cd->open_caps = NULL; 9329 9330/* Now do the pre-compile. On error, errorcode will be set non-zero, so we 9331don't need to look at the result of the function here. The initial options have 9332been put into the cd block so that they can be changed if an option setting is 9333found within the regex right at the beginning. Bringing initial option settings 9334outside can help speed up starting point checks. */ 9335 9336ptr += skipatstart; 9337code = cworkspace; 9338*code = OP_BRA; 9339 9340(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE, 9341 FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, 9342 cd, &length); 9343if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; 9344 9345DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, 9346 (int)(cd->hwm - cworkspace))); 9347 9348if (length > MAX_PATTERN_SIZE) 9349 { 9350 errorcode = ERR20; 9351 goto PCRE_EARLY_ERROR_RETURN; 9352 } 9353 9354/* Compute the size of the data block for storing the compiled pattern. Integer 9355overflow should no longer be possible because nowadays we limit the maximum 9356value of cd->names_found and cd->name_entry_size. */ 9357 9358size = sizeof(REAL_PCRE) + 9359 (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar); 9360 9361/* Get the memory. */ 9362 9363re = (REAL_PCRE *)(PUBL(malloc))(size); 9364if (re == NULL) 9365 { 9366 errorcode = ERR21; 9367 goto PCRE_EARLY_ERROR_RETURN; 9368 } 9369 9370/* Put in the magic number, and save the sizes, initial options, internal 9371flags, and character table pointer. NULL is used for the default character 9372tables. The nullpad field is at the end; it's there to help in the case when a 9373regex compiled on a system with 4-byte pointers is run on another with 8-byte 9374pointers. */ 9375 9376re->magic_number = MAGIC_NUMBER; 9377re->size = (int)size; 9378re->options = cd->external_options; 9379re->flags = cd->external_flags; 9380re->limit_match = limit_match; 9381re->limit_recursion = limit_recursion; 9382re->first_char = 0; 9383re->req_char = 0; 9384re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar); 9385re->name_entry_size = cd->name_entry_size; 9386re->name_count = cd->names_found; 9387re->ref_count = 0; 9388re->tables = (tables == PRIV(default_tables))? NULL : tables; 9389re->nullpad = NULL; 9390#ifdef COMPILE_PCRE32 9391re->dummy = 0; 9392#else 9393re->dummy1 = re->dummy2 = re->dummy3 = 0; 9394#endif 9395 9396/* The starting points of the name/number translation table and of the code are 9397passed around in the compile data block. The start/end pattern and initial 9398options are already set from the pre-compile phase, as is the name_entry_size 9399field. Reset the bracket count and the names_found field. Also reset the hwm 9400field; this time it's used for remembering forward references to subpatterns. 9401*/ 9402 9403cd->final_bracount = cd->bracount; /* Save for checking forward references */ 9404cd->parens_depth = 0; 9405cd->assert_depth = 0; 9406cd->bracount = 0; 9407cd->max_lookbehind = 0; 9408cd->name_table = (pcre_uchar *)re + re->name_table_offset; 9409codestart = cd->name_table + re->name_entry_size * re->name_count; 9410cd->start_code = codestart; 9411cd->hwm = (pcre_uchar *)(cd->start_workspace); 9412cd->iscondassert = FALSE; 9413cd->req_varyopt = 0; 9414cd->had_accept = FALSE; 9415cd->had_pruneorskip = FALSE; 9416cd->check_lookbehind = FALSE; 9417cd->open_caps = NULL; 9418 9419/* If any named groups were found, create the name/number table from the list 9420created in the first pass. */ 9421 9422if (cd->names_found > 0) 9423 { 9424 int i = cd->names_found; 9425 named_group *ng = cd->named_groups; 9426 cd->names_found = 0; 9427 for (; i > 0; i--, ng++) 9428 add_name(cd, ng->name, ng->length, ng->number); 9429 if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE) 9430 (PUBL(free))((void *)cd->named_groups); 9431 } 9432 9433/* Set up a starting, non-extracting bracket, then compile the expression. On 9434error, errorcode will be set non-zero, so we don't need to look at the result 9435of the function here. */ 9436 9437ptr = (const pcre_uchar *)pattern + skipatstart; 9438code = (pcre_uchar *)codestart; 9439*code = OP_BRA; 9440(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0, 9441 &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL); 9442re->top_bracket = cd->bracount; 9443re->top_backref = cd->top_backref; 9444re->max_lookbehind = cd->max_lookbehind; 9445re->flags = cd->external_flags | PCRE_MODE; 9446 9447if (cd->had_accept) 9448 { 9449 reqchar = 0; /* Must disable after (*ACCEPT) */ 9450 reqcharflags = REQ_NONE; 9451 } 9452 9453/* If not reached end of pattern on success, there's an excess bracket. */ 9454 9455if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22; 9456 9457/* Fill in the terminating state and check for disastrous overflow, but 9458if debugging, leave the test till after things are printed out. */ 9459 9460*code++ = OP_END; 9461 9462#ifndef PCRE_DEBUG 9463if (code - codestart > length) errorcode = ERR23; 9464#endif 9465 9466#ifdef SUPPORT_VALGRIND 9467/* If the estimated length exceeds the really used length, mark the extra 9468allocated memory as unaddressable, so that any out-of-bound reads can be 9469detected. */ 9470VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar)); 9471#endif 9472 9473/* Fill in any forward references that are required. There may be repeated 9474references; optimize for them, as searching a large regex takes time. */ 9475 9476if (cd->hwm > cd->start_workspace) 9477 { 9478 int prev_recno = -1; 9479 const pcre_uchar *groupptr = NULL; 9480 while (errorcode == 0 && cd->hwm > cd->start_workspace) 9481 { 9482 int offset, recno; 9483 cd->hwm -= LINK_SIZE; 9484 offset = GET(cd->hwm, 0); 9485 9486 /* Check that the hwm handling hasn't gone wrong. This whole area is 9487 rewritten in PCRE2 because there are some obscure cases. */ 9488 9489 if (offset == 0 || codestart[offset-1] != OP_RECURSE) 9490 { 9491 errorcode = ERR10; 9492 break; 9493 } 9494 9495 recno = GET(codestart, offset); 9496 if (recno != prev_recno) 9497 { 9498 groupptr = PRIV(find_bracket)(codestart, utf, recno); 9499 prev_recno = recno; 9500 } 9501 if (groupptr == NULL) errorcode = ERR53; 9502 else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart)); 9503 } 9504 } 9505 9506/* If the workspace had to be expanded, free the new memory. Set the pointer to 9507NULL to indicate that forward references have been filled in. */ 9508 9509if (cd->workspace_size > COMPILE_WORK_SIZE) 9510 (PUBL(free))((void *)cd->start_workspace); 9511cd->start_workspace = NULL; 9512 9513/* Give an error if there's back reference to a non-existent capturing 9514subpattern. */ 9515 9516if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; 9517 9518/* Unless disabled, check whether any single character iterators can be 9519auto-possessified. The function overwrites the appropriate opcode values, so 9520the type of the pointer must be cast. NOTE: the intermediate variable "temp" is 9521used in this code because at least one compiler gives a warning about loss of 9522"const" attribute if the cast (pcre_uchar *)codestart is used directly in the 9523function call. */ 9524 9525if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0) 9526 { 9527 pcre_uchar *temp = (pcre_uchar *)codestart; 9528 auto_possessify(temp, utf, cd); 9529 } 9530 9531/* If there were any lookbehind assertions that contained OP_RECURSE 9532(recursions or subroutine calls), a flag is set for them to be checked here, 9533because they may contain forward references. Actual recursions cannot be fixed 9534length, but subroutine calls can. It is done like this so that those without 9535OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The 9536exceptional ones forgo this. We scan the pattern to check that they are fixed 9537length, and set their lengths. */ 9538 9539if (errorcode == 0 && cd->check_lookbehind) 9540 { 9541 pcre_uchar *cc = (pcre_uchar *)codestart; 9542 9543 /* Loop, searching for OP_REVERSE items, and process those that do not have 9544 their length set. (Actually, it will also re-process any that have a length 9545 of zero, but that is a pathological case, and it does no harm.) When we find 9546 one, we temporarily terminate the branch it is in while we scan it. */ 9547 9548 for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1); 9549 cc != NULL; 9550 cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1)) 9551 { 9552 if (GET(cc, 1) == 0) 9553 { 9554 int fixed_length; 9555 pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); 9556 int end_op = *be; 9557 *be = OP_END; 9558 fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE, 9559 cd, NULL); 9560 *be = end_op; 9561 DPRINTF(("fixed length = %d\n", fixed_length)); 9562 if (fixed_length < 0) 9563 { 9564 errorcode = (fixed_length == -2)? ERR36 : 9565 (fixed_length == -4)? ERR70 : ERR25; 9566 break; 9567 } 9568 if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length; 9569 PUT(cc, 1, fixed_length); 9570 } 9571 cc += 1 + LINK_SIZE; 9572 } 9573 } 9574 9575/* Failed to compile, or error while post-processing */ 9576 9577if (errorcode != 0) 9578 { 9579 (PUBL(free))(re); 9580 PCRE_EARLY_ERROR_RETURN: 9581 *erroroffset = (int)(ptr - (const pcre_uchar *)pattern); 9582 PCRE_EARLY_ERROR_RETURN2: 9583 *errorptr = find_error_text(errorcode); 9584 if (errorcodeptr != NULL) *errorcodeptr = errorcode; 9585 return NULL; 9586 } 9587 9588/* If the anchored option was not passed, set the flag if we can determine that 9589the pattern is anchored by virtue of ^ characters or \A or anything else, such 9590as starting with non-atomic .* when DOTALL is set and there are no occurrences 9591of *PRUNE or *SKIP. 9592 9593Otherwise, if we know what the first byte has to be, save it, because that 9594speeds up unanchored matches no end. If not, see if we can set the 9595PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 9596start with ^. and also when all branches start with non-atomic .* for 9597non-DOTALL matches when *PRUNE and SKIP are not present. */ 9598 9599if ((re->options & PCRE_ANCHORED) == 0) 9600 { 9601 if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED; 9602 else 9603 { 9604 if (firstcharflags < 0) 9605 firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE); 9606 if (firstcharflags >= 0) /* Remove caseless flag for non-caseable chars */ 9607 { 9608#if defined COMPILE_PCRE8 9609 re->first_char = firstchar & 0xff; 9610#elif defined COMPILE_PCRE16 9611 re->first_char = firstchar & 0xffff; 9612#elif defined COMPILE_PCRE32 9613 re->first_char = firstchar; 9614#endif 9615 if ((firstcharflags & REQ_CASELESS) != 0) 9616 { 9617#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 9618 /* We ignore non-ASCII first chars in 8 bit mode. */ 9619 if (utf) 9620 { 9621 if (re->first_char < 128) 9622 { 9623 if (cd->fcc[re->first_char] != re->first_char) 9624 re->flags |= PCRE_FCH_CASELESS; 9625 } 9626 else if (UCD_OTHERCASE(re->first_char) != re->first_char) 9627 re->flags |= PCRE_FCH_CASELESS; 9628 } 9629 else 9630#endif 9631 if (MAX_255(re->first_char) 9632 && cd->fcc[re->first_char] != re->first_char) 9633 re->flags |= PCRE_FCH_CASELESS; 9634 } 9635 9636 re->flags |= PCRE_FIRSTSET; 9637 } 9638 9639 else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE; 9640 } 9641 } 9642 9643/* For an anchored pattern, we use the "required byte" only if it follows a 9644variable length item in the regex. Remove the caseless flag for non-caseable 9645bytes. */ 9646 9647if (reqcharflags >= 0 && 9648 ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0)) 9649 { 9650#if defined COMPILE_PCRE8 9651 re->req_char = reqchar & 0xff; 9652#elif defined COMPILE_PCRE16 9653 re->req_char = reqchar & 0xffff; 9654#elif defined COMPILE_PCRE32 9655 re->req_char = reqchar; 9656#endif 9657 if ((reqcharflags & REQ_CASELESS) != 0) 9658 { 9659#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8) 9660 /* We ignore non-ASCII first chars in 8 bit mode. */ 9661 if (utf) 9662 { 9663 if (re->req_char < 128) 9664 { 9665 if (cd->fcc[re->req_char] != re->req_char) 9666 re->flags |= PCRE_RCH_CASELESS; 9667 } 9668 else if (UCD_OTHERCASE(re->req_char) != re->req_char) 9669 re->flags |= PCRE_RCH_CASELESS; 9670 } 9671 else 9672#endif 9673 if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char) 9674 re->flags |= PCRE_RCH_CASELESS; 9675 } 9676 9677 re->flags |= PCRE_REQCHSET; 9678 } 9679 9680/* Print out the compiled data if debugging is enabled. This is never the 9681case when building a production library. */ 9682 9683#ifdef PCRE_DEBUG 9684printf("Length = %d top_bracket = %d top_backref = %d\n", 9685 length, re->top_bracket, re->top_backref); 9686 9687printf("Options=%08x\n", re->options); 9688 9689if ((re->flags & PCRE_FIRSTSET) != 0) 9690 { 9691 pcre_uchar ch = re->first_char; 9692 const char *caseless = 9693 ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)"; 9694 if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless); 9695 else printf("First char = \\x%02x%s\n", ch, caseless); 9696 } 9697 9698if ((re->flags & PCRE_REQCHSET) != 0) 9699 { 9700 pcre_uchar ch = re->req_char; 9701 const char *caseless = 9702 ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)"; 9703 if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless); 9704 else printf("Req char = \\x%02x%s\n", ch, caseless); 9705 } 9706 9707#if defined COMPILE_PCRE8 9708pcre_printint((pcre *)re, stdout, TRUE); 9709#elif defined COMPILE_PCRE16 9710pcre16_printint((pcre *)re, stdout, TRUE); 9711#elif defined COMPILE_PCRE32 9712pcre32_printint((pcre *)re, stdout, TRUE); 9713#endif 9714 9715/* This check is done here in the debugging case so that the code that 9716was compiled can be seen. */ 9717 9718if (code - codestart > length) 9719 { 9720 (PUBL(free))(re); 9721 *errorptr = find_error_text(ERR23); 9722 *erroroffset = ptr - (pcre_uchar *)pattern; 9723 if (errorcodeptr != NULL) *errorcodeptr = ERR23; 9724 return NULL; 9725 } 9726#endif /* PCRE_DEBUG */ 9727 9728/* Check for a pattern than can match an empty string, so that this information 9729can be provided to applications. */ 9730 9731do 9732 { 9733 if (could_be_empty_branch(codestart, code, utf, cd, NULL)) 9734 { 9735 re->flags |= PCRE_MATCH_EMPTY; 9736 break; 9737 } 9738 codestart += GET(codestart, 1); 9739 } 9740while (*codestart == OP_ALT); 9741 9742#if defined COMPILE_PCRE8 9743return (pcre *)re; 9744#elif defined COMPILE_PCRE16 9745return (pcre16 *)re; 9746#elif defined COMPILE_PCRE32 9747return (pcre32 *)re; 9748#endif 9749} 9750 9751/* End of pcre_compile.c */ 9752