1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* PCRE is a library of functions to support regular expressions whose syntax 6and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Copyright (c) 1997-2010 University of Cambridge 10 11----------------------------------------------------------------------------- 12Redistribution and use in source and binary forms, with or without 13modification, are permitted provided that the following conditions are met: 14 15 * Redistributions of source code must retain the above copyright notice, 16 this list of conditions and the following disclaimer. 17 18 * Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in the 20 documentation and/or other materials provided with the distribution. 21 22 * Neither the name of the University of Cambridge nor the names of its 23 contributors may be used to endorse or promote products derived from 24 this software without specific prior written permission. 25 26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36POSSIBILITY OF SUCH DAMAGE. 37----------------------------------------------------------------------------- 38*/ 39 40 41/* This module contains the external function pcre_compile(), along with 42supporting internal functions that are not used by other modules. */ 43 44 45#ifdef HAVE_CONFIG_H 46#include "config.h" 47#endif 48 49#define NLBLOCK cd /* Block containing newline information */ 50#define PSSTART start_pattern /* Field containing processed string start */ 51#define PSEND end_pattern /* Field containing processed string end */ 52 53#include "pcre_internal.h" 54 55 56/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is 57also used by pcretest. PCRE_DEBUG is not defined when building a production 58library. */ 59 60#ifdef PCRE_DEBUG 61#include "pcre_printint.src" 62#endif 63 64 65/* Macro for setting individual bits in class bitmaps. */ 66 67#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) 68 69/* Maximum length value to check against when making sure that the integer that 70holds the compiled pattern length does not overflow. We make it a bit less than 71INT_MAX to allow for adding in group terminating bytes, so that we don't have 72to check them every time. */ 73 74#define OFLOW_MAX (INT_MAX - 20) 75 76 77/************************************************* 78* Code parameters and static tables * 79*************************************************/ 80 81/* This value specifies the size of stack workspace that is used during the 82first pre-compile phase that determines how much memory is required. The regex 83is partly compiled into this space, but the compiled parts are discarded as 84soon as they can be, so that hopefully there will never be an overrun. The code 85does, however, check for an overrun. The largest amount I've seen used is 218, 86so this number is very generous. 87 88The same workspace is used during the second, actual compile phase for 89remembering forward references to groups so that they can be filled in at the 90end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE 91is 4 there is plenty of room. */ 92 93#define COMPILE_WORK_SIZE (4096) 94 95/* The overrun tests check for a slightly smaller size so that they detect the 96overrun before it actually does run off the end of the data block. */ 97 98#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100) 99 100 101/* Table for handling escaped characters in the range '0'-'z'. Positive returns 102are simple data values; negative values are for special things like \d and so 103on. Zero means further processing is needed (for things like \x), or the escape 104is invalid. */ 105 106#ifndef EBCDIC 107 108/* This is the "normal" table for ASCII systems or for EBCDIC systems running 109in UTF-8 mode. */ 110 111static const short int escapes[] = { 112 0, 0, 113 0, 0, 114 0, 0, 115 0, 0, 116 0, 0, 117 CHAR_COLON, CHAR_SEMICOLON, 118 CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, 119 CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, 120 CHAR_COMMERCIAL_AT, -ESC_A, 121 -ESC_B, -ESC_C, 122 -ESC_D, -ESC_E, 123 0, -ESC_G, 124 -ESC_H, 0, 125 0, -ESC_K, 126 0, 0, 127 0, 0, 128 -ESC_P, -ESC_Q, 129 -ESC_R, -ESC_S, 130 0, 0, 131 -ESC_V, -ESC_W, 132 -ESC_X, 0, 133 -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, 134 CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, 135 CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, 136 CHAR_GRAVE_ACCENT, 7, 137 -ESC_b, 0, 138 -ESC_d, ESC_e, 139 ESC_f, 0, 140 -ESC_h, 0, 141 0, -ESC_k, 142 0, 0, 143 ESC_n, 0, 144 -ESC_p, 0, 145 ESC_r, -ESC_s, 146 ESC_tee, 0, 147 -ESC_v, -ESC_w, 148 0, 0, 149 -ESC_z 150}; 151 152#else 153 154/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */ 155 156static const short int escapes[] = { 157/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', 158/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, 159/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', 160/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, 161/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', 162/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, 163/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', 164/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, 165/* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0, 166/* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p, 167/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, 168/* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0, 169/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, 170/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, 171/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', 172/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, 173/* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0, 174/* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P, 175/* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0, 176/* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X, 177/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, 178/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, 179/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 180}; 181#endif 182 183 184/* Table of special "verbs" like (*PRUNE). This is a short table, so it is 185searched linearly. Put all the names into a single string, in order to reduce 186the number of relocations when a shared library is dynamically linked. The 187string is built from string macros so that it works in UTF-8 mode on EBCDIC 188platforms. */ 189 190typedef struct verbitem { 191 int len; 192 int op; 193} verbitem; 194 195static const char verbnames[] = 196 STRING_ACCEPT0 197 STRING_COMMIT0 198 STRING_F0 199 STRING_FAIL0 200 STRING_PRUNE0 201 STRING_SKIP0 202 STRING_THEN; 203 204static const verbitem verbs[] = { 205 { 6, OP_ACCEPT }, 206 { 6, OP_COMMIT }, 207 { 1, OP_FAIL }, 208 { 4, OP_FAIL }, 209 { 5, OP_PRUNE }, 210 { 4, OP_SKIP }, 211 { 4, OP_THEN } 212}; 213 214static const int verbcount = sizeof(verbs)/sizeof(verbitem); 215 216 217/* Tables of names of POSIX character classes and their lengths. The names are 218now all in a single string, to reduce the number of relocations when a shared 219library is dynamically loaded. The list of lengths is terminated by a zero 220length entry. The first three must be alpha, lower, upper, as this is assumed 221for handling case independence. */ 222 223static const char posix_names[] = 224 STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 225 STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 226 STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 227 STRING_word0 STRING_xdigit; 228 229static const uschar posix_name_lengths[] = { 230 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; 231 232/* Table of class bit maps for each POSIX class. Each class is formed from a 233base map, with an optional addition or removal of another map. Then, for some 234classes, there is some additional tweaking: for [:blank:] the vertical space 235characters are removed, and for [:alpha:] and [:alnum:] the underscore 236character is removed. The triples in the table consist of the base map offset, 237second map offset or -1 if no second map, and a non-negative value for map 238addition or a negative value for map subtraction (if there are two maps). The 239absolute value of the third field has these meanings: 0 => no tweaking, 1 => 240remove vertical space characters, 2 => remove underscore. */ 241 242static const int posix_class_maps[] = { 243 cbit_word, cbit_digit, -2, /* alpha */ 244 cbit_lower, -1, 0, /* lower */ 245 cbit_upper, -1, 0, /* upper */ 246 cbit_word, -1, 2, /* alnum - word without underscore */ 247 cbit_print, cbit_cntrl, 0, /* ascii */ 248 cbit_space, -1, 1, /* blank - a GNU extension */ 249 cbit_cntrl, -1, 0, /* cntrl */ 250 cbit_digit, -1, 0, /* digit */ 251 cbit_graph, -1, 0, /* graph */ 252 cbit_print, -1, 0, /* print */ 253 cbit_punct, -1, 0, /* punct */ 254 cbit_space, -1, 0, /* space */ 255 cbit_word, -1, 0, /* word - a Perl extension */ 256 cbit_xdigit,-1, 0 /* xdigit */ 257}; 258 259 260#define STRING(a) # a 261#define XSTRING(s) STRING(s) 262 263/* The texts of compile-time error messages. These are "char *" because they 264are passed to the outside world. Do not ever re-use any error number, because 265they are documented. Always add a new error instead. Messages marked DEAD below 266are no longer used. This used to be a table of strings, but in order to reduce 267the number of relocations needed when a shared library is loaded dynamically, 268it is now one long string. We cannot use a table of offsets, because the 269lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we 270simply count through to the one we want - this isn't a performance issue 271because these strings are used only when there is a compilation error. 272 273Each substring ends with \0 to insert a null character. This includes the final 274substring, so that the whole string ends with \0\0, which can be detected when 275counting through. */ 276 277static const char error_texts[] = 278 "no error\0" 279 "\\ at end of pattern\0" 280 "\\c at end of pattern\0" 281 "unrecognized character follows \\\0" 282 "numbers out of order in {} quantifier\0" 283 /* 5 */ 284 "number too big in {} quantifier\0" 285 "missing terminating ] for character class\0" 286 "invalid escape sequence in character class\0" 287 "range out of order in character class\0" 288 "nothing to repeat\0" 289 /* 10 */ 290 "operand of unlimited repeat could match the empty string\0" /** DEAD **/ 291 "internal error: unexpected repeat\0" 292 "unrecognized character after (? or (?-\0" 293 "POSIX named classes are supported only within a class\0" 294 "missing )\0" 295 /* 15 */ 296 "reference to non-existent subpattern\0" 297 "erroffset passed as NULL\0" 298 "unknown option bit(s) set\0" 299 "missing ) after comment\0" 300 "parentheses nested too deeply\0" /** DEAD **/ 301 /* 20 */ 302 "regular expression is too large\0" 303 "failed to get memory\0" 304 "unmatched parentheses\0" 305 "internal error: code overflow\0" 306 "unrecognized character after (?<\0" 307 /* 25 */ 308 "lookbehind assertion is not fixed length\0" 309 "malformed number or name after (?(\0" 310 "conditional group contains more than two branches\0" 311 "assertion expected after (?(\0" 312 "(?R or (?[+-]digits must be followed by )\0" 313 /* 30 */ 314 "unknown POSIX class name\0" 315 "POSIX collating elements are not supported\0" 316 "this version of PCRE is not compiled with PCRE_UTF8 support\0" 317 "spare error\0" /** DEAD **/ 318 "character value in \\x{...} sequence is too large\0" 319 /* 35 */ 320 "invalid condition (?(0)\0" 321 "\\C not allowed in lookbehind assertion\0" 322 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" 323 "number after (?C is > 255\0" 324 "closing ) for (?C expected\0" 325 /* 40 */ 326 "recursive call could loop indefinitely\0" 327 "unrecognized character after (?P\0" 328 "syntax error in subpattern name (missing terminator)\0" 329 "two named subpatterns have the same name\0" 330 "invalid UTF-8 string\0" 331 /* 45 */ 332 "support for \\P, \\p, and \\X has not been compiled\0" 333 "malformed \\P or \\p sequence\0" 334 "unknown property name after \\P or \\p\0" 335 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0" 336 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0" 337 /* 50 */ 338 "repeated subpattern is too long\0" /** DEAD **/ 339 "octal value is greater than \\377 (not in UTF-8 mode)\0" 340 "internal error: overran compiling workspace\0" 341 "internal error: previously-checked referenced subpattern not found\0" 342 "DEFINE group contains more than one branch\0" 343 /* 55 */ 344 "repeating a DEFINE group is not allowed\0" 345 "inconsistent NEWLINE options\0" 346 "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" 347 "a numbered reference must not be zero\0" 348 "(*VERB) with an argument is not supported\0" 349 /* 60 */ 350 "(*VERB) not recognized\0" 351 "number is too big\0" 352 "subpattern name expected\0" 353 "digit expected after (?+\0" 354 "] is an invalid data character in JavaScript compatibility mode\0" 355 /* 65 */ 356 "different names for subpatterns of the same number are not allowed\0"; 357 358/* Table to identify digits and hex digits. This is used when compiling 359patterns. Note that the tables in chartables are dependent on the locale, and 360may mark arbitrary characters as digits - but the PCRE compiling code expects 361to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have 362a private table here. It costs 256 bytes, but it is a lot faster than doing 363character value tests (at least in some simple cases I timed), and in some 364applications one wants PCRE to compile efficiently as well as match 365efficiently. 366 367For convenience, we use the same bit definitions as in chartables: 368 369 0x04 decimal digit 370 0x08 hexadecimal digit 371 372Then we can use ctype_digit and ctype_xdigit in the code. */ 373 374#ifndef EBCDIC 375 376/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in 377UTF-8 mode. */ 378 379static const unsigned char digitab[] = 380 { 381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ 384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ 386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ 387 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ 388 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ 389 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ 390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ 391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ 392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ 393 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ 394 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ 395 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ 396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ 397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ 398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ 399 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ 400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ 401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ 402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ 403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ 404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ 406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ 407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ 408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ 409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ 410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ 411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ 413 414#else 415 416/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ 417 418static const unsigned char digitab[] = 419 { 420 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 421 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 422 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ 423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ 425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ 427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ 429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ 430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ 431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */ 432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ 433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ 434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ 435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 436 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ 437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ 439 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ 441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ 443 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 444 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ 445 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ 447 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ 449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 450 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ 451 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 452 453static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ 454 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ 455 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ 456 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ 457 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 458 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ 459 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ 461 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 462 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ 463 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ 464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ 465 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */ 466 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ 467 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ 468 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ 469 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 470 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ 471 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 472 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ 473 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 474 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ 475 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 476 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ 477 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 478 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ 479 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 480 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ 481 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 482 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ 483 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 484 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ 485 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 486#endif 487 488 489/* Definition to allow mutual recursion */ 490 491static BOOL 492 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int, 493 int *, int *, branch_chain *, compile_data *, int *); 494 495 496 497/************************************************* 498* Find an error text * 499*************************************************/ 500 501/* The error texts are now all in one long string, to save on relocations. As 502some of the text is of unknown length, we can't use a table of offsets. 503Instead, just count through the strings. This is not a performance issue 504because it happens only when there has been a compilation error. 505 506Argument: the error number 507Returns: pointer to the error string 508*/ 509 510static const char * 511find_error_text(int n) 512{ 513const char *s = error_texts; 514for (; n > 0; n--) 515 { 516 while (*s++ != 0) {}; 517 if (*s == 0) return "Error text not found (please report)"; 518 } 519return s; 520} 521 522 523/************************************************* 524* Handle escapes * 525*************************************************/ 526 527/* This function is called when a \ has been encountered. It either returns a 528positive value for a simple escape such as \n, or a negative value which 529encodes one of the more complicated things such as \d. A backreference to group 530n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When 531UTF-8 is enabled, a positive value greater than 255 may be returned. On entry, 532ptr is pointing at the \. On exit, it is on the final character of the escape 533sequence. 534 535Arguments: 536 ptrptr points to the pattern position pointer 537 errorcodeptr points to the errorcode variable 538 bracount number of previous extracting brackets 539 options the options bits 540 isclass TRUE if inside a character class 541 542Returns: zero or positive => a data character 543 negative => a special escape sequence 544 on error, errorcodeptr is set 545*/ 546 547static int 548check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, 549 int options, BOOL isclass) 550{ 551BOOL utf8 = (options & PCRE_UTF8) != 0; 552const uschar *ptr = *ptrptr + 1; 553int c, i; 554 555GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ 556ptr--; /* Set pointer back to the last byte */ 557 558/* If backslash is at the end of the pattern, it's an error. */ 559 560if (c == 0) *errorcodeptr = ERR1; 561 562/* Non-alphanumerics are literals. For digits or letters, do an initial lookup 563in a table. A non-zero result is something that can be returned immediately. 564Otherwise further processing may be required. */ 565 566#ifndef EBCDIC /* ASCII/UTF-8 coding */ 567else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */ 568else if ((i = escapes[c - CHAR_0]) != 0) c = i; 569 570#else /* EBCDIC coding */ 571else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ 572else if ((i = escapes[c - 0x48]) != 0) c = i; 573#endif 574 575/* Escapes that need further processing, or are illegal. */ 576 577else 578 { 579 const uschar *oldptr; 580 BOOL braced, negated; 581 582 switch (c) 583 { 584 /* A number of Perl escapes are not handled by PCRE. We give an explicit 585 error. */ 586 587 case CHAR_l: 588 case CHAR_L: 589 case CHAR_N: 590 case CHAR_u: 591 case CHAR_U: 592 *errorcodeptr = ERR37; 593 break; 594 595 /* \g must be followed by one of a number of specific things: 596 597 (1) A number, either plain or braced. If positive, it is an absolute 598 backreference. If negative, it is a relative backreference. This is a Perl 599 5.10 feature. 600 601 (2) Perl 5.10 also supports \g{name} as a reference to a named group. This 602 is part of Perl's movement towards a unified syntax for back references. As 603 this is synonymous with \k{name}, we fudge it up by pretending it really 604 was \k. 605 606 (3) For Oniguruma compatibility we also support \g followed by a name or a 607 number either in angle brackets or in single quotes. However, these are 608 (possibly recursive) subroutine calls, _not_ backreferences. Just return 609 the -ESC_g code (cf \k). */ 610 611 case CHAR_g: 612 if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) 613 { 614 c = -ESC_g; 615 break; 616 } 617 618 /* Handle the Perl-compatible cases */ 619 620 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 621 { 622 const uschar *p; 623 for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++) 624 if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break; 625 if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET) 626 { 627 c = -ESC_k; 628 break; 629 } 630 braced = TRUE; 631 ptr++; 632 } 633 else braced = FALSE; 634 635 if (ptr[1] == CHAR_MINUS) 636 { 637 negated = TRUE; 638 ptr++; 639 } 640 else negated = FALSE; 641 642 c = 0; 643 while ((digitab[ptr[1]] & ctype_digit) != 0) 644 c = c * 10 + *(++ptr) - CHAR_0; 645 646 if (c < 0) /* Integer overflow */ 647 { 648 *errorcodeptr = ERR61; 649 break; 650 } 651 652 if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) 653 { 654 *errorcodeptr = ERR57; 655 break; 656 } 657 658 if (c == 0) 659 { 660 *errorcodeptr = ERR58; 661 break; 662 } 663 664 if (negated) 665 { 666 if (c > bracount) 667 { 668 *errorcodeptr = ERR15; 669 break; 670 } 671 c = bracount - (c - 1); 672 } 673 674 c = -(ESC_REF + c); 675 break; 676 677 /* The handling of escape sequences consisting of a string of digits 678 starting with one that is not zero is not straightforward. By experiment, 679 the way Perl works seems to be as follows: 680 681 Outside a character class, the digits are read as a decimal number. If the 682 number is less than 10, or if there are that many previous extracting 683 left brackets, then it is a back reference. Otherwise, up to three octal 684 digits are read to form an escaped byte. Thus \123 is likely to be octal 685 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal 686 value is greater than 377, the least significant 8 bits are taken. Inside a 687 character class, \ followed by a digit is always an octal number. */ 688 689 case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: 690 case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 691 692 if (!isclass) 693 { 694 oldptr = ptr; 695 c -= CHAR_0; 696 while ((digitab[ptr[1]] & ctype_digit) != 0) 697 c = c * 10 + *(++ptr) - CHAR_0; 698 if (c < 0) /* Integer overflow */ 699 { 700 *errorcodeptr = ERR61; 701 break; 702 } 703 if (c < 10 || c <= bracount) 704 { 705 c = -(ESC_REF + c); 706 break; 707 } 708 ptr = oldptr; /* Put the pointer back and fall through */ 709 } 710 711 /* Handle an octal number following \. If the first digit is 8 or 9, Perl 712 generates a binary zero byte and treats the digit as a following literal. 713 Thus we have to pull back the pointer by one. */ 714 715 if ((c = *ptr) >= CHAR_8) 716 { 717 ptr--; 718 c = 0; 719 break; 720 } 721 722 /* \0 always starts an octal number, but we may drop through to here with a 723 larger first octal digit. The original code used just to take the least 724 significant 8 bits of octal numbers (I think this is what early Perls used 725 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more 726 than 3 octal digits. */ 727 728 case CHAR_0: 729 c -= CHAR_0; 730 while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) 731 c = c * 8 + *(++ptr) - CHAR_0; 732 if (!utf8 && c > 255) *errorcodeptr = ERR51; 733 break; 734 735 /* \x is complicated. \x{ddd} is a character number which can be greater 736 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is 737 treated as a data character. */ 738 739 case CHAR_x: 740 if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) 741 { 742 const uschar *pt = ptr + 2; 743 int count = 0; 744 745 c = 0; 746 while ((digitab[*pt] & ctype_xdigit) != 0) 747 { 748 register int cc = *pt++; 749 if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ 750 count++; 751 752#ifndef EBCDIC /* ASCII/UTF-8 coding */ 753 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 754 c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 755#else /* EBCDIC coding */ 756 if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ 757 c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 758#endif 759 } 760 761 if (*pt == CHAR_RIGHT_CURLY_BRACKET) 762 { 763 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; 764 ptr = pt; 765 break; 766 } 767 768 /* If the sequence of hex digits does not end with '}', then we don't 769 recognize this construct; fall through to the normal \x handling. */ 770 } 771 772 /* Read just a single-byte hex-defined char */ 773 774 c = 0; 775 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) 776 { 777 int cc; /* Some compilers don't like */ 778 cc = *(++ptr); /* ++ in initializers */ 779#ifndef EBCDIC /* ASCII/UTF-8 coding */ 780 if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ 781 c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); 782#else /* EBCDIC coding */ 783 if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ 784 c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); 785#endif 786 } 787 break; 788 789 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. 790 This coding is ASCII-specific, but then the whole concept of \cx is 791 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ 792 793 case CHAR_c: 794 c = *(++ptr); 795 if (c == 0) 796 { 797 *errorcodeptr = ERR2; 798 break; 799 } 800 801#ifndef EBCDIC /* ASCII/UTF-8 coding */ 802 if (c >= CHAR_a && c <= CHAR_z) c -= 32; 803 c ^= 0x40; 804#else /* EBCDIC coding */ 805 if (c >= CHAR_a && c <= CHAR_z) c += 64; 806 c ^= 0xC0; 807#endif 808 break; 809 810 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any 811 other alphanumeric following \ is an error if PCRE_EXTRA was set; 812 otherwise, for Perl compatibility, it is a literal. This code looks a bit 813 odd, but there used to be some cases other than the default, and there may 814 be again in future, so I haven't "optimized" it. */ 815 816 default: 817 if ((options & PCRE_EXTRA) != 0) switch(c) 818 { 819 default: 820 *errorcodeptr = ERR3; 821 break; 822 } 823 break; 824 } 825 } 826 827*ptrptr = ptr; 828return c; 829} 830 831 832 833#ifdef SUPPORT_UCP 834/************************************************* 835* Handle \P and \p * 836*************************************************/ 837 838/* This function is called after \P or \p has been encountered, provided that 839PCRE is compiled with support for Unicode properties. On entry, ptrptr is 840pointing at the P or p. On exit, it is pointing at the final character of the 841escape sequence. 842 843Argument: 844 ptrptr points to the pattern position pointer 845 negptr points to a boolean that is set TRUE for negation else FALSE 846 dptr points to an int that is set to the detailed property value 847 errorcodeptr points to the error code variable 848 849Returns: type value from ucp_type_table, or -1 for an invalid type 850*/ 851 852static int 853get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) 854{ 855int c, i, bot, top; 856const uschar *ptr = *ptrptr; 857char name[32]; 858 859c = *(++ptr); 860if (c == 0) goto ERROR_RETURN; 861 862*negptr = FALSE; 863 864/* \P or \p can be followed by a name in {}, optionally preceded by ^ for 865negation. */ 866 867if (c == CHAR_LEFT_CURLY_BRACKET) 868 { 869 if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) 870 { 871 *negptr = TRUE; 872 ptr++; 873 } 874 for (i = 0; i < (int)sizeof(name) - 1; i++) 875 { 876 c = *(++ptr); 877 if (c == 0) goto ERROR_RETURN; 878 if (c == CHAR_RIGHT_CURLY_BRACKET) break; 879 name[i] = c; 880 } 881 if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; 882 name[i] = 0; 883 } 884 885/* Otherwise there is just one following character */ 886 887else 888 { 889 name[0] = c; 890 name[1] = 0; 891 } 892 893*ptrptr = ptr; 894 895/* Search for a recognized property name using binary chop */ 896 897bot = 0; 898top = _pcre_utt_size; 899 900while (bot < top) 901 { 902 i = (bot + top) >> 1; 903 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); 904 if (c == 0) 905 { 906 *dptr = _pcre_utt[i].value; 907 return _pcre_utt[i].type; 908 } 909 if (c > 0) bot = i + 1; else top = i; 910 } 911 912*errorcodeptr = ERR47; 913*ptrptr = ptr; 914return -1; 915 916ERROR_RETURN: 917*errorcodeptr = ERR46; 918*ptrptr = ptr; 919return -1; 920} 921#endif 922 923 924 925 926/************************************************* 927* Check for counted repeat * 928*************************************************/ 929 930/* This function is called when a '{' is encountered in a place where it might 931start a quantifier. It looks ahead to see if it really is a quantifier or not. 932It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} 933where the ddds are digits. 934 935Arguments: 936 p pointer to the first char after '{' 937 938Returns: TRUE or FALSE 939*/ 940 941static BOOL 942is_counted_repeat(const uschar *p) 943{ 944if ((digitab[*p++] & ctype_digit) == 0) return FALSE; 945while ((digitab[*p] & ctype_digit) != 0) p++; 946if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 947 948if (*p++ != CHAR_COMMA) return FALSE; 949if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; 950 951if ((digitab[*p++] & ctype_digit) == 0) return FALSE; 952while ((digitab[*p] & ctype_digit) != 0) p++; 953 954return (*p == CHAR_RIGHT_CURLY_BRACKET); 955} 956 957 958 959/************************************************* 960* Read repeat counts * 961*************************************************/ 962 963/* Read an item of the form {n,m} and return the values. This is called only 964after is_counted_repeat() has confirmed that a repeat-count quantifier exists, 965so the syntax is guaranteed to be correct, but we need to check the values. 966 967Arguments: 968 p pointer to first char after '{' 969 minp pointer to int for min 970 maxp pointer to int for max 971 returned as -1 if no max 972 errorcodeptr points to error code variable 973 974Returns: pointer to '}' on success; 975 current ptr on error, with errorcodeptr set non-zero 976*/ 977 978static const uschar * 979read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) 980{ 981int min = 0; 982int max = -1; 983 984/* Read the minimum value and do a paranoid check: a negative value indicates 985an integer overflow. */ 986 987while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0; 988if (min < 0 || min > 65535) 989 { 990 *errorcodeptr = ERR5; 991 return p; 992 } 993 994/* Read the maximum value if there is one, and again do a paranoid on its size. 995Also, max must not be less than min. */ 996 997if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else 998 { 999 if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) 1000 { 1001 max = 0; 1002 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0; 1003 if (max < 0 || max > 65535) 1004 { 1005 *errorcodeptr = ERR5; 1006 return p; 1007 } 1008 if (max < min) 1009 { 1010 *errorcodeptr = ERR4; 1011 return p; 1012 } 1013 } 1014 } 1015 1016/* Fill in the required variables, and pass back the pointer to the terminating 1017'}'. */ 1018 1019*minp = min; 1020*maxp = max; 1021return p; 1022} 1023 1024 1025 1026/************************************************* 1027* Subroutine for finding forward reference * 1028*************************************************/ 1029 1030/* This recursive function is called only from find_parens() below. The 1031top-level call starts at the beginning of the pattern. All other calls must 1032start at a parenthesis. It scans along a pattern's text looking for capturing 1033subpatterns, and counting them. If it finds a named pattern that matches the 1034name it is given, it returns its number. Alternatively, if the name is NULL, it 1035returns when it reaches a given numbered subpattern. We know that if (?P< is 1036encountered, the name will be terminated by '>' because that is checked in the 1037first pass. Recursion is used to keep track of subpatterns that reset the 1038capturing group numbers - the (?| feature. 1039 1040Arguments: 1041 ptrptr address of the current character pointer (updated) 1042 cd compile background data 1043 name name to seek, or NULL if seeking a numbered subpattern 1044 lorn name length, or subpattern number if name is NULL 1045 xmode TRUE if we are in /x mode 1046 count pointer to the current capturing subpattern number (updated) 1047 1048Returns: the number of the named subpattern, or -1 if not found 1049*/ 1050 1051static int 1052find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn, 1053 BOOL xmode, int *count) 1054{ 1055uschar *ptr = *ptrptr; 1056int start_count = *count; 1057int hwm_count = start_count; 1058BOOL dup_parens = FALSE; 1059 1060/* If the first character is a parenthesis, check on the type of group we are 1061dealing with. The very first call may not start with a parenthesis. */ 1062 1063if (ptr[0] == CHAR_LEFT_PARENTHESIS) 1064 { 1065 if (ptr[1] == CHAR_QUESTION_MARK && 1066 ptr[2] == CHAR_VERTICAL_LINE) 1067 { 1068 ptr += 3; 1069 dup_parens = TRUE; 1070 } 1071 1072 /* Handle a normal, unnamed capturing parenthesis */ 1073 1074 else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) 1075 { 1076 *count += 1; 1077 if (name == NULL && *count == lorn) return *count; 1078 ptr++; 1079 } 1080 1081 /* Handle a condition. If it is an assertion, just carry on so that it 1082 is processed as normal. If not, skip to the closing parenthesis of the 1083 condition (there can't be any nested parens. */ 1084 1085 else if (ptr[2] == CHAR_LEFT_PARENTHESIS) 1086 { 1087 ptr += 2; 1088 if (ptr[1] != CHAR_QUESTION_MARK) 1089 { 1090 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; 1091 if (*ptr != 0) ptr++; 1092 } 1093 } 1094 1095 /* We have either (? or (* and not a condition */ 1096 1097 else 1098 { 1099 ptr += 2; 1100 if (*ptr == CHAR_P) ptr++; /* Allow optional P */ 1101 1102 /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */ 1103 1104 if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK && 1105 ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE) 1106 { 1107 int term; 1108 const uschar *thisname; 1109 *count += 1; 1110 if (name == NULL && *count == lorn) return *count; 1111 term = *ptr++; 1112 if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN; 1113 thisname = ptr; 1114 while (*ptr != term) ptr++; 1115 if (name != NULL && lorn == ptr - thisname && 1116 strncmp((const char *)name, (const char *)thisname, lorn) == 0) 1117 return *count; 1118 term++; 1119 } 1120 } 1121 } 1122 1123/* Past any initial parenthesis handling, scan for parentheses or vertical 1124bars. */ 1125 1126for (; *ptr != 0; ptr++) 1127 { 1128 /* Skip over backslashed characters and also entire \Q...\E */ 1129 1130 if (*ptr == CHAR_BACKSLASH) 1131 { 1132 if (*(++ptr) == 0) goto FAIL_EXIT; 1133 if (*ptr == CHAR_Q) for (;;) 1134 { 1135 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; 1136 if (*ptr == 0) goto FAIL_EXIT; 1137 if (*(++ptr) == CHAR_E) break; 1138 } 1139 continue; 1140 } 1141 1142 /* Skip over character classes; this logic must be similar to the way they 1143 are handled for real. If the first character is '^', skip it. Also, if the 1144 first few characters (either before or after ^) are \Q\E or \E we skip them 1145 too. This makes for compatibility with Perl. Note the use of STR macros to 1146 encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */ 1147 1148 if (*ptr == CHAR_LEFT_SQUARE_BRACKET) 1149 { 1150 BOOL negate_class = FALSE; 1151 for (;;) 1152 { 1153 if (ptr[1] == CHAR_BACKSLASH) 1154 { 1155 if (ptr[2] == CHAR_E) 1156 ptr+= 2; 1157 else if (strncmp((const char *)ptr+2, 1158 STR_Q STR_BACKSLASH STR_E, 3) == 0) 1159 ptr += 4; 1160 else 1161 break; 1162 } 1163 else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT) 1164 { 1165 negate_class = TRUE; 1166 ptr++; 1167 } 1168 else break; 1169 } 1170 1171 /* If the next character is ']', it is a data character that must be 1172 skipped, except in JavaScript compatibility mode. */ 1173 1174 if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET && 1175 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) 1176 ptr++; 1177 1178 while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET) 1179 { 1180 if (*ptr == 0) return -1; 1181 if (*ptr == CHAR_BACKSLASH) 1182 { 1183 if (*(++ptr) == 0) goto FAIL_EXIT; 1184 if (*ptr == CHAR_Q) for (;;) 1185 { 1186 while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; 1187 if (*ptr == 0) goto FAIL_EXIT; 1188 if (*(++ptr) == CHAR_E) break; 1189 } 1190 continue; 1191 } 1192 } 1193 continue; 1194 } 1195 1196 /* Skip comments in /x mode */ 1197 1198 if (xmode && *ptr == CHAR_NUMBER_SIGN) 1199 { 1200 while (*(++ptr) != 0 && *ptr != CHAR_NL) {}; 1201 if (*ptr == 0) goto FAIL_EXIT; 1202 continue; 1203 } 1204 1205 /* Check for the special metacharacters */ 1206 1207 if (*ptr == CHAR_LEFT_PARENTHESIS) 1208 { 1209 int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count); 1210 if (rc > 0) return rc; 1211 if (*ptr == 0) goto FAIL_EXIT; 1212 } 1213 1214 else if (*ptr == CHAR_RIGHT_PARENTHESIS) 1215 { 1216 if (dup_parens && *count < hwm_count) *count = hwm_count; 1217 *ptrptr = ptr; 1218 return -1; 1219 } 1220 1221 else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) 1222 { 1223 if (*count > hwm_count) hwm_count = *count; 1224 *count = start_count; 1225 } 1226 } 1227 1228FAIL_EXIT: 1229*ptrptr = ptr; 1230return -1; 1231} 1232 1233 1234 1235 1236/************************************************* 1237* Find forward referenced subpattern * 1238*************************************************/ 1239 1240/* This function scans along a pattern's text looking for capturing 1241subpatterns, and counting them. If it finds a named pattern that matches the 1242name it is given, it returns its number. Alternatively, if the name is NULL, it 1243returns when it reaches a given numbered subpattern. This is used for forward 1244references to subpatterns. We used to be able to start this scan from the 1245current compiling point, using the current count value from cd->bracount, and 1246do it all in a single loop, but the addition of the possibility of duplicate 1247subpattern numbers means that we have to scan from the very start, in order to 1248take account of such duplicates, and to use a recursive function to keep track 1249of the different types of group. 1250 1251Arguments: 1252 cd compile background data 1253 name name to seek, or NULL if seeking a numbered subpattern 1254 lorn name length, or subpattern number if name is NULL 1255 xmode TRUE if we are in /x mode 1256 1257Returns: the number of the found subpattern, or -1 if not found 1258*/ 1259 1260static int 1261find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode) 1262{ 1263uschar *ptr = (uschar *)cd->start_pattern; 1264int count = 0; 1265int rc; 1266 1267/* If the pattern does not start with an opening parenthesis, the first call 1268to find_parens_sub() will scan right to the end (if necessary). However, if it 1269does start with a parenthesis, find_parens_sub() will return when it hits the 1270matching closing parens. That is why we have to have a loop. */ 1271 1272for (;;) 1273 { 1274 rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count); 1275 if (rc > 0 || *ptr++ == 0) break; 1276 } 1277 1278return rc; 1279} 1280 1281 1282 1283 1284/************************************************* 1285* Find first significant op code * 1286*************************************************/ 1287 1288/* This is called by several functions that scan a compiled expression looking 1289for a fixed first character, or an anchoring op code etc. It skips over things 1290that do not influence this. For some calls, a change of option is important. 1291For some calls, it makes sense to skip negative forward and all backward 1292assertions, and also the \b assertion; for others it does not. 1293 1294Arguments: 1295 code pointer to the start of the group 1296 options pointer to external options 1297 optbit the option bit whose changing is significant, or 1298 zero if none are 1299 skipassert TRUE if certain assertions are to be skipped 1300 1301Returns: pointer to the first significant opcode 1302*/ 1303 1304static const uschar* 1305first_significant_code(const uschar *code, int *options, int optbit, 1306 BOOL skipassert) 1307{ 1308for (;;) 1309 { 1310 switch ((int)*code) 1311 { 1312 case OP_OPT: 1313 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) 1314 *options = (int)code[1]; 1315 code += 2; 1316 break; 1317 1318 case OP_ASSERT_NOT: 1319 case OP_ASSERTBACK: 1320 case OP_ASSERTBACK_NOT: 1321 if (!skipassert) return code; 1322 do code += GET(code, 1); while (*code == OP_ALT); 1323 code += _pcre_OP_lengths[*code]; 1324 break; 1325 1326 case OP_WORD_BOUNDARY: 1327 case OP_NOT_WORD_BOUNDARY: 1328 if (!skipassert) return code; 1329 /* Fall through */ 1330 1331 case OP_CALLOUT: 1332 case OP_CREF: 1333 case OP_NCREF: 1334 case OP_RREF: 1335 case OP_NRREF: 1336 case OP_DEF: 1337 code += _pcre_OP_lengths[*code]; 1338 break; 1339 1340 default: 1341 return code; 1342 } 1343 } 1344/* Control never reaches here */ 1345} 1346 1347 1348 1349 1350/************************************************* 1351* Find the fixed length of a branch * 1352*************************************************/ 1353 1354/* Scan a branch and compute the fixed length of subject that will match it, 1355if the length is fixed. This is needed for dealing with backward assertions. 1356In UTF8 mode, the result is in characters rather than bytes. The branch is 1357temporarily terminated with OP_END when this function is called. 1358 1359This function is called when a backward assertion is encountered, so that if it 1360fails, the error message can point to the correct place in the pattern. 1361However, we cannot do this when the assertion contains subroutine calls, 1362because they can be forward references. We solve this by remembering this case 1363and doing the check at the end; a flag specifies which mode we are running in. 1364 1365Arguments: 1366 code points to the start of the pattern (the bracket) 1367 options the compiling options 1368 atend TRUE if called when the pattern is complete 1369 cd the "compile data" structure 1370 1371Returns: the fixed length, 1372 or -1 if there is no fixed length, 1373 or -2 if \C was encountered 1374 or -3 if an OP_RECURSE item was encountered and atend is FALSE 1375*/ 1376 1377static int 1378find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd) 1379{ 1380int length = -1; 1381 1382register int branchlength = 0; 1383register uschar *cc = code + 1 + LINK_SIZE; 1384 1385/* Scan along the opcodes for this branch. If we get to the end of the 1386branch, check the length against that of the other branches. */ 1387 1388for (;;) 1389 { 1390 int d; 1391 uschar *ce, *cs; 1392 register int op = *cc; 1393 switch (op) 1394 { 1395 case OP_CBRA: 1396 case OP_BRA: 1397 case OP_ONCE: 1398 case OP_COND: 1399 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd); 1400 if (d < 0) return d; 1401 branchlength += d; 1402 do cc += GET(cc, 1); while (*cc == OP_ALT); 1403 cc += 1 + LINK_SIZE; 1404 break; 1405 1406 /* Reached end of a branch; if it's a ket it is the end of a nested 1407 call. If it's ALT it is an alternation in a nested call. If it is 1408 END it's the end of the outer call. All can be handled by the same code. */ 1409 1410 case OP_ALT: 1411 case OP_KET: 1412 case OP_KETRMAX: 1413 case OP_KETRMIN: 1414 case OP_END: 1415 if (length < 0) length = branchlength; 1416 else if (length != branchlength) return -1; 1417 if (*cc != OP_ALT) return length; 1418 cc += 1 + LINK_SIZE; 1419 branchlength = 0; 1420 break; 1421 1422 /* A true recursion implies not fixed length, but a subroutine call may 1423 be OK. If the subroutine is a forward reference, we can't deal with 1424 it until the end of the pattern, so return -3. */ 1425 1426 case OP_RECURSE: 1427 if (!atend) return -3; 1428 cs = ce = (uschar *)cd->start_code + GET(cc, 1); /* Start subpattern */ 1429 do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ 1430 if (cc > cs && cc < ce) return -1; /* Recursion */ 1431 d = find_fixedlength(cs + 2, options, atend, cd); 1432 if (d < 0) return d; 1433 branchlength += d; 1434 cc += 1 + LINK_SIZE; 1435 break; 1436 1437 /* Skip over assertive subpatterns */ 1438 1439 case OP_ASSERT: 1440 case OP_ASSERT_NOT: 1441 case OP_ASSERTBACK: 1442 case OP_ASSERTBACK_NOT: 1443 do cc += GET(cc, 1); while (*cc == OP_ALT); 1444 /* Fall through */ 1445 1446 /* Skip over things that don't match chars */ 1447 1448 case OP_REVERSE: 1449 case OP_CREF: 1450 case OP_NCREF: 1451 case OP_RREF: 1452 case OP_NRREF: 1453 case OP_DEF: 1454 case OP_OPT: 1455 case OP_CALLOUT: 1456 case OP_SOD: 1457 case OP_SOM: 1458 case OP_SET_SOM: 1459 case OP_EOD: 1460 case OP_EODN: 1461 case OP_CIRC: 1462 case OP_DOLL: 1463 case OP_NOT_WORD_BOUNDARY: 1464 case OP_WORD_BOUNDARY: 1465 cc += _pcre_OP_lengths[*cc]; 1466 break; 1467 1468 /* Handle literal characters */ 1469 1470 case OP_CHAR: 1471 case OP_CHARNC: 1472 case OP_NOT: 1473 branchlength++; 1474 cc += 2; 1475#ifdef SUPPORT_UTF8 1476 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0) 1477 cc += _pcre_utf8_table4[cc[-1] & 0x3f]; 1478#endif 1479 break; 1480 1481 /* Handle exact repetitions. The count is already in characters, but we 1482 need to skip over a multibyte character in UTF8 mode. */ 1483 1484 case OP_EXACT: 1485 branchlength += GET2(cc,1); 1486 cc += 4; 1487#ifdef SUPPORT_UTF8 1488 if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0) 1489 cc += _pcre_utf8_table4[cc[-1] & 0x3f]; 1490#endif 1491 break; 1492 1493 case OP_TYPEEXACT: 1494 branchlength += GET2(cc,1); 1495 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; 1496 cc += 4; 1497 break; 1498 1499 /* Handle single-char matchers */ 1500 1501 case OP_PROP: 1502 case OP_NOTPROP: 1503 cc += 2; 1504 /* Fall through */ 1505 1506 case OP_NOT_DIGIT: 1507 case OP_DIGIT: 1508 case OP_NOT_WHITESPACE: 1509 case OP_WHITESPACE: 1510 case OP_NOT_WORDCHAR: 1511 case OP_WORDCHAR: 1512 case OP_ANY: 1513 case OP_ALLANY: 1514 branchlength++; 1515 cc++; 1516 break; 1517 1518 /* The single-byte matcher isn't allowed */ 1519 1520 case OP_ANYBYTE: 1521 return -2; 1522 1523 /* Check a class for variable quantification */ 1524 1525#ifdef SUPPORT_UTF8 1526 case OP_XCLASS: 1527 cc += GET(cc, 1) - 33; 1528 /* Fall through */ 1529#endif 1530 1531 case OP_CLASS: 1532 case OP_NCLASS: 1533 cc += 33; 1534 1535 switch (*cc) 1536 { 1537 case OP_CRSTAR: 1538 case OP_CRMINSTAR: 1539 case OP_CRQUERY: 1540 case OP_CRMINQUERY: 1541 return -1; 1542 1543 case OP_CRRANGE: 1544 case OP_CRMINRANGE: 1545 if (GET2(cc,1) != GET2(cc,3)) return -1; 1546 branchlength += GET2(cc,1); 1547 cc += 5; 1548 break; 1549 1550 default: 1551 branchlength++; 1552 } 1553 break; 1554 1555 /* Anything else is variable length */ 1556 1557 default: 1558 return -1; 1559 } 1560 } 1561/* Control never gets here */ 1562} 1563 1564 1565 1566 1567/************************************************* 1568* Scan compiled regex for specific bracket * 1569*************************************************/ 1570 1571/* This little function scans through a compiled pattern until it finds a 1572capturing bracket with the given number, or, if the number is negative, an 1573instance of OP_REVERSE for a lookbehind. The function is global in the C sense 1574so that it can be called from pcre_study() when finding the minimum matching 1575length. 1576 1577Arguments: 1578 code points to start of expression 1579 utf8 TRUE in UTF-8 mode 1580 number the required bracket number or negative to find a lookbehind 1581 1582Returns: pointer to the opcode for the bracket, or NULL if not found 1583*/ 1584 1585const uschar * 1586_pcre_find_bracket(const uschar *code, BOOL utf8, int number) 1587{ 1588for (;;) 1589 { 1590 register int c = *code; 1591 if (c == OP_END) return NULL; 1592 1593 /* XCLASS is used for classes that cannot be represented just by a bit 1594 map. This includes negated single high-valued characters. The length in 1595 the table is zero; the actual length is stored in the compiled code. */ 1596 1597 if (c == OP_XCLASS) code += GET(code, 1); 1598 1599 /* Handle recursion */ 1600 1601 else if (c == OP_REVERSE) 1602 { 1603 if (number < 0) return (uschar *)code; 1604 code += _pcre_OP_lengths[c]; 1605 } 1606 1607 /* Handle capturing bracket */ 1608 1609 else if (c == OP_CBRA) 1610 { 1611 int n = GET2(code, 1+LINK_SIZE); 1612 if (n == number) return (uschar *)code; 1613 code += _pcre_OP_lengths[c]; 1614 } 1615 1616 /* Otherwise, we can get the item's length from the table, except that for 1617 repeated character types, we have to test for \p and \P, which have an extra 1618 two bytes of parameters. */ 1619 1620 else 1621 { 1622 switch(c) 1623 { 1624 case OP_TYPESTAR: 1625 case OP_TYPEMINSTAR: 1626 case OP_TYPEPLUS: 1627 case OP_TYPEMINPLUS: 1628 case OP_TYPEQUERY: 1629 case OP_TYPEMINQUERY: 1630 case OP_TYPEPOSSTAR: 1631 case OP_TYPEPOSPLUS: 1632 case OP_TYPEPOSQUERY: 1633 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 1634 break; 1635 1636 case OP_TYPEUPTO: 1637 case OP_TYPEMINUPTO: 1638 case OP_TYPEEXACT: 1639 case OP_TYPEPOSUPTO: 1640 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; 1641 break; 1642 } 1643 1644 /* Add in the fixed length from the table */ 1645 1646 code += _pcre_OP_lengths[c]; 1647 1648 /* In UTF-8 mode, opcodes that are followed by a character may be followed by 1649 a multi-byte character. The length in the table is a minimum, so we have to 1650 arrange to skip the extra bytes. */ 1651 1652#ifdef SUPPORT_UTF8 1653 if (utf8) switch(c) 1654 { 1655 case OP_CHAR: 1656 case OP_CHARNC: 1657 case OP_EXACT: 1658 case OP_UPTO: 1659 case OP_MINUPTO: 1660 case OP_POSUPTO: 1661 case OP_STAR: 1662 case OP_MINSTAR: 1663 case OP_POSSTAR: 1664 case OP_PLUS: 1665 case OP_MINPLUS: 1666 case OP_POSPLUS: 1667 case OP_QUERY: 1668 case OP_MINQUERY: 1669 case OP_POSQUERY: 1670 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; 1671 break; 1672 } 1673#else 1674 (void)(utf8); /* Keep compiler happy by referencing function argument */ 1675#endif 1676 } 1677 } 1678} 1679 1680 1681 1682/************************************************* 1683* Scan compiled regex for recursion reference * 1684*************************************************/ 1685 1686/* This little function scans through a compiled pattern until it finds an 1687instance of OP_RECURSE. 1688 1689Arguments: 1690 code points to start of expression 1691 utf8 TRUE in UTF-8 mode 1692 1693Returns: pointer to the opcode for OP_RECURSE, or NULL if not found 1694*/ 1695 1696static const uschar * 1697find_recurse(const uschar *code, BOOL utf8) 1698{ 1699for (;;) 1700 { 1701 register int c = *code; 1702 if (c == OP_END) return NULL; 1703 if (c == OP_RECURSE) return code; 1704 1705 /* XCLASS is used for classes that cannot be represented just by a bit 1706 map. This includes negated single high-valued characters. The length in 1707 the table is zero; the actual length is stored in the compiled code. */ 1708 1709 if (c == OP_XCLASS) code += GET(code, 1); 1710 1711 /* Otherwise, we can get the item's length from the table, except that for 1712 repeated character types, we have to test for \p and \P, which have an extra 1713 two bytes of parameters. */ 1714 1715 else 1716 { 1717 switch(c) 1718 { 1719 case OP_TYPESTAR: 1720 case OP_TYPEMINSTAR: 1721 case OP_TYPEPLUS: 1722 case OP_TYPEMINPLUS: 1723 case OP_TYPEQUERY: 1724 case OP_TYPEMINQUERY: 1725 case OP_TYPEPOSSTAR: 1726 case OP_TYPEPOSPLUS: 1727 case OP_TYPEPOSQUERY: 1728 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 1729 break; 1730 1731 case OP_TYPEPOSUPTO: 1732 case OP_TYPEUPTO: 1733 case OP_TYPEMINUPTO: 1734 case OP_TYPEEXACT: 1735 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; 1736 break; 1737 } 1738 1739 /* Add in the fixed length from the table */ 1740 1741 code += _pcre_OP_lengths[c]; 1742 1743 /* In UTF-8 mode, opcodes that are followed by a character may be followed 1744 by a multi-byte character. The length in the table is a minimum, so we have 1745 to arrange to skip the extra bytes. */ 1746 1747#ifdef SUPPORT_UTF8 1748 if (utf8) switch(c) 1749 { 1750 case OP_CHAR: 1751 case OP_CHARNC: 1752 case OP_EXACT: 1753 case OP_UPTO: 1754 case OP_MINUPTO: 1755 case OP_POSUPTO: 1756 case OP_STAR: 1757 case OP_MINSTAR: 1758 case OP_POSSTAR: 1759 case OP_PLUS: 1760 case OP_MINPLUS: 1761 case OP_POSPLUS: 1762 case OP_QUERY: 1763 case OP_MINQUERY: 1764 case OP_POSQUERY: 1765 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; 1766 break; 1767 } 1768#else 1769 (void)(utf8); /* Keep compiler happy by referencing function argument */ 1770#endif 1771 } 1772 } 1773} 1774 1775 1776 1777/************************************************* 1778* Scan compiled branch for non-emptiness * 1779*************************************************/ 1780 1781/* This function scans through a branch of a compiled pattern to see whether it 1782can match the empty string or not. It is called from could_be_empty() 1783below and from compile_branch() when checking for an unlimited repeat of a 1784group that can match nothing. Note that first_significant_code() skips over 1785backward and negative forward assertions when its final argument is TRUE. If we 1786hit an unclosed bracket, we return "empty" - this means we've struck an inner 1787bracket whose current branch will already have been scanned. 1788 1789Arguments: 1790 code points to start of search 1791 endcode points to where to stop 1792 utf8 TRUE if in UTF8 mode 1793 cd contains pointers to tables etc. 1794 1795Returns: TRUE if what is matched could be empty 1796*/ 1797 1798static BOOL 1799could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8, 1800 compile_data *cd) 1801{ 1802register int c; 1803for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); 1804 code < endcode; 1805 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) 1806 { 1807 const uschar *ccode; 1808 1809 c = *code; 1810 1811 /* Skip over forward assertions; the other assertions are skipped by 1812 first_significant_code() with a TRUE final argument. */ 1813 1814 if (c == OP_ASSERT) 1815 { 1816 do code += GET(code, 1); while (*code == OP_ALT); 1817 c = *code; 1818 continue; 1819 } 1820 1821 /* Groups with zero repeats can of course be empty; skip them. */ 1822 1823 if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) 1824 { 1825 code += _pcre_OP_lengths[c]; 1826 do code += GET(code, 1); while (*code == OP_ALT); 1827 c = *code; 1828 continue; 1829 } 1830 1831 /* For a recursion/subroutine call, if its end has been reached, which 1832 implies a subroutine call, we can scan it. */ 1833 1834 if (c == OP_RECURSE) 1835 { 1836 BOOL empty_branch = FALSE; 1837 const uschar *scode = cd->start_code + GET(code, 1); 1838 if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ 1839 do 1840 { 1841 if (could_be_empty_branch(scode, endcode, utf8, cd)) 1842 { 1843 empty_branch = TRUE; 1844 break; 1845 } 1846 scode += GET(scode, 1); 1847 } 1848 while (*scode == OP_ALT); 1849 if (!empty_branch) return FALSE; /* All branches are non-empty */ 1850 continue; 1851 } 1852 1853 /* For other groups, scan the branches. */ 1854 1855 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) 1856 { 1857 BOOL empty_branch; 1858 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ 1859 1860 /* If a conditional group has only one branch, there is a second, implied, 1861 empty branch, so just skip over the conditional, because it could be empty. 1862 Otherwise, scan the individual branches of the group. */ 1863 1864 if (c == OP_COND && code[GET(code, 1)] != OP_ALT) 1865 code += GET(code, 1); 1866 else 1867 { 1868 empty_branch = FALSE; 1869 do 1870 { 1871 if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd)) 1872 empty_branch = TRUE; 1873 code += GET(code, 1); 1874 } 1875 while (*code == OP_ALT); 1876 if (!empty_branch) return FALSE; /* All branches are non-empty */ 1877 } 1878 1879 c = *code; 1880 continue; 1881 } 1882 1883 /* Handle the other opcodes */ 1884 1885 switch (c) 1886 { 1887 /* Check for quantifiers after a class. XCLASS is used for classes that 1888 cannot be represented just by a bit map. This includes negated single 1889 high-valued characters. The length in _pcre_OP_lengths[] is zero; the 1890 actual length is stored in the compiled code, so we must update "code" 1891 here. */ 1892 1893#ifdef SUPPORT_UTF8 1894 case OP_XCLASS: 1895 ccode = code += GET(code, 1); 1896 goto CHECK_CLASS_REPEAT; 1897#endif 1898 1899 case OP_CLASS: 1900 case OP_NCLASS: 1901 ccode = code + 33; 1902 1903#ifdef SUPPORT_UTF8 1904 CHECK_CLASS_REPEAT: 1905#endif 1906 1907 switch (*ccode) 1908 { 1909 case OP_CRSTAR: /* These could be empty; continue */ 1910 case OP_CRMINSTAR: 1911 case OP_CRQUERY: 1912 case OP_CRMINQUERY: 1913 break; 1914 1915 default: /* Non-repeat => class must match */ 1916 case OP_CRPLUS: /* These repeats aren't empty */ 1917 case OP_CRMINPLUS: 1918 return FALSE; 1919 1920 case OP_CRRANGE: 1921 case OP_CRMINRANGE: 1922 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ 1923 break; 1924 } 1925 break; 1926 1927 /* Opcodes that must match a character */ 1928 1929 case OP_PROP: 1930 case OP_NOTPROP: 1931 case OP_EXTUNI: 1932 case OP_NOT_DIGIT: 1933 case OP_DIGIT: 1934 case OP_NOT_WHITESPACE: 1935 case OP_WHITESPACE: 1936 case OP_NOT_WORDCHAR: 1937 case OP_WORDCHAR: 1938 case OP_ANY: 1939 case OP_ALLANY: 1940 case OP_ANYBYTE: 1941 case OP_CHAR: 1942 case OP_CHARNC: 1943 case OP_NOT: 1944 case OP_PLUS: 1945 case OP_MINPLUS: 1946 case OP_POSPLUS: 1947 case OP_EXACT: 1948 case OP_NOTPLUS: 1949 case OP_NOTMINPLUS: 1950 case OP_NOTPOSPLUS: 1951 case OP_NOTEXACT: 1952 case OP_TYPEPLUS: 1953 case OP_TYPEMINPLUS: 1954 case OP_TYPEPOSPLUS: 1955 case OP_TYPEEXACT: 1956 return FALSE; 1957 1958 /* These are going to continue, as they may be empty, but we have to 1959 fudge the length for the \p and \P cases. */ 1960 1961 case OP_TYPESTAR: 1962 case OP_TYPEMINSTAR: 1963 case OP_TYPEPOSSTAR: 1964 case OP_TYPEQUERY: 1965 case OP_TYPEMINQUERY: 1966 case OP_TYPEPOSQUERY: 1967 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; 1968 break; 1969 1970 /* Same for these */ 1971 1972 case OP_TYPEUPTO: 1973 case OP_TYPEMINUPTO: 1974 case OP_TYPEPOSUPTO: 1975 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; 1976 break; 1977 1978 /* End of branch */ 1979 1980 case OP_KET: 1981 case OP_KETRMAX: 1982 case OP_KETRMIN: 1983 case OP_ALT: 1984 return TRUE; 1985 1986 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, 1987 MINUPTO, and POSUPTO may be followed by a multibyte character */ 1988 1989#ifdef SUPPORT_UTF8 1990 case OP_STAR: 1991 case OP_MINSTAR: 1992 case OP_POSSTAR: 1993 case OP_QUERY: 1994 case OP_MINQUERY: 1995 case OP_POSQUERY: 1996 if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f]; 1997 break; 1998 1999 case OP_UPTO: 2000 case OP_MINUPTO: 2001 case OP_POSUPTO: 2002 if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f]; 2003 break; 2004#endif 2005 2006 /* None of the remaining opcodes are required to match a character. */ 2007 2008 default: 2009 break; 2010 } 2011 } 2012 2013return TRUE; 2014} 2015 2016 2017 2018/************************************************* 2019* Scan compiled regex for non-emptiness * 2020*************************************************/ 2021 2022/* This function is called to check for left recursive calls. We want to check 2023the current branch of the current pattern to see if it could match the empty 2024string. If it could, we must look outwards for branches at other levels, 2025stopping when we pass beyond the bracket which is the subject of the recursion. 2026 2027Arguments: 2028 code points to start of the recursion 2029 endcode points to where to stop (current RECURSE item) 2030 bcptr points to the chain of current (unclosed) branch starts 2031 utf8 TRUE if in UTF-8 mode 2032 cd pointers to tables etc 2033 2034Returns: TRUE if what is matched could be empty 2035*/ 2036 2037static BOOL 2038could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, 2039 BOOL utf8, compile_data *cd) 2040{ 2041while (bcptr != NULL && bcptr->current_branch >= code) 2042 { 2043 if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd)) 2044 return FALSE; 2045 bcptr = bcptr->outer; 2046 } 2047return TRUE; 2048} 2049 2050 2051 2052/************************************************* 2053* Check for POSIX class syntax * 2054*************************************************/ 2055 2056/* This function is called when the sequence "[:" or "[." or "[=" is 2057encountered in a character class. It checks whether this is followed by a 2058sequence of characters terminated by a matching ":]" or ".]" or "=]". If we 2059reach an unescaped ']' without the special preceding character, return FALSE. 2060 2061Originally, this function only recognized a sequence of letters between the 2062terminators, but it seems that Perl recognizes any sequence of characters, 2063though of course unknown POSIX names are subsequently rejected. Perl gives an 2064"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE 2065didn't consider this to be a POSIX class. Likewise for [:1234:]. 2066 2067The problem in trying to be exactly like Perl is in the handling of escapes. We 2068have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX 2069class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code 2070below handles the special case of \], but does not try to do any other escape 2071processing. This makes it different from Perl for cases such as [:l\ower:] 2072where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize 2073"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, 2074I think. 2075 2076Arguments: 2077 ptr pointer to the initial [ 2078 endptr where to return the end pointer 2079 2080Returns: TRUE or FALSE 2081*/ 2082 2083static BOOL 2084check_posix_syntax(const uschar *ptr, const uschar **endptr) 2085{ 2086int terminator; /* Don't combine these lines; the Solaris cc */ 2087terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ 2088for (++ptr; *ptr != 0; ptr++) 2089 { 2090 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else 2091 { 2092 if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; 2093 if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) 2094 { 2095 *endptr = ptr; 2096 return TRUE; 2097 } 2098 } 2099 } 2100return FALSE; 2101} 2102 2103 2104 2105 2106/************************************************* 2107* Check POSIX class name * 2108*************************************************/ 2109 2110/* This function is called to check the name given in a POSIX-style class entry 2111such as [:alnum:]. 2112 2113Arguments: 2114 ptr points to the first letter 2115 len the length of the name 2116 2117Returns: a value representing the name, or -1 if unknown 2118*/ 2119 2120static int 2121check_posix_name(const uschar *ptr, int len) 2122{ 2123const char *pn = posix_names; 2124register int yield = 0; 2125while (posix_name_lengths[yield] != 0) 2126 { 2127 if (len == posix_name_lengths[yield] && 2128 strncmp((const char *)ptr, pn, len) == 0) return yield; 2129 pn += posix_name_lengths[yield] + 1; 2130 yield++; 2131 } 2132return -1; 2133} 2134 2135 2136/************************************************* 2137* Adjust OP_RECURSE items in repeated group * 2138*************************************************/ 2139 2140/* OP_RECURSE items contain an offset from the start of the regex to the group 2141that is referenced. This means that groups can be replicated for fixed 2142repetition simply by copying (because the recursion is allowed to refer to 2143earlier groups that are outside the current group). However, when a group is 2144optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is 2145inserted before it, after it has been compiled. This means that any OP_RECURSE 2146items within it that refer to the group itself or any contained groups have to 2147have their offsets adjusted. That one of the jobs of this function. Before it 2148is called, the partially compiled regex must be temporarily terminated with 2149OP_END. 2150 2151This function has been extended with the possibility of forward references for 2152recursions and subroutine calls. It must also check the list of such references 2153for the group we are dealing with. If it finds that one of the recursions in 2154the current group is on this list, it adjusts the offset in the list, not the 2155value in the reference (which is a group number). 2156 2157Arguments: 2158 group points to the start of the group 2159 adjust the amount by which the group is to be moved 2160 utf8 TRUE in UTF-8 mode 2161 cd contains pointers to tables etc. 2162 save_hwm the hwm forward reference pointer at the start of the group 2163 2164Returns: nothing 2165*/ 2166 2167static void 2168adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, 2169 uschar *save_hwm) 2170{ 2171uschar *ptr = group; 2172 2173while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) 2174 { 2175 int offset; 2176 uschar *hc; 2177 2178 /* See if this recursion is on the forward reference list. If so, adjust the 2179 reference. */ 2180 2181 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) 2182 { 2183 offset = GET(hc, 0); 2184 if (cd->start_code + offset == ptr + 1) 2185 { 2186 PUT(hc, 0, offset + adjust); 2187 break; 2188 } 2189 } 2190 2191 /* Otherwise, adjust the recursion offset if it's after the start of this 2192 group. */ 2193 2194 if (hc >= cd->hwm) 2195 { 2196 offset = GET(ptr, 1); 2197 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); 2198 } 2199 2200 ptr += 1 + LINK_SIZE; 2201 } 2202} 2203 2204 2205 2206/************************************************* 2207* Insert an automatic callout point * 2208*************************************************/ 2209 2210/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert 2211callout points before each pattern item. 2212 2213Arguments: 2214 code current code pointer 2215 ptr current pattern pointer 2216 cd pointers to tables etc 2217 2218Returns: new code pointer 2219*/ 2220 2221static uschar * 2222auto_callout(uschar *code, const uschar *ptr, compile_data *cd) 2223{ 2224*code++ = OP_CALLOUT; 2225*code++ = 255; 2226PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ 2227PUT(code, LINK_SIZE, 0); /* Default length */ 2228return code + 2*LINK_SIZE; 2229} 2230 2231 2232 2233/************************************************* 2234* Complete a callout item * 2235*************************************************/ 2236 2237/* A callout item contains the length of the next item in the pattern, which 2238we can't fill in till after we have reached the relevant point. This is used 2239for both automatic and manual callouts. 2240 2241Arguments: 2242 previous_callout points to previous callout item 2243 ptr current pattern pointer 2244 cd pointers to tables etc 2245 2246Returns: nothing 2247*/ 2248 2249static void 2250complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) 2251{ 2252int length = ptr - cd->start_pattern - GET(previous_callout, 2); 2253PUT(previous_callout, 2 + LINK_SIZE, length); 2254} 2255 2256 2257 2258#ifdef SUPPORT_UCP 2259/************************************************* 2260* Get othercase range * 2261*************************************************/ 2262 2263/* This function is passed the start and end of a class range, in UTF-8 mode 2264with UCP support. It searches up the characters, looking for internal ranges of 2265characters in the "other" case. Each call returns the next one, updating the 2266start address. 2267 2268Arguments: 2269 cptr points to starting character value; updated 2270 d end value 2271 ocptr where to put start of othercase range 2272 odptr where to put end of othercase range 2273 2274Yield: TRUE when range returned; FALSE when no more 2275*/ 2276 2277static BOOL 2278get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, 2279 unsigned int *odptr) 2280{ 2281unsigned int c, othercase, next; 2282 2283for (c = *cptr; c <= d; c++) 2284 { if ((othercase = UCD_OTHERCASE(c)) != c) break; } 2285 2286if (c > d) return FALSE; 2287 2288*ocptr = othercase; 2289next = othercase + 1; 2290 2291for (++c; c <= d; c++) 2292 { 2293 if (UCD_OTHERCASE(c) != next) break; 2294 next++; 2295 } 2296 2297*odptr = next - 1; 2298*cptr = c; 2299 2300return TRUE; 2301} 2302#endif /* SUPPORT_UCP */ 2303 2304 2305 2306/************************************************* 2307* Check if auto-possessifying is possible * 2308*************************************************/ 2309 2310/* This function is called for unlimited repeats of certain items, to see 2311whether the next thing could possibly match the repeated item. If not, it makes 2312sense to automatically possessify the repeated item. 2313 2314Arguments: 2315 op_code the repeated op code 2316 this data for this item, depends on the opcode 2317 utf8 TRUE in UTF-8 mode 2318 utf8_char used for utf8 character bytes, NULL if not relevant 2319 ptr next character in pattern 2320 options options bits 2321 cd contains pointers to tables etc. 2322 2323Returns: TRUE if possessifying is wanted 2324*/ 2325 2326static BOOL 2327check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, 2328 const uschar *ptr, int options, compile_data *cd) 2329{ 2330int next; 2331 2332/* Skip whitespace and comments in extended mode */ 2333 2334if ((options & PCRE_EXTENDED) != 0) 2335 { 2336 for (;;) 2337 { 2338 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; 2339 if (*ptr == CHAR_NUMBER_SIGN) 2340 { 2341 while (*(++ptr) != 0) 2342 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } 2343 } 2344 else break; 2345 } 2346 } 2347 2348/* If the next item is one that we can handle, get its value. A non-negative 2349value is a character, a negative value is an escape value. */ 2350 2351if (*ptr == CHAR_BACKSLASH) 2352 { 2353 int temperrorcode = 0; 2354 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); 2355 if (temperrorcode != 0) return FALSE; 2356 ptr++; /* Point after the escape sequence */ 2357 } 2358 2359else if ((cd->ctypes[*ptr] & ctype_meta) == 0) 2360 { 2361#ifdef SUPPORT_UTF8 2362 if (utf8) { GETCHARINC(next, ptr); } else 2363#endif 2364 next = *ptr++; 2365 } 2366 2367else return FALSE; 2368 2369/* Skip whitespace and comments in extended mode */ 2370 2371if ((options & PCRE_EXTENDED) != 0) 2372 { 2373 for (;;) 2374 { 2375 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; 2376 if (*ptr == CHAR_NUMBER_SIGN) 2377 { 2378 while (*(++ptr) != 0) 2379 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } 2380 } 2381 else break; 2382 } 2383 } 2384 2385/* If the next thing is itself optional, we have to give up. */ 2386 2387if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || 2388 strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) 2389 return FALSE; 2390 2391/* Now compare the next item with the previous opcode. If the previous is a 2392positive single character match, "item" either contains the character or, if 2393"item" is greater than 127 in utf8 mode, the character's bytes are in 2394utf8_char. */ 2395 2396 2397/* Handle cases when the next item is a character. */ 2398 2399if (next >= 0) switch(op_code) 2400 { 2401 case OP_CHAR: 2402#ifdef SUPPORT_UTF8 2403 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } 2404#else 2405 (void)(utf8_char); /* Keep compiler happy by referencing function argument */ 2406#endif 2407 return item != next; 2408 2409 /* For CHARNC (caseless character) we must check the other case. If we have 2410 Unicode property support, we can use it to test the other case of 2411 high-valued characters. */ 2412 2413 case OP_CHARNC: 2414#ifdef SUPPORT_UTF8 2415 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } 2416#endif 2417 if (item == next) return FALSE; 2418#ifdef SUPPORT_UTF8 2419 if (utf8) 2420 { 2421 unsigned int othercase; 2422 if (next < 128) othercase = cd->fcc[next]; else 2423#ifdef SUPPORT_UCP 2424 othercase = UCD_OTHERCASE((unsigned int)next); 2425#else 2426 othercase = NOTACHAR; 2427#endif 2428 return (unsigned int)item != othercase; 2429 } 2430 else 2431#endif /* SUPPORT_UTF8 */ 2432 return (item != cd->fcc[next]); /* Non-UTF-8 mode */ 2433 2434 /* For OP_NOT, "item" must be a single-byte character. */ 2435 2436 case OP_NOT: 2437 if (item == next) return TRUE; 2438 if ((options & PCRE_CASELESS) == 0) return FALSE; 2439#ifdef SUPPORT_UTF8 2440 if (utf8) 2441 { 2442 unsigned int othercase; 2443 if (next < 128) othercase = cd->fcc[next]; else 2444#ifdef SUPPORT_UCP 2445 othercase = UCD_OTHERCASE(next); 2446#else 2447 othercase = NOTACHAR; 2448#endif 2449 return (unsigned int)item == othercase; 2450 } 2451 else 2452#endif /* SUPPORT_UTF8 */ 2453 return (item == cd->fcc[next]); /* Non-UTF-8 mode */ 2454 2455 case OP_DIGIT: 2456 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; 2457 2458 case OP_NOT_DIGIT: 2459 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; 2460 2461 case OP_WHITESPACE: 2462 return next > 127 || (cd->ctypes[next] & ctype_space) == 0; 2463 2464 case OP_NOT_WHITESPACE: 2465 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; 2466 2467 case OP_WORDCHAR: 2468 return next > 127 || (cd->ctypes[next] & ctype_word) == 0; 2469 2470 case OP_NOT_WORDCHAR: 2471 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; 2472 2473 case OP_HSPACE: 2474 case OP_NOT_HSPACE: 2475 switch(next) 2476 { 2477 case 0x09: 2478 case 0x20: 2479 case 0xa0: 2480 case 0x1680: 2481 case 0x180e: 2482 case 0x2000: 2483 case 0x2001: 2484 case 0x2002: 2485 case 0x2003: 2486 case 0x2004: 2487 case 0x2005: 2488 case 0x2006: 2489 case 0x2007: 2490 case 0x2008: 2491 case 0x2009: 2492 case 0x200A: 2493 case 0x202f: 2494 case 0x205f: 2495 case 0x3000: 2496 return op_code != OP_HSPACE; 2497 default: 2498 return op_code == OP_HSPACE; 2499 } 2500 2501 case OP_VSPACE: 2502 case OP_NOT_VSPACE: 2503 switch(next) 2504 { 2505 case 0x0a: 2506 case 0x0b: 2507 case 0x0c: 2508 case 0x0d: 2509 case 0x85: 2510 case 0x2028: 2511 case 0x2029: 2512 return op_code != OP_VSPACE; 2513 default: 2514 return op_code == OP_VSPACE; 2515 } 2516 2517 default: 2518 return FALSE; 2519 } 2520 2521 2522/* Handle the case when the next item is \d, \s, etc. */ 2523 2524switch(op_code) 2525 { 2526 case OP_CHAR: 2527 case OP_CHARNC: 2528#ifdef SUPPORT_UTF8 2529 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } 2530#endif 2531 switch(-next) 2532 { 2533 case ESC_d: 2534 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; 2535 2536 case ESC_D: 2537 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; 2538 2539 case ESC_s: 2540 return item > 127 || (cd->ctypes[item] & ctype_space) == 0; 2541 2542 case ESC_S: 2543 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; 2544 2545 case ESC_w: 2546 return item > 127 || (cd->ctypes[item] & ctype_word) == 0; 2547 2548 case ESC_W: 2549 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; 2550 2551 case ESC_h: 2552 case ESC_H: 2553 switch(item) 2554 { 2555 case 0x09: 2556 case 0x20: 2557 case 0xa0: 2558 case 0x1680: 2559 case 0x180e: 2560 case 0x2000: 2561 case 0x2001: 2562 case 0x2002: 2563 case 0x2003: 2564 case 0x2004: 2565 case 0x2005: 2566 case 0x2006: 2567 case 0x2007: 2568 case 0x2008: 2569 case 0x2009: 2570 case 0x200A: 2571 case 0x202f: 2572 case 0x205f: 2573 case 0x3000: 2574 return -next != ESC_h; 2575 default: 2576 return -next == ESC_h; 2577 } 2578 2579 case ESC_v: 2580 case ESC_V: 2581 switch(item) 2582 { 2583 case 0x0a: 2584 case 0x0b: 2585 case 0x0c: 2586 case 0x0d: 2587 case 0x85: 2588 case 0x2028: 2589 case 0x2029: 2590 return -next != ESC_v; 2591 default: 2592 return -next == ESC_v; 2593 } 2594 2595 default: 2596 return FALSE; 2597 } 2598 2599 case OP_DIGIT: 2600 return next == -ESC_D || next == -ESC_s || next == -ESC_W || 2601 next == -ESC_h || next == -ESC_v; 2602 2603 case OP_NOT_DIGIT: 2604 return next == -ESC_d; 2605 2606 case OP_WHITESPACE: 2607 return next == -ESC_S || next == -ESC_d || next == -ESC_w; 2608 2609 case OP_NOT_WHITESPACE: 2610 return next == -ESC_s || next == -ESC_h || next == -ESC_v; 2611 2612 case OP_HSPACE: 2613 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; 2614 2615 case OP_NOT_HSPACE: 2616 return next == -ESC_h; 2617 2618 /* Can't have \S in here because VT matches \S (Perl anomaly) */ 2619 case OP_VSPACE: 2620 return next == -ESC_V || next == -ESC_d || next == -ESC_w; 2621 2622 case OP_NOT_VSPACE: 2623 return next == -ESC_v; 2624 2625 case OP_WORDCHAR: 2626 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; 2627 2628 case OP_NOT_WORDCHAR: 2629 return next == -ESC_w || next == -ESC_d; 2630 2631 default: 2632 return FALSE; 2633 } 2634 2635/* Control does not reach here */ 2636} 2637 2638 2639 2640/************************************************* 2641* Compile one branch * 2642*************************************************/ 2643 2644/* Scan the pattern, compiling it into the a vector. If the options are 2645changed during the branch, the pointer is used to change the external options 2646bits. This function is used during the pre-compile phase when we are trying 2647to find out the amount of memory needed, as well as during the real compile 2648phase. The value of lengthptr distinguishes the two phases. 2649 2650Arguments: 2651 optionsptr pointer to the option bits 2652 codeptr points to the pointer to the current code point 2653 ptrptr points to the current pattern pointer 2654 errorcodeptr points to error code variable 2655 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) 2656 reqbyteptr set to the last literal character required, else < 0 2657 bcptr points to current branch chain 2658 cd contains pointers to tables etc. 2659 lengthptr NULL during the real compile phase 2660 points to length accumulator during pre-compile phase 2661 2662Returns: TRUE on success 2663 FALSE, with *errorcodeptr set non-zero on error 2664*/ 2665 2666static BOOL 2667compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, 2668 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, 2669 compile_data *cd, int *lengthptr) 2670{ 2671int repeat_type, op_type; 2672int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 2673int bravalue = 0; 2674int greedy_default, greedy_non_default; 2675int firstbyte, reqbyte; 2676int zeroreqbyte, zerofirstbyte; 2677int req_caseopt, reqvary, tempreqvary; 2678int options = *optionsptr; 2679int after_manual_callout = 0; 2680int length_prevgroup = 0; 2681register int c; 2682register uschar *code = *codeptr; 2683uschar *last_code = code; 2684uschar *orig_code = code; 2685uschar *tempcode; 2686BOOL inescq = FALSE; 2687BOOL groupsetfirstbyte = FALSE; 2688const uschar *ptr = *ptrptr; 2689const uschar *tempptr; 2690uschar *previous = NULL; 2691uschar *previous_callout = NULL; 2692uschar *save_hwm = NULL; 2693uschar classbits[32]; 2694 2695#ifdef SUPPORT_UTF8 2696BOOL class_utf8; 2697BOOL utf8 = (options & PCRE_UTF8) != 0; 2698uschar *class_utf8data; 2699uschar *class_utf8data_base; 2700uschar utf8_char[6]; 2701#else 2702BOOL utf8 = FALSE; 2703uschar *utf8_char = NULL; 2704#endif 2705 2706#ifdef PCRE_DEBUG 2707if (lengthptr != NULL) DPRINTF((">> start branch\n")); 2708#endif 2709 2710/* Set up the default and non-default settings for greediness */ 2711 2712greedy_default = ((options & PCRE_UNGREEDY) != 0); 2713greedy_non_default = greedy_default ^ 1; 2714 2715/* Initialize no first byte, no required byte. REQ_UNSET means "no char 2716matching encountered yet". It gets changed to REQ_NONE if we hit something that 2717matches a non-fixed char first char; reqbyte just remains unset if we never 2718find one. 2719 2720When we hit a repeat whose minimum is zero, we may have to adjust these values 2721to take the zero repeat into account. This is implemented by setting them to 2722zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual 2723item types that can be repeated set these backoff variables appropriately. */ 2724 2725firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; 2726 2727/* The variable req_caseopt contains either the REQ_CASELESS value or zero, 2728according to the current setting of the caseless flag. REQ_CASELESS is a bit 2729value > 255. It is added into the firstbyte or reqbyte variables to record the 2730case status of the value. This is used only for ASCII characters. */ 2731 2732req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 2733 2734/* Switch on next character until the end of the branch */ 2735 2736for (;; ptr++) 2737 { 2738 BOOL negate_class; 2739 BOOL should_flip_negation; 2740 BOOL possessive_quantifier; 2741 BOOL is_quantifier; 2742 BOOL is_recurse; 2743 BOOL reset_bracount; 2744 int class_charcount; 2745 int class_lastchar; 2746 int newoptions; 2747 int recno; 2748 int refsign; 2749 int skipbytes; 2750 int subreqbyte; 2751 int subfirstbyte; 2752 int terminator; 2753 int mclength; 2754 uschar mcbuffer[8]; 2755 2756 /* Get next byte in the pattern */ 2757 2758 c = *ptr; 2759 2760 /* If we are in the pre-compile phase, accumulate the length used for the 2761 previous cycle of this loop. */ 2762 2763 if (lengthptr != NULL) 2764 { 2765#ifdef PCRE_DEBUG 2766 if (code > cd->hwm) cd->hwm = code; /* High water info */ 2767#endif 2768 if (code > cd->start_workspace + WORK_SIZE_CHECK) /* Check for overrun */ 2769 { 2770 *errorcodeptr = ERR52; 2771 goto FAILED; 2772 } 2773 2774 /* There is at least one situation where code goes backwards: this is the 2775 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, 2776 the class is simply eliminated. However, it is created first, so we have to 2777 allow memory for it. Therefore, don't ever reduce the length at this point. 2778 */ 2779 2780 if (code < last_code) code = last_code; 2781 2782 /* Paranoid check for integer overflow */ 2783 2784 if (OFLOW_MAX - *lengthptr < code - last_code) 2785 { 2786 *errorcodeptr = ERR20; 2787 goto FAILED; 2788 } 2789 2790 *lengthptr += code - last_code; 2791 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); 2792 2793 /* If "previous" is set and it is not at the start of the work space, move 2794 it back to there, in order to avoid filling up the work space. Otherwise, 2795 if "previous" is NULL, reset the current code pointer to the start. */ 2796 2797 if (previous != NULL) 2798 { 2799 if (previous > orig_code) 2800 { 2801 memmove(orig_code, previous, code - previous); 2802 code -= previous - orig_code; 2803 previous = orig_code; 2804 } 2805 } 2806 else code = orig_code; 2807 2808 /* Remember where this code item starts so we can pick up the length 2809 next time round. */ 2810 2811 last_code = code; 2812 } 2813 2814 /* In the real compile phase, just check the workspace used by the forward 2815 reference list. */ 2816 2817 else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK) 2818 { 2819 *errorcodeptr = ERR52; 2820 goto FAILED; 2821 } 2822 2823 /* If in \Q...\E, check for the end; if not, we have a literal */ 2824 2825 if (inescq && c != 0) 2826 { 2827 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) 2828 { 2829 inescq = FALSE; 2830 ptr++; 2831 continue; 2832 } 2833 else 2834 { 2835 if (previous_callout != NULL) 2836 { 2837 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 2838 complete_callout(previous_callout, ptr, cd); 2839 previous_callout = NULL; 2840 } 2841 if ((options & PCRE_AUTO_CALLOUT) != 0) 2842 { 2843 previous_callout = code; 2844 code = auto_callout(code, ptr, cd); 2845 } 2846 goto NORMAL_CHAR; 2847 } 2848 } 2849 2850 /* Fill in length of a previous callout, except when the next thing is 2851 a quantifier. */ 2852 2853 is_quantifier = 2854 c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || 2855 (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); 2856 2857 if (!is_quantifier && previous_callout != NULL && 2858 after_manual_callout-- <= 0) 2859 { 2860 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ 2861 complete_callout(previous_callout, ptr, cd); 2862 previous_callout = NULL; 2863 } 2864 2865 /* In extended mode, skip white space and comments */ 2866 2867 if ((options & PCRE_EXTENDED) != 0) 2868 { 2869 if ((cd->ctypes[c] & ctype_space) != 0) continue; 2870 if (c == CHAR_NUMBER_SIGN) 2871 { 2872 while (*(++ptr) != 0) 2873 { 2874 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } 2875 } 2876 if (*ptr != 0) continue; 2877 2878 /* Else fall through to handle end of string */ 2879 c = 0; 2880 } 2881 } 2882 2883 /* No auto callout for quantifiers. */ 2884 2885 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) 2886 { 2887 previous_callout = code; 2888 code = auto_callout(code, ptr, cd); 2889 } 2890 2891 switch(c) 2892 { 2893 /* ===================================================================*/ 2894 case 0: /* The branch terminates at string end */ 2895 case CHAR_VERTICAL_LINE: /* or | or ) */ 2896 case CHAR_RIGHT_PARENTHESIS: 2897 *firstbyteptr = firstbyte; 2898 *reqbyteptr = reqbyte; 2899 *codeptr = code; 2900 *ptrptr = ptr; 2901 if (lengthptr != NULL) 2902 { 2903 if (OFLOW_MAX - *lengthptr < code - last_code) 2904 { 2905 *errorcodeptr = ERR20; 2906 goto FAILED; 2907 } 2908 *lengthptr += code - last_code; /* To include callout length */ 2909 DPRINTF((">> end branch\n")); 2910 } 2911 return TRUE; 2912 2913 2914 /* ===================================================================*/ 2915 /* Handle single-character metacharacters. In multiline mode, ^ disables 2916 the setting of any following char as a first character. */ 2917 2918 case CHAR_CIRCUMFLEX_ACCENT: 2919 if ((options & PCRE_MULTILINE) != 0) 2920 { 2921 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2922 } 2923 previous = NULL; 2924 *code++ = OP_CIRC; 2925 break; 2926 2927 case CHAR_DOLLAR_SIGN: 2928 previous = NULL; 2929 *code++ = OP_DOLL; 2930 break; 2931 2932 /* There can never be a first char if '.' is first, whatever happens about 2933 repeats. The value of reqbyte doesn't change either. */ 2934 2935 case CHAR_DOT: 2936 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2937 zerofirstbyte = firstbyte; 2938 zeroreqbyte = reqbyte; 2939 previous = code; 2940 *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; 2941 break; 2942 2943 2944 /* ===================================================================*/ 2945 /* Character classes. If the included characters are all < 256, we build a 2946 32-byte bitmap of the permitted characters, except in the special case 2947 where there is only one such character. For negated classes, we build the 2948 map as usual, then invert it at the end. However, we use a different opcode 2949 so that data characters > 255 can be handled correctly. 2950 2951 If the class contains characters outside the 0-255 range, a different 2952 opcode is compiled. It may optionally have a bit map for characters < 256, 2953 but those above are are explicitly listed afterwards. A flag byte tells 2954 whether the bitmap is present, and whether this is a negated class or not. 2955 2956 In JavaScript compatibility mode, an isolated ']' causes an error. In 2957 default (Perl) mode, it is treated as a data character. */ 2958 2959 case CHAR_RIGHT_SQUARE_BRACKET: 2960 if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) 2961 { 2962 *errorcodeptr = ERR64; 2963 goto FAILED; 2964 } 2965 goto NORMAL_CHAR; 2966 2967 case CHAR_LEFT_SQUARE_BRACKET: 2968 previous = code; 2969 2970 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 2971 they are encountered at the top level, so we'll do that too. */ 2972 2973 if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 2974 ptr[1] == CHAR_EQUALS_SIGN) && 2975 check_posix_syntax(ptr, &tempptr)) 2976 { 2977 *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31; 2978 goto FAILED; 2979 } 2980 2981 /* If the first character is '^', set the negation flag and skip it. Also, 2982 if the first few characters (either before or after ^) are \Q\E or \E we 2983 skip them too. This makes for compatibility with Perl. */ 2984 2985 negate_class = FALSE; 2986 for (;;) 2987 { 2988 c = *(++ptr); 2989 if (c == CHAR_BACKSLASH) 2990 { 2991 if (ptr[1] == CHAR_E) 2992 ptr++; 2993 else if (strncmp((const char *)ptr+1, 2994 STR_Q STR_BACKSLASH STR_E, 3) == 0) 2995 ptr += 3; 2996 else 2997 break; 2998 } 2999 else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) 3000 negate_class = TRUE; 3001 else break; 3002 } 3003 3004 /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, 3005 an initial ']' is taken as a data character -- the code below handles 3006 that. In JS mode, [] must always fail, so generate OP_FAIL, whereas 3007 [^] must match any character, so generate OP_ALLANY. */ 3008 3009 if (c == CHAR_RIGHT_SQUARE_BRACKET && 3010 (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) 3011 { 3012 *code++ = negate_class? OP_ALLANY : OP_FAIL; 3013 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 3014 zerofirstbyte = firstbyte; 3015 break; 3016 } 3017 3018 /* If a class contains a negative special such as \S, we need to flip the 3019 negation flag at the end, so that support for characters > 255 works 3020 correctly (they are all included in the class). */ 3021 3022 should_flip_negation = FALSE; 3023 3024 /* Keep a count of chars with values < 256 so that we can optimize the case 3025 of just a single character (as long as it's < 256). However, For higher 3026 valued UTF-8 characters, we don't yet do any optimization. */ 3027 3028 class_charcount = 0; 3029 class_lastchar = -1; 3030 3031 /* Initialize the 32-char bit map to all zeros. We build the map in a 3032 temporary bit of memory, in case the class contains only 1 character (less 3033 than 256), because in that case the compiled code doesn't use the bit map. 3034 */ 3035 3036 memset(classbits, 0, 32 * sizeof(uschar)); 3037 3038#ifdef SUPPORT_UTF8 3039 class_utf8 = FALSE; /* No chars >= 256 */ 3040 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ 3041 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ 3042#endif 3043 3044 /* Process characters until ] is reached. By writing this as a "do" it 3045 means that an initial ] is taken as a data character. At the start of the 3046 loop, c contains the first byte of the character. */ 3047 3048 if (c != 0) do 3049 { 3050 const uschar *oldptr; 3051 3052#ifdef SUPPORT_UTF8 3053 if (utf8 && c > 127) 3054 { /* Braces are required because the */ 3055 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ 3056 } 3057 3058 /* In the pre-compile phase, accumulate the length of any UTF-8 extra 3059 data and reset the pointer. This is so that very large classes that 3060 contain a zillion UTF-8 characters no longer overwrite the work space 3061 (which is on the stack). */ 3062 3063 if (lengthptr != NULL) 3064 { 3065 *lengthptr += class_utf8data - class_utf8data_base; 3066 class_utf8data = class_utf8data_base; 3067 } 3068 3069#endif 3070 3071 /* Inside \Q...\E everything is literal except \E */ 3072 3073 if (inescq) 3074 { 3075 if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ 3076 { 3077 inescq = FALSE; /* Reset literal state */ 3078 ptr++; /* Skip the 'E' */ 3079 continue; /* Carry on with next */ 3080 } 3081 goto CHECK_RANGE; /* Could be range if \E follows */ 3082 } 3083 3084 /* Handle POSIX class names. Perl allows a negation extension of the 3085 form [:^name:]. A square bracket that doesn't match the syntax is 3086 treated as a literal. We also recognize the POSIX constructions 3087 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 3088 5.6 and 5.8 do. */ 3089 3090 if (c == CHAR_LEFT_SQUARE_BRACKET && 3091 (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || 3092 ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) 3093 { 3094 BOOL local_negate = FALSE; 3095 int posix_class, taboffset, tabopt; 3096 register const uschar *cbits = cd->cbits; 3097 uschar pbits[32]; 3098 3099 if (ptr[1] != CHAR_COLON) 3100 { 3101 *errorcodeptr = ERR31; 3102 goto FAILED; 3103 } 3104 3105 ptr += 2; 3106 if (*ptr == CHAR_CIRCUMFLEX_ACCENT) 3107 { 3108 local_negate = TRUE; 3109 should_flip_negation = TRUE; /* Note negative special */ 3110 ptr++; 3111 } 3112 3113 posix_class = check_posix_name(ptr, tempptr - ptr); 3114 if (posix_class < 0) 3115 { 3116 *errorcodeptr = ERR30; 3117 goto FAILED; 3118 } 3119 3120 /* If matching is caseless, upper and lower are converted to 3121 alpha. This relies on the fact that the class table starts with 3122 alpha, lower, upper as the first 3 entries. */ 3123 3124 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) 3125 posix_class = 0; 3126 3127 /* We build the bit map for the POSIX class in a chunk of local store 3128 because we may be adding and subtracting from it, and we don't want to 3129 subtract bits that may be in the main map already. At the end we or the 3130 result into the bit map that is being built. */ 3131 3132 posix_class *= 3; 3133 3134 /* Copy in the first table (always present) */ 3135 3136 memcpy(pbits, cbits + posix_class_maps[posix_class], 3137 32 * sizeof(uschar)); 3138 3139 /* If there is a second table, add or remove it as required. */ 3140 3141 taboffset = posix_class_maps[posix_class + 1]; 3142 tabopt = posix_class_maps[posix_class + 2]; 3143 3144 if (taboffset >= 0) 3145 { 3146 if (tabopt >= 0) 3147 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; 3148 else 3149 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; 3150 } 3151 3152 /* Not see if we need to remove any special characters. An option 3153 value of 1 removes vertical space and 2 removes underscore. */ 3154 3155 if (tabopt < 0) tabopt = -tabopt; 3156 if (tabopt == 1) pbits[1] &= ~0x3c; 3157 else if (tabopt == 2) pbits[11] &= 0x7f; 3158 3159 /* Add the POSIX table or its complement into the main table that is 3160 being built and we are done. */ 3161 3162 if (local_negate) 3163 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; 3164 else 3165 for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; 3166 3167 ptr = tempptr + 1; 3168 class_charcount = 10; /* Set > 1; assumes more than 1 per class */ 3169 continue; /* End of POSIX syntax handling */ 3170 } 3171 3172 /* Backslash may introduce a single character, or it may introduce one 3173 of the specials, which just set a flag. The sequence \b is a special 3174 case. Inside a class (and only there) it is treated as backspace. 3175 Elsewhere it marks a word boundary. Other escapes have preset maps ready 3176 to 'or' into the one we are building. We assume they have more than one 3177 character in them, so set class_charcount bigger than one. */ 3178 3179 if (c == CHAR_BACKSLASH) 3180 { 3181 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); 3182 if (*errorcodeptr != 0) goto FAILED; 3183 3184 if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ 3185 else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */ 3186 else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */ 3187 else if (-c == ESC_Q) /* Handle start of quoted string */ 3188 { 3189 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 3190 { 3191 ptr += 2; /* avoid empty string */ 3192 } 3193 else inescq = TRUE; 3194 continue; 3195 } 3196 else if (-c == ESC_E) continue; /* Ignore orphan \E */ 3197 3198 if (c < 0) 3199 { 3200 register const uschar *cbits = cd->cbits; 3201 class_charcount += 2; /* Greater than 1 is what matters */ 3202 3203 /* Save time by not doing this in the pre-compile phase. */ 3204 3205 if (lengthptr == NULL) switch (-c) 3206 { 3207 case ESC_d: 3208 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; 3209 continue; 3210 3211 case ESC_D: 3212 should_flip_negation = TRUE; 3213 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; 3214 continue; 3215 3216 case ESC_w: 3217 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; 3218 continue; 3219 3220 case ESC_W: 3221 should_flip_negation = TRUE; 3222 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; 3223 continue; 3224 3225 case ESC_s: 3226 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; 3227 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ 3228 continue; 3229 3230 case ESC_S: 3231 should_flip_negation = TRUE; 3232 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; 3233 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ 3234 continue; 3235 3236 default: /* Not recognized; fall through */ 3237 break; /* Need "default" setting to stop compiler warning. */ 3238 } 3239 3240 /* In the pre-compile phase, just do the recognition. */ 3241 3242 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || 3243 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; 3244 3245 /* We need to deal with \H, \h, \V, and \v in both phases because 3246 they use extra memory. */ 3247 3248 if (-c == ESC_h) 3249 { 3250 SETBIT(classbits, 0x09); /* VT */ 3251 SETBIT(classbits, 0x20); /* SPACE */ 3252 SETBIT(classbits, 0xa0); /* NSBP */ 3253#ifdef SUPPORT_UTF8 3254 if (utf8) 3255 { 3256 class_utf8 = TRUE; 3257 *class_utf8data++ = XCL_SINGLE; 3258 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); 3259 *class_utf8data++ = XCL_SINGLE; 3260 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); 3261 *class_utf8data++ = XCL_RANGE; 3262 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); 3263 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); 3264 *class_utf8data++ = XCL_SINGLE; 3265 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); 3266 *class_utf8data++ = XCL_SINGLE; 3267 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); 3268 *class_utf8data++ = XCL_SINGLE; 3269 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); 3270 } 3271#endif 3272 continue; 3273 } 3274 3275 if (-c == ESC_H) 3276 { 3277 for (c = 0; c < 32; c++) 3278 { 3279 int x = 0xff; 3280 switch (c) 3281 { 3282 case 0x09/8: x ^= 1 << (0x09%8); break; 3283 case 0x20/8: x ^= 1 << (0x20%8); break; 3284 case 0xa0/8: x ^= 1 << (0xa0%8); break; 3285 default: break; 3286 } 3287 classbits[c] |= x; 3288 } 3289 3290#ifdef SUPPORT_UTF8 3291 if (utf8) 3292 { 3293 class_utf8 = TRUE; 3294 *class_utf8data++ = XCL_RANGE; 3295 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); 3296 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); 3297 *class_utf8data++ = XCL_RANGE; 3298 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); 3299 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); 3300 *class_utf8data++ = XCL_RANGE; 3301 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); 3302 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); 3303 *class_utf8data++ = XCL_RANGE; 3304 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); 3305 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); 3306 *class_utf8data++ = XCL_RANGE; 3307 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); 3308 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); 3309 *class_utf8data++ = XCL_RANGE; 3310 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); 3311 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); 3312 *class_utf8data++ = XCL_RANGE; 3313 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); 3314 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); 3315 } 3316#endif 3317 continue; 3318 } 3319 3320 if (-c == ESC_v) 3321 { 3322 SETBIT(classbits, 0x0a); /* LF */ 3323 SETBIT(classbits, 0x0b); /* VT */ 3324 SETBIT(classbits, 0x0c); /* FF */ 3325 SETBIT(classbits, 0x0d); /* CR */ 3326 SETBIT(classbits, 0x85); /* NEL */ 3327#ifdef SUPPORT_UTF8 3328 if (utf8) 3329 { 3330 class_utf8 = TRUE; 3331 *class_utf8data++ = XCL_RANGE; 3332 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); 3333 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); 3334 } 3335#endif 3336 continue; 3337 } 3338 3339 if (-c == ESC_V) 3340 { 3341 for (c = 0; c < 32; c++) 3342 { 3343 int x = 0xff; 3344 switch (c) 3345 { 3346 case 0x0a/8: x ^= 1 << (0x0a%8); 3347 x ^= 1 << (0x0b%8); 3348 x ^= 1 << (0x0c%8); 3349 x ^= 1 << (0x0d%8); 3350 break; 3351 case 0x85/8: x ^= 1 << (0x85%8); break; 3352 default: break; 3353 } 3354 classbits[c] |= x; 3355 } 3356 3357#ifdef SUPPORT_UTF8 3358 if (utf8) 3359 { 3360 class_utf8 = TRUE; 3361 *class_utf8data++ = XCL_RANGE; 3362 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); 3363 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); 3364 *class_utf8data++ = XCL_RANGE; 3365 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); 3366 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); 3367 } 3368#endif 3369 continue; 3370 } 3371 3372 /* We need to deal with \P and \p in both phases. */ 3373 3374#ifdef SUPPORT_UCP 3375 if (-c == ESC_p || -c == ESC_P) 3376 { 3377 BOOL negated; 3378 int pdata; 3379 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); 3380 if (ptype < 0) goto FAILED; 3381 class_utf8 = TRUE; 3382 *class_utf8data++ = ((-c == ESC_p) != negated)? 3383 XCL_PROP : XCL_NOTPROP; 3384 *class_utf8data++ = ptype; 3385 *class_utf8data++ = pdata; 3386 class_charcount -= 2; /* Not a < 256 character */ 3387 continue; 3388 } 3389#endif 3390 /* Unrecognized escapes are faulted if PCRE is running in its 3391 strict mode. By default, for compatibility with Perl, they are 3392 treated as literals. */ 3393 3394 if ((options & PCRE_EXTRA) != 0) 3395 { 3396 *errorcodeptr = ERR7; 3397 goto FAILED; 3398 } 3399 3400 class_charcount -= 2; /* Undo the default count from above */ 3401 c = *ptr; /* Get the final character and fall through */ 3402 } 3403 3404 /* Fall through if we have a single character (c >= 0). This may be 3405 greater than 256 in UTF-8 mode. */ 3406 3407 } /* End of backslash handling */ 3408 3409 /* A single character may be followed by '-' to form a range. However, 3410 Perl does not permit ']' to be the end of the range. A '-' character 3411 at the end is treated as a literal. Perl ignores orphaned \E sequences 3412 entirely. The code for handling \Q and \E is messy. */ 3413 3414 CHECK_RANGE: 3415 while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 3416 { 3417 inescq = FALSE; 3418 ptr += 2; 3419 } 3420 3421 oldptr = ptr; 3422 3423 /* Remember \r or \n */ 3424 3425 if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; 3426 3427 /* Check for range */ 3428 3429 if (!inescq && ptr[1] == CHAR_MINUS) 3430 { 3431 int d; 3432 ptr += 2; 3433 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; 3434 3435 /* If we hit \Q (not followed by \E) at this point, go into escaped 3436 mode. */ 3437 3438 while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) 3439 { 3440 ptr += 2; 3441 if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) 3442 { ptr += 2; continue; } 3443 inescq = TRUE; 3444 break; 3445 } 3446 3447 if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) 3448 { 3449 ptr = oldptr; 3450 goto LONE_SINGLE_CHARACTER; 3451 } 3452 3453#ifdef SUPPORT_UTF8 3454 if (utf8) 3455 { /* Braces are required because the */ 3456 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ 3457 } 3458 else 3459#endif 3460 d = *ptr; /* Not UTF-8 mode */ 3461 3462 /* The second part of a range can be a single-character escape, but 3463 not any of the other escapes. Perl 5.6 treats a hyphen as a literal 3464 in such circumstances. */ 3465 3466 if (!inescq && d == CHAR_BACKSLASH) 3467 { 3468 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); 3469 if (*errorcodeptr != 0) goto FAILED; 3470 3471 /* \b is backspace; \X is literal X; \R is literal R; any other 3472 special means the '-' was literal */ 3473 3474 if (d < 0) 3475 { 3476 if (d == -ESC_b) d = CHAR_BS; 3477 else if (d == -ESC_X) d = CHAR_X; 3478 else if (d == -ESC_R) d = CHAR_R; else 3479 { 3480 ptr = oldptr; 3481 goto LONE_SINGLE_CHARACTER; /* A few lines below */ 3482 } 3483 } 3484 } 3485 3486 /* Check that the two values are in the correct order. Optimize 3487 one-character ranges */ 3488 3489 if (d < c) 3490 { 3491 *errorcodeptr = ERR8; 3492 goto FAILED; 3493 } 3494 3495 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ 3496 3497 /* Remember \r or \n */ 3498 3499 if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; 3500 3501 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless 3502 matching, we have to use an XCLASS with extra data items. Caseless 3503 matching for characters > 127 is available only if UCP support is 3504 available. */ 3505 3506#ifdef SUPPORT_UTF8 3507 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) 3508 { 3509 class_utf8 = TRUE; 3510 3511 /* With UCP support, we can find the other case equivalents of 3512 the relevant characters. There may be several ranges. Optimize how 3513 they fit with the basic range. */ 3514 3515#ifdef SUPPORT_UCP 3516 if ((options & PCRE_CASELESS) != 0) 3517 { 3518 unsigned int occ, ocd; 3519 unsigned int cc = c; 3520 unsigned int origd = d; 3521 while (get_othercase_range(&cc, origd, &occ, &ocd)) 3522 { 3523 if (occ >= (unsigned int)c && 3524 ocd <= (unsigned int)d) 3525 continue; /* Skip embedded ranges */ 3526 3527 if (occ < (unsigned int)c && 3528 ocd >= (unsigned int)c - 1) /* Extend the basic range */ 3529 { /* if there is overlap, */ 3530 c = occ; /* noting that if occ < c */ 3531 continue; /* we can't have ocd > d */ 3532 } /* because a subrange is */ 3533 if (ocd > (unsigned int)d && 3534 occ <= (unsigned int)d + 1) /* always shorter than */ 3535 { /* the basic range. */ 3536 d = ocd; 3537 continue; 3538 } 3539 3540 if (occ == ocd) 3541 { 3542 *class_utf8data++ = XCL_SINGLE; 3543 } 3544 else 3545 { 3546 *class_utf8data++ = XCL_RANGE; 3547 class_utf8data += _pcre_ord2utf8(occ, class_utf8data); 3548 } 3549 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); 3550 } 3551 } 3552#endif /* SUPPORT_UCP */ 3553 3554 /* Now record the original range, possibly modified for UCP caseless 3555 overlapping ranges. */ 3556 3557 *class_utf8data++ = XCL_RANGE; 3558 class_utf8data += _pcre_ord2utf8(c, class_utf8data); 3559 class_utf8data += _pcre_ord2utf8(d, class_utf8data); 3560 3561 /* With UCP support, we are done. Without UCP support, there is no 3562 caseless matching for UTF-8 characters > 127; we can use the bit map 3563 for the smaller ones. */ 3564 3565#ifdef SUPPORT_UCP 3566 continue; /* With next character in the class */ 3567#else 3568 if ((options & PCRE_CASELESS) == 0 || c > 127) continue; 3569 3570 /* Adjust upper limit and fall through to set up the map */ 3571 3572 d = 127; 3573 3574#endif /* SUPPORT_UCP */ 3575 } 3576#endif /* SUPPORT_UTF8 */ 3577 3578 /* We use the bit map for all cases when not in UTF-8 mode; else 3579 ranges that lie entirely within 0-127 when there is UCP support; else 3580 for partial ranges without UCP support. */ 3581 3582 class_charcount += d - c + 1; 3583 class_lastchar = d; 3584 3585 /* We can save a bit of time by skipping this in the pre-compile. */ 3586 3587 if (lengthptr == NULL) for (; c <= d; c++) 3588 { 3589 classbits[c/8] |= (1 << (c&7)); 3590 if ((options & PCRE_CASELESS) != 0) 3591 { 3592 int uc = cd->fcc[c]; /* flip case */ 3593 classbits[uc/8] |= (1 << (uc&7)); 3594 } 3595 } 3596 3597 continue; /* Go get the next char in the class */ 3598 } 3599 3600 /* Handle a lone single character - we can get here for a normal 3601 non-escape char, or after \ that introduces a single character or for an 3602 apparent range that isn't. */ 3603 3604 LONE_SINGLE_CHARACTER: 3605 3606 /* Handle a character that cannot go in the bit map */ 3607 3608#ifdef SUPPORT_UTF8 3609 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) 3610 { 3611 class_utf8 = TRUE; 3612 *class_utf8data++ = XCL_SINGLE; 3613 class_utf8data += _pcre_ord2utf8(c, class_utf8data); 3614 3615#ifdef SUPPORT_UCP 3616 if ((options & PCRE_CASELESS) != 0) 3617 { 3618 unsigned int othercase; 3619 if ((othercase = UCD_OTHERCASE(c)) != c) 3620 { 3621 *class_utf8data++ = XCL_SINGLE; 3622 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); 3623 } 3624 } 3625#endif /* SUPPORT_UCP */ 3626 3627 } 3628 else 3629#endif /* SUPPORT_UTF8 */ 3630 3631 /* Handle a single-byte character */ 3632 { 3633 classbits[c/8] |= (1 << (c&7)); 3634 if ((options & PCRE_CASELESS) != 0) 3635 { 3636 c = cd->fcc[c]; /* flip case */ 3637 classbits[c/8] |= (1 << (c&7)); 3638 } 3639 class_charcount++; 3640 class_lastchar = c; 3641 } 3642 } 3643 3644 /* Loop until ']' reached. This "while" is the end of the "do" above. */ 3645 3646 while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); 3647 3648 if (c == 0) /* Missing terminating ']' */ 3649 { 3650 *errorcodeptr = ERR6; 3651 goto FAILED; 3652 } 3653 3654 3655/* This code has been disabled because it would mean that \s counts as 3656an explicit \r or \n reference, and that's not really what is wanted. Now 3657we set the flag only if there is a literal "\r" or "\n" in the class. */ 3658 3659#if 0 3660 /* Remember whether \r or \n are in this class */ 3661 3662 if (negate_class) 3663 { 3664 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; 3665 } 3666 else 3667 { 3668 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; 3669 } 3670#endif 3671 3672 3673 /* If class_charcount is 1, we saw precisely one character whose value is 3674 less than 256. As long as there were no characters >= 128 and there was no 3675 use of \p or \P, in other words, no use of any XCLASS features, we can 3676 optimize. 3677 3678 In UTF-8 mode, we can optimize the negative case only if there were no 3679 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR 3680 operate on single-bytes only. This is an historical hangover. Maybe one day 3681 we can tidy these opcodes to handle multi-byte characters. 3682 3683 The optimization throws away the bit map. We turn the item into a 3684 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note 3685 that OP_NOT does not support multibyte characters. In the positive case, it 3686 can cause firstbyte to be set. Otherwise, there can be no first char if 3687 this item is first, whatever repeat count may follow. In the case of 3688 reqbyte, save the previous value for reinstating. */ 3689 3690#ifdef SUPPORT_UTF8 3691 if (class_charcount == 1 && !class_utf8 && 3692 (!utf8 || !negate_class || class_lastchar < 128)) 3693#else 3694 if (class_charcount == 1) 3695#endif 3696 { 3697 zeroreqbyte = reqbyte; 3698 3699 /* The OP_NOT opcode works on one-byte characters only. */ 3700 3701 if (negate_class) 3702 { 3703 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 3704 zerofirstbyte = firstbyte; 3705 *code++ = OP_NOT; 3706 *code++ = class_lastchar; 3707 break; 3708 } 3709 3710 /* For a single, positive character, get the value into mcbuffer, and 3711 then we can handle this with the normal one-character code. */ 3712 3713#ifdef SUPPORT_UTF8 3714 if (utf8 && class_lastchar > 127) 3715 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); 3716 else 3717#endif 3718 { 3719 mcbuffer[0] = class_lastchar; 3720 mclength = 1; 3721 } 3722 goto ONE_CHAR; 3723 } /* End of 1-char optimization */ 3724 3725 /* The general case - not the one-char optimization. If this is the first 3726 thing in the branch, there can be no first char setting, whatever the 3727 repeat count. Any reqbyte setting must remain unchanged after any kind of 3728 repeat. */ 3729 3730 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 3731 zerofirstbyte = firstbyte; 3732 zeroreqbyte = reqbyte; 3733 3734 /* If there are characters with values > 255, we have to compile an 3735 extended class, with its own opcode, unless there was a negated special 3736 such as \S in the class, because in that case all characters > 255 are in 3737 the class, so any that were explicitly given as well can be ignored. If 3738 (when there are explicit characters > 255 that must be listed) there are no 3739 characters < 256, we can omit the bitmap in the actual compiled code. */ 3740 3741#ifdef SUPPORT_UTF8 3742 if (class_utf8 && !should_flip_negation) 3743 { 3744 *class_utf8data++ = XCL_END; /* Marks the end of extra data */ 3745 *code++ = OP_XCLASS; 3746 code += LINK_SIZE; 3747 *code = negate_class? XCL_NOT : 0; 3748 3749 /* If the map is required, move up the extra data to make room for it; 3750 otherwise just move the code pointer to the end of the extra data. */ 3751 3752 if (class_charcount > 0) 3753 { 3754 *code++ |= XCL_MAP; 3755 memmove(code + 32, code, class_utf8data - code); 3756 memcpy(code, classbits, 32); 3757 code = class_utf8data + 32; 3758 } 3759 else code = class_utf8data; 3760 3761 /* Now fill in the complete length of the item */ 3762 3763 PUT(previous, 1, code - previous); 3764 break; /* End of class handling */ 3765 } 3766#endif 3767 3768 /* If there are no characters > 255, set the opcode to OP_CLASS or 3769 OP_NCLASS, depending on whether the whole class was negated and whether 3770 there were negative specials such as \S in the class. Then copy the 32-byte 3771 map into the code vector, negating it if necessary. */ 3772 3773 *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; 3774 if (negate_class) 3775 { 3776 if (lengthptr == NULL) /* Save time in the pre-compile phase */ 3777 for (c = 0; c < 32; c++) code[c] = ~classbits[c]; 3778 } 3779 else 3780 { 3781 memcpy(code, classbits, 32); 3782 } 3783 code += 32; 3784 break; 3785 3786 3787 /* ===================================================================*/ 3788 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this 3789 has been tested above. */ 3790 3791 case CHAR_LEFT_CURLY_BRACKET: 3792 if (!is_quantifier) goto NORMAL_CHAR; 3793 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); 3794 if (*errorcodeptr != 0) goto FAILED; 3795 goto REPEAT; 3796 3797 case CHAR_ASTERISK: 3798 repeat_min = 0; 3799 repeat_max = -1; 3800 goto REPEAT; 3801 3802 case CHAR_PLUS: 3803 repeat_min = 1; 3804 repeat_max = -1; 3805 goto REPEAT; 3806 3807 case CHAR_QUESTION_MARK: 3808 repeat_min = 0; 3809 repeat_max = 1; 3810 3811 REPEAT: 3812 if (previous == NULL) 3813 { 3814 *errorcodeptr = ERR9; 3815 goto FAILED; 3816 } 3817 3818 if (repeat_min == 0) 3819 { 3820 firstbyte = zerofirstbyte; /* Adjust for zero repeat */ 3821 reqbyte = zeroreqbyte; /* Ditto */ 3822 } 3823 3824 /* Remember whether this is a variable length repeat */ 3825 3826 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 3827 3828 op_type = 0; /* Default single-char op codes */ 3829 possessive_quantifier = FALSE; /* Default not possessive quantifier */ 3830 3831 /* Save start of previous item, in case we have to move it up to make space 3832 for an inserted OP_ONCE for the additional '+' extension. */ 3833 3834 tempcode = previous; 3835 3836 /* If the next character is '+', we have a possessive quantifier. This 3837 implies greediness, whatever the setting of the PCRE_UNGREEDY option. 3838 If the next character is '?' this is a minimizing repeat, by default, 3839 but if PCRE_UNGREEDY is set, it works the other way round. We change the 3840 repeat type to the non-default. */ 3841 3842 if (ptr[1] == CHAR_PLUS) 3843 { 3844 repeat_type = 0; /* Force greedy */ 3845 possessive_quantifier = TRUE; 3846 ptr++; 3847 } 3848 else if (ptr[1] == CHAR_QUESTION_MARK) 3849 { 3850 repeat_type = greedy_non_default; 3851 ptr++; 3852 } 3853 else repeat_type = greedy_default; 3854 3855 /* If previous was a character match, abolish the item and generate a 3856 repeat item instead. If a char item has a minumum of more than one, ensure 3857 that it is set in reqbyte - it might not be if a sequence such as x{3} is 3858 the first thing in a branch because the x will have gone into firstbyte 3859 instead. */ 3860 3861 if (*previous == OP_CHAR || *previous == OP_CHARNC) 3862 { 3863 /* Deal with UTF-8 characters that take up more than one byte. It's 3864 easier to write this out separately than try to macrify it. Use c to 3865 hold the length of the character in bytes, plus 0x80 to flag that it's a 3866 length rather than a small character. */ 3867 3868#ifdef SUPPORT_UTF8 3869 if (utf8 && (code[-1] & 0x80) != 0) 3870 { 3871 uschar *lastchar = code - 1; 3872 while((*lastchar & 0xc0) == 0x80) lastchar--; 3873 c = code - lastchar; /* Length of UTF-8 character */ 3874 memcpy(utf8_char, lastchar, c); /* Save the char */ 3875 c |= 0x80; /* Flag c as a length */ 3876 } 3877 else 3878#endif 3879 3880 /* Handle the case of a single byte - either with no UTF8 support, or 3881 with UTF-8 disabled, or for a UTF-8 character < 128. */ 3882 3883 { 3884 c = code[-1]; 3885 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; 3886 } 3887 3888 /* If the repetition is unlimited, it pays to see if the next thing on 3889 the line is something that cannot possibly match this character. If so, 3890 automatically possessifying this item gains some performance in the case 3891 where the match fails. */ 3892 3893 if (!possessive_quantifier && 3894 repeat_max < 0 && 3895 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, 3896 options, cd)) 3897 { 3898 repeat_type = 0; /* Force greedy */ 3899 possessive_quantifier = TRUE; 3900 } 3901 3902 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 3903 } 3904 3905 /* If previous was a single negated character ([^a] or similar), we use 3906 one of the special opcodes, replacing it. The code is shared with single- 3907 character repeats by setting opt_type to add a suitable offset into 3908 repeat_type. We can also test for auto-possessification. OP_NOT is 3909 currently used only for single-byte chars. */ 3910 3911 else if (*previous == OP_NOT) 3912 { 3913 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ 3914 c = previous[1]; 3915 if (!possessive_quantifier && 3916 repeat_max < 0 && 3917 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) 3918 { 3919 repeat_type = 0; /* Force greedy */ 3920 possessive_quantifier = TRUE; 3921 } 3922 goto OUTPUT_SINGLE_REPEAT; 3923 } 3924 3925 /* If previous was a character type match (\d or similar), abolish it and 3926 create a suitable repeat item. The code is shared with single-character 3927 repeats by setting op_type to add a suitable offset into repeat_type. Note 3928 the the Unicode property types will be present only when SUPPORT_UCP is 3929 defined, but we don't wrap the little bits of code here because it just 3930 makes it horribly messy. */ 3931 3932 else if (*previous < OP_EODN) 3933 { 3934 uschar *oldcode; 3935 int prop_type, prop_value; 3936 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 3937 c = *previous; 3938 3939 if (!possessive_quantifier && 3940 repeat_max < 0 && 3941 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) 3942 { 3943 repeat_type = 0; /* Force greedy */ 3944 possessive_quantifier = TRUE; 3945 } 3946 3947 OUTPUT_SINGLE_REPEAT: 3948 if (*previous == OP_PROP || *previous == OP_NOTPROP) 3949 { 3950 prop_type = previous[1]; 3951 prop_value = previous[2]; 3952 } 3953 else prop_type = prop_value = -1; 3954 3955 oldcode = code; 3956 code = previous; /* Usually overwrite previous item */ 3957 3958 /* If the maximum is zero then the minimum must also be zero; Perl allows 3959 this case, so we do too - by simply omitting the item altogether. */ 3960 3961 if (repeat_max == 0) goto END_REPEAT; 3962 3963 /*--------------------------------------------------------------------*/ 3964 /* This code is obsolete from release 8.00; the restriction was finally 3965 removed: */ 3966 3967 /* All real repeats make it impossible to handle partial matching (maybe 3968 one day we will be able to remove this restriction). */ 3969 3970 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */ 3971 /*--------------------------------------------------------------------*/ 3972 3973 /* Combine the op_type with the repeat_type */ 3974 3975 repeat_type += op_type; 3976 3977 /* A minimum of zero is handled either as the special case * or ?, or as 3978 an UPTO, with the maximum given. */ 3979 3980 if (repeat_min == 0) 3981 { 3982 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; 3983 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 3984 else 3985 { 3986 *code++ = OP_UPTO + repeat_type; 3987 PUT2INC(code, 0, repeat_max); 3988 } 3989 } 3990 3991 /* A repeat minimum of 1 is optimized into some special cases. If the 3992 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is 3993 left in place and, if the maximum is greater than 1, we use OP_UPTO with 3994 one less than the maximum. */ 3995 3996 else if (repeat_min == 1) 3997 { 3998 if (repeat_max == -1) 3999 *code++ = OP_PLUS + repeat_type; 4000 else 4001 { 4002 code = oldcode; /* leave previous item in place */ 4003 if (repeat_max == 1) goto END_REPEAT; 4004 *code++ = OP_UPTO + repeat_type; 4005 PUT2INC(code, 0, repeat_max - 1); 4006 } 4007 } 4008 4009 /* The case {n,n} is just an EXACT, while the general case {n,m} is 4010 handled as an EXACT followed by an UPTO. */ 4011 4012 else 4013 { 4014 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 4015 PUT2INC(code, 0, repeat_min); 4016 4017 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, 4018 we have to insert the character for the previous code. For a repeated 4019 Unicode property match, there are two extra bytes that define the 4020 required property. In UTF-8 mode, long characters have their length in 4021 c, with the 0x80 bit as a flag. */ 4022 4023 if (repeat_max < 0) 4024 { 4025#ifdef SUPPORT_UTF8 4026 if (utf8 && c >= 128) 4027 { 4028 memcpy(code, utf8_char, c & 7); 4029 code += c & 7; 4030 } 4031 else 4032#endif 4033 { 4034 *code++ = c; 4035 if (prop_type >= 0) 4036 { 4037 *code++ = prop_type; 4038 *code++ = prop_value; 4039 } 4040 } 4041 *code++ = OP_STAR + repeat_type; 4042 } 4043 4044 /* Else insert an UPTO if the max is greater than the min, again 4045 preceded by the character, for the previously inserted code. If the 4046 UPTO is just for 1 instance, we can use QUERY instead. */ 4047 4048 else if (repeat_max != repeat_min) 4049 { 4050#ifdef SUPPORT_UTF8 4051 if (utf8 && c >= 128) 4052 { 4053 memcpy(code, utf8_char, c & 7); 4054 code += c & 7; 4055 } 4056 else 4057#endif 4058 *code++ = c; 4059 if (prop_type >= 0) 4060 { 4061 *code++ = prop_type; 4062 *code++ = prop_value; 4063 } 4064 repeat_max -= repeat_min; 4065 4066 if (repeat_max == 1) 4067 { 4068 *code++ = OP_QUERY + repeat_type; 4069 } 4070 else 4071 { 4072 *code++ = OP_UPTO + repeat_type; 4073 PUT2INC(code, 0, repeat_max); 4074 } 4075 } 4076 } 4077 4078 /* The character or character type itself comes last in all cases. */ 4079 4080#ifdef SUPPORT_UTF8 4081 if (utf8 && c >= 128) 4082 { 4083 memcpy(code, utf8_char, c & 7); 4084 code += c & 7; 4085 } 4086 else 4087#endif 4088 *code++ = c; 4089 4090 /* For a repeated Unicode property match, there are two extra bytes that 4091 define the required property. */ 4092 4093#ifdef SUPPORT_UCP 4094 if (prop_type >= 0) 4095 { 4096 *code++ = prop_type; 4097 *code++ = prop_value; 4098 } 4099#endif 4100 } 4101 4102 /* If previous was a character class or a back reference, we put the repeat 4103 stuff after it, but just skip the item if the repeat was {0,0}. */ 4104 4105 else if (*previous == OP_CLASS || 4106 *previous == OP_NCLASS || 4107#ifdef SUPPORT_UTF8 4108 *previous == OP_XCLASS || 4109#endif 4110 *previous == OP_REF) 4111 { 4112 if (repeat_max == 0) 4113 { 4114 code = previous; 4115 goto END_REPEAT; 4116 } 4117 4118 /*--------------------------------------------------------------------*/ 4119 /* This code is obsolete from release 8.00; the restriction was finally 4120 removed: */ 4121 4122 /* All real repeats make it impossible to handle partial matching (maybe 4123 one day we will be able to remove this restriction). */ 4124 4125 /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */ 4126 /*--------------------------------------------------------------------*/ 4127 4128 if (repeat_min == 0 && repeat_max == -1) 4129 *code++ = OP_CRSTAR + repeat_type; 4130 else if (repeat_min == 1 && repeat_max == -1) 4131 *code++ = OP_CRPLUS + repeat_type; 4132 else if (repeat_min == 0 && repeat_max == 1) 4133 *code++ = OP_CRQUERY + repeat_type; 4134 else 4135 { 4136 *code++ = OP_CRRANGE + repeat_type; 4137 PUT2INC(code, 0, repeat_min); 4138 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ 4139 PUT2INC(code, 0, repeat_max); 4140 } 4141 } 4142 4143 /* If previous was a bracket group, we may have to replicate it in certain 4144 cases. */ 4145 4146 else if (*previous == OP_BRA || *previous == OP_CBRA || 4147 *previous == OP_ONCE || *previous == OP_COND) 4148 { 4149 register int i; 4150 int ketoffset = 0; 4151 int len = code - previous; 4152 uschar *bralink = NULL; 4153 4154 /* Repeating a DEFINE group is pointless */ 4155 4156 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) 4157 { 4158 *errorcodeptr = ERR55; 4159 goto FAILED; 4160 } 4161 4162 /* If the maximum repeat count is unlimited, find the end of the bracket 4163 by scanning through from the start, and compute the offset back to it 4164 from the current code pointer. There may be an OP_OPT setting following 4165 the final KET, so we can't find the end just by going back from the code 4166 pointer. */ 4167 4168 if (repeat_max == -1) 4169 { 4170 register uschar *ket = previous; 4171 do ket += GET(ket, 1); while (*ket != OP_KET); 4172 ketoffset = code - ket; 4173 } 4174 4175 /* The case of a zero minimum is special because of the need to stick 4176 OP_BRAZERO in front of it, and because the group appears once in the 4177 data, whereas in other cases it appears the minimum number of times. For 4178 this reason, it is simplest to treat this case separately, as otherwise 4179 the code gets far too messy. There are several special subcases when the 4180 minimum is zero. */ 4181 4182 if (repeat_min == 0) 4183 { 4184 /* If the maximum is also zero, we used to just omit the group from the 4185 output altogether, like this: 4186 4187 ** if (repeat_max == 0) 4188 ** { 4189 ** code = previous; 4190 ** goto END_REPEAT; 4191 ** } 4192 4193 However, that fails when a group is referenced as a subroutine from 4194 elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it 4195 so that it is skipped on execution. As we don't have a list of which 4196 groups are referenced, we cannot do this selectively. 4197 4198 If the maximum is 1 or unlimited, we just have to stick in the BRAZERO 4199 and do no more at this point. However, we do need to adjust any 4200 OP_RECURSE calls inside the group that refer to the group itself or any 4201 internal or forward referenced group, because the offset is from the 4202 start of the whole regex. Temporarily terminate the pattern while doing 4203 this. */ 4204 4205 if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ 4206 { 4207 *code = OP_END; 4208 adjust_recurse(previous, 1, utf8, cd, save_hwm); 4209 memmove(previous+1, previous, len); 4210 code++; 4211 if (repeat_max == 0) 4212 { 4213 *previous++ = OP_SKIPZERO; 4214 goto END_REPEAT; 4215 } 4216 *previous++ = OP_BRAZERO + repeat_type; 4217 } 4218 4219 /* If the maximum is greater than 1 and limited, we have to replicate 4220 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 4221 The first one has to be handled carefully because it's the original 4222 copy, which has to be moved up. The remainder can be handled by code 4223 that is common with the non-zero minimum case below. We have to 4224 adjust the value or repeat_max, since one less copy is required. Once 4225 again, we may have to adjust any OP_RECURSE calls inside the group. */ 4226 4227 else 4228 { 4229 int offset; 4230 *code = OP_END; 4231 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); 4232 memmove(previous + 2 + LINK_SIZE, previous, len); 4233 code += 2 + LINK_SIZE; 4234 *previous++ = OP_BRAZERO + repeat_type; 4235 *previous++ = OP_BRA; 4236 4237 /* We chain together the bracket offset fields that have to be 4238 filled in later when the ends of the brackets are reached. */ 4239 4240 offset = (bralink == NULL)? 0 : previous - bralink; 4241 bralink = previous; 4242 PUTINC(previous, 0, offset); 4243 } 4244 4245 repeat_max--; 4246 } 4247 4248 /* If the minimum is greater than zero, replicate the group as many 4249 times as necessary, and adjust the maximum to the number of subsequent 4250 copies that we need. If we set a first char from the group, and didn't 4251 set a required char, copy the latter from the former. If there are any 4252 forward reference subroutine calls in the group, there will be entries on 4253 the workspace list; replicate these with an appropriate increment. */ 4254 4255 else 4256 { 4257 if (repeat_min > 1) 4258 { 4259 /* In the pre-compile phase, we don't actually do the replication. We 4260 just adjust the length as if we had. Do some paranoid checks for 4261 potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit 4262 integer type when available, otherwise double. */ 4263 4264 if (lengthptr != NULL) 4265 { 4266 int delta = (repeat_min - 1)*length_prevgroup; 4267 if ((INT64_OR_DOUBLE)(repeat_min - 1)* 4268 (INT64_OR_DOUBLE)length_prevgroup > 4269 (INT64_OR_DOUBLE)INT_MAX || 4270 OFLOW_MAX - *lengthptr < delta) 4271 { 4272 *errorcodeptr = ERR20; 4273 goto FAILED; 4274 } 4275 *lengthptr += delta; 4276 } 4277 4278 /* This is compiling for real */ 4279 4280 else 4281 { 4282 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; 4283 for (i = 1; i < repeat_min; i++) 4284 { 4285 uschar *hc; 4286 uschar *this_hwm = cd->hwm; 4287 memcpy(code, previous, len); 4288 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) 4289 { 4290 PUT(cd->hwm, 0, GET(hc, 0) + len); 4291 cd->hwm += LINK_SIZE; 4292 } 4293 save_hwm = this_hwm; 4294 code += len; 4295 } 4296 } 4297 } 4298 4299 if (repeat_max > 0) repeat_max -= repeat_min; 4300 } 4301 4302 /* This code is common to both the zero and non-zero minimum cases. If 4303 the maximum is limited, it replicates the group in a nested fashion, 4304 remembering the bracket starts on a stack. In the case of a zero minimum, 4305 the first one was set up above. In all cases the repeat_max now specifies 4306 the number of additional copies needed. Again, we must remember to 4307 replicate entries on the forward reference list. */ 4308 4309 if (repeat_max >= 0) 4310 { 4311 /* In the pre-compile phase, we don't actually do the replication. We 4312 just adjust the length as if we had. For each repetition we must add 1 4313 to the length for BRAZERO and for all but the last repetition we must 4314 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some 4315 paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is 4316 a 64-bit integer type when available, otherwise double. */ 4317 4318 if (lengthptr != NULL && repeat_max > 0) 4319 { 4320 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 4321 2 - 2*LINK_SIZE; /* Last one doesn't nest */ 4322 if ((INT64_OR_DOUBLE)repeat_max * 4323 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) 4324 > (INT64_OR_DOUBLE)INT_MAX || 4325 OFLOW_MAX - *lengthptr < delta) 4326 { 4327 *errorcodeptr = ERR20; 4328 goto FAILED; 4329 } 4330 *lengthptr += delta; 4331 } 4332 4333 /* This is compiling for real */ 4334 4335 else for (i = repeat_max - 1; i >= 0; i--) 4336 { 4337 uschar *hc; 4338 uschar *this_hwm = cd->hwm; 4339 4340 *code++ = OP_BRAZERO + repeat_type; 4341 4342 /* All but the final copy start a new nesting, maintaining the 4343 chain of brackets outstanding. */ 4344 4345 if (i != 0) 4346 { 4347 int offset; 4348 *code++ = OP_BRA; 4349 offset = (bralink == NULL)? 0 : code - bralink; 4350 bralink = code; 4351 PUTINC(code, 0, offset); 4352 } 4353 4354 memcpy(code, previous, len); 4355 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) 4356 { 4357 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); 4358 cd->hwm += LINK_SIZE; 4359 } 4360 save_hwm = this_hwm; 4361 code += len; 4362 } 4363 4364 /* Now chain through the pending brackets, and fill in their length 4365 fields (which are holding the chain links pro tem). */ 4366 4367 while (bralink != NULL) 4368 { 4369 int oldlinkoffset; 4370 int offset = code - bralink + 1; 4371 uschar *bra = code - offset; 4372 oldlinkoffset = GET(bra, 1); 4373 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 4374 *code++ = OP_KET; 4375 PUTINC(code, 0, offset); 4376 PUT(bra, 1, offset); 4377 } 4378 } 4379 4380 /* If the maximum is unlimited, set a repeater in the final copy. We 4381 can't just offset backwards from the current code point, because we 4382 don't know if there's been an options resetting after the ket. The 4383 correct offset was computed above. 4384 4385 Then, when we are doing the actual compile phase, check to see whether 4386 this group is a non-atomic one that could match an empty string. If so, 4387 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so 4388 that runtime checking can be done. [This check is also applied to 4389 atomic groups at runtime, but in a different way.] */ 4390 4391 else 4392 { 4393 uschar *ketcode = code - ketoffset; 4394 uschar *bracode = ketcode - GET(ketcode, 1); 4395 *ketcode = OP_KETRMAX + repeat_type; 4396 if (lengthptr == NULL && *bracode != OP_ONCE) 4397 { 4398 uschar *scode = bracode; 4399 do 4400 { 4401 if (could_be_empty_branch(scode, ketcode, utf8, cd)) 4402 { 4403 *bracode += OP_SBRA - OP_BRA; 4404 break; 4405 } 4406 scode += GET(scode, 1); 4407 } 4408 while (*scode == OP_ALT); 4409 } 4410 } 4411 } 4412 4413 /* If previous is OP_FAIL, it was generated by an empty class [] in 4414 JavaScript mode. The other ways in which OP_FAIL can be generated, that is 4415 by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" 4416 error above. We can just ignore the repeat in JS case. */ 4417 4418 else if (*previous == OP_FAIL) goto END_REPEAT; 4419 4420 /* Else there's some kind of shambles */ 4421 4422 else 4423 { 4424 *errorcodeptr = ERR11; 4425 goto FAILED; 4426 } 4427 4428 /* If the character following a repeat is '+', or if certain optimization 4429 tests above succeeded, possessive_quantifier is TRUE. For some of the 4430 simpler opcodes, there is an special alternative opcode for this. For 4431 anything else, we wrap the entire repeated item inside OP_ONCE brackets. 4432 The '+' notation is just syntactic sugar, taken from Sun's Java package, 4433 but the special opcodes can optimize it a bit. The repeated item starts at 4434 tempcode, not at previous, which might be the first part of a string whose 4435 (former) last char we repeated. 4436 4437 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But 4438 an 'upto' may follow. We skip over an 'exact' item, and then test the 4439 length of what remains before proceeding. */ 4440 4441 if (possessive_quantifier) 4442 { 4443 int len; 4444 4445 if (*tempcode == OP_TYPEEXACT) 4446 tempcode += _pcre_OP_lengths[*tempcode] + 4447 ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0); 4448 4449 else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) 4450 { 4451 tempcode += _pcre_OP_lengths[*tempcode]; 4452#ifdef SUPPORT_UTF8 4453 if (utf8 && tempcode[-1] >= 0xc0) 4454 tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f]; 4455#endif 4456 } 4457 4458 len = code - tempcode; 4459 if (len > 0) switch (*tempcode) 4460 { 4461 case OP_STAR: *tempcode = OP_POSSTAR; break; 4462 case OP_PLUS: *tempcode = OP_POSPLUS; break; 4463 case OP_QUERY: *tempcode = OP_POSQUERY; break; 4464 case OP_UPTO: *tempcode = OP_POSUPTO; break; 4465 4466 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; 4467 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; 4468 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; 4469 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; 4470 4471 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; 4472 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; 4473 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; 4474 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; 4475 4476 /* Because we are moving code along, we must ensure that any 4477 pending recursive references are updated. */ 4478 4479 default: 4480 *code = OP_END; 4481 adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm); 4482 memmove(tempcode + 1+LINK_SIZE, tempcode, len); 4483 code += 1 + LINK_SIZE; 4484 len += 1 + LINK_SIZE; 4485 tempcode[0] = OP_ONCE; 4486 *code++ = OP_KET; 4487 PUTINC(code, 0, len); 4488 PUT(tempcode, 1, len); 4489 break; 4490 } 4491 } 4492 4493 /* In all case we no longer have a previous item. We also set the 4494 "follows varying string" flag for subsequently encountered reqbytes if 4495 it isn't already set and we have just passed a varying length item. */ 4496 4497 END_REPEAT: 4498 previous = NULL; 4499 cd->req_varyopt |= reqvary; 4500 break; 4501 4502 4503 /* ===================================================================*/ 4504 /* Start of nested parenthesized sub-expression, or comment or lookahead or 4505 lookbehind or option setting or condition or all the other extended 4506 parenthesis forms. */ 4507 4508 case CHAR_LEFT_PARENTHESIS: 4509 newoptions = options; 4510 skipbytes = 0; 4511 bravalue = OP_CBRA; 4512 save_hwm = cd->hwm; 4513 reset_bracount = FALSE; 4514 4515 /* First deal with various "verbs" that can be introduced by '*'. */ 4516 4517 if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) 4518 { 4519 int i, namelen; 4520 const char *vn = verbnames; 4521 const uschar *name = ++ptr; 4522 previous = NULL; 4523 while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; 4524 if (*ptr == CHAR_COLON) 4525 { 4526 *errorcodeptr = ERR59; /* Not supported */ 4527 goto FAILED; 4528 } 4529 if (*ptr != CHAR_RIGHT_PARENTHESIS) 4530 { 4531 *errorcodeptr = ERR60; 4532 goto FAILED; 4533 } 4534 namelen = ptr - name; 4535 for (i = 0; i < verbcount; i++) 4536 { 4537 if (namelen == verbs[i].len && 4538 strncmp((char *)name, vn, namelen) == 0) 4539 { 4540 /* Check for open captures before ACCEPT */ 4541 4542 if (verbs[i].op == OP_ACCEPT) 4543 { 4544 open_capitem *oc; 4545 cd->had_accept = TRUE; 4546 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 4547 { 4548 *code++ = OP_CLOSE; 4549 PUT2INC(code, 0, oc->number); 4550 } 4551 } 4552 *code++ = verbs[i].op; 4553 break; 4554 } 4555 vn += verbs[i].len + 1; 4556 } 4557 if (i < verbcount) continue; 4558 *errorcodeptr = ERR60; 4559 goto FAILED; 4560 } 4561 4562 /* Deal with the extended parentheses; all are introduced by '?', and the 4563 appearance of any of them means that this is not a capturing group. */ 4564 4565 else if (*ptr == CHAR_QUESTION_MARK) 4566 { 4567 int i, set, unset, namelen; 4568 int *optset; 4569 const uschar *name; 4570 uschar *slot; 4571 4572 switch (*(++ptr)) 4573 { 4574 case CHAR_NUMBER_SIGN: /* Comment; skip to ket */ 4575 ptr++; 4576 while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; 4577 if (*ptr == 0) 4578 { 4579 *errorcodeptr = ERR18; 4580 goto FAILED; 4581 } 4582 continue; 4583 4584 4585 /* ------------------------------------------------------------ */ 4586 case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ 4587 reset_bracount = TRUE; 4588 /* Fall through */ 4589 4590 /* ------------------------------------------------------------ */ 4591 case CHAR_COLON: /* Non-capturing bracket */ 4592 bravalue = OP_BRA; 4593 ptr++; 4594 break; 4595 4596 4597 /* ------------------------------------------------------------ */ 4598 case CHAR_LEFT_PARENTHESIS: 4599 bravalue = OP_COND; /* Conditional group */ 4600 4601 /* A condition can be an assertion, a number (referring to a numbered 4602 group), a name (referring to a named group), or 'R', referring to 4603 recursion. R<digits> and R&name are also permitted for recursion tests. 4604 4605 There are several syntaxes for testing a named group: (?(name)) is used 4606 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')). 4607 4608 There are two unfortunate ambiguities, caused by history. (a) 'R' can 4609 be the recursive thing or the name 'R' (and similarly for 'R' followed 4610 by digits), and (b) a number could be a name that consists of digits. 4611 In both cases, we look for a name first; if not found, we try the other 4612 cases. */ 4613 4614 /* For conditions that are assertions, check the syntax, and then exit 4615 the switch. This will take control down to where bracketed groups, 4616 including assertions, are processed. */ 4617 4618 if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN || 4619 ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN)) 4620 break; 4621 4622 /* Most other conditions use OP_CREF (a couple change to OP_RREF 4623 below), and all need to skip 3 bytes at the start of the group. */ 4624 4625 code[1+LINK_SIZE] = OP_CREF; 4626 skipbytes = 3; 4627 refsign = -1; 4628 4629 /* Check for a test for recursion in a named group. */ 4630 4631 if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND) 4632 { 4633 terminator = -1; 4634 ptr += 2; 4635 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */ 4636 } 4637 4638 /* Check for a test for a named group's having been set, using the Perl 4639 syntax (?(<name>) or (?('name') */ 4640 4641 else if (ptr[1] == CHAR_LESS_THAN_SIGN) 4642 { 4643 terminator = CHAR_GREATER_THAN_SIGN; 4644 ptr++; 4645 } 4646 else if (ptr[1] == CHAR_APOSTROPHE) 4647 { 4648 terminator = CHAR_APOSTROPHE; 4649 ptr++; 4650 } 4651 else 4652 { 4653 terminator = 0; 4654 if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr); 4655 } 4656 4657 /* We now expect to read a name; any thing else is an error */ 4658 4659 if ((cd->ctypes[ptr[1]] & ctype_word) == 0) 4660 { 4661 ptr += 1; /* To get the right offset */ 4662 *errorcodeptr = ERR28; 4663 goto FAILED; 4664 } 4665 4666 /* Read the name, but also get it as a number if it's all digits */ 4667 4668 recno = 0; 4669 name = ++ptr; 4670 while ((cd->ctypes[*ptr] & ctype_word) != 0) 4671 { 4672 if (recno >= 0) 4673 recno = ((digitab[*ptr] & ctype_digit) != 0)? 4674 recno * 10 + *ptr - CHAR_0 : -1; 4675 ptr++; 4676 } 4677 namelen = ptr - name; 4678 4679 if ((terminator > 0 && *ptr++ != terminator) || 4680 *ptr++ != CHAR_RIGHT_PARENTHESIS) 4681 { 4682 ptr--; /* Error offset */ 4683 *errorcodeptr = ERR26; 4684 goto FAILED; 4685 } 4686 4687 /* Do no further checking in the pre-compile phase. */ 4688 4689 if (lengthptr != NULL) break; 4690 4691 /* In the real compile we do the work of looking for the actual 4692 reference. If the string started with "+" or "-" we require the rest to 4693 be digits, in which case recno will be set. */ 4694 4695 if (refsign > 0) 4696 { 4697 if (recno <= 0) 4698 { 4699 *errorcodeptr = ERR58; 4700 goto FAILED; 4701 } 4702 recno = (refsign == CHAR_MINUS)? 4703 cd->bracount - recno + 1 : recno +cd->bracount; 4704 if (recno <= 0 || recno > cd->final_bracount) 4705 { 4706 *errorcodeptr = ERR15; 4707 goto FAILED; 4708 } 4709 PUT2(code, 2+LINK_SIZE, recno); 4710 break; 4711 } 4712 4713 /* Otherwise (did not start with "+" or "-"), start by looking for the 4714 name. If we find a name, add one to the opcode to change OP_CREF or 4715 OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same, 4716 except they record that the reference was originally to a name. The 4717 information is used to check duplicate names. */ 4718 4719 slot = cd->name_table; 4720 for (i = 0; i < cd->names_found; i++) 4721 { 4722 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; 4723 slot += cd->name_entry_size; 4724 } 4725 4726 /* Found a previous named subpattern */ 4727 4728 if (i < cd->names_found) 4729 { 4730 recno = GET2(slot, 0); 4731 PUT2(code, 2+LINK_SIZE, recno); 4732 code[1+LINK_SIZE]++; 4733 } 4734 4735 /* Search the pattern for a forward reference */ 4736 4737 else if ((i = find_parens(cd, name, namelen, 4738 (options & PCRE_EXTENDED) != 0)) > 0) 4739 { 4740 PUT2(code, 2+LINK_SIZE, i); 4741 code[1+LINK_SIZE]++; 4742 } 4743 4744 /* If terminator == 0 it means that the name followed directly after 4745 the opening parenthesis [e.g. (?(abc)...] and in this case there are 4746 some further alternatives to try. For the cases where terminator != 0 4747 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have 4748 now checked all the possibilities, so give an error. */ 4749 4750 else if (terminator != 0) 4751 { 4752 *errorcodeptr = ERR15; 4753 goto FAILED; 4754 } 4755 4756 /* Check for (?(R) for recursion. Allow digits after R to specify a 4757 specific group number. */ 4758 4759 else if (*name == CHAR_R) 4760 { 4761 recno = 0; 4762 for (i = 1; i < namelen; i++) 4763 { 4764 if ((digitab[name[i]] & ctype_digit) == 0) 4765 { 4766 *errorcodeptr = ERR15; 4767 goto FAILED; 4768 } 4769 recno = recno * 10 + name[i] - CHAR_0; 4770 } 4771 if (recno == 0) recno = RREF_ANY; 4772 code[1+LINK_SIZE] = OP_RREF; /* Change test type */ 4773 PUT2(code, 2+LINK_SIZE, recno); 4774 } 4775 4776 /* Similarly, check for the (?(DEFINE) "condition", which is always 4777 false. */ 4778 4779 else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0) 4780 { 4781 code[1+LINK_SIZE] = OP_DEF; 4782 skipbytes = 1; 4783 } 4784 4785 /* Check for the "name" actually being a subpattern number. We are 4786 in the second pass here, so final_bracount is set. */ 4787 4788 else if (recno > 0 && recno <= cd->final_bracount) 4789 { 4790 PUT2(code, 2+LINK_SIZE, recno); 4791 } 4792 4793 /* Either an unidentified subpattern, or a reference to (?(0) */ 4794 4795 else 4796 { 4797 *errorcodeptr = (recno == 0)? ERR35: ERR15; 4798 goto FAILED; 4799 } 4800 break; 4801 4802 4803 /* ------------------------------------------------------------ */ 4804 case CHAR_EQUALS_SIGN: /* Positive lookahead */ 4805 bravalue = OP_ASSERT; 4806 ptr++; 4807 break; 4808 4809 4810 /* ------------------------------------------------------------ */ 4811 case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ 4812 ptr++; 4813 if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */ 4814 { 4815 *code++ = OP_FAIL; 4816 previous = NULL; 4817 continue; 4818 } 4819 bravalue = OP_ASSERT_NOT; 4820 break; 4821 4822 4823 /* ------------------------------------------------------------ */ 4824 case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ 4825 switch (ptr[1]) 4826 { 4827 case CHAR_EQUALS_SIGN: /* Positive lookbehind */ 4828 bravalue = OP_ASSERTBACK; 4829 ptr += 2; 4830 break; 4831 4832 case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ 4833 bravalue = OP_ASSERTBACK_NOT; 4834 ptr += 2; 4835 break; 4836 4837 default: /* Could be name define, else bad */ 4838 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; 4839 ptr++; /* Correct offset for error */ 4840 *errorcodeptr = ERR24; 4841 goto FAILED; 4842 } 4843 break; 4844 4845 4846 /* ------------------------------------------------------------ */ 4847 case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ 4848 bravalue = OP_ONCE; 4849 ptr++; 4850 break; 4851 4852 4853 /* ------------------------------------------------------------ */ 4854 case CHAR_C: /* Callout - may be followed by digits; */ 4855 previous_callout = code; /* Save for later completion */ 4856 after_manual_callout = 1; /* Skip one item before completing */ 4857 *code++ = OP_CALLOUT; 4858 { 4859 int n = 0; 4860 while ((digitab[*(++ptr)] & ctype_digit) != 0) 4861 n = n * 10 + *ptr - CHAR_0; 4862 if (*ptr != CHAR_RIGHT_PARENTHESIS) 4863 { 4864 *errorcodeptr = ERR39; 4865 goto FAILED; 4866 } 4867 if (n > 255) 4868 { 4869 *errorcodeptr = ERR38; 4870 goto FAILED; 4871 } 4872 *code++ = n; 4873 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ 4874 PUT(code, LINK_SIZE, 0); /* Default length */ 4875 code += 2 * LINK_SIZE; 4876 } 4877 previous = NULL; 4878 continue; 4879 4880 4881 /* ------------------------------------------------------------ */ 4882 case CHAR_P: /* Python-style named subpattern handling */ 4883 if (*(++ptr) == CHAR_EQUALS_SIGN || 4884 *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ 4885 { 4886 is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; 4887 terminator = CHAR_RIGHT_PARENTHESIS; 4888 goto NAMED_REF_OR_RECURSE; 4889 } 4890 else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ 4891 { 4892 *errorcodeptr = ERR41; 4893 goto FAILED; 4894 } 4895 /* Fall through to handle (?P< as (?< is handled */ 4896 4897 4898 /* ------------------------------------------------------------ */ 4899 DEFINE_NAME: /* Come here from (?< handling */ 4900 case CHAR_APOSTROPHE: 4901 { 4902 terminator = (*ptr == CHAR_LESS_THAN_SIGN)? 4903 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 4904 name = ++ptr; 4905 4906 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; 4907 namelen = ptr - name; 4908 4909 /* In the pre-compile phase, just do a syntax check. */ 4910 4911 if (lengthptr != NULL) 4912 { 4913 if (*ptr != terminator) 4914 { 4915 *errorcodeptr = ERR42; 4916 goto FAILED; 4917 } 4918 if (cd->names_found >= MAX_NAME_COUNT) 4919 { 4920 *errorcodeptr = ERR49; 4921 goto FAILED; 4922 } 4923 if (namelen + 3 > cd->name_entry_size) 4924 { 4925 cd->name_entry_size = namelen + 3; 4926 if (namelen > MAX_NAME_SIZE) 4927 { 4928 *errorcodeptr = ERR48; 4929 goto FAILED; 4930 } 4931 } 4932 } 4933 4934 /* In the real compile, create the entry in the table, maintaining 4935 alphabetical order. Duplicate names for different numbers are 4936 permitted only if PCRE_DUPNAMES is set. Duplicate names for the same 4937 number are always OK. (An existing number can be re-used if (?| 4938 appears in the pattern.) In either event, a duplicate name results in 4939 a duplicate entry in the table, even if the number is the same. This 4940 is because the number of names, and hence the table size, is computed 4941 in the pre-compile, and it affects various numbers and pointers which 4942 would all have to be modified, and the compiled code moved down, if 4943 duplicates with the same number were omitted from the table. This 4944 doesn't seem worth the hassle. However, *different* names for the 4945 same number are not permitted. */ 4946 4947 else 4948 { 4949 BOOL dupname = FALSE; 4950 slot = cd->name_table; 4951 4952 for (i = 0; i < cd->names_found; i++) 4953 { 4954 int crc = memcmp(name, slot+2, namelen); 4955 if (crc == 0) 4956 { 4957 if (slot[2+namelen] == 0) 4958 { 4959 if (GET2(slot, 0) != cd->bracount + 1 && 4960 (options & PCRE_DUPNAMES) == 0) 4961 { 4962 *errorcodeptr = ERR43; 4963 goto FAILED; 4964 } 4965 else dupname = TRUE; 4966 } 4967 else crc = -1; /* Current name is a substring */ 4968 } 4969 4970 /* Make space in the table and break the loop for an earlier 4971 name. For a duplicate or later name, carry on. We do this for 4972 duplicates so that in the simple case (when ?(| is not used) they 4973 are in order of their numbers. */ 4974 4975 if (crc < 0) 4976 { 4977 memmove(slot + cd->name_entry_size, slot, 4978 (cd->names_found - i) * cd->name_entry_size); 4979 break; 4980 } 4981 4982 /* Continue the loop for a later or duplicate name */ 4983 4984 slot += cd->name_entry_size; 4985 } 4986 4987 /* For non-duplicate names, check for a duplicate number before 4988 adding the new name. */ 4989 4990 if (!dupname) 4991 { 4992 uschar *cslot = cd->name_table; 4993 for (i = 0; i < cd->names_found; i++) 4994 { 4995 if (cslot != slot) 4996 { 4997 if (GET2(cslot, 0) == cd->bracount + 1) 4998 { 4999 *errorcodeptr = ERR65; 5000 goto FAILED; 5001 } 5002 } 5003 else i--; 5004 cslot += cd->name_entry_size; 5005 } 5006 } 5007 5008 PUT2(slot, 0, cd->bracount + 1); 5009 memcpy(slot + 2, name, namelen); 5010 slot[2+namelen] = 0; 5011 } 5012 } 5013 5014 /* In both pre-compile and compile, count the number of names we've 5015 encountered. */ 5016 5017 cd->names_found++; 5018 ptr++; /* Move past > or ' */ 5019 goto NUMBERED_GROUP; 5020 5021 5022 /* ------------------------------------------------------------ */ 5023 case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ 5024 terminator = CHAR_RIGHT_PARENTHESIS; 5025 is_recurse = TRUE; 5026 /* Fall through */ 5027 5028 /* We come here from the Python syntax above that handles both 5029 references (?P=name) and recursion (?P>name), as well as falling 5030 through from the Perl recursion syntax (?&name). We also come here from 5031 the Perl \k<name> or \k'name' back reference syntax and the \k{name} 5032 .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ 5033 5034 NAMED_REF_OR_RECURSE: 5035 name = ++ptr; 5036 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; 5037 namelen = ptr - name; 5038 5039 /* In the pre-compile phase, do a syntax check and set a dummy 5040 reference number. */ 5041 5042 if (lengthptr != NULL) 5043 { 5044 if (namelen == 0) 5045 { 5046 *errorcodeptr = ERR62; 5047 goto FAILED; 5048 } 5049 if (*ptr != terminator) 5050 { 5051 *errorcodeptr = ERR42; 5052 goto FAILED; 5053 } 5054 if (namelen > MAX_NAME_SIZE) 5055 { 5056 *errorcodeptr = ERR48; 5057 goto FAILED; 5058 } 5059 recno = 0; 5060 } 5061 5062 /* In the real compile, seek the name in the table. We check the name 5063 first, and then check that we have reached the end of the name in the 5064 table. That way, if the name that is longer than any in the table, 5065 the comparison will fail without reading beyond the table entry. */ 5066 5067 else 5068 { 5069 slot = cd->name_table; 5070 for (i = 0; i < cd->names_found; i++) 5071 { 5072 if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && 5073 slot[2+namelen] == 0) 5074 break; 5075 slot += cd->name_entry_size; 5076 } 5077 5078 if (i < cd->names_found) /* Back reference */ 5079 { 5080 recno = GET2(slot, 0); 5081 } 5082 else if ((recno = /* Forward back reference */ 5083 find_parens(cd, name, namelen, 5084 (options & PCRE_EXTENDED) != 0)) <= 0) 5085 { 5086 *errorcodeptr = ERR15; 5087 goto FAILED; 5088 } 5089 } 5090 5091 /* In both phases, we can now go to the code than handles numerical 5092 recursion or backreferences. */ 5093 5094 if (is_recurse) goto HANDLE_RECURSION; 5095 else goto HANDLE_REFERENCE; 5096 5097 5098 /* ------------------------------------------------------------ */ 5099 case CHAR_R: /* Recursion */ 5100 ptr++; /* Same as (?0) */ 5101 /* Fall through */ 5102 5103 5104 /* ------------------------------------------------------------ */ 5105 case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ 5106 case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: 5107 case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: 5108 { 5109 const uschar *called; 5110 terminator = CHAR_RIGHT_PARENTHESIS; 5111 5112 /* Come here from the \g<...> and \g'...' code (Oniguruma 5113 compatibility). However, the syntax has been checked to ensure that 5114 the ... are a (signed) number, so that neither ERR63 nor ERR29 will 5115 be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY 5116 ever be taken. */ 5117 5118 HANDLE_NUMERICAL_RECURSION: 5119 5120 if ((refsign = *ptr) == CHAR_PLUS) 5121 { 5122 ptr++; 5123 if ((digitab[*ptr] & ctype_digit) == 0) 5124 { 5125 *errorcodeptr = ERR63; 5126 goto FAILED; 5127 } 5128 } 5129 else if (refsign == CHAR_MINUS) 5130 { 5131 if ((digitab[ptr[1]] & ctype_digit) == 0) 5132 goto OTHER_CHAR_AFTER_QUERY; 5133 ptr++; 5134 } 5135 5136 recno = 0; 5137 while((digitab[*ptr] & ctype_digit) != 0) 5138 recno = recno * 10 + *ptr++ - CHAR_0; 5139 5140 if (*ptr != terminator) 5141 { 5142 *errorcodeptr = ERR29; 5143 goto FAILED; 5144 } 5145 5146 if (refsign == CHAR_MINUS) 5147 { 5148 if (recno == 0) 5149 { 5150 *errorcodeptr = ERR58; 5151 goto FAILED; 5152 } 5153 recno = cd->bracount - recno + 1; 5154 if (recno <= 0) 5155 { 5156 *errorcodeptr = ERR15; 5157 goto FAILED; 5158 } 5159 } 5160 else if (refsign == CHAR_PLUS) 5161 { 5162 if (recno == 0) 5163 { 5164 *errorcodeptr = ERR58; 5165 goto FAILED; 5166 } 5167 recno += cd->bracount; 5168 } 5169 5170 /* Come here from code above that handles a named recursion */ 5171 5172 HANDLE_RECURSION: 5173 5174 previous = code; 5175 called = cd->start_code; 5176 5177 /* When we are actually compiling, find the bracket that is being 5178 referenced. Temporarily end the regex in case it doesn't exist before 5179 this point. If we end up with a forward reference, first check that 5180 the bracket does occur later so we can give the error (and position) 5181 now. Then remember this forward reference in the workspace so it can 5182 be filled in at the end. */ 5183 5184 if (lengthptr == NULL) 5185 { 5186 *code = OP_END; 5187 if (recno != 0) 5188 called = _pcre_find_bracket(cd->start_code, utf8, recno); 5189 5190 /* Forward reference */ 5191 5192 if (called == NULL) 5193 { 5194 if (find_parens(cd, NULL, recno, 5195 (options & PCRE_EXTENDED) != 0) < 0) 5196 { 5197 *errorcodeptr = ERR15; 5198 goto FAILED; 5199 } 5200 5201 /* Fudge the value of "called" so that when it is inserted as an 5202 offset below, what it actually inserted is the reference number 5203 of the group. */ 5204 5205 called = cd->start_code + recno; 5206 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); 5207 } 5208 5209 /* If not a forward reference, and the subpattern is still open, 5210 this is a recursive call. We check to see if this is a left 5211 recursion that could loop for ever, and diagnose that case. */ 5212 5213 else if (GET(called, 1) == 0 && 5214 could_be_empty(called, code, bcptr, utf8, cd)) 5215 { 5216 *errorcodeptr = ERR40; 5217 goto FAILED; 5218 } 5219 } 5220 5221 /* Insert the recursion/subroutine item, automatically wrapped inside 5222 "once" brackets. Set up a "previous group" length so that a 5223 subsequent quantifier will work. */ 5224 5225 *code = OP_ONCE; 5226 PUT(code, 1, 2 + 2*LINK_SIZE); 5227 code += 1 + LINK_SIZE; 5228 5229 *code = OP_RECURSE; 5230 PUT(code, 1, called - cd->start_code); 5231 code += 1 + LINK_SIZE; 5232 5233 *code = OP_KET; 5234 PUT(code, 1, 2 + 2*LINK_SIZE); 5235 code += 1 + LINK_SIZE; 5236 5237 length_prevgroup = 3 + 3*LINK_SIZE; 5238 } 5239 5240 /* Can't determine a first byte now */ 5241 5242 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 5243 continue; 5244 5245 5246 /* ------------------------------------------------------------ */ 5247 default: /* Other characters: check option setting */ 5248 OTHER_CHAR_AFTER_QUERY: 5249 set = unset = 0; 5250 optset = &set; 5251 5252 while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) 5253 { 5254 switch (*ptr++) 5255 { 5256 case CHAR_MINUS: optset = &unset; break; 5257 5258 case CHAR_J: /* Record that it changed in the external options */ 5259 *optset |= PCRE_DUPNAMES; 5260 cd->external_flags |= PCRE_JCHANGED; 5261 break; 5262 5263 case CHAR_i: *optset |= PCRE_CASELESS; break; 5264 case CHAR_m: *optset |= PCRE_MULTILINE; break; 5265 case CHAR_s: *optset |= PCRE_DOTALL; break; 5266 case CHAR_x: *optset |= PCRE_EXTENDED; break; 5267 case CHAR_U: *optset |= PCRE_UNGREEDY; break; 5268 case CHAR_X: *optset |= PCRE_EXTRA; break; 5269 5270 default: *errorcodeptr = ERR12; 5271 ptr--; /* Correct the offset */ 5272 goto FAILED; 5273 } 5274 } 5275 5276 /* Set up the changed option bits, but don't change anything yet. */ 5277 5278 newoptions = (options | set) & (~unset); 5279 5280 /* If the options ended with ')' this is not the start of a nested 5281 group with option changes, so the options change at this level. If this 5282 item is right at the start of the pattern, the options can be 5283 abstracted and made external in the pre-compile phase, and ignored in 5284 the compile phase. This can be helpful when matching -- for instance in 5285 caseless checking of required bytes. 5286 5287 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are 5288 definitely *not* at the start of the pattern because something has been 5289 compiled. In the pre-compile phase, however, the code pointer can have 5290 that value after the start, because it gets reset as code is discarded 5291 during the pre-compile. However, this can happen only at top level - if 5292 we are within parentheses, the starting BRA will still be present. At 5293 any parenthesis level, the length value can be used to test if anything 5294 has been compiled at that level. Thus, a test for both these conditions 5295 is necessary to ensure we correctly detect the start of the pattern in 5296 both phases. 5297 5298 If we are not at the pattern start, compile code to change the ims 5299 options if this setting actually changes any of them, and reset the 5300 greedy defaults and the case value for firstbyte and reqbyte. */ 5301 5302 if (*ptr == CHAR_RIGHT_PARENTHESIS) 5303 { 5304 if (code == cd->start_code + 1 + LINK_SIZE && 5305 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) 5306 { 5307 cd->external_options = newoptions; 5308 } 5309 else 5310 { 5311 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) 5312 { 5313 *code++ = OP_OPT; 5314 *code++ = newoptions & PCRE_IMS; 5315 } 5316 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); 5317 greedy_non_default = greedy_default ^ 1; 5318 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 5319 } 5320 5321 /* Change options at this level, and pass them back for use 5322 in subsequent branches. When not at the start of the pattern, this 5323 information is also necessary so that a resetting item can be 5324 compiled at the end of a group (if we are in a group). */ 5325 5326 *optionsptr = options = newoptions; 5327 previous = NULL; /* This item can't be repeated */ 5328 continue; /* It is complete */ 5329 } 5330 5331 /* If the options ended with ':' we are heading into a nested group 5332 with possible change of options. Such groups are non-capturing and are 5333 not assertions of any kind. All we need to do is skip over the ':'; 5334 the newoptions value is handled below. */ 5335 5336 bravalue = OP_BRA; 5337 ptr++; 5338 } /* End of switch for character following (? */ 5339 } /* End of (? handling */ 5340 5341 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, 5342 all unadorned brackets become non-capturing and behave like (?:...) 5343 brackets. */ 5344 5345 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) 5346 { 5347 bravalue = OP_BRA; 5348 } 5349 5350 /* Else we have a capturing group. */ 5351 5352 else 5353 { 5354 NUMBERED_GROUP: 5355 cd->bracount += 1; 5356 PUT2(code, 1+LINK_SIZE, cd->bracount); 5357 skipbytes = 2; 5358 } 5359 5360 /* Process nested bracketed regex. Assertions may not be repeated, but 5361 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a 5362 non-register variable in order to be able to pass its address because some 5363 compilers complain otherwise. Pass in a new setting for the ims options if 5364 they have changed. */ 5365 5366 previous = (bravalue >= OP_ONCE)? code : NULL; 5367 *code = bravalue; 5368 tempcode = code; 5369 tempreqvary = cd->req_varyopt; /* Save value before bracket */ 5370 length_prevgroup = 0; /* Initialize for pre-compile phase */ 5371 5372 if (!compile_regex( 5373 newoptions, /* The complete new option state */ 5374 options & PCRE_IMS, /* The previous ims option state */ 5375 &tempcode, /* Where to put code (updated) */ 5376 &ptr, /* Input pointer (updated) */ 5377 errorcodeptr, /* Where to put an error message */ 5378 (bravalue == OP_ASSERTBACK || 5379 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ 5380 reset_bracount, /* True if (?| group */ 5381 skipbytes, /* Skip over bracket number */ 5382 &subfirstbyte, /* For possible first char */ 5383 &subreqbyte, /* For possible last char */ 5384 bcptr, /* Current branch chain */ 5385 cd, /* Tables block */ 5386 (lengthptr == NULL)? NULL : /* Actual compile phase */ 5387 &length_prevgroup /* Pre-compile phase */ 5388 )) 5389 goto FAILED; 5390 5391 /* At the end of compiling, code is still pointing to the start of the 5392 group, while tempcode has been updated to point past the end of the group 5393 and any option resetting that may follow it. The pattern pointer (ptr) 5394 is on the bracket. */ 5395 5396 /* If this is a conditional bracket, check that there are no more than 5397 two branches in the group, or just one if it's a DEFINE group. We do this 5398 in the real compile phase, not in the pre-pass, where the whole group may 5399 not be available. */ 5400 5401 if (bravalue == OP_COND && lengthptr == NULL) 5402 { 5403 uschar *tc = code; 5404 int condcount = 0; 5405 5406 do { 5407 condcount++; 5408 tc += GET(tc,1); 5409 } 5410 while (*tc != OP_KET); 5411 5412 /* A DEFINE group is never obeyed inline (the "condition" is always 5413 false). It must have only one branch. */ 5414 5415 if (code[LINK_SIZE+1] == OP_DEF) 5416 { 5417 if (condcount > 1) 5418 { 5419 *errorcodeptr = ERR54; 5420 goto FAILED; 5421 } 5422 bravalue = OP_DEF; /* Just a flag to suppress char handling below */ 5423 } 5424 5425 /* A "normal" conditional group. If there is just one branch, we must not 5426 make use of its firstbyte or reqbyte, because this is equivalent to an 5427 empty second branch. */ 5428 5429 else 5430 { 5431 if (condcount > 2) 5432 { 5433 *errorcodeptr = ERR27; 5434 goto FAILED; 5435 } 5436 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; 5437 } 5438 } 5439 5440 /* Error if hit end of pattern */ 5441 5442 if (*ptr != CHAR_RIGHT_PARENTHESIS) 5443 { 5444 *errorcodeptr = ERR14; 5445 goto FAILED; 5446 } 5447 5448 /* In the pre-compile phase, update the length by the length of the group, 5449 less the brackets at either end. Then reduce the compiled code to just a 5450 set of non-capturing brackets so that it doesn't use much memory if it is 5451 duplicated by a quantifier.*/ 5452 5453 if (lengthptr != NULL) 5454 { 5455 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE) 5456 { 5457 *errorcodeptr = ERR20; 5458 goto FAILED; 5459 } 5460 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE; 5461 *code++ = OP_BRA; 5462 PUTINC(code, 0, 1 + LINK_SIZE); 5463 *code++ = OP_KET; 5464 PUTINC(code, 0, 1 + LINK_SIZE); 5465 break; /* No need to waste time with special character handling */ 5466 } 5467 5468 /* Otherwise update the main code pointer to the end of the group. */ 5469 5470 code = tempcode; 5471 5472 /* For a DEFINE group, required and first character settings are not 5473 relevant. */ 5474 5475 if (bravalue == OP_DEF) break; 5476 5477 /* Handle updating of the required and first characters for other types of 5478 group. Update for normal brackets of all kinds, and conditions with two 5479 branches (see code above). If the bracket is followed by a quantifier with 5480 zero repeat, we have to back off. Hence the definition of zeroreqbyte and 5481 zerofirstbyte outside the main loop so that they can be accessed for the 5482 back off. */ 5483 5484 zeroreqbyte = reqbyte; 5485 zerofirstbyte = firstbyte; 5486 groupsetfirstbyte = FALSE; 5487 5488 if (bravalue >= OP_ONCE) 5489 { 5490 /* If we have not yet set a firstbyte in this branch, take it from the 5491 subpattern, remembering that it was set here so that a repeat of more 5492 than one can replicate it as reqbyte if necessary. If the subpattern has 5493 no firstbyte, set "none" for the whole branch. In both cases, a zero 5494 repeat forces firstbyte to "none". */ 5495 5496 if (firstbyte == REQ_UNSET) 5497 { 5498 if (subfirstbyte >= 0) 5499 { 5500 firstbyte = subfirstbyte; 5501 groupsetfirstbyte = TRUE; 5502 } 5503 else firstbyte = REQ_NONE; 5504 zerofirstbyte = REQ_NONE; 5505 } 5506 5507 /* If firstbyte was previously set, convert the subpattern's firstbyte 5508 into reqbyte if there wasn't one, using the vary flag that was in 5509 existence beforehand. */ 5510 5511 else if (subfirstbyte >= 0 && subreqbyte < 0) 5512 subreqbyte = subfirstbyte | tempreqvary; 5513 5514 /* If the subpattern set a required byte (or set a first byte that isn't 5515 really the first byte - see above), set it. */ 5516 5517 if (subreqbyte >= 0) reqbyte = subreqbyte; 5518 } 5519 5520 /* For a forward assertion, we take the reqbyte, if set. This can be 5521 helpful if the pattern that follows the assertion doesn't set a different 5522 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte 5523 for an assertion, however because it leads to incorrect effect for patterns 5524 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead 5525 of a firstbyte. This is overcome by a scan at the end if there's no 5526 firstbyte, looking for an asserted first char. */ 5527 5528 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; 5529 break; /* End of processing '(' */ 5530 5531 5532 /* ===================================================================*/ 5533 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values 5534 are arranged to be the negation of the corresponding OP_values. For the 5535 back references, the values are ESC_REF plus the reference number. Only 5536 back references and those types that consume a character may be repeated. 5537 We can test for values between ESC_b and ESC_Z for the latter; this may 5538 have to change if any new ones are ever created. */ 5539 5540 case CHAR_BACKSLASH: 5541 tempptr = ptr; 5542 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); 5543 if (*errorcodeptr != 0) goto FAILED; 5544 5545 if (c < 0) 5546 { 5547 if (-c == ESC_Q) /* Handle start of quoted string */ 5548 { 5549 if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) 5550 ptr += 2; /* avoid empty string */ 5551 else inescq = TRUE; 5552 continue; 5553 } 5554 5555 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */ 5556 5557 /* For metasequences that actually match a character, we disable the 5558 setting of a first character if it hasn't already been set. */ 5559 5560 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) 5561 firstbyte = REQ_NONE; 5562 5563 /* Set values to reset to if this is followed by a zero repeat. */ 5564 5565 zerofirstbyte = firstbyte; 5566 zeroreqbyte = reqbyte; 5567 5568 /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n' 5569 is a subroutine call by number (Oniguruma syntax). In fact, the value 5570 -ESC_g is returned only for these cases. So we don't need to check for < 5571 or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is 5572 -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as 5573 that is a synonym for a named back reference). */ 5574 5575 if (-c == ESC_g) 5576 { 5577 const uschar *p; 5578 save_hwm = cd->hwm; /* Normally this is set when '(' is read */ 5579 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 5580 CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; 5581 5582 /* These two statements stop the compiler for warning about possibly 5583 unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In 5584 fact, because we actually check for a number below, the paths that 5585 would actually be in error are never taken. */ 5586 5587 skipbytes = 0; 5588 reset_bracount = FALSE; 5589 5590 /* Test for a name */ 5591 5592 if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS) 5593 { 5594 BOOL isnumber = TRUE; 5595 for (p = ptr + 1; *p != 0 && *p != terminator; p++) 5596 { 5597 if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; 5598 if ((cd->ctypes[*p] & ctype_word) == 0) break; 5599 } 5600 if (*p != terminator) 5601 { 5602 *errorcodeptr = ERR57; 5603 break; 5604 } 5605 if (isnumber) 5606 { 5607 ptr++; 5608 goto HANDLE_NUMERICAL_RECURSION; 5609 } 5610 is_recurse = TRUE; 5611 goto NAMED_REF_OR_RECURSE; 5612 } 5613 5614 /* Test a signed number in angle brackets or quotes. */ 5615 5616 p = ptr + 2; 5617 while ((digitab[*p] & ctype_digit) != 0) p++; 5618 if (*p != terminator) 5619 { 5620 *errorcodeptr = ERR57; 5621 break; 5622 } 5623 ptr++; 5624 goto HANDLE_NUMERICAL_RECURSION; 5625 } 5626 5627 /* \k<name> or \k'name' is a back reference by name (Perl syntax). 5628 We also support \k{name} (.NET syntax) */ 5629 5630 if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN || 5631 ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET)) 5632 { 5633 is_recurse = FALSE; 5634 terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? 5635 CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? 5636 CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; 5637 goto NAMED_REF_OR_RECURSE; 5638 } 5639 5640 /* Back references are handled specially; must disable firstbyte if 5641 not set to cope with cases like (?=(\w+))\1: which would otherwise set 5642 ':' later. */ 5643 5644 if (-c >= ESC_REF) 5645 { 5646 open_capitem *oc; 5647 recno = -c - ESC_REF; 5648 5649 HANDLE_REFERENCE: /* Come here from named backref handling */ 5650 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 5651 previous = code; 5652 *code++ = OP_REF; 5653 PUT2INC(code, 0, recno); 5654 cd->backref_map |= (recno < 32)? (1 << recno) : 1; 5655 if (recno > cd->top_backref) cd->top_backref = recno; 5656 5657 /* Check to see if this back reference is recursive, that it, it 5658 is inside the group that it references. A flag is set so that the 5659 group can be made atomic. */ 5660 5661 for (oc = cd->open_caps; oc != NULL; oc = oc->next) 5662 { 5663 if (oc->number == recno) 5664 { 5665 oc->flag = TRUE; 5666 break; 5667 } 5668 } 5669 } 5670 5671 /* So are Unicode property matches, if supported. */ 5672 5673#ifdef SUPPORT_UCP 5674 else if (-c == ESC_P || -c == ESC_p) 5675 { 5676 BOOL negated; 5677 int pdata; 5678 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); 5679 if (ptype < 0) goto FAILED; 5680 previous = code; 5681 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; 5682 *code++ = ptype; 5683 *code++ = pdata; 5684 } 5685#else 5686 5687 /* If Unicode properties are not supported, \X, \P, and \p are not 5688 allowed. */ 5689 5690 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p) 5691 { 5692 *errorcodeptr = ERR45; 5693 goto FAILED; 5694 } 5695#endif 5696 5697 /* For the rest (including \X when Unicode properties are supported), we 5698 can obtain the OP value by negating the escape value. */ 5699 5700 else 5701 { 5702 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; 5703 *code++ = -c; 5704 } 5705 continue; 5706 } 5707 5708 /* We have a data character whose value is in c. In UTF-8 mode it may have 5709 a value > 127. We set its representation in the length/buffer, and then 5710 handle it as a data character. */ 5711 5712#ifdef SUPPORT_UTF8 5713 if (utf8 && c > 127) 5714 mclength = _pcre_ord2utf8(c, mcbuffer); 5715 else 5716#endif 5717 5718 { 5719 mcbuffer[0] = c; 5720 mclength = 1; 5721 } 5722 goto ONE_CHAR; 5723 5724 5725 /* ===================================================================*/ 5726 /* Handle a literal character. It is guaranteed not to be whitespace or # 5727 when the extended flag is set. If we are in UTF-8 mode, it may be a 5728 multi-byte literal character. */ 5729 5730 default: 5731 NORMAL_CHAR: 5732 mclength = 1; 5733 mcbuffer[0] = c; 5734 5735#ifdef SUPPORT_UTF8 5736 if (utf8 && c >= 0xc0) 5737 { 5738 while ((ptr[1] & 0xc0) == 0x80) 5739 mcbuffer[mclength++] = *(++ptr); 5740 } 5741#endif 5742 5743 /* At this point we have the character's bytes in mcbuffer, and the length 5744 in mclength. When not in UTF-8 mode, the length is always 1. */ 5745 5746 ONE_CHAR: 5747 previous = code; 5748 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; 5749 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; 5750 5751 /* Remember if \r or \n were seen */ 5752 5753 if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) 5754 cd->external_flags |= PCRE_HASCRORLF; 5755 5756 /* Set the first and required bytes appropriately. If no previous first 5757 byte, set it from this character, but revert to none on a zero repeat. 5758 Otherwise, leave the firstbyte value alone, and don't change it on a zero 5759 repeat. */ 5760 5761 if (firstbyte == REQ_UNSET) 5762 { 5763 zerofirstbyte = REQ_NONE; 5764 zeroreqbyte = reqbyte; 5765 5766 /* If the character is more than one byte long, we can set firstbyte 5767 only if it is not to be matched caselessly. */ 5768 5769 if (mclength == 1 || req_caseopt == 0) 5770 { 5771 firstbyte = mcbuffer[0] | req_caseopt; 5772 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; 5773 } 5774 else firstbyte = reqbyte = REQ_NONE; 5775 } 5776 5777 /* firstbyte was previously set; we can set reqbyte only the length is 5778 1 or the matching is caseful. */ 5779 5780 else 5781 { 5782 zerofirstbyte = firstbyte; 5783 zeroreqbyte = reqbyte; 5784 if (mclength == 1 || req_caseopt == 0) 5785 reqbyte = code[-1] | req_caseopt | cd->req_varyopt; 5786 } 5787 5788 break; /* End of literal character handling */ 5789 } 5790 } /* end of big loop */ 5791 5792 5793/* Control never reaches here by falling through, only by a goto for all the 5794error states. Pass back the position in the pattern so that it can be displayed 5795to the user for diagnosing the error. */ 5796 5797FAILED: 5798*ptrptr = ptr; 5799return FALSE; 5800} 5801 5802 5803 5804 5805/************************************************* 5806* Compile sequence of alternatives * 5807*************************************************/ 5808 5809/* On entry, ptr is pointing past the bracket character, but on return it 5810points to the closing bracket, or vertical bar, or end of string. The code 5811variable is pointing at the byte into which the BRA operator has been stored. 5812If the ims options are changed at the start (for a (?ims: group) or during any 5813branch, we need to insert an OP_OPT item at the start of every following branch 5814to ensure they get set correctly at run time, and also pass the new options 5815into every subsequent branch compile. 5816 5817This function is used during the pre-compile phase when we are trying to find 5818out the amount of memory needed, as well as during the real compile phase. The 5819value of lengthptr distinguishes the two phases. 5820 5821Arguments: 5822 options option bits, including any changes for this subpattern 5823 oldims previous settings of ims option bits 5824 codeptr -> the address of the current code pointer 5825 ptrptr -> the address of the current pattern pointer 5826 errorcodeptr -> pointer to error code variable 5827 lookbehind TRUE if this is a lookbehind assertion 5828 reset_bracount TRUE to reset the count for each branch 5829 skipbytes skip this many bytes at start (for brackets and OP_COND) 5830 firstbyteptr place to put the first required character, or a negative number 5831 reqbyteptr place to put the last required character, or a negative number 5832 bcptr pointer to the chain of currently open branches 5833 cd points to the data block with tables pointers etc. 5834 lengthptr NULL during the real compile phase 5835 points to length accumulator during pre-compile phase 5836 5837Returns: TRUE on success 5838*/ 5839 5840static BOOL 5841compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr, 5842 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, 5843 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, 5844 int *lengthptr) 5845{ 5846const uschar *ptr = *ptrptr; 5847uschar *code = *codeptr; 5848uschar *last_branch = code; 5849uschar *start_bracket = code; 5850uschar *reverse_count = NULL; 5851open_capitem capitem; 5852int capnumber = 0; 5853int firstbyte, reqbyte; 5854int branchfirstbyte, branchreqbyte; 5855int length; 5856int orig_bracount; 5857int max_bracount; 5858int old_external_options = cd->external_options; 5859branch_chain bc; 5860 5861bc.outer = bcptr; 5862bc.current_branch = code; 5863 5864firstbyte = reqbyte = REQ_UNSET; 5865 5866/* Accumulate the length for use in the pre-compile phase. Start with the 5867length of the BRA and KET and any extra bytes that are required at the 5868beginning. We accumulate in a local variable to save frequent testing of 5869lenthptr for NULL. We cannot do this by looking at the value of code at the 5870start and end of each alternative, because compiled items are discarded during 5871the pre-compile phase so that the work space is not exceeded. */ 5872 5873length = 2 + 2*LINK_SIZE + skipbytes; 5874 5875/* WARNING: If the above line is changed for any reason, you must also change 5876the code that abstracts option settings at the start of the pattern and makes 5877them global. It tests the value of length for (2 + 2*LINK_SIZE) in the 5878pre-compile phase to find out whether anything has yet been compiled or not. */ 5879 5880/* If this is a capturing subpattern, add to the chain of open capturing items 5881so that we can detect them if (*ACCEPT) is encountered. This is also used to 5882detect groups that contain recursive back references to themselves. */ 5883 5884if (*code == OP_CBRA) 5885 { 5886 capnumber = GET2(code, 1 + LINK_SIZE); 5887 capitem.number = capnumber; 5888 capitem.next = cd->open_caps; 5889 capitem.flag = FALSE; 5890 cd->open_caps = &capitem; 5891 } 5892 5893/* Offset is set zero to mark that this bracket is still open */ 5894 5895PUT(code, 1, 0); 5896code += 1 + LINK_SIZE + skipbytes; 5897 5898/* Loop for each alternative branch */ 5899 5900orig_bracount = max_bracount = cd->bracount; 5901for (;;) 5902 { 5903 /* For a (?| group, reset the capturing bracket count so that each branch 5904 uses the same numbers. */ 5905 5906 if (reset_bracount) cd->bracount = orig_bracount; 5907 5908 /* Handle a change of ims options at the start of the branch */ 5909 5910 if ((options & PCRE_IMS) != oldims) 5911 { 5912 *code++ = OP_OPT; 5913 *code++ = options & PCRE_IMS; 5914 length += 2; 5915 } 5916 5917 /* Set up dummy OP_REVERSE if lookbehind assertion */ 5918 5919 if (lookbehind) 5920 { 5921 *code++ = OP_REVERSE; 5922 reverse_count = code; 5923 PUTINC(code, 0, 0); 5924 length += 1 + LINK_SIZE; 5925 } 5926 5927 /* Now compile the branch; in the pre-compile phase its length gets added 5928 into the length. */ 5929 5930 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, 5931 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) 5932 { 5933 *ptrptr = ptr; 5934 return FALSE; 5935 } 5936 5937 /* If the external options have changed during this branch, it means that we 5938 are at the top level, and a leading option setting has been encountered. We 5939 need to re-set the original option values to take account of this so that, 5940 during the pre-compile phase, we know to allow for a re-set at the start of 5941 subsequent branches. */ 5942 5943 if (old_external_options != cd->external_options) 5944 oldims = cd->external_options & PCRE_IMS; 5945 5946 /* Keep the highest bracket count in case (?| was used and some branch 5947 has fewer than the rest. */ 5948 5949 if (cd->bracount > max_bracount) max_bracount = cd->bracount; 5950 5951 /* In the real compile phase, there is some post-processing to be done. */ 5952 5953 if (lengthptr == NULL) 5954 { 5955 /* If this is the first branch, the firstbyte and reqbyte values for the 5956 branch become the values for the regex. */ 5957 5958 if (*last_branch != OP_ALT) 5959 { 5960 firstbyte = branchfirstbyte; 5961 reqbyte = branchreqbyte; 5962 } 5963 5964 /* If this is not the first branch, the first char and reqbyte have to 5965 match the values from all the previous branches, except that if the 5966 previous value for reqbyte didn't have REQ_VARY set, it can still match, 5967 and we set REQ_VARY for the regex. */ 5968 5969 else 5970 { 5971 /* If we previously had a firstbyte, but it doesn't match the new branch, 5972 we have to abandon the firstbyte for the regex, but if there was 5973 previously no reqbyte, it takes on the value of the old firstbyte. */ 5974 5975 if (firstbyte >= 0 && firstbyte != branchfirstbyte) 5976 { 5977 if (reqbyte < 0) reqbyte = firstbyte; 5978 firstbyte = REQ_NONE; 5979 } 5980 5981 /* If we (now or from before) have no firstbyte, a firstbyte from the 5982 branch becomes a reqbyte if there isn't a branch reqbyte. */ 5983 5984 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) 5985 branchreqbyte = branchfirstbyte; 5986 5987 /* Now ensure that the reqbytes match */ 5988 5989 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) 5990 reqbyte = REQ_NONE; 5991 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ 5992 } 5993 5994 /* If lookbehind, check that this branch matches a fixed-length string, and 5995 put the length into the OP_REVERSE item. Temporarily mark the end of the 5996 branch with OP_END. If the branch contains OP_RECURSE, the result is -3 5997 because there may be forward references that we can't check here. Set a 5998 flag to cause another lookbehind check at the end. Why not do it all at the 5999 end? Because common, erroneous checks are picked up here and the offset of 6000 the problem can be shown. */ 6001 6002 if (lookbehind) 6003 { 6004 int fixed_length; 6005 *code = OP_END; 6006 fixed_length = find_fixedlength(last_branch, options, FALSE, cd); 6007 DPRINTF(("fixed length = %d\n", fixed_length)); 6008 if (fixed_length == -3) 6009 { 6010 cd->check_lookbehind = TRUE; 6011 } 6012 else if (fixed_length < 0) 6013 { 6014 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; 6015 *ptrptr = ptr; 6016 return FALSE; 6017 } 6018 else { PUT(reverse_count, 0, fixed_length); } 6019 } 6020 } 6021 6022 /* Reached end of expression, either ')' or end of pattern. In the real 6023 compile phase, go back through the alternative branches and reverse the chain 6024 of offsets, with the field in the BRA item now becoming an offset to the 6025 first alternative. If there are no alternatives, it points to the end of the 6026 group. The length in the terminating ket is always the length of the whole 6027 bracketed item. If any of the ims options were changed inside the group, 6028 compile a resetting op-code following, except at the very end of the pattern. 6029 Return leaving the pointer at the terminating char. */ 6030 6031 if (*ptr != CHAR_VERTICAL_LINE) 6032 { 6033 if (lengthptr == NULL) 6034 { 6035 int branch_length = code - last_branch; 6036 do 6037 { 6038 int prev_length = GET(last_branch, 1); 6039 PUT(last_branch, 1, branch_length); 6040 branch_length = prev_length; 6041 last_branch -= branch_length; 6042 } 6043 while (branch_length > 0); 6044 } 6045 6046 /* Fill in the ket */ 6047 6048 *code = OP_KET; 6049 PUT(code, 1, code - start_bracket); 6050 code += 1 + LINK_SIZE; 6051 6052 /* If it was a capturing subpattern, check to see if it contained any 6053 recursive back references. If so, we must wrap it in atomic brackets. 6054 In any event, remove the block from the chain. */ 6055 6056 if (capnumber > 0) 6057 { 6058 if (cd->open_caps->flag) 6059 { 6060 memmove(start_bracket + 1 + LINK_SIZE, start_bracket, 6061 code - start_bracket); 6062 *start_bracket = OP_ONCE; 6063 code += 1 + LINK_SIZE; 6064 PUT(start_bracket, 1, code - start_bracket); 6065 *code = OP_KET; 6066 PUT(code, 1, code - start_bracket); 6067 code += 1 + LINK_SIZE; 6068 length += 2 + 2*LINK_SIZE; 6069 } 6070 cd->open_caps = cd->open_caps->next; 6071 } 6072 6073 /* Reset options if needed. */ 6074 6075 if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS) 6076 { 6077 *code++ = OP_OPT; 6078 *code++ = oldims; 6079 length += 2; 6080 } 6081 6082 /* Retain the highest bracket number, in case resetting was used. */ 6083 6084 cd->bracount = max_bracount; 6085 6086 /* Set values to pass back */ 6087 6088 *codeptr = code; 6089 *ptrptr = ptr; 6090 *firstbyteptr = firstbyte; 6091 *reqbyteptr = reqbyte; 6092 if (lengthptr != NULL) 6093 { 6094 if (OFLOW_MAX - *lengthptr < length) 6095 { 6096 *errorcodeptr = ERR20; 6097 return FALSE; 6098 } 6099 *lengthptr += length; 6100 } 6101 return TRUE; 6102 } 6103 6104 /* Another branch follows. In the pre-compile phase, we can move the code 6105 pointer back to where it was for the start of the first branch. (That is, 6106 pretend that each branch is the only one.) 6107 6108 In the real compile phase, insert an ALT node. Its length field points back 6109 to the previous branch while the bracket remains open. At the end the chain 6110 is reversed. It's done like this so that the start of the bracket has a 6111 zero offset until it is closed, making it possible to detect recursion. */ 6112 6113 if (lengthptr != NULL) 6114 { 6115 code = *codeptr + 1 + LINK_SIZE + skipbytes; 6116 length += 1 + LINK_SIZE; 6117 } 6118 else 6119 { 6120 *code = OP_ALT; 6121 PUT(code, 1, code - last_branch); 6122 bc.current_branch = last_branch = code; 6123 code += 1 + LINK_SIZE; 6124 } 6125 6126 ptr++; 6127 } 6128/* Control never reaches here */ 6129} 6130 6131 6132 6133 6134/************************************************* 6135* Check for anchored expression * 6136*************************************************/ 6137 6138/* Try to find out if this is an anchored regular expression. Consider each 6139alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket 6140all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then 6141it's anchored. However, if this is a multiline pattern, then only OP_SOD 6142counts, since OP_CIRC can match in the middle. 6143 6144We can also consider a regex to be anchored if OP_SOM starts all its branches. 6145This is the code for \G, which means "match at start of match position, taking 6146into account the match offset". 6147 6148A branch is also implicitly anchored if it starts with .* and DOTALL is set, 6149because that will try the rest of the pattern at all possible matching points, 6150so there is no point trying again.... er .... 6151 6152.... except when the .* appears inside capturing parentheses, and there is a 6153subsequent back reference to those parentheses. We haven't enough information 6154to catch that case precisely. 6155 6156At first, the best we could do was to detect when .* was in capturing brackets 6157and the highest back reference was greater than or equal to that level. 6158However, by keeping a bitmap of the first 31 back references, we can catch some 6159of the more common cases more precisely. 6160 6161Arguments: 6162 code points to start of expression (the bracket) 6163 options points to the options setting 6164 bracket_map a bitmap of which brackets we are inside while testing; this 6165 handles up to substring 31; after that we just have to take 6166 the less precise approach 6167 backref_map the back reference bitmap 6168 6169Returns: TRUE or FALSE 6170*/ 6171 6172static BOOL 6173is_anchored(register const uschar *code, int *options, unsigned int bracket_map, 6174 unsigned int backref_map) 6175{ 6176do { 6177 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], 6178 options, PCRE_MULTILINE, FALSE); 6179 register int op = *scode; 6180 6181 /* Non-capturing brackets */ 6182 6183 if (op == OP_BRA) 6184 { 6185 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; 6186 } 6187 6188 /* Capturing brackets */ 6189 6190 else if (op == OP_CBRA) 6191 { 6192 int n = GET2(scode, 1+LINK_SIZE); 6193 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); 6194 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; 6195 } 6196 6197 /* Other brackets */ 6198 6199 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) 6200 { 6201 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; 6202 } 6203 6204 /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and 6205 it isn't in brackets that are or may be referenced. */ 6206 6207 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || 6208 op == OP_TYPEPOSSTAR)) 6209 { 6210 if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) 6211 return FALSE; 6212 } 6213 6214 /* Check for explicit anchoring */ 6215 6216 else if (op != OP_SOD && op != OP_SOM && 6217 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) 6218 return FALSE; 6219 code += GET(code, 1); 6220 } 6221while (*code == OP_ALT); /* Loop for each alternative */ 6222return TRUE; 6223} 6224 6225 6226 6227/************************************************* 6228* Check for starting with ^ or .* * 6229*************************************************/ 6230 6231/* This is called to find out if every branch starts with ^ or .* so that 6232"first char" processing can be done to speed things up in multiline 6233matching and for non-DOTALL patterns that start with .* (which must start at 6234the beginning or after \n). As in the case of is_anchored() (see above), we 6235have to take account of back references to capturing brackets that contain .* 6236because in that case we can't make the assumption. 6237 6238Arguments: 6239 code points to start of expression (the bracket) 6240 bracket_map a bitmap of which brackets we are inside while testing; this 6241 handles up to substring 31; after that we just have to take 6242 the less precise approach 6243 backref_map the back reference bitmap 6244 6245Returns: TRUE or FALSE 6246*/ 6247 6248static BOOL 6249is_startline(const uschar *code, unsigned int bracket_map, 6250 unsigned int backref_map) 6251{ 6252do { 6253 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code], 6254 NULL, 0, FALSE); 6255 register int op = *scode; 6256 6257 /* If we are at the start of a conditional assertion group, *both* the 6258 conditional assertion *and* what follows the condition must satisfy the test 6259 for start of line. Other kinds of condition fail. Note that there may be an 6260 auto-callout at the start of a condition. */ 6261 6262 if (op == OP_COND) 6263 { 6264 scode += 1 + LINK_SIZE; 6265 if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT]; 6266 switch (*scode) 6267 { 6268 case OP_CREF: 6269 case OP_NCREF: 6270 case OP_RREF: 6271 case OP_NRREF: 6272 case OP_DEF: 6273 return FALSE; 6274 6275 default: /* Assertion */ 6276 if (!is_startline(scode, bracket_map, backref_map)) return FALSE; 6277 do scode += GET(scode, 1); while (*scode == OP_ALT); 6278 scode += 1 + LINK_SIZE; 6279 break; 6280 } 6281 scode = first_significant_code(scode, NULL, 0, FALSE); 6282 op = *scode; 6283 } 6284 6285 /* Non-capturing brackets */ 6286 6287 if (op == OP_BRA) 6288 { 6289 if (!is_startline(scode, bracket_map, backref_map)) return FALSE; 6290 } 6291 6292 /* Capturing brackets */ 6293 6294 else if (op == OP_CBRA) 6295 { 6296 int n = GET2(scode, 1+LINK_SIZE); 6297 int new_map = bracket_map | ((n < 32)? (1 << n) : 1); 6298 if (!is_startline(scode, new_map, backref_map)) return FALSE; 6299 } 6300 6301 /* Other brackets */ 6302 6303 else if (op == OP_ASSERT || op == OP_ONCE) 6304 { 6305 if (!is_startline(scode, bracket_map, backref_map)) return FALSE; 6306 } 6307 6308 /* .* means "start at start or after \n" if it isn't in brackets that 6309 may be referenced. */ 6310 6311 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) 6312 { 6313 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; 6314 } 6315 6316 /* Check for explicit circumflex */ 6317 6318 else if (op != OP_CIRC) return FALSE; 6319 6320 /* Move on to the next alternative */ 6321 6322 code += GET(code, 1); 6323 } 6324while (*code == OP_ALT); /* Loop for each alternative */ 6325return TRUE; 6326} 6327 6328 6329 6330/************************************************* 6331* Check for asserted fixed first char * 6332*************************************************/ 6333 6334/* During compilation, the "first char" settings from forward assertions are 6335discarded, because they can cause conflicts with actual literals that follow. 6336However, if we end up without a first char setting for an unanchored pattern, 6337it is worth scanning the regex to see if there is an initial asserted first 6338char. If all branches start with the same asserted char, or with a bracket all 6339of whose alternatives start with the same asserted char (recurse ad lib), then 6340we return that char, otherwise -1. 6341 6342Arguments: 6343 code points to start of expression (the bracket) 6344 options pointer to the options (used to check casing changes) 6345 inassert TRUE if in an assertion 6346 6347Returns: -1 or the fixed first char 6348*/ 6349 6350static int 6351find_firstassertedchar(const uschar *code, int *options, BOOL inassert) 6352{ 6353register int c = -1; 6354do { 6355 int d; 6356 const uschar *scode = 6357 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); 6358 register int op = *scode; 6359 6360 switch(op) 6361 { 6362 default: 6363 return -1; 6364 6365 case OP_BRA: 6366 case OP_CBRA: 6367 case OP_ASSERT: 6368 case OP_ONCE: 6369 case OP_COND: 6370 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) 6371 return -1; 6372 if (c < 0) c = d; else if (c != d) return -1; 6373 break; 6374 6375 case OP_EXACT: /* Fall through */ 6376 scode += 2; 6377 6378 case OP_CHAR: 6379 case OP_CHARNC: 6380 case OP_PLUS: 6381 case OP_MINPLUS: 6382 case OP_POSPLUS: 6383 if (!inassert) return -1; 6384 if (c < 0) 6385 { 6386 c = scode[1]; 6387 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; 6388 } 6389 else if (c != scode[1]) return -1; 6390 break; 6391 } 6392 6393 code += GET(code, 1); 6394 } 6395while (*code == OP_ALT); 6396return c; 6397} 6398 6399 6400 6401/************************************************* 6402* Compile a Regular Expression * 6403*************************************************/ 6404 6405/* This function takes a string and returns a pointer to a block of store 6406holding a compiled version of the expression. The original API for this 6407function had no error code return variable; it is retained for backwards 6408compatibility. The new function is given a new name. 6409 6410Arguments: 6411 pattern the regular expression 6412 options various option bits 6413 errorcodeptr pointer to error code variable (pcre_compile2() only) 6414 can be NULL if you don't want a code value 6415 errorptr pointer to pointer to error text 6416 erroroffset ptr offset in pattern where error was detected 6417 tables pointer to character tables or NULL 6418 6419Returns: pointer to compiled data block, or NULL on error, 6420 with errorptr and erroroffset set 6421*/ 6422 6423PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION 6424pcre_compile(const char *pattern, int options, const char **errorptr, 6425 int *erroroffset, const unsigned char *tables) 6426{ 6427return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); 6428} 6429 6430 6431PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION 6432pcre_compile2(const char *pattern, int options, int *errorcodeptr, 6433 const char **errorptr, int *erroroffset, const unsigned char *tables) 6434{ 6435real_pcre *re; 6436int length = 1; /* For final END opcode */ 6437int firstbyte, reqbyte, newline; 6438int errorcode = 0; 6439int skipatstart = 0; 6440BOOL utf8 = (options & PCRE_UTF8) != 0; 6441size_t size; 6442uschar *code; 6443const uschar *codestart; 6444const uschar *ptr; 6445compile_data compile_block; 6446compile_data *cd = &compile_block; 6447 6448/* This space is used for "compiling" into during the first phase, when we are 6449computing the amount of memory that is needed. Compiled items are thrown away 6450as soon as possible, so that a fairly large buffer should be sufficient for 6451this purpose. The same space is used in the second phase for remembering where 6452to fill in forward references to subpatterns. */ 6453 6454uschar cworkspace[COMPILE_WORK_SIZE]; 6455 6456/* Set this early so that early errors get offset 0. */ 6457 6458ptr = (const uschar *)pattern; 6459 6460/* We can't pass back an error message if errorptr is NULL; I guess the best we 6461can do is just return NULL, but we can set a code value if there is a code 6462pointer. */ 6463 6464if (errorptr == NULL) 6465 { 6466 if (errorcodeptr != NULL) *errorcodeptr = 99; 6467 return NULL; 6468 } 6469 6470*errorptr = NULL; 6471if (errorcodeptr != NULL) *errorcodeptr = ERR0; 6472 6473/* However, we can give a message for this error */ 6474 6475if (erroroffset == NULL) 6476 { 6477 errorcode = ERR16; 6478 goto PCRE_EARLY_ERROR_RETURN2; 6479 } 6480 6481*erroroffset = 0; 6482 6483/* Set up pointers to the individual character tables */ 6484 6485if (tables == NULL) tables = _pcre_default_tables; 6486cd->lcc = tables + lcc_offset; 6487cd->fcc = tables + fcc_offset; 6488cd->cbits = tables + cbits_offset; 6489cd->ctypes = tables + ctypes_offset; 6490 6491/* Check that all undefined public option bits are zero */ 6492 6493if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) 6494 { 6495 errorcode = ERR17; 6496 goto PCRE_EARLY_ERROR_RETURN; 6497 } 6498 6499/* Check for global one-time settings at the start of the pattern, and remember 6500the offset for later. */ 6501 6502while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && 6503 ptr[skipatstart+1] == CHAR_ASTERISK) 6504 { 6505 int newnl = 0; 6506 int newbsr = 0; 6507 6508 if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0) 6509 { skipatstart += 7; options |= PCRE_UTF8; continue; } 6510 6511 if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) 6512 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } 6513 else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0) 6514 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } 6515 else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0) 6516 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } 6517 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0) 6518 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } 6519 else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0) 6520 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } 6521 6522 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) 6523 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } 6524 else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) 6525 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } 6526 6527 if (newnl != 0) 6528 options = (options & ~PCRE_NEWLINE_BITS) | newnl; 6529 else if (newbsr != 0) 6530 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr; 6531 else break; 6532 } 6533 6534/* Can't support UTF8 unless PCRE has been compiled to include the code. */ 6535 6536#ifdef SUPPORT_UTF8 6537if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && 6538 (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0) 6539 { 6540 errorcode = ERR44; 6541 goto PCRE_EARLY_ERROR_RETURN2; 6542 } 6543#else 6544if (utf8) 6545 { 6546 errorcode = ERR32; 6547 goto PCRE_EARLY_ERROR_RETURN; 6548 } 6549#endif 6550 6551/* Check validity of \R options. */ 6552 6553switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) 6554 { 6555 case 0: 6556 case PCRE_BSR_ANYCRLF: 6557 case PCRE_BSR_UNICODE: 6558 break; 6559 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; 6560 } 6561 6562/* Handle different types of newline. The three bits give seven cases. The 6563current code allows for fixed one- or two-byte sequences, plus "any" and 6564"anycrlf". */ 6565 6566switch (options & PCRE_NEWLINE_BITS) 6567 { 6568 case 0: newline = NEWLINE; break; /* Build-time default */ 6569 case PCRE_NEWLINE_CR: newline = CHAR_CR; break; 6570 case PCRE_NEWLINE_LF: newline = CHAR_NL; break; 6571 case PCRE_NEWLINE_CR+ 6572 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; 6573 case PCRE_NEWLINE_ANY: newline = -1; break; 6574 case PCRE_NEWLINE_ANYCRLF: newline = -2; break; 6575 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; 6576 } 6577 6578if (newline == -2) 6579 { 6580 cd->nltype = NLTYPE_ANYCRLF; 6581 } 6582else if (newline < 0) 6583 { 6584 cd->nltype = NLTYPE_ANY; 6585 } 6586else 6587 { 6588 cd->nltype = NLTYPE_FIXED; 6589 if (newline > 255) 6590 { 6591 cd->nllen = 2; 6592 cd->nl[0] = (newline >> 8) & 255; 6593 cd->nl[1] = newline & 255; 6594 } 6595 else 6596 { 6597 cd->nllen = 1; 6598 cd->nl[0] = newline; 6599 } 6600 } 6601 6602/* Maximum back reference and backref bitmap. The bitmap records up to 31 back 6603references to help in deciding whether (.*) can be treated as anchored or not. 6604*/ 6605 6606cd->top_backref = 0; 6607cd->backref_map = 0; 6608 6609/* Reflect pattern for debugging output */ 6610 6611DPRINTF(("------------------------------------------------------------------\n")); 6612DPRINTF(("%s\n", pattern)); 6613 6614/* Pretend to compile the pattern while actually just accumulating the length 6615of memory required. This behaviour is triggered by passing a non-NULL final 6616argument to compile_regex(). We pass a block of workspace (cworkspace) for it 6617to compile parts of the pattern into; the compiled code is discarded when it is 6618no longer needed, so hopefully this workspace will never overflow, though there 6619is a test for its doing so. */ 6620 6621cd->bracount = cd->final_bracount = 0; 6622cd->names_found = 0; 6623cd->name_entry_size = 0; 6624cd->name_table = NULL; 6625cd->start_workspace = cworkspace; 6626cd->start_code = cworkspace; 6627cd->hwm = cworkspace; 6628cd->start_pattern = (const uschar *)pattern; 6629cd->end_pattern = (const uschar *)(pattern + strlen(pattern)); 6630cd->req_varyopt = 0; 6631cd->external_options = options; 6632cd->external_flags = 0; 6633cd->open_caps = NULL; 6634 6635/* Now do the pre-compile. On error, errorcode will be set non-zero, so we 6636don't need to look at the result of the function here. The initial options have 6637been put into the cd block so that they can be changed if an option setting is 6638found within the regex right at the beginning. Bringing initial option settings 6639outside can help speed up starting point checks. */ 6640 6641ptr += skipatstart; 6642code = cworkspace; 6643*code = OP_BRA; 6644(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, 6645 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, 6646 &length); 6647if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; 6648 6649DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, 6650 cd->hwm - cworkspace)); 6651 6652if (length > MAX_PATTERN_SIZE) 6653 { 6654 errorcode = ERR20; 6655 goto PCRE_EARLY_ERROR_RETURN; 6656 } 6657 6658/* Compute the size of data block needed and get it, either from malloc or 6659externally provided function. Integer overflow should no longer be possible 6660because nowadays we limit the maximum value of cd->names_found and 6661cd->name_entry_size. */ 6662 6663size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3); 6664re = (real_pcre *)(pcre_malloc)(size); 6665 6666if (re == NULL) 6667 { 6668 errorcode = ERR21; 6669 goto PCRE_EARLY_ERROR_RETURN; 6670 } 6671 6672/* Put in the magic number, and save the sizes, initial options, internal 6673flags, and character table pointer. NULL is used for the default character 6674tables. The nullpad field is at the end; it's there to help in the case when a 6675regex compiled on a system with 4-byte pointers is run on another with 8-byte 6676pointers. */ 6677 6678re->magic_number = MAGIC_NUMBER; 6679re->size = size; 6680re->options = cd->external_options; 6681re->flags = cd->external_flags; 6682re->dummy1 = 0; 6683re->first_byte = 0; 6684re->req_byte = 0; 6685re->name_table_offset = sizeof(real_pcre); 6686re->name_entry_size = cd->name_entry_size; 6687re->name_count = cd->names_found; 6688re->ref_count = 0; 6689re->tables = (tables == _pcre_default_tables)? NULL : tables; 6690re->nullpad = NULL; 6691 6692/* The starting points of the name/number translation table and of the code are 6693passed around in the compile data block. The start/end pattern and initial 6694options are already set from the pre-compile phase, as is the name_entry_size 6695field. Reset the bracket count and the names_found field. Also reset the hwm 6696field; this time it's used for remembering forward references to subpatterns. 6697*/ 6698 6699cd->final_bracount = cd->bracount; /* Save for checking forward references */ 6700cd->bracount = 0; 6701cd->names_found = 0; 6702cd->name_table = (uschar *)re + re->name_table_offset; 6703codestart = cd->name_table + re->name_entry_size * re->name_count; 6704cd->start_code = codestart; 6705cd->hwm = cworkspace; 6706cd->req_varyopt = 0; 6707cd->had_accept = FALSE; 6708cd->check_lookbehind = FALSE; 6709cd->open_caps = NULL; 6710 6711/* Set up a starting, non-extracting bracket, then compile the expression. On 6712error, errorcode will be set non-zero, so we don't need to look at the result 6713of the function here. */ 6714 6715ptr = (const uschar *)pattern + skipatstart; 6716code = (uschar *)codestart; 6717*code = OP_BRA; 6718(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, 6719 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); 6720re->top_bracket = cd->bracount; 6721re->top_backref = cd->top_backref; 6722re->flags = cd->external_flags; 6723 6724if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */ 6725 6726/* If not reached end of pattern on success, there's an excess bracket. */ 6727 6728if (errorcode == 0 && *ptr != 0) errorcode = ERR22; 6729 6730/* Fill in the terminating state and check for disastrous overflow, but 6731if debugging, leave the test till after things are printed out. */ 6732 6733*code++ = OP_END; 6734 6735#ifndef PCRE_DEBUG 6736if (code - codestart > length) errorcode = ERR23; 6737#endif 6738 6739/* Fill in any forward references that are required. */ 6740 6741while (errorcode == 0 && cd->hwm > cworkspace) 6742 { 6743 int offset, recno; 6744 const uschar *groupptr; 6745 cd->hwm -= LINK_SIZE; 6746 offset = GET(cd->hwm, 0); 6747 recno = GET(codestart, offset); 6748 groupptr = _pcre_find_bracket(codestart, utf8, recno); 6749 if (groupptr == NULL) errorcode = ERR53; 6750 else PUT(((uschar *)codestart), offset, groupptr - codestart); 6751 } 6752 6753/* Give an error if there's back reference to a non-existent capturing 6754subpattern. */ 6755 6756if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; 6757 6758/* If there were any lookbehind assertions that contained OP_RECURSE 6759(recursions or subroutine calls), a flag is set for them to be checked here, 6760because they may contain forward references. Actual recursions can't be fixed 6761length, but subroutine calls can. It is done like this so that those without 6762OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The 6763exceptional ones forgo this. We scan the pattern to check that they are fixed 6764length, and set their lengths. */ 6765 6766if (cd->check_lookbehind) 6767 { 6768 uschar *cc = (uschar *)codestart; 6769 6770 /* Loop, searching for OP_REVERSE items, and process those that do not have 6771 their length set. (Actually, it will also re-process any that have a length 6772 of zero, but that is a pathological case, and it does no harm.) When we find 6773 one, we temporarily terminate the branch it is in while we scan it. */ 6774 6775 for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1); 6776 cc != NULL; 6777 cc = (uschar *)_pcre_find_bracket(cc, utf8, -1)) 6778 { 6779 if (GET(cc, 1) == 0) 6780 { 6781 int fixed_length; 6782 uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE); 6783 int end_op = *be; 6784 *be = OP_END; 6785 fixed_length = find_fixedlength(cc, re->options, TRUE, cd); 6786 *be = end_op; 6787 DPRINTF(("fixed length = %d\n", fixed_length)); 6788 if (fixed_length < 0) 6789 { 6790 errorcode = (fixed_length == -2)? ERR36 : ERR25; 6791 break; 6792 } 6793 PUT(cc, 1, fixed_length); 6794 } 6795 cc += 1 + LINK_SIZE; 6796 } 6797 } 6798 6799/* Failed to compile, or error while post-processing */ 6800 6801if (errorcode != 0) 6802 { 6803 (pcre_free)(re); 6804 PCRE_EARLY_ERROR_RETURN: 6805 *erroroffset = ptr - (const uschar *)pattern; 6806 PCRE_EARLY_ERROR_RETURN2: 6807 *errorptr = find_error_text(errorcode); 6808 if (errorcodeptr != NULL) *errorcodeptr = errorcode; 6809 return NULL; 6810 } 6811 6812/* If the anchored option was not passed, set the flag if we can determine that 6813the pattern is anchored by virtue of ^ characters or \A or anything else (such 6814as starting with .* when DOTALL is set). 6815 6816Otherwise, if we know what the first byte has to be, save it, because that 6817speeds up unanchored matches no end. If not, see if we can set the 6818PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 6819start with ^. and also when all branches start with .* for non-DOTALL matches. 6820*/ 6821 6822if ((re->options & PCRE_ANCHORED) == 0) 6823 { 6824 int temp_options = re->options; /* May get changed during these scans */ 6825 if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) 6826 re->options |= PCRE_ANCHORED; 6827 else 6828 { 6829 if (firstbyte < 0) 6830 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); 6831 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 6832 { 6833 int ch = firstbyte & 255; 6834 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 6835 cd->fcc[ch] == ch)? ch : firstbyte; 6836 re->flags |= PCRE_FIRSTSET; 6837 } 6838 else if (is_startline(codestart, 0, cd->backref_map)) 6839 re->flags |= PCRE_STARTLINE; 6840 } 6841 } 6842 6843/* For an anchored pattern, we use the "required byte" only if it follows a 6844variable length item in the regex. Remove the caseless flag for non-caseable 6845bytes. */ 6846 6847if (reqbyte >= 0 && 6848 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) 6849 { 6850 int ch = reqbyte & 255; 6851 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 6852 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 6853 re->flags |= PCRE_REQCHSET; 6854 } 6855 6856/* Print out the compiled data if debugging is enabled. This is never the 6857case when building a production library. */ 6858 6859#ifdef PCRE_DEBUG 6860printf("Length = %d top_bracket = %d top_backref = %d\n", 6861 length, re->top_bracket, re->top_backref); 6862 6863printf("Options=%08x\n", re->options); 6864 6865if ((re->flags & PCRE_FIRSTSET) != 0) 6866 { 6867 int ch = re->first_byte & 255; 6868 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? 6869 "" : " (caseless)"; 6870 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); 6871 else printf("First char = \\x%02x%s\n", ch, caseless); 6872 } 6873 6874if ((re->flags & PCRE_REQCHSET) != 0) 6875 { 6876 int ch = re->req_byte & 255; 6877 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? 6878 "" : " (caseless)"; 6879 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); 6880 else printf("Req char = \\x%02x%s\n", ch, caseless); 6881 } 6882 6883pcre_printint(re, stdout, TRUE); 6884 6885/* This check is done here in the debugging case so that the code that 6886was compiled can be seen. */ 6887 6888if (code - codestart > length) 6889 { 6890 (pcre_free)(re); 6891 *errorptr = find_error_text(ERR23); 6892 *erroroffset = ptr - (uschar *)pattern; 6893 if (errorcodeptr != NULL) *errorcodeptr = ERR23; 6894 return NULL; 6895 } 6896#endif /* PCRE_DEBUG */ 6897 6898return (pcre *)re; 6899} 6900 6901/* End of pcre_compile.c */ 6902