1/************************************************* 2* Perl-Compatible Regular Expressions * 3*************************************************/ 4 5/* 6This is a library of functions to support regular expressions whose syntax 7and semantics are as close as possible to those of the Perl 5 language. See 8the file Tech.Notes for some information on the internals. 9 10Written by: Philip Hazel <ph10@cam.ac.uk> 11 12 Copyright (c) 1997-2004 University of Cambridge 13 14----------------------------------------------------------------------------- 15Redistribution and use in source and binary forms, with or without 16modification, are permitted provided that the following conditions are met: 17 18 * Redistributions of source code must retain the above copyright notice, 19 this list of conditions and the following disclaimer. 20 21 * Redistributions in binary form must reproduce the above copyright 22 notice, this list of conditions and the following disclaimer in the 23 documentation and/or other materials provided with the distribution. 24 25 * Neither the name of the University of Cambridge nor the names of its 26 contributors may be used to endorse or promote products derived from 27 this software without specific prior written permission. 28 29THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 30AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 33LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39POSSIBILITY OF SUCH DAMAGE. 40----------------------------------------------------------------------------- 41*/ 42 43 44/* Define DEBUG to get debugging output on stdout. */ 45/* #define DEBUG */ 46 47/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef 48inline, and there are *still* stupid compilers about that don't like indented 49pre-processor statements. I suppose it's only been 10 years... */ 50 51#ifdef DEBUG 52#define DPRINTF(p) printf p 53#else 54#define DPRINTF(p) /*nothing*/ 55#endif 56 57/* Include the internals header, which itself includes "config.h", the Standard 58C headers, and the external pcre header. */ 59 60#include "internal.h" 61 62/* If Unicode Property support is wanted, include a private copy of the 63function that does it, and the table that translates names to numbers. */ 64 65#ifdef SUPPORT_UCP 66#include "ucp.c" 67#include "ucptypetable.c" 68#endif 69 70/* Maximum number of items on the nested bracket stacks at compile time. This 71applies to the nesting of all kinds of parentheses. It does not limit 72un-nested, non-capturing parentheses. This number can be made bigger if 73necessary - it is used to dimension one int and one unsigned char vector at 74compile time. */ 75 76#define BRASTACK_SIZE 200 77 78 79/* Maximum number of ints of offset to save on the stack for recursive calls. 80If the offset vector is bigger, malloc is used. This should be a multiple of 3, 81because the offset vector is always a multiple of 3 long. */ 82 83#define REC_STACK_SAVE_MAX 30 84 85 86/* The maximum remaining length of subject we are prepared to search for a 87req_byte match. */ 88 89#define REQ_BYTE_MAX 1000 90 91 92/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that 93the definition is next to the definition of the opcodes in internal.h. */ 94 95static const uschar OP_lengths[] = { OP_LENGTHS }; 96 97/* Min and max values for the common repeats; for the maxima, 0 => infinity */ 98 99static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; 100static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; 101 102/* Table for handling escaped characters in the range '0'-'z'. Positive returns 103are simple data values; negative values are for special things like \d and so 104on. Zero means further processing is needed (for things like \x), or the escape 105is invalid. */ 106 107#if !EBCDIC /* This is the "normal" table for ASCII systems */ 108static const short int escapes[] = { 109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ 110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ 111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ 112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */ 113-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */ 114-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ 115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ 116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */ 117-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */ 118 0, 0, -ESC_z /* x - z */ 119}; 120 121#else /* This is the "abnormal" table for EBCDIC systems */ 122static const short int escapes[] = { 123/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', 124/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, 125/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~', 126/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0, 127/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?', 128/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0, 129/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"', 130/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, 131/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0, 132/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p, 133/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0, 134/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0, 135/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0, 136/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0, 137/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-', 138/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G, 139/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0, 140/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P, 141/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0, 142/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X, 143/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0, 144/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0, 145/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0 146}; 147#endif 148 149 150/* Tables of names of POSIX character classes and their lengths. The list is 151terminated by a zero length entry. The first three must be alpha, upper, lower, 152as this is assumed for handling case independence. */ 153 154static const char *const posix_names[] = { 155 "alpha", "lower", "upper", 156 "alnum", "ascii", "blank", "cntrl", "digit", "graph", 157 "print", "punct", "space", "word", "xdigit" }; 158 159static const uschar posix_name_lengths[] = { 160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; 161 162/* Table of class bit maps for each POSIX class; up to three may be combined 163to form the class. The table for [:blank:] is dynamically modified to remove 164the vertical space characters. */ 165 166static const int posix_class_maps[] = { 167 cbit_lower, cbit_upper, -1, /* alpha */ 168 cbit_lower, -1, -1, /* lower */ 169 cbit_upper, -1, -1, /* upper */ 170 cbit_digit, cbit_lower, cbit_upper, /* alnum */ 171 cbit_print, cbit_cntrl, -1, /* ascii */ 172 cbit_space, -1, -1, /* blank - a GNU extension */ 173 cbit_cntrl, -1, -1, /* cntrl */ 174 cbit_digit, -1, -1, /* digit */ 175 cbit_graph, -1, -1, /* graph */ 176 cbit_print, -1, -1, /* print */ 177 cbit_punct, -1, -1, /* punct */ 178 cbit_space, -1, -1, /* space */ 179 cbit_word, -1, -1, /* word - a Perl extension */ 180 cbit_xdigit,-1, -1 /* xdigit */ 181}; 182 183/* Table to identify digits and hex digits. This is used when compiling 184patterns. Note that the tables in chartables are dependent on the locale, and 185may mark arbitrary characters as digits - but the PCRE compiling code expects 186to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have 187a private table here. It costs 256 bytes, but it is a lot faster than doing 188character value tests (at least in some simple cases I timed), and in some 189applications one wants PCRE to compile efficiently as well as match 190efficiently. 191 192For convenience, we use the same bit definitions as in chartables: 193 194 0x04 decimal digit 195 0x08 hexadecimal digit 196 197Then we can use ctype_digit and ctype_xdigit in the code. */ 198 199#if !EBCDIC /* This is the "normal" case, for ASCII systems */ 200static const unsigned char digitab[] = 201 { 202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ 203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */ 205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */ 207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */ 208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */ 209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */ 210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */ 211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */ 212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */ 213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */ 214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */ 215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */ 216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */ 217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */ 218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */ 219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */ 220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */ 221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */ 222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */ 223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */ 224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */ 225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */ 227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */ 228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */ 229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */ 230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */ 231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */ 232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ 234 235#else /* This is the "abnormal" case, for EBCDIC systems */ 236static const unsigned char digitab[] = 237 { 238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ 239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ 240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */ 241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */ 243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */ 245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */ 247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */ 248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */ 249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- � */ 250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */ 251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */ 252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */ 253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */ 255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */ 257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */ 259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */ 261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */ 263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */ 265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */ 267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */ 269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 270 271static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */ 272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */ 273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */ 274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */ 275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */ 276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */ 277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */ 278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */ 279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */ 280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */ 281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */ 282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */ 283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- � */ 284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */ 285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */ 286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */ 287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */ 288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */ 289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */ 290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */ 291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */ 292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */ 293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */ 294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */ 295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */ 296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */ 297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */ 298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */ 299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */ 300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */ 301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */ 302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */ 303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */ 304#endif 305 306 307/* Definition to allow mutual recursion */ 308 309static BOOL 310 compile_regex(int, int, int *, uschar **, const uschar **, const char **, 311 BOOL, int, int *, int *, branch_chain *, compile_data *); 312 313/* Structure for building a chain of data that actually lives on the 314stack, for holding the values of the subject pointer at the start of each 315subpattern, so as to detect when an empty string has been matched by a 316subpattern - to break infinite loops. When NO_RECURSE is set, these blocks 317are on the heap, not on the stack. */ 318 319typedef struct eptrblock { 320 struct eptrblock *epb_prev; 321 const uschar *epb_saved_eptr; 322} eptrblock; 323 324/* Flag bits for the match() function */ 325 326#define match_condassert 0x01 /* Called to check a condition assertion */ 327#define match_isgroup 0x02 /* Set if start of bracketed group */ 328 329/* Non-error returns from the match() function. Error returns are externally 330defined PCRE_ERROR_xxx codes, which are all negative. */ 331 332#define MATCH_MATCH 1 333#define MATCH_NOMATCH 0 334 335 336 337/************************************************* 338* Global variables * 339*************************************************/ 340 341/* PCRE is thread-clean and doesn't use any global variables in the normal 342sense. However, it calls memory allocation and free functions via the four 343indirections below, and it can optionally do callouts. These values can be 344changed by the caller, but are shared between all threads. However, when 345compiling for Virtual Pascal, things are done differently (see pcre.in). */ 346 347#ifndef VPCOMPAT 348#ifdef __cplusplus 349extern "C" void *(*pcre_malloc)(size_t) = malloc; 350extern "C" void (*pcre_free)(void *) = free; 351extern "C" void *(*pcre_stack_malloc)(size_t) = malloc; 352extern "C" void (*pcre_stack_free)(void *) = free; 353extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL; 354#else 355void *(*pcre_malloc)(size_t) = malloc; 356void (*pcre_free)(void *) = free; 357void *(*pcre_stack_malloc)(size_t) = malloc; 358void (*pcre_stack_free)(void *) = free; 359int (*pcre_callout)(pcre_callout_block *) = NULL; 360#endif 361#endif 362 363 364/************************************************* 365* Macros and tables for character handling * 366*************************************************/ 367 368/* When UTF-8 encoding is being used, a character is no longer just a single 369byte. The macros for character handling generate simple sequences when used in 370byte-mode, and more complicated ones for UTF-8 characters. */ 371 372#ifndef SUPPORT_UTF8 373#define GETCHAR(c, eptr) c = *eptr; 374#define GETCHARINC(c, eptr) c = *eptr++; 375#define GETCHARINCTEST(c, eptr) c = *eptr++; 376#define GETCHARLEN(c, eptr, len) c = *eptr; 377#define BACKCHAR(eptr) 378 379#else /* SUPPORT_UTF8 */ 380 381/* Get the next UTF-8 character, not advancing the pointer. This is called when 382we know we are in UTF-8 mode. */ 383 384#define GETCHAR(c, eptr) \ 385 c = *eptr; \ 386 if ((c & 0xc0) == 0xc0) \ 387 { \ 388 int gcii; \ 389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ 390 int gcss = 6*gcaa; \ 391 c = (c & utf8_table3[gcaa]) << gcss; \ 392 for (gcii = 1; gcii <= gcaa; gcii++) \ 393 { \ 394 gcss -= 6; \ 395 c |= (eptr[gcii] & 0x3f) << gcss; \ 396 } \ 397 } 398 399/* Get the next UTF-8 character, advancing the pointer. This is called when we 400know we are in UTF-8 mode. */ 401 402#define GETCHARINC(c, eptr) \ 403 c = *eptr++; \ 404 if ((c & 0xc0) == 0xc0) \ 405 { \ 406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ 407 int gcss = 6*gcaa; \ 408 c = (c & utf8_table3[gcaa]) << gcss; \ 409 while (gcaa-- > 0) \ 410 { \ 411 gcss -= 6; \ 412 c |= (*eptr++ & 0x3f) << gcss; \ 413 } \ 414 } 415 416/* Get the next character, testing for UTF-8 mode, and advancing the pointer */ 417 418#define GETCHARINCTEST(c, eptr) \ 419 c = *eptr++; \ 420 if (md->utf8 && (c & 0xc0) == 0xc0) \ 421 { \ 422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ 423 int gcss = 6*gcaa; \ 424 c = (c & utf8_table3[gcaa]) << gcss; \ 425 while (gcaa-- > 0) \ 426 { \ 427 gcss -= 6; \ 428 c |= (*eptr++ & 0x3f) << gcss; \ 429 } \ 430 } 431 432/* Get the next UTF-8 character, not advancing the pointer, incrementing length 433if there are extra bytes. This is called when we know we are in UTF-8 mode. */ 434 435#define GETCHARLEN(c, eptr, len) \ 436 c = *eptr; \ 437 if ((c & 0xc0) == 0xc0) \ 438 { \ 439 int gcii; \ 440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ 441 int gcss = 6*gcaa; \ 442 c = (c & utf8_table3[gcaa]) << gcss; \ 443 for (gcii = 1; gcii <= gcaa; gcii++) \ 444 { \ 445 gcss -= 6; \ 446 c |= (eptr[gcii] & 0x3f) << gcss; \ 447 } \ 448 len += gcaa; \ 449 } 450 451/* If the pointer is not at the start of a character, move it back until 452it is. Called only in UTF-8 mode. */ 453 454#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; 455 456#endif 457 458 459 460/************************************************* 461* Default character tables * 462*************************************************/ 463 464/* A default set of character tables is included in the PCRE binary. Its source 465is built by the maketables auxiliary program, which uses the default C ctypes 466functions, and put in the file chartables.c. These tables are used by PCRE 467whenever the caller of pcre_compile() does not provide an alternate set of 468tables. */ 469 470#include "chartables.c" 471 472 473 474#ifdef SUPPORT_UTF8 475/************************************************* 476* Tables for UTF-8 support * 477*************************************************/ 478 479/* These are the breakpoints for different numbers of bytes in a UTF-8 480character. */ 481 482static const int utf8_table1[] = 483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; 484 485/* These are the indicator bits and the mask for the data bits to set in the 486first byte of a character, indexed by the number of additional bytes. */ 487 488static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; 489static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; 490 491/* Table of the number of extra characters, indexed by the first character 492masked with 0x3f. The highest number for a valid UTF-8 character is in fact 4930x3d. */ 494 495static const uschar utf8_table4[] = { 496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; 500 501 502/************************************************* 503* Convert character value to UTF-8 * 504*************************************************/ 505 506/* This function takes an integer value in the range 0 - 0x7fffffff 507and encodes it as a UTF-8 character in 0 to 6 bytes. 508 509Arguments: 510 cvalue the character value 511 buffer pointer to buffer for result - at least 6 bytes long 512 513Returns: number of characters placed in the buffer 514*/ 515 516static int 517ord2utf8(int cvalue, uschar *buffer) 518{ 519register int i, j; 520for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) 521 if (cvalue <= utf8_table1[i]) break; 522buffer += i; 523for (j = i; j > 0; j--) 524 { 525 *buffer-- = 0x80 | (cvalue & 0x3f); 526 cvalue >>= 6; 527 } 528*buffer = utf8_table2[i] | cvalue; 529return i + 1; 530} 531#endif 532 533 534 535/************************************************* 536* Print compiled regex * 537*************************************************/ 538 539/* The code for doing this is held in a separate file that is also included in 540pcretest.c. It defines a function called print_internals(). */ 541 542#ifdef DEBUG 543#include "printint.c" 544#endif 545 546 547 548/************************************************* 549* Return version string * 550*************************************************/ 551 552#define STRING(a) # a 553#define XSTRING(s) STRING(s) 554 555EXPORT const char * 556pcre_version(void) 557{ 558return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE); 559} 560 561 562 563 564/************************************************* 565* Flip bytes in an integer * 566*************************************************/ 567 568/* This function is called when the magic number in a regex doesn't match in 569order to flip its bytes to see if we are dealing with a pattern that was 570compiled on a host of different endianness. If so, this function is used to 571flip other byte values. 572 573Arguments: 574 value the number to flip 575 n the number of bytes to flip (assumed to be 2 or 4) 576 577Returns: the flipped value 578*/ 579 580static pcre_uint16 581byteflip2(pcre_uint16 value) 582{ 583return ((value & 0x00ff) << 8) | 584 ((value & 0xff00) >> 8); 585} 586 587static pcre_uint32 588byteflip4(pcre_uint32 value) 589{ 590return ((value & 0x000000ff) << 24) | 591 ((value & 0x0000ff00) << 8) | 592 ((value & 0x00ff0000) >> 8) | 593 ((value & 0xff000000) >> 24); 594} 595 596/************************************************* 597* Test for a byte-flipped compiled regex * 598*************************************************/ 599 600/* This function is called from pce_exec() and also from pcre_fullinfo(). Its 601job is to test whether the regex is byte-flipped - that is, it was compiled on 602a system of opposite endianness. The function is called only when the native 603MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the 604relevant values into a different data block, and return it. 605 606Arguments: 607 re points to the regex 608 study points to study data, or NULL 609 internal_re points to a new regex block 610 internal_study points to a new study block 611 612Returns: the new block if is is indeed a byte-flipped regex 613 NULL if it is not 614*/ 615 616static real_pcre * 617try_flipped(const real_pcre *re, real_pcre *internal_re, 618 const pcre_study_data *study, pcre_study_data *internal_study) 619{ 620if (byteflip4(re->magic_number) != MAGIC_NUMBER) 621 return NULL; 622 623*internal_re = *re; /* To copy other fields */ 624internal_re->size = byteflip4(re->size); 625internal_re->options = byteflip4(re->options); 626internal_re->top_bracket = byteflip2(re->top_bracket); 627internal_re->top_backref = byteflip2(re->top_backref); 628internal_re->first_byte = byteflip2(re->first_byte); 629internal_re->req_byte = byteflip2(re->req_byte); 630internal_re->name_table_offset = byteflip2(re->name_table_offset); 631internal_re->name_entry_size = byteflip2(re->name_entry_size); 632internal_re->name_count = byteflip2(re->name_count); 633 634if (study != NULL) 635 { 636 *internal_study = *study; /* To copy other fields */ 637 internal_study->size = byteflip4(study->size); 638 internal_study->options = byteflip4(study->options); 639 } 640 641return internal_re; 642} 643 644 645 646/************************************************* 647* (Obsolete) Return info about compiled pattern * 648*************************************************/ 649 650/* This is the original "info" function. It picks potentially useful data out 651of the private structure, but its interface was too rigid. It remains for 652backwards compatibility. The public options are passed back in an int - though 653the re->options field has been expanded to a long int, all the public options 654at the low end of it, and so even on 16-bit systems this will still be OK. 655Therefore, I haven't changed the API for pcre_info(). 656 657Arguments: 658 argument_re points to compiled code 659 optptr where to pass back the options 660 first_byte where to pass back the first character, 661 or -1 if multiline and all branches start ^, 662 or -2 otherwise 663 664Returns: number of capturing subpatterns 665 or negative values on error 666*/ 667 668EXPORT int 669pcre_info(const pcre *argument_re, int *optptr, int *first_byte) 670{ 671real_pcre internal_re; 672const real_pcre *re = (const real_pcre *)argument_re; 673if (re == NULL) return PCRE_ERROR_NULL; 674if (re->magic_number != MAGIC_NUMBER) 675 { 676 re = try_flipped(re, &internal_re, NULL, NULL); 677 if (re == NULL) return PCRE_ERROR_BADMAGIC; 678 } 679if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); 680if (first_byte != NULL) 681 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte : 682 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; 683return re->top_bracket; 684} 685 686 687 688/************************************************* 689* Return info about compiled pattern * 690*************************************************/ 691 692/* This is a newer "info" function which has an extensible interface so 693that additional items can be added compatibly. 694 695Arguments: 696 argument_re points to compiled code 697 extra_data points extra data, or NULL 698 what what information is required 699 where where to put the information 700 701Returns: 0 if data returned, negative on error 702*/ 703 704EXPORT int 705pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, 706 void *where) 707{ 708real_pcre internal_re; 709pcre_study_data internal_study; 710const real_pcre *re = (const real_pcre *)argument_re; 711const pcre_study_data *study = NULL; 712 713if (re == NULL || where == NULL) return PCRE_ERROR_NULL; 714 715if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) 716 study = (const pcre_study_data *)extra_data->study_data; 717 718if (re->magic_number != MAGIC_NUMBER) 719 { 720 re = try_flipped(re, &internal_re, study, &internal_study); 721 if (re == NULL) return PCRE_ERROR_BADMAGIC; 722 if (study != NULL) study = &internal_study; 723 } 724 725switch (what) 726 { 727 case PCRE_INFO_OPTIONS: 728 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; 729 break; 730 731 case PCRE_INFO_SIZE: 732 *((size_t *)where) = re->size; 733 break; 734 735 case PCRE_INFO_STUDYSIZE: 736 *((size_t *)where) = (study == NULL)? 0 : study->size; 737 break; 738 739 case PCRE_INFO_CAPTURECOUNT: 740 *((int *)where) = re->top_bracket; 741 break; 742 743 case PCRE_INFO_BACKREFMAX: 744 *((int *)where) = re->top_backref; 745 break; 746 747 case PCRE_INFO_FIRSTBYTE: 748 *((int *)where) = 749 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte : 750 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2; 751 break; 752 753 /* Make sure we pass back the pointer to the bit vector in the external 754 block, not the internal copy (with flipped integer fields). */ 755 756 case PCRE_INFO_FIRSTTABLE: 757 *((const uschar **)where) = 758 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? 759 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; 760 break; 761 762 case PCRE_INFO_LASTLITERAL: 763 *((int *)where) = 764 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1; 765 break; 766 767 case PCRE_INFO_NAMEENTRYSIZE: 768 *((int *)where) = re->name_entry_size; 769 break; 770 771 case PCRE_INFO_NAMECOUNT: 772 *((int *)where) = re->name_count; 773 break; 774 775 case PCRE_INFO_NAMETABLE: 776 *((const uschar **)where) = (const uschar *)re + re->name_table_offset; 777 break; 778 779 case PCRE_INFO_DEFAULT_TABLES: 780 *((const uschar **)where) = (const uschar *)pcre_default_tables; 781 break; 782 783 default: return PCRE_ERROR_BADOPTION; 784 } 785 786return 0; 787} 788 789 790 791/************************************************* 792* Return info about what features are configured * 793*************************************************/ 794 795/* This is function which has an extensible interface so that additional items 796can be added compatibly. 797 798Arguments: 799 what what information is required 800 where where to put the information 801 802Returns: 0 if data returned, negative on error 803*/ 804 805EXPORT int 806pcre_config(int what, void *where) 807{ 808switch (what) 809 { 810 case PCRE_CONFIG_UTF8: 811#ifdef SUPPORT_UTF8 812 *((int *)where) = 1; 813#else 814 *((int *)where) = 0; 815#endif 816 break; 817 818 case PCRE_CONFIG_UNICODE_PROPERTIES: 819#ifdef SUPPORT_UCP 820 *((int *)where) = 1; 821#else 822 *((int *)where) = 0; 823#endif 824 break; 825 826 case PCRE_CONFIG_NEWLINE: 827 *((int *)where) = NEWLINE; 828 break; 829 830 case PCRE_CONFIG_LINK_SIZE: 831 *((int *)where) = LINK_SIZE; 832 break; 833 834 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD: 835 *((int *)where) = POSIX_MALLOC_THRESHOLD; 836 break; 837 838 case PCRE_CONFIG_MATCH_LIMIT: 839 *((unsigned int *)where) = MATCH_LIMIT; 840 break; 841 842 case PCRE_CONFIG_STACKRECURSE: 843#ifdef NO_RECURSE 844 *((int *)where) = 0; 845#else 846 *((int *)where) = 1; 847#endif 848 break; 849 850 default: return PCRE_ERROR_BADOPTION; 851 } 852 853return 0; 854} 855 856 857 858#ifdef DEBUG 859/************************************************* 860* Debugging function to print chars * 861*************************************************/ 862 863/* Print a sequence of chars in printable format, stopping at the end of the 864subject if the requested. 865 866Arguments: 867 p points to characters 868 length number to print 869 is_subject TRUE if printing from within md->start_subject 870 md pointer to matching data block, if is_subject is TRUE 871 872Returns: nothing 873*/ 874 875static void 876pchars(const uschar *p, int length, BOOL is_subject, match_data *md) 877{ 878int c; 879if (is_subject && length > md->end_subject - p) length = md->end_subject - p; 880while (length-- > 0) 881 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); 882} 883#endif 884 885 886 887 888/************************************************* 889* Handle escapes * 890*************************************************/ 891 892/* This function is called when a \ has been encountered. It either returns a 893positive value for a simple escape such as \n, or a negative value which 894encodes one of the more complicated things such as \d. When UTF-8 is enabled, 895a positive value greater than 255 may be returned. On entry, ptr is pointing at 896the \. On exit, it is on the final character of the escape sequence. 897 898Arguments: 899 ptrptr points to the pattern position pointer 900 errorptr points to the pointer to the error message 901 bracount number of previous extracting brackets 902 options the options bits 903 isclass TRUE if inside a character class 904 905Returns: zero or positive => a data character 906 negative => a special escape sequence 907 on error, errorptr is set 908*/ 909 910static int 911check_escape(const uschar **ptrptr, const char **errorptr, int bracount, 912 int options, BOOL isclass) 913{ 914const uschar *ptr = *ptrptr; 915int c, i; 916 917/* If backslash is at the end of the pattern, it's an error. */ 918 919c = *(++ptr); 920if (c == 0) *errorptr = ERR1; 921 922/* Non-alphamerics are literals. For digits or letters, do an initial lookup in 923a table. A non-zero result is something that can be returned immediately. 924Otherwise further processing may be required. */ 925 926#if !EBCDIC /* ASCII coding */ 927else if (c < '0' || c > 'z') {} /* Not alphameric */ 928else if ((i = escapes[c - '0']) != 0) c = i; 929 930#else /* EBCDIC coding */ 931else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */ 932else if ((i = escapes[c - 0x48]) != 0) c = i; 933#endif 934 935/* Escapes that need further processing, or are illegal. */ 936 937else 938 { 939 const uschar *oldptr; 940 switch (c) 941 { 942 /* A number of Perl escapes are not handled by PCRE. We give an explicit 943 error. */ 944 945 case 'l': 946 case 'L': 947 case 'N': 948 case 'u': 949 case 'U': 950 *errorptr = ERR37; 951 break; 952 953 /* The handling of escape sequences consisting of a string of digits 954 starting with one that is not zero is not straightforward. By experiment, 955 the way Perl works seems to be as follows: 956 957 Outside a character class, the digits are read as a decimal number. If the 958 number is less than 10, or if there are that many previous extracting 959 left brackets, then it is a back reference. Otherwise, up to three octal 960 digits are read to form an escaped byte. Thus \123 is likely to be octal 961 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal 962 value is greater than 377, the least significant 8 bits are taken. Inside a 963 character class, \ followed by a digit is always an octal number. */ 964 965 case '1': case '2': case '3': case '4': case '5': 966 case '6': case '7': case '8': case '9': 967 968 if (!isclass) 969 { 970 oldptr = ptr; 971 c -= '0'; 972 while ((digitab[ptr[1]] & ctype_digit) != 0) 973 c = c * 10 + *(++ptr) - '0'; 974 if (c < 10 || c <= bracount) 975 { 976 c = -(ESC_REF + c); 977 break; 978 } 979 ptr = oldptr; /* Put the pointer back and fall through */ 980 } 981 982 /* Handle an octal number following \. If the first digit is 8 or 9, Perl 983 generates a binary zero byte and treats the digit as a following literal. 984 Thus we have to pull back the pointer by one. */ 985 986 if ((c = *ptr) >= '8') 987 { 988 ptr--; 989 c = 0; 990 break; 991 } 992 993 /* \0 always starts an octal number, but we may drop through to here with a 994 larger first octal digit. */ 995 996 case '0': 997 c -= '0'; 998 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') 999 c = c * 8 + *(++ptr) - '0'; 1000 c &= 255; /* Take least significant 8 bits */ 1001 break; 1002 1003 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number 1004 which can be greater than 0xff, but only if the ddd are hex digits. */ 1005 1006 case 'x': 1007#ifdef SUPPORT_UTF8 1008 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) 1009 { 1010 const uschar *pt = ptr + 2; 1011 register int count = 0; 1012 c = 0; 1013 while ((digitab[*pt] & ctype_xdigit) != 0) 1014 { 1015 int cc = *pt++; 1016 count++; 1017#if !EBCDIC /* ASCII coding */ 1018 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 1019 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); 1020#else /* EBCDIC coding */ 1021 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ 1022 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); 1023#endif 1024 } 1025 if (*pt == '}') 1026 { 1027 if (c < 0 || count > 8) *errorptr = ERR34; 1028 ptr = pt; 1029 break; 1030 } 1031 /* If the sequence of hex digits does not end with '}', then we don't 1032 recognize this construct; fall through to the normal \x handling. */ 1033 } 1034#endif 1035 1036 /* Read just a single hex char */ 1037 1038 c = 0; 1039 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) 1040 { 1041 int cc; /* Some compilers don't like ++ */ 1042 cc = *(++ptr); /* in initializers */ 1043#if !EBCDIC /* ASCII coding */ 1044 if (cc >= 'a') cc -= 32; /* Convert to upper case */ 1045 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); 1046#else /* EBCDIC coding */ 1047 if (cc <= 'z') cc += 64; /* Convert to upper case */ 1048 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); 1049#endif 1050 } 1051 break; 1052 1053 /* Other special escapes not starting with a digit are straightforward */ 1054 1055 case 'c': 1056 c = *(++ptr); 1057 if (c == 0) 1058 { 1059 *errorptr = ERR2; 1060 return 0; 1061 } 1062 1063 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding 1064 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. 1065 (However, an EBCDIC equivalent has now been added.) */ 1066 1067#if !EBCDIC /* ASCII coding */ 1068 if (c >= 'a' && c <= 'z') c -= 32; 1069 c ^= 0x40; 1070#else /* EBCDIC coding */ 1071 if (c >= 'a' && c <= 'z') c += 64; 1072 c ^= 0xC0; 1073#endif 1074 break; 1075 1076 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any 1077 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise, 1078 for Perl compatibility, it is a literal. This code looks a bit odd, but 1079 there used to be some cases other than the default, and there may be again 1080 in future, so I haven't "optimized" it. */ 1081 1082 default: 1083 if ((options & PCRE_EXTRA) != 0) switch(c) 1084 { 1085 default: 1086 *errorptr = ERR3; 1087 break; 1088 } 1089 break; 1090 } 1091 } 1092 1093*ptrptr = ptr; 1094return c; 1095} 1096 1097 1098 1099#ifdef SUPPORT_UCP 1100/************************************************* 1101* Handle \P and \p * 1102*************************************************/ 1103 1104/* This function is called after \P or \p has been encountered, provided that 1105PCRE is compiled with support for Unicode properties. On entry, ptrptr is 1106pointing at the P or p. On exit, it is pointing at the final character of the 1107escape sequence. 1108 1109Argument: 1110 ptrptr points to the pattern position pointer 1111 negptr points to a boolean that is set TRUE for negation else FALSE 1112 errorptr points to the pointer to the error message 1113 1114Returns: value from ucp_type_table, or -1 for an invalid type 1115*/ 1116 1117static int 1118get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr) 1119{ 1120int c, i, bot, top; 1121const uschar *ptr = *ptrptr; 1122char name[4]; 1123 1124c = *(++ptr); 1125if (c == 0) goto ERROR_RETURN; 1126 1127*negptr = FALSE; 1128 1129/* \P or \p can be followed by a one- or two-character name in {}, optionally 1130preceded by ^ for negation. */ 1131 1132if (c == '{') 1133 { 1134 if (ptr[1] == '^') 1135 { 1136 *negptr = TRUE; 1137 ptr++; 1138 } 1139 for (i = 0; i <= 2; i++) 1140 { 1141 c = *(++ptr); 1142 if (c == 0) goto ERROR_RETURN; 1143 if (c == '}') break; 1144 name[i] = c; 1145 } 1146 if (c !='}') /* Try to distinguish error cases */ 1147 { 1148 while (*(++ptr) != 0 && *ptr != '}'); 1149 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN; 1150 } 1151 name[i] = 0; 1152 } 1153 1154/* Otherwise there is just one following character */ 1155 1156else 1157 { 1158 name[0] = c; 1159 name[1] = 0; 1160 } 1161 1162*ptrptr = ptr; 1163 1164/* Search for a recognized property name using binary chop */ 1165 1166bot = 0; 1167top = sizeof(utt)/sizeof(ucp_type_table); 1168 1169while (bot < top) 1170 { 1171 i = (bot + top)/2; 1172 c = strcmp(name, utt[i].name); 1173 if (c == 0) return utt[i].value; 1174 if (c > 0) bot = i + 1; else top = i; 1175 } 1176 1177UNKNOWN_RETURN: 1178*errorptr = ERR47; 1179*ptrptr = ptr; 1180return -1; 1181 1182ERROR_RETURN: 1183*errorptr = ERR46; 1184*ptrptr = ptr; 1185return -1; 1186} 1187#endif 1188 1189 1190 1191 1192/************************************************* 1193* Check for counted repeat * 1194*************************************************/ 1195 1196/* This function is called when a '{' is encountered in a place where it might 1197start a quantifier. It looks ahead to see if it really is a quantifier or not. 1198It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} 1199where the ddds are digits. 1200 1201Arguments: 1202 p pointer to the first char after '{' 1203 1204Returns: TRUE or FALSE 1205*/ 1206 1207static BOOL 1208is_counted_repeat(const uschar *p) 1209{ 1210if ((digitab[*p++] & ctype_digit) == 0) return FALSE; 1211while ((digitab[*p] & ctype_digit) != 0) p++; 1212if (*p == '}') return TRUE; 1213 1214if (*p++ != ',') return FALSE; 1215if (*p == '}') return TRUE; 1216 1217if ((digitab[*p++] & ctype_digit) == 0) return FALSE; 1218while ((digitab[*p] & ctype_digit) != 0) p++; 1219 1220return (*p == '}'); 1221} 1222 1223 1224 1225/************************************************* 1226* Read repeat counts * 1227*************************************************/ 1228 1229/* Read an item of the form {n,m} and return the values. This is called only 1230after is_counted_repeat() has confirmed that a repeat-count quantifier exists, 1231so the syntax is guaranteed to be correct, but we need to check the values. 1232 1233Arguments: 1234 p pointer to first char after '{' 1235 minp pointer to int for min 1236 maxp pointer to int for max 1237 returned as -1 if no max 1238 errorptr points to pointer to error message 1239 1240Returns: pointer to '}' on success; 1241 current ptr on error, with errorptr set 1242*/ 1243 1244static const uschar * 1245read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr) 1246{ 1247int min = 0; 1248int max = -1; 1249 1250/* Read the minimum value and do a paranoid check: a negative value indicates 1251an integer overflow. */ 1252 1253while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; 1254if (min < 0 || min > 65535) 1255 { 1256 *errorptr = ERR5; 1257 return p; 1258 } 1259 1260/* Read the maximum value if there is one, and again do a paranoid on its size. 1261Also, max must not be less than min. */ 1262 1263if (*p == '}') max = min; else 1264 { 1265 if (*(++p) != '}') 1266 { 1267 max = 0; 1268 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; 1269 if (max < 0 || max > 65535) 1270 { 1271 *errorptr = ERR5; 1272 return p; 1273 } 1274 if (max < min) 1275 { 1276 *errorptr = ERR4; 1277 return p; 1278 } 1279 } 1280 } 1281 1282/* Fill in the required variables, and pass back the pointer to the terminating 1283'}'. */ 1284 1285*minp = min; 1286*maxp = max; 1287return p; 1288} 1289 1290 1291 1292/************************************************* 1293* Find first significant op code * 1294*************************************************/ 1295 1296/* This is called by several functions that scan a compiled expression looking 1297for a fixed first character, or an anchoring op code etc. It skips over things 1298that do not influence this. For some calls, a change of option is important. 1299For some calls, it makes sense to skip negative forward and all backward 1300assertions, and also the \b assertion; for others it does not. 1301 1302Arguments: 1303 code pointer to the start of the group 1304 options pointer to external options 1305 optbit the option bit whose changing is significant, or 1306 zero if none are 1307 skipassert TRUE if certain assertions are to be skipped 1308 1309Returns: pointer to the first significant opcode 1310*/ 1311 1312static const uschar* 1313first_significant_code(const uschar *code, int *options, int optbit, 1314 BOOL skipassert) 1315{ 1316for (;;) 1317 { 1318 switch ((int)*code) 1319 { 1320 case OP_OPT: 1321 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) 1322 *options = (int)code[1]; 1323 code += 2; 1324 break; 1325 1326 case OP_ASSERT_NOT: 1327 case OP_ASSERTBACK: 1328 case OP_ASSERTBACK_NOT: 1329 if (!skipassert) return code; 1330 do code += GET(code, 1); while (*code == OP_ALT); 1331 code += OP_lengths[*code]; 1332 break; 1333 1334 case OP_WORD_BOUNDARY: 1335 case OP_NOT_WORD_BOUNDARY: 1336 if (!skipassert) return code; 1337 /* Fall through */ 1338 1339 case OP_CALLOUT: 1340 case OP_CREF: 1341 case OP_BRANUMBER: 1342 code += OP_lengths[*code]; 1343 break; 1344 1345 default: 1346 return code; 1347 } 1348 } 1349/* Control never reaches here */ 1350} 1351 1352 1353 1354 1355/************************************************* 1356* Find the fixed length of a pattern * 1357*************************************************/ 1358 1359/* Scan a pattern and compute the fixed length of subject that will match it, 1360if the length is fixed. This is needed for dealing with backward assertions. 1361In UTF8 mode, the result is in characters rather than bytes. 1362 1363Arguments: 1364 code points to the start of the pattern (the bracket) 1365 options the compiling options 1366 1367Returns: the fixed length, or -1 if there is no fixed length, 1368 or -2 if \C was encountered 1369*/ 1370 1371static int 1372find_fixedlength(uschar *code, int options) 1373{ 1374int length = -1; 1375 1376register int branchlength = 0; 1377register uschar *cc = code + 1 + LINK_SIZE; 1378 1379/* Scan along the opcodes for this branch. If we get to the end of the 1380branch, check the length against that of the other branches. */ 1381 1382for (;;) 1383 { 1384 int d; 1385 register int op = *cc; 1386 if (op >= OP_BRA) op = OP_BRA; 1387 1388 switch (op) 1389 { 1390 case OP_BRA: 1391 case OP_ONCE: 1392 case OP_COND: 1393 d = find_fixedlength(cc, options); 1394 if (d < 0) return d; 1395 branchlength += d; 1396 do cc += GET(cc, 1); while (*cc == OP_ALT); 1397 cc += 1 + LINK_SIZE; 1398 break; 1399 1400 /* Reached end of a branch; if it's a ket it is the end of a nested 1401 call. If it's ALT it is an alternation in a nested call. If it is 1402 END it's the end of the outer call. All can be handled by the same code. */ 1403 1404 case OP_ALT: 1405 case OP_KET: 1406 case OP_KETRMAX: 1407 case OP_KETRMIN: 1408 case OP_END: 1409 if (length < 0) length = branchlength; 1410 else if (length != branchlength) return -1; 1411 if (*cc != OP_ALT) return length; 1412 cc += 1 + LINK_SIZE; 1413 branchlength = 0; 1414 break; 1415 1416 /* Skip over assertive subpatterns */ 1417 1418 case OP_ASSERT: 1419 case OP_ASSERT_NOT: 1420 case OP_ASSERTBACK: 1421 case OP_ASSERTBACK_NOT: 1422 do cc += GET(cc, 1); while (*cc == OP_ALT); 1423 /* Fall through */ 1424 1425 /* Skip over things that don't match chars */ 1426 1427 case OP_REVERSE: 1428 case OP_BRANUMBER: 1429 case OP_CREF: 1430 case OP_OPT: 1431 case OP_CALLOUT: 1432 case OP_SOD: 1433 case OP_SOM: 1434 case OP_EOD: 1435 case OP_EODN: 1436 case OP_CIRC: 1437 case OP_DOLL: 1438 case OP_NOT_WORD_BOUNDARY: 1439 case OP_WORD_BOUNDARY: 1440 cc += OP_lengths[*cc]; 1441 break; 1442 1443 /* Handle literal characters */ 1444 1445 case OP_CHAR: 1446 case OP_CHARNC: 1447 branchlength++; 1448 cc += 2; 1449#ifdef SUPPORT_UTF8 1450 if ((options & PCRE_UTF8) != 0) 1451 { 1452 while ((*cc & 0xc0) == 0x80) cc++; 1453 } 1454#endif 1455 break; 1456 1457 /* Handle exact repetitions. The count is already in characters, but we 1458 need to skip over a multibyte character in UTF8 mode. */ 1459 1460 case OP_EXACT: 1461 branchlength += GET2(cc,1); 1462 cc += 4; 1463#ifdef SUPPORT_UTF8 1464 if ((options & PCRE_UTF8) != 0) 1465 { 1466 while((*cc & 0x80) == 0x80) cc++; 1467 } 1468#endif 1469 break; 1470 1471 case OP_TYPEEXACT: 1472 branchlength += GET2(cc,1); 1473 cc += 4; 1474 break; 1475 1476 /* Handle single-char matchers */ 1477 1478 case OP_PROP: 1479 case OP_NOTPROP: 1480 cc++; 1481 /* Fall through */ 1482 1483 case OP_NOT_DIGIT: 1484 case OP_DIGIT: 1485 case OP_NOT_WHITESPACE: 1486 case OP_WHITESPACE: 1487 case OP_NOT_WORDCHAR: 1488 case OP_WORDCHAR: 1489 case OP_ANY: 1490 branchlength++; 1491 cc++; 1492 break; 1493 1494 /* The single-byte matcher isn't allowed */ 1495 1496 case OP_ANYBYTE: 1497 return -2; 1498 1499 /* Check a class for variable quantification */ 1500 1501#ifdef SUPPORT_UTF8 1502 case OP_XCLASS: 1503 cc += GET(cc, 1) - 33; 1504 /* Fall through */ 1505#endif 1506 1507 case OP_CLASS: 1508 case OP_NCLASS: 1509 cc += 33; 1510 1511 switch (*cc) 1512 { 1513 case OP_CRSTAR: 1514 case OP_CRMINSTAR: 1515 case OP_CRQUERY: 1516 case OP_CRMINQUERY: 1517 return -1; 1518 1519 case OP_CRRANGE: 1520 case OP_CRMINRANGE: 1521 if (GET2(cc,1) != GET2(cc,3)) return -1; 1522 branchlength += GET2(cc,1); 1523 cc += 5; 1524 break; 1525 1526 default: 1527 branchlength++; 1528 } 1529 break; 1530 1531 /* Anything else is variable length */ 1532 1533 default: 1534 return -1; 1535 } 1536 } 1537/* Control never gets here */ 1538} 1539 1540 1541 1542 1543/************************************************* 1544* Scan compiled regex for numbered bracket * 1545*************************************************/ 1546 1547/* This little function scans through a compiled pattern until it finds a 1548capturing bracket with the given number. 1549 1550Arguments: 1551 code points to start of expression 1552 utf8 TRUE in UTF-8 mode 1553 number the required bracket number 1554 1555Returns: pointer to the opcode for the bracket, or NULL if not found 1556*/ 1557 1558static const uschar * 1559find_bracket(const uschar *code, BOOL utf8, int number) 1560{ 1561#ifndef SUPPORT_UTF8 1562utf8 = utf8; /* Stop pedantic compilers complaining */ 1563#endif 1564 1565for (;;) 1566 { 1567 register int c = *code; 1568 if (c == OP_END) return NULL; 1569 else if (c > OP_BRA) 1570 { 1571 int n = c - OP_BRA; 1572 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE); 1573 if (n == number) return (uschar *)code; 1574 code += OP_lengths[OP_BRA]; 1575 } 1576 else 1577 { 1578 code += OP_lengths[c]; 1579 1580#ifdef SUPPORT_UTF8 1581 1582 /* In UTF-8 mode, opcodes that are followed by a character may be followed 1583 by a multi-byte character. The length in the table is a minimum, so we have 1584 to scan along to skip the extra bytes. All opcodes are less than 128, so we 1585 can use relatively efficient code. */ 1586 1587 if (utf8) switch(c) 1588 { 1589 case OP_CHAR: 1590 case OP_CHARNC: 1591 case OP_EXACT: 1592 case OP_UPTO: 1593 case OP_MINUPTO: 1594 case OP_STAR: 1595 case OP_MINSTAR: 1596 case OP_PLUS: 1597 case OP_MINPLUS: 1598 case OP_QUERY: 1599 case OP_MINQUERY: 1600 while ((*code & 0xc0) == 0x80) code++; 1601 break; 1602 1603 /* XCLASS is used for classes that cannot be represented just by a bit 1604 map. This includes negated single high-valued characters. The length in 1605 the table is zero; the actual length is stored in the compiled code. */ 1606 1607 case OP_XCLASS: 1608 code += GET(code, 1) + 1; 1609 break; 1610 } 1611#endif 1612 } 1613 } 1614} 1615 1616 1617 1618/************************************************* 1619* Scan compiled regex for recursion reference * 1620*************************************************/ 1621 1622/* This little function scans through a compiled pattern until it finds an 1623instance of OP_RECURSE. 1624 1625Arguments: 1626 code points to start of expression 1627 utf8 TRUE in UTF-8 mode 1628 1629Returns: pointer to the opcode for OP_RECURSE, or NULL if not found 1630*/ 1631 1632static const uschar * 1633find_recurse(const uschar *code, BOOL utf8) 1634{ 1635#ifndef SUPPORT_UTF8 1636utf8 = utf8; /* Stop pedantic compilers complaining */ 1637#endif 1638 1639for (;;) 1640 { 1641 register int c = *code; 1642 if (c == OP_END) return NULL; 1643 else if (c == OP_RECURSE) return code; 1644 else if (c > OP_BRA) 1645 { 1646 code += OP_lengths[OP_BRA]; 1647 } 1648 else 1649 { 1650 code += OP_lengths[c]; 1651 1652#ifdef SUPPORT_UTF8 1653 1654 /* In UTF-8 mode, opcodes that are followed by a character may be followed 1655 by a multi-byte character. The length in the table is a minimum, so we have 1656 to scan along to skip the extra bytes. All opcodes are less than 128, so we 1657 can use relatively efficient code. */ 1658 1659 if (utf8) switch(c) 1660 { 1661 case OP_CHAR: 1662 case OP_CHARNC: 1663 case OP_EXACT: 1664 case OP_UPTO: 1665 case OP_MINUPTO: 1666 case OP_STAR: 1667 case OP_MINSTAR: 1668 case OP_PLUS: 1669 case OP_MINPLUS: 1670 case OP_QUERY: 1671 case OP_MINQUERY: 1672 while ((*code & 0xc0) == 0x80) code++; 1673 break; 1674 1675 /* XCLASS is used for classes that cannot be represented just by a bit 1676 map. This includes negated single high-valued characters. The length in 1677 the table is zero; the actual length is stored in the compiled code. */ 1678 1679 case OP_XCLASS: 1680 code += GET(code, 1) + 1; 1681 break; 1682 } 1683#endif 1684 } 1685 } 1686} 1687 1688 1689 1690/************************************************* 1691* Scan compiled branch for non-emptiness * 1692*************************************************/ 1693 1694/* This function scans through a branch of a compiled pattern to see whether it 1695can match the empty string or not. It is called only from could_be_empty() 1696below. Note that first_significant_code() skips over assertions. If we hit an 1697unclosed bracket, we return "empty" - this means we've struck an inner bracket 1698whose current branch will already have been scanned. 1699 1700Arguments: 1701 code points to start of search 1702 endcode points to where to stop 1703 utf8 TRUE if in UTF8 mode 1704 1705Returns: TRUE if what is matched could be empty 1706*/ 1707 1708static BOOL 1709could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) 1710{ 1711register int c; 1712for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE); 1713 code < endcode; 1714 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE)) 1715 { 1716 const uschar *ccode; 1717 1718 c = *code; 1719 1720 if (c >= OP_BRA) 1721 { 1722 BOOL empty_branch; 1723 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ 1724 1725 /* Scan a closed bracket */ 1726 1727 empty_branch = FALSE; 1728 do 1729 { 1730 if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) 1731 empty_branch = TRUE; 1732 code += GET(code, 1); 1733 } 1734 while (*code == OP_ALT); 1735 if (!empty_branch) return FALSE; /* All branches are non-empty */ 1736 code += 1 + LINK_SIZE; 1737 c = *code; 1738 } 1739 1740 else switch (c) 1741 { 1742 /* Check for quantifiers after a class */ 1743 1744#ifdef SUPPORT_UTF8 1745 case OP_XCLASS: 1746 ccode = code + GET(code, 1); 1747 goto CHECK_CLASS_REPEAT; 1748#endif 1749 1750 case OP_CLASS: 1751 case OP_NCLASS: 1752 ccode = code + 33; 1753 1754#ifdef SUPPORT_UTF8 1755 CHECK_CLASS_REPEAT: 1756#endif 1757 1758 switch (*ccode) 1759 { 1760 case OP_CRSTAR: /* These could be empty; continue */ 1761 case OP_CRMINSTAR: 1762 case OP_CRQUERY: 1763 case OP_CRMINQUERY: 1764 break; 1765 1766 default: /* Non-repeat => class must match */ 1767 case OP_CRPLUS: /* These repeats aren't empty */ 1768 case OP_CRMINPLUS: 1769 return FALSE; 1770 1771 case OP_CRRANGE: 1772 case OP_CRMINRANGE: 1773 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ 1774 break; 1775 } 1776 break; 1777 1778 /* Opcodes that must match a character */ 1779 1780 case OP_PROP: 1781 case OP_NOTPROP: 1782 case OP_EXTUNI: 1783 case OP_NOT_DIGIT: 1784 case OP_DIGIT: 1785 case OP_NOT_WHITESPACE: 1786 case OP_WHITESPACE: 1787 case OP_NOT_WORDCHAR: 1788 case OP_WORDCHAR: 1789 case OP_ANY: 1790 case OP_ANYBYTE: 1791 case OP_CHAR: 1792 case OP_CHARNC: 1793 case OP_NOT: 1794 case OP_PLUS: 1795 case OP_MINPLUS: 1796 case OP_EXACT: 1797 case OP_NOTPLUS: 1798 case OP_NOTMINPLUS: 1799 case OP_NOTEXACT: 1800 case OP_TYPEPLUS: 1801 case OP_TYPEMINPLUS: 1802 case OP_TYPEEXACT: 1803 return FALSE; 1804 1805 /* End of branch */ 1806 1807 case OP_KET: 1808 case OP_KETRMAX: 1809 case OP_KETRMIN: 1810 case OP_ALT: 1811 return TRUE; 1812 1813 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be 1814 followed by a multibyte character */ 1815 1816#ifdef SUPPORT_UTF8 1817 case OP_STAR: 1818 case OP_MINSTAR: 1819 case OP_QUERY: 1820 case OP_MINQUERY: 1821 case OP_UPTO: 1822 case OP_MINUPTO: 1823 if (utf8) while ((code[2] & 0xc0) == 0x80) code++; 1824 break; 1825#endif 1826 } 1827 } 1828 1829return TRUE; 1830} 1831 1832 1833 1834/************************************************* 1835* Scan compiled regex for non-emptiness * 1836*************************************************/ 1837 1838/* This function is called to check for left recursive calls. We want to check 1839the current branch of the current pattern to see if it could match the empty 1840string. If it could, we must look outwards for branches at other levels, 1841stopping when we pass beyond the bracket which is the subject of the recursion. 1842 1843Arguments: 1844 code points to start of the recursion 1845 endcode points to where to stop (current RECURSE item) 1846 bcptr points to the chain of current (unclosed) branch starts 1847 utf8 TRUE if in UTF-8 mode 1848 1849Returns: TRUE if what is matched could be empty 1850*/ 1851 1852static BOOL 1853could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, 1854 BOOL utf8) 1855{ 1856while (bcptr != NULL && bcptr->current >= code) 1857 { 1858 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; 1859 bcptr = bcptr->outer; 1860 } 1861return TRUE; 1862} 1863 1864 1865 1866/************************************************* 1867* Check for POSIX class syntax * 1868*************************************************/ 1869 1870/* This function is called when the sequence "[:" or "[." or "[=" is 1871encountered in a character class. It checks whether this is followed by an 1872optional ^ and then a sequence of letters, terminated by a matching ":]" or 1873".]" or "=]". 1874 1875Argument: 1876 ptr pointer to the initial [ 1877 endptr where to return the end pointer 1878 cd pointer to compile data 1879 1880Returns: TRUE or FALSE 1881*/ 1882 1883static BOOL 1884check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) 1885{ 1886int terminator; /* Don't combine these lines; the Solaris cc */ 1887terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ 1888if (*(++ptr) == '^') ptr++; 1889while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; 1890if (*ptr == terminator && ptr[1] == ']') 1891 { 1892 *endptr = ptr; 1893 return TRUE; 1894 } 1895return FALSE; 1896} 1897 1898 1899 1900 1901/************************************************* 1902* Check POSIX class name * 1903*************************************************/ 1904 1905/* This function is called to check the name given in a POSIX-style class entry 1906such as [:alnum:]. 1907 1908Arguments: 1909 ptr points to the first letter 1910 len the length of the name 1911 1912Returns: a value representing the name, or -1 if unknown 1913*/ 1914 1915static int 1916check_posix_name(const uschar *ptr, int len) 1917{ 1918register int yield = 0; 1919while (posix_name_lengths[yield] != 0) 1920 { 1921 if (len == posix_name_lengths[yield] && 1922 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; 1923 yield++; 1924 } 1925return -1; 1926} 1927 1928 1929/************************************************* 1930* Adjust OP_RECURSE items in repeated group * 1931*************************************************/ 1932 1933/* OP_RECURSE items contain an offset from the start of the regex to the group 1934that is referenced. This means that groups can be replicated for fixed 1935repetition simply by copying (because the recursion is allowed to refer to 1936earlier groups that are outside the current group). However, when a group is 1937optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before 1938it, after it has been compiled. This means that any OP_RECURSE items within it 1939that refer to the group itself or any contained groups have to have their 1940offsets adjusted. That is the job of this function. Before it is called, the 1941partially compiled regex must be temporarily terminated with OP_END. 1942 1943Arguments: 1944 group points to the start of the group 1945 adjust the amount by which the group is to be moved 1946 utf8 TRUE in UTF-8 mode 1947 cd contains pointers to tables etc. 1948 1949Returns: nothing 1950*/ 1951 1952static void 1953adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd) 1954{ 1955uschar *ptr = group; 1956while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) 1957 { 1958 int offset = GET(ptr, 1); 1959 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); 1960 ptr += 1 + LINK_SIZE; 1961 } 1962} 1963 1964 1965 1966/************************************************* 1967* Insert an automatic callout point * 1968*************************************************/ 1969 1970/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert 1971callout points before each pattern item. 1972 1973Arguments: 1974 code current code pointer 1975 ptr current pattern pointer 1976 cd pointers to tables etc 1977 1978Returns: new code pointer 1979*/ 1980 1981static uschar * 1982auto_callout(uschar *code, const uschar *ptr, compile_data *cd) 1983{ 1984*code++ = OP_CALLOUT; 1985*code++ = 255; 1986PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ 1987PUT(code, LINK_SIZE, 0); /* Default length */ 1988return code + 2*LINK_SIZE; 1989} 1990 1991 1992 1993/************************************************* 1994* Complete a callout item * 1995*************************************************/ 1996 1997/* A callout item contains the length of the next item in the pattern, which 1998we can't fill in till after we have reached the relevant point. This is used 1999for both automatic and manual callouts. 2000 2001Arguments: 2002 previous_callout points to previous callout item 2003 ptr current pattern pointer 2004 cd pointers to tables etc 2005 2006Returns: nothing 2007*/ 2008 2009static void 2010complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) 2011{ 2012int length = ptr - cd->start_pattern - GET(previous_callout, 2); 2013PUT(previous_callout, 2 + LINK_SIZE, length); 2014} 2015 2016 2017 2018#ifdef SUPPORT_UCP 2019/************************************************* 2020* Get othercase range * 2021*************************************************/ 2022 2023/* This function is passed the start and end of a class range, in UTF-8 mode 2024with UCP support. It searches up the characters, looking for internal ranges of 2025characters in the "other" case. Each call returns the next one, updating the 2026start address. 2027 2028Arguments: 2029 cptr points to starting character value; updated 2030 d end value 2031 ocptr where to put start of othercase range 2032 odptr where to put end of othercase range 2033 2034Yield: TRUE when range returned; FALSE when no more 2035*/ 2036 2037static BOOL 2038get_othercase_range(int *cptr, int d, int *ocptr, int *odptr) 2039{ 2040int c, chartype, othercase, next; 2041 2042for (c = *cptr; c <= d; c++) 2043 { 2044 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break; 2045 } 2046 2047if (c > d) return FALSE; 2048 2049*ocptr = othercase; 2050next = othercase + 1; 2051 2052for (++c; c <= d; c++) 2053 { 2054 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next) 2055 break; 2056 next++; 2057 } 2058 2059*odptr = next - 1; 2060*cptr = c; 2061 2062return TRUE; 2063} 2064#endif /* SUPPORT_UCP */ 2065 2066 2067/************************************************* 2068* Compile one branch * 2069*************************************************/ 2070 2071/* Scan the pattern, compiling it into the code vector. If the options are 2072changed during the branch, the pointer is used to change the external options 2073bits. 2074 2075Arguments: 2076 optionsptr pointer to the option bits 2077 brackets points to number of extracting brackets used 2078 codeptr points to the pointer to the current code point 2079 ptrptr points to the current pattern pointer 2080 errorptr points to pointer to error message 2081 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) 2082 reqbyteptr set to the last literal character required, else < 0 2083 bcptr points to current branch chain 2084 cd contains pointers to tables etc. 2085 2086Returns: TRUE on success 2087 FALSE, with *errorptr set on error 2088*/ 2089 2090static BOOL 2091compile_branch(int *optionsptr, int *brackets, uschar **codeptr, 2092 const uschar **ptrptr, const char **errorptr, int *firstbyteptr, 2093 int *reqbyteptr, branch_chain *bcptr, compile_data *cd) 2094{ 2095int repeat_type, op_type; 2096int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ 2097int bravalue = 0; 2098int greedy_default, greedy_non_default; 2099int firstbyte, reqbyte; 2100int zeroreqbyte, zerofirstbyte; 2101int req_caseopt, reqvary, tempreqvary; 2102int condcount = 0; 2103int options = *optionsptr; 2104int after_manual_callout = 0; 2105register int c; 2106register uschar *code = *codeptr; 2107uschar *tempcode; 2108BOOL inescq = FALSE; 2109BOOL groupsetfirstbyte = FALSE; 2110const uschar *ptr = *ptrptr; 2111const uschar *tempptr; 2112uschar *previous = NULL; 2113uschar *previous_callout = NULL; 2114uschar classbits[32]; 2115 2116#ifdef SUPPORT_UTF8 2117BOOL class_utf8; 2118BOOL utf8 = (options & PCRE_UTF8) != 0; 2119uschar *class_utf8data; 2120uschar utf8_char[6]; 2121#else 2122BOOL utf8 = FALSE; 2123#endif 2124 2125/* Set up the default and non-default settings for greediness */ 2126 2127greedy_default = ((options & PCRE_UNGREEDY) != 0); 2128greedy_non_default = greedy_default ^ 1; 2129 2130/* Initialize no first byte, no required byte. REQ_UNSET means "no char 2131matching encountered yet". It gets changed to REQ_NONE if we hit something that 2132matches a non-fixed char first char; reqbyte just remains unset if we never 2133find one. 2134 2135When we hit a repeat whose minimum is zero, we may have to adjust these values 2136to take the zero repeat into account. This is implemented by setting them to 2137zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual 2138item types that can be repeated set these backoff variables appropriately. */ 2139 2140firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; 2141 2142/* The variable req_caseopt contains either the REQ_CASELESS value or zero, 2143according to the current setting of the caseless flag. REQ_CASELESS is a bit 2144value > 255. It is added into the firstbyte or reqbyte variables to record the 2145case status of the value. This is used only for ASCII characters. */ 2146 2147req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 2148 2149/* Switch on next character until the end of the branch */ 2150 2151for (;; ptr++) 2152 { 2153 BOOL negate_class; 2154 BOOL possessive_quantifier; 2155 BOOL is_quantifier; 2156 int class_charcount; 2157 int class_lastchar; 2158 int newoptions; 2159 int recno; 2160 int skipbytes; 2161 int subreqbyte; 2162 int subfirstbyte; 2163 int mclength; 2164 uschar mcbuffer[8]; 2165 2166 /* Next byte in the pattern */ 2167 2168 c = *ptr; 2169 2170 /* If in \Q...\E, check for the end; if not, we have a literal */ 2171 2172 if (inescq && c != 0) 2173 { 2174 if (c == '\\' && ptr[1] == 'E') 2175 { 2176 inescq = FALSE; 2177 ptr++; 2178 continue; 2179 } 2180 else 2181 { 2182 if (previous_callout != NULL) 2183 { 2184 complete_callout(previous_callout, ptr, cd); 2185 previous_callout = NULL; 2186 } 2187 if ((options & PCRE_AUTO_CALLOUT) != 0) 2188 { 2189 previous_callout = code; 2190 code = auto_callout(code, ptr, cd); 2191 } 2192 goto NORMAL_CHAR; 2193 } 2194 } 2195 2196 /* Fill in length of a previous callout, except when the next thing is 2197 a quantifier. */ 2198 2199 is_quantifier = c == '*' || c == '+' || c == '?' || 2200 (c == '{' && is_counted_repeat(ptr+1)); 2201 2202 if (!is_quantifier && previous_callout != NULL && 2203 after_manual_callout-- <= 0) 2204 { 2205 complete_callout(previous_callout, ptr, cd); 2206 previous_callout = NULL; 2207 } 2208 2209 /* In extended mode, skip white space and comments */ 2210 2211 if ((options & PCRE_EXTENDED) != 0) 2212 { 2213 if ((cd->ctypes[c] & ctype_space) != 0) continue; 2214 if (c == '#') 2215 { 2216 /* The space before the ; is to avoid a warning on a silly compiler 2217 on the Macintosh. */ 2218 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 2219 if (c != 0) continue; /* Else fall through to handle end of string */ 2220 } 2221 } 2222 2223 /* No auto callout for quantifiers. */ 2224 2225 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) 2226 { 2227 previous_callout = code; 2228 code = auto_callout(code, ptr, cd); 2229 } 2230 2231 switch(c) 2232 { 2233 /* The branch terminates at end of string, |, or ). */ 2234 2235 case 0: 2236 case '|': 2237 case ')': 2238 *firstbyteptr = firstbyte; 2239 *reqbyteptr = reqbyte; 2240 *codeptr = code; 2241 *ptrptr = ptr; 2242 return TRUE; 2243 2244 /* Handle single-character metacharacters. In multiline mode, ^ disables 2245 the setting of any following char as a first character. */ 2246 2247 case '^': 2248 if ((options & PCRE_MULTILINE) != 0) 2249 { 2250 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2251 } 2252 previous = NULL; 2253 *code++ = OP_CIRC; 2254 break; 2255 2256 case '$': 2257 previous = NULL; 2258 *code++ = OP_DOLL; 2259 break; 2260 2261 /* There can never be a first char if '.' is first, whatever happens about 2262 repeats. The value of reqbyte doesn't change either. */ 2263 2264 case '.': 2265 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2266 zerofirstbyte = firstbyte; 2267 zeroreqbyte = reqbyte; 2268 previous = code; 2269 *code++ = OP_ANY; 2270 break; 2271 2272 /* Character classes. If the included characters are all < 255 in value, we 2273 build a 32-byte bitmap of the permitted characters, except in the special 2274 case where there is only one such character. For negated classes, we build 2275 the map as usual, then invert it at the end. However, we use a different 2276 opcode so that data characters > 255 can be handled correctly. 2277 2278 If the class contains characters outside the 0-255 range, a different 2279 opcode is compiled. It may optionally have a bit map for characters < 256, 2280 but those above are are explicitly listed afterwards. A flag byte tells 2281 whether the bitmap is present, and whether this is a negated class or not. 2282 */ 2283 2284 case '[': 2285 previous = code; 2286 2287 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if 2288 they are encountered at the top level, so we'll do that too. */ 2289 2290 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && 2291 check_posix_syntax(ptr, &tempptr, cd)) 2292 { 2293 *errorptr = (ptr[1] == ':')? ERR13 : ERR31; 2294 goto FAILED; 2295 } 2296 2297 /* If the first character is '^', set the negation flag and skip it. */ 2298 2299 if ((c = *(++ptr)) == '^') 2300 { 2301 negate_class = TRUE; 2302 c = *(++ptr); 2303 } 2304 else 2305 { 2306 negate_class = FALSE; 2307 } 2308 2309 /* Keep a count of chars with values < 256 so that we can optimize the case 2310 of just a single character (as long as it's < 256). For higher valued UTF-8 2311 characters, we don't yet do any optimization. */ 2312 2313 class_charcount = 0; 2314 class_lastchar = -1; 2315 2316#ifdef SUPPORT_UTF8 2317 class_utf8 = FALSE; /* No chars >= 256 */ 2318 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */ 2319#endif 2320 2321 /* Initialize the 32-char bit map to all zeros. We have to build the 2322 map in a temporary bit of store, in case the class contains only 1 2323 character (< 256), because in that case the compiled code doesn't use the 2324 bit map. */ 2325 2326 memset(classbits, 0, 32 * sizeof(uschar)); 2327 2328 /* Process characters until ] is reached. By writing this as a "do" it 2329 means that an initial ] is taken as a data character. The first pass 2330 through the regex checked the overall syntax, so we don't need to be very 2331 strict here. At the start of the loop, c contains the first byte of the 2332 character. */ 2333 2334 do 2335 { 2336#ifdef SUPPORT_UTF8 2337 if (utf8 && c > 127) 2338 { /* Braces are required because the */ 2339 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ 2340 } 2341#endif 2342 2343 /* Inside \Q...\E everything is literal except \E */ 2344 2345 if (inescq) 2346 { 2347 if (c == '\\' && ptr[1] == 'E') 2348 { 2349 inescq = FALSE; 2350 ptr++; 2351 continue; 2352 } 2353 else goto LONE_SINGLE_CHARACTER; 2354 } 2355 2356 /* Handle POSIX class names. Perl allows a negation extension of the 2357 form [:^name:]. A square bracket that doesn't match the syntax is 2358 treated as a literal. We also recognize the POSIX constructions 2359 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 2360 5.6 and 5.8 do. */ 2361 2362 if (c == '[' && 2363 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && 2364 check_posix_syntax(ptr, &tempptr, cd)) 2365 { 2366 BOOL local_negate = FALSE; 2367 int posix_class, i; 2368 register const uschar *cbits = cd->cbits; 2369 2370 if (ptr[1] != ':') 2371 { 2372 *errorptr = ERR31; 2373 goto FAILED; 2374 } 2375 2376 ptr += 2; 2377 if (*ptr == '^') 2378 { 2379 local_negate = TRUE; 2380 ptr++; 2381 } 2382 2383 posix_class = check_posix_name(ptr, tempptr - ptr); 2384 if (posix_class < 0) 2385 { 2386 *errorptr = ERR30; 2387 goto FAILED; 2388 } 2389 2390 /* If matching is caseless, upper and lower are converted to 2391 alpha. This relies on the fact that the class table starts with 2392 alpha, lower, upper as the first 3 entries. */ 2393 2394 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) 2395 posix_class = 0; 2396 2397 /* Or into the map we are building up to 3 of the static class 2398 tables, or their negations. The [:blank:] class sets up the same 2399 chars as the [:space:] class (all white space). We remove the vertical 2400 white space chars afterwards. */ 2401 2402 posix_class *= 3; 2403 for (i = 0; i < 3; i++) 2404 { 2405 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0; 2406 int taboffset = posix_class_maps[posix_class + i]; 2407 if (taboffset < 0) break; 2408 if (local_negate) 2409 { 2410 if (i == 0) 2411 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset]; 2412 else 2413 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset]; 2414 if (blankclass) classbits[1] |= 0x3c; 2415 } 2416 else 2417 { 2418 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset]; 2419 if (blankclass) classbits[1] &= ~0x3c; 2420 } 2421 } 2422 2423 ptr = tempptr + 1; 2424 class_charcount = 10; /* Set > 1; assumes more than 1 per class */ 2425 continue; /* End of POSIX syntax handling */ 2426 } 2427 2428 /* Backslash may introduce a single character, or it may introduce one 2429 of the specials, which just set a flag. Escaped items are checked for 2430 validity in the pre-compiling pass. The sequence \b is a special case. 2431 Inside a class (and only there) it is treated as backspace. Elsewhere 2432 it marks a word boundary. Other escapes have preset maps ready to 2433 or into the one we are building. We assume they have more than one 2434 character in them, so set class_charcount bigger than one. */ 2435 2436 if (c == '\\') 2437 { 2438 c = check_escape(&ptr, errorptr, *brackets, options, TRUE); 2439 2440 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */ 2441 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ 2442 else if (-c == ESC_Q) /* Handle start of quoted string */ 2443 { 2444 if (ptr[1] == '\\' && ptr[2] == 'E') 2445 { 2446 ptr += 2; /* avoid empty string */ 2447 } 2448 else inescq = TRUE; 2449 continue; 2450 } 2451 2452 if (c < 0) 2453 { 2454 register const uschar *cbits = cd->cbits; 2455 class_charcount += 2; /* Greater than 1 is what matters */ 2456 switch (-c) 2457 { 2458 case ESC_d: 2459 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; 2460 continue; 2461 2462 case ESC_D: 2463 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; 2464 continue; 2465 2466 case ESC_w: 2467 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; 2468 continue; 2469 2470 case ESC_W: 2471 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; 2472 continue; 2473 2474 case ESC_s: 2475 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; 2476 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ 2477 continue; 2478 2479 case ESC_S: 2480 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; 2481 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ 2482 continue; 2483 2484#ifdef SUPPORT_UCP 2485 case ESC_p: 2486 case ESC_P: 2487 { 2488 BOOL negated; 2489 int property = get_ucp(&ptr, &negated, errorptr); 2490 if (property < 0) goto FAILED; 2491 class_utf8 = TRUE; 2492 *class_utf8data++ = ((-c == ESC_p) != negated)? 2493 XCL_PROP : XCL_NOTPROP; 2494 *class_utf8data++ = property; 2495 class_charcount -= 2; /* Not a < 256 character */ 2496 } 2497 continue; 2498#endif 2499 2500 /* Unrecognized escapes are faulted if PCRE is running in its 2501 strict mode. By default, for compatibility with Perl, they are 2502 treated as literals. */ 2503 2504 default: 2505 if ((options & PCRE_EXTRA) != 0) 2506 { 2507 *errorptr = ERR7; 2508 goto FAILED; 2509 } 2510 c = *ptr; /* The final character */ 2511 class_charcount -= 2; /* Undo the default count from above */ 2512 } 2513 } 2514 2515 /* Fall through if we have a single character (c >= 0). This may be 2516 > 256 in UTF-8 mode. */ 2517 2518 } /* End of backslash handling */ 2519 2520 /* A single character may be followed by '-' to form a range. However, 2521 Perl does not permit ']' to be the end of the range. A '-' character 2522 here is treated as a literal. */ 2523 2524 if (ptr[1] == '-' && ptr[2] != ']') 2525 { 2526 int d; 2527 ptr += 2; 2528 2529#ifdef SUPPORT_UTF8 2530 if (utf8) 2531 { /* Braces are required because the */ 2532 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ 2533 } 2534 else 2535#endif 2536 d = *ptr; /* Not UTF-8 mode */ 2537 2538 /* The second part of a range can be a single-character escape, but 2539 not any of the other escapes. Perl 5.6 treats a hyphen as a literal 2540 in such circumstances. */ 2541 2542 if (d == '\\') 2543 { 2544 const uschar *oldptr = ptr; 2545 d = check_escape(&ptr, errorptr, *brackets, options, TRUE); 2546 2547 /* \b is backslash; \X is literal X; any other special means the '-' 2548 was literal */ 2549 2550 if (d < 0) 2551 { 2552 if (d == -ESC_b) d = '\b'; 2553 else if (d == -ESC_X) d = 'X'; else 2554 { 2555 ptr = oldptr - 2; 2556 goto LONE_SINGLE_CHARACTER; /* A few lines below */ 2557 } 2558 } 2559 } 2560 2561 /* The check that the two values are in the correct order happens in 2562 the pre-pass. Optimize one-character ranges */ 2563 2564 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ 2565 2566 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless 2567 matching, we have to use an XCLASS with extra data items. Caseless 2568 matching for characters > 127 is available only if UCP support is 2569 available. */ 2570 2571#ifdef SUPPORT_UTF8 2572 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) 2573 { 2574 class_utf8 = TRUE; 2575 2576 /* With UCP support, we can find the other case equivalents of 2577 the relevant characters. There may be several ranges. Optimize how 2578 they fit with the basic range. */ 2579 2580#ifdef SUPPORT_UCP 2581 if ((options & PCRE_CASELESS) != 0) 2582 { 2583 int occ, ocd; 2584 int cc = c; 2585 int origd = d; 2586 while (get_othercase_range(&cc, origd, &occ, &ocd)) 2587 { 2588 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */ 2589 2590 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 2591 { /* if there is overlap, */ 2592 c = occ; /* noting that if occ < c */ 2593 continue; /* we can't have ocd > d */ 2594 } /* because a subrange is */ 2595 if (ocd > d && occ <= d + 1) /* always shorter than */ 2596 { /* the basic range. */ 2597 d = ocd; 2598 continue; 2599 } 2600 2601 if (occ == ocd) 2602 { 2603 *class_utf8data++ = XCL_SINGLE; 2604 } 2605 else 2606 { 2607 *class_utf8data++ = XCL_RANGE; 2608 class_utf8data += ord2utf8(occ, class_utf8data); 2609 } 2610 class_utf8data += ord2utf8(ocd, class_utf8data); 2611 } 2612 } 2613#endif /* SUPPORT_UCP */ 2614 2615 /* Now record the original range, possibly modified for UCP caseless 2616 overlapping ranges. */ 2617 2618 *class_utf8data++ = XCL_RANGE; 2619 class_utf8data += ord2utf8(c, class_utf8data); 2620 class_utf8data += ord2utf8(d, class_utf8data); 2621 2622 /* With UCP support, we are done. Without UCP support, there is no 2623 caseless matching for UTF-8 characters > 127; we can use the bit map 2624 for the smaller ones. */ 2625 2626#ifdef SUPPORT_UCP 2627 continue; /* With next character in the class */ 2628#else 2629 if ((options & PCRE_CASELESS) == 0 || c > 127) continue; 2630 2631 /* Adjust upper limit and fall through to set up the map */ 2632 2633 d = 127; 2634 2635#endif /* SUPPORT_UCP */ 2636 } 2637#endif /* SUPPORT_UTF8 */ 2638 2639 /* We use the bit map for all cases when not in UTF-8 mode; else 2640 ranges that lie entirely within 0-127 when there is UCP support; else 2641 for partial ranges without UCP support. */ 2642 2643 for (; c <= d; c++) 2644 { 2645 classbits[c/8] |= (1 << (c&7)); 2646 if ((options & PCRE_CASELESS) != 0) 2647 { 2648 int uc = cd->fcc[c]; /* flip case */ 2649 classbits[uc/8] |= (1 << (uc&7)); 2650 } 2651 class_charcount++; /* in case a one-char range */ 2652 class_lastchar = c; 2653 } 2654 2655 continue; /* Go get the next char in the class */ 2656 } 2657 2658 /* Handle a lone single character - we can get here for a normal 2659 non-escape char, or after \ that introduces a single character or for an 2660 apparent range that isn't. */ 2661 2662 LONE_SINGLE_CHARACTER: 2663 2664 /* Handle a character that cannot go in the bit map */ 2665 2666#ifdef SUPPORT_UTF8 2667 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) 2668 { 2669 class_utf8 = TRUE; 2670 *class_utf8data++ = XCL_SINGLE; 2671 class_utf8data += ord2utf8(c, class_utf8data); 2672 2673#ifdef SUPPORT_UCP 2674 if ((options & PCRE_CASELESS) != 0) 2675 { 2676 int chartype; 2677 int othercase; 2678 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0) 2679 { 2680 *class_utf8data++ = XCL_SINGLE; 2681 class_utf8data += ord2utf8(othercase, class_utf8data); 2682 } 2683 } 2684#endif /* SUPPORT_UCP */ 2685 2686 } 2687 else 2688#endif /* SUPPORT_UTF8 */ 2689 2690 /* Handle a single-byte character */ 2691 { 2692 classbits[c/8] |= (1 << (c&7)); 2693 if ((options & PCRE_CASELESS) != 0) 2694 { 2695 c = cd->fcc[c]; /* flip case */ 2696 classbits[c/8] |= (1 << (c&7)); 2697 } 2698 class_charcount++; 2699 class_lastchar = c; 2700 } 2701 } 2702 2703 /* Loop until ']' reached; the check for end of string happens inside the 2704 loop. This "while" is the end of the "do" above. */ 2705 2706 while ((c = *(++ptr)) != ']' || inescq); 2707 2708 /* If class_charcount is 1, we saw precisely one character whose value is 2709 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we 2710 can optimize the negative case only if there were no characters >= 128 2711 because OP_NOT and the related opcodes like OP_NOTSTAR operate on 2712 single-bytes only. This is an historical hangover. Maybe one day we can 2713 tidy these opcodes to handle multi-byte characters. 2714 2715 The optimization throws away the bit map. We turn the item into a 2716 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note 2717 that OP_NOT does not support multibyte characters. In the positive case, it 2718 can cause firstbyte to be set. Otherwise, there can be no first char if 2719 this item is first, whatever repeat count may follow. In the case of 2720 reqbyte, save the previous value for reinstating. */ 2721 2722#ifdef SUPPORT_UTF8 2723 if (class_charcount == 1 && 2724 (!utf8 || 2725 (!class_utf8 && (!negate_class || class_lastchar < 128)))) 2726 2727#else 2728 if (class_charcount == 1) 2729#endif 2730 { 2731 zeroreqbyte = reqbyte; 2732 2733 /* The OP_NOT opcode works on one-byte characters only. */ 2734 2735 if (negate_class) 2736 { 2737 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2738 zerofirstbyte = firstbyte; 2739 *code++ = OP_NOT; 2740 *code++ = class_lastchar; 2741 break; 2742 } 2743 2744 /* For a single, positive character, get the value into mcbuffer, and 2745 then we can handle this with the normal one-character code. */ 2746 2747#ifdef SUPPORT_UTF8 2748 if (utf8 && class_lastchar > 127) 2749 mclength = ord2utf8(class_lastchar, mcbuffer); 2750 else 2751#endif 2752 { 2753 mcbuffer[0] = class_lastchar; 2754 mclength = 1; 2755 } 2756 goto ONE_CHAR; 2757 } /* End of 1-char optimization */ 2758 2759 /* The general case - not the one-char optimization. If this is the first 2760 thing in the branch, there can be no first char setting, whatever the 2761 repeat count. Any reqbyte setting must remain unchanged after any kind of 2762 repeat. */ 2763 2764 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; 2765 zerofirstbyte = firstbyte; 2766 zeroreqbyte = reqbyte; 2767 2768 /* If there are characters with values > 255, we have to compile an 2769 extended class, with its own opcode. If there are no characters < 256, 2770 we can omit the bitmap. */ 2771 2772#ifdef SUPPORT_UTF8 2773 if (class_utf8) 2774 { 2775 *class_utf8data++ = XCL_END; /* Marks the end of extra data */ 2776 *code++ = OP_XCLASS; 2777 code += LINK_SIZE; 2778 *code = negate_class? XCL_NOT : 0; 2779 2780 /* If the map is required, install it, and move on to the end of 2781 the extra data */ 2782 2783 if (class_charcount > 0) 2784 { 2785 *code++ |= XCL_MAP; 2786 memcpy(code, classbits, 32); 2787 code = class_utf8data; 2788 } 2789 2790 /* If the map is not required, slide down the extra data. */ 2791 2792 else 2793 { 2794 int len = class_utf8data - (code + 33); 2795 memmove(code + 1, code + 33, len); 2796 code += len + 1; 2797 } 2798 2799 /* Now fill in the complete length of the item */ 2800 2801 PUT(previous, 1, code - previous); 2802 break; /* End of class handling */ 2803 } 2804#endif 2805 2806 /* If there are no characters > 255, negate the 32-byte map if necessary, 2807 and copy it into the code vector. If this is the first thing in the branch, 2808 there can be no first char setting, whatever the repeat count. Any reqbyte 2809 setting must remain unchanged after any kind of repeat. */ 2810 2811 if (negate_class) 2812 { 2813 *code++ = OP_NCLASS; 2814 for (c = 0; c < 32; c++) code[c] = ~classbits[c]; 2815 } 2816 else 2817 { 2818 *code++ = OP_CLASS; 2819 memcpy(code, classbits, 32); 2820 } 2821 code += 32; 2822 break; 2823 2824 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this 2825 has been tested above. */ 2826 2827 case '{': 2828 if (!is_quantifier) goto NORMAL_CHAR; 2829 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr); 2830 if (*errorptr != NULL) goto FAILED; 2831 goto REPEAT; 2832 2833 case '*': 2834 repeat_min = 0; 2835 repeat_max = -1; 2836 goto REPEAT; 2837 2838 case '+': 2839 repeat_min = 1; 2840 repeat_max = -1; 2841 goto REPEAT; 2842 2843 case '?': 2844 repeat_min = 0; 2845 repeat_max = 1; 2846 2847 REPEAT: 2848 if (previous == NULL) 2849 { 2850 *errorptr = ERR9; 2851 goto FAILED; 2852 } 2853 2854 if (repeat_min == 0) 2855 { 2856 firstbyte = zerofirstbyte; /* Adjust for zero repeat */ 2857 reqbyte = zeroreqbyte; /* Ditto */ 2858 } 2859 2860 /* Remember whether this is a variable length repeat */ 2861 2862 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; 2863 2864 op_type = 0; /* Default single-char op codes */ 2865 possessive_quantifier = FALSE; /* Default not possessive quantifier */ 2866 2867 /* Save start of previous item, in case we have to move it up to make space 2868 for an inserted OP_ONCE for the additional '+' extension. */ 2869 2870 tempcode = previous; 2871 2872 /* If the next character is '+', we have a possessive quantifier. This 2873 implies greediness, whatever the setting of the PCRE_UNGREEDY option. 2874 If the next character is '?' this is a minimizing repeat, by default, 2875 but if PCRE_UNGREEDY is set, it works the other way round. We change the 2876 repeat type to the non-default. */ 2877 2878 if (ptr[1] == '+') 2879 { 2880 repeat_type = 0; /* Force greedy */ 2881 possessive_quantifier = TRUE; 2882 ptr++; 2883 } 2884 else if (ptr[1] == '?') 2885 { 2886 repeat_type = greedy_non_default; 2887 ptr++; 2888 } 2889 else repeat_type = greedy_default; 2890 2891 /* If previous was a recursion, we need to wrap it inside brackets so that 2892 it can be replicated if necessary. */ 2893 2894 if (*previous == OP_RECURSE) 2895 { 2896 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); 2897 code += 1 + LINK_SIZE; 2898 *previous = OP_BRA; 2899 PUT(previous, 1, code - previous); 2900 *code = OP_KET; 2901 PUT(code, 1, code - previous); 2902 code += 1 + LINK_SIZE; 2903 } 2904 2905 /* If previous was a character match, abolish the item and generate a 2906 repeat item instead. If a char item has a minumum of more than one, ensure 2907 that it is set in reqbyte - it might not be if a sequence such as x{3} is 2908 the first thing in a branch because the x will have gone into firstbyte 2909 instead. */ 2910 2911 if (*previous == OP_CHAR || *previous == OP_CHARNC) 2912 { 2913 /* Deal with UTF-8 characters that take up more than one byte. It's 2914 easier to write this out separately than try to macrify it. Use c to 2915 hold the length of the character in bytes, plus 0x80 to flag that it's a 2916 length rather than a small character. */ 2917 2918#ifdef SUPPORT_UTF8 2919 if (utf8 && (code[-1] & 0x80) != 0) 2920 { 2921 uschar *lastchar = code - 1; 2922 while((*lastchar & 0xc0) == 0x80) lastchar--; 2923 c = code - lastchar; /* Length of UTF-8 character */ 2924 memcpy(utf8_char, lastchar, c); /* Save the char */ 2925 c |= 0x80; /* Flag c as a length */ 2926 } 2927 else 2928#endif 2929 2930 /* Handle the case of a single byte - either with no UTF8 support, or 2931 with UTF-8 disabled, or for a UTF-8 character < 128. */ 2932 2933 { 2934 c = code[-1]; 2935 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; 2936 } 2937 2938 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ 2939 } 2940 2941 /* If previous was a single negated character ([^a] or similar), we use 2942 one of the special opcodes, replacing it. The code is shared with single- 2943 character repeats by setting opt_type to add a suitable offset into 2944 repeat_type. OP_NOT is currently used only for single-byte chars. */ 2945 2946 else if (*previous == OP_NOT) 2947 { 2948 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ 2949 c = previous[1]; 2950 goto OUTPUT_SINGLE_REPEAT; 2951 } 2952 2953 /* If previous was a character type match (\d or similar), abolish it and 2954 create a suitable repeat item. The code is shared with single-character 2955 repeats by setting op_type to add a suitable offset into repeat_type. Note 2956 the the Unicode property types will be present only when SUPPORT_UCP is 2957 defined, but we don't wrap the little bits of code here because it just 2958 makes it horribly messy. */ 2959 2960 else if (*previous < OP_EODN) 2961 { 2962 uschar *oldcode; 2963 int prop_type; 2964 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ 2965 c = *previous; 2966 2967 OUTPUT_SINGLE_REPEAT: 2968 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)? 2969 previous[1] : -1; 2970 2971 oldcode = code; 2972 code = previous; /* Usually overwrite previous item */ 2973 2974 /* If the maximum is zero then the minimum must also be zero; Perl allows 2975 this case, so we do too - by simply omitting the item altogether. */ 2976 2977 if (repeat_max == 0) goto END_REPEAT; 2978 2979 /* All real repeats make it impossible to handle partial matching (maybe 2980 one day we will be able to remove this restriction). */ 2981 2982 if (repeat_max != 1) cd->nopartial = TRUE; 2983 2984 /* Combine the op_type with the repeat_type */ 2985 2986 repeat_type += op_type; 2987 2988 /* A minimum of zero is handled either as the special case * or ?, or as 2989 an UPTO, with the maximum given. */ 2990 2991 if (repeat_min == 0) 2992 { 2993 if (repeat_max == -1) *code++ = OP_STAR + repeat_type; 2994 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; 2995 else 2996 { 2997 *code++ = OP_UPTO + repeat_type; 2998 PUT2INC(code, 0, repeat_max); 2999 } 3000 } 3001 3002 /* A repeat minimum of 1 is optimized into some special cases. If the 3003 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it 3004 left in place and, if the maximum is greater than 1, we use OP_UPTO with 3005 one less than the maximum. */ 3006 3007 else if (repeat_min == 1) 3008 { 3009 if (repeat_max == -1) 3010 *code++ = OP_PLUS + repeat_type; 3011 else 3012 { 3013 code = oldcode; /* leave previous item in place */ 3014 if (repeat_max == 1) goto END_REPEAT; 3015 *code++ = OP_UPTO + repeat_type; 3016 PUT2INC(code, 0, repeat_max - 1); 3017 } 3018 } 3019 3020 /* The case {n,n} is just an EXACT, while the general case {n,m} is 3021 handled as an EXACT followed by an UPTO. */ 3022 3023 else 3024 { 3025 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ 3026 PUT2INC(code, 0, repeat_min); 3027 3028 /* If the maximum is unlimited, insert an OP_STAR. Before doing so, 3029 we have to insert the character for the previous code. For a repeated 3030 Unicode property match, there is an extra byte that defines the 3031 required property. In UTF-8 mode, long characters have their length in 3032 c, with the 0x80 bit as a flag. */ 3033 3034 if (repeat_max < 0) 3035 { 3036#ifdef SUPPORT_UTF8 3037 if (utf8 && c >= 128) 3038 { 3039 memcpy(code, utf8_char, c & 7); 3040 code += c & 7; 3041 } 3042 else 3043#endif 3044 { 3045 *code++ = c; 3046 if (prop_type >= 0) *code++ = prop_type; 3047 } 3048 *code++ = OP_STAR + repeat_type; 3049 } 3050 3051 /* Else insert an UPTO if the max is greater than the min, again 3052 preceded by the character, for the previously inserted code. */ 3053 3054 else if (repeat_max != repeat_min) 3055 { 3056#ifdef SUPPORT_UTF8 3057 if (utf8 && c >= 128) 3058 { 3059 memcpy(code, utf8_char, c & 7); 3060 code += c & 7; 3061 } 3062 else 3063#endif 3064 *code++ = c; 3065 if (prop_type >= 0) *code++ = prop_type; 3066 repeat_max -= repeat_min; 3067 *code++ = OP_UPTO + repeat_type; 3068 PUT2INC(code, 0, repeat_max); 3069 } 3070 } 3071 3072 /* The character or character type itself comes last in all cases. */ 3073 3074#ifdef SUPPORT_UTF8 3075 if (utf8 && c >= 128) 3076 { 3077 memcpy(code, utf8_char, c & 7); 3078 code += c & 7; 3079 } 3080 else 3081#endif 3082 *code++ = c; 3083 3084 /* For a repeated Unicode property match, there is an extra byte that 3085 defines the required property. */ 3086 3087#ifdef SUPPORT_UCP 3088 if (prop_type >= 0) *code++ = prop_type; 3089#endif 3090 } 3091 3092 /* If previous was a character class or a back reference, we put the repeat 3093 stuff after it, but just skip the item if the repeat was {0,0}. */ 3094 3095 else if (*previous == OP_CLASS || 3096 *previous == OP_NCLASS || 3097#ifdef SUPPORT_UTF8 3098 *previous == OP_XCLASS || 3099#endif 3100 *previous == OP_REF) 3101 { 3102 if (repeat_max == 0) 3103 { 3104 code = previous; 3105 goto END_REPEAT; 3106 } 3107 3108 /* All real repeats make it impossible to handle partial matching (maybe 3109 one day we will be able to remove this restriction). */ 3110 3111 if (repeat_max != 1) cd->nopartial = TRUE; 3112 3113 if (repeat_min == 0 && repeat_max == -1) 3114 *code++ = OP_CRSTAR + repeat_type; 3115 else if (repeat_min == 1 && repeat_max == -1) 3116 *code++ = OP_CRPLUS + repeat_type; 3117 else if (repeat_min == 0 && repeat_max == 1) 3118 *code++ = OP_CRQUERY + repeat_type; 3119 else 3120 { 3121 *code++ = OP_CRRANGE + repeat_type; 3122 PUT2INC(code, 0, repeat_min); 3123 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ 3124 PUT2INC(code, 0, repeat_max); 3125 } 3126 } 3127 3128 /* If previous was a bracket group, we may have to replicate it in certain 3129 cases. */ 3130 3131 else if (*previous >= OP_BRA || *previous == OP_ONCE || 3132 *previous == OP_COND) 3133 { 3134 register int i; 3135 int ketoffset = 0; 3136 int len = code - previous; 3137 uschar *bralink = NULL; 3138 3139 /* If the maximum repeat count is unlimited, find the end of the bracket 3140 by scanning through from the start, and compute the offset back to it 3141 from the current code pointer. There may be an OP_OPT setting following 3142 the final KET, so we can't find the end just by going back from the code 3143 pointer. */ 3144 3145 if (repeat_max == -1) 3146 { 3147 register uschar *ket = previous; 3148 do ket += GET(ket, 1); while (*ket != OP_KET); 3149 ketoffset = code - ket; 3150 } 3151 3152 /* The case of a zero minimum is special because of the need to stick 3153 OP_BRAZERO in front of it, and because the group appears once in the 3154 data, whereas in other cases it appears the minimum number of times. For 3155 this reason, it is simplest to treat this case separately, as otherwise 3156 the code gets far too messy. There are several special subcases when the 3157 minimum is zero. */ 3158 3159 if (repeat_min == 0) 3160 { 3161 /* If the maximum is also zero, we just omit the group from the output 3162 altogether. */ 3163 3164 if (repeat_max == 0) 3165 { 3166 code = previous; 3167 goto END_REPEAT; 3168 } 3169 3170 /* If the maximum is 1 or unlimited, we just have to stick in the 3171 BRAZERO and do no more at this point. However, we do need to adjust 3172 any OP_RECURSE calls inside the group that refer to the group itself or 3173 any internal group, because the offset is from the start of the whole 3174 regex. Temporarily terminate the pattern while doing this. */ 3175 3176 if (repeat_max <= 1) 3177 { 3178 *code = OP_END; 3179 adjust_recurse(previous, 1, utf8, cd); 3180 memmove(previous+1, previous, len); 3181 code++; 3182 *previous++ = OP_BRAZERO + repeat_type; 3183 } 3184 3185 /* If the maximum is greater than 1 and limited, we have to replicate 3186 in a nested fashion, sticking OP_BRAZERO before each set of brackets. 3187 The first one has to be handled carefully because it's the original 3188 copy, which has to be moved up. The remainder can be handled by code 3189 that is common with the non-zero minimum case below. We have to 3190 adjust the value or repeat_max, since one less copy is required. Once 3191 again, we may have to adjust any OP_RECURSE calls inside the group. */ 3192 3193 else 3194 { 3195 int offset; 3196 *code = OP_END; 3197 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd); 3198 memmove(previous + 2 + LINK_SIZE, previous, len); 3199 code += 2 + LINK_SIZE; 3200 *previous++ = OP_BRAZERO + repeat_type; 3201 *previous++ = OP_BRA; 3202 3203 /* We chain together the bracket offset fields that have to be 3204 filled in later when the ends of the brackets are reached. */ 3205 3206 offset = (bralink == NULL)? 0 : previous - bralink; 3207 bralink = previous; 3208 PUTINC(previous, 0, offset); 3209 } 3210 3211 repeat_max--; 3212 } 3213 3214 /* If the minimum is greater than zero, replicate the group as many 3215 times as necessary, and adjust the maximum to the number of subsequent 3216 copies that we need. If we set a first char from the group, and didn't 3217 set a required char, copy the latter from the former. */ 3218 3219 else 3220 { 3221 if (repeat_min > 1) 3222 { 3223 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; 3224 for (i = 1; i < repeat_min; i++) 3225 { 3226 memcpy(code, previous, len); 3227 code += len; 3228 } 3229 } 3230 if (repeat_max > 0) repeat_max -= repeat_min; 3231 } 3232 3233 /* This code is common to both the zero and non-zero minimum cases. If 3234 the maximum is limited, it replicates the group in a nested fashion, 3235 remembering the bracket starts on a stack. In the case of a zero minimum, 3236 the first one was set up above. In all cases the repeat_max now specifies 3237 the number of additional copies needed. */ 3238 3239 if (repeat_max >= 0) 3240 { 3241 for (i = repeat_max - 1; i >= 0; i--) 3242 { 3243 *code++ = OP_BRAZERO + repeat_type; 3244 3245 /* All but the final copy start a new nesting, maintaining the 3246 chain of brackets outstanding. */ 3247 3248 if (i != 0) 3249 { 3250 int offset; 3251 *code++ = OP_BRA; 3252 offset = (bralink == NULL)? 0 : code - bralink; 3253 bralink = code; 3254 PUTINC(code, 0, offset); 3255 } 3256 3257 memcpy(code, previous, len); 3258 code += len; 3259 } 3260 3261 /* Now chain through the pending brackets, and fill in their length 3262 fields (which are holding the chain links pro tem). */ 3263 3264 while (bralink != NULL) 3265 { 3266 int oldlinkoffset; 3267 int offset = code - bralink + 1; 3268 uschar *bra = code - offset; 3269 oldlinkoffset = GET(bra, 1); 3270 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; 3271 *code++ = OP_KET; 3272 PUTINC(code, 0, offset); 3273 PUT(bra, 1, offset); 3274 } 3275 } 3276 3277 /* If the maximum is unlimited, set a repeater in the final copy. We 3278 can't just offset backwards from the current code point, because we 3279 don't know if there's been an options resetting after the ket. The 3280 correct offset was computed above. */ 3281 3282 else code[-ketoffset] = OP_KETRMAX + repeat_type; 3283 } 3284 3285 /* Else there's some kind of shambles */ 3286 3287 else 3288 { 3289 *errorptr = ERR11; 3290 goto FAILED; 3291 } 3292 3293 /* If the character following a repeat is '+', we wrap the entire repeated 3294 item inside OP_ONCE brackets. This is just syntactic sugar, taken from 3295 Sun's Java package. The repeated item starts at tempcode, not at previous, 3296 which might be the first part of a string whose (former) last char we 3297 repeated. However, we don't support '+' after a greediness '?'. */ 3298 3299 if (possessive_quantifier) 3300 { 3301 int len = code - tempcode; 3302 memmove(tempcode + 1+LINK_SIZE, tempcode, len); 3303 code += 1 + LINK_SIZE; 3304 len += 1 + LINK_SIZE; 3305 tempcode[0] = OP_ONCE; 3306 *code++ = OP_KET; 3307 PUTINC(code, 0, len); 3308 PUT(tempcode, 1, len); 3309 } 3310 3311 /* In all case we no longer have a previous item. We also set the 3312 "follows varying string" flag for subsequently encountered reqbytes if 3313 it isn't already set and we have just passed a varying length item. */ 3314 3315 END_REPEAT: 3316 previous = NULL; 3317 cd->req_varyopt |= reqvary; 3318 break; 3319 3320 3321 /* Start of nested bracket sub-expression, or comment or lookahead or 3322 lookbehind or option setting or condition. First deal with special things 3323 that can come after a bracket; all are introduced by ?, and the appearance 3324 of any of them means that this is not a referencing group. They were 3325 checked for validity in the first pass over the string, so we don't have to 3326 check for syntax errors here. */ 3327 3328 case '(': 3329 newoptions = options; 3330 skipbytes = 0; 3331 3332 if (*(++ptr) == '?') 3333 { 3334 int set, unset; 3335 int *optset; 3336 3337 switch (*(++ptr)) 3338 { 3339 case '#': /* Comment; skip to ket */ 3340 ptr++; 3341 while (*ptr != ')') ptr++; 3342 continue; 3343 3344 case ':': /* Non-extracting bracket */ 3345 bravalue = OP_BRA; 3346 ptr++; 3347 break; 3348 3349 case '(': 3350 bravalue = OP_COND; /* Conditional group */ 3351 3352 /* Condition to test for recursion */ 3353 3354 if (ptr[1] == 'R') 3355 { 3356 code[1+LINK_SIZE] = OP_CREF; 3357 PUT2(code, 2+LINK_SIZE, CREF_RECURSE); 3358 skipbytes = 3; 3359 ptr += 3; 3360 } 3361 3362 /* Condition to test for a numbered subpattern match. We know that 3363 if a digit follows ( then there will just be digits until ) because 3364 the syntax was checked in the first pass. */ 3365 3366 else if ((digitab[ptr[1]] && ctype_digit) != 0) 3367 { 3368 int condref; /* Don't amalgamate; some compilers */ 3369 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ 3370 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; 3371 if (condref == 0) 3372 { 3373 *errorptr = ERR35; 3374 goto FAILED; 3375 } 3376 ptr++; 3377 code[1+LINK_SIZE] = OP_CREF; 3378 PUT2(code, 2+LINK_SIZE, condref); 3379 skipbytes = 3; 3380 } 3381 /* For conditions that are assertions, we just fall through, having 3382 set bravalue above. */ 3383 break; 3384 3385 case '=': /* Positive lookahead */ 3386 bravalue = OP_ASSERT; 3387 ptr++; 3388 break; 3389 3390 case '!': /* Negative lookahead */ 3391 bravalue = OP_ASSERT_NOT; 3392 ptr++; 3393 break; 3394 3395 case '<': /* Lookbehinds */ 3396 switch (*(++ptr)) 3397 { 3398 case '=': /* Positive lookbehind */ 3399 bravalue = OP_ASSERTBACK; 3400 ptr++; 3401 break; 3402 3403 case '!': /* Negative lookbehind */ 3404 bravalue = OP_ASSERTBACK_NOT; 3405 ptr++; 3406 break; 3407 } 3408 break; 3409 3410 case '>': /* One-time brackets */ 3411 bravalue = OP_ONCE; 3412 ptr++; 3413 break; 3414 3415 case 'C': /* Callout - may be followed by digits; */ 3416 previous_callout = code; /* Save for later completion */ 3417 after_manual_callout = 1; /* Skip one item before completing */ 3418 *code++ = OP_CALLOUT; /* Already checked that the terminating */ 3419 { /* closing parenthesis is present. */ 3420 int n = 0; 3421 while ((digitab[*(++ptr)] & ctype_digit) != 0) 3422 n = n * 10 + *ptr - '0'; 3423 if (n > 255) 3424 { 3425 *errorptr = ERR38; 3426 goto FAILED; 3427 } 3428 *code++ = n; 3429 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */ 3430 PUT(code, LINK_SIZE, 0); /* Default length */ 3431 code += 2 * LINK_SIZE; 3432 } 3433 previous = NULL; 3434 continue; 3435 3436 case 'P': /* Named subpattern handling */ 3437 if (*(++ptr) == '<') /* Definition */ 3438 { 3439 int i, namelen; 3440 uschar *slot = cd->name_table; 3441 const uschar *name; /* Don't amalgamate; some compilers */ 3442 name = ++ptr; /* grumble at autoincrement in declaration */ 3443 3444 while (*ptr++ != '>'); 3445 namelen = ptr - name - 1; 3446 3447 for (i = 0; i < cd->names_found; i++) 3448 { 3449 int crc = memcmp(name, slot+2, namelen); 3450 if (crc == 0) 3451 { 3452 if (slot[2+namelen] == 0) 3453 { 3454 *errorptr = ERR43; 3455 goto FAILED; 3456 } 3457 crc = -1; /* Current name is substring */ 3458 } 3459 if (crc < 0) 3460 { 3461 memmove(slot + cd->name_entry_size, slot, 3462 (cd->names_found - i) * cd->name_entry_size); 3463 break; 3464 } 3465 slot += cd->name_entry_size; 3466 } 3467 3468 PUT2(slot, 0, *brackets + 1); 3469 memcpy(slot + 2, name, namelen); 3470 slot[2+namelen] = 0; 3471 cd->names_found++; 3472 goto NUMBERED_GROUP; 3473 } 3474 3475 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */ 3476 { 3477 int i, namelen; 3478 int type = *ptr++; 3479 const uschar *name = ptr; 3480 uschar *slot = cd->name_table; 3481 3482 while (*ptr != ')') ptr++; 3483 namelen = ptr - name; 3484 3485 for (i = 0; i < cd->names_found; i++) 3486 { 3487 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; 3488 slot += cd->name_entry_size; 3489 } 3490 if (i >= cd->names_found) 3491 { 3492 *errorptr = ERR15; 3493 goto FAILED; 3494 } 3495 3496 recno = GET2(slot, 0); 3497 3498 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ 3499 3500 /* Back reference */ 3501 3502 previous = code; 3503 *code++ = OP_REF; 3504 PUT2INC(code, 0, recno); 3505 cd->backref_map |= (recno < 32)? (1 << recno) : 1; 3506 if (recno > cd->top_backref) cd->top_backref = recno; 3507 continue; 3508 } 3509 3510 /* Should never happen */ 3511 break; 3512 3513 case 'R': /* Pattern recursion */ 3514 ptr++; /* Same as (?0) */ 3515 /* Fall through */ 3516 3517 /* Recursion or "subroutine" call */ 3518 3519 case '0': case '1': case '2': case '3': case '4': 3520 case '5': case '6': case '7': case '8': case '9': 3521 { 3522 const uschar *called; 3523 recno = 0; 3524 while((digitab[*ptr] & ctype_digit) != 0) 3525 recno = recno * 10 + *ptr++ - '0'; 3526 3527 /* Come here from code above that handles a named recursion */ 3528 3529 HANDLE_RECURSION: 3530 3531 previous = code; 3532 3533 /* Find the bracket that is being referenced. Temporarily end the 3534 regex in case it doesn't exist. */ 3535 3536 *code = OP_END; 3537 called = (recno == 0)? 3538 cd->start_code : find_bracket(cd->start_code, utf8, recno); 3539 3540 if (called == NULL) 3541 { 3542 *errorptr = ERR15; 3543 goto FAILED; 3544 } 3545 3546 /* If the subpattern is still open, this is a recursive call. We 3547 check to see if this is a left recursion that could loop for ever, 3548 and diagnose that case. */ 3549 3550 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) 3551 { 3552 *errorptr = ERR40; 3553 goto FAILED; 3554 } 3555 3556 /* Insert the recursion/subroutine item */ 3557 3558 *code = OP_RECURSE; 3559 PUT(code, 1, called - cd->start_code); 3560 code += 1 + LINK_SIZE; 3561 } 3562 continue; 3563 3564 /* Character after (? not specially recognized */ 3565 3566 default: /* Option setting */ 3567 set = unset = 0; 3568 optset = &set; 3569 3570 while (*ptr != ')' && *ptr != ':') 3571 { 3572 switch (*ptr++) 3573 { 3574 case '-': optset = &unset; break; 3575 3576 case 'i': *optset |= PCRE_CASELESS; break; 3577 case 'm': *optset |= PCRE_MULTILINE; break; 3578 case 's': *optset |= PCRE_DOTALL; break; 3579 case 'x': *optset |= PCRE_EXTENDED; break; 3580 case 'U': *optset |= PCRE_UNGREEDY; break; 3581 case 'X': *optset |= PCRE_EXTRA; break; 3582 } 3583 } 3584 3585 /* Set up the changed option bits, but don't change anything yet. */ 3586 3587 newoptions = (options | set) & (~unset); 3588 3589 /* If the options ended with ')' this is not the start of a nested 3590 group with option changes, so the options change at this level. Compile 3591 code to change the ims options if this setting actually changes any of 3592 them. We also pass the new setting back so that it can be put at the 3593 start of any following branches, and when this group ends (if we are in 3594 a group), a resetting item can be compiled. 3595 3596 Note that if this item is right at the start of the pattern, the 3597 options will have been abstracted and made global, so there will be no 3598 change to compile. */ 3599 3600 if (*ptr == ')') 3601 { 3602 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) 3603 { 3604 *code++ = OP_OPT; 3605 *code++ = newoptions & PCRE_IMS; 3606 } 3607 3608 /* Change options at this level, and pass them back for use 3609 in subsequent branches. Reset the greedy defaults and the case 3610 value for firstbyte and reqbyte. */ 3611 3612 *optionsptr = options = newoptions; 3613 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); 3614 greedy_non_default = greedy_default ^ 1; 3615 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; 3616 3617 previous = NULL; /* This item can't be repeated */ 3618 continue; /* It is complete */ 3619 } 3620 3621 /* If the options ended with ':' we are heading into a nested group 3622 with possible change of options. Such groups are non-capturing and are 3623 not assertions of any kind. All we need to do is skip over the ':'; 3624 the newoptions value is handled below. */ 3625 3626 bravalue = OP_BRA; 3627 ptr++; 3628 } 3629 } 3630 3631 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become 3632 non-capturing and behave like (?:...) brackets */ 3633 3634 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) 3635 { 3636 bravalue = OP_BRA; 3637 } 3638 3639 /* Else we have a referencing group; adjust the opcode. If the bracket 3640 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and 3641 arrange for the true number to follow later, in an OP_BRANUMBER item. */ 3642 3643 else 3644 { 3645 NUMBERED_GROUP: 3646 if (++(*brackets) > EXTRACT_BASIC_MAX) 3647 { 3648 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1; 3649 code[1+LINK_SIZE] = OP_BRANUMBER; 3650 PUT2(code, 2+LINK_SIZE, *brackets); 3651 skipbytes = 3; 3652 } 3653 else bravalue = OP_BRA + *brackets; 3654 } 3655 3656 /* Process nested bracketed re. Assertions may not be repeated, but other 3657 kinds can be. We copy code into a non-register variable in order to be able 3658 to pass its address because some compilers complain otherwise. Pass in a 3659 new setting for the ims options if they have changed. */ 3660 3661 previous = (bravalue >= OP_ONCE)? code : NULL; 3662 *code = bravalue; 3663 tempcode = code; 3664 tempreqvary = cd->req_varyopt; /* Save value before bracket */ 3665 3666 if (!compile_regex( 3667 newoptions, /* The complete new option state */ 3668 options & PCRE_IMS, /* The previous ims option state */ 3669 brackets, /* Extracting bracket count */ 3670 &tempcode, /* Where to put code (updated) */ 3671 &ptr, /* Input pointer (updated) */ 3672 errorptr, /* Where to put an error message */ 3673 (bravalue == OP_ASSERTBACK || 3674 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ 3675 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ 3676 &subfirstbyte, /* For possible first char */ 3677 &subreqbyte, /* For possible last char */ 3678 bcptr, /* Current branch chain */ 3679 cd)) /* Tables block */ 3680 goto FAILED; 3681 3682 /* At the end of compiling, code is still pointing to the start of the 3683 group, while tempcode has been updated to point past the end of the group 3684 and any option resetting that may follow it. The pattern pointer (ptr) 3685 is on the bracket. */ 3686 3687 /* If this is a conditional bracket, check that there are no more than 3688 two branches in the group. */ 3689 3690 else if (bravalue == OP_COND) 3691 { 3692 uschar *tc = code; 3693 condcount = 0; 3694 3695 do { 3696 condcount++; 3697 tc += GET(tc,1); 3698 } 3699 while (*tc != OP_KET); 3700 3701 if (condcount > 2) 3702 { 3703 *errorptr = ERR27; 3704 goto FAILED; 3705 } 3706 3707 /* If there is just one branch, we must not make use of its firstbyte or 3708 reqbyte, because this is equivalent to an empty second branch. */ 3709 3710 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; 3711 } 3712 3713 /* Handle updating of the required and first characters. Update for normal 3714 brackets of all kinds, and conditions with two branches (see code above). 3715 If the bracket is followed by a quantifier with zero repeat, we have to 3716 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the 3717 main loop so that they can be accessed for the back off. */ 3718 3719 zeroreqbyte = reqbyte; 3720 zerofirstbyte = firstbyte; 3721 groupsetfirstbyte = FALSE; 3722 3723 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND) 3724 { 3725 /* If we have not yet set a firstbyte in this branch, take it from the 3726 subpattern, remembering that it was set here so that a repeat of more 3727 than one can replicate it as reqbyte if necessary. If the subpattern has 3728 no firstbyte, set "none" for the whole branch. In both cases, a zero 3729 repeat forces firstbyte to "none". */ 3730 3731 if (firstbyte == REQ_UNSET) 3732 { 3733 if (subfirstbyte >= 0) 3734 { 3735 firstbyte = subfirstbyte; 3736 groupsetfirstbyte = TRUE; 3737 } 3738 else firstbyte = REQ_NONE; 3739 zerofirstbyte = REQ_NONE; 3740 } 3741 3742 /* If firstbyte was previously set, convert the subpattern's firstbyte 3743 into reqbyte if there wasn't one, using the vary flag that was in 3744 existence beforehand. */ 3745 3746 else if (subfirstbyte >= 0 && subreqbyte < 0) 3747 subreqbyte = subfirstbyte | tempreqvary; 3748 3749 /* If the subpattern set a required byte (or set a first byte that isn't 3750 really the first byte - see above), set it. */ 3751 3752 if (subreqbyte >= 0) reqbyte = subreqbyte; 3753 } 3754 3755 /* For a forward assertion, we take the reqbyte, if set. This can be 3756 helpful if the pattern that follows the assertion doesn't set a different 3757 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte 3758 for an assertion, however because it leads to incorrect effect for patterns 3759 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead 3760 of a firstbyte. This is overcome by a scan at the end if there's no 3761 firstbyte, looking for an asserted first char. */ 3762 3763 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; 3764 3765 /* Now update the main code pointer to the end of the group. */ 3766 3767 code = tempcode; 3768 3769 /* Error if hit end of pattern */ 3770 3771 if (*ptr != ')') 3772 { 3773 *errorptr = ERR14; 3774 goto FAILED; 3775 } 3776 break; 3777 3778 /* Check \ for being a real metacharacter; if not, fall through and handle 3779 it as a data character at the start of a string. Escape items are checked 3780 for validity in the pre-compiling pass. */ 3781 3782 case '\\': 3783 tempptr = ptr; 3784 c = check_escape(&ptr, errorptr, *brackets, options, FALSE); 3785 3786 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values 3787 are arranged to be the negation of the corresponding OP_values. For the 3788 back references, the values are ESC_REF plus the reference number. Only 3789 back references and those types that consume a character may be repeated. 3790 We can test for values between ESC_b and ESC_Z for the latter; this may 3791 have to change if any new ones are ever created. */ 3792 3793 if (c < 0) 3794 { 3795 if (-c == ESC_Q) /* Handle start of quoted string */ 3796 { 3797 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ 3798 else inescq = TRUE; 3799 continue; 3800 } 3801 3802 /* For metasequences that actually match a character, we disable the 3803 setting of a first character if it hasn't already been set. */ 3804 3805 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) 3806 firstbyte = REQ_NONE; 3807 3808 /* Set values to reset to if this is followed by a zero repeat. */ 3809 3810 zerofirstbyte = firstbyte; 3811 zeroreqbyte = reqbyte; 3812 3813 /* Back references are handled specially */ 3814 3815 if (-c >= ESC_REF) 3816 { 3817 int number = -c - ESC_REF; 3818 previous = code; 3819 *code++ = OP_REF; 3820 PUT2INC(code, 0, number); 3821 } 3822 3823 /* So are Unicode property matches, if supported. We know that get_ucp 3824 won't fail because it was tested in the pre-pass. */ 3825 3826#ifdef SUPPORT_UCP 3827 else if (-c == ESC_P || -c == ESC_p) 3828 { 3829 BOOL negated; 3830 int value = get_ucp(&ptr, &negated, errorptr); 3831 previous = code; 3832 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; 3833 *code++ = value; 3834 } 3835#endif 3836 3837 /* For the rest, we can obtain the OP value by negating the escape 3838 value */ 3839 3840 else 3841 { 3842 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; 3843 *code++ = -c; 3844 } 3845 continue; 3846 } 3847 3848 /* We have a data character whose value is in c. In UTF-8 mode it may have 3849 a value > 127. We set its representation in the length/buffer, and then 3850 handle it as a data character. */ 3851 3852#ifdef SUPPORT_UTF8 3853 if (utf8 && c > 127) 3854 mclength = ord2utf8(c, mcbuffer); 3855 else 3856#endif 3857 3858 { 3859 mcbuffer[0] = c; 3860 mclength = 1; 3861 } 3862 3863 goto ONE_CHAR; 3864 3865 /* Handle a literal character. It is guaranteed not to be whitespace or # 3866 when the extended flag is set. If we are in UTF-8 mode, it may be a 3867 multi-byte literal character. */ 3868 3869 default: 3870 NORMAL_CHAR: 3871 mclength = 1; 3872 mcbuffer[0] = c; 3873 3874#ifdef SUPPORT_UTF8 3875 if (utf8 && (c & 0xc0) == 0xc0) 3876 { 3877 while ((ptr[1] & 0xc0) == 0x80) 3878 mcbuffer[mclength++] = *(++ptr); 3879 } 3880#endif 3881 3882 /* At this point we have the character's bytes in mcbuffer, and the length 3883 in mclength. When not in UTF-8 mode, the length is always 1. */ 3884 3885 ONE_CHAR: 3886 previous = code; 3887 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; 3888 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; 3889 3890 /* Set the first and required bytes appropriately. If no previous first 3891 byte, set it from this character, but revert to none on a zero repeat. 3892 Otherwise, leave the firstbyte value alone, and don't change it on a zero 3893 repeat. */ 3894 3895 if (firstbyte == REQ_UNSET) 3896 { 3897 zerofirstbyte = REQ_NONE; 3898 zeroreqbyte = reqbyte; 3899 3900 /* If the character is more than one byte long, we can set firstbyte 3901 only if it is not to be matched caselessly. */ 3902 3903 if (mclength == 1 || req_caseopt == 0) 3904 { 3905 firstbyte = mcbuffer[0] | req_caseopt; 3906 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt; 3907 } 3908 else firstbyte = reqbyte = REQ_NONE; 3909 } 3910 3911 /* firstbyte was previously set; we can set reqbyte only the length is 3912 1 or the matching is caseful. */ 3913 3914 else 3915 { 3916 zerofirstbyte = firstbyte; 3917 zeroreqbyte = reqbyte; 3918 if (mclength == 1 || req_caseopt == 0) 3919 reqbyte = code[-1] | req_caseopt | cd->req_varyopt; 3920 } 3921 3922 break; /* End of literal character handling */ 3923 } 3924 } /* end of big loop */ 3925 3926/* Control never reaches here by falling through, only by a goto for all the 3927error states. Pass back the position in the pattern so that it can be displayed 3928to the user for diagnosing the error. */ 3929 3930FAILED: 3931*ptrptr = ptr; 3932return FALSE; 3933} 3934 3935 3936 3937 3938/************************************************* 3939* Compile sequence of alternatives * 3940*************************************************/ 3941 3942/* On entry, ptr is pointing past the bracket character, but on return 3943it points to the closing bracket, or vertical bar, or end of string. 3944The code variable is pointing at the byte into which the BRA operator has been 3945stored. If the ims options are changed at the start (for a (?ims: group) or 3946during any branch, we need to insert an OP_OPT item at the start of every 3947following branch to ensure they get set correctly at run time, and also pass 3948the new options into every subsequent branch compile. 3949 3950Argument: 3951 options option bits, including any changes for this subpattern 3952 oldims previous settings of ims option bits 3953 brackets -> int containing the number of extracting brackets used 3954 codeptr -> the address of the current code pointer 3955 ptrptr -> the address of the current pattern pointer 3956 errorptr -> pointer to error message 3957 lookbehind TRUE if this is a lookbehind assertion 3958 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) 3959 firstbyteptr place to put the first required character, or a negative number 3960 reqbyteptr place to put the last required character, or a negative number 3961 bcptr pointer to the chain of currently open branches 3962 cd points to the data block with tables pointers etc. 3963 3964Returns: TRUE on success 3965*/ 3966 3967static BOOL 3968compile_regex(int options, int oldims, int *brackets, uschar **codeptr, 3969 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes, 3970 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd) 3971{ 3972const uschar *ptr = *ptrptr; 3973uschar *code = *codeptr; 3974uschar *last_branch = code; 3975uschar *start_bracket = code; 3976uschar *reverse_count = NULL; 3977int firstbyte, reqbyte; 3978int branchfirstbyte, branchreqbyte; 3979branch_chain bc; 3980 3981bc.outer = bcptr; 3982bc.current = code; 3983 3984firstbyte = reqbyte = REQ_UNSET; 3985 3986/* Offset is set zero to mark that this bracket is still open */ 3987 3988PUT(code, 1, 0); 3989code += 1 + LINK_SIZE + skipbytes; 3990 3991/* Loop for each alternative branch */ 3992 3993for (;;) 3994 { 3995 /* Handle a change of ims options at the start of the branch */ 3996 3997 if ((options & PCRE_IMS) != oldims) 3998 { 3999 *code++ = OP_OPT; 4000 *code++ = options & PCRE_IMS; 4001 } 4002 4003 /* Set up dummy OP_REVERSE if lookbehind assertion */ 4004 4005 if (lookbehind) 4006 { 4007 *code++ = OP_REVERSE; 4008 reverse_count = code; 4009 PUTINC(code, 0, 0); 4010 } 4011 4012 /* Now compile the branch */ 4013 4014 if (!compile_branch(&options, brackets, &code, &ptr, errorptr, 4015 &branchfirstbyte, &branchreqbyte, &bc, cd)) 4016 { 4017 *ptrptr = ptr; 4018 return FALSE; 4019 } 4020 4021 /* If this is the first branch, the firstbyte and reqbyte values for the 4022 branch become the values for the regex. */ 4023 4024 if (*last_branch != OP_ALT) 4025 { 4026 firstbyte = branchfirstbyte; 4027 reqbyte = branchreqbyte; 4028 } 4029 4030 /* If this is not the first branch, the first char and reqbyte have to 4031 match the values from all the previous branches, except that if the previous 4032 value for reqbyte didn't have REQ_VARY set, it can still match, and we set 4033 REQ_VARY for the regex. */ 4034 4035 else 4036 { 4037 /* If we previously had a firstbyte, but it doesn't match the new branch, 4038 we have to abandon the firstbyte for the regex, but if there was previously 4039 no reqbyte, it takes on the value of the old firstbyte. */ 4040 4041 if (firstbyte >= 0 && firstbyte != branchfirstbyte) 4042 { 4043 if (reqbyte < 0) reqbyte = firstbyte; 4044 firstbyte = REQ_NONE; 4045 } 4046 4047 /* If we (now or from before) have no firstbyte, a firstbyte from the 4048 branch becomes a reqbyte if there isn't a branch reqbyte. */ 4049 4050 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) 4051 branchreqbyte = branchfirstbyte; 4052 4053 /* Now ensure that the reqbytes match */ 4054 4055 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) 4056 reqbyte = REQ_NONE; 4057 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ 4058 } 4059 4060 /* If lookbehind, check that this branch matches a fixed-length string, 4061 and put the length into the OP_REVERSE item. Temporarily mark the end of 4062 the branch with OP_END. */ 4063 4064 if (lookbehind) 4065 { 4066 int length; 4067 *code = OP_END; 4068 length = find_fixedlength(last_branch, options); 4069 DPRINTF(("fixed length = %d\n", length)); 4070 if (length < 0) 4071 { 4072 *errorptr = (length == -2)? ERR36 : ERR25; 4073 *ptrptr = ptr; 4074 return FALSE; 4075 } 4076 PUT(reverse_count, 0, length); 4077 } 4078 4079 /* Reached end of expression, either ')' or end of pattern. Go back through 4080 the alternative branches and reverse the chain of offsets, with the field in 4081 the BRA item now becoming an offset to the first alternative. If there are 4082 no alternatives, it points to the end of the group. The length in the 4083 terminating ket is always the length of the whole bracketed item. If any of 4084 the ims options were changed inside the group, compile a resetting op-code 4085 following, except at the very end of the pattern. Return leaving the pointer 4086 at the terminating char. */ 4087 4088 if (*ptr != '|') 4089 { 4090 int length = code - last_branch; 4091 do 4092 { 4093 int prev_length = GET(last_branch, 1); 4094 PUT(last_branch, 1, length); 4095 length = prev_length; 4096 last_branch -= length; 4097 } 4098 while (length > 0); 4099 4100 /* Fill in the ket */ 4101 4102 *code = OP_KET; 4103 PUT(code, 1, code - start_bracket); 4104 code += 1 + LINK_SIZE; 4105 4106 /* Resetting option if needed */ 4107 4108 if ((options & PCRE_IMS) != oldims && *ptr == ')') 4109 { 4110 *code++ = OP_OPT; 4111 *code++ = oldims; 4112 } 4113 4114 /* Set values to pass back */ 4115 4116 *codeptr = code; 4117 *ptrptr = ptr; 4118 *firstbyteptr = firstbyte; 4119 *reqbyteptr = reqbyte; 4120 return TRUE; 4121 } 4122 4123 /* Another branch follows; insert an "or" node. Its length field points back 4124 to the previous branch while the bracket remains open. At the end the chain 4125 is reversed. It's done like this so that the start of the bracket has a 4126 zero offset until it is closed, making it possible to detect recursion. */ 4127 4128 *code = OP_ALT; 4129 PUT(code, 1, code - last_branch); 4130 bc.current = last_branch = code; 4131 code += 1 + LINK_SIZE; 4132 ptr++; 4133 } 4134/* Control never reaches here */ 4135} 4136 4137 4138 4139 4140/************************************************* 4141* Check for anchored expression * 4142*************************************************/ 4143 4144/* Try to find out if this is an anchored regular expression. Consider each 4145alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket 4146all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then 4147it's anchored. However, if this is a multiline pattern, then only OP_SOD 4148counts, since OP_CIRC can match in the middle. 4149 4150We can also consider a regex to be anchored if OP_SOM starts all its branches. 4151This is the code for \G, which means "match at start of match position, taking 4152into account the match offset". 4153 4154A branch is also implicitly anchored if it starts with .* and DOTALL is set, 4155because that will try the rest of the pattern at all possible matching points, 4156so there is no point trying again.... er .... 4157 4158.... except when the .* appears inside capturing parentheses, and there is a 4159subsequent back reference to those parentheses. We haven't enough information 4160to catch that case precisely. 4161 4162At first, the best we could do was to detect when .* was in capturing brackets 4163and the highest back reference was greater than or equal to that level. 4164However, by keeping a bitmap of the first 31 back references, we can catch some 4165of the more common cases more precisely. 4166 4167Arguments: 4168 code points to start of expression (the bracket) 4169 options points to the options setting 4170 bracket_map a bitmap of which brackets we are inside while testing; this 4171 handles up to substring 31; after that we just have to take 4172 the less precise approach 4173 backref_map the back reference bitmap 4174 4175Returns: TRUE or FALSE 4176*/ 4177 4178static BOOL 4179is_anchored(register const uschar *code, int *options, unsigned int bracket_map, 4180 unsigned int backref_map) 4181{ 4182do { 4183 const uschar *scode = 4184 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE); 4185 register int op = *scode; 4186 4187 /* Capturing brackets */ 4188 4189 if (op > OP_BRA) 4190 { 4191 int new_map; 4192 op -= OP_BRA; 4193 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); 4194 new_map = bracket_map | ((op < 32)? (1 << op) : 1); 4195 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; 4196 } 4197 4198 /* Other brackets */ 4199 4200 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) 4201 { 4202 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; 4203 } 4204 4205 /* .* is not anchored unless DOTALL is set and it isn't in brackets that 4206 are or may be referenced. */ 4207 4208 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) && 4209 (*options & PCRE_DOTALL) != 0) 4210 { 4211 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; 4212 } 4213 4214 /* Check for explicit anchoring */ 4215 4216 else if (op != OP_SOD && op != OP_SOM && 4217 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) 4218 return FALSE; 4219 code += GET(code, 1); 4220 } 4221while (*code == OP_ALT); /* Loop for each alternative */ 4222return TRUE; 4223} 4224 4225 4226 4227/************************************************* 4228* Check for starting with ^ or .* * 4229*************************************************/ 4230 4231/* This is called to find out if every branch starts with ^ or .* so that 4232"first char" processing can be done to speed things up in multiline 4233matching and for non-DOTALL patterns that start with .* (which must start at 4234the beginning or after \n). As in the case of is_anchored() (see above), we 4235have to take account of back references to capturing brackets that contain .* 4236because in that case we can't make the assumption. 4237 4238Arguments: 4239 code points to start of expression (the bracket) 4240 bracket_map a bitmap of which brackets we are inside while testing; this 4241 handles up to substring 31; after that we just have to take 4242 the less precise approach 4243 backref_map the back reference bitmap 4244 4245Returns: TRUE or FALSE 4246*/ 4247 4248static BOOL 4249is_startline(const uschar *code, unsigned int bracket_map, 4250 unsigned int backref_map) 4251{ 4252do { 4253 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0, 4254 FALSE); 4255 register int op = *scode; 4256 4257 /* Capturing brackets */ 4258 4259 if (op > OP_BRA) 4260 { 4261 int new_map; 4262 op -= OP_BRA; 4263 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE); 4264 new_map = bracket_map | ((op < 32)? (1 << op) : 1); 4265 if (!is_startline(scode, new_map, backref_map)) return FALSE; 4266 } 4267 4268 /* Other brackets */ 4269 4270 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) 4271 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } 4272 4273 /* .* means "start at start or after \n" if it isn't in brackets that 4274 may be referenced. */ 4275 4276 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) 4277 { 4278 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; 4279 } 4280 4281 /* Check for explicit circumflex */ 4282 4283 else if (op != OP_CIRC) return FALSE; 4284 4285 /* Move on to the next alternative */ 4286 4287 code += GET(code, 1); 4288 } 4289while (*code == OP_ALT); /* Loop for each alternative */ 4290return TRUE; 4291} 4292 4293 4294 4295/************************************************* 4296* Check for asserted fixed first char * 4297*************************************************/ 4298 4299/* During compilation, the "first char" settings from forward assertions are 4300discarded, because they can cause conflicts with actual literals that follow. 4301However, if we end up without a first char setting for an unanchored pattern, 4302it is worth scanning the regex to see if there is an initial asserted first 4303char. If all branches start with the same asserted char, or with a bracket all 4304of whose alternatives start with the same asserted char (recurse ad lib), then 4305we return that char, otherwise -1. 4306 4307Arguments: 4308 code points to start of expression (the bracket) 4309 options pointer to the options (used to check casing changes) 4310 inassert TRUE if in an assertion 4311 4312Returns: -1 or the fixed first char 4313*/ 4314 4315static int 4316find_firstassertedchar(const uschar *code, int *options, BOOL inassert) 4317{ 4318register int c = -1; 4319do { 4320 int d; 4321 const uschar *scode = 4322 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); 4323 register int op = *scode; 4324 4325 if (op >= OP_BRA) op = OP_BRA; 4326 4327 switch(op) 4328 { 4329 default: 4330 return -1; 4331 4332 case OP_BRA: 4333 case OP_ASSERT: 4334 case OP_ONCE: 4335 case OP_COND: 4336 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) 4337 return -1; 4338 if (c < 0) c = d; else if (c != d) return -1; 4339 break; 4340 4341 case OP_EXACT: /* Fall through */ 4342 scode += 2; 4343 4344 case OP_CHAR: 4345 case OP_CHARNC: 4346 case OP_PLUS: 4347 case OP_MINPLUS: 4348 if (!inassert) return -1; 4349 if (c < 0) 4350 { 4351 c = scode[1]; 4352 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS; 4353 } 4354 else if (c != scode[1]) return -1; 4355 break; 4356 } 4357 4358 code += GET(code, 1); 4359 } 4360while (*code == OP_ALT); 4361return c; 4362} 4363 4364 4365 4366 4367#ifdef SUPPORT_UTF8 4368/************************************************* 4369* Validate a UTF-8 string * 4370*************************************************/ 4371 4372/* This function is called (optionally) at the start of compile or match, to 4373validate that a supposed UTF-8 string is actually valid. The early check means 4374that subsequent code can assume it is dealing with a valid string. The check 4375can be turned off for maximum performance, but then consequences of supplying 4376an invalid string are then undefined. 4377 4378Arguments: 4379 string points to the string 4380 length length of string, or -1 if the string is zero-terminated 4381 4382Returns: < 0 if the string is a valid UTF-8 string 4383 >= 0 otherwise; the value is the offset of the bad byte 4384*/ 4385 4386static int 4387valid_utf8(const uschar *string, int length) 4388{ 4389register const uschar *p; 4390 4391if (length < 0) 4392 { 4393 for (p = string; *p != 0; p++); 4394 length = p - string; 4395 } 4396 4397for (p = string; length-- > 0; p++) 4398 { 4399 register int ab; 4400 register int c = *p; 4401 if (c < 128) continue; 4402 if ((c & 0xc0) != 0xc0) return p - string; 4403 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */ 4404 if (length < ab) return p - string; 4405 length -= ab; 4406 4407 /* Check top bits in the second byte */ 4408 if ((*(++p) & 0xc0) != 0x80) return p - string; 4409 4410 /* Check for overlong sequences for each different length */ 4411 switch (ab) 4412 { 4413 /* Check for xx00 000x */ 4414 case 1: 4415 if ((c & 0x3e) == 0) return p - string; 4416 continue; /* We know there aren't any more bytes to check */ 4417 4418 /* Check for 1110 0000, xx0x xxxx */ 4419 case 2: 4420 if (c == 0xe0 && (*p & 0x20) == 0) return p - string; 4421 break; 4422 4423 /* Check for 1111 0000, xx00 xxxx */ 4424 case 3: 4425 if (c == 0xf0 && (*p & 0x30) == 0) return p - string; 4426 break; 4427 4428 /* Check for 1111 1000, xx00 0xxx */ 4429 case 4: 4430 if (c == 0xf8 && (*p & 0x38) == 0) return p - string; 4431 break; 4432 4433 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ 4434 case 5: 4435 if (c == 0xfe || c == 0xff || 4436 (c == 0xfc && (*p & 0x3c) == 0)) return p - string; 4437 break; 4438 } 4439 4440 /* Check for valid bytes after the 2nd, if any; all must start 10 */ 4441 while (--ab > 0) 4442 { 4443 if ((*(++p) & 0xc0) != 0x80) return p - string; 4444 } 4445 } 4446 4447return -1; 4448} 4449#endif 4450 4451 4452 4453/************************************************* 4454* Compile a Regular Expression * 4455*************************************************/ 4456 4457/* This function takes a string and returns a pointer to a block of store 4458holding a compiled version of the expression. 4459 4460Arguments: 4461 pattern the regular expression 4462 options various option bits 4463 errorptr pointer to pointer to error text 4464 erroroffset ptr offset in pattern where error was detected 4465 tables pointer to character tables or NULL 4466 4467Returns: pointer to compiled data block, or NULL on error, 4468 with errorptr and erroroffset set 4469*/ 4470 4471EXPORT pcre * 4472pcre_compile(const char *pattern, int options, const char **errorptr, 4473 int *erroroffset, const unsigned char *tables) 4474{ 4475real_pcre *re; 4476int length = 1 + LINK_SIZE; /* For initial BRA plus length */ 4477int c, firstbyte, reqbyte; 4478int bracount = 0; 4479int branch_extra = 0; 4480int branch_newextra; 4481int item_count = -1; 4482int name_count = 0; 4483int max_name_size = 0; 4484int lastitemlength = 0; 4485#ifdef SUPPORT_UTF8 4486BOOL utf8; 4487BOOL class_utf8; 4488#endif 4489BOOL inescq = FALSE; 4490unsigned int brastackptr = 0; 4491size_t size; 4492uschar *code; 4493const uschar *codestart; 4494const uschar *ptr; 4495compile_data compile_block; 4496int brastack[BRASTACK_SIZE]; 4497uschar bralenstack[BRASTACK_SIZE]; 4498 4499/* We can't pass back an error message if errorptr is NULL; I guess the best we 4500can do is just return NULL. */ 4501 4502if (errorptr == NULL) return NULL; 4503*errorptr = NULL; 4504 4505/* However, we can give a message for this error */ 4506 4507if (erroroffset == NULL) 4508 { 4509 *errorptr = ERR16; 4510 return NULL; 4511 } 4512*erroroffset = 0; 4513 4514/* Can't support UTF8 unless PCRE has been compiled to include the code. */ 4515 4516#ifdef SUPPORT_UTF8 4517utf8 = (options & PCRE_UTF8) != 0; 4518if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && 4519 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0) 4520 { 4521 *errorptr = ERR44; 4522 return NULL; 4523 } 4524#else 4525if ((options & PCRE_UTF8) != 0) 4526 { 4527 *errorptr = ERR32; 4528 return NULL; 4529 } 4530#endif 4531 4532if ((options & ~PUBLIC_OPTIONS) != 0) 4533 { 4534 *errorptr = ERR17; 4535 return NULL; 4536 } 4537 4538/* Set up pointers to the individual character tables */ 4539 4540if (tables == NULL) tables = pcre_default_tables; 4541compile_block.lcc = tables + lcc_offset; 4542compile_block.fcc = tables + fcc_offset; 4543compile_block.cbits = tables + cbits_offset; 4544compile_block.ctypes = tables + ctypes_offset; 4545 4546/* Maximum back reference and backref bitmap. This is updated for numeric 4547references during the first pass, but for named references during the actual 4548compile pass. The bitmap records up to 31 back references to help in deciding 4549whether (.*) can be treated as anchored or not. */ 4550 4551compile_block.top_backref = 0; 4552compile_block.backref_map = 0; 4553 4554/* Reflect pattern for debugging output */ 4555 4556DPRINTF(("------------------------------------------------------------------\n")); 4557DPRINTF(("%s\n", pattern)); 4558 4559/* The first thing to do is to make a pass over the pattern to compute the 4560amount of store required to hold the compiled code. This does not have to be 4561perfect as long as errors are overestimates. At the same time we can detect any 4562flag settings right at the start, and extract them. Make an attempt to correct 4563for any counted white space if an "extended" flag setting appears late in the 4564pattern. We can't be so clever for #-comments. */ 4565 4566ptr = (const uschar *)(pattern - 1); 4567while ((c = *(++ptr)) != 0) 4568 { 4569 int min, max; 4570 int class_optcount; 4571 int bracket_length; 4572 int duplength; 4573 4574 /* If we are inside a \Q...\E sequence, all chars are literal */ 4575 4576 if (inescq) 4577 { 4578 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE; 4579 goto NORMAL_CHAR; 4580 } 4581 4582 /* Otherwise, first check for ignored whitespace and comments */ 4583 4584 if ((options & PCRE_EXTENDED) != 0) 4585 { 4586 if ((compile_block.ctypes[c] & ctype_space) != 0) continue; 4587 if (c == '#') 4588 { 4589 /* The space before the ; is to avoid a warning on a silly compiler 4590 on the Macintosh. */ 4591 while ((c = *(++ptr)) != 0 && c != NEWLINE) ; 4592 if (c == 0) break; 4593 continue; 4594 } 4595 } 4596 4597 item_count++; /* Is zero for the first non-comment item */ 4598 4599 /* Allow space for auto callout before every item except quantifiers. */ 4600 4601 if ((options & PCRE_AUTO_CALLOUT) != 0 && 4602 c != '*' && c != '+' && c != '?' && 4603 (c != '{' || !is_counted_repeat(ptr + 1))) 4604 length += 2 + 2*LINK_SIZE; 4605 4606 switch(c) 4607 { 4608 /* A backslashed item may be an escaped data character or it may be a 4609 character type. */ 4610 4611 case '\\': 4612 c = check_escape(&ptr, errorptr, bracount, options, FALSE); 4613 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 4614 4615 lastitemlength = 1; /* Default length of last item for repeats */ 4616 4617 if (c >= 0) /* Data character */ 4618 { 4619 length += 2; /* For a one-byte character */ 4620 4621#ifdef SUPPORT_UTF8 4622 if (utf8 && c > 127) 4623 { 4624 int i; 4625 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) 4626 if (c <= utf8_table1[i]) break; 4627 length += i; 4628 lastitemlength += i; 4629 } 4630#endif 4631 4632 continue; 4633 } 4634 4635 /* If \Q, enter "literal" mode */ 4636 4637 if (-c == ESC_Q) 4638 { 4639 inescq = TRUE; 4640 continue; 4641 } 4642 4643 /* \X is supported only if Unicode property support is compiled */ 4644 4645#ifndef SUPPORT_UCP 4646 if (-c == ESC_X) 4647 { 4648 *errorptr = ERR45; 4649 goto PCRE_ERROR_RETURN; 4650 } 4651#endif 4652 4653 /* \P and \p are for Unicode properties, but only when the support has 4654 been compiled. Each item needs 2 bytes. */ 4655 4656 else if (-c == ESC_P || -c == ESC_p) 4657 { 4658#ifdef SUPPORT_UCP 4659 BOOL negated; 4660 length += 2; 4661 lastitemlength = 2; 4662 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN; 4663 continue; 4664#else 4665 *errorptr = ERR45; 4666 goto PCRE_ERROR_RETURN; 4667#endif 4668 } 4669 4670 /* Other escapes need one byte */ 4671 4672 length++; 4673 4674 /* A back reference needs an additional 2 bytes, plus either one or 5 4675 bytes for a repeat. We also need to keep the value of the highest 4676 back reference. */ 4677 4678 if (c <= -ESC_REF) 4679 { 4680 int refnum = -c - ESC_REF; 4681 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; 4682 if (refnum > compile_block.top_backref) 4683 compile_block.top_backref = refnum; 4684 length += 2; /* For single back reference */ 4685 if (ptr[1] == '{' && is_counted_repeat(ptr+2)) 4686 { 4687 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 4688 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 4689 if ((min == 0 && (max == 1 || max == -1)) || 4690 (min == 1 && max == -1)) 4691 length++; 4692 else length += 5; 4693 if (ptr[1] == '?') ptr++; 4694 } 4695 } 4696 continue; 4697 4698 case '^': /* Single-byte metacharacters */ 4699 case '.': 4700 case '$': 4701 length++; 4702 lastitemlength = 1; 4703 continue; 4704 4705 case '*': /* These repeats won't be after brackets; */ 4706 case '+': /* those are handled separately */ 4707 case '?': 4708 length++; 4709 goto POSESSIVE; /* A few lines below */ 4710 4711 /* This covers the cases of braced repeats after a single char, metachar, 4712 class, or back reference. */ 4713 4714 case '{': 4715 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR; 4716 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr); 4717 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 4718 4719 /* These special cases just insert one extra opcode */ 4720 4721 if ((min == 0 && (max == 1 || max == -1)) || 4722 (min == 1 && max == -1)) 4723 length++; 4724 4725 /* These cases might insert additional copies of a preceding character. */ 4726 4727 else 4728 { 4729 if (min != 1) 4730 { 4731 length -= lastitemlength; /* Uncount the original char or metachar */ 4732 if (min > 0) length += 3 + lastitemlength; 4733 } 4734 length += lastitemlength + ((max > 0)? 3 : 1); 4735 } 4736 4737 if (ptr[1] == '?') ptr++; /* Needs no extra length */ 4738 4739 POSESSIVE: /* Test for possessive quantifier */ 4740 if (ptr[1] == '+') 4741 { 4742 ptr++; 4743 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */ 4744 } 4745 continue; 4746 4747 /* An alternation contains an offset to the next branch or ket. If any ims 4748 options changed in the previous branch(es), and/or if we are in a 4749 lookbehind assertion, extra space will be needed at the start of the 4750 branch. This is handled by branch_extra. */ 4751 4752 case '|': 4753 length += 1 + LINK_SIZE + branch_extra; 4754 continue; 4755 4756 /* A character class uses 33 characters provided that all the character 4757 values are less than 256. Otherwise, it uses a bit map for low valued 4758 characters, and individual items for others. Don't worry about character 4759 types that aren't allowed in classes - they'll get picked up during the 4760 compile. A character class that contains only one single-byte character 4761 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this 4762 where we can. (In UTF-8 mode we can do this only for chars < 128.) */ 4763 4764 case '[': 4765 if (*(++ptr) == '^') 4766 { 4767 class_optcount = 10; /* Greater than one */ 4768 ptr++; 4769 } 4770 else class_optcount = 0; 4771 4772#ifdef SUPPORT_UTF8 4773 class_utf8 = FALSE; 4774#endif 4775 4776 /* Written as a "do" so that an initial ']' is taken as data */ 4777 4778 if (*ptr != 0) do 4779 { 4780 /* Inside \Q...\E everything is literal except \E */ 4781 4782 if (inescq) 4783 { 4784 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER; 4785 inescq = FALSE; 4786 ptr += 1; 4787 continue; 4788 } 4789 4790 /* Outside \Q...\E, check for escapes */ 4791 4792 if (*ptr == '\\') 4793 { 4794 c = check_escape(&ptr, errorptr, bracount, options, TRUE); 4795 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 4796 4797 /* \b is backspace inside a class; \X is literal */ 4798 4799 if (-c == ESC_b) c = '\b'; 4800 else if (-c == ESC_X) c = 'X'; 4801 4802 /* \Q enters quoting mode */ 4803 4804 else if (-c == ESC_Q) 4805 { 4806 inescq = TRUE; 4807 continue; 4808 } 4809 4810 /* Handle escapes that turn into characters */ 4811 4812 if (c >= 0) goto NON_SPECIAL_CHARACTER; 4813 4814 /* Escapes that are meta-things. The normal ones just affect the 4815 bit map, but Unicode properties require an XCLASS extended item. */ 4816 4817 else 4818 { 4819 class_optcount = 10; /* \d, \s etc; make sure > 1 */ 4820#ifdef SUPPORT_UTF8 4821 if (-c == ESC_p || -c == ESC_P) 4822 { 4823 if (!class_utf8) 4824 { 4825 class_utf8 = TRUE; 4826 length += LINK_SIZE + 2; 4827 } 4828 length += 2; 4829 } 4830#endif 4831 } 4832 } 4833 4834 /* Check the syntax for POSIX stuff. The bits we actually handle are 4835 checked during the real compile phase. */ 4836 4837 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) 4838 { 4839 ptr++; 4840 class_optcount = 10; /* Make sure > 1 */ 4841 } 4842 4843 /* Anything else increments the possible optimization count. We have to 4844 detect ranges here so that we can compute the number of extra ranges for 4845 caseless wide characters when UCP support is available. If there are wide 4846 characters, we are going to have to use an XCLASS, even for single 4847 characters. */ 4848 4849 else 4850 { 4851 int d; 4852 4853 GET_ONE_CHARACTER: 4854 4855#ifdef SUPPORT_UTF8 4856 if (utf8) 4857 { 4858 int extra = 0; 4859 GETCHARLEN(c, ptr, extra); 4860 ptr += extra; 4861 } 4862 else c = *ptr; 4863#else 4864 c = *ptr; 4865#endif 4866 4867 /* Come here from handling \ above when it escapes to a char value */ 4868 4869 NON_SPECIAL_CHARACTER: 4870 class_optcount++; 4871 4872 d = -1; 4873 if (ptr[1] == '-') 4874 { 4875 uschar const *hyptr = ptr++; 4876 if (ptr[1] == '\\') 4877 { 4878 ptr++; 4879 d = check_escape(&ptr, errorptr, bracount, options, TRUE); 4880 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 4881 if (-d == ESC_b) d = '\b'; /* backspace */ 4882 else if (-d == ESC_X) d = 'X'; /* literal X in a class */ 4883 } 4884 else if (ptr[1] != 0 && ptr[1] != ']') 4885 { 4886 ptr++; 4887#ifdef SUPPORT_UTF8 4888 if (utf8) 4889 { 4890 int extra = 0; 4891 GETCHARLEN(d, ptr, extra); 4892 ptr += extra; 4893 } 4894 else 4895#endif 4896 d = *ptr; 4897 } 4898 if (d < 0) ptr = hyptr; /* go back to hyphen as data */ 4899 } 4900 4901 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or > 4902 127 for caseless matching, we will need to use an XCLASS. */ 4903 4904 if (d >= 0) 4905 { 4906 class_optcount = 10; /* Ensure > 1 */ 4907 if (d < c) 4908 { 4909 *errorptr = ERR8; 4910 goto PCRE_ERROR_RETURN; 4911 } 4912 4913#ifdef SUPPORT_UTF8 4914 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) 4915 { 4916 uschar buffer[6]; 4917 if (!class_utf8) /* Allow for XCLASS overhead */ 4918 { 4919 class_utf8 = TRUE; 4920 length += LINK_SIZE + 2; 4921 } 4922 4923#ifdef SUPPORT_UCP 4924 /* If we have UCP support, find out how many extra ranges are 4925 needed to map the other case of characters within this range. We 4926 have to mimic the range optimization here, because extending the 4927 range upwards might push d over a boundary that makes is use 4928 another byte in the UTF-8 representation. */ 4929 4930 if ((options & PCRE_CASELESS) != 0) 4931 { 4932 int occ, ocd; 4933 int cc = c; 4934 int origd = d; 4935 while (get_othercase_range(&cc, origd, &occ, &ocd)) 4936 { 4937 if (occ >= c && ocd <= d) continue; /* Skip embedded */ 4938 4939 if (occ < c && ocd >= c - 1) /* Extend the basic range */ 4940 { /* if there is overlap, */ 4941 c = occ; /* noting that if occ < c */ 4942 continue; /* we can't have ocd > d */ 4943 } /* because a subrange is */ 4944 if (ocd > d && occ <= d + 1) /* always shorter than */ 4945 { /* the basic range. */ 4946 d = ocd; 4947 continue; 4948 } 4949 4950 /* An extra item is needed */ 4951 4952 length += 1 + ord2utf8(occ, buffer) + 4953 ((occ == ocd)? 0 : ord2utf8(ocd, buffer)); 4954 } 4955 } 4956#endif /* SUPPORT_UCP */ 4957 4958 /* The length of the (possibly extended) range */ 4959 4960 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer); 4961 } 4962#endif /* SUPPORT_UTF8 */ 4963 4964 } 4965 4966 /* We have a single character. There is nothing to be done unless we 4967 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must 4968 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP 4969 support. */ 4970 4971 else 4972 { 4973#ifdef SUPPORT_UTF8 4974 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) 4975 { 4976 uschar buffer[6]; 4977 class_optcount = 10; /* Ensure > 1 */ 4978 if (!class_utf8) /* Allow for XCLASS overhead */ 4979 { 4980 class_utf8 = TRUE; 4981 length += LINK_SIZE + 2; 4982 } 4983#ifdef SUPPORT_UCP 4984 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) * 4985 (1 + ord2utf8(c, buffer)); 4986#else /* SUPPORT_UCP */ 4987 length += 1 + ord2utf8(c, buffer); 4988#endif /* SUPPORT_UCP */ 4989 } 4990#endif /* SUPPORT_UTF8 */ 4991 } 4992 } 4993 } 4994 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */ 4995 4996 if (*ptr == 0) /* Missing terminating ']' */ 4997 { 4998 *errorptr = ERR6; 4999 goto PCRE_ERROR_RETURN; 5000 } 5001 5002 /* We can optimize when there was only one optimizable character. Repeats 5003 for positive and negated single one-byte chars are handled by the general 5004 code. Here, we handle repeats for the class opcodes. */ 5005 5006 if (class_optcount == 1) length += 3; else 5007 { 5008 length += 33; 5009 5010 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier, 5011 we also need extra for wrapping the whole thing in a sub-pattern. */ 5012 5013 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2)) 5014 { 5015 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 5016 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 5017 if ((min == 0 && (max == 1 || max == -1)) || 5018 (min == 1 && max == -1)) 5019 length++; 5020 else length += 5; 5021 if (ptr[1] == '+') 5022 { 5023 ptr++; 5024 length += 2 + 2*LINK_SIZE; 5025 } 5026 else if (ptr[1] == '?') ptr++; 5027 } 5028 } 5029 continue; 5030 5031 /* Brackets may be genuine groups or special things */ 5032 5033 case '(': 5034 branch_newextra = 0; 5035 bracket_length = 1 + LINK_SIZE; 5036 5037 /* Handle special forms of bracket, which all start (? */ 5038 5039 if (ptr[1] == '?') 5040 { 5041 int set, unset; 5042 int *optset; 5043 5044 switch (c = ptr[2]) 5045 { 5046 /* Skip over comments entirely */ 5047 case '#': 5048 ptr += 3; 5049 while (*ptr != 0 && *ptr != ')') ptr++; 5050 if (*ptr == 0) 5051 { 5052 *errorptr = ERR18; 5053 goto PCRE_ERROR_RETURN; 5054 } 5055 continue; 5056 5057 /* Non-referencing groups and lookaheads just move the pointer on, and 5058 then behave like a non-special bracket, except that they don't increment 5059 the count of extracting brackets. Ditto for the "once only" bracket, 5060 which is in Perl from version 5.005. */ 5061 5062 case ':': 5063 case '=': 5064 case '!': 5065 case '>': 5066 ptr += 2; 5067 break; 5068 5069 /* (?R) specifies a recursive call to the regex, which is an extension 5070 to provide the facility which can be obtained by (?p{perl-code}) in 5071 Perl 5.6. In Perl 5.8 this has become (??{perl-code}). 5072 5073 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to 5074 the appropriate numbered brackets. This includes both recursive and 5075 non-recursive calls. (?R) is now synonymous with (?0). */ 5076 5077 case 'R': 5078 ptr++; 5079 5080 case '0': case '1': case '2': case '3': case '4': 5081 case '5': case '6': case '7': case '8': case '9': 5082 ptr += 2; 5083 if (c != 'R') 5084 while ((digitab[*(++ptr)] & ctype_digit) != 0); 5085 if (*ptr != ')') 5086 { 5087 *errorptr = ERR29; 5088 goto PCRE_ERROR_RETURN; 5089 } 5090 length += 1 + LINK_SIZE; 5091 5092 /* If this item is quantified, it will get wrapped inside brackets so 5093 as to use the code for quantified brackets. We jump down and use the 5094 code that handles this for real brackets. */ 5095 5096 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') 5097 { 5098 length += 2 + 2 * LINK_SIZE; /* to make bracketed */ 5099 duplength = 5 + 3 * LINK_SIZE; 5100 goto HANDLE_QUANTIFIED_BRACKETS; 5101 } 5102 continue; 5103 5104 /* (?C) is an extension which provides "callout" - to provide a bit of 5105 the functionality of the Perl (?{...}) feature. An optional number may 5106 follow (default is zero). */ 5107 5108 case 'C': 5109 ptr += 2; 5110 while ((digitab[*(++ptr)] & ctype_digit) != 0); 5111 if (*ptr != ')') 5112 { 5113 *errorptr = ERR39; 5114 goto PCRE_ERROR_RETURN; 5115 } 5116 length += 2 + 2*LINK_SIZE; 5117 continue; 5118 5119 /* Named subpatterns are an extension copied from Python */ 5120 5121 case 'P': 5122 ptr += 3; 5123 if (*ptr == '<') 5124 { 5125 const uschar *p; /* Don't amalgamate; some compilers */ 5126 p = ++ptr; /* grumble at autoincrement in declaration */ 5127 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; 5128 if (*ptr != '>') 5129 { 5130 *errorptr = ERR42; 5131 goto PCRE_ERROR_RETURN; 5132 } 5133 name_count++; 5134 if (ptr - p > max_name_size) max_name_size = (ptr - p); 5135 break; 5136 } 5137 5138 if (*ptr == '=' || *ptr == '>') 5139 { 5140 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); 5141 if (*ptr != ')') 5142 { 5143 *errorptr = ERR42; 5144 goto PCRE_ERROR_RETURN; 5145 } 5146 break; 5147 } 5148 5149 /* Unknown character after (?P */ 5150 5151 *errorptr = ERR41; 5152 goto PCRE_ERROR_RETURN; 5153 5154 /* Lookbehinds are in Perl from version 5.005 */ 5155 5156 case '<': 5157 ptr += 3; 5158 if (*ptr == '=' || *ptr == '!') 5159 { 5160 branch_newextra = 1 + LINK_SIZE; 5161 length += 1 + LINK_SIZE; /* For the first branch */ 5162 break; 5163 } 5164 *errorptr = ERR24; 5165 goto PCRE_ERROR_RETURN; 5166 5167 /* Conditionals are in Perl from version 5.005. The bracket must either 5168 be followed by a number (for bracket reference) or by an assertion 5169 group, or (a PCRE extension) by 'R' for a recursion test. */ 5170 5171 case '(': 5172 if (ptr[3] == 'R' && ptr[4] == ')') 5173 { 5174 ptr += 4; 5175 length += 3; 5176 } 5177 else if ((digitab[ptr[3]] & ctype_digit) != 0) 5178 { 5179 ptr += 4; 5180 length += 3; 5181 while ((digitab[*ptr] & ctype_digit) != 0) ptr++; 5182 if (*ptr != ')') 5183 { 5184 *errorptr = ERR26; 5185 goto PCRE_ERROR_RETURN; 5186 } 5187 } 5188 else /* An assertion must follow */ 5189 { 5190 ptr++; /* Can treat like ':' as far as spacing is concerned */ 5191 if (ptr[2] != '?' || 5192 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) 5193 { 5194 ptr += 2; /* To get right offset in message */ 5195 *errorptr = ERR28; 5196 goto PCRE_ERROR_RETURN; 5197 } 5198 } 5199 break; 5200 5201 /* Else loop checking valid options until ) is met. Anything else is an 5202 error. If we are without any brackets, i.e. at top level, the settings 5203 act as if specified in the options, so massage the options immediately. 5204 This is for backward compatibility with Perl 5.004. */ 5205 5206 default: 5207 set = unset = 0; 5208 optset = &set; 5209 ptr += 2; 5210 5211 for (;; ptr++) 5212 { 5213 c = *ptr; 5214 switch (c) 5215 { 5216 case 'i': 5217 *optset |= PCRE_CASELESS; 5218 continue; 5219 5220 case 'm': 5221 *optset |= PCRE_MULTILINE; 5222 continue; 5223 5224 case 's': 5225 *optset |= PCRE_DOTALL; 5226 continue; 5227 5228 case 'x': 5229 *optset |= PCRE_EXTENDED; 5230 continue; 5231 5232 case 'X': 5233 *optset |= PCRE_EXTRA; 5234 continue; 5235 5236 case 'U': 5237 *optset |= PCRE_UNGREEDY; 5238 continue; 5239 5240 case '-': 5241 optset = &unset; 5242 continue; 5243 5244 /* A termination by ')' indicates an options-setting-only item; if 5245 this is at the very start of the pattern (indicated by item_count 5246 being zero), we use it to set the global options. This is helpful 5247 when analyzing the pattern for first characters, etc. Otherwise 5248 nothing is done here and it is handled during the compiling 5249 process. 5250 5251 [Historical note: Up to Perl 5.8, options settings at top level 5252 were always global settings, wherever they appeared in the pattern. 5253 That is, they were equivalent to an external setting. From 5.8 5254 onwards, they apply only to what follows (which is what you might 5255 expect).] */ 5256 5257 case ')': 5258 if (item_count == 0) 5259 { 5260 options = (options | set) & (~unset); 5261 set = unset = 0; /* To save length */ 5262 item_count--; /* To allow for several */ 5263 } 5264 5265 /* Fall through */ 5266 5267 /* A termination by ':' indicates the start of a nested group with 5268 the given options set. This is again handled at compile time, but 5269 we must allow for compiled space if any of the ims options are 5270 set. We also have to allow for resetting space at the end of 5271 the group, which is why 4 is added to the length and not just 2. 5272 If there are several changes of options within the same group, this 5273 will lead to an over-estimate on the length, but this shouldn't 5274 matter very much. We also have to allow for resetting options at 5275 the start of any alternations, which we do by setting 5276 branch_newextra to 2. Finally, we record whether the case-dependent 5277 flag ever changes within the regex. This is used by the "required 5278 character" code. */ 5279 5280 case ':': 5281 if (((set|unset) & PCRE_IMS) != 0) 5282 { 5283 length += 4; 5284 branch_newextra = 2; 5285 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; 5286 } 5287 goto END_OPTIONS; 5288 5289 /* Unrecognized option character */ 5290 5291 default: 5292 *errorptr = ERR12; 5293 goto PCRE_ERROR_RETURN; 5294 } 5295 } 5296 5297 /* If we hit a closing bracket, that's it - this is a freestanding 5298 option-setting. We need to ensure that branch_extra is updated if 5299 necessary. The only values branch_newextra can have here are 0 or 2. 5300 If the value is 2, then branch_extra must either be 2 or 5, depending 5301 on whether this is a lookbehind group or not. */ 5302 5303 END_OPTIONS: 5304 if (c == ')') 5305 { 5306 if (branch_newextra == 2 && 5307 (branch_extra == 0 || branch_extra == 1+LINK_SIZE)) 5308 branch_extra += branch_newextra; 5309 continue; 5310 } 5311 5312 /* If options were terminated by ':' control comes here. Fall through 5313 to handle the group below. */ 5314 } 5315 } 5316 5317 /* Extracting brackets must be counted so we can process escapes in a 5318 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to 5319 need an additional 3 bytes of store per extracting bracket. However, if 5320 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we 5321 must leave the count alone (it will aways be zero). */ 5322 5323 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0) 5324 { 5325 bracount++; 5326 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; 5327 } 5328 5329 /* Save length for computing whole length at end if there's a repeat that 5330 requires duplication of the group. Also save the current value of 5331 branch_extra, and start the new group with the new value. If non-zero, this 5332 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */ 5333 5334 if (brastackptr >= sizeof(brastack)/sizeof(int)) 5335 { 5336 *errorptr = ERR19; 5337 goto PCRE_ERROR_RETURN; 5338 } 5339 5340 bralenstack[brastackptr] = branch_extra; 5341 branch_extra = branch_newextra; 5342 5343 brastack[brastackptr++] = length; 5344 length += bracket_length; 5345 continue; 5346 5347 /* Handle ket. Look for subsequent max/min; for certain sets of values we 5348 have to replicate this bracket up to that many times. If brastackptr is 5349 0 this is an unmatched bracket which will generate an error, but take care 5350 not to try to access brastack[-1] when computing the length and restoring 5351 the branch_extra value. */ 5352 5353 case ')': 5354 length += 1 + LINK_SIZE; 5355 if (brastackptr > 0) 5356 { 5357 duplength = length - brastack[--brastackptr]; 5358 branch_extra = bralenstack[brastackptr]; 5359 } 5360 else duplength = 0; 5361 5362 /* The following code is also used when a recursion such as (?3) is 5363 followed by a quantifier, because in that case, it has to be wrapped inside 5364 brackets so that the quantifier works. The value of duplength must be 5365 set before arrival. */ 5366 5367 HANDLE_QUANTIFIED_BRACKETS: 5368 5369 /* Leave ptr at the final char; for read_repeat_counts this happens 5370 automatically; for the others we need an increment. */ 5371 5372 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2)) 5373 { 5374 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr); 5375 if (*errorptr != NULL) goto PCRE_ERROR_RETURN; 5376 } 5377 else if (c == '*') { min = 0; max = -1; ptr++; } 5378 else if (c == '+') { min = 1; max = -1; ptr++; } 5379 else if (c == '?') { min = 0; max = 1; ptr++; } 5380 else { min = 1; max = 1; } 5381 5382 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the 5383 group, and if the maximum is greater than zero, we have to replicate 5384 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting 5385 bracket set. */ 5386 5387 if (min == 0) 5388 { 5389 length++; 5390 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE); 5391 } 5392 5393 /* When the minimum is greater than zero, we have to replicate up to 5394 minval-1 times, with no additions required in the copies. Then, if there 5395 is a limited maximum we have to replicate up to maxval-1 times allowing 5396 for a BRAZERO item before each optional copy and nesting brackets for all 5397 but one of the optional copies. */ 5398 5399 else 5400 { 5401 length += (min - 1) * duplength; 5402 if (max > min) /* Need this test as max=-1 means no limit */ 5403 length += (max - min) * (duplength + 3 + 2*LINK_SIZE) 5404 - (2 + 2*LINK_SIZE); 5405 } 5406 5407 /* Allow space for once brackets for "possessive quantifier" */ 5408 5409 if (ptr[1] == '+') 5410 { 5411 ptr++; 5412 length += 2 + 2*LINK_SIZE; 5413 } 5414 continue; 5415 5416 /* Non-special character. It won't be space or # in extended mode, so it is 5417 always a genuine character. If we are in a \Q...\E sequence, check for the 5418 end; if not, we have a literal. */ 5419 5420 default: 5421 NORMAL_CHAR: 5422 5423 if (inescq && c == '\\' && ptr[1] == 'E') 5424 { 5425 inescq = FALSE; 5426 ptr++; 5427 continue; 5428 } 5429 5430 length += 2; /* For a one-byte character */ 5431 lastitemlength = 1; /* Default length of last item for repeats */ 5432 5433 /* In UTF-8 mode, check for additional bytes. */ 5434 5435#ifdef SUPPORT_UTF8 5436 if (utf8 && (c & 0xc0) == 0xc0) 5437 { 5438 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */ 5439 { /* because the end is marked */ 5440 lastitemlength++; /* by a zero byte. */ 5441 length++; 5442 ptr++; 5443 } 5444 } 5445#endif 5446 5447 continue; 5448 } 5449 } 5450 5451length += 2 + LINK_SIZE; /* For final KET and END */ 5452 5453if ((options & PCRE_AUTO_CALLOUT) != 0) 5454 length += 2 + 2*LINK_SIZE; /* For final callout */ 5455 5456if (length > MAX_PATTERN_SIZE) 5457 { 5458 *errorptr = ERR20; 5459 return NULL; 5460 } 5461 5462/* Compute the size of data block needed and get it, either from malloc or 5463externally provided function. */ 5464 5465size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); 5466re = (real_pcre *)(pcre_malloc)(size); 5467 5468if (re == NULL) 5469 { 5470 *errorptr = ERR21; 5471 return NULL; 5472 } 5473 5474/* Put in the magic number, and save the sizes, options, and character table 5475pointer. NULL is used for the default character tables. The nullpad field is at 5476the end; it's there to help in the case when a regex compiled on a system with 54774-byte pointers is run on another with 8-byte pointers. */ 5478 5479re->magic_number = MAGIC_NUMBER; 5480re->size = size; 5481re->options = options; 5482re->dummy1 = re->dummy2 = 0; 5483re->name_table_offset = sizeof(real_pcre); 5484re->name_entry_size = max_name_size + 3; 5485re->name_count = name_count; 5486re->tables = (tables == pcre_default_tables)? NULL : tables; 5487re->nullpad = NULL; 5488 5489/* The starting points of the name/number translation table and of the code are 5490passed around in the compile data block. */ 5491 5492compile_block.names_found = 0; 5493compile_block.name_entry_size = max_name_size + 3; 5494compile_block.name_table = (uschar *)re + re->name_table_offset; 5495codestart = compile_block.name_table + re->name_entry_size * re->name_count; 5496compile_block.start_code = codestart; 5497compile_block.start_pattern = (const uschar *)pattern; 5498compile_block.req_varyopt = 0; 5499compile_block.nopartial = FALSE; 5500 5501/* Set up a starting, non-extracting bracket, then compile the expression. On 5502error, *errorptr will be set non-NULL, so we don't need to look at the result 5503of the function here. */ 5504 5505ptr = (const uschar *)pattern; 5506code = (uschar *)codestart; 5507*code = OP_BRA; 5508bracount = 0; 5509(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, 5510 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block); 5511re->top_bracket = bracount; 5512re->top_backref = compile_block.top_backref; 5513 5514if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL; 5515 5516/* If not reached end of pattern on success, there's an excess bracket. */ 5517 5518if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22; 5519 5520/* Fill in the terminating state and check for disastrous overflow, but 5521if debugging, leave the test till after things are printed out. */ 5522 5523*code++ = OP_END; 5524 5525#ifndef DEBUG 5526if (code - codestart > length) *errorptr = ERR23; 5527#endif 5528 5529/* Give an error if there's back reference to a non-existent capturing 5530subpattern. */ 5531 5532if (re->top_backref > re->top_bracket) *errorptr = ERR15; 5533 5534/* Failed to compile, or error while post-processing */ 5535 5536if (*errorptr != NULL) 5537 { 5538 (pcre_free)(re); 5539 PCRE_ERROR_RETURN: 5540 *erroroffset = ptr - (const uschar *)pattern; 5541 return NULL; 5542 } 5543 5544/* If the anchored option was not passed, set the flag if we can determine that 5545the pattern is anchored by virtue of ^ characters or \A or anything else (such 5546as starting with .* when DOTALL is set). 5547 5548Otherwise, if we know what the first character has to be, save it, because that 5549speeds up unanchored matches no end. If not, see if we can set the 5550PCRE_STARTLINE flag. This is helpful for multiline matches when all branches 5551start with ^. and also when all branches start with .* for non-DOTALL matches. 5552*/ 5553 5554if ((options & PCRE_ANCHORED) == 0) 5555 { 5556 int temp_options = options; 5557 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) 5558 re->options |= PCRE_ANCHORED; 5559 else 5560 { 5561 if (firstbyte < 0) 5562 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); 5563 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */ 5564 { 5565 int ch = firstbyte & 255; 5566 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && 5567 compile_block.fcc[ch] == ch)? ch : firstbyte; 5568 re->options |= PCRE_FIRSTSET; 5569 } 5570 else if (is_startline(codestart, 0, compile_block.backref_map)) 5571 re->options |= PCRE_STARTLINE; 5572 } 5573 } 5574 5575/* For an anchored pattern, we use the "required byte" only if it follows a 5576variable length item in the regex. Remove the caseless flag for non-caseable 5577bytes. */ 5578 5579if (reqbyte >= 0 && 5580 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) 5581 { 5582 int ch = reqbyte & 255; 5583 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && 5584 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; 5585 re->options |= PCRE_REQCHSET; 5586 } 5587 5588/* Print out the compiled data for debugging */ 5589 5590#ifdef DEBUG 5591 5592printf("Length = %d top_bracket = %d top_backref = %d\n", 5593 length, re->top_bracket, re->top_backref); 5594 5595if (re->options != 0) 5596 { 5597 printf("%s%s%s%s%s%s%s%s%s%s\n", 5598 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", 5599 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", 5600 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "", 5601 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", 5602 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "", 5603 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", 5604 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "", 5605 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "", 5606 ((re->options & PCRE_EXTRA) != 0)? "extra " : "", 5607 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : ""); 5608 } 5609 5610if ((re->options & PCRE_FIRSTSET) != 0) 5611 { 5612 int ch = re->first_byte & 255; 5613 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; 5614 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); 5615 else printf("First char = \\x%02x%s\n", ch, caseless); 5616 } 5617 5618if ((re->options & PCRE_REQCHSET) != 0) 5619 { 5620 int ch = re->req_byte & 255; 5621 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; 5622 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); 5623 else printf("Req char = \\x%02x%s\n", ch, caseless); 5624 } 5625 5626print_internals(re, stdout); 5627 5628/* This check is done here in the debugging case so that the code that 5629was compiled can be seen. */ 5630 5631if (code - codestart > length) 5632 { 5633 *errorptr = ERR23; 5634 (pcre_free)(re); 5635 *erroroffset = ptr - (uschar *)pattern; 5636 return NULL; 5637 } 5638#endif 5639 5640return (pcre *)re; 5641} 5642 5643 5644 5645/************************************************* 5646* Match a back-reference * 5647*************************************************/ 5648 5649/* If a back reference hasn't been set, the length that is passed is greater 5650than the number of characters left in the string, so the match fails. 5651 5652Arguments: 5653 offset index into the offset vector 5654 eptr points into the subject 5655 length length to be matched 5656 md points to match data block 5657 ims the ims flags 5658 5659Returns: TRUE if matched 5660*/ 5661 5662static BOOL 5663match_ref(int offset, register const uschar *eptr, int length, match_data *md, 5664 unsigned long int ims) 5665{ 5666const uschar *p = md->start_subject + md->offset_vector[offset]; 5667 5668#ifdef DEBUG 5669if (eptr >= md->end_subject) 5670 printf("matching subject <null>"); 5671else 5672 { 5673 printf("matching subject "); 5674 pchars(eptr, length, TRUE, md); 5675 } 5676printf(" against backref "); 5677pchars(p, length, FALSE, md); 5678printf("\n"); 5679#endif 5680 5681/* Always fail if not enough characters left */ 5682 5683if (length > md->end_subject - eptr) return FALSE; 5684 5685/* Separate the caselesss case for speed */ 5686 5687if ((ims & PCRE_CASELESS) != 0) 5688 { 5689 while (length-- > 0) 5690 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; 5691 } 5692else 5693 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; } 5694 5695return TRUE; 5696} 5697 5698 5699#ifdef SUPPORT_UTF8 5700/************************************************* 5701* Match character against an XCLASS * 5702*************************************************/ 5703 5704/* This function is called from within the XCLASS code below, to match a 5705character against an extended class which might match values > 255. 5706 5707Arguments: 5708 c the character 5709 data points to the flag byte of the XCLASS data 5710 5711Returns: TRUE if character matches, else FALSE 5712*/ 5713 5714static BOOL 5715match_xclass(int c, const uschar *data) 5716{ 5717int t; 5718BOOL negated = (*data & XCL_NOT) != 0; 5719 5720/* Character values < 256 are matched against a bitmap, if one is present. If 5721not, we still carry on, because there may be ranges that start below 256 in the 5722additional data. */ 5723 5724if (c < 256) 5725 { 5726 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) 5727 return !negated; /* char found */ 5728 } 5729 5730/* First skip the bit map if present. Then match against the list of Unicode 5731properties or large chars or ranges that end with a large char. We won't ever 5732encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ 5733 5734if ((*data++ & XCL_MAP) != 0) data += 32; 5735 5736while ((t = *data++) != XCL_END) 5737 { 5738 int x, y; 5739 if (t == XCL_SINGLE) 5740 { 5741 GETCHARINC(x, data); 5742 if (c == x) return !negated; 5743 } 5744 else if (t == XCL_RANGE) 5745 { 5746 GETCHARINC(x, data); 5747 GETCHARINC(y, data); 5748 if (c >= x && c <= y) return !negated; 5749 } 5750 5751#ifdef SUPPORT_UCP 5752 else /* XCL_PROP & XCL_NOTPROP */ 5753 { 5754 int chartype, othercase; 5755 int rqdtype = *data++; 5756 int category = ucp_findchar(c, &chartype, &othercase); 5757 if (rqdtype >= 128) 5758 { 5759 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated; 5760 } 5761 else 5762 { 5763 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated; 5764 } 5765 } 5766#endif /* SUPPORT_UCP */ 5767 } 5768 5769return negated; /* char did not match */ 5770} 5771#endif 5772 5773 5774/*************************************************************************** 5775**************************************************************************** 5776 RECURSION IN THE match() FUNCTION 5777 5778The match() function is highly recursive. Some regular expressions can cause 5779it to recurse thousands of times. I was writing for Unix, so I just let it 5780call itself recursively. This uses the stack for saving everything that has 5781to be saved for a recursive call. On Unix, the stack can be large, and this 5782works fine. 5783 5784It turns out that on non-Unix systems there are problems with programs that 5785use a lot of stack. (This despite the fact that every last chip has oodles 5786of memory these days, and techniques for extending the stack have been known 5787for decades.) So.... 5788 5789There is a fudge, triggered by defining NO_RECURSE, which avoids recursive 5790calls by keeping local variables that need to be preserved in blocks of memory 5791obtained from malloc instead instead of on the stack. Macros are used to 5792achieve this so that the actual code doesn't look very different to what it 5793always used to. 5794**************************************************************************** 5795***************************************************************************/ 5796 5797 5798/* These versions of the macros use the stack, as normal */ 5799 5800#ifndef NO_RECURSE 5801#define REGISTER register 5802#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg) 5803#define RRETURN(ra) return ra 5804#else 5805 5806 5807/* These versions of the macros manage a private stack on the heap. Note 5808that the rd argument of RMATCH isn't actually used. It's the md argument of 5809match(), which never changes. */ 5810 5811#define REGISTER 5812 5813#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\ 5814 {\ 5815 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ 5816 if (setjmp(frame->Xwhere) == 0)\ 5817 {\ 5818 newframe->Xeptr = ra;\ 5819 newframe->Xecode = rb;\ 5820 newframe->Xoffset_top = rc;\ 5821 newframe->Xims = re;\ 5822 newframe->Xeptrb = rf;\ 5823 newframe->Xflags = rg;\ 5824 newframe->Xprevframe = frame;\ 5825 frame = newframe;\ 5826 DPRINTF(("restarting from line %d\n", __LINE__));\ 5827 goto HEAP_RECURSE;\ 5828 }\ 5829 else\ 5830 {\ 5831 DPRINTF(("longjumped back to line %d\n", __LINE__));\ 5832 frame = md->thisframe;\ 5833 rx = frame->Xresult;\ 5834 }\ 5835 } 5836 5837#define RRETURN(ra)\ 5838 {\ 5839 heapframe *newframe = frame;\ 5840 frame = newframe->Xprevframe;\ 5841 (pcre_stack_free)(newframe);\ 5842 if (frame != NULL)\ 5843 {\ 5844 frame->Xresult = ra;\ 5845 md->thisframe = frame;\ 5846 longjmp(frame->Xwhere, 1);\ 5847 }\ 5848 return ra;\ 5849 } 5850 5851 5852/* Structure for remembering the local variables in a private frame */ 5853 5854typedef struct heapframe { 5855 struct heapframe *Xprevframe; 5856 5857 /* Function arguments that may change */ 5858 5859 const uschar *Xeptr; 5860 const uschar *Xecode; 5861 int Xoffset_top; 5862 long int Xims; 5863 eptrblock *Xeptrb; 5864 int Xflags; 5865 5866 /* Function local variables */ 5867 5868 const uschar *Xcallpat; 5869 const uschar *Xcharptr; 5870 const uschar *Xdata; 5871 const uschar *Xnext; 5872 const uschar *Xpp; 5873 const uschar *Xprev; 5874 const uschar *Xsaved_eptr; 5875 5876 recursion_info Xnew_recursive; 5877 5878 BOOL Xcur_is_word; 5879 BOOL Xcondition; 5880 BOOL Xminimize; 5881 BOOL Xprev_is_word; 5882 5883 unsigned long int Xoriginal_ims; 5884 5885#ifdef SUPPORT_UCP 5886 int Xprop_type; 5887 int Xprop_fail_result; 5888 int Xprop_category; 5889 int Xprop_chartype; 5890 int Xprop_othercase; 5891 int Xprop_test_against; 5892 int *Xprop_test_variable; 5893#endif 5894 5895 int Xctype; 5896 int Xfc; 5897 int Xfi; 5898 int Xlength; 5899 int Xmax; 5900 int Xmin; 5901 int Xnumber; 5902 int Xoffset; 5903 int Xop; 5904 int Xsave_capture_last; 5905 int Xsave_offset1, Xsave_offset2, Xsave_offset3; 5906 int Xstacksave[REC_STACK_SAVE_MAX]; 5907 5908 eptrblock Xnewptrb; 5909 5910 /* Place to pass back result, and where to jump back to */ 5911 5912 int Xresult; 5913 jmp_buf Xwhere; 5914 5915} heapframe; 5916 5917#endif 5918 5919 5920/*************************************************************************** 5921***************************************************************************/ 5922 5923 5924 5925/************************************************* 5926* Match from current position * 5927*************************************************/ 5928 5929/* On entry ecode points to the first opcode, and eptr to the first character 5930in the subject string, while eptrb holds the value of eptr at the start of the 5931last bracketed group - used for breaking infinite loops matching zero-length 5932strings. This function is called recursively in many circumstances. Whenever it 5933returns a negative (error) response, the outer incarnation must also return the 5934same response. 5935 5936Performance note: It might be tempting to extract commonly used fields from the 5937md structure (e.g. utf8, end_subject) into individual variables to improve 5938performance. Tests using gcc on a SPARC disproved this; in the first case, it 5939made performance worse. 5940 5941Arguments: 5942 eptr pointer in subject 5943 ecode position in code 5944 offset_top current top pointer 5945 md pointer to "static" info for the match 5946 ims current /i, /m, and /s options 5947 eptrb pointer to chain of blocks containing eptr at start of 5948 brackets - for testing for empty matches 5949 flags can contain 5950 match_condassert - this is an assertion condition 5951 match_isgroup - this is the start of a bracketed group 5952 5953Returns: MATCH_MATCH if matched ) these values are >= 0 5954 MATCH_NOMATCH if failed to match ) 5955 a negative PCRE_ERROR_xxx value if aborted by an error condition 5956 (e.g. stopped by recursion limit) 5957*/ 5958 5959static int 5960match(REGISTER const uschar *eptr, REGISTER const uschar *ecode, 5961 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, 5962 int flags) 5963{ 5964/* These variables do not need to be preserved over recursion in this function, 5965so they can be ordinary variables in all cases. Mark them with "register" 5966because they are used a lot in loops. */ 5967 5968register int rrc; /* Returns from recursive calls */ 5969register int i; /* Used for loops not involving calls to RMATCH() */ 5970register int c; /* Character values not kept over RMATCH() calls */ 5971 5972/* When recursion is not being used, all "local" variables that have to be 5973preserved over calls to RMATCH() are part of a "frame" which is obtained from 5974heap storage. Set up the top-level frame here; others are obtained from the 5975heap whenever RMATCH() does a "recursion". See the macro definitions above. */ 5976 5977#ifdef NO_RECURSE 5978heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); 5979frame->Xprevframe = NULL; /* Marks the top level */ 5980 5981/* Copy in the original argument variables */ 5982 5983frame->Xeptr = eptr; 5984frame->Xecode = ecode; 5985frame->Xoffset_top = offset_top; 5986frame->Xims = ims; 5987frame->Xeptrb = eptrb; 5988frame->Xflags = flags; 5989 5990/* This is where control jumps back to to effect "recursion" */ 5991 5992HEAP_RECURSE: 5993 5994/* Macros make the argument variables come from the current frame */ 5995 5996#define eptr frame->Xeptr 5997#define ecode frame->Xecode 5998#define offset_top frame->Xoffset_top 5999#define ims frame->Xims 6000#define eptrb frame->Xeptrb 6001#define flags frame->Xflags 6002 6003/* Ditto for the local variables */ 6004 6005#ifdef SUPPORT_UTF8 6006#define charptr frame->Xcharptr 6007#endif 6008#define callpat frame->Xcallpat 6009#define data frame->Xdata 6010#define next frame->Xnext 6011#define pp frame->Xpp 6012#define prev frame->Xprev 6013#define saved_eptr frame->Xsaved_eptr 6014 6015#define new_recursive frame->Xnew_recursive 6016 6017#define cur_is_word frame->Xcur_is_word 6018#define condition frame->Xcondition 6019#define minimize frame->Xminimize 6020#define prev_is_word frame->Xprev_is_word 6021 6022#define original_ims frame->Xoriginal_ims 6023 6024#ifdef SUPPORT_UCP 6025#define prop_type frame->Xprop_type 6026#define prop_fail_result frame->Xprop_fail_result 6027#define prop_category frame->Xprop_category 6028#define prop_chartype frame->Xprop_chartype 6029#define prop_othercase frame->Xprop_othercase 6030#define prop_test_against frame->Xprop_test_against 6031#define prop_test_variable frame->Xprop_test_variable 6032#endif 6033 6034#define ctype frame->Xctype 6035#define fc frame->Xfc 6036#define fi frame->Xfi 6037#define length frame->Xlength 6038#define max frame->Xmax 6039#define min frame->Xmin 6040#define number frame->Xnumber 6041#define offset frame->Xoffset 6042#define op frame->Xop 6043#define save_capture_last frame->Xsave_capture_last 6044#define save_offset1 frame->Xsave_offset1 6045#define save_offset2 frame->Xsave_offset2 6046#define save_offset3 frame->Xsave_offset3 6047#define stacksave frame->Xstacksave 6048 6049#define newptrb frame->Xnewptrb 6050 6051/* When recursion is being used, local variables are allocated on the stack and 6052get preserved during recursion in the normal way. In this environment, fi and 6053i, and fc and c, can be the same variables. */ 6054 6055#else 6056#define fi i 6057#define fc c 6058 6059 6060#ifdef SUPPORT_UTF8 /* Many of these variables are used ony */ 6061const uschar *charptr; /* small blocks of the code. My normal */ 6062#endif /* style of coding would have declared */ 6063const uschar *callpat; /* them within each of those blocks. */ 6064const uschar *data; /* However, in order to accommodate the */ 6065const uschar *next; /* version of this code that uses an */ 6066const uschar *pp; /* external "stack" implemented on the */ 6067const uschar *prev; /* heap, it is easier to declare them */ 6068const uschar *saved_eptr; /* all here, so the declarations can */ 6069 /* be cut out in a block. The only */ 6070recursion_info new_recursive; /* declarations within blocks below are */ 6071 /* for variables that do not have to */ 6072BOOL cur_is_word; /* be preserved over a recursive call */ 6073BOOL condition; /* to RMATCH(). */ 6074BOOL minimize; 6075BOOL prev_is_word; 6076 6077unsigned long int original_ims; 6078 6079#ifdef SUPPORT_UCP 6080int prop_type; 6081int prop_fail_result; 6082int prop_category; 6083int prop_chartype; 6084int prop_othercase; 6085int prop_test_against; 6086int *prop_test_variable; 6087#endif 6088 6089int ctype; 6090int length; 6091int max; 6092int min; 6093int number; 6094int offset; 6095int op; 6096int save_capture_last; 6097int save_offset1, save_offset2, save_offset3; 6098int stacksave[REC_STACK_SAVE_MAX]; 6099 6100eptrblock newptrb; 6101#endif 6102 6103/* These statements are here to stop the compiler complaining about unitialized 6104variables. */ 6105 6106#ifdef SUPPORT_UCP 6107prop_fail_result = 0; 6108prop_test_against = 0; 6109prop_test_variable = NULL; 6110#endif 6111 6112/* OK, now we can get on with the real code of the function. Recursion is 6113specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined, 6114these just turn into a recursive call to match() and a "return", respectively. 6115However, RMATCH isn't like a function call because it's quite a complicated 6116macro. It has to be used in one particular way. This shouldn't, however, impact 6117performance when true recursion is being used. */ 6118 6119if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); 6120 6121original_ims = ims; /* Save for resetting on ')' */ 6122 6123/* At the start of a bracketed group, add the current subject pointer to the 6124stack of such pointers, to be re-instated at the end of the group when we hit 6125the closing ket. When match() is called in other circumstances, we don't add to 6126this stack. */ 6127 6128if ((flags & match_isgroup) != 0) 6129 { 6130 newptrb.epb_prev = eptrb; 6131 newptrb.epb_saved_eptr = eptr; 6132 eptrb = &newptrb; 6133 } 6134 6135/* Now start processing the operations. */ 6136 6137for (;;) 6138 { 6139 op = *ecode; 6140 minimize = FALSE; 6141 6142 /* For partial matching, remember if we ever hit the end of the subject after 6143 matching at least one subject character. */ 6144 6145 if (md->partial && 6146 eptr >= md->end_subject && 6147 eptr > md->start_match) 6148 md->hitend = TRUE; 6149 6150 /* Opening capturing bracket. If there is space in the offset vector, save 6151 the current subject position in the working slot at the top of the vector. We 6152 mustn't change the current values of the data slot, because they may be set 6153 from a previous iteration of this group, and be referred to by a reference 6154 inside the group. 6155 6156 If the bracket fails to match, we need to restore this value and also the 6157 values of the final offsets, in case they were set by a previous iteration of 6158 the same bracket. 6159 6160 If there isn't enough space in the offset vector, treat this as if it were a 6161 non-capturing bracket. Don't worry about setting the flag for the error case 6162 here; that is handled in the code for KET. */ 6163 6164 if (op > OP_BRA) 6165 { 6166 number = op - OP_BRA; 6167 6168 /* For extended extraction brackets (large number), we have to fish out the 6169 number from a dummy opcode at the start. */ 6170 6171 if (number > EXTRACT_BASIC_MAX) 6172 number = GET2(ecode, 2+LINK_SIZE); 6173 offset = number << 1; 6174 6175#ifdef DEBUG 6176 printf("start bracket %d subject=", number); 6177 pchars(eptr, 16, TRUE, md); 6178 printf("\n"); 6179#endif 6180 6181 if (offset < md->offset_max) 6182 { 6183 save_offset1 = md->offset_vector[offset]; 6184 save_offset2 = md->offset_vector[offset+1]; 6185 save_offset3 = md->offset_vector[md->offset_end - number]; 6186 save_capture_last = md->capture_last; 6187 6188 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); 6189 md->offset_vector[md->offset_end - number] = eptr - md->start_subject; 6190 6191 do 6192 { 6193 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 6194 match_isgroup); 6195 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6196 md->capture_last = save_capture_last; 6197 ecode += GET(ecode, 1); 6198 } 6199 while (*ecode == OP_ALT); 6200 6201 DPRINTF(("bracket %d failed\n", number)); 6202 6203 md->offset_vector[offset] = save_offset1; 6204 md->offset_vector[offset+1] = save_offset2; 6205 md->offset_vector[md->offset_end - number] = save_offset3; 6206 6207 RRETURN(MATCH_NOMATCH); 6208 } 6209 6210 /* Insufficient room for saving captured contents */ 6211 6212 else op = OP_BRA; 6213 } 6214 6215 /* Other types of node can be handled by a switch */ 6216 6217 switch(op) 6218 { 6219 case OP_BRA: /* Non-capturing bracket: optimized */ 6220 DPRINTF(("start bracket 0\n")); 6221 do 6222 { 6223 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 6224 match_isgroup); 6225 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6226 ecode += GET(ecode, 1); 6227 } 6228 while (*ecode == OP_ALT); 6229 DPRINTF(("bracket 0 failed\n")); 6230 RRETURN(MATCH_NOMATCH); 6231 6232 /* Conditional group: compilation checked that there are no more than 6233 two branches. If the condition is false, skipping the first branch takes us 6234 past the end if there is only one branch, but that's OK because that is 6235 exactly what going to the ket would do. */ 6236 6237 case OP_COND: 6238 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */ 6239 { 6240 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ 6241 condition = (offset == CREF_RECURSE * 2)? 6242 (md->recursive != NULL) : 6243 (offset < offset_top && md->offset_vector[offset] >= 0); 6244 RMATCH(rrc, eptr, ecode + (condition? 6245 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))), 6246 offset_top, md, ims, eptrb, match_isgroup); 6247 RRETURN(rrc); 6248 } 6249 6250 /* The condition is an assertion. Call match() to evaluate it - setting 6251 the final argument TRUE causes it to stop at the end of an assertion. */ 6252 6253 else 6254 { 6255 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 6256 match_condassert | match_isgroup); 6257 if (rrc == MATCH_MATCH) 6258 { 6259 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2); 6260 while (*ecode == OP_ALT) ecode += GET(ecode, 1); 6261 } 6262 else if (rrc != MATCH_NOMATCH) 6263 { 6264 RRETURN(rrc); /* Need braces because of following else */ 6265 } 6266 else ecode += GET(ecode, 1); 6267 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 6268 match_isgroup); 6269 RRETURN(rrc); 6270 } 6271 /* Control never reaches here */ 6272 6273 /* Skip over conditional reference or large extraction number data if 6274 encountered. */ 6275 6276 case OP_CREF: 6277 case OP_BRANUMBER: 6278 ecode += 3; 6279 break; 6280 6281 /* End of the pattern. If we are in a recursion, we should restore the 6282 offsets appropriately and continue from after the call. */ 6283 6284 case OP_END: 6285 if (md->recursive != NULL && md->recursive->group_num == 0) 6286 { 6287 recursion_info *rec = md->recursive; 6288 DPRINTF(("Hit the end in a (?0) recursion\n")); 6289 md->recursive = rec->prevrec; 6290 memmove(md->offset_vector, rec->offset_save, 6291 rec->saved_max * sizeof(int)); 6292 md->start_match = rec->save_start; 6293 ims = original_ims; 6294 ecode = rec->after_call; 6295 break; 6296 } 6297 6298 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty 6299 string - backtracking will then try other alternatives, if any. */ 6300 6301 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH); 6302 md->end_match_ptr = eptr; /* Record where we ended */ 6303 md->end_offset_top = offset_top; /* and how many extracts were taken */ 6304 RRETURN(MATCH_MATCH); 6305 6306 /* Change option settings */ 6307 6308 case OP_OPT: 6309 ims = ecode[1]; 6310 ecode += 2; 6311 DPRINTF(("ims set to %02lx\n", ims)); 6312 break; 6313 6314 /* Assertion brackets. Check the alternative branches in turn - the 6315 matching won't pass the KET for an assertion. If any one branch matches, 6316 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the 6317 start of each branch to move the current point backwards, so the code at 6318 this level is identical to the lookahead case. */ 6319 6320 case OP_ASSERT: 6321 case OP_ASSERTBACK: 6322 do 6323 { 6324 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 6325 match_isgroup); 6326 if (rrc == MATCH_MATCH) break; 6327 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6328 ecode += GET(ecode, 1); 6329 } 6330 while (*ecode == OP_ALT); 6331 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); 6332 6333 /* If checking an assertion for a condition, return MATCH_MATCH. */ 6334 6335 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 6336 6337 /* Continue from after the assertion, updating the offsets high water 6338 mark, since extracts may have been taken during the assertion. */ 6339 6340 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 6341 ecode += 1 + LINK_SIZE; 6342 offset_top = md->end_offset_top; 6343 continue; 6344 6345 /* Negative assertion: all branches must fail to match */ 6346 6347 case OP_ASSERT_NOT: 6348 case OP_ASSERTBACK_NOT: 6349 do 6350 { 6351 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 6352 match_isgroup); 6353 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); 6354 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6355 ecode += GET(ecode,1); 6356 } 6357 while (*ecode == OP_ALT); 6358 6359 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); 6360 6361 ecode += 1 + LINK_SIZE; 6362 continue; 6363 6364 /* Move the subject pointer back. This occurs only at the start of 6365 each branch of a lookbehind assertion. If we are too close to the start to 6366 move back, this match function fails. When working with UTF-8 we move 6367 back a number of characters, not bytes. */ 6368 6369 case OP_REVERSE: 6370#ifdef SUPPORT_UTF8 6371 if (md->utf8) 6372 { 6373 c = GET(ecode,1); 6374 for (i = 0; i < c; i++) 6375 { 6376 eptr--; 6377 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 6378 BACKCHAR(eptr) 6379 } 6380 } 6381 else 6382#endif 6383 6384 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ 6385 6386 { 6387 eptr -= GET(ecode,1); 6388 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); 6389 } 6390 6391 /* Skip to next op code */ 6392 6393 ecode += 1 + LINK_SIZE; 6394 break; 6395 6396 /* The callout item calls an external function, if one is provided, passing 6397 details of the match so far. This is mainly for debugging, though the 6398 function is able to force a failure. */ 6399 6400 case OP_CALLOUT: 6401 if (pcre_callout != NULL) 6402 { 6403 pcre_callout_block cb; 6404 cb.version = 1; /* Version 1 of the callout block */ 6405 cb.callout_number = ecode[1]; 6406 cb.offset_vector = md->offset_vector; 6407 cb.subject = (const char *)md->start_subject; 6408 cb.subject_length = md->end_subject - md->start_subject; 6409 cb.start_match = md->start_match - md->start_subject; 6410 cb.current_position = eptr - md->start_subject; 6411 cb.pattern_position = GET(ecode, 2); 6412 cb.next_item_length = GET(ecode, 2 + LINK_SIZE); 6413 cb.capture_top = offset_top/2; 6414 cb.capture_last = md->capture_last; 6415 cb.callout_data = md->callout_data; 6416 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); 6417 if (rrc < 0) RRETURN(rrc); 6418 } 6419 ecode += 2 + 2*LINK_SIZE; 6420 break; 6421 6422 /* Recursion either matches the current regex, or some subexpression. The 6423 offset data is the offset to the starting bracket from the start of the 6424 whole pattern. (This is so that it works from duplicated subpatterns.) 6425 6426 If there are any capturing brackets started but not finished, we have to 6427 save their starting points and reinstate them after the recursion. However, 6428 we don't know how many such there are (offset_top records the completed 6429 total) so we just have to save all the potential data. There may be up to 6430 65535 such values, which is too large to put on the stack, but using malloc 6431 for small numbers seems expensive. As a compromise, the stack is used when 6432 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc 6433 is used. A problem is what to do if the malloc fails ... there is no way of 6434 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX 6435 values on the stack, and accept that the rest may be wrong. 6436 6437 There are also other values that have to be saved. We use a chained 6438 sequence of blocks that actually live on the stack. Thanks to Robin Houston 6439 for the original version of this logic. */ 6440 6441 case OP_RECURSE: 6442 { 6443 callpat = md->start_code + GET(ecode, 1); 6444 new_recursive.group_num = *callpat - OP_BRA; 6445 6446 /* For extended extraction brackets (large number), we have to fish out 6447 the number from a dummy opcode at the start. */ 6448 6449 if (new_recursive.group_num > EXTRACT_BASIC_MAX) 6450 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE); 6451 6452 /* Add to "recursing stack" */ 6453 6454 new_recursive.prevrec = md->recursive; 6455 md->recursive = &new_recursive; 6456 6457 /* Find where to continue from afterwards */ 6458 6459 ecode += 1 + LINK_SIZE; 6460 new_recursive.after_call = ecode; 6461 6462 /* Now save the offset data. */ 6463 6464 new_recursive.saved_max = md->offset_end; 6465 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) 6466 new_recursive.offset_save = stacksave; 6467 else 6468 { 6469 new_recursive.offset_save = 6470 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); 6471 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); 6472 } 6473 6474 memcpy(new_recursive.offset_save, md->offset_vector, 6475 new_recursive.saved_max * sizeof(int)); 6476 new_recursive.save_start = md->start_match; 6477 md->start_match = eptr; 6478 6479 /* OK, now we can do the recursion. For each top-level alternative we 6480 restore the offset and recursion data. */ 6481 6482 DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); 6483 do 6484 { 6485 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims, 6486 eptrb, match_isgroup); 6487 if (rrc == MATCH_MATCH) 6488 { 6489 md->recursive = new_recursive.prevrec; 6490 if (new_recursive.offset_save != stacksave) 6491 (pcre_free)(new_recursive.offset_save); 6492 RRETURN(MATCH_MATCH); 6493 } 6494 else if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6495 6496 md->recursive = &new_recursive; 6497 memcpy(md->offset_vector, new_recursive.offset_save, 6498 new_recursive.saved_max * sizeof(int)); 6499 callpat += GET(callpat, 1); 6500 } 6501 while (*callpat == OP_ALT); 6502 6503 DPRINTF(("Recursion didn't match\n")); 6504 md->recursive = new_recursive.prevrec; 6505 if (new_recursive.offset_save != stacksave) 6506 (pcre_free)(new_recursive.offset_save); 6507 RRETURN(MATCH_NOMATCH); 6508 } 6509 /* Control never reaches here */ 6510 6511 /* "Once" brackets are like assertion brackets except that after a match, 6512 the point in the subject string is not moved back. Thus there can never be 6513 a move back into the brackets. Friedl calls these "atomic" subpatterns. 6514 Check the alternative branches in turn - the matching won't pass the KET 6515 for this kind of subpattern. If any one branch matches, we carry on as at 6516 the end of a normal bracket, leaving the subject pointer. */ 6517 6518 case OP_ONCE: 6519 { 6520 prev = ecode; 6521 saved_eptr = eptr; 6522 6523 do 6524 { 6525 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, 6526 eptrb, match_isgroup); 6527 if (rrc == MATCH_MATCH) break; 6528 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6529 ecode += GET(ecode,1); 6530 } 6531 while (*ecode == OP_ALT); 6532 6533 /* If hit the end of the group (which could be repeated), fail */ 6534 6535 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); 6536 6537 /* Continue as from after the assertion, updating the offsets high water 6538 mark, since extracts may have been taken. */ 6539 6540 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 6541 6542 offset_top = md->end_offset_top; 6543 eptr = md->end_match_ptr; 6544 6545 /* For a non-repeating ket, just continue at this level. This also 6546 happens for a repeating ket if no characters were matched in the group. 6547 This is the forcible breaking of infinite loops as implemented in Perl 6548 5.005. If there is an options reset, it will get obeyed in the normal 6549 course of events. */ 6550 6551 if (*ecode == OP_KET || eptr == saved_eptr) 6552 { 6553 ecode += 1+LINK_SIZE; 6554 break; 6555 } 6556 6557 /* The repeating kets try the rest of the pattern or restart from the 6558 preceding bracket, in the appropriate order. We need to reset any options 6559 that changed within the bracket before re-running it, so check the next 6560 opcode. */ 6561 6562 if (ecode[1+LINK_SIZE] == OP_OPT) 6563 { 6564 ims = (ims & ~PCRE_IMS) | ecode[4]; 6565 DPRINTF(("ims set to %02lx at group repeat\n", ims)); 6566 } 6567 6568 if (*ecode == OP_KETRMIN) 6569 { 6570 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0); 6571 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6572 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 6573 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6574 } 6575 else /* OP_KETRMAX */ 6576 { 6577 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 6578 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6579 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 6580 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6581 } 6582 } 6583 RRETURN(MATCH_NOMATCH); 6584 6585 /* An alternation is the end of a branch; scan along to find the end of the 6586 bracketed group and go to there. */ 6587 6588 case OP_ALT: 6589 do ecode += GET(ecode,1); while (*ecode == OP_ALT); 6590 break; 6591 6592 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating 6593 that it may occur zero times. It may repeat infinitely, or not at all - 6594 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper 6595 repeat limits are compiled as a number of copies, with the optional ones 6596 preceded by BRAZERO or BRAMINZERO. */ 6597 6598 case OP_BRAZERO: 6599 { 6600 next = ecode+1; 6601 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup); 6602 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6603 do next += GET(next,1); while (*next == OP_ALT); 6604 ecode = next + 1+LINK_SIZE; 6605 } 6606 break; 6607 6608 case OP_BRAMINZERO: 6609 { 6610 next = ecode+1; 6611 do next += GET(next,1); while (*next == OP_ALT); 6612 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 6613 match_isgroup); 6614 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6615 ecode++; 6616 } 6617 break; 6618 6619 /* End of a group, repeated or non-repeating. If we are at the end of 6620 an assertion "group", stop matching and return MATCH_MATCH, but record the 6621 current high water mark for use by positive assertions. Do this also 6622 for the "once" (not-backup up) groups. */ 6623 6624 case OP_KET: 6625 case OP_KETRMIN: 6626 case OP_KETRMAX: 6627 { 6628 prev = ecode - GET(ecode, 1); 6629 saved_eptr = eptrb->epb_saved_eptr; 6630 6631 /* Back up the stack of bracket start pointers. */ 6632 6633 eptrb = eptrb->epb_prev; 6634 6635 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || 6636 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || 6637 *prev == OP_ONCE) 6638 { 6639 md->end_match_ptr = eptr; /* For ONCE */ 6640 md->end_offset_top = offset_top; 6641 RRETURN(MATCH_MATCH); 6642 } 6643 6644 /* In all other cases except a conditional group we have to check the 6645 group number back at the start and if necessary complete handling an 6646 extraction by setting the offsets and bumping the high water mark. */ 6647 6648 if (*prev != OP_COND) 6649 { 6650 number = *prev - OP_BRA; 6651 6652 /* For extended extraction brackets (large number), we have to fish out 6653 the number from a dummy opcode at the start. */ 6654 6655 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE); 6656 offset = number << 1; 6657 6658#ifdef DEBUG 6659 printf("end bracket %d", number); 6660 printf("\n"); 6661#endif 6662 6663 /* Test for a numbered group. This includes groups called as a result 6664 of recursion. Note that whole-pattern recursion is coded as a recurse 6665 into group 0, so it won't be picked up here. Instead, we catch it when 6666 the OP_END is reached. */ 6667 6668 if (number > 0) 6669 { 6670 md->capture_last = number; 6671 if (offset >= md->offset_max) md->offset_overflow = TRUE; else 6672 { 6673 md->offset_vector[offset] = 6674 md->offset_vector[md->offset_end - number]; 6675 md->offset_vector[offset+1] = eptr - md->start_subject; 6676 if (offset_top <= offset) offset_top = offset + 2; 6677 } 6678 6679 /* Handle a recursively called group. Restore the offsets 6680 appropriately and continue from after the call. */ 6681 6682 if (md->recursive != NULL && md->recursive->group_num == number) 6683 { 6684 recursion_info *rec = md->recursive; 6685 DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); 6686 md->recursive = rec->prevrec; 6687 md->start_match = rec->save_start; 6688 memcpy(md->offset_vector, rec->offset_save, 6689 rec->saved_max * sizeof(int)); 6690 ecode = rec->after_call; 6691 ims = original_ims; 6692 break; 6693 } 6694 } 6695 } 6696 6697 /* Reset the value of the ims flags, in case they got changed during 6698 the group. */ 6699 6700 ims = original_ims; 6701 DPRINTF(("ims reset to %02lx\n", ims)); 6702 6703 /* For a non-repeating ket, just continue at this level. This also 6704 happens for a repeating ket if no characters were matched in the group. 6705 This is the forcible breaking of infinite loops as implemented in Perl 6706 5.005. If there is an options reset, it will get obeyed in the normal 6707 course of events. */ 6708 6709 if (*ecode == OP_KET || eptr == saved_eptr) 6710 { 6711 ecode += 1 + LINK_SIZE; 6712 break; 6713 } 6714 6715 /* The repeating kets try the rest of the pattern or restart from the 6716 preceding bracket, in the appropriate order. */ 6717 6718 if (*ecode == OP_KETRMIN) 6719 { 6720 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 6721 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6722 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 6723 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6724 } 6725 else /* OP_KETRMAX */ 6726 { 6727 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup); 6728 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6729 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0); 6730 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 6731 } 6732 } 6733 6734 RRETURN(MATCH_NOMATCH); 6735 6736 /* Start of subject unless notbol, or after internal newline if multiline */ 6737 6738 case OP_CIRC: 6739 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); 6740 if ((ims & PCRE_MULTILINE) != 0) 6741 { 6742 if (eptr != md->start_subject && eptr[-1] != NEWLINE) 6743 RRETURN(MATCH_NOMATCH); 6744 ecode++; 6745 break; 6746 } 6747 /* ... else fall through */ 6748 6749 /* Start of subject assertion */ 6750 6751 case OP_SOD: 6752 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); 6753 ecode++; 6754 break; 6755 6756 /* Start of match assertion */ 6757 6758 case OP_SOM: 6759 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); 6760 ecode++; 6761 break; 6762 6763 /* Assert before internal newline if multiline, or before a terminating 6764 newline unless endonly is set, else end of subject unless noteol is set. */ 6765 6766 case OP_DOLL: 6767 if ((ims & PCRE_MULTILINE) != 0) 6768 { 6769 if (eptr < md->end_subject) 6770 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); } 6771 else 6772 { if (md->noteol) RRETURN(MATCH_NOMATCH); } 6773 ecode++; 6774 break; 6775 } 6776 else 6777 { 6778 if (md->noteol) RRETURN(MATCH_NOMATCH); 6779 if (!md->endonly) 6780 { 6781 if (eptr < md->end_subject - 1 || 6782 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) 6783 RRETURN(MATCH_NOMATCH); 6784 ecode++; 6785 break; 6786 } 6787 } 6788 /* ... else fall through */ 6789 6790 /* End of subject assertion (\z) */ 6791 6792 case OP_EOD: 6793 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); 6794 ecode++; 6795 break; 6796 6797 /* End of subject or ending \n assertion (\Z) */ 6798 6799 case OP_EODN: 6800 if (eptr < md->end_subject - 1 || 6801 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH); 6802 ecode++; 6803 break; 6804 6805 /* Word boundary assertions */ 6806 6807 case OP_NOT_WORD_BOUNDARY: 6808 case OP_WORD_BOUNDARY: 6809 { 6810 6811 /* Find out if the previous and current characters are "word" characters. 6812 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to 6813 be "non-word" characters. */ 6814 6815#ifdef SUPPORT_UTF8 6816 if (md->utf8) 6817 { 6818 if (eptr == md->start_subject) prev_is_word = FALSE; else 6819 { 6820 const uschar *lastptr = eptr - 1; 6821 while((*lastptr & 0xc0) == 0x80) lastptr--; 6822 GETCHAR(c, lastptr); 6823 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 6824 } 6825 if (eptr >= md->end_subject) cur_is_word = FALSE; else 6826 { 6827 GETCHAR(c, eptr); 6828 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; 6829 } 6830 } 6831 else 6832#endif 6833 6834 /* More streamlined when not in UTF-8 mode */ 6835 6836 { 6837 prev_is_word = (eptr != md->start_subject) && 6838 ((md->ctypes[eptr[-1]] & ctype_word) != 0); 6839 cur_is_word = (eptr < md->end_subject) && 6840 ((md->ctypes[*eptr] & ctype_word) != 0); 6841 } 6842 6843 /* Now see if the situation is what we want */ 6844 6845 if ((*ecode++ == OP_WORD_BOUNDARY)? 6846 cur_is_word == prev_is_word : cur_is_word != prev_is_word) 6847 RRETURN(MATCH_NOMATCH); 6848 } 6849 break; 6850 6851 /* Match a single character type; inline for speed */ 6852 6853 case OP_ANY: 6854 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE) 6855 RRETURN(MATCH_NOMATCH); 6856 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); 6857#ifdef SUPPORT_UTF8 6858 if (md->utf8) 6859 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 6860#endif 6861 ecode++; 6862 break; 6863 6864 /* Match a single byte, even in UTF-8 mode. This opcode really does match 6865 any byte, even newline, independent of the setting of PCRE_DOTALL. */ 6866 6867 case OP_ANYBYTE: 6868 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); 6869 ecode++; 6870 break; 6871 6872 case OP_NOT_DIGIT: 6873 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6874 GETCHARINCTEST(c, eptr); 6875 if ( 6876#ifdef SUPPORT_UTF8 6877 c < 256 && 6878#endif 6879 (md->ctypes[c] & ctype_digit) != 0 6880 ) 6881 RRETURN(MATCH_NOMATCH); 6882 ecode++; 6883 break; 6884 6885 case OP_DIGIT: 6886 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6887 GETCHARINCTEST(c, eptr); 6888 if ( 6889#ifdef SUPPORT_UTF8 6890 c >= 256 || 6891#endif 6892 (md->ctypes[c] & ctype_digit) == 0 6893 ) 6894 RRETURN(MATCH_NOMATCH); 6895 ecode++; 6896 break; 6897 6898 case OP_NOT_WHITESPACE: 6899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6900 GETCHARINCTEST(c, eptr); 6901 if ( 6902#ifdef SUPPORT_UTF8 6903 c < 256 && 6904#endif 6905 (md->ctypes[c] & ctype_space) != 0 6906 ) 6907 RRETURN(MATCH_NOMATCH); 6908 ecode++; 6909 break; 6910 6911 case OP_WHITESPACE: 6912 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6913 GETCHARINCTEST(c, eptr); 6914 if ( 6915#ifdef SUPPORT_UTF8 6916 c >= 256 || 6917#endif 6918 (md->ctypes[c] & ctype_space) == 0 6919 ) 6920 RRETURN(MATCH_NOMATCH); 6921 ecode++; 6922 break; 6923 6924 case OP_NOT_WORDCHAR: 6925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6926 GETCHARINCTEST(c, eptr); 6927 if ( 6928#ifdef SUPPORT_UTF8 6929 c < 256 && 6930#endif 6931 (md->ctypes[c] & ctype_word) != 0 6932 ) 6933 RRETURN(MATCH_NOMATCH); 6934 ecode++; 6935 break; 6936 6937 case OP_WORDCHAR: 6938 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6939 GETCHARINCTEST(c, eptr); 6940 if ( 6941#ifdef SUPPORT_UTF8 6942 c >= 256 || 6943#endif 6944 (md->ctypes[c] & ctype_word) == 0 6945 ) 6946 RRETURN(MATCH_NOMATCH); 6947 ecode++; 6948 break; 6949 6950#ifdef SUPPORT_UCP 6951 /* Check the next character by Unicode property. We will get here only 6952 if the support is in the binary; otherwise a compile-time error occurs. */ 6953 6954 case OP_PROP: 6955 case OP_NOTPROP: 6956 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6957 GETCHARINCTEST(c, eptr); 6958 { 6959 int chartype, rqdtype; 6960 int othercase; 6961 int category = ucp_findchar(c, &chartype, &othercase); 6962 6963 rqdtype = *(++ecode); 6964 ecode++; 6965 6966 if (rqdtype >= 128) 6967 { 6968 if ((rqdtype - 128 != category) == (op == OP_PROP)) 6969 RRETURN(MATCH_NOMATCH); 6970 } 6971 else 6972 { 6973 if ((rqdtype != chartype) == (op == OP_PROP)) 6974 RRETURN(MATCH_NOMATCH); 6975 } 6976 } 6977 break; 6978 6979 /* Match an extended Unicode sequence. We will get here only if the support 6980 is in the binary; otherwise a compile-time error occurs. */ 6981 6982 case OP_EXTUNI: 6983 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 6984 GETCHARINCTEST(c, eptr); 6985 { 6986 int chartype; 6987 int othercase; 6988 int category = ucp_findchar(c, &chartype, &othercase); 6989 if (category == ucp_M) RRETURN(MATCH_NOMATCH); 6990 while (eptr < md->end_subject) 6991 { 6992 int len = 1; 6993 if (!md->utf8) c = *eptr; else 6994 { 6995 GETCHARLEN(c, eptr, len); 6996 } 6997 category = ucp_findchar(c, &chartype, &othercase); 6998 if (category != ucp_M) break; 6999 eptr += len; 7000 } 7001 } 7002 ecode++; 7003 break; 7004#endif 7005 7006 7007 /* Match a back reference, possibly repeatedly. Look past the end of the 7008 item to see if there is repeat information following. The code is similar 7009 to that for character classes, but repeated for efficiency. Then obey 7010 similar code to character type repeats - written out again for speed. 7011 However, if the referenced string is the empty string, always treat 7012 it as matched, any number of times (otherwise there could be infinite 7013 loops). */ 7014 7015 case OP_REF: 7016 { 7017 offset = GET2(ecode, 1) << 1; /* Doubled ref number */ 7018 ecode += 3; /* Advance past item */ 7019 7020 /* If the reference is unset, set the length to be longer than the amount 7021 of subject left; this ensures that every attempt at a match fails. We 7022 can't just fail here, because of the possibility of quantifiers with zero 7023 minima. */ 7024 7025 length = (offset >= offset_top || md->offset_vector[offset] < 0)? 7026 md->end_subject - eptr + 1 : 7027 md->offset_vector[offset+1] - md->offset_vector[offset]; 7028 7029 /* Set up for repetition, or handle the non-repeated case */ 7030 7031 switch (*ecode) 7032 { 7033 case OP_CRSTAR: 7034 case OP_CRMINSTAR: 7035 case OP_CRPLUS: 7036 case OP_CRMINPLUS: 7037 case OP_CRQUERY: 7038 case OP_CRMINQUERY: 7039 c = *ecode++ - OP_CRSTAR; 7040 minimize = (c & 1) != 0; 7041 min = rep_min[c]; /* Pick up values from tables; */ 7042 max = rep_max[c]; /* zero for max => infinity */ 7043 if (max == 0) max = INT_MAX; 7044 break; 7045 7046 case OP_CRRANGE: 7047 case OP_CRMINRANGE: 7048 minimize = (*ecode == OP_CRMINRANGE); 7049 min = GET2(ecode, 1); 7050 max = GET2(ecode, 3); 7051 if (max == 0) max = INT_MAX; 7052 ecode += 5; 7053 break; 7054 7055 default: /* No repeat follows */ 7056 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); 7057 eptr += length; 7058 continue; /* With the main loop */ 7059 } 7060 7061 /* If the length of the reference is zero, just continue with the 7062 main loop. */ 7063 7064 if (length == 0) continue; 7065 7066 /* First, ensure the minimum number of matches are present. We get back 7067 the length of the reference string explicitly rather than passing the 7068 address of eptr, so that eptr can be a register variable. */ 7069 7070 for (i = 1; i <= min; i++) 7071 { 7072 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); 7073 eptr += length; 7074 } 7075 7076 /* If min = max, continue at the same level without recursion. 7077 They are not both allowed to be zero. */ 7078 7079 if (min == max) continue; 7080 7081 /* If minimizing, keep trying and advancing the pointer */ 7082 7083 if (minimize) 7084 { 7085 for (fi = min;; fi++) 7086 { 7087 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7088 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7089 if (fi >= max || !match_ref(offset, eptr, length, md, ims)) 7090 RRETURN(MATCH_NOMATCH); 7091 eptr += length; 7092 } 7093 /* Control never gets here */ 7094 } 7095 7096 /* If maximizing, find the longest string and work backwards */ 7097 7098 else 7099 { 7100 pp = eptr; 7101 for (i = min; i < max; i++) 7102 { 7103 if (!match_ref(offset, eptr, length, md, ims)) break; 7104 eptr += length; 7105 } 7106 while (eptr >= pp) 7107 { 7108 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7109 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7110 eptr -= length; 7111 } 7112 RRETURN(MATCH_NOMATCH); 7113 } 7114 } 7115 /* Control never gets here */ 7116 7117 7118 7119 /* Match a bit-mapped character class, possibly repeatedly. This op code is 7120 used when all the characters in the class have values in the range 0-255, 7121 and either the matching is caseful, or the characters are in the range 7122 0-127 when UTF-8 processing is enabled. The only difference between 7123 OP_CLASS and OP_NCLASS occurs when a data character outside the range is 7124 encountered. 7125 7126 First, look past the end of the item to see if there is repeat information 7127 following. Then obey similar code to character type repeats - written out 7128 again for speed. */ 7129 7130 case OP_NCLASS: 7131 case OP_CLASS: 7132 { 7133 data = ecode + 1; /* Save for matching */ 7134 ecode += 33; /* Advance past the item */ 7135 7136 switch (*ecode) 7137 { 7138 case OP_CRSTAR: 7139 case OP_CRMINSTAR: 7140 case OP_CRPLUS: 7141 case OP_CRMINPLUS: 7142 case OP_CRQUERY: 7143 case OP_CRMINQUERY: 7144 c = *ecode++ - OP_CRSTAR; 7145 minimize = (c & 1) != 0; 7146 min = rep_min[c]; /* Pick up values from tables; */ 7147 max = rep_max[c]; /* zero for max => infinity */ 7148 if (max == 0) max = INT_MAX; 7149 break; 7150 7151 case OP_CRRANGE: 7152 case OP_CRMINRANGE: 7153 minimize = (*ecode == OP_CRMINRANGE); 7154 min = GET2(ecode, 1); 7155 max = GET2(ecode, 3); 7156 if (max == 0) max = INT_MAX; 7157 ecode += 5; 7158 break; 7159 7160 default: /* No repeat follows */ 7161 min = max = 1; 7162 break; 7163 } 7164 7165 /* First, ensure the minimum number of matches are present. */ 7166 7167#ifdef SUPPORT_UTF8 7168 /* UTF-8 mode */ 7169 if (md->utf8) 7170 { 7171 for (i = 1; i <= min; i++) 7172 { 7173 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7174 GETCHARINC(c, eptr); 7175 if (c > 255) 7176 { 7177 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 7178 } 7179 else 7180 { 7181 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 7182 } 7183 } 7184 } 7185 else 7186#endif 7187 /* Not UTF-8 mode */ 7188 { 7189 for (i = 1; i <= min; i++) 7190 { 7191 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7192 c = *eptr++; 7193 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 7194 } 7195 } 7196 7197 /* If max == min we can continue with the main loop without the 7198 need to recurse. */ 7199 7200 if (min == max) continue; 7201 7202 /* If minimizing, keep testing the rest of the expression and advancing 7203 the pointer while it matches the class. */ 7204 7205 if (minimize) 7206 { 7207#ifdef SUPPORT_UTF8 7208 /* UTF-8 mode */ 7209 if (md->utf8) 7210 { 7211 for (fi = min;; fi++) 7212 { 7213 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7214 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7215 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7216 GETCHARINC(c, eptr); 7217 if (c > 255) 7218 { 7219 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); 7220 } 7221 else 7222 { 7223 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 7224 } 7225 } 7226 } 7227 else 7228#endif 7229 /* Not UTF-8 mode */ 7230 { 7231 for (fi = min;; fi++) 7232 { 7233 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7234 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7235 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7236 c = *eptr++; 7237 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); 7238 } 7239 } 7240 /* Control never gets here */ 7241 } 7242 7243 /* If maximizing, find the longest possible run, then work backwards. */ 7244 7245 else 7246 { 7247 pp = eptr; 7248 7249#ifdef SUPPORT_UTF8 7250 /* UTF-8 mode */ 7251 if (md->utf8) 7252 { 7253 for (i = min; i < max; i++) 7254 { 7255 int len = 1; 7256 if (eptr >= md->end_subject) break; 7257 GETCHARLEN(c, eptr, len); 7258 if (c > 255) 7259 { 7260 if (op == OP_CLASS) break; 7261 } 7262 else 7263 { 7264 if ((data[c/8] & (1 << (c&7))) == 0) break; 7265 } 7266 eptr += len; 7267 } 7268 for (;;) 7269 { 7270 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7271 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7272 if (eptr-- == pp) break; /* Stop if tried at original pos */ 7273 BACKCHAR(eptr); 7274 } 7275 } 7276 else 7277#endif 7278 /* Not UTF-8 mode */ 7279 { 7280 for (i = min; i < max; i++) 7281 { 7282 if (eptr >= md->end_subject) break; 7283 c = *eptr; 7284 if ((data[c/8] & (1 << (c&7))) == 0) break; 7285 eptr++; 7286 } 7287 while (eptr >= pp) 7288 { 7289 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7290 eptr--; 7291 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7292 } 7293 } 7294 7295 RRETURN(MATCH_NOMATCH); 7296 } 7297 } 7298 /* Control never gets here */ 7299 7300 7301 /* Match an extended character class. This opcode is encountered only 7302 in UTF-8 mode, because that's the only time it is compiled. */ 7303 7304#ifdef SUPPORT_UTF8 7305 case OP_XCLASS: 7306 { 7307 data = ecode + 1 + LINK_SIZE; /* Save for matching */ 7308 ecode += GET(ecode, 1); /* Advance past the item */ 7309 7310 switch (*ecode) 7311 { 7312 case OP_CRSTAR: 7313 case OP_CRMINSTAR: 7314 case OP_CRPLUS: 7315 case OP_CRMINPLUS: 7316 case OP_CRQUERY: 7317 case OP_CRMINQUERY: 7318 c = *ecode++ - OP_CRSTAR; 7319 minimize = (c & 1) != 0; 7320 min = rep_min[c]; /* Pick up values from tables; */ 7321 max = rep_max[c]; /* zero for max => infinity */ 7322 if (max == 0) max = INT_MAX; 7323 break; 7324 7325 case OP_CRRANGE: 7326 case OP_CRMINRANGE: 7327 minimize = (*ecode == OP_CRMINRANGE); 7328 min = GET2(ecode, 1); 7329 max = GET2(ecode, 3); 7330 if (max == 0) max = INT_MAX; 7331 ecode += 5; 7332 break; 7333 7334 default: /* No repeat follows */ 7335 min = max = 1; 7336 break; 7337 } 7338 7339 /* First, ensure the minimum number of matches are present. */ 7340 7341 for (i = 1; i <= min; i++) 7342 { 7343 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7344 GETCHARINC(c, eptr); 7345 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH); 7346 } 7347 7348 /* If max == min we can continue with the main loop without the 7349 need to recurse. */ 7350 7351 if (min == max) continue; 7352 7353 /* If minimizing, keep testing the rest of the expression and advancing 7354 the pointer while it matches the class. */ 7355 7356 if (minimize) 7357 { 7358 for (fi = min;; fi++) 7359 { 7360 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7361 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7362 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7363 GETCHARINC(c, eptr); 7364 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH); 7365 } 7366 /* Control never gets here */ 7367 } 7368 7369 /* If maximizing, find the longest possible run, then work backwards. */ 7370 7371 else 7372 { 7373 pp = eptr; 7374 for (i = min; i < max; i++) 7375 { 7376 int len = 1; 7377 if (eptr >= md->end_subject) break; 7378 GETCHARLEN(c, eptr, len); 7379 if (!match_xclass(c, data)) break; 7380 eptr += len; 7381 } 7382 for(;;) 7383 { 7384 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7385 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7386 if (eptr-- == pp) break; /* Stop if tried at original pos */ 7387 BACKCHAR(eptr) 7388 } 7389 RRETURN(MATCH_NOMATCH); 7390 } 7391 7392 /* Control never gets here */ 7393 } 7394#endif /* End of XCLASS */ 7395 7396 /* Match a single character, casefully */ 7397 7398 case OP_CHAR: 7399#ifdef SUPPORT_UTF8 7400 if (md->utf8) 7401 { 7402 length = 1; 7403 ecode++; 7404 GETCHARLEN(fc, ecode, length); 7405 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 7406 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); 7407 } 7408 else 7409#endif 7410 7411 /* Non-UTF-8 mode */ 7412 { 7413 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); 7414 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); 7415 ecode += 2; 7416 } 7417 break; 7418 7419 /* Match a single character, caselessly */ 7420 7421 case OP_CHARNC: 7422#ifdef SUPPORT_UTF8 7423 if (md->utf8) 7424 { 7425 length = 1; 7426 ecode++; 7427 GETCHARLEN(fc, ecode, length); 7428 7429 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 7430 7431 /* If the pattern character's value is < 128, we have only one byte, and 7432 can use the fast lookup table. */ 7433 7434 if (fc < 128) 7435 { 7436 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 7437 } 7438 7439 /* Otherwise we must pick up the subject character */ 7440 7441 else 7442 { 7443 int dc; 7444 GETCHARINC(dc, eptr); 7445 ecode += length; 7446 7447 /* If we have Unicode property support, we can use it to test the other 7448 case of the character, if there is one. The result of ucp_findchar() is 7449 < 0 if the char isn't found, and othercase is returned as zero if there 7450 isn't one. */ 7451 7452 if (fc != dc) 7453 { 7454#ifdef SUPPORT_UCP 7455 int chartype; 7456 int othercase; 7457 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase) 7458#endif 7459 RRETURN(MATCH_NOMATCH); 7460 } 7461 } 7462 } 7463 else 7464#endif /* SUPPORT_UTF8 */ 7465 7466 /* Non-UTF-8 mode */ 7467 { 7468 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); 7469 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 7470 ecode += 2; 7471 } 7472 break; 7473 7474 /* Match a single character repeatedly; different opcodes share code. */ 7475 7476 case OP_EXACT: 7477 min = max = GET2(ecode, 1); 7478 ecode += 3; 7479 goto REPEATCHAR; 7480 7481 case OP_UPTO: 7482 case OP_MINUPTO: 7483 min = 0; 7484 max = GET2(ecode, 1); 7485 minimize = *ecode == OP_MINUPTO; 7486 ecode += 3; 7487 goto REPEATCHAR; 7488 7489 case OP_STAR: 7490 case OP_MINSTAR: 7491 case OP_PLUS: 7492 case OP_MINPLUS: 7493 case OP_QUERY: 7494 case OP_MINQUERY: 7495 c = *ecode++ - OP_STAR; 7496 minimize = (c & 1) != 0; 7497 min = rep_min[c]; /* Pick up values from tables; */ 7498 max = rep_max[c]; /* zero for max => infinity */ 7499 if (max == 0) max = INT_MAX; 7500 7501 /* Common code for all repeated single-character matches. We can give 7502 up quickly if there are fewer than the minimum number of characters left in 7503 the subject. */ 7504 7505 REPEATCHAR: 7506#ifdef SUPPORT_UTF8 7507 if (md->utf8) 7508 { 7509 length = 1; 7510 charptr = ecode; 7511 GETCHARLEN(fc, ecode, length); 7512 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 7513 ecode += length; 7514 7515 /* Handle multibyte character matching specially here. There is 7516 support for caseless matching if UCP support is present. */ 7517 7518 if (length > 1) 7519 { 7520 int oclength = 0; 7521 uschar occhars[8]; 7522 7523#ifdef SUPPORT_UCP 7524 int othercase; 7525 int chartype; 7526 if ((ims & PCRE_CASELESS) != 0 && 7527 ucp_findchar(fc, &chartype, &othercase) >= 0 && 7528 othercase > 0) 7529 oclength = ord2utf8(othercase, occhars); 7530#endif /* SUPPORT_UCP */ 7531 7532 for (i = 1; i <= min; i++) 7533 { 7534 if (memcmp(eptr, charptr, length) == 0) eptr += length; 7535 /* Need braces because of following else */ 7536 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } 7537 else 7538 { 7539 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); 7540 eptr += oclength; 7541 } 7542 } 7543 7544 if (min == max) continue; 7545 7546 if (minimize) 7547 { 7548 for (fi = min;; fi++) 7549 { 7550 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7551 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7552 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7553 if (memcmp(eptr, charptr, length) == 0) eptr += length; 7554 /* Need braces because of following else */ 7555 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } 7556 else 7557 { 7558 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); 7559 eptr += oclength; 7560 } 7561 } 7562 /* Control never gets here */ 7563 } 7564 else 7565 { 7566 pp = eptr; 7567 for (i = min; i < max; i++) 7568 { 7569 if (eptr > md->end_subject - length) break; 7570 if (memcmp(eptr, charptr, length) == 0) eptr += length; 7571 else if (oclength == 0) break; 7572 else 7573 { 7574 if (memcmp(eptr, occhars, oclength) != 0) break; 7575 eptr += oclength; 7576 } 7577 } 7578 while (eptr >= pp) 7579 { 7580 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7581 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7582 eptr -= length; 7583 } 7584 RRETURN(MATCH_NOMATCH); 7585 } 7586 /* Control never gets here */ 7587 } 7588 7589 /* If the length of a UTF-8 character is 1, we fall through here, and 7590 obey the code as for non-UTF-8 characters below, though in this case the 7591 value of fc will always be < 128. */ 7592 } 7593 else 7594#endif /* SUPPORT_UTF8 */ 7595 7596 /* When not in UTF-8 mode, load a single-byte character. */ 7597 { 7598 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 7599 fc = *ecode++; 7600 } 7601 7602 /* The value of fc at this point is always less than 256, though we may or 7603 may not be in UTF-8 mode. The code is duplicated for the caseless and 7604 caseful cases, for speed, since matching characters is likely to be quite 7605 common. First, ensure the minimum number of matches are present. If min = 7606 max, continue at the same level without recursing. Otherwise, if 7607 minimizing, keep trying the rest of the expression and advancing one 7608 matching character if failing, up to the maximum. Alternatively, if 7609 maximizing, find the maximum number of characters and work backwards. */ 7610 7611 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, 7612 max, eptr)); 7613 7614 if ((ims & PCRE_CASELESS) != 0) 7615 { 7616 fc = md->lcc[fc]; 7617 for (i = 1; i <= min; i++) 7618 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 7619 if (min == max) continue; 7620 if (minimize) 7621 { 7622 for (fi = min;; fi++) 7623 { 7624 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7625 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7626 if (fi >= max || eptr >= md->end_subject || 7627 fc != md->lcc[*eptr++]) 7628 RRETURN(MATCH_NOMATCH); 7629 } 7630 /* Control never gets here */ 7631 } 7632 else 7633 { 7634 pp = eptr; 7635 for (i = min; i < max; i++) 7636 { 7637 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; 7638 eptr++; 7639 } 7640 while (eptr >= pp) 7641 { 7642 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7643 eptr--; 7644 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7645 } 7646 RRETURN(MATCH_NOMATCH); 7647 } 7648 /* Control never gets here */ 7649 } 7650 7651 /* Caseful comparisons (includes all multi-byte characters) */ 7652 7653 else 7654 { 7655 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); 7656 if (min == max) continue; 7657 if (minimize) 7658 { 7659 for (fi = min;; fi++) 7660 { 7661 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7662 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7663 if (fi >= max || eptr >= md->end_subject || fc != *eptr++) 7664 RRETURN(MATCH_NOMATCH); 7665 } 7666 /* Control never gets here */ 7667 } 7668 else 7669 { 7670 pp = eptr; 7671 for (i = min; i < max; i++) 7672 { 7673 if (eptr >= md->end_subject || fc != *eptr) break; 7674 eptr++; 7675 } 7676 while (eptr >= pp) 7677 { 7678 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7679 eptr--; 7680 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7681 } 7682 RRETURN(MATCH_NOMATCH); 7683 } 7684 } 7685 /* Control never gets here */ 7686 7687 /* Match a negated single one-byte character. The character we are 7688 checking can be multibyte. */ 7689 7690 case OP_NOT: 7691 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 7692 ecode++; 7693 GETCHARINCTEST(c, eptr); 7694 if ((ims & PCRE_CASELESS) != 0) 7695 { 7696#ifdef SUPPORT_UTF8 7697 if (c < 256) 7698#endif 7699 c = md->lcc[c]; 7700 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); 7701 } 7702 else 7703 { 7704 if (*ecode++ == c) RRETURN(MATCH_NOMATCH); 7705 } 7706 break; 7707 7708 /* Match a negated single one-byte character repeatedly. This is almost a 7709 repeat of the code for a repeated single character, but I haven't found a 7710 nice way of commoning these up that doesn't require a test of the 7711 positive/negative option for each character match. Maybe that wouldn't add 7712 very much to the time taken, but character matching *is* what this is all 7713 about... */ 7714 7715 case OP_NOTEXACT: 7716 min = max = GET2(ecode, 1); 7717 ecode += 3; 7718 goto REPEATNOTCHAR; 7719 7720 case OP_NOTUPTO: 7721 case OP_NOTMINUPTO: 7722 min = 0; 7723 max = GET2(ecode, 1); 7724 minimize = *ecode == OP_NOTMINUPTO; 7725 ecode += 3; 7726 goto REPEATNOTCHAR; 7727 7728 case OP_NOTSTAR: 7729 case OP_NOTMINSTAR: 7730 case OP_NOTPLUS: 7731 case OP_NOTMINPLUS: 7732 case OP_NOTQUERY: 7733 case OP_NOTMINQUERY: 7734 c = *ecode++ - OP_NOTSTAR; 7735 minimize = (c & 1) != 0; 7736 min = rep_min[c]; /* Pick up values from tables; */ 7737 max = rep_max[c]; /* zero for max => infinity */ 7738 if (max == 0) max = INT_MAX; 7739 7740 /* Common code for all repeated single-byte matches. We can give up quickly 7741 if there are fewer than the minimum number of bytes left in the 7742 subject. */ 7743 7744 REPEATNOTCHAR: 7745 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 7746 fc = *ecode++; 7747 7748 /* The code is duplicated for the caseless and caseful cases, for speed, 7749 since matching characters is likely to be quite common. First, ensure the 7750 minimum number of matches are present. If min = max, continue at the same 7751 level without recursing. Otherwise, if minimizing, keep trying the rest of 7752 the expression and advancing one matching character if failing, up to the 7753 maximum. Alternatively, if maximizing, find the maximum number of 7754 characters and work backwards. */ 7755 7756 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, 7757 max, eptr)); 7758 7759 if ((ims & PCRE_CASELESS) != 0) 7760 { 7761 fc = md->lcc[fc]; 7762 7763#ifdef SUPPORT_UTF8 7764 /* UTF-8 mode */ 7765 if (md->utf8) 7766 { 7767 register int d; 7768 for (i = 1; i <= min; i++) 7769 { 7770 GETCHARINC(d, eptr); 7771 if (d < 256) d = md->lcc[d]; 7772 if (fc == d) RRETURN(MATCH_NOMATCH); 7773 } 7774 } 7775 else 7776#endif 7777 7778 /* Not UTF-8 mode */ 7779 { 7780 for (i = 1; i <= min; i++) 7781 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); 7782 } 7783 7784 if (min == max) continue; 7785 7786 if (minimize) 7787 { 7788#ifdef SUPPORT_UTF8 7789 /* UTF-8 mode */ 7790 if (md->utf8) 7791 { 7792 register int d; 7793 for (fi = min;; fi++) 7794 { 7795 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7796 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7797 GETCHARINC(d, eptr); 7798 if (d < 256) d = md->lcc[d]; 7799 if (fi >= max || eptr >= md->end_subject || fc == d) 7800 RRETURN(MATCH_NOMATCH); 7801 } 7802 } 7803 else 7804#endif 7805 /* Not UTF-8 mode */ 7806 { 7807 for (fi = min;; fi++) 7808 { 7809 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7810 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7811 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) 7812 RRETURN(MATCH_NOMATCH); 7813 } 7814 } 7815 /* Control never gets here */ 7816 } 7817 7818 /* Maximize case */ 7819 7820 else 7821 { 7822 pp = eptr; 7823 7824#ifdef SUPPORT_UTF8 7825 /* UTF-8 mode */ 7826 if (md->utf8) 7827 { 7828 register int d; 7829 for (i = min; i < max; i++) 7830 { 7831 int len = 1; 7832 if (eptr >= md->end_subject) break; 7833 GETCHARLEN(d, eptr, len); 7834 if (d < 256) d = md->lcc[d]; 7835 if (fc == d) break; 7836 eptr += len; 7837 } 7838 for(;;) 7839 { 7840 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7841 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7842 if (eptr-- == pp) break; /* Stop if tried at original pos */ 7843 BACKCHAR(eptr); 7844 } 7845 } 7846 else 7847#endif 7848 /* Not UTF-8 mode */ 7849 { 7850 for (i = min; i < max; i++) 7851 { 7852 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; 7853 eptr++; 7854 } 7855 while (eptr >= pp) 7856 { 7857 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7858 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7859 eptr--; 7860 } 7861 } 7862 7863 RRETURN(MATCH_NOMATCH); 7864 } 7865 /* Control never gets here */ 7866 } 7867 7868 /* Caseful comparisons */ 7869 7870 else 7871 { 7872#ifdef SUPPORT_UTF8 7873 /* UTF-8 mode */ 7874 if (md->utf8) 7875 { 7876 register int d; 7877 for (i = 1; i <= min; i++) 7878 { 7879 GETCHARINC(d, eptr); 7880 if (fc == d) RRETURN(MATCH_NOMATCH); 7881 } 7882 } 7883 else 7884#endif 7885 /* Not UTF-8 mode */ 7886 { 7887 for (i = 1; i <= min; i++) 7888 if (fc == *eptr++) RRETURN(MATCH_NOMATCH); 7889 } 7890 7891 if (min == max) continue; 7892 7893 if (minimize) 7894 { 7895#ifdef SUPPORT_UTF8 7896 /* UTF-8 mode */ 7897 if (md->utf8) 7898 { 7899 register int d; 7900 for (fi = min;; fi++) 7901 { 7902 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7903 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7904 GETCHARINC(d, eptr); 7905 if (fi >= max || eptr >= md->end_subject || fc == d) 7906 RRETURN(MATCH_NOMATCH); 7907 } 7908 } 7909 else 7910#endif 7911 /* Not UTF-8 mode */ 7912 { 7913 for (fi = min;; fi++) 7914 { 7915 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7916 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7917 if (fi >= max || eptr >= md->end_subject || fc == *eptr++) 7918 RRETURN(MATCH_NOMATCH); 7919 } 7920 } 7921 /* Control never gets here */ 7922 } 7923 7924 /* Maximize case */ 7925 7926 else 7927 { 7928 pp = eptr; 7929 7930#ifdef SUPPORT_UTF8 7931 /* UTF-8 mode */ 7932 if (md->utf8) 7933 { 7934 register int d; 7935 for (i = min; i < max; i++) 7936 { 7937 int len = 1; 7938 if (eptr >= md->end_subject) break; 7939 GETCHARLEN(d, eptr, len); 7940 if (fc == d) break; 7941 eptr += len; 7942 } 7943 for(;;) 7944 { 7945 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7946 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7947 if (eptr-- == pp) break; /* Stop if tried at original pos */ 7948 BACKCHAR(eptr); 7949 } 7950 } 7951 else 7952#endif 7953 /* Not UTF-8 mode */ 7954 { 7955 for (i = min; i < max; i++) 7956 { 7957 if (eptr >= md->end_subject || fc == *eptr) break; 7958 eptr++; 7959 } 7960 while (eptr >= pp) 7961 { 7962 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 7963 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 7964 eptr--; 7965 } 7966 } 7967 7968 RRETURN(MATCH_NOMATCH); 7969 } 7970 } 7971 /* Control never gets here */ 7972 7973 /* Match a single character type repeatedly; several different opcodes 7974 share code. This is very similar to the code for single characters, but we 7975 repeat it in the interests of efficiency. */ 7976 7977 case OP_TYPEEXACT: 7978 min = max = GET2(ecode, 1); 7979 minimize = TRUE; 7980 ecode += 3; 7981 goto REPEATTYPE; 7982 7983 case OP_TYPEUPTO: 7984 case OP_TYPEMINUPTO: 7985 min = 0; 7986 max = GET2(ecode, 1); 7987 minimize = *ecode == OP_TYPEMINUPTO; 7988 ecode += 3; 7989 goto REPEATTYPE; 7990 7991 case OP_TYPESTAR: 7992 case OP_TYPEMINSTAR: 7993 case OP_TYPEPLUS: 7994 case OP_TYPEMINPLUS: 7995 case OP_TYPEQUERY: 7996 case OP_TYPEMINQUERY: 7997 c = *ecode++ - OP_TYPESTAR; 7998 minimize = (c & 1) != 0; 7999 min = rep_min[c]; /* Pick up values from tables; */ 8000 max = rep_max[c]; /* zero for max => infinity */ 8001 if (max == 0) max = INT_MAX; 8002 8003 /* Common code for all repeated single character type matches. Note that 8004 in UTF-8 mode, '.' matches a character of any length, but for the other 8005 character types, the valid characters are all one-byte long. */ 8006 8007 REPEATTYPE: 8008 ctype = *ecode++; /* Code for the character type */ 8009 8010#ifdef SUPPORT_UCP 8011 if (ctype == OP_PROP || ctype == OP_NOTPROP) 8012 { 8013 prop_fail_result = ctype == OP_NOTPROP; 8014 prop_type = *ecode++; 8015 if (prop_type >= 128) 8016 { 8017 prop_test_against = prop_type - 128; 8018 prop_test_variable = &prop_category; 8019 } 8020 else 8021 { 8022 prop_test_against = prop_type; 8023 prop_test_variable = &prop_chartype; 8024 } 8025 } 8026 else prop_type = -1; 8027#endif 8028 8029 /* First, ensure the minimum number of matches are present. Use inline 8030 code for maximizing the speed, and do the type test once at the start 8031 (i.e. keep it out of the loop). Also we can test that there are at least 8032 the minimum number of bytes before we start. This isn't as effective in 8033 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that 8034 is tidier. Also separate the UCP code, which can be the same for both UTF-8 8035 and single-bytes. */ 8036 8037 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); 8038 if (min > 0) 8039 { 8040#ifdef SUPPORT_UCP 8041 if (prop_type > 0) 8042 { 8043 for (i = 1; i <= min; i++) 8044 { 8045 GETCHARINC(c, eptr); 8046 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8047 if ((*prop_test_variable == prop_test_against) == prop_fail_result) 8048 RRETURN(MATCH_NOMATCH); 8049 } 8050 } 8051 8052 /* Match extended Unicode sequences. We will get here only if the 8053 support is in the binary; otherwise a compile-time error occurs. */ 8054 8055 else if (ctype == OP_EXTUNI) 8056 { 8057 for (i = 1; i <= min; i++) 8058 { 8059 GETCHARINCTEST(c, eptr); 8060 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8061 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); 8062 while (eptr < md->end_subject) 8063 { 8064 int len = 1; 8065 if (!md->utf8) c = *eptr; else 8066 { 8067 GETCHARLEN(c, eptr, len); 8068 } 8069 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8070 if (prop_category != ucp_M) break; 8071 eptr += len; 8072 } 8073 } 8074 } 8075 8076 else 8077#endif /* SUPPORT_UCP */ 8078 8079/* Handle all other cases when the coding is UTF-8 */ 8080 8081#ifdef SUPPORT_UTF8 8082 if (md->utf8) switch(ctype) 8083 { 8084 case OP_ANY: 8085 for (i = 1; i <= min; i++) 8086 { 8087 if (eptr >= md->end_subject || 8088 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0)) 8089 RRETURN(MATCH_NOMATCH); 8090 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 8091 } 8092 break; 8093 8094 case OP_ANYBYTE: 8095 eptr += min; 8096 break; 8097 8098 case OP_NOT_DIGIT: 8099 for (i = 1; i <= min; i++) 8100 { 8101 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 8102 GETCHARINC(c, eptr); 8103 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) 8104 RRETURN(MATCH_NOMATCH); 8105 } 8106 break; 8107 8108 case OP_DIGIT: 8109 for (i = 1; i <= min; i++) 8110 { 8111 if (eptr >= md->end_subject || 8112 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) 8113 RRETURN(MATCH_NOMATCH); 8114 /* No need to skip more bytes - we know it's a 1-byte character */ 8115 } 8116 break; 8117 8118 case OP_NOT_WHITESPACE: 8119 for (i = 1; i <= min; i++) 8120 { 8121 if (eptr >= md->end_subject || 8122 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0)) 8123 RRETURN(MATCH_NOMATCH); 8124 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 8125 } 8126 break; 8127 8128 case OP_WHITESPACE: 8129 for (i = 1; i <= min; i++) 8130 { 8131 if (eptr >= md->end_subject || 8132 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) 8133 RRETURN(MATCH_NOMATCH); 8134 /* No need to skip more bytes - we know it's a 1-byte character */ 8135 } 8136 break; 8137 8138 case OP_NOT_WORDCHAR: 8139 for (i = 1; i <= min; i++) 8140 { 8141 if (eptr >= md->end_subject || 8142 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0)) 8143 RRETURN(MATCH_NOMATCH); 8144 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 8145 } 8146 break; 8147 8148 case OP_WORDCHAR: 8149 for (i = 1; i <= min; i++) 8150 { 8151 if (eptr >= md->end_subject || 8152 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) 8153 RRETURN(MATCH_NOMATCH); 8154 /* No need to skip more bytes - we know it's a 1-byte character */ 8155 } 8156 break; 8157 8158 default: 8159 RRETURN(PCRE_ERROR_INTERNAL); 8160 } /* End switch(ctype) */ 8161 8162 else 8163#endif /* SUPPORT_UTF8 */ 8164 8165 /* Code for the non-UTF-8 case for minimum matching of operators other 8166 than OP_PROP and OP_NOTPROP. */ 8167 8168 switch(ctype) 8169 { 8170 case OP_ANY: 8171 if ((ims & PCRE_DOTALL) == 0) 8172 { 8173 for (i = 1; i <= min; i++) 8174 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH); 8175 } 8176 else eptr += min; 8177 break; 8178 8179 case OP_ANYBYTE: 8180 eptr += min; 8181 break; 8182 8183 case OP_NOT_DIGIT: 8184 for (i = 1; i <= min; i++) 8185 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 8186 break; 8187 8188 case OP_DIGIT: 8189 for (i = 1; i <= min; i++) 8190 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 8191 break; 8192 8193 case OP_NOT_WHITESPACE: 8194 for (i = 1; i <= min; i++) 8195 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 8196 break; 8197 8198 case OP_WHITESPACE: 8199 for (i = 1; i <= min; i++) 8200 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 8201 break; 8202 8203 case OP_NOT_WORDCHAR: 8204 for (i = 1; i <= min; i++) 8205 if ((md->ctypes[*eptr++] & ctype_word) != 0) 8206 RRETURN(MATCH_NOMATCH); 8207 break; 8208 8209 case OP_WORDCHAR: 8210 for (i = 1; i <= min; i++) 8211 if ((md->ctypes[*eptr++] & ctype_word) == 0) 8212 RRETURN(MATCH_NOMATCH); 8213 break; 8214 8215 default: 8216 RRETURN(PCRE_ERROR_INTERNAL); 8217 } 8218 } 8219 8220 /* If min = max, continue at the same level without recursing */ 8221 8222 if (min == max) continue; 8223 8224 /* If minimizing, we have to test the rest of the pattern before each 8225 subsequent match. Again, separate the UTF-8 case for speed, and also 8226 separate the UCP cases. */ 8227 8228 if (minimize) 8229 { 8230#ifdef SUPPORT_UCP 8231 if (prop_type > 0) 8232 { 8233 for (fi = min;; fi++) 8234 { 8235 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8236 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8237 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 8238 GETCHARINC(c, eptr); 8239 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8240 if ((*prop_test_variable == prop_test_against) == prop_fail_result) 8241 RRETURN(MATCH_NOMATCH); 8242 } 8243 } 8244 8245 /* Match extended Unicode sequences. We will get here only if the 8246 support is in the binary; otherwise a compile-time error occurs. */ 8247 8248 else if (ctype == OP_EXTUNI) 8249 { 8250 for (fi = min;; fi++) 8251 { 8252 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8253 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8254 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 8255 GETCHARINCTEST(c, eptr); 8256 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8257 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); 8258 while (eptr < md->end_subject) 8259 { 8260 int len = 1; 8261 if (!md->utf8) c = *eptr; else 8262 { 8263 GETCHARLEN(c, eptr, len); 8264 } 8265 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8266 if (prop_category != ucp_M) break; 8267 eptr += len; 8268 } 8269 } 8270 } 8271 8272 else 8273#endif /* SUPPORT_UCP */ 8274 8275#ifdef SUPPORT_UTF8 8276 /* UTF-8 mode */ 8277 if (md->utf8) 8278 { 8279 for (fi = min;; fi++) 8280 { 8281 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8282 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8283 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 8284 8285 GETCHARINC(c, eptr); 8286 switch(ctype) 8287 { 8288 case OP_ANY: 8289 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); 8290 break; 8291 8292 case OP_ANYBYTE: 8293 break; 8294 8295 case OP_NOT_DIGIT: 8296 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) 8297 RRETURN(MATCH_NOMATCH); 8298 break; 8299 8300 case OP_DIGIT: 8301 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) 8302 RRETURN(MATCH_NOMATCH); 8303 break; 8304 8305 case OP_NOT_WHITESPACE: 8306 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) 8307 RRETURN(MATCH_NOMATCH); 8308 break; 8309 8310 case OP_WHITESPACE: 8311 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) 8312 RRETURN(MATCH_NOMATCH); 8313 break; 8314 8315 case OP_NOT_WORDCHAR: 8316 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) 8317 RRETURN(MATCH_NOMATCH); 8318 break; 8319 8320 case OP_WORDCHAR: 8321 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0) 8322 RRETURN(MATCH_NOMATCH); 8323 break; 8324 8325 default: 8326 RRETURN(PCRE_ERROR_INTERNAL); 8327 } 8328 } 8329 } 8330 else 8331#endif 8332 /* Not UTF-8 mode */ 8333 { 8334 for (fi = min;; fi++) 8335 { 8336 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8337 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8338 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); 8339 c = *eptr++; 8340 switch(ctype) 8341 { 8342 case OP_ANY: 8343 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH); 8344 break; 8345 8346 case OP_ANYBYTE: 8347 break; 8348 8349 case OP_NOT_DIGIT: 8350 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); 8351 break; 8352 8353 case OP_DIGIT: 8354 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); 8355 break; 8356 8357 case OP_NOT_WHITESPACE: 8358 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); 8359 break; 8360 8361 case OP_WHITESPACE: 8362 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); 8363 break; 8364 8365 case OP_NOT_WORDCHAR: 8366 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); 8367 break; 8368 8369 case OP_WORDCHAR: 8370 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); 8371 break; 8372 8373 default: 8374 RRETURN(PCRE_ERROR_INTERNAL); 8375 } 8376 } 8377 } 8378 /* Control never gets here */ 8379 } 8380 8381 /* If maximizing it is worth using inline code for speed, doing the type 8382 test once at the start (i.e. keep it out of the loop). Again, keep the 8383 UTF-8 and UCP stuff separate. */ 8384 8385 else 8386 { 8387 pp = eptr; /* Remember where we started */ 8388 8389#ifdef SUPPORT_UCP 8390 if (prop_type > 0) 8391 { 8392 for (i = min; i < max; i++) 8393 { 8394 int len = 1; 8395 if (eptr >= md->end_subject) break; 8396 GETCHARLEN(c, eptr, len); 8397 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8398 if ((*prop_test_variable == prop_test_against) == prop_fail_result) 8399 break; 8400 eptr+= len; 8401 } 8402 8403 /* eptr is now past the end of the maximum run */ 8404 8405 for(;;) 8406 { 8407 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8408 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8409 if (eptr-- == pp) break; /* Stop if tried at original pos */ 8410 BACKCHAR(eptr); 8411 } 8412 } 8413 8414 /* Match extended Unicode sequences. We will get here only if the 8415 support is in the binary; otherwise a compile-time error occurs. */ 8416 8417 else if (ctype == OP_EXTUNI) 8418 { 8419 for (i = min; i < max; i++) 8420 { 8421 if (eptr >= md->end_subject) break; 8422 GETCHARINCTEST(c, eptr); 8423 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8424 if (prop_category == ucp_M) break; 8425 while (eptr < md->end_subject) 8426 { 8427 int len = 1; 8428 if (!md->utf8) c = *eptr; else 8429 { 8430 GETCHARLEN(c, eptr, len); 8431 } 8432 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8433 if (prop_category != ucp_M) break; 8434 eptr += len; 8435 } 8436 } 8437 8438 /* eptr is now past the end of the maximum run */ 8439 8440 for(;;) 8441 { 8442 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8443 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8444 if (eptr-- == pp) break; /* Stop if tried at original pos */ 8445 for (;;) /* Move back over one extended */ 8446 { 8447 int len = 1; 8448 BACKCHAR(eptr); 8449 if (!md->utf8) c = *eptr; else 8450 { 8451 GETCHARLEN(c, eptr, len); 8452 } 8453 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase); 8454 if (prop_category != ucp_M) break; 8455 eptr--; 8456 } 8457 } 8458 } 8459 8460 else 8461#endif /* SUPPORT_UCP */ 8462 8463#ifdef SUPPORT_UTF8 8464 /* UTF-8 mode */ 8465 8466 if (md->utf8) 8467 { 8468 switch(ctype) 8469 { 8470 case OP_ANY: 8471 8472 /* Special code is required for UTF8, but when the maximum is unlimited 8473 we don't need it, so we repeat the non-UTF8 code. This is probably 8474 worth it, because .* is quite a common idiom. */ 8475 8476 if (max < INT_MAX) 8477 { 8478 if ((ims & PCRE_DOTALL) == 0) 8479 { 8480 for (i = min; i < max; i++) 8481 { 8482 if (eptr >= md->end_subject || *eptr == NEWLINE) break; 8483 eptr++; 8484 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 8485 } 8486 } 8487 else 8488 { 8489 for (i = min; i < max; i++) 8490 { 8491 eptr++; 8492 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; 8493 } 8494 } 8495 } 8496 8497 /* Handle unlimited UTF-8 repeat */ 8498 8499 else 8500 { 8501 if ((ims & PCRE_DOTALL) == 0) 8502 { 8503 for (i = min; i < max; i++) 8504 { 8505 if (eptr >= md->end_subject || *eptr == NEWLINE) break; 8506 eptr++; 8507 } 8508 break; 8509 } 8510 else 8511 { 8512 c = max - min; 8513 if (c > md->end_subject - eptr) c = md->end_subject - eptr; 8514 eptr += c; 8515 } 8516 } 8517 break; 8518 8519 /* The byte case is the same as non-UTF8 */ 8520 8521 case OP_ANYBYTE: 8522 c = max - min; 8523 if (c > md->end_subject - eptr) c = md->end_subject - eptr; 8524 eptr += c; 8525 break; 8526 8527 case OP_NOT_DIGIT: 8528 for (i = min; i < max; i++) 8529 { 8530 int len = 1; 8531 if (eptr >= md->end_subject) break; 8532 GETCHARLEN(c, eptr, len); 8533 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; 8534 eptr+= len; 8535 } 8536 break; 8537 8538 case OP_DIGIT: 8539 for (i = min; i < max; i++) 8540 { 8541 int len = 1; 8542 if (eptr >= md->end_subject) break; 8543 GETCHARLEN(c, eptr, len); 8544 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; 8545 eptr+= len; 8546 } 8547 break; 8548 8549 case OP_NOT_WHITESPACE: 8550 for (i = min; i < max; i++) 8551 { 8552 int len = 1; 8553 if (eptr >= md->end_subject) break; 8554 GETCHARLEN(c, eptr, len); 8555 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; 8556 eptr+= len; 8557 } 8558 break; 8559 8560 case OP_WHITESPACE: 8561 for (i = min; i < max; i++) 8562 { 8563 int len = 1; 8564 if (eptr >= md->end_subject) break; 8565 GETCHARLEN(c, eptr, len); 8566 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; 8567 eptr+= len; 8568 } 8569 break; 8570 8571 case OP_NOT_WORDCHAR: 8572 for (i = min; i < max; i++) 8573 { 8574 int len = 1; 8575 if (eptr >= md->end_subject) break; 8576 GETCHARLEN(c, eptr, len); 8577 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; 8578 eptr+= len; 8579 } 8580 break; 8581 8582 case OP_WORDCHAR: 8583 for (i = min; i < max; i++) 8584 { 8585 int len = 1; 8586 if (eptr >= md->end_subject) break; 8587 GETCHARLEN(c, eptr, len); 8588 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; 8589 eptr+= len; 8590 } 8591 break; 8592 8593 default: 8594 RRETURN(PCRE_ERROR_INTERNAL); 8595 } 8596 8597 /* eptr is now past the end of the maximum run */ 8598 8599 for(;;) 8600 { 8601 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8602 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8603 if (eptr-- == pp) break; /* Stop if tried at original pos */ 8604 BACKCHAR(eptr); 8605 } 8606 } 8607 else 8608#endif 8609 8610 /* Not UTF-8 mode */ 8611 { 8612 switch(ctype) 8613 { 8614 case OP_ANY: 8615 if ((ims & PCRE_DOTALL) == 0) 8616 { 8617 for (i = min; i < max; i++) 8618 { 8619 if (eptr >= md->end_subject || *eptr == NEWLINE) break; 8620 eptr++; 8621 } 8622 break; 8623 } 8624 /* For DOTALL case, fall through and treat as \C */ 8625 8626 case OP_ANYBYTE: 8627 c = max - min; 8628 if (c > md->end_subject - eptr) c = md->end_subject - eptr; 8629 eptr += c; 8630 break; 8631 8632 case OP_NOT_DIGIT: 8633 for (i = min; i < max; i++) 8634 { 8635 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) 8636 break; 8637 eptr++; 8638 } 8639 break; 8640 8641 case OP_DIGIT: 8642 for (i = min; i < max; i++) 8643 { 8644 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) 8645 break; 8646 eptr++; 8647 } 8648 break; 8649 8650 case OP_NOT_WHITESPACE: 8651 for (i = min; i < max; i++) 8652 { 8653 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) 8654 break; 8655 eptr++; 8656 } 8657 break; 8658 8659 case OP_WHITESPACE: 8660 for (i = min; i < max; i++) 8661 { 8662 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) 8663 break; 8664 eptr++; 8665 } 8666 break; 8667 8668 case OP_NOT_WORDCHAR: 8669 for (i = min; i < max; i++) 8670 { 8671 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) 8672 break; 8673 eptr++; 8674 } 8675 break; 8676 8677 case OP_WORDCHAR: 8678 for (i = min; i < max; i++) 8679 { 8680 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) 8681 break; 8682 eptr++; 8683 } 8684 break; 8685 8686 default: 8687 RRETURN(PCRE_ERROR_INTERNAL); 8688 } 8689 8690 /* eptr is now past the end of the maximum run */ 8691 8692 while (eptr >= pp) 8693 { 8694 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0); 8695 eptr--; 8696 if (rrc != MATCH_NOMATCH) RRETURN(rrc); 8697 } 8698 } 8699 8700 /* Get here if we can't make it match with any permitted repetitions */ 8701 8702 RRETURN(MATCH_NOMATCH); 8703 } 8704 /* Control never gets here */ 8705 8706 /* There's been some horrible disaster. Since all codes > OP_BRA are 8707 for capturing brackets, and there shouldn't be any gaps between 0 and 8708 OP_BRA, arrival here can only mean there is something seriously wrong 8709 in the code above or the OP_xxx definitions. */ 8710 8711 default: 8712 DPRINTF(("Unknown opcode %d\n", *ecode)); 8713 RRETURN(PCRE_ERROR_UNKNOWN_NODE); 8714 } 8715 8716 /* Do not stick any code in here without much thought; it is assumed 8717 that "continue" in the code above comes out to here to repeat the main 8718 loop. */ 8719 8720 } /* End of main loop */ 8721/* Control never reaches here */ 8722} 8723 8724 8725/*************************************************************************** 8726**************************************************************************** 8727 RECURSION IN THE match() FUNCTION 8728 8729Undefine all the macros that were defined above to handle this. */ 8730 8731#ifdef NO_RECURSE 8732#undef eptr 8733#undef ecode 8734#undef offset_top 8735#undef ims 8736#undef eptrb 8737#undef flags 8738 8739#undef callpat 8740#undef charptr 8741#undef data 8742#undef next 8743#undef pp 8744#undef prev 8745#undef saved_eptr 8746 8747#undef new_recursive 8748 8749#undef cur_is_word 8750#undef condition 8751#undef minimize 8752#undef prev_is_word 8753 8754#undef original_ims 8755 8756#undef ctype 8757#undef length 8758#undef max 8759#undef min 8760#undef number 8761#undef offset 8762#undef op 8763#undef save_capture_last 8764#undef save_offset1 8765#undef save_offset2 8766#undef save_offset3 8767#undef stacksave 8768 8769#undef newptrb 8770 8771#endif 8772 8773/* These two are defined as macros in both cases */ 8774 8775#undef fc 8776#undef fi 8777 8778/*************************************************************************** 8779***************************************************************************/ 8780 8781 8782 8783/************************************************* 8784* Execute a Regular Expression * 8785*************************************************/ 8786 8787/* This function applies a compiled re to a subject string and picks out 8788portions of the string if it matches. Two elements in the vector are set for 8789each substring: the offsets to the start and end of the substring. 8790 8791Arguments: 8792 argument_re points to the compiled expression 8793 extra_data points to extra data or is NULL 8794 subject points to the subject string 8795 length length of subject string (may contain binary zeros) 8796 start_offset where to start in the subject string 8797 options option bits 8798 offsets points to a vector of ints to be filled in with offsets 8799 offsetcount the number of elements in the vector 8800 8801Returns: > 0 => success; value is the number of elements filled in 8802 = 0 => success, but offsets is not big enough 8803 -1 => failed to match 8804 < -1 => some kind of unexpected problem 8805*/ 8806 8807EXPORT int 8808pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, 8809 const char *subject, int length, int start_offset, int options, int *offsets, 8810 int offsetcount) 8811{ 8812int rc, resetcount, ocount; 8813int first_byte = -1; 8814int req_byte = -1; 8815int req_byte2 = -1; 8816unsigned long int ims = 0; 8817BOOL using_temporary_offsets = FALSE; 8818BOOL anchored; 8819BOOL startline; 8820BOOL first_byte_caseless = FALSE; 8821BOOL req_byte_caseless = FALSE; 8822match_data match_block; 8823const uschar *tables; 8824const uschar *start_bits = NULL; 8825const uschar *start_match = (const uschar *)subject + start_offset; 8826const uschar *end_subject; 8827const uschar *req_byte_ptr = start_match - 1; 8828 8829pcre_study_data internal_study; 8830const pcre_study_data *study; 8831 8832real_pcre internal_re; 8833const real_pcre *external_re = (const real_pcre *)argument_re; 8834const real_pcre *re = external_re; 8835 8836/* Plausibility checks */ 8837 8838if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; 8839if (re == NULL || subject == NULL || 8840 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; 8841if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; 8842 8843/* Fish out the optional data from the extra_data structure, first setting 8844the default values. */ 8845 8846study = NULL; 8847match_block.match_limit = MATCH_LIMIT; 8848match_block.callout_data = NULL; 8849 8850/* The table pointer is always in native byte order. */ 8851 8852tables = external_re->tables; 8853 8854if (extra_data != NULL) 8855 { 8856 register unsigned int flags = extra_data->flags; 8857 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) 8858 study = (const pcre_study_data *)extra_data->study_data; 8859 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) 8860 match_block.match_limit = extra_data->match_limit; 8861 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) 8862 match_block.callout_data = extra_data->callout_data; 8863 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; 8864 } 8865 8866/* If the exec call supplied NULL for tables, use the inbuilt ones. This 8867is a feature that makes it possible to save compiled regex and re-use them 8868in other programs later. */ 8869 8870if (tables == NULL) tables = pcre_default_tables; 8871 8872/* Check that the first field in the block is the magic number. If it is not, 8873test for a regex that was compiled on a host of opposite endianness. If this is 8874the case, flipped values are put in internal_re and internal_study if there was 8875study data too. */ 8876 8877if (re->magic_number != MAGIC_NUMBER) 8878 { 8879 re = try_flipped(re, &internal_re, study, &internal_study); 8880 if (re == NULL) return PCRE_ERROR_BADMAGIC; 8881 if (study != NULL) study = &internal_study; 8882 } 8883 8884/* Set up other data */ 8885 8886anchored = ((re->options | options) & PCRE_ANCHORED) != 0; 8887startline = (re->options & PCRE_STARTLINE) != 0; 8888 8889/* The code starts after the real_pcre block and the capture name table. */ 8890 8891match_block.start_code = (const uschar *)external_re + re->name_table_offset + 8892 re->name_count * re->name_entry_size; 8893 8894match_block.start_subject = (const uschar *)subject; 8895match_block.start_offset = start_offset; 8896match_block.end_subject = match_block.start_subject + length; 8897end_subject = match_block.end_subject; 8898 8899match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; 8900match_block.utf8 = (re->options & PCRE_UTF8) != 0; 8901 8902match_block.notbol = (options & PCRE_NOTBOL) != 0; 8903match_block.noteol = (options & PCRE_NOTEOL) != 0; 8904match_block.notempty = (options & PCRE_NOTEMPTY) != 0; 8905match_block.partial = (options & PCRE_PARTIAL) != 0; 8906match_block.hitend = FALSE; 8907 8908match_block.recursive = NULL; /* No recursion at top level */ 8909 8910match_block.lcc = tables + lcc_offset; 8911match_block.ctypes = tables + ctypes_offset; 8912 8913/* Partial matching is supported only for a restricted set of regexes at the 8914moment. */ 8915 8916if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0) 8917 return PCRE_ERROR_BADPARTIAL; 8918 8919/* Check a UTF-8 string if required. Unfortunately there's no way of passing 8920back the character offset. */ 8921 8922#ifdef SUPPORT_UTF8 8923if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) 8924 { 8925 if (valid_utf8((uschar *)subject, length) >= 0) 8926 return PCRE_ERROR_BADUTF8; 8927 if (start_offset > 0 && start_offset < length) 8928 { 8929 int tb = ((uschar *)subject)[start_offset]; 8930 if (tb > 127) 8931 { 8932 tb &= 0xc0; 8933 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; 8934 } 8935 } 8936 } 8937#endif 8938 8939/* The ims options can vary during the matching as a result of the presence 8940of (?ims) items in the pattern. They are kept in a local variable so that 8941restoring at the exit of a group is easy. */ 8942 8943ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); 8944 8945/* If the expression has got more back references than the offsets supplied can 8946hold, we get a temporary chunk of working store to use during the matching. 8947Otherwise, we can use the vector supplied, rounding down its size to a multiple 8948of 3. */ 8949 8950ocount = offsetcount - (offsetcount % 3); 8951 8952if (re->top_backref > 0 && re->top_backref >= ocount/3) 8953 { 8954 ocount = re->top_backref * 3 + 3; 8955 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); 8956 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY; 8957 using_temporary_offsets = TRUE; 8958 DPRINTF(("Got memory to hold back references\n")); 8959 } 8960else match_block.offset_vector = offsets; 8961 8962match_block.offset_end = ocount; 8963match_block.offset_max = (2*ocount)/3; 8964match_block.offset_overflow = FALSE; 8965match_block.capture_last = -1; 8966 8967/* Compute the minimum number of offsets that we need to reset each time. Doing 8968this makes a huge difference to execution time when there aren't many brackets 8969in the pattern. */ 8970 8971resetcount = 2 + re->top_bracket * 2; 8972if (resetcount > offsetcount) resetcount = ocount; 8973 8974/* Reset the working variable associated with each extraction. These should 8975never be used unless previously set, but they get saved and restored, and so we 8976initialize them to avoid reading uninitialized locations. */ 8977 8978if (match_block.offset_vector != NULL) 8979 { 8980 register int *iptr = match_block.offset_vector + ocount; 8981 register int *iend = iptr - resetcount/2 + 1; 8982 while (--iptr >= iend) *iptr = -1; 8983 } 8984 8985/* Set up the first character to match, if available. The first_byte value is 8986never set for an anchored regular expression, but the anchoring may be forced 8987at run time, so we have to test for anchoring. The first char may be unset for 8988an unanchored pattern, of course. If there's no first char and the pattern was 8989studied, there may be a bitmap of possible first characters. */ 8990 8991if (!anchored) 8992 { 8993 if ((re->options & PCRE_FIRSTSET) != 0) 8994 { 8995 first_byte = re->first_byte & 255; 8996 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) 8997 first_byte = match_block.lcc[first_byte]; 8998 } 8999 else 9000 if (!startline && study != NULL && 9001 (study->options & PCRE_STUDY_MAPPED) != 0) 9002 start_bits = study->start_bits; 9003 } 9004 9005/* For anchored or unanchored matches, there may be a "last known required 9006character" set. */ 9007 9008if ((re->options & PCRE_REQCHSET) != 0) 9009 { 9010 req_byte = re->req_byte & 255; 9011 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; 9012 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ 9013 } 9014 9015/* Loop for handling unanchored repeated matching attempts; for anchored regexs 9016the loop runs just once. */ 9017 9018do 9019 { 9020 /* Reset the maximum number of extractions we might see. */ 9021 9022 if (match_block.offset_vector != NULL) 9023 { 9024 register int *iptr = match_block.offset_vector; 9025 register int *iend = iptr + resetcount; 9026 while (iptr < iend) *iptr++ = -1; 9027 } 9028 9029 /* Advance to a unique first char if possible */ 9030 9031 if (first_byte >= 0) 9032 { 9033 if (first_byte_caseless) 9034 while (start_match < end_subject && 9035 match_block.lcc[*start_match] != first_byte) 9036 start_match++; 9037 else 9038 while (start_match < end_subject && *start_match != first_byte) 9039 start_match++; 9040 } 9041 9042 /* Or to just after \n for a multiline match if possible */ 9043 9044 else if (startline) 9045 { 9046 if (start_match > match_block.start_subject + start_offset) 9047 { 9048 while (start_match < end_subject && start_match[-1] != NEWLINE) 9049 start_match++; 9050 } 9051 } 9052 9053 /* Or to a non-unique first char after study */ 9054 9055 else if (start_bits != NULL) 9056 { 9057 while (start_match < end_subject) 9058 { 9059 register unsigned int c = *start_match; 9060 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; 9061 } 9062 } 9063 9064#ifdef DEBUG /* Sigh. Some compilers never learn. */ 9065 printf(">>>> Match against: "); 9066 pchars(start_match, end_subject - start_match, TRUE, &match_block); 9067 printf("\n"); 9068#endif 9069 9070 /* If req_byte is set, we know that that character must appear in the subject 9071 for the match to succeed. If the first character is set, req_byte must be 9072 later in the subject; otherwise the test starts at the match point. This 9073 optimization can save a huge amount of backtracking in patterns with nested 9074 unlimited repeats that aren't going to match. Writing separate code for 9075 cased/caseless versions makes it go faster, as does using an autoincrement 9076 and backing off on a match. 9077 9078 HOWEVER: when the subject string is very, very long, searching to its end can 9079 take a long time, and give bad performance on quite ordinary patterns. This 9080 showed up when somebody was matching /^C/ on a 32-megabyte string... so we 9081 don't do this when the string is sufficiently long. 9082 9083 ALSO: this processing is disabled when partial matching is requested. 9084 */ 9085 9086 if (req_byte >= 0 && 9087 end_subject - start_match < REQ_BYTE_MAX && 9088 !match_block.partial) 9089 { 9090 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0); 9091 9092 /* We don't need to repeat the search if we haven't yet reached the 9093 place we found it at last time. */ 9094 9095 if (p > req_byte_ptr) 9096 { 9097 if (req_byte_caseless) 9098 { 9099 while (p < end_subject) 9100 { 9101 register int pp = *p++; 9102 if (pp == req_byte || pp == req_byte2) { p--; break; } 9103 } 9104 } 9105 else 9106 { 9107 while (p < end_subject) 9108 { 9109 if (*p++ == req_byte) { p--; break; } 9110 } 9111 } 9112 9113 /* If we can't find the required character, break the matching loop */ 9114 9115 if (p >= end_subject) break; 9116 9117 /* If we have found the required character, save the point where we 9118 found it, so that we don't search again next time round the loop if 9119 the start hasn't passed this character yet. */ 9120 9121 req_byte_ptr = p; 9122 } 9123 } 9124 9125 /* When a match occurs, substrings will be set for all internal extractions; 9126 we just need to set up the whole thing as substring 0 before returning. If 9127 there were too many extractions, set the return code to zero. In the case 9128 where we had to get some local store to hold offsets for backreferences, copy 9129 those back references that we can. In this case there need not be overflow 9130 if certain parts of the pattern were not used. */ 9131 9132 match_block.start_match = start_match; 9133 match_block.match_call_count = 0; 9134 9135 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL, 9136 match_isgroup); 9137 9138 if (rc == MATCH_NOMATCH) 9139 { 9140 start_match++; 9141#ifdef SUPPORT_UTF8 9142 if (match_block.utf8) 9143 while(start_match < end_subject && (*start_match & 0xc0) == 0x80) 9144 start_match++; 9145#endif 9146 continue; 9147 } 9148 9149 if (rc != MATCH_MATCH) 9150 { 9151 DPRINTF((">>>> error: returning %d\n", rc)); 9152 return rc; 9153 } 9154 9155 /* We have a match! Copy the offset information from temporary store if 9156 necessary */ 9157 9158 if (using_temporary_offsets) 9159 { 9160 if (offsetcount >= 4) 9161 { 9162 memcpy(offsets + 2, match_block.offset_vector + 2, 9163 (offsetcount - 2) * sizeof(int)); 9164 DPRINTF(("Copied offsets from temporary memory\n")); 9165 } 9166 if (match_block.end_offset_top > offsetcount) 9167 match_block.offset_overflow = TRUE; 9168 9169 DPRINTF(("Freeing temporary memory\n")); 9170 (pcre_free)(match_block.offset_vector); 9171 } 9172 9173 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2; 9174 9175 if (offsetcount < 2) rc = 0; else 9176 { 9177 offsets[0] = start_match - match_block.start_subject; 9178 offsets[1] = match_block.end_match_ptr - match_block.start_subject; 9179 } 9180 9181 DPRINTF((">>>> returning %d\n", rc)); 9182 return rc; 9183 } 9184 9185/* This "while" is the end of the "do" above */ 9186 9187while (!anchored && start_match <= end_subject); 9188 9189if (using_temporary_offsets) 9190 { 9191 DPRINTF(("Freeing temporary memory\n")); 9192 (pcre_free)(match_block.offset_vector); 9193 } 9194 9195if (match_block.partial && match_block.hitend) 9196 { 9197 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); 9198 return PCRE_ERROR_PARTIAL; 9199 } 9200else 9201 { 9202 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); 9203 return PCRE_ERROR_NOMATCH; 9204 } 9205} 9206 9207/* End of pcre.c */ 9208