1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2010 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57also used by pcretest. PCRE_DEBUG is not defined when building a production
58library. */
59
60#ifdef PCRE_DEBUG
61#include "pcre_printint.src"
62#endif
63
64
65/* Macro for setting individual bits in class bitmaps. */
66
67#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69/* Maximum length value to check against when making sure that the integer that
70holds the compiled pattern length does not overflow. We make it a bit less than
71INT_MAX to allow for adding in group terminating bytes, so that we don't have
72to check them every time. */
73
74#define OFLOW_MAX (INT_MAX - 20)
75
76
77/*************************************************
78*      Code parameters and static tables         *
79*************************************************/
80
81/* This value specifies the size of stack workspace that is used during the
82first pre-compile phase that determines how much memory is required. The regex
83is partly compiled into this space, but the compiled parts are discarded as
84soon as they can be, so that hopefully there will never be an overrun. The code
85does, however, check for an overrun. The largest amount I've seen used is 218,
86so this number is very generous.
87
88The same workspace is used during the second, actual compile phase for
89remembering forward references to groups so that they can be filled in at the
90end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91is 4 there is plenty of room. */
92
93#define COMPILE_WORK_SIZE (4096)
94
95/* The overrun tests check for a slightly smaller size so that they detect the
96overrun before it actually does run off the end of the data block. */
97
98#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101/* Table for handling escaped characters in the range '0'-'z'. Positive returns
102are simple data values; negative values are for special things like \d and so
103on. Zero means further processing is needed (for things like \x), or the escape
104is invalid. */
105
106#ifndef EBCDIC
107
108/* This is the "normal" table for ASCII systems or for EBCDIC systems running
109in UTF-8 mode. */
110
111static const short int escapes[] = {
112     0,                       0,
113     0,                       0,
114     0,                       0,
115     0,                       0,
116     0,                       0,
117     CHAR_COLON,              CHAR_SEMICOLON,
118     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
119     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
120     CHAR_COMMERCIAL_AT,      -ESC_A,
121     -ESC_B,                  -ESC_C,
122     -ESC_D,                  -ESC_E,
123     0,                       -ESC_G,
124     -ESC_H,                  0,
125     0,                       -ESC_K,
126     0,                       0,
127     0,                       0,
128     -ESC_P,                  -ESC_Q,
129     -ESC_R,                  -ESC_S,
130     0,                       0,
131     -ESC_V,                  -ESC_W,
132     -ESC_X,                  0,
133     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
134     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
135     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
136     CHAR_GRAVE_ACCENT,       7,
137     -ESC_b,                  0,
138     -ESC_d,                  ESC_e,
139     ESC_f,                   0,
140     -ESC_h,                  0,
141     0,                       -ESC_k,
142     0,                       0,
143     ESC_n,                   0,
144     -ESC_p,                  0,
145     ESC_r,                   -ESC_s,
146     ESC_tee,                 0,
147     -ESC_v,                  -ESC_w,
148     0,                       0,
149     -ESC_z
150};
151
152#else
153
154/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156static const short int escapes[] = {
157/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
158/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
159/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
160/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
161/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
162/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
163/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
164/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
165/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
166/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
167/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
168/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
169/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
170/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
171/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
172/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
173/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
174/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
175/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
176/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
177/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
178/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
179/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
180};
181#endif
182
183
184/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185searched linearly. Put all the names into a single string, in order to reduce
186the number of relocations when a shared library is dynamically linked. The
187string is built from string macros so that it works in UTF-8 mode on EBCDIC
188platforms. */
189
190typedef struct verbitem {
191  int   len;
192  int   op;
193} verbitem;
194
195static const char verbnames[] =
196  STRING_ACCEPT0
197  STRING_COMMIT0
198  STRING_F0
199  STRING_FAIL0
200  STRING_PRUNE0
201  STRING_SKIP0
202  STRING_THEN;
203
204static const verbitem verbs[] = {
205  { 6, OP_ACCEPT },
206  { 6, OP_COMMIT },
207  { 1, OP_FAIL },
208  { 4, OP_FAIL },
209  { 5, OP_PRUNE },
210  { 4, OP_SKIP  },
211  { 4, OP_THEN  }
212};
213
214static const int verbcount = sizeof(verbs)/sizeof(verbitem);
215
216
217/* Tables of names of POSIX character classes and their lengths. The names are
218now all in a single string, to reduce the number of relocations when a shared
219library is dynamically loaded. The list of lengths is terminated by a zero
220length entry. The first three must be alpha, lower, upper, as this is assumed
221for handling case independence. */
222
223static const char posix_names[] =
224  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
225  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
226  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
227  STRING_word0  STRING_xdigit;
228
229static const uschar posix_name_lengths[] = {
230  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
231
232/* Table of class bit maps for each POSIX class. Each class is formed from a
233base map, with an optional addition or removal of another map. Then, for some
234classes, there is some additional tweaking: for [:blank:] the vertical space
235characters are removed, and for [:alpha:] and [:alnum:] the underscore
236character is removed. The triples in the table consist of the base map offset,
237second map offset or -1 if no second map, and a non-negative value for map
238addition or a negative value for map subtraction (if there are two maps). The
239absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
240remove vertical space characters, 2 => remove underscore. */
241
242static const int posix_class_maps[] = {
243  cbit_word,  cbit_digit, -2,             /* alpha */
244  cbit_lower, -1,          0,             /* lower */
245  cbit_upper, -1,          0,             /* upper */
246  cbit_word,  -1,          2,             /* alnum - word without underscore */
247  cbit_print, cbit_cntrl,  0,             /* ascii */
248  cbit_space, -1,          1,             /* blank - a GNU extension */
249  cbit_cntrl, -1,          0,             /* cntrl */
250  cbit_digit, -1,          0,             /* digit */
251  cbit_graph, -1,          0,             /* graph */
252  cbit_print, -1,          0,             /* print */
253  cbit_punct, -1,          0,             /* punct */
254  cbit_space, -1,          0,             /* space */
255  cbit_word,  -1,          0,             /* word - a Perl extension */
256  cbit_xdigit,-1,          0              /* xdigit */
257};
258
259
260#define STRING(a)  # a
261#define XSTRING(s) STRING(s)
262
263/* The texts of compile-time error messages. These are "char *" because they
264are passed to the outside world. Do not ever re-use any error number, because
265they are documented. Always add a new error instead. Messages marked DEAD below
266are no longer used. This used to be a table of strings, but in order to reduce
267the number of relocations needed when a shared library is loaded dynamically,
268it is now one long string. We cannot use a table of offsets, because the
269lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
270simply count through to the one we want - this isn't a performance issue
271because these strings are used only when there is a compilation error.
272
273Each substring ends with \0 to insert a null character. This includes the final
274substring, so that the whole string ends with \0\0, which can be detected when
275counting through. */
276
277static const char error_texts[] =
278  "no error\0"
279  "\\ at end of pattern\0"
280  "\\c at end of pattern\0"
281  "unrecognized character follows \\\0"
282  "numbers out of order in {} quantifier\0"
283  /* 5 */
284  "number too big in {} quantifier\0"
285  "missing terminating ] for character class\0"
286  "invalid escape sequence in character class\0"
287  "range out of order in character class\0"
288  "nothing to repeat\0"
289  /* 10 */
290  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
291  "internal error: unexpected repeat\0"
292  "unrecognized character after (? or (?-\0"
293  "POSIX named classes are supported only within a class\0"
294  "missing )\0"
295  /* 15 */
296  "reference to non-existent subpattern\0"
297  "erroffset passed as NULL\0"
298  "unknown option bit(s) set\0"
299  "missing ) after comment\0"
300  "parentheses nested too deeply\0"  /** DEAD **/
301  /* 20 */
302  "regular expression is too large\0"
303  "failed to get memory\0"
304  "unmatched parentheses\0"
305  "internal error: code overflow\0"
306  "unrecognized character after (?<\0"
307  /* 25 */
308  "lookbehind assertion is not fixed length\0"
309  "malformed number or name after (?(\0"
310  "conditional group contains more than two branches\0"
311  "assertion expected after (?(\0"
312  "(?R or (?[+-]digits must be followed by )\0"
313  /* 30 */
314  "unknown POSIX class name\0"
315  "POSIX collating elements are not supported\0"
316  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
317  "spare error\0"  /** DEAD **/
318  "character value in \\x{...} sequence is too large\0"
319  /* 35 */
320  "invalid condition (?(0)\0"
321  "\\C not allowed in lookbehind assertion\0"
322  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
323  "number after (?C is > 255\0"
324  "closing ) for (?C expected\0"
325  /* 40 */
326  "recursive call could loop indefinitely\0"
327  "unrecognized character after (?P\0"
328  "syntax error in subpattern name (missing terminator)\0"
329  "two named subpatterns have the same name\0"
330  "invalid UTF-8 string\0"
331  /* 45 */
332  "support for \\P, \\p, and \\X has not been compiled\0"
333  "malformed \\P or \\p sequence\0"
334  "unknown property name after \\P or \\p\0"
335  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
336  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
337  /* 50 */
338  "repeated subpattern is too long\0"    /** DEAD **/
339  "octal value is greater than \\377 (not in UTF-8 mode)\0"
340  "internal error: overran compiling workspace\0"
341  "internal error: previously-checked referenced subpattern not found\0"
342  "DEFINE group contains more than one branch\0"
343  /* 55 */
344  "repeating a DEFINE group is not allowed\0"
345  "inconsistent NEWLINE options\0"
346  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
347  "a numbered reference must not be zero\0"
348  "(*VERB) with an argument is not supported\0"
349  /* 60 */
350  "(*VERB) not recognized\0"
351  "number is too big\0"
352  "subpattern name expected\0"
353  "digit expected after (?+\0"
354  "] is an invalid data character in JavaScript compatibility mode\0"
355  /* 65 */
356  "different names for subpatterns of the same number are not allowed\0";
357
358/* Table to identify digits and hex digits. This is used when compiling
359patterns. Note that the tables in chartables are dependent on the locale, and
360may mark arbitrary characters as digits - but the PCRE compiling code expects
361to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
362a private table here. It costs 256 bytes, but it is a lot faster than doing
363character value tests (at least in some simple cases I timed), and in some
364applications one wants PCRE to compile efficiently as well as match
365efficiently.
366
367For convenience, we use the same bit definitions as in chartables:
368
369  0x04   decimal digit
370  0x08   hexadecimal digit
371
372Then we can use ctype_digit and ctype_xdigit in the code. */
373
374#ifndef EBCDIC
375
376/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
377UTF-8 mode. */
378
379static const unsigned char digitab[] =
380  {
381  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
382  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
383  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
384  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
385  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
386  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
387  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
388  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
389  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
390  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
391  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
392  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
393  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
394  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
395  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
396  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
397  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
398  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
399  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
400  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
401  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
402  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
403  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
404  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
405  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
406  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
407  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
408  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
409  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
410  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
411  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
412  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
413
414#else
415
416/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
417
418static const unsigned char digitab[] =
419  {
420  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
421  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
422  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
423  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
424  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
425  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
426  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
427  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
428  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
429  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
430  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
431  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
432  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
433  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
434  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
435  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
436  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
437  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
438  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
439  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
440  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
441  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
442  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
443  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
444  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
445  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
446  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
447  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
448  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
449  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
450  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
451  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
452
453static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
454  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
455  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
456  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
457  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
458  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
459  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
460  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
461  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
462  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
463  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
464  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
465  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
466  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
467  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
468  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
469  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
470  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
471  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
472  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
473  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
474  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
475  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
476  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
477  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
478  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
479  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
480  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
481  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
482  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
483  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
484  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
485  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
486#endif
487
488
489/* Definition to allow mutual recursion */
490
491static BOOL
492  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
493    int *, int *, branch_chain *, compile_data *, int *);
494
495
496
497/*************************************************
498*            Find an error text                  *
499*************************************************/
500
501/* The error texts are now all in one long string, to save on relocations. As
502some of the text is of unknown length, we can't use a table of offsets.
503Instead, just count through the strings. This is not a performance issue
504because it happens only when there has been a compilation error.
505
506Argument:   the error number
507Returns:    pointer to the error string
508*/
509
510static const char *
511find_error_text(int n)
512{
513const char *s = error_texts;
514for (; n > 0; n--)
515  {
516  while (*s++ != 0) {};
517  if (*s == 0) return "Error text not found (please report)";
518  }
519return s;
520}
521
522
523/*************************************************
524*            Handle escapes                      *
525*************************************************/
526
527/* This function is called when a \ has been encountered. It either returns a
528positive value for a simple escape such as \n, or a negative value which
529encodes one of the more complicated things such as \d. A backreference to group
530n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
531UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
532ptr is pointing at the \. On exit, it is on the final character of the escape
533sequence.
534
535Arguments:
536  ptrptr         points to the pattern position pointer
537  errorcodeptr   points to the errorcode variable
538  bracount       number of previous extracting brackets
539  options        the options bits
540  isclass        TRUE if inside a character class
541
542Returns:         zero or positive => a data character
543                 negative => a special escape sequence
544                 on error, errorcodeptr is set
545*/
546
547static int
548check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
549  int options, BOOL isclass)
550{
551BOOL utf8 = (options & PCRE_UTF8) != 0;
552const uschar *ptr = *ptrptr + 1;
553int c, i;
554
555GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
556ptr--;                            /* Set pointer back to the last byte */
557
558/* If backslash is at the end of the pattern, it's an error. */
559
560if (c == 0) *errorcodeptr = ERR1;
561
562/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
563in a table. A non-zero result is something that can be returned immediately.
564Otherwise further processing may be required. */
565
566#ifndef EBCDIC  /* ASCII/UTF-8 coding */
567else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
568else if ((i = escapes[c - CHAR_0]) != 0) c = i;
569
570#else           /* EBCDIC coding */
571else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
572else if ((i = escapes[c - 0x48]) != 0)  c = i;
573#endif
574
575/* Escapes that need further processing, or are illegal. */
576
577else
578  {
579  const uschar *oldptr;
580  BOOL braced, negated;
581
582  switch (c)
583    {
584    /* A number of Perl escapes are not handled by PCRE. We give an explicit
585    error. */
586
587    case CHAR_l:
588    case CHAR_L:
589    case CHAR_N:
590    case CHAR_u:
591    case CHAR_U:
592    *errorcodeptr = ERR37;
593    break;
594
595    /* \g must be followed by one of a number of specific things:
596
597    (1) A number, either plain or braced. If positive, it is an absolute
598    backreference. If negative, it is a relative backreference. This is a Perl
599    5.10 feature.
600
601    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
602    is part of Perl's movement towards a unified syntax for back references. As
603    this is synonymous with \k{name}, we fudge it up by pretending it really
604    was \k.
605
606    (3) For Oniguruma compatibility we also support \g followed by a name or a
607    number either in angle brackets or in single quotes. However, these are
608    (possibly recursive) subroutine calls, _not_ backreferences. Just return
609    the -ESC_g code (cf \k). */
610
611    case CHAR_g:
612    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
613      {
614      c = -ESC_g;
615      break;
616      }
617
618    /* Handle the Perl-compatible cases */
619
620    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
621      {
622      const uschar *p;
623      for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
624        if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
625      if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
626        {
627        c = -ESC_k;
628        break;
629        }
630      braced = TRUE;
631      ptr++;
632      }
633    else braced = FALSE;
634
635    if (ptr[1] == CHAR_MINUS)
636      {
637      negated = TRUE;
638      ptr++;
639      }
640    else negated = FALSE;
641
642    c = 0;
643    while ((digitab[ptr[1]] & ctype_digit) != 0)
644      c = c * 10 + *(++ptr) - CHAR_0;
645
646    if (c < 0)   /* Integer overflow */
647      {
648      *errorcodeptr = ERR61;
649      break;
650      }
651
652    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
653      {
654      *errorcodeptr = ERR57;
655      break;
656      }
657
658    if (c == 0)
659      {
660      *errorcodeptr = ERR58;
661      break;
662      }
663
664    if (negated)
665      {
666      if (c > bracount)
667        {
668        *errorcodeptr = ERR15;
669        break;
670        }
671      c = bracount - (c - 1);
672      }
673
674    c = -(ESC_REF + c);
675    break;
676
677    /* The handling of escape sequences consisting of a string of digits
678    starting with one that is not zero is not straightforward. By experiment,
679    the way Perl works seems to be as follows:
680
681    Outside a character class, the digits are read as a decimal number. If the
682    number is less than 10, or if there are that many previous extracting
683    left brackets, then it is a back reference. Otherwise, up to three octal
684    digits are read to form an escaped byte. Thus \123 is likely to be octal
685    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
686    value is greater than 377, the least significant 8 bits are taken. Inside a
687    character class, \ followed by a digit is always an octal number. */
688
689    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
690    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
691
692    if (!isclass)
693      {
694      oldptr = ptr;
695      c -= CHAR_0;
696      while ((digitab[ptr[1]] & ctype_digit) != 0)
697        c = c * 10 + *(++ptr) - CHAR_0;
698      if (c < 0)    /* Integer overflow */
699        {
700        *errorcodeptr = ERR61;
701        break;
702        }
703      if (c < 10 || c <= bracount)
704        {
705        c = -(ESC_REF + c);
706        break;
707        }
708      ptr = oldptr;      /* Put the pointer back and fall through */
709      }
710
711    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
712    generates a binary zero byte and treats the digit as a following literal.
713    Thus we have to pull back the pointer by one. */
714
715    if ((c = *ptr) >= CHAR_8)
716      {
717      ptr--;
718      c = 0;
719      break;
720      }
721
722    /* \0 always starts an octal number, but we may drop through to here with a
723    larger first octal digit. The original code used just to take the least
724    significant 8 bits of octal numbers (I think this is what early Perls used
725    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
726    than 3 octal digits. */
727
728    case CHAR_0:
729    c -= CHAR_0;
730    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
731        c = c * 8 + *(++ptr) - CHAR_0;
732    if (!utf8 && c > 255) *errorcodeptr = ERR51;
733    break;
734
735    /* \x is complicated. \x{ddd} is a character number which can be greater
736    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
737    treated as a data character. */
738
739    case CHAR_x:
740    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
741      {
742      const uschar *pt = ptr + 2;
743      int count = 0;
744
745      c = 0;
746      while ((digitab[*pt] & ctype_xdigit) != 0)
747        {
748        register int cc = *pt++;
749        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
750        count++;
751
752#ifndef EBCDIC  /* ASCII/UTF-8 coding */
753        if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
754        c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
755#else           /* EBCDIC coding */
756        if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
757        c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
758#endif
759        }
760
761      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
762        {
763        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
764        ptr = pt;
765        break;
766        }
767
768      /* If the sequence of hex digits does not end with '}', then we don't
769      recognize this construct; fall through to the normal \x handling. */
770      }
771
772    /* Read just a single-byte hex-defined char */
773
774    c = 0;
775    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
776      {
777      int cc;                                  /* Some compilers don't like */
778      cc = *(++ptr);                           /* ++ in initializers */
779#ifndef EBCDIC  /* ASCII/UTF-8 coding */
780      if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
781      c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
782#else           /* EBCDIC coding */
783      if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
784      c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
785#endif
786      }
787    break;
788
789    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
790    This coding is ASCII-specific, but then the whole concept of \cx is
791    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
792
793    case CHAR_c:
794    c = *(++ptr);
795    if (c == 0)
796      {
797      *errorcodeptr = ERR2;
798      break;
799      }
800
801#ifndef EBCDIC  /* ASCII/UTF-8 coding */
802    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
803    c ^= 0x40;
804#else           /* EBCDIC coding */
805    if (c >= CHAR_a && c <= CHAR_z) c += 64;
806    c ^= 0xC0;
807#endif
808    break;
809
810    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
811    other alphanumeric following \ is an error if PCRE_EXTRA was set;
812    otherwise, for Perl compatibility, it is a literal. This code looks a bit
813    odd, but there used to be some cases other than the default, and there may
814    be again in future, so I haven't "optimized" it. */
815
816    default:
817    if ((options & PCRE_EXTRA) != 0) switch(c)
818      {
819      default:
820      *errorcodeptr = ERR3;
821      break;
822      }
823    break;
824    }
825  }
826
827*ptrptr = ptr;
828return c;
829}
830
831
832
833#ifdef SUPPORT_UCP
834/*************************************************
835*               Handle \P and \p                 *
836*************************************************/
837
838/* This function is called after \P or \p has been encountered, provided that
839PCRE is compiled with support for Unicode properties. On entry, ptrptr is
840pointing at the P or p. On exit, it is pointing at the final character of the
841escape sequence.
842
843Argument:
844  ptrptr         points to the pattern position pointer
845  negptr         points to a boolean that is set TRUE for negation else FALSE
846  dptr           points to an int that is set to the detailed property value
847  errorcodeptr   points to the error code variable
848
849Returns:         type value from ucp_type_table, or -1 for an invalid type
850*/
851
852static int
853get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
854{
855int c, i, bot, top;
856const uschar *ptr = *ptrptr;
857char name[32];
858
859c = *(++ptr);
860if (c == 0) goto ERROR_RETURN;
861
862*negptr = FALSE;
863
864/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
865negation. */
866
867if (c == CHAR_LEFT_CURLY_BRACKET)
868  {
869  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
870    {
871    *negptr = TRUE;
872    ptr++;
873    }
874  for (i = 0; i < (int)sizeof(name) - 1; i++)
875    {
876    c = *(++ptr);
877    if (c == 0) goto ERROR_RETURN;
878    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
879    name[i] = c;
880    }
881  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
882  name[i] = 0;
883  }
884
885/* Otherwise there is just one following character */
886
887else
888  {
889  name[0] = c;
890  name[1] = 0;
891  }
892
893*ptrptr = ptr;
894
895/* Search for a recognized property name using binary chop */
896
897bot = 0;
898top = _pcre_utt_size;
899
900while (bot < top)
901  {
902  i = (bot + top) >> 1;
903  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
904  if (c == 0)
905    {
906    *dptr = _pcre_utt[i].value;
907    return _pcre_utt[i].type;
908    }
909  if (c > 0) bot = i + 1; else top = i;
910  }
911
912*errorcodeptr = ERR47;
913*ptrptr = ptr;
914return -1;
915
916ERROR_RETURN:
917*errorcodeptr = ERR46;
918*ptrptr = ptr;
919return -1;
920}
921#endif
922
923
924
925
926/*************************************************
927*            Check for counted repeat            *
928*************************************************/
929
930/* This function is called when a '{' is encountered in a place where it might
931start a quantifier. It looks ahead to see if it really is a quantifier or not.
932It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
933where the ddds are digits.
934
935Arguments:
936  p         pointer to the first char after '{'
937
938Returns:    TRUE or FALSE
939*/
940
941static BOOL
942is_counted_repeat(const uschar *p)
943{
944if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
945while ((digitab[*p] & ctype_digit) != 0) p++;
946if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
947
948if (*p++ != CHAR_COMMA) return FALSE;
949if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
950
951if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
952while ((digitab[*p] & ctype_digit) != 0) p++;
953
954return (*p == CHAR_RIGHT_CURLY_BRACKET);
955}
956
957
958
959/*************************************************
960*         Read repeat counts                     *
961*************************************************/
962
963/* Read an item of the form {n,m} and return the values. This is called only
964after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
965so the syntax is guaranteed to be correct, but we need to check the values.
966
967Arguments:
968  p              pointer to first char after '{'
969  minp           pointer to int for min
970  maxp           pointer to int for max
971                 returned as -1 if no max
972  errorcodeptr   points to error code variable
973
974Returns:         pointer to '}' on success;
975                 current ptr on error, with errorcodeptr set non-zero
976*/
977
978static const uschar *
979read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
980{
981int min = 0;
982int max = -1;
983
984/* Read the minimum value and do a paranoid check: a negative value indicates
985an integer overflow. */
986
987while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
988if (min < 0 || min > 65535)
989  {
990  *errorcodeptr = ERR5;
991  return p;
992  }
993
994/* Read the maximum value if there is one, and again do a paranoid on its size.
995Also, max must not be less than min. */
996
997if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
998  {
999  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1000    {
1001    max = 0;
1002    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1003    if (max < 0 || max > 65535)
1004      {
1005      *errorcodeptr = ERR5;
1006      return p;
1007      }
1008    if (max < min)
1009      {
1010      *errorcodeptr = ERR4;
1011      return p;
1012      }
1013    }
1014  }
1015
1016/* Fill in the required variables, and pass back the pointer to the terminating
1017'}'. */
1018
1019*minp = min;
1020*maxp = max;
1021return p;
1022}
1023
1024
1025
1026/*************************************************
1027*  Subroutine for finding forward reference      *
1028*************************************************/
1029
1030/* This recursive function is called only from find_parens() below. The
1031top-level call starts at the beginning of the pattern. All other calls must
1032start at a parenthesis. It scans along a pattern's text looking for capturing
1033subpatterns, and counting them. If it finds a named pattern that matches the
1034name it is given, it returns its number. Alternatively, if the name is NULL, it
1035returns when it reaches a given numbered subpattern. We know that if (?P< is
1036encountered, the name will be terminated by '>' because that is checked in the
1037first pass. Recursion is used to keep track of subpatterns that reset the
1038capturing group numbers - the (?| feature.
1039
1040Arguments:
1041  ptrptr       address of the current character pointer (updated)
1042  cd           compile background data
1043  name         name to seek, or NULL if seeking a numbered subpattern
1044  lorn         name length, or subpattern number if name is NULL
1045  xmode        TRUE if we are in /x mode
1046  count        pointer to the current capturing subpattern number (updated)
1047
1048Returns:       the number of the named subpattern, or -1 if not found
1049*/
1050
1051static int
1052find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1053  BOOL xmode, int *count)
1054{
1055uschar *ptr = *ptrptr;
1056int start_count = *count;
1057int hwm_count = start_count;
1058BOOL dup_parens = FALSE;
1059
1060/* If the first character is a parenthesis, check on the type of group we are
1061dealing with. The very first call may not start with a parenthesis. */
1062
1063if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1064  {
1065  if (ptr[1] == CHAR_QUESTION_MARK &&
1066      ptr[2] == CHAR_VERTICAL_LINE)
1067    {
1068    ptr += 3;
1069    dup_parens = TRUE;
1070    }
1071
1072  /* Handle a normal, unnamed capturing parenthesis */
1073
1074  else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1075    {
1076    *count += 1;
1077    if (name == NULL && *count == lorn) return *count;
1078    ptr++;
1079    }
1080
1081  /* Handle a condition. If it is an assertion, just carry on so that it
1082  is processed as normal. If not, skip to the closing parenthesis of the
1083  condition (there can't be any nested parens. */
1084
1085  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1086    {
1087    ptr += 2;
1088    if (ptr[1] != CHAR_QUESTION_MARK)
1089      {
1090      while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1091      if (*ptr != 0) ptr++;
1092      }
1093    }
1094
1095  /* We have either (? or (* and not a condition */
1096
1097  else
1098    {
1099    ptr += 2;
1100    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1101
1102    /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1103
1104    if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1105        ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1106      {
1107      int term;
1108      const uschar *thisname;
1109      *count += 1;
1110      if (name == NULL && *count == lorn) return *count;
1111      term = *ptr++;
1112      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1113      thisname = ptr;
1114      while (*ptr != term) ptr++;
1115      if (name != NULL && lorn == ptr - thisname &&
1116          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1117        return *count;
1118      term++;
1119      }
1120    }
1121  }
1122
1123/* Past any initial parenthesis handling, scan for parentheses or vertical
1124bars. */
1125
1126for (; *ptr != 0; ptr++)
1127  {
1128  /* Skip over backslashed characters and also entire \Q...\E */
1129
1130  if (*ptr == CHAR_BACKSLASH)
1131    {
1132    if (*(++ptr) == 0) goto FAIL_EXIT;
1133    if (*ptr == CHAR_Q) for (;;)
1134      {
1135      while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1136      if (*ptr == 0) goto FAIL_EXIT;
1137      if (*(++ptr) == CHAR_E) break;
1138      }
1139    continue;
1140    }
1141
1142  /* Skip over character classes; this logic must be similar to the way they
1143  are handled for real. If the first character is '^', skip it. Also, if the
1144  first few characters (either before or after ^) are \Q\E or \E we skip them
1145  too. This makes for compatibility with Perl. Note the use of STR macros to
1146  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1147
1148  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1149    {
1150    BOOL negate_class = FALSE;
1151    for (;;)
1152      {
1153      if (ptr[1] == CHAR_BACKSLASH)
1154        {
1155        if (ptr[2] == CHAR_E)
1156          ptr+= 2;
1157        else if (strncmp((const char *)ptr+2,
1158                 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1159          ptr += 4;
1160        else
1161          break;
1162        }
1163      else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1164        {
1165        negate_class = TRUE;
1166        ptr++;
1167        }
1168      else break;
1169      }
1170
1171    /* If the next character is ']', it is a data character that must be
1172    skipped, except in JavaScript compatibility mode. */
1173
1174    if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1175        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1176      ptr++;
1177
1178    while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1179      {
1180      if (*ptr == 0) return -1;
1181      if (*ptr == CHAR_BACKSLASH)
1182        {
1183        if (*(++ptr) == 0) goto FAIL_EXIT;
1184        if (*ptr == CHAR_Q) for (;;)
1185          {
1186          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1187          if (*ptr == 0) goto FAIL_EXIT;
1188          if (*(++ptr) == CHAR_E) break;
1189          }
1190        continue;
1191        }
1192      }
1193    continue;
1194    }
1195
1196  /* Skip comments in /x mode */
1197
1198  if (xmode && *ptr == CHAR_NUMBER_SIGN)
1199    {
1200    while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1201    if (*ptr == 0) goto FAIL_EXIT;
1202    continue;
1203    }
1204
1205  /* Check for the special metacharacters */
1206
1207  if (*ptr == CHAR_LEFT_PARENTHESIS)
1208    {
1209    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1210    if (rc > 0) return rc;
1211    if (*ptr == 0) goto FAIL_EXIT;
1212    }
1213
1214  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1215    {
1216    if (dup_parens && *count < hwm_count) *count = hwm_count;
1217    *ptrptr = ptr;
1218    return -1;
1219    }
1220
1221  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1222    {
1223    if (*count > hwm_count) hwm_count = *count;
1224    *count = start_count;
1225    }
1226  }
1227
1228FAIL_EXIT:
1229*ptrptr = ptr;
1230return -1;
1231}
1232
1233
1234
1235
1236/*************************************************
1237*       Find forward referenced subpattern       *
1238*************************************************/
1239
1240/* This function scans along a pattern's text looking for capturing
1241subpatterns, and counting them. If it finds a named pattern that matches the
1242name it is given, it returns its number. Alternatively, if the name is NULL, it
1243returns when it reaches a given numbered subpattern. This is used for forward
1244references to subpatterns. We used to be able to start this scan from the
1245current compiling point, using the current count value from cd->bracount, and
1246do it all in a single loop, but the addition of the possibility of duplicate
1247subpattern numbers means that we have to scan from the very start, in order to
1248take account of such duplicates, and to use a recursive function to keep track
1249of the different types of group.
1250
1251Arguments:
1252  cd           compile background data
1253  name         name to seek, or NULL if seeking a numbered subpattern
1254  lorn         name length, or subpattern number if name is NULL
1255  xmode        TRUE if we are in /x mode
1256
1257Returns:       the number of the found subpattern, or -1 if not found
1258*/
1259
1260static int
1261find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1262{
1263uschar *ptr = (uschar *)cd->start_pattern;
1264int count = 0;
1265int rc;
1266
1267/* If the pattern does not start with an opening parenthesis, the first call
1268to find_parens_sub() will scan right to the end (if necessary). However, if it
1269does start with a parenthesis, find_parens_sub() will return when it hits the
1270matching closing parens. That is why we have to have a loop. */
1271
1272for (;;)
1273  {
1274  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1275  if (rc > 0 || *ptr++ == 0) break;
1276  }
1277
1278return rc;
1279}
1280
1281
1282
1283
1284/*************************************************
1285*      Find first significant op code            *
1286*************************************************/
1287
1288/* This is called by several functions that scan a compiled expression looking
1289for a fixed first character, or an anchoring op code etc. It skips over things
1290that do not influence this. For some calls, a change of option is important.
1291For some calls, it makes sense to skip negative forward and all backward
1292assertions, and also the \b assertion; for others it does not.
1293
1294Arguments:
1295  code         pointer to the start of the group
1296  options      pointer to external options
1297  optbit       the option bit whose changing is significant, or
1298                 zero if none are
1299  skipassert   TRUE if certain assertions are to be skipped
1300
1301Returns:       pointer to the first significant opcode
1302*/
1303
1304static const uschar*
1305first_significant_code(const uschar *code, int *options, int optbit,
1306  BOOL skipassert)
1307{
1308for (;;)
1309  {
1310  switch ((int)*code)
1311    {
1312    case OP_OPT:
1313    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1314      *options = (int)code[1];
1315    code += 2;
1316    break;
1317
1318    case OP_ASSERT_NOT:
1319    case OP_ASSERTBACK:
1320    case OP_ASSERTBACK_NOT:
1321    if (!skipassert) return code;
1322    do code += GET(code, 1); while (*code == OP_ALT);
1323    code += _pcre_OP_lengths[*code];
1324    break;
1325
1326    case OP_WORD_BOUNDARY:
1327    case OP_NOT_WORD_BOUNDARY:
1328    if (!skipassert) return code;
1329    /* Fall through */
1330
1331    case OP_CALLOUT:
1332    case OP_CREF:
1333    case OP_NCREF:
1334    case OP_RREF:
1335    case OP_NRREF:
1336    case OP_DEF:
1337    code += _pcre_OP_lengths[*code];
1338    break;
1339
1340    default:
1341    return code;
1342    }
1343  }
1344/* Control never reaches here */
1345}
1346
1347
1348
1349
1350/*************************************************
1351*        Find the fixed length of a branch       *
1352*************************************************/
1353
1354/* Scan a branch and compute the fixed length of subject that will match it,
1355if the length is fixed. This is needed for dealing with backward assertions.
1356In UTF8 mode, the result is in characters rather than bytes. The branch is
1357temporarily terminated with OP_END when this function is called.
1358
1359This function is called when a backward assertion is encountered, so that if it
1360fails, the error message can point to the correct place in the pattern.
1361However, we cannot do this when the assertion contains subroutine calls,
1362because they can be forward references. We solve this by remembering this case
1363and doing the check at the end; a flag specifies which mode we are running in.
1364
1365Arguments:
1366  code     points to the start of the pattern (the bracket)
1367  options  the compiling options
1368  atend    TRUE if called when the pattern is complete
1369  cd       the "compile data" structure
1370
1371Returns:   the fixed length,
1372             or -1 if there is no fixed length,
1373             or -2 if \C was encountered
1374             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1375*/
1376
1377static int
1378find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1379{
1380int length = -1;
1381
1382register int branchlength = 0;
1383register uschar *cc = code + 1 + LINK_SIZE;
1384
1385/* Scan along the opcodes for this branch. If we get to the end of the
1386branch, check the length against that of the other branches. */
1387
1388for (;;)
1389  {
1390  int d;
1391  uschar *ce, *cs;
1392  register int op = *cc;
1393  switch (op)
1394    {
1395    case OP_CBRA:
1396    case OP_BRA:
1397    case OP_ONCE:
1398    case OP_COND:
1399    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1400    if (d < 0) return d;
1401    branchlength += d;
1402    do cc += GET(cc, 1); while (*cc == OP_ALT);
1403    cc += 1 + LINK_SIZE;
1404    break;
1405
1406    /* Reached end of a branch; if it's a ket it is the end of a nested
1407    call. If it's ALT it is an alternation in a nested call. If it is
1408    END it's the end of the outer call. All can be handled by the same code. */
1409
1410    case OP_ALT:
1411    case OP_KET:
1412    case OP_KETRMAX:
1413    case OP_KETRMIN:
1414    case OP_END:
1415    if (length < 0) length = branchlength;
1416      else if (length != branchlength) return -1;
1417    if (*cc != OP_ALT) return length;
1418    cc += 1 + LINK_SIZE;
1419    branchlength = 0;
1420    break;
1421
1422    /* A true recursion implies not fixed length, but a subroutine call may
1423    be OK. If the subroutine is a forward reference, we can't deal with
1424    it until the end of the pattern, so return -3. */
1425
1426    case OP_RECURSE:
1427    if (!atend) return -3;
1428    cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1429    do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1430    if (cc > cs && cc < ce) return -1;                /* Recursion */
1431    d = find_fixedlength(cs + 2, options, atend, cd);
1432    if (d < 0) return d;
1433    branchlength += d;
1434    cc += 1 + LINK_SIZE;
1435    break;
1436
1437    /* Skip over assertive subpatterns */
1438
1439    case OP_ASSERT:
1440    case OP_ASSERT_NOT:
1441    case OP_ASSERTBACK:
1442    case OP_ASSERTBACK_NOT:
1443    do cc += GET(cc, 1); while (*cc == OP_ALT);
1444    /* Fall through */
1445
1446    /* Skip over things that don't match chars */
1447
1448    case OP_REVERSE:
1449    case OP_CREF:
1450    case OP_NCREF:
1451    case OP_RREF:
1452    case OP_NRREF:
1453    case OP_DEF:
1454    case OP_OPT:
1455    case OP_CALLOUT:
1456    case OP_SOD:
1457    case OP_SOM:
1458    case OP_SET_SOM:
1459    case OP_EOD:
1460    case OP_EODN:
1461    case OP_CIRC:
1462    case OP_DOLL:
1463    case OP_NOT_WORD_BOUNDARY:
1464    case OP_WORD_BOUNDARY:
1465    cc += _pcre_OP_lengths[*cc];
1466    break;
1467
1468    /* Handle literal characters */
1469
1470    case OP_CHAR:
1471    case OP_CHARNC:
1472    case OP_NOT:
1473    branchlength++;
1474    cc += 2;
1475#ifdef SUPPORT_UTF8
1476    if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1477      cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1478#endif
1479    break;
1480
1481    /* Handle exact repetitions. The count is already in characters, but we
1482    need to skip over a multibyte character in UTF8 mode.  */
1483
1484    case OP_EXACT:
1485    branchlength += GET2(cc,1);
1486    cc += 4;
1487#ifdef SUPPORT_UTF8
1488    if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1489      cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1490#endif
1491    break;
1492
1493    case OP_TYPEEXACT:
1494    branchlength += GET2(cc,1);
1495    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1496    cc += 4;
1497    break;
1498
1499    /* Handle single-char matchers */
1500
1501    case OP_PROP:
1502    case OP_NOTPROP:
1503    cc += 2;
1504    /* Fall through */
1505
1506    case OP_NOT_DIGIT:
1507    case OP_DIGIT:
1508    case OP_NOT_WHITESPACE:
1509    case OP_WHITESPACE:
1510    case OP_NOT_WORDCHAR:
1511    case OP_WORDCHAR:
1512    case OP_ANY:
1513    case OP_ALLANY:
1514    branchlength++;
1515    cc++;
1516    break;
1517
1518    /* The single-byte matcher isn't allowed */
1519
1520    case OP_ANYBYTE:
1521    return -2;
1522
1523    /* Check a class for variable quantification */
1524
1525#ifdef SUPPORT_UTF8
1526    case OP_XCLASS:
1527    cc += GET(cc, 1) - 33;
1528    /* Fall through */
1529#endif
1530
1531    case OP_CLASS:
1532    case OP_NCLASS:
1533    cc += 33;
1534
1535    switch (*cc)
1536      {
1537      case OP_CRSTAR:
1538      case OP_CRMINSTAR:
1539      case OP_CRQUERY:
1540      case OP_CRMINQUERY:
1541      return -1;
1542
1543      case OP_CRRANGE:
1544      case OP_CRMINRANGE:
1545      if (GET2(cc,1) != GET2(cc,3)) return -1;
1546      branchlength += GET2(cc,1);
1547      cc += 5;
1548      break;
1549
1550      default:
1551      branchlength++;
1552      }
1553    break;
1554
1555    /* Anything else is variable length */
1556
1557    default:
1558    return -1;
1559    }
1560  }
1561/* Control never gets here */
1562}
1563
1564
1565
1566
1567/*************************************************
1568*    Scan compiled regex for specific bracket    *
1569*************************************************/
1570
1571/* This little function scans through a compiled pattern until it finds a
1572capturing bracket with the given number, or, if the number is negative, an
1573instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1574so that it can be called from pcre_study() when finding the minimum matching
1575length.
1576
1577Arguments:
1578  code        points to start of expression
1579  utf8        TRUE in UTF-8 mode
1580  number      the required bracket number or negative to find a lookbehind
1581
1582Returns:      pointer to the opcode for the bracket, or NULL if not found
1583*/
1584
1585const uschar *
1586_pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1587{
1588for (;;)
1589  {
1590  register int c = *code;
1591  if (c == OP_END) return NULL;
1592
1593  /* XCLASS is used for classes that cannot be represented just by a bit
1594  map. This includes negated single high-valued characters. The length in
1595  the table is zero; the actual length is stored in the compiled code. */
1596
1597  if (c == OP_XCLASS) code += GET(code, 1);
1598
1599  /* Handle recursion */
1600
1601  else if (c == OP_REVERSE)
1602    {
1603    if (number < 0) return (uschar *)code;
1604    code += _pcre_OP_lengths[c];
1605    }
1606
1607  /* Handle capturing bracket */
1608
1609  else if (c == OP_CBRA)
1610    {
1611    int n = GET2(code, 1+LINK_SIZE);
1612    if (n == number) return (uschar *)code;
1613    code += _pcre_OP_lengths[c];
1614    }
1615
1616  /* Otherwise, we can get the item's length from the table, except that for
1617  repeated character types, we have to test for \p and \P, which have an extra
1618  two bytes of parameters. */
1619
1620  else
1621    {
1622    switch(c)
1623      {
1624      case OP_TYPESTAR:
1625      case OP_TYPEMINSTAR:
1626      case OP_TYPEPLUS:
1627      case OP_TYPEMINPLUS:
1628      case OP_TYPEQUERY:
1629      case OP_TYPEMINQUERY:
1630      case OP_TYPEPOSSTAR:
1631      case OP_TYPEPOSPLUS:
1632      case OP_TYPEPOSQUERY:
1633      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1634      break;
1635
1636      case OP_TYPEUPTO:
1637      case OP_TYPEMINUPTO:
1638      case OP_TYPEEXACT:
1639      case OP_TYPEPOSUPTO:
1640      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1641      break;
1642      }
1643
1644    /* Add in the fixed length from the table */
1645
1646    code += _pcre_OP_lengths[c];
1647
1648  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1649  a multi-byte character. The length in the table is a minimum, so we have to
1650  arrange to skip the extra bytes. */
1651
1652#ifdef SUPPORT_UTF8
1653    if (utf8) switch(c)
1654      {
1655      case OP_CHAR:
1656      case OP_CHARNC:
1657      case OP_EXACT:
1658      case OP_UPTO:
1659      case OP_MINUPTO:
1660      case OP_POSUPTO:
1661      case OP_STAR:
1662      case OP_MINSTAR:
1663      case OP_POSSTAR:
1664      case OP_PLUS:
1665      case OP_MINPLUS:
1666      case OP_POSPLUS:
1667      case OP_QUERY:
1668      case OP_MINQUERY:
1669      case OP_POSQUERY:
1670      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1671      break;
1672      }
1673#else
1674    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1675#endif
1676    }
1677  }
1678}
1679
1680
1681
1682/*************************************************
1683*   Scan compiled regex for recursion reference  *
1684*************************************************/
1685
1686/* This little function scans through a compiled pattern until it finds an
1687instance of OP_RECURSE.
1688
1689Arguments:
1690  code        points to start of expression
1691  utf8        TRUE in UTF-8 mode
1692
1693Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1694*/
1695
1696static const uschar *
1697find_recurse(const uschar *code, BOOL utf8)
1698{
1699for (;;)
1700  {
1701  register int c = *code;
1702  if (c == OP_END) return NULL;
1703  if (c == OP_RECURSE) return code;
1704
1705  /* XCLASS is used for classes that cannot be represented just by a bit
1706  map. This includes negated single high-valued characters. The length in
1707  the table is zero; the actual length is stored in the compiled code. */
1708
1709  if (c == OP_XCLASS) code += GET(code, 1);
1710
1711  /* Otherwise, we can get the item's length from the table, except that for
1712  repeated character types, we have to test for \p and \P, which have an extra
1713  two bytes of parameters. */
1714
1715  else
1716    {
1717    switch(c)
1718      {
1719      case OP_TYPESTAR:
1720      case OP_TYPEMINSTAR:
1721      case OP_TYPEPLUS:
1722      case OP_TYPEMINPLUS:
1723      case OP_TYPEQUERY:
1724      case OP_TYPEMINQUERY:
1725      case OP_TYPEPOSSTAR:
1726      case OP_TYPEPOSPLUS:
1727      case OP_TYPEPOSQUERY:
1728      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1729      break;
1730
1731      case OP_TYPEPOSUPTO:
1732      case OP_TYPEUPTO:
1733      case OP_TYPEMINUPTO:
1734      case OP_TYPEEXACT:
1735      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1736      break;
1737      }
1738
1739    /* Add in the fixed length from the table */
1740
1741    code += _pcre_OP_lengths[c];
1742
1743    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1744    by a multi-byte character. The length in the table is a minimum, so we have
1745    to arrange to skip the extra bytes. */
1746
1747#ifdef SUPPORT_UTF8
1748    if (utf8) switch(c)
1749      {
1750      case OP_CHAR:
1751      case OP_CHARNC:
1752      case OP_EXACT:
1753      case OP_UPTO:
1754      case OP_MINUPTO:
1755      case OP_POSUPTO:
1756      case OP_STAR:
1757      case OP_MINSTAR:
1758      case OP_POSSTAR:
1759      case OP_PLUS:
1760      case OP_MINPLUS:
1761      case OP_POSPLUS:
1762      case OP_QUERY:
1763      case OP_MINQUERY:
1764      case OP_POSQUERY:
1765      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1766      break;
1767      }
1768#else
1769    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1770#endif
1771    }
1772  }
1773}
1774
1775
1776
1777/*************************************************
1778*    Scan compiled branch for non-emptiness      *
1779*************************************************/
1780
1781/* This function scans through a branch of a compiled pattern to see whether it
1782can match the empty string or not. It is called from could_be_empty()
1783below and from compile_branch() when checking for an unlimited repeat of a
1784group that can match nothing. Note that first_significant_code() skips over
1785backward and negative forward assertions when its final argument is TRUE. If we
1786hit an unclosed bracket, we return "empty" - this means we've struck an inner
1787bracket whose current branch will already have been scanned.
1788
1789Arguments:
1790  code        points to start of search
1791  endcode     points to where to stop
1792  utf8        TRUE if in UTF8 mode
1793  cd          contains pointers to tables etc.
1794
1795Returns:      TRUE if what is matched could be empty
1796*/
1797
1798static BOOL
1799could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1800  compile_data *cd)
1801{
1802register int c;
1803for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1804     code < endcode;
1805     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1806  {
1807  const uschar *ccode;
1808
1809  c = *code;
1810
1811  /* Skip over forward assertions; the other assertions are skipped by
1812  first_significant_code() with a TRUE final argument. */
1813
1814  if (c == OP_ASSERT)
1815    {
1816    do code += GET(code, 1); while (*code == OP_ALT);
1817    c = *code;
1818    continue;
1819    }
1820
1821  /* Groups with zero repeats can of course be empty; skip them. */
1822
1823  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1824    {
1825    code += _pcre_OP_lengths[c];
1826    do code += GET(code, 1); while (*code == OP_ALT);
1827    c = *code;
1828    continue;
1829    }
1830
1831  /* For a recursion/subroutine call, if its end has been reached, which
1832  implies a subroutine call, we can scan it. */
1833
1834  if (c == OP_RECURSE)
1835    {
1836    BOOL empty_branch = FALSE;
1837    const uschar *scode = cd->start_code + GET(code, 1);
1838    if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
1839    do
1840      {
1841      if (could_be_empty_branch(scode, endcode, utf8, cd))
1842        {
1843        empty_branch = TRUE;
1844        break;
1845        }
1846      scode += GET(scode, 1);
1847      }
1848    while (*scode == OP_ALT);
1849    if (!empty_branch) return FALSE;  /* All branches are non-empty */
1850    continue;
1851    }
1852
1853  /* For other groups, scan the branches. */
1854
1855  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1856    {
1857    BOOL empty_branch;
1858    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1859
1860    /* If a conditional group has only one branch, there is a second, implied,
1861    empty branch, so just skip over the conditional, because it could be empty.
1862    Otherwise, scan the individual branches of the group. */
1863
1864    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1865      code += GET(code, 1);
1866    else
1867      {
1868      empty_branch = FALSE;
1869      do
1870        {
1871        if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
1872          empty_branch = TRUE;
1873        code += GET(code, 1);
1874        }
1875      while (*code == OP_ALT);
1876      if (!empty_branch) return FALSE;   /* All branches are non-empty */
1877      }
1878
1879    c = *code;
1880    continue;
1881    }
1882
1883  /* Handle the other opcodes */
1884
1885  switch (c)
1886    {
1887    /* Check for quantifiers after a class. XCLASS is used for classes that
1888    cannot be represented just by a bit map. This includes negated single
1889    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1890    actual length is stored in the compiled code, so we must update "code"
1891    here. */
1892
1893#ifdef SUPPORT_UTF8
1894    case OP_XCLASS:
1895    ccode = code += GET(code, 1);
1896    goto CHECK_CLASS_REPEAT;
1897#endif
1898
1899    case OP_CLASS:
1900    case OP_NCLASS:
1901    ccode = code + 33;
1902
1903#ifdef SUPPORT_UTF8
1904    CHECK_CLASS_REPEAT:
1905#endif
1906
1907    switch (*ccode)
1908      {
1909      case OP_CRSTAR:            /* These could be empty; continue */
1910      case OP_CRMINSTAR:
1911      case OP_CRQUERY:
1912      case OP_CRMINQUERY:
1913      break;
1914
1915      default:                   /* Non-repeat => class must match */
1916      case OP_CRPLUS:            /* These repeats aren't empty */
1917      case OP_CRMINPLUS:
1918      return FALSE;
1919
1920      case OP_CRRANGE:
1921      case OP_CRMINRANGE:
1922      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1923      break;
1924      }
1925    break;
1926
1927    /* Opcodes that must match a character */
1928
1929    case OP_PROP:
1930    case OP_NOTPROP:
1931    case OP_EXTUNI:
1932    case OP_NOT_DIGIT:
1933    case OP_DIGIT:
1934    case OP_NOT_WHITESPACE:
1935    case OP_WHITESPACE:
1936    case OP_NOT_WORDCHAR:
1937    case OP_WORDCHAR:
1938    case OP_ANY:
1939    case OP_ALLANY:
1940    case OP_ANYBYTE:
1941    case OP_CHAR:
1942    case OP_CHARNC:
1943    case OP_NOT:
1944    case OP_PLUS:
1945    case OP_MINPLUS:
1946    case OP_POSPLUS:
1947    case OP_EXACT:
1948    case OP_NOTPLUS:
1949    case OP_NOTMINPLUS:
1950    case OP_NOTPOSPLUS:
1951    case OP_NOTEXACT:
1952    case OP_TYPEPLUS:
1953    case OP_TYPEMINPLUS:
1954    case OP_TYPEPOSPLUS:
1955    case OP_TYPEEXACT:
1956    return FALSE;
1957
1958    /* These are going to continue, as they may be empty, but we have to
1959    fudge the length for the \p and \P cases. */
1960
1961    case OP_TYPESTAR:
1962    case OP_TYPEMINSTAR:
1963    case OP_TYPEPOSSTAR:
1964    case OP_TYPEQUERY:
1965    case OP_TYPEMINQUERY:
1966    case OP_TYPEPOSQUERY:
1967    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1968    break;
1969
1970    /* Same for these */
1971
1972    case OP_TYPEUPTO:
1973    case OP_TYPEMINUPTO:
1974    case OP_TYPEPOSUPTO:
1975    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1976    break;
1977
1978    /* End of branch */
1979
1980    case OP_KET:
1981    case OP_KETRMAX:
1982    case OP_KETRMIN:
1983    case OP_ALT:
1984    return TRUE;
1985
1986    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1987    MINUPTO, and POSUPTO may be followed by a multibyte character */
1988
1989#ifdef SUPPORT_UTF8
1990    case OP_STAR:
1991    case OP_MINSTAR:
1992    case OP_POSSTAR:
1993    case OP_QUERY:
1994    case OP_MINQUERY:
1995    case OP_POSQUERY:
1996    if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
1997    break;
1998
1999    case OP_UPTO:
2000    case OP_MINUPTO:
2001    case OP_POSUPTO:
2002    if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2003    break;
2004#endif
2005
2006    /* None of the remaining opcodes are required to match a character. */
2007
2008    default:
2009    break;
2010    }
2011  }
2012
2013return TRUE;
2014}
2015
2016
2017
2018/*************************************************
2019*    Scan compiled regex for non-emptiness       *
2020*************************************************/
2021
2022/* This function is called to check for left recursive calls. We want to check
2023the current branch of the current pattern to see if it could match the empty
2024string. If it could, we must look outwards for branches at other levels,
2025stopping when we pass beyond the bracket which is the subject of the recursion.
2026
2027Arguments:
2028  code        points to start of the recursion
2029  endcode     points to where to stop (current RECURSE item)
2030  bcptr       points to the chain of current (unclosed) branch starts
2031  utf8        TRUE if in UTF-8 mode
2032  cd          pointers to tables etc
2033
2034Returns:      TRUE if what is matched could be empty
2035*/
2036
2037static BOOL
2038could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2039  BOOL utf8, compile_data *cd)
2040{
2041while (bcptr != NULL && bcptr->current_branch >= code)
2042  {
2043  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2044    return FALSE;
2045  bcptr = bcptr->outer;
2046  }
2047return TRUE;
2048}
2049
2050
2051
2052/*************************************************
2053*           Check for POSIX class syntax         *
2054*************************************************/
2055
2056/* This function is called when the sequence "[:" or "[." or "[=" is
2057encountered in a character class. It checks whether this is followed by a
2058sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2059reach an unescaped ']' without the special preceding character, return FALSE.
2060
2061Originally, this function only recognized a sequence of letters between the
2062terminators, but it seems that Perl recognizes any sequence of characters,
2063though of course unknown POSIX names are subsequently rejected. Perl gives an
2064"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2065didn't consider this to be a POSIX class. Likewise for [:1234:].
2066
2067The problem in trying to be exactly like Perl is in the handling of escapes. We
2068have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2069class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2070below handles the special case of \], but does not try to do any other escape
2071processing. This makes it different from Perl for cases such as [:l\ower:]
2072where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2073"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2074I think.
2075
2076Arguments:
2077  ptr      pointer to the initial [
2078  endptr   where to return the end pointer
2079
2080Returns:   TRUE or FALSE
2081*/
2082
2083static BOOL
2084check_posix_syntax(const uschar *ptr, const uschar **endptr)
2085{
2086int terminator;          /* Don't combine these lines; the Solaris cc */
2087terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2088for (++ptr; *ptr != 0; ptr++)
2089  {
2090  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2091    {
2092    if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2093    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2094      {
2095      *endptr = ptr;
2096      return TRUE;
2097      }
2098    }
2099  }
2100return FALSE;
2101}
2102
2103
2104
2105
2106/*************************************************
2107*          Check POSIX class name                *
2108*************************************************/
2109
2110/* This function is called to check the name given in a POSIX-style class entry
2111such as [:alnum:].
2112
2113Arguments:
2114  ptr        points to the first letter
2115  len        the length of the name
2116
2117Returns:     a value representing the name, or -1 if unknown
2118*/
2119
2120static int
2121check_posix_name(const uschar *ptr, int len)
2122{
2123const char *pn = posix_names;
2124register int yield = 0;
2125while (posix_name_lengths[yield] != 0)
2126  {
2127  if (len == posix_name_lengths[yield] &&
2128    strncmp((const char *)ptr, pn, len) == 0) return yield;
2129  pn += posix_name_lengths[yield] + 1;
2130  yield++;
2131  }
2132return -1;
2133}
2134
2135
2136/*************************************************
2137*    Adjust OP_RECURSE items in repeated group   *
2138*************************************************/
2139
2140/* OP_RECURSE items contain an offset from the start of the regex to the group
2141that is referenced. This means that groups can be replicated for fixed
2142repetition simply by copying (because the recursion is allowed to refer to
2143earlier groups that are outside the current group). However, when a group is
2144optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2145inserted before it, after it has been compiled. This means that any OP_RECURSE
2146items within it that refer to the group itself or any contained groups have to
2147have their offsets adjusted. That one of the jobs of this function. Before it
2148is called, the partially compiled regex must be temporarily terminated with
2149OP_END.
2150
2151This function has been extended with the possibility of forward references for
2152recursions and subroutine calls. It must also check the list of such references
2153for the group we are dealing with. If it finds that one of the recursions in
2154the current group is on this list, it adjusts the offset in the list, not the
2155value in the reference (which is a group number).
2156
2157Arguments:
2158  group      points to the start of the group
2159  adjust     the amount by which the group is to be moved
2160  utf8       TRUE in UTF-8 mode
2161  cd         contains pointers to tables etc.
2162  save_hwm   the hwm forward reference pointer at the start of the group
2163
2164Returns:     nothing
2165*/
2166
2167static void
2168adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2169  uschar *save_hwm)
2170{
2171uschar *ptr = group;
2172
2173while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2174  {
2175  int offset;
2176  uschar *hc;
2177
2178  /* See if this recursion is on the forward reference list. If so, adjust the
2179  reference. */
2180
2181  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2182    {
2183    offset = GET(hc, 0);
2184    if (cd->start_code + offset == ptr + 1)
2185      {
2186      PUT(hc, 0, offset + adjust);
2187      break;
2188      }
2189    }
2190
2191  /* Otherwise, adjust the recursion offset if it's after the start of this
2192  group. */
2193
2194  if (hc >= cd->hwm)
2195    {
2196    offset = GET(ptr, 1);
2197    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2198    }
2199
2200  ptr += 1 + LINK_SIZE;
2201  }
2202}
2203
2204
2205
2206/*************************************************
2207*        Insert an automatic callout point       *
2208*************************************************/
2209
2210/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2211callout points before each pattern item.
2212
2213Arguments:
2214  code           current code pointer
2215  ptr            current pattern pointer
2216  cd             pointers to tables etc
2217
2218Returns:         new code pointer
2219*/
2220
2221static uschar *
2222auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2223{
2224*code++ = OP_CALLOUT;
2225*code++ = 255;
2226PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
2227PUT(code, LINK_SIZE, 0);                /* Default length */
2228return code + 2*LINK_SIZE;
2229}
2230
2231
2232
2233/*************************************************
2234*         Complete a callout item                *
2235*************************************************/
2236
2237/* A callout item contains the length of the next item in the pattern, which
2238we can't fill in till after we have reached the relevant point. This is used
2239for both automatic and manual callouts.
2240
2241Arguments:
2242  previous_callout   points to previous callout item
2243  ptr                current pattern pointer
2244  cd                 pointers to tables etc
2245
2246Returns:             nothing
2247*/
2248
2249static void
2250complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2251{
2252int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2253PUT(previous_callout, 2 + LINK_SIZE, length);
2254}
2255
2256
2257
2258#ifdef SUPPORT_UCP
2259/*************************************************
2260*           Get othercase range                  *
2261*************************************************/
2262
2263/* This function is passed the start and end of a class range, in UTF-8 mode
2264with UCP support. It searches up the characters, looking for internal ranges of
2265characters in the "other" case. Each call returns the next one, updating the
2266start address.
2267
2268Arguments:
2269  cptr        points to starting character value; updated
2270  d           end value
2271  ocptr       where to put start of othercase range
2272  odptr       where to put end of othercase range
2273
2274Yield:        TRUE when range returned; FALSE when no more
2275*/
2276
2277static BOOL
2278get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2279  unsigned int *odptr)
2280{
2281unsigned int c, othercase, next;
2282
2283for (c = *cptr; c <= d; c++)
2284  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2285
2286if (c > d) return FALSE;
2287
2288*ocptr = othercase;
2289next = othercase + 1;
2290
2291for (++c; c <= d; c++)
2292  {
2293  if (UCD_OTHERCASE(c) != next) break;
2294  next++;
2295  }
2296
2297*odptr = next - 1;
2298*cptr = c;
2299
2300return TRUE;
2301}
2302#endif  /* SUPPORT_UCP */
2303
2304
2305
2306/*************************************************
2307*     Check if auto-possessifying is possible    *
2308*************************************************/
2309
2310/* This function is called for unlimited repeats of certain items, to see
2311whether the next thing could possibly match the repeated item. If not, it makes
2312sense to automatically possessify the repeated item.
2313
2314Arguments:
2315  op_code       the repeated op code
2316  this          data for this item, depends on the opcode
2317  utf8          TRUE in UTF-8 mode
2318  utf8_char     used for utf8 character bytes, NULL if not relevant
2319  ptr           next character in pattern
2320  options       options bits
2321  cd            contains pointers to tables etc.
2322
2323Returns:        TRUE if possessifying is wanted
2324*/
2325
2326static BOOL
2327check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2328  const uschar *ptr, int options, compile_data *cd)
2329{
2330int next;
2331
2332/* Skip whitespace and comments in extended mode */
2333
2334if ((options & PCRE_EXTENDED) != 0)
2335  {
2336  for (;;)
2337    {
2338    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2339    if (*ptr == CHAR_NUMBER_SIGN)
2340      {
2341      while (*(++ptr) != 0)
2342        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2343      }
2344    else break;
2345    }
2346  }
2347
2348/* If the next item is one that we can handle, get its value. A non-negative
2349value is a character, a negative value is an escape value. */
2350
2351if (*ptr == CHAR_BACKSLASH)
2352  {
2353  int temperrorcode = 0;
2354  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2355  if (temperrorcode != 0) return FALSE;
2356  ptr++;    /* Point after the escape sequence */
2357  }
2358
2359else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2360  {
2361#ifdef SUPPORT_UTF8
2362  if (utf8) { GETCHARINC(next, ptr); } else
2363#endif
2364  next = *ptr++;
2365  }
2366
2367else return FALSE;
2368
2369/* Skip whitespace and comments in extended mode */
2370
2371if ((options & PCRE_EXTENDED) != 0)
2372  {
2373  for (;;)
2374    {
2375    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2376    if (*ptr == CHAR_NUMBER_SIGN)
2377      {
2378      while (*(++ptr) != 0)
2379        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2380      }
2381    else break;
2382    }
2383  }
2384
2385/* If the next thing is itself optional, we have to give up. */
2386
2387if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2388  strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2389    return FALSE;
2390
2391/* Now compare the next item with the previous opcode. If the previous is a
2392positive single character match, "item" either contains the character or, if
2393"item" is greater than 127 in utf8 mode, the character's bytes are in
2394utf8_char. */
2395
2396
2397/* Handle cases when the next item is a character. */
2398
2399if (next >= 0) switch(op_code)
2400  {
2401  case OP_CHAR:
2402#ifdef SUPPORT_UTF8
2403  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2404#else
2405  (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2406#endif
2407  return item != next;
2408
2409  /* For CHARNC (caseless character) we must check the other case. If we have
2410  Unicode property support, we can use it to test the other case of
2411  high-valued characters. */
2412
2413  case OP_CHARNC:
2414#ifdef SUPPORT_UTF8
2415  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2416#endif
2417  if (item == next) return FALSE;
2418#ifdef SUPPORT_UTF8
2419  if (utf8)
2420    {
2421    unsigned int othercase;
2422    if (next < 128) othercase = cd->fcc[next]; else
2423#ifdef SUPPORT_UCP
2424    othercase = UCD_OTHERCASE((unsigned int)next);
2425#else
2426    othercase = NOTACHAR;
2427#endif
2428    return (unsigned int)item != othercase;
2429    }
2430  else
2431#endif  /* SUPPORT_UTF8 */
2432  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2433
2434  /* For OP_NOT, "item" must be a single-byte character. */
2435
2436  case OP_NOT:
2437  if (item == next) return TRUE;
2438  if ((options & PCRE_CASELESS) == 0) return FALSE;
2439#ifdef SUPPORT_UTF8
2440  if (utf8)
2441    {
2442    unsigned int othercase;
2443    if (next < 128) othercase = cd->fcc[next]; else
2444#ifdef SUPPORT_UCP
2445    othercase = UCD_OTHERCASE(next);
2446#else
2447    othercase = NOTACHAR;
2448#endif
2449    return (unsigned int)item == othercase;
2450    }
2451  else
2452#endif  /* SUPPORT_UTF8 */
2453  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2454
2455  case OP_DIGIT:
2456  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2457
2458  case OP_NOT_DIGIT:
2459  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2460
2461  case OP_WHITESPACE:
2462  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2463
2464  case OP_NOT_WHITESPACE:
2465  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2466
2467  case OP_WORDCHAR:
2468  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2469
2470  case OP_NOT_WORDCHAR:
2471  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2472
2473  case OP_HSPACE:
2474  case OP_NOT_HSPACE:
2475  switch(next)
2476    {
2477    case 0x09:
2478    case 0x20:
2479    case 0xa0:
2480    case 0x1680:
2481    case 0x180e:
2482    case 0x2000:
2483    case 0x2001:
2484    case 0x2002:
2485    case 0x2003:
2486    case 0x2004:
2487    case 0x2005:
2488    case 0x2006:
2489    case 0x2007:
2490    case 0x2008:
2491    case 0x2009:
2492    case 0x200A:
2493    case 0x202f:
2494    case 0x205f:
2495    case 0x3000:
2496    return op_code != OP_HSPACE;
2497    default:
2498    return op_code == OP_HSPACE;
2499    }
2500
2501  case OP_VSPACE:
2502  case OP_NOT_VSPACE:
2503  switch(next)
2504    {
2505    case 0x0a:
2506    case 0x0b:
2507    case 0x0c:
2508    case 0x0d:
2509    case 0x85:
2510    case 0x2028:
2511    case 0x2029:
2512    return op_code != OP_VSPACE;
2513    default:
2514    return op_code == OP_VSPACE;
2515    }
2516
2517  default:
2518  return FALSE;
2519  }
2520
2521
2522/* Handle the case when the next item is \d, \s, etc. */
2523
2524switch(op_code)
2525  {
2526  case OP_CHAR:
2527  case OP_CHARNC:
2528#ifdef SUPPORT_UTF8
2529  if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2530#endif
2531  switch(-next)
2532    {
2533    case ESC_d:
2534    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2535
2536    case ESC_D:
2537    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2538
2539    case ESC_s:
2540    return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2541
2542    case ESC_S:
2543    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2544
2545    case ESC_w:
2546    return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2547
2548    case ESC_W:
2549    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2550
2551    case ESC_h:
2552    case ESC_H:
2553    switch(item)
2554      {
2555      case 0x09:
2556      case 0x20:
2557      case 0xa0:
2558      case 0x1680:
2559      case 0x180e:
2560      case 0x2000:
2561      case 0x2001:
2562      case 0x2002:
2563      case 0x2003:
2564      case 0x2004:
2565      case 0x2005:
2566      case 0x2006:
2567      case 0x2007:
2568      case 0x2008:
2569      case 0x2009:
2570      case 0x200A:
2571      case 0x202f:
2572      case 0x205f:
2573      case 0x3000:
2574      return -next != ESC_h;
2575      default:
2576      return -next == ESC_h;
2577      }
2578
2579    case ESC_v:
2580    case ESC_V:
2581    switch(item)
2582      {
2583      case 0x0a:
2584      case 0x0b:
2585      case 0x0c:
2586      case 0x0d:
2587      case 0x85:
2588      case 0x2028:
2589      case 0x2029:
2590      return -next != ESC_v;
2591      default:
2592      return -next == ESC_v;
2593      }
2594
2595    default:
2596    return FALSE;
2597    }
2598
2599  case OP_DIGIT:
2600  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2601         next == -ESC_h || next == -ESC_v;
2602
2603  case OP_NOT_DIGIT:
2604  return next == -ESC_d;
2605
2606  case OP_WHITESPACE:
2607  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2608
2609  case OP_NOT_WHITESPACE:
2610  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2611
2612  case OP_HSPACE:
2613  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2614
2615  case OP_NOT_HSPACE:
2616  return next == -ESC_h;
2617
2618  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2619  case OP_VSPACE:
2620  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2621
2622  case OP_NOT_VSPACE:
2623  return next == -ESC_v;
2624
2625  case OP_WORDCHAR:
2626  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2627
2628  case OP_NOT_WORDCHAR:
2629  return next == -ESC_w || next == -ESC_d;
2630
2631  default:
2632  return FALSE;
2633  }
2634
2635/* Control does not reach here */
2636}
2637
2638
2639
2640/*************************************************
2641*           Compile one branch                   *
2642*************************************************/
2643
2644/* Scan the pattern, compiling it into the a vector. If the options are
2645changed during the branch, the pointer is used to change the external options
2646bits. This function is used during the pre-compile phase when we are trying
2647to find out the amount of memory needed, as well as during the real compile
2648phase. The value of lengthptr distinguishes the two phases.
2649
2650Arguments:
2651  optionsptr     pointer to the option bits
2652  codeptr        points to the pointer to the current code point
2653  ptrptr         points to the current pattern pointer
2654  errorcodeptr   points to error code variable
2655  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2656  reqbyteptr     set to the last literal character required, else < 0
2657  bcptr          points to current branch chain
2658  cd             contains pointers to tables etc.
2659  lengthptr      NULL during the real compile phase
2660                 points to length accumulator during pre-compile phase
2661
2662Returns:         TRUE on success
2663                 FALSE, with *errorcodeptr set non-zero on error
2664*/
2665
2666static BOOL
2667compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2668  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2669  compile_data *cd, int *lengthptr)
2670{
2671int repeat_type, op_type;
2672int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2673int bravalue = 0;
2674int greedy_default, greedy_non_default;
2675int firstbyte, reqbyte;
2676int zeroreqbyte, zerofirstbyte;
2677int req_caseopt, reqvary, tempreqvary;
2678int options = *optionsptr;
2679int after_manual_callout = 0;
2680int length_prevgroup = 0;
2681register int c;
2682register uschar *code = *codeptr;
2683uschar *last_code = code;
2684uschar *orig_code = code;
2685uschar *tempcode;
2686BOOL inescq = FALSE;
2687BOOL groupsetfirstbyte = FALSE;
2688const uschar *ptr = *ptrptr;
2689const uschar *tempptr;
2690uschar *previous = NULL;
2691uschar *previous_callout = NULL;
2692uschar *save_hwm = NULL;
2693uschar classbits[32];
2694
2695#ifdef SUPPORT_UTF8
2696BOOL class_utf8;
2697BOOL utf8 = (options & PCRE_UTF8) != 0;
2698uschar *class_utf8data;
2699uschar *class_utf8data_base;
2700uschar utf8_char[6];
2701#else
2702BOOL utf8 = FALSE;
2703uschar *utf8_char = NULL;
2704#endif
2705
2706#ifdef PCRE_DEBUG
2707if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2708#endif
2709
2710/* Set up the default and non-default settings for greediness */
2711
2712greedy_default = ((options & PCRE_UNGREEDY) != 0);
2713greedy_non_default = greedy_default ^ 1;
2714
2715/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2716matching encountered yet". It gets changed to REQ_NONE if we hit something that
2717matches a non-fixed char first char; reqbyte just remains unset if we never
2718find one.
2719
2720When we hit a repeat whose minimum is zero, we may have to adjust these values
2721to take the zero repeat into account. This is implemented by setting them to
2722zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2723item types that can be repeated set these backoff variables appropriately. */
2724
2725firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2726
2727/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2728according to the current setting of the caseless flag. REQ_CASELESS is a bit
2729value > 255. It is added into the firstbyte or reqbyte variables to record the
2730case status of the value. This is used only for ASCII characters. */
2731
2732req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2733
2734/* Switch on next character until the end of the branch */
2735
2736for (;; ptr++)
2737  {
2738  BOOL negate_class;
2739  BOOL should_flip_negation;
2740  BOOL possessive_quantifier;
2741  BOOL is_quantifier;
2742  BOOL is_recurse;
2743  BOOL reset_bracount;
2744  int class_charcount;
2745  int class_lastchar;
2746  int newoptions;
2747  int recno;
2748  int refsign;
2749  int skipbytes;
2750  int subreqbyte;
2751  int subfirstbyte;
2752  int terminator;
2753  int mclength;
2754  uschar mcbuffer[8];
2755
2756  /* Get next byte in the pattern */
2757
2758  c = *ptr;
2759
2760  /* If we are in the pre-compile phase, accumulate the length used for the
2761  previous cycle of this loop. */
2762
2763  if (lengthptr != NULL)
2764    {
2765#ifdef PCRE_DEBUG
2766    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2767#endif
2768    if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
2769      {
2770      *errorcodeptr = ERR52;
2771      goto FAILED;
2772      }
2773
2774    /* There is at least one situation where code goes backwards: this is the
2775    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2776    the class is simply eliminated. However, it is created first, so we have to
2777    allow memory for it. Therefore, don't ever reduce the length at this point.
2778    */
2779
2780    if (code < last_code) code = last_code;
2781
2782    /* Paranoid check for integer overflow */
2783
2784    if (OFLOW_MAX - *lengthptr < code - last_code)
2785      {
2786      *errorcodeptr = ERR20;
2787      goto FAILED;
2788      }
2789
2790    *lengthptr += code - last_code;
2791    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2792
2793    /* If "previous" is set and it is not at the start of the work space, move
2794    it back to there, in order to avoid filling up the work space. Otherwise,
2795    if "previous" is NULL, reset the current code pointer to the start. */
2796
2797    if (previous != NULL)
2798      {
2799      if (previous > orig_code)
2800        {
2801        memmove(orig_code, previous, code - previous);
2802        code -= previous - orig_code;
2803        previous = orig_code;
2804        }
2805      }
2806    else code = orig_code;
2807
2808    /* Remember where this code item starts so we can pick up the length
2809    next time round. */
2810
2811    last_code = code;
2812    }
2813
2814  /* In the real compile phase, just check the workspace used by the forward
2815  reference list. */
2816
2817  else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
2818    {
2819    *errorcodeptr = ERR52;
2820    goto FAILED;
2821    }
2822
2823  /* If in \Q...\E, check for the end; if not, we have a literal */
2824
2825  if (inescq && c != 0)
2826    {
2827    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2828      {
2829      inescq = FALSE;
2830      ptr++;
2831      continue;
2832      }
2833    else
2834      {
2835      if (previous_callout != NULL)
2836        {
2837        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2838          complete_callout(previous_callout, ptr, cd);
2839        previous_callout = NULL;
2840        }
2841      if ((options & PCRE_AUTO_CALLOUT) != 0)
2842        {
2843        previous_callout = code;
2844        code = auto_callout(code, ptr, cd);
2845        }
2846      goto NORMAL_CHAR;
2847      }
2848    }
2849
2850  /* Fill in length of a previous callout, except when the next thing is
2851  a quantifier. */
2852
2853  is_quantifier =
2854    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2855    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2856
2857  if (!is_quantifier && previous_callout != NULL &&
2858       after_manual_callout-- <= 0)
2859    {
2860    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2861      complete_callout(previous_callout, ptr, cd);
2862    previous_callout = NULL;
2863    }
2864
2865  /* In extended mode, skip white space and comments */
2866
2867  if ((options & PCRE_EXTENDED) != 0)
2868    {
2869    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2870    if (c == CHAR_NUMBER_SIGN)
2871      {
2872      while (*(++ptr) != 0)
2873        {
2874        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2875        }
2876      if (*ptr != 0) continue;
2877
2878      /* Else fall through to handle end of string */
2879      c = 0;
2880      }
2881    }
2882
2883  /* No auto callout for quantifiers. */
2884
2885  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2886    {
2887    previous_callout = code;
2888    code = auto_callout(code, ptr, cd);
2889    }
2890
2891  switch(c)
2892    {
2893    /* ===================================================================*/
2894    case 0:                        /* The branch terminates at string end */
2895    case CHAR_VERTICAL_LINE:       /* or | or ) */
2896    case CHAR_RIGHT_PARENTHESIS:
2897    *firstbyteptr = firstbyte;
2898    *reqbyteptr = reqbyte;
2899    *codeptr = code;
2900    *ptrptr = ptr;
2901    if (lengthptr != NULL)
2902      {
2903      if (OFLOW_MAX - *lengthptr < code - last_code)
2904        {
2905        *errorcodeptr = ERR20;
2906        goto FAILED;
2907        }
2908      *lengthptr += code - last_code;   /* To include callout length */
2909      DPRINTF((">> end branch\n"));
2910      }
2911    return TRUE;
2912
2913
2914    /* ===================================================================*/
2915    /* Handle single-character metacharacters. In multiline mode, ^ disables
2916    the setting of any following char as a first character. */
2917
2918    case CHAR_CIRCUMFLEX_ACCENT:
2919    if ((options & PCRE_MULTILINE) != 0)
2920      {
2921      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2922      }
2923    previous = NULL;
2924    *code++ = OP_CIRC;
2925    break;
2926
2927    case CHAR_DOLLAR_SIGN:
2928    previous = NULL;
2929    *code++ = OP_DOLL;
2930    break;
2931
2932    /* There can never be a first char if '.' is first, whatever happens about
2933    repeats. The value of reqbyte doesn't change either. */
2934
2935    case CHAR_DOT:
2936    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2937    zerofirstbyte = firstbyte;
2938    zeroreqbyte = reqbyte;
2939    previous = code;
2940    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2941    break;
2942
2943
2944    /* ===================================================================*/
2945    /* Character classes. If the included characters are all < 256, we build a
2946    32-byte bitmap of the permitted characters, except in the special case
2947    where there is only one such character. For negated classes, we build the
2948    map as usual, then invert it at the end. However, we use a different opcode
2949    so that data characters > 255 can be handled correctly.
2950
2951    If the class contains characters outside the 0-255 range, a different
2952    opcode is compiled. It may optionally have a bit map for characters < 256,
2953    but those above are are explicitly listed afterwards. A flag byte tells
2954    whether the bitmap is present, and whether this is a negated class or not.
2955
2956    In JavaScript compatibility mode, an isolated ']' causes an error. In
2957    default (Perl) mode, it is treated as a data character. */
2958
2959    case CHAR_RIGHT_SQUARE_BRACKET:
2960    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2961      {
2962      *errorcodeptr = ERR64;
2963      goto FAILED;
2964      }
2965    goto NORMAL_CHAR;
2966
2967    case CHAR_LEFT_SQUARE_BRACKET:
2968    previous = code;
2969
2970    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2971    they are encountered at the top level, so we'll do that too. */
2972
2973    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2974         ptr[1] == CHAR_EQUALS_SIGN) &&
2975        check_posix_syntax(ptr, &tempptr))
2976      {
2977      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2978      goto FAILED;
2979      }
2980
2981    /* If the first character is '^', set the negation flag and skip it. Also,
2982    if the first few characters (either before or after ^) are \Q\E or \E we
2983    skip them too. This makes for compatibility with Perl. */
2984
2985    negate_class = FALSE;
2986    for (;;)
2987      {
2988      c = *(++ptr);
2989      if (c == CHAR_BACKSLASH)
2990        {
2991        if (ptr[1] == CHAR_E)
2992          ptr++;
2993        else if (strncmp((const char *)ptr+1,
2994                          STR_Q STR_BACKSLASH STR_E, 3) == 0)
2995          ptr += 3;
2996        else
2997          break;
2998        }
2999      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3000        negate_class = TRUE;
3001      else break;
3002      }
3003
3004    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3005    an initial ']' is taken as a data character -- the code below handles
3006    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3007    [^] must match any character, so generate OP_ALLANY. */
3008
3009    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3010        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3011      {
3012      *code++ = negate_class? OP_ALLANY : OP_FAIL;
3013      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3014      zerofirstbyte = firstbyte;
3015      break;
3016      }
3017
3018    /* If a class contains a negative special such as \S, we need to flip the
3019    negation flag at the end, so that support for characters > 255 works
3020    correctly (they are all included in the class). */
3021
3022    should_flip_negation = FALSE;
3023
3024    /* Keep a count of chars with values < 256 so that we can optimize the case
3025    of just a single character (as long as it's < 256). However, For higher
3026    valued UTF-8 characters, we don't yet do any optimization. */
3027
3028    class_charcount = 0;
3029    class_lastchar = -1;
3030
3031    /* Initialize the 32-char bit map to all zeros. We build the map in a
3032    temporary bit of memory, in case the class contains only 1 character (less
3033    than 256), because in that case the compiled code doesn't use the bit map.
3034    */
3035
3036    memset(classbits, 0, 32 * sizeof(uschar));
3037
3038#ifdef SUPPORT_UTF8
3039    class_utf8 = FALSE;                       /* No chars >= 256 */
3040    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
3041    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
3042#endif
3043
3044    /* Process characters until ] is reached. By writing this as a "do" it
3045    means that an initial ] is taken as a data character. At the start of the
3046    loop, c contains the first byte of the character. */
3047
3048    if (c != 0) do
3049      {
3050      const uschar *oldptr;
3051
3052#ifdef SUPPORT_UTF8
3053      if (utf8 && c > 127)
3054        {                           /* Braces are required because the */
3055        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3056        }
3057
3058      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3059      data and reset the pointer. This is so that very large classes that
3060      contain a zillion UTF-8 characters no longer overwrite the work space
3061      (which is on the stack). */
3062
3063      if (lengthptr != NULL)
3064        {
3065        *lengthptr += class_utf8data - class_utf8data_base;
3066        class_utf8data = class_utf8data_base;
3067        }
3068
3069#endif
3070
3071      /* Inside \Q...\E everything is literal except \E */
3072
3073      if (inescq)
3074        {
3075        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3076          {
3077          inescq = FALSE;                   /* Reset literal state */
3078          ptr++;                            /* Skip the 'E' */
3079          continue;                         /* Carry on with next */
3080          }
3081        goto CHECK_RANGE;                   /* Could be range if \E follows */
3082        }
3083
3084      /* Handle POSIX class names. Perl allows a negation extension of the
3085      form [:^name:]. A square bracket that doesn't match the syntax is
3086      treated as a literal. We also recognize the POSIX constructions
3087      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3088      5.6 and 5.8 do. */
3089
3090      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3091          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3092           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3093        {
3094        BOOL local_negate = FALSE;
3095        int posix_class, taboffset, tabopt;
3096        register const uschar *cbits = cd->cbits;
3097        uschar pbits[32];
3098
3099        if (ptr[1] != CHAR_COLON)
3100          {
3101          *errorcodeptr = ERR31;
3102          goto FAILED;
3103          }
3104
3105        ptr += 2;
3106        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3107          {
3108          local_negate = TRUE;
3109          should_flip_negation = TRUE;  /* Note negative special */
3110          ptr++;
3111          }
3112
3113        posix_class = check_posix_name(ptr, tempptr - ptr);
3114        if (posix_class < 0)
3115          {
3116          *errorcodeptr = ERR30;
3117          goto FAILED;
3118          }
3119
3120        /* If matching is caseless, upper and lower are converted to
3121        alpha. This relies on the fact that the class table starts with
3122        alpha, lower, upper as the first 3 entries. */
3123
3124        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3125          posix_class = 0;
3126
3127        /* We build the bit map for the POSIX class in a chunk of local store
3128        because we may be adding and subtracting from it, and we don't want to
3129        subtract bits that may be in the main map already. At the end we or the
3130        result into the bit map that is being built. */
3131
3132        posix_class *= 3;
3133
3134        /* Copy in the first table (always present) */
3135
3136        memcpy(pbits, cbits + posix_class_maps[posix_class],
3137          32 * sizeof(uschar));
3138
3139        /* If there is a second table, add or remove it as required. */
3140
3141        taboffset = posix_class_maps[posix_class + 1];
3142        tabopt = posix_class_maps[posix_class + 2];
3143
3144        if (taboffset >= 0)
3145          {
3146          if (tabopt >= 0)
3147            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3148          else
3149            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3150          }
3151
3152        /* Not see if we need to remove any special characters. An option
3153        value of 1 removes vertical space and 2 removes underscore. */
3154
3155        if (tabopt < 0) tabopt = -tabopt;
3156        if (tabopt == 1) pbits[1] &= ~0x3c;
3157          else if (tabopt == 2) pbits[11] &= 0x7f;
3158
3159        /* Add the POSIX table or its complement into the main table that is
3160        being built and we are done. */
3161
3162        if (local_negate)
3163          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3164        else
3165          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3166
3167        ptr = tempptr + 1;
3168        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3169        continue;    /* End of POSIX syntax handling */
3170        }
3171
3172      /* Backslash may introduce a single character, or it may introduce one
3173      of the specials, which just set a flag. The sequence \b is a special
3174      case. Inside a class (and only there) it is treated as backspace.
3175      Elsewhere it marks a word boundary. Other escapes have preset maps ready
3176      to 'or' into the one we are building. We assume they have more than one
3177      character in them, so set class_charcount bigger than one. */
3178
3179      if (c == CHAR_BACKSLASH)
3180        {
3181        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3182        if (*errorcodeptr != 0) goto FAILED;
3183
3184        if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3185        else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3186        else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3187        else if (-c == ESC_Q)            /* Handle start of quoted string */
3188          {
3189          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3190            {
3191            ptr += 2; /* avoid empty string */
3192            }
3193          else inescq = TRUE;
3194          continue;
3195          }
3196        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3197
3198        if (c < 0)
3199          {
3200          register const uschar *cbits = cd->cbits;
3201          class_charcount += 2;     /* Greater than 1 is what matters */
3202
3203          /* Save time by not doing this in the pre-compile phase. */
3204
3205          if (lengthptr == NULL) switch (-c)
3206            {
3207            case ESC_d:
3208            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3209            continue;
3210
3211            case ESC_D:
3212            should_flip_negation = TRUE;
3213            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3214            continue;
3215
3216            case ESC_w:
3217            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3218            continue;
3219
3220            case ESC_W:
3221            should_flip_negation = TRUE;
3222            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3223            continue;
3224
3225            case ESC_s:
3226            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3227            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
3228            continue;
3229
3230            case ESC_S:
3231            should_flip_negation = TRUE;
3232            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3233            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3234            continue;
3235
3236            default:    /* Not recognized; fall through */
3237            break;      /* Need "default" setting to stop compiler warning. */
3238            }
3239
3240          /* In the pre-compile phase, just do the recognition. */
3241
3242          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3243                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3244
3245          /* We need to deal with \H, \h, \V, and \v in both phases because
3246          they use extra memory. */
3247
3248          if (-c == ESC_h)
3249            {
3250            SETBIT(classbits, 0x09); /* VT */
3251            SETBIT(classbits, 0x20); /* SPACE */
3252            SETBIT(classbits, 0xa0); /* NSBP */
3253#ifdef SUPPORT_UTF8
3254            if (utf8)
3255              {
3256              class_utf8 = TRUE;
3257              *class_utf8data++ = XCL_SINGLE;
3258              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3259              *class_utf8data++ = XCL_SINGLE;
3260              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3261              *class_utf8data++ = XCL_RANGE;
3262              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3263              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3264              *class_utf8data++ = XCL_SINGLE;
3265              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3266              *class_utf8data++ = XCL_SINGLE;
3267              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3268              *class_utf8data++ = XCL_SINGLE;
3269              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3270              }
3271#endif
3272            continue;
3273            }
3274
3275          if (-c == ESC_H)
3276            {
3277            for (c = 0; c < 32; c++)
3278              {
3279              int x = 0xff;
3280              switch (c)
3281                {
3282                case 0x09/8: x ^= 1 << (0x09%8); break;
3283                case 0x20/8: x ^= 1 << (0x20%8); break;
3284                case 0xa0/8: x ^= 1 << (0xa0%8); break;
3285                default: break;
3286                }
3287              classbits[c] |= x;
3288              }
3289
3290#ifdef SUPPORT_UTF8
3291            if (utf8)
3292              {
3293              class_utf8 = TRUE;
3294              *class_utf8data++ = XCL_RANGE;
3295              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3296              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3297              *class_utf8data++ = XCL_RANGE;
3298              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3299              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3300              *class_utf8data++ = XCL_RANGE;
3301              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3302              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3303              *class_utf8data++ = XCL_RANGE;
3304              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3305              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3306              *class_utf8data++ = XCL_RANGE;
3307              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3308              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3309              *class_utf8data++ = XCL_RANGE;
3310              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3311              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3312              *class_utf8data++ = XCL_RANGE;
3313              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3314              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3315              }
3316#endif
3317            continue;
3318            }
3319
3320          if (-c == ESC_v)
3321            {
3322            SETBIT(classbits, 0x0a); /* LF */
3323            SETBIT(classbits, 0x0b); /* VT */
3324            SETBIT(classbits, 0x0c); /* FF */
3325            SETBIT(classbits, 0x0d); /* CR */
3326            SETBIT(classbits, 0x85); /* NEL */
3327#ifdef SUPPORT_UTF8
3328            if (utf8)
3329              {
3330              class_utf8 = TRUE;
3331              *class_utf8data++ = XCL_RANGE;
3332              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3333              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3334              }
3335#endif
3336            continue;
3337            }
3338
3339          if (-c == ESC_V)
3340            {
3341            for (c = 0; c < 32; c++)
3342              {
3343              int x = 0xff;
3344              switch (c)
3345                {
3346                case 0x0a/8: x ^= 1 << (0x0a%8);
3347                             x ^= 1 << (0x0b%8);
3348                             x ^= 1 << (0x0c%8);
3349                             x ^= 1 << (0x0d%8);
3350                             break;
3351                case 0x85/8: x ^= 1 << (0x85%8); break;
3352                default: break;
3353                }
3354              classbits[c] |= x;
3355              }
3356
3357#ifdef SUPPORT_UTF8
3358            if (utf8)
3359              {
3360              class_utf8 = TRUE;
3361              *class_utf8data++ = XCL_RANGE;
3362              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3363              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3364              *class_utf8data++ = XCL_RANGE;
3365              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3366              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3367              }
3368#endif
3369            continue;
3370            }
3371
3372          /* We need to deal with \P and \p in both phases. */
3373
3374#ifdef SUPPORT_UCP
3375          if (-c == ESC_p || -c == ESC_P)
3376            {
3377            BOOL negated;
3378            int pdata;
3379            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3380            if (ptype < 0) goto FAILED;
3381            class_utf8 = TRUE;
3382            *class_utf8data++ = ((-c == ESC_p) != negated)?
3383              XCL_PROP : XCL_NOTPROP;
3384            *class_utf8data++ = ptype;
3385            *class_utf8data++ = pdata;
3386            class_charcount -= 2;   /* Not a < 256 character */
3387            continue;
3388            }
3389#endif
3390          /* Unrecognized escapes are faulted if PCRE is running in its
3391          strict mode. By default, for compatibility with Perl, they are
3392          treated as literals. */
3393
3394          if ((options & PCRE_EXTRA) != 0)
3395            {
3396            *errorcodeptr = ERR7;
3397            goto FAILED;
3398            }
3399
3400          class_charcount -= 2;  /* Undo the default count from above */
3401          c = *ptr;              /* Get the final character and fall through */
3402          }
3403
3404        /* Fall through if we have a single character (c >= 0). This may be
3405        greater than 256 in UTF-8 mode. */
3406
3407        }   /* End of backslash handling */
3408
3409      /* A single character may be followed by '-' to form a range. However,
3410      Perl does not permit ']' to be the end of the range. A '-' character
3411      at the end is treated as a literal. Perl ignores orphaned \E sequences
3412      entirely. The code for handling \Q and \E is messy. */
3413
3414      CHECK_RANGE:
3415      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3416        {
3417        inescq = FALSE;
3418        ptr += 2;
3419        }
3420
3421      oldptr = ptr;
3422
3423      /* Remember \r or \n */
3424
3425      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3426
3427      /* Check for range */
3428
3429      if (!inescq && ptr[1] == CHAR_MINUS)
3430        {
3431        int d;
3432        ptr += 2;
3433        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3434
3435        /* If we hit \Q (not followed by \E) at this point, go into escaped
3436        mode. */
3437
3438        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3439          {
3440          ptr += 2;
3441          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3442            { ptr += 2; continue; }
3443          inescq = TRUE;
3444          break;
3445          }
3446
3447        if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3448          {
3449          ptr = oldptr;
3450          goto LONE_SINGLE_CHARACTER;
3451          }
3452
3453#ifdef SUPPORT_UTF8
3454        if (utf8)
3455          {                           /* Braces are required because the */
3456          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3457          }
3458        else
3459#endif
3460        d = *ptr;  /* Not UTF-8 mode */
3461
3462        /* The second part of a range can be a single-character escape, but
3463        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3464        in such circumstances. */
3465
3466        if (!inescq && d == CHAR_BACKSLASH)
3467          {
3468          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3469          if (*errorcodeptr != 0) goto FAILED;
3470
3471          /* \b is backspace; \X is literal X; \R is literal R; any other
3472          special means the '-' was literal */
3473
3474          if (d < 0)
3475            {
3476            if (d == -ESC_b) d = CHAR_BS;
3477            else if (d == -ESC_X) d = CHAR_X;
3478            else if (d == -ESC_R) d = CHAR_R; else
3479              {
3480              ptr = oldptr;
3481              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3482              }
3483            }
3484          }
3485
3486        /* Check that the two values are in the correct order. Optimize
3487        one-character ranges */
3488
3489        if (d < c)
3490          {
3491          *errorcodeptr = ERR8;
3492          goto FAILED;
3493          }
3494
3495        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3496
3497        /* Remember \r or \n */
3498
3499        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3500
3501        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3502        matching, we have to use an XCLASS with extra data items. Caseless
3503        matching for characters > 127 is available only if UCP support is
3504        available. */
3505
3506#ifdef SUPPORT_UTF8
3507        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3508          {
3509          class_utf8 = TRUE;
3510
3511          /* With UCP support, we can find the other case equivalents of
3512          the relevant characters. There may be several ranges. Optimize how
3513          they fit with the basic range. */
3514
3515#ifdef SUPPORT_UCP
3516          if ((options & PCRE_CASELESS) != 0)
3517            {
3518            unsigned int occ, ocd;
3519            unsigned int cc = c;
3520            unsigned int origd = d;
3521            while (get_othercase_range(&cc, origd, &occ, &ocd))
3522              {
3523              if (occ >= (unsigned int)c &&
3524                  ocd <= (unsigned int)d)
3525                continue;                          /* Skip embedded ranges */
3526
3527              if (occ < (unsigned int)c  &&
3528                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3529                {                                  /* if there is overlap,   */
3530                c = occ;                           /* noting that if occ < c */
3531                continue;                          /* we can't have ocd > d  */
3532                }                                  /* because a subrange is  */
3533              if (ocd > (unsigned int)d &&
3534                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3535                {                                  /* the basic range.       */
3536                d = ocd;
3537                continue;
3538                }
3539
3540              if (occ == ocd)
3541                {
3542                *class_utf8data++ = XCL_SINGLE;
3543                }
3544              else
3545                {
3546                *class_utf8data++ = XCL_RANGE;
3547                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3548                }
3549              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3550              }
3551            }
3552#endif  /* SUPPORT_UCP */
3553
3554          /* Now record the original range, possibly modified for UCP caseless
3555          overlapping ranges. */
3556
3557          *class_utf8data++ = XCL_RANGE;
3558          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3559          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3560
3561          /* With UCP support, we are done. Without UCP support, there is no
3562          caseless matching for UTF-8 characters > 127; we can use the bit map
3563          for the smaller ones. */
3564
3565#ifdef SUPPORT_UCP
3566          continue;    /* With next character in the class */
3567#else
3568          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3569
3570          /* Adjust upper limit and fall through to set up the map */
3571
3572          d = 127;
3573
3574#endif  /* SUPPORT_UCP */
3575          }
3576#endif  /* SUPPORT_UTF8 */
3577
3578        /* We use the bit map for all cases when not in UTF-8 mode; else
3579        ranges that lie entirely within 0-127 when there is UCP support; else
3580        for partial ranges without UCP support. */
3581
3582        class_charcount += d - c + 1;
3583        class_lastchar = d;
3584
3585        /* We can save a bit of time by skipping this in the pre-compile. */
3586
3587        if (lengthptr == NULL) for (; c <= d; c++)
3588          {
3589          classbits[c/8] |= (1 << (c&7));
3590          if ((options & PCRE_CASELESS) != 0)
3591            {
3592            int uc = cd->fcc[c];           /* flip case */
3593            classbits[uc/8] |= (1 << (uc&7));
3594            }
3595          }
3596
3597        continue;   /* Go get the next char in the class */
3598        }
3599
3600      /* Handle a lone single character - we can get here for a normal
3601      non-escape char, or after \ that introduces a single character or for an
3602      apparent range that isn't. */
3603
3604      LONE_SINGLE_CHARACTER:
3605
3606      /* Handle a character that cannot go in the bit map */
3607
3608#ifdef SUPPORT_UTF8
3609      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3610        {
3611        class_utf8 = TRUE;
3612        *class_utf8data++ = XCL_SINGLE;
3613        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3614
3615#ifdef SUPPORT_UCP
3616        if ((options & PCRE_CASELESS) != 0)
3617          {
3618          unsigned int othercase;
3619          if ((othercase = UCD_OTHERCASE(c)) != c)
3620            {
3621            *class_utf8data++ = XCL_SINGLE;
3622            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3623            }
3624          }
3625#endif  /* SUPPORT_UCP */
3626
3627        }
3628      else
3629#endif  /* SUPPORT_UTF8 */
3630
3631      /* Handle a single-byte character */
3632        {
3633        classbits[c/8] |= (1 << (c&7));
3634        if ((options & PCRE_CASELESS) != 0)
3635          {
3636          c = cd->fcc[c];   /* flip case */
3637          classbits[c/8] |= (1 << (c&7));
3638          }
3639        class_charcount++;
3640        class_lastchar = c;
3641        }
3642      }
3643
3644    /* Loop until ']' reached. This "while" is the end of the "do" above. */
3645
3646    while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3647
3648    if (c == 0)                          /* Missing terminating ']' */
3649      {
3650      *errorcodeptr = ERR6;
3651      goto FAILED;
3652      }
3653
3654
3655/* This code has been disabled because it would mean that \s counts as
3656an explicit \r or \n reference, and that's not really what is wanted. Now
3657we set the flag only if there is a literal "\r" or "\n" in the class. */
3658
3659#if 0
3660    /* Remember whether \r or \n are in this class */
3661
3662    if (negate_class)
3663      {
3664      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3665      }
3666    else
3667      {
3668      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3669      }
3670#endif
3671
3672
3673    /* If class_charcount is 1, we saw precisely one character whose value is
3674    less than 256. As long as there were no characters >= 128 and there was no
3675    use of \p or \P, in other words, no use of any XCLASS features, we can
3676    optimize.
3677
3678    In UTF-8 mode, we can optimize the negative case only if there were no
3679    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3680    operate on single-bytes only. This is an historical hangover. Maybe one day
3681    we can tidy these opcodes to handle multi-byte characters.
3682
3683    The optimization throws away the bit map. We turn the item into a
3684    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3685    that OP_NOT does not support multibyte characters. In the positive case, it
3686    can cause firstbyte to be set. Otherwise, there can be no first char if
3687    this item is first, whatever repeat count may follow. In the case of
3688    reqbyte, save the previous value for reinstating. */
3689
3690#ifdef SUPPORT_UTF8
3691    if (class_charcount == 1 && !class_utf8 &&
3692      (!utf8 || !negate_class || class_lastchar < 128))
3693#else
3694    if (class_charcount == 1)
3695#endif
3696      {
3697      zeroreqbyte = reqbyte;
3698
3699      /* The OP_NOT opcode works on one-byte characters only. */
3700
3701      if (negate_class)
3702        {
3703        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3704        zerofirstbyte = firstbyte;
3705        *code++ = OP_NOT;
3706        *code++ = class_lastchar;
3707        break;
3708        }
3709
3710      /* For a single, positive character, get the value into mcbuffer, and
3711      then we can handle this with the normal one-character code. */
3712
3713#ifdef SUPPORT_UTF8
3714      if (utf8 && class_lastchar > 127)
3715        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3716      else
3717#endif
3718        {
3719        mcbuffer[0] = class_lastchar;
3720        mclength = 1;
3721        }
3722      goto ONE_CHAR;
3723      }       /* End of 1-char optimization */
3724
3725    /* The general case - not the one-char optimization. If this is the first
3726    thing in the branch, there can be no first char setting, whatever the
3727    repeat count. Any reqbyte setting must remain unchanged after any kind of
3728    repeat. */
3729
3730    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3731    zerofirstbyte = firstbyte;
3732    zeroreqbyte = reqbyte;
3733
3734    /* If there are characters with values > 255, we have to compile an
3735    extended class, with its own opcode, unless there was a negated special
3736    such as \S in the class, because in that case all characters > 255 are in
3737    the class, so any that were explicitly given as well can be ignored. If
3738    (when there are explicit characters > 255 that must be listed) there are no
3739    characters < 256, we can omit the bitmap in the actual compiled code. */
3740
3741#ifdef SUPPORT_UTF8
3742    if (class_utf8 && !should_flip_negation)
3743      {
3744      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3745      *code++ = OP_XCLASS;
3746      code += LINK_SIZE;
3747      *code = negate_class? XCL_NOT : 0;
3748
3749      /* If the map is required, move up the extra data to make room for it;
3750      otherwise just move the code pointer to the end of the extra data. */
3751
3752      if (class_charcount > 0)
3753        {
3754        *code++ |= XCL_MAP;
3755        memmove(code + 32, code, class_utf8data - code);
3756        memcpy(code, classbits, 32);
3757        code = class_utf8data + 32;
3758        }
3759      else code = class_utf8data;
3760
3761      /* Now fill in the complete length of the item */
3762
3763      PUT(previous, 1, code - previous);
3764      break;   /* End of class handling */
3765      }
3766#endif
3767
3768    /* If there are no characters > 255, set the opcode to OP_CLASS or
3769    OP_NCLASS, depending on whether the whole class was negated and whether
3770    there were negative specials such as \S in the class. Then copy the 32-byte
3771    map into the code vector, negating it if necessary. */
3772
3773    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3774    if (negate_class)
3775      {
3776      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3777        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3778      }
3779    else
3780      {
3781      memcpy(code, classbits, 32);
3782      }
3783    code += 32;
3784    break;
3785
3786
3787    /* ===================================================================*/
3788    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3789    has been tested above. */
3790
3791    case CHAR_LEFT_CURLY_BRACKET:
3792    if (!is_quantifier) goto NORMAL_CHAR;
3793    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3794    if (*errorcodeptr != 0) goto FAILED;
3795    goto REPEAT;
3796
3797    case CHAR_ASTERISK:
3798    repeat_min = 0;
3799    repeat_max = -1;
3800    goto REPEAT;
3801
3802    case CHAR_PLUS:
3803    repeat_min = 1;
3804    repeat_max = -1;
3805    goto REPEAT;
3806
3807    case CHAR_QUESTION_MARK:
3808    repeat_min = 0;
3809    repeat_max = 1;
3810
3811    REPEAT:
3812    if (previous == NULL)
3813      {
3814      *errorcodeptr = ERR9;
3815      goto FAILED;
3816      }
3817
3818    if (repeat_min == 0)
3819      {
3820      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3821      reqbyte = zeroreqbyte;        /* Ditto */
3822      }
3823
3824    /* Remember whether this is a variable length repeat */
3825
3826    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3827
3828    op_type = 0;                    /* Default single-char op codes */
3829    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3830
3831    /* Save start of previous item, in case we have to move it up to make space
3832    for an inserted OP_ONCE for the additional '+' extension. */
3833
3834    tempcode = previous;
3835
3836    /* If the next character is '+', we have a possessive quantifier. This
3837    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3838    If the next character is '?' this is a minimizing repeat, by default,
3839    but if PCRE_UNGREEDY is set, it works the other way round. We change the
3840    repeat type to the non-default. */
3841
3842    if (ptr[1] == CHAR_PLUS)
3843      {
3844      repeat_type = 0;                  /* Force greedy */
3845      possessive_quantifier = TRUE;
3846      ptr++;
3847      }
3848    else if (ptr[1] == CHAR_QUESTION_MARK)
3849      {
3850      repeat_type = greedy_non_default;
3851      ptr++;
3852      }
3853    else repeat_type = greedy_default;
3854
3855    /* If previous was a character match, abolish the item and generate a
3856    repeat item instead. If a char item has a minumum of more than one, ensure
3857    that it is set in reqbyte - it might not be if a sequence such as x{3} is
3858    the first thing in a branch because the x will have gone into firstbyte
3859    instead.  */
3860
3861    if (*previous == OP_CHAR || *previous == OP_CHARNC)
3862      {
3863      /* Deal with UTF-8 characters that take up more than one byte. It's
3864      easier to write this out separately than try to macrify it. Use c to
3865      hold the length of the character in bytes, plus 0x80 to flag that it's a
3866      length rather than a small character. */
3867
3868#ifdef SUPPORT_UTF8
3869      if (utf8 && (code[-1] & 0x80) != 0)
3870        {
3871        uschar *lastchar = code - 1;
3872        while((*lastchar & 0xc0) == 0x80) lastchar--;
3873        c = code - lastchar;            /* Length of UTF-8 character */
3874        memcpy(utf8_char, lastchar, c); /* Save the char */
3875        c |= 0x80;                      /* Flag c as a length */
3876        }
3877      else
3878#endif
3879
3880      /* Handle the case of a single byte - either with no UTF8 support, or
3881      with UTF-8 disabled, or for a UTF-8 character < 128. */
3882
3883        {
3884        c = code[-1];
3885        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3886        }
3887
3888      /* If the repetition is unlimited, it pays to see if the next thing on
3889      the line is something that cannot possibly match this character. If so,
3890      automatically possessifying this item gains some performance in the case
3891      where the match fails. */
3892
3893      if (!possessive_quantifier &&
3894          repeat_max < 0 &&
3895          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3896            options, cd))
3897        {
3898        repeat_type = 0;    /* Force greedy */
3899        possessive_quantifier = TRUE;
3900        }
3901
3902      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3903      }
3904
3905    /* If previous was a single negated character ([^a] or similar), we use
3906    one of the special opcodes, replacing it. The code is shared with single-
3907    character repeats by setting opt_type to add a suitable offset into
3908    repeat_type. We can also test for auto-possessification. OP_NOT is
3909    currently used only for single-byte chars. */
3910
3911    else if (*previous == OP_NOT)
3912      {
3913      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3914      c = previous[1];
3915      if (!possessive_quantifier &&
3916          repeat_max < 0 &&
3917          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3918        {
3919        repeat_type = 0;    /* Force greedy */
3920        possessive_quantifier = TRUE;
3921        }
3922      goto OUTPUT_SINGLE_REPEAT;
3923      }
3924
3925    /* If previous was a character type match (\d or similar), abolish it and
3926    create a suitable repeat item. The code is shared with single-character
3927    repeats by setting op_type to add a suitable offset into repeat_type. Note
3928    the the Unicode property types will be present only when SUPPORT_UCP is
3929    defined, but we don't wrap the little bits of code here because it just
3930    makes it horribly messy. */
3931
3932    else if (*previous < OP_EODN)
3933      {
3934      uschar *oldcode;
3935      int prop_type, prop_value;
3936      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3937      c = *previous;
3938
3939      if (!possessive_quantifier &&
3940          repeat_max < 0 &&
3941          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3942        {
3943        repeat_type = 0;    /* Force greedy */
3944        possessive_quantifier = TRUE;
3945        }
3946
3947      OUTPUT_SINGLE_REPEAT:
3948      if (*previous == OP_PROP || *previous == OP_NOTPROP)
3949        {
3950        prop_type = previous[1];
3951        prop_value = previous[2];
3952        }
3953      else prop_type = prop_value = -1;
3954
3955      oldcode = code;
3956      code = previous;                  /* Usually overwrite previous item */
3957
3958      /* If the maximum is zero then the minimum must also be zero; Perl allows
3959      this case, so we do too - by simply omitting the item altogether. */
3960
3961      if (repeat_max == 0) goto END_REPEAT;
3962
3963      /*--------------------------------------------------------------------*/
3964      /* This code is obsolete from release 8.00; the restriction was finally
3965      removed: */
3966
3967      /* All real repeats make it impossible to handle partial matching (maybe
3968      one day we will be able to remove this restriction). */
3969
3970      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
3971      /*--------------------------------------------------------------------*/
3972
3973      /* Combine the op_type with the repeat_type */
3974
3975      repeat_type += op_type;
3976
3977      /* A minimum of zero is handled either as the special case * or ?, or as
3978      an UPTO, with the maximum given. */
3979
3980      if (repeat_min == 0)
3981        {
3982        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3983          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3984        else
3985          {
3986          *code++ = OP_UPTO + repeat_type;
3987          PUT2INC(code, 0, repeat_max);
3988          }
3989        }
3990
3991      /* A repeat minimum of 1 is optimized into some special cases. If the
3992      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3993      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3994      one less than the maximum. */
3995
3996      else if (repeat_min == 1)
3997        {
3998        if (repeat_max == -1)
3999          *code++ = OP_PLUS + repeat_type;
4000        else
4001          {
4002          code = oldcode;                 /* leave previous item in place */
4003          if (repeat_max == 1) goto END_REPEAT;
4004          *code++ = OP_UPTO + repeat_type;
4005          PUT2INC(code, 0, repeat_max - 1);
4006          }
4007        }
4008
4009      /* The case {n,n} is just an EXACT, while the general case {n,m} is
4010      handled as an EXACT followed by an UPTO. */
4011
4012      else
4013        {
4014        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4015        PUT2INC(code, 0, repeat_min);
4016
4017        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4018        we have to insert the character for the previous code. For a repeated
4019        Unicode property match, there are two extra bytes that define the
4020        required property. In UTF-8 mode, long characters have their length in
4021        c, with the 0x80 bit as a flag. */
4022
4023        if (repeat_max < 0)
4024          {
4025#ifdef SUPPORT_UTF8
4026          if (utf8 && c >= 128)
4027            {
4028            memcpy(code, utf8_char, c & 7);
4029            code += c & 7;
4030            }
4031          else
4032#endif
4033            {
4034            *code++ = c;
4035            if (prop_type >= 0)
4036              {
4037              *code++ = prop_type;
4038              *code++ = prop_value;
4039              }
4040            }
4041          *code++ = OP_STAR + repeat_type;
4042          }
4043
4044        /* Else insert an UPTO if the max is greater than the min, again
4045        preceded by the character, for the previously inserted code. If the
4046        UPTO is just for 1 instance, we can use QUERY instead. */
4047
4048        else if (repeat_max != repeat_min)
4049          {
4050#ifdef SUPPORT_UTF8
4051          if (utf8 && c >= 128)
4052            {
4053            memcpy(code, utf8_char, c & 7);
4054            code += c & 7;
4055            }
4056          else
4057#endif
4058          *code++ = c;
4059          if (prop_type >= 0)
4060            {
4061            *code++ = prop_type;
4062            *code++ = prop_value;
4063            }
4064          repeat_max -= repeat_min;
4065
4066          if (repeat_max == 1)
4067            {
4068            *code++ = OP_QUERY + repeat_type;
4069            }
4070          else
4071            {
4072            *code++ = OP_UPTO + repeat_type;
4073            PUT2INC(code, 0, repeat_max);
4074            }
4075          }
4076        }
4077
4078      /* The character or character type itself comes last in all cases. */
4079
4080#ifdef SUPPORT_UTF8
4081      if (utf8 && c >= 128)
4082        {
4083        memcpy(code, utf8_char, c & 7);
4084        code += c & 7;
4085        }
4086      else
4087#endif
4088      *code++ = c;
4089
4090      /* For a repeated Unicode property match, there are two extra bytes that
4091      define the required property. */
4092
4093#ifdef SUPPORT_UCP
4094      if (prop_type >= 0)
4095        {
4096        *code++ = prop_type;
4097        *code++ = prop_value;
4098        }
4099#endif
4100      }
4101
4102    /* If previous was a character class or a back reference, we put the repeat
4103    stuff after it, but just skip the item if the repeat was {0,0}. */
4104
4105    else if (*previous == OP_CLASS ||
4106             *previous == OP_NCLASS ||
4107#ifdef SUPPORT_UTF8
4108             *previous == OP_XCLASS ||
4109#endif
4110             *previous == OP_REF)
4111      {
4112      if (repeat_max == 0)
4113        {
4114        code = previous;
4115        goto END_REPEAT;
4116        }
4117
4118      /*--------------------------------------------------------------------*/
4119      /* This code is obsolete from release 8.00; the restriction was finally
4120      removed: */
4121
4122      /* All real repeats make it impossible to handle partial matching (maybe
4123      one day we will be able to remove this restriction). */
4124
4125      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4126      /*--------------------------------------------------------------------*/
4127
4128      if (repeat_min == 0 && repeat_max == -1)
4129        *code++ = OP_CRSTAR + repeat_type;
4130      else if (repeat_min == 1 && repeat_max == -1)
4131        *code++ = OP_CRPLUS + repeat_type;
4132      else if (repeat_min == 0 && repeat_max == 1)
4133        *code++ = OP_CRQUERY + repeat_type;
4134      else
4135        {
4136        *code++ = OP_CRRANGE + repeat_type;
4137        PUT2INC(code, 0, repeat_min);
4138        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4139        PUT2INC(code, 0, repeat_max);
4140        }
4141      }
4142
4143    /* If previous was a bracket group, we may have to replicate it in certain
4144    cases. */
4145
4146    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4147             *previous == OP_ONCE || *previous == OP_COND)
4148      {
4149      register int i;
4150      int ketoffset = 0;
4151      int len = code - previous;
4152      uschar *bralink = NULL;
4153
4154      /* Repeating a DEFINE group is pointless */
4155
4156      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4157        {
4158        *errorcodeptr = ERR55;
4159        goto FAILED;
4160        }
4161
4162      /* If the maximum repeat count is unlimited, find the end of the bracket
4163      by scanning through from the start, and compute the offset back to it
4164      from the current code pointer. There may be an OP_OPT setting following
4165      the final KET, so we can't find the end just by going back from the code
4166      pointer. */
4167
4168      if (repeat_max == -1)
4169        {
4170        register uschar *ket = previous;
4171        do ket += GET(ket, 1); while (*ket != OP_KET);
4172        ketoffset = code - ket;
4173        }
4174
4175      /* The case of a zero minimum is special because of the need to stick
4176      OP_BRAZERO in front of it, and because the group appears once in the
4177      data, whereas in other cases it appears the minimum number of times. For
4178      this reason, it is simplest to treat this case separately, as otherwise
4179      the code gets far too messy. There are several special subcases when the
4180      minimum is zero. */
4181
4182      if (repeat_min == 0)
4183        {
4184        /* If the maximum is also zero, we used to just omit the group from the
4185        output altogether, like this:
4186
4187        ** if (repeat_max == 0)
4188        **   {
4189        **   code = previous;
4190        **   goto END_REPEAT;
4191        **   }
4192
4193        However, that fails when a group is referenced as a subroutine from
4194        elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4195        so that it is skipped on execution. As we don't have a list of which
4196        groups are referenced, we cannot do this selectively.
4197
4198        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4199        and do no more at this point. However, we do need to adjust any
4200        OP_RECURSE calls inside the group that refer to the group itself or any
4201        internal or forward referenced group, because the offset is from the
4202        start of the whole regex. Temporarily terminate the pattern while doing
4203        this. */
4204
4205        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4206          {
4207          *code = OP_END;
4208          adjust_recurse(previous, 1, utf8, cd, save_hwm);
4209          memmove(previous+1, previous, len);
4210          code++;
4211          if (repeat_max == 0)
4212            {
4213            *previous++ = OP_SKIPZERO;
4214            goto END_REPEAT;
4215            }
4216          *previous++ = OP_BRAZERO + repeat_type;
4217          }
4218
4219        /* If the maximum is greater than 1 and limited, we have to replicate
4220        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4221        The first one has to be handled carefully because it's the original
4222        copy, which has to be moved up. The remainder can be handled by code
4223        that is common with the non-zero minimum case below. We have to
4224        adjust the value or repeat_max, since one less copy is required. Once
4225        again, we may have to adjust any OP_RECURSE calls inside the group. */
4226
4227        else
4228          {
4229          int offset;
4230          *code = OP_END;
4231          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4232          memmove(previous + 2 + LINK_SIZE, previous, len);
4233          code += 2 + LINK_SIZE;
4234          *previous++ = OP_BRAZERO + repeat_type;
4235          *previous++ = OP_BRA;
4236
4237          /* We chain together the bracket offset fields that have to be
4238          filled in later when the ends of the brackets are reached. */
4239
4240          offset = (bralink == NULL)? 0 : previous - bralink;
4241          bralink = previous;
4242          PUTINC(previous, 0, offset);
4243          }
4244
4245        repeat_max--;
4246        }
4247
4248      /* If the minimum is greater than zero, replicate the group as many
4249      times as necessary, and adjust the maximum to the number of subsequent
4250      copies that we need. If we set a first char from the group, and didn't
4251      set a required char, copy the latter from the former. If there are any
4252      forward reference subroutine calls in the group, there will be entries on
4253      the workspace list; replicate these with an appropriate increment. */
4254
4255      else
4256        {
4257        if (repeat_min > 1)
4258          {
4259          /* In the pre-compile phase, we don't actually do the replication. We
4260          just adjust the length as if we had. Do some paranoid checks for
4261          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4262          integer type when available, otherwise double. */
4263
4264          if (lengthptr != NULL)
4265            {
4266            int delta = (repeat_min - 1)*length_prevgroup;
4267            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4268                  (INT64_OR_DOUBLE)length_prevgroup >
4269                    (INT64_OR_DOUBLE)INT_MAX ||
4270                OFLOW_MAX - *lengthptr < delta)
4271              {
4272              *errorcodeptr = ERR20;
4273              goto FAILED;
4274              }
4275            *lengthptr += delta;
4276            }
4277
4278          /* This is compiling for real */
4279
4280          else
4281            {
4282            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4283            for (i = 1; i < repeat_min; i++)
4284              {
4285              uschar *hc;
4286              uschar *this_hwm = cd->hwm;
4287              memcpy(code, previous, len);
4288              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4289                {
4290                PUT(cd->hwm, 0, GET(hc, 0) + len);
4291                cd->hwm += LINK_SIZE;
4292                }
4293              save_hwm = this_hwm;
4294              code += len;
4295              }
4296            }
4297          }
4298
4299        if (repeat_max > 0) repeat_max -= repeat_min;
4300        }
4301
4302      /* This code is common to both the zero and non-zero minimum cases. If
4303      the maximum is limited, it replicates the group in a nested fashion,
4304      remembering the bracket starts on a stack. In the case of a zero minimum,
4305      the first one was set up above. In all cases the repeat_max now specifies
4306      the number of additional copies needed. Again, we must remember to
4307      replicate entries on the forward reference list. */
4308
4309      if (repeat_max >= 0)
4310        {
4311        /* In the pre-compile phase, we don't actually do the replication. We
4312        just adjust the length as if we had. For each repetition we must add 1
4313        to the length for BRAZERO and for all but the last repetition we must
4314        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4315        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4316        a 64-bit integer type when available, otherwise double. */
4317
4318        if (lengthptr != NULL && repeat_max > 0)
4319          {
4320          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4321                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4322          if ((INT64_OR_DOUBLE)repeat_max *
4323                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4324                  > (INT64_OR_DOUBLE)INT_MAX ||
4325              OFLOW_MAX - *lengthptr < delta)
4326            {
4327            *errorcodeptr = ERR20;
4328            goto FAILED;
4329            }
4330          *lengthptr += delta;
4331          }
4332
4333        /* This is compiling for real */
4334
4335        else for (i = repeat_max - 1; i >= 0; i--)
4336          {
4337          uschar *hc;
4338          uschar *this_hwm = cd->hwm;
4339
4340          *code++ = OP_BRAZERO + repeat_type;
4341
4342          /* All but the final copy start a new nesting, maintaining the
4343          chain of brackets outstanding. */
4344
4345          if (i != 0)
4346            {
4347            int offset;
4348            *code++ = OP_BRA;
4349            offset = (bralink == NULL)? 0 : code - bralink;
4350            bralink = code;
4351            PUTINC(code, 0, offset);
4352            }
4353
4354          memcpy(code, previous, len);
4355          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4356            {
4357            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4358            cd->hwm += LINK_SIZE;
4359            }
4360          save_hwm = this_hwm;
4361          code += len;
4362          }
4363
4364        /* Now chain through the pending brackets, and fill in their length
4365        fields (which are holding the chain links pro tem). */
4366
4367        while (bralink != NULL)
4368          {
4369          int oldlinkoffset;
4370          int offset = code - bralink + 1;
4371          uschar *bra = code - offset;
4372          oldlinkoffset = GET(bra, 1);
4373          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4374          *code++ = OP_KET;
4375          PUTINC(code, 0, offset);
4376          PUT(bra, 1, offset);
4377          }
4378        }
4379
4380      /* If the maximum is unlimited, set a repeater in the final copy. We
4381      can't just offset backwards from the current code point, because we
4382      don't know if there's been an options resetting after the ket. The
4383      correct offset was computed above.
4384
4385      Then, when we are doing the actual compile phase, check to see whether
4386      this group is a non-atomic one that could match an empty string. If so,
4387      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4388      that runtime checking can be done. [This check is also applied to
4389      atomic groups at runtime, but in a different way.] */
4390
4391      else
4392        {
4393        uschar *ketcode = code - ketoffset;
4394        uschar *bracode = ketcode - GET(ketcode, 1);
4395        *ketcode = OP_KETRMAX + repeat_type;
4396        if (lengthptr == NULL && *bracode != OP_ONCE)
4397          {
4398          uschar *scode = bracode;
4399          do
4400            {
4401            if (could_be_empty_branch(scode, ketcode, utf8, cd))
4402              {
4403              *bracode += OP_SBRA - OP_BRA;
4404              break;
4405              }
4406            scode += GET(scode, 1);
4407            }
4408          while (*scode == OP_ALT);
4409          }
4410        }
4411      }
4412
4413    /* If previous is OP_FAIL, it was generated by an empty class [] in
4414    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4415    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4416    error above. We can just ignore the repeat in JS case. */
4417
4418    else if (*previous == OP_FAIL) goto END_REPEAT;
4419
4420    /* Else there's some kind of shambles */
4421
4422    else
4423      {
4424      *errorcodeptr = ERR11;
4425      goto FAILED;
4426      }
4427
4428    /* If the character following a repeat is '+', or if certain optimization
4429    tests above succeeded, possessive_quantifier is TRUE. For some of the
4430    simpler opcodes, there is an special alternative opcode for this. For
4431    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4432    The '+' notation is just syntactic sugar, taken from Sun's Java package,
4433    but the special opcodes can optimize it a bit. The repeated item starts at
4434    tempcode, not at previous, which might be the first part of a string whose
4435    (former) last char we repeated.
4436
4437    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4438    an 'upto' may follow. We skip over an 'exact' item, and then test the
4439    length of what remains before proceeding. */
4440
4441    if (possessive_quantifier)
4442      {
4443      int len;
4444
4445      if (*tempcode == OP_TYPEEXACT)
4446        tempcode += _pcre_OP_lengths[*tempcode] +
4447          ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4448
4449      else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4450        {
4451        tempcode += _pcre_OP_lengths[*tempcode];
4452#ifdef SUPPORT_UTF8
4453        if (utf8 && tempcode[-1] >= 0xc0)
4454          tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4455#endif
4456        }
4457
4458      len = code - tempcode;
4459      if (len > 0) switch (*tempcode)
4460        {
4461        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4462        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4463        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4464        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4465
4466        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4467        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4468        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4469        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4470
4471        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4472        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4473        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4474        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4475
4476        /* Because we are moving code along, we must ensure that any
4477        pending recursive references are updated. */
4478
4479        default:
4480        *code = OP_END;
4481        adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4482        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4483        code += 1 + LINK_SIZE;
4484        len += 1 + LINK_SIZE;
4485        tempcode[0] = OP_ONCE;
4486        *code++ = OP_KET;
4487        PUTINC(code, 0, len);
4488        PUT(tempcode, 1, len);
4489        break;
4490        }
4491      }
4492
4493    /* In all case we no longer have a previous item. We also set the
4494    "follows varying string" flag for subsequently encountered reqbytes if
4495    it isn't already set and we have just passed a varying length item. */
4496
4497    END_REPEAT:
4498    previous = NULL;
4499    cd->req_varyopt |= reqvary;
4500    break;
4501
4502
4503    /* ===================================================================*/
4504    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4505    lookbehind or option setting or condition or all the other extended
4506    parenthesis forms.  */
4507
4508    case CHAR_LEFT_PARENTHESIS:
4509    newoptions = options;
4510    skipbytes = 0;
4511    bravalue = OP_CBRA;
4512    save_hwm = cd->hwm;
4513    reset_bracount = FALSE;
4514
4515    /* First deal with various "verbs" that can be introduced by '*'. */
4516
4517    if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4518      {
4519      int i, namelen;
4520      const char *vn = verbnames;
4521      const uschar *name = ++ptr;
4522      previous = NULL;
4523      while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4524      if (*ptr == CHAR_COLON)
4525        {
4526        *errorcodeptr = ERR59;   /* Not supported */
4527        goto FAILED;
4528        }
4529      if (*ptr != CHAR_RIGHT_PARENTHESIS)
4530        {
4531        *errorcodeptr = ERR60;
4532        goto FAILED;
4533        }
4534      namelen = ptr - name;
4535      for (i = 0; i < verbcount; i++)
4536        {
4537        if (namelen == verbs[i].len &&
4538            strncmp((char *)name, vn, namelen) == 0)
4539          {
4540          /* Check for open captures before ACCEPT */
4541
4542          if (verbs[i].op == OP_ACCEPT)
4543            {
4544            open_capitem *oc;
4545            cd->had_accept = TRUE;
4546            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4547              {
4548              *code++ = OP_CLOSE;
4549              PUT2INC(code, 0, oc->number);
4550              }
4551            }
4552          *code++ = verbs[i].op;
4553          break;
4554          }
4555        vn += verbs[i].len + 1;
4556        }
4557      if (i < verbcount) continue;
4558      *errorcodeptr = ERR60;
4559      goto FAILED;
4560      }
4561
4562    /* Deal with the extended parentheses; all are introduced by '?', and the
4563    appearance of any of them means that this is not a capturing group. */
4564
4565    else if (*ptr == CHAR_QUESTION_MARK)
4566      {
4567      int i, set, unset, namelen;
4568      int *optset;
4569      const uschar *name;
4570      uschar *slot;
4571
4572      switch (*(++ptr))
4573        {
4574        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4575        ptr++;
4576        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4577        if (*ptr == 0)
4578          {
4579          *errorcodeptr = ERR18;
4580          goto FAILED;
4581          }
4582        continue;
4583
4584
4585        /* ------------------------------------------------------------ */
4586        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4587        reset_bracount = TRUE;
4588        /* Fall through */
4589
4590        /* ------------------------------------------------------------ */
4591        case CHAR_COLON:          /* Non-capturing bracket */
4592        bravalue = OP_BRA;
4593        ptr++;
4594        break;
4595
4596
4597        /* ------------------------------------------------------------ */
4598        case CHAR_LEFT_PARENTHESIS:
4599        bravalue = OP_COND;       /* Conditional group */
4600
4601        /* A condition can be an assertion, a number (referring to a numbered
4602        group), a name (referring to a named group), or 'R', referring to
4603        recursion. R<digits> and R&name are also permitted for recursion tests.
4604
4605        There are several syntaxes for testing a named group: (?(name)) is used
4606        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4607
4608        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4609        be the recursive thing or the name 'R' (and similarly for 'R' followed
4610        by digits), and (b) a number could be a name that consists of digits.
4611        In both cases, we look for a name first; if not found, we try the other
4612        cases. */
4613
4614        /* For conditions that are assertions, check the syntax, and then exit
4615        the switch. This will take control down to where bracketed groups,
4616        including assertions, are processed. */
4617
4618        if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4619            ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4620          break;
4621
4622        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4623        below), and all need to skip 3 bytes at the start of the group. */
4624
4625        code[1+LINK_SIZE] = OP_CREF;
4626        skipbytes = 3;
4627        refsign = -1;
4628
4629        /* Check for a test for recursion in a named group. */
4630
4631        if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4632          {
4633          terminator = -1;
4634          ptr += 2;
4635          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4636          }
4637
4638        /* Check for a test for a named group's having been set, using the Perl
4639        syntax (?(<name>) or (?('name') */
4640
4641        else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4642          {
4643          terminator = CHAR_GREATER_THAN_SIGN;
4644          ptr++;
4645          }
4646        else if (ptr[1] == CHAR_APOSTROPHE)
4647          {
4648          terminator = CHAR_APOSTROPHE;
4649          ptr++;
4650          }
4651        else
4652          {
4653          terminator = 0;
4654          if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4655          }
4656
4657        /* We now expect to read a name; any thing else is an error */
4658
4659        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4660          {
4661          ptr += 1;  /* To get the right offset */
4662          *errorcodeptr = ERR28;
4663          goto FAILED;
4664          }
4665
4666        /* Read the name, but also get it as a number if it's all digits */
4667
4668        recno = 0;
4669        name = ++ptr;
4670        while ((cd->ctypes[*ptr] & ctype_word) != 0)
4671          {
4672          if (recno >= 0)
4673            recno = ((digitab[*ptr] & ctype_digit) != 0)?
4674              recno * 10 + *ptr - CHAR_0 : -1;
4675          ptr++;
4676          }
4677        namelen = ptr - name;
4678
4679        if ((terminator > 0 && *ptr++ != terminator) ||
4680            *ptr++ != CHAR_RIGHT_PARENTHESIS)
4681          {
4682          ptr--;      /* Error offset */
4683          *errorcodeptr = ERR26;
4684          goto FAILED;
4685          }
4686
4687        /* Do no further checking in the pre-compile phase. */
4688
4689        if (lengthptr != NULL) break;
4690
4691        /* In the real compile we do the work of looking for the actual
4692        reference. If the string started with "+" or "-" we require the rest to
4693        be digits, in which case recno will be set. */
4694
4695        if (refsign > 0)
4696          {
4697          if (recno <= 0)
4698            {
4699            *errorcodeptr = ERR58;
4700            goto FAILED;
4701            }
4702          recno = (refsign == CHAR_MINUS)?
4703            cd->bracount - recno + 1 : recno +cd->bracount;
4704          if (recno <= 0 || recno > cd->final_bracount)
4705            {
4706            *errorcodeptr = ERR15;
4707            goto FAILED;
4708            }
4709          PUT2(code, 2+LINK_SIZE, recno);
4710          break;
4711          }
4712
4713        /* Otherwise (did not start with "+" or "-"), start by looking for the
4714        name. If we find a name, add one to the opcode to change OP_CREF or
4715        OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
4716        except they record that the reference was originally to a name. The
4717        information is used to check duplicate names. */
4718
4719        slot = cd->name_table;
4720        for (i = 0; i < cd->names_found; i++)
4721          {
4722          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4723          slot += cd->name_entry_size;
4724          }
4725
4726        /* Found a previous named subpattern */
4727
4728        if (i < cd->names_found)
4729          {
4730          recno = GET2(slot, 0);
4731          PUT2(code, 2+LINK_SIZE, recno);
4732          code[1+LINK_SIZE]++;
4733          }
4734
4735        /* Search the pattern for a forward reference */
4736
4737        else if ((i = find_parens(cd, name, namelen,
4738                        (options & PCRE_EXTENDED) != 0)) > 0)
4739          {
4740          PUT2(code, 2+LINK_SIZE, i);
4741          code[1+LINK_SIZE]++;
4742          }
4743
4744        /* If terminator == 0 it means that the name followed directly after
4745        the opening parenthesis [e.g. (?(abc)...] and in this case there are
4746        some further alternatives to try. For the cases where terminator != 0
4747        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4748        now checked all the possibilities, so give an error. */
4749
4750        else if (terminator != 0)
4751          {
4752          *errorcodeptr = ERR15;
4753          goto FAILED;
4754          }
4755
4756        /* Check for (?(R) for recursion. Allow digits after R to specify a
4757        specific group number. */
4758
4759        else if (*name == CHAR_R)
4760          {
4761          recno = 0;
4762          for (i = 1; i < namelen; i++)
4763            {
4764            if ((digitab[name[i]] & ctype_digit) == 0)
4765              {
4766              *errorcodeptr = ERR15;
4767              goto FAILED;
4768              }
4769            recno = recno * 10 + name[i] - CHAR_0;
4770            }
4771          if (recno == 0) recno = RREF_ANY;
4772          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4773          PUT2(code, 2+LINK_SIZE, recno);
4774          }
4775
4776        /* Similarly, check for the (?(DEFINE) "condition", which is always
4777        false. */
4778
4779        else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4780          {
4781          code[1+LINK_SIZE] = OP_DEF;
4782          skipbytes = 1;
4783          }
4784
4785        /* Check for the "name" actually being a subpattern number. We are
4786        in the second pass here, so final_bracount is set. */
4787
4788        else if (recno > 0 && recno <= cd->final_bracount)
4789          {
4790          PUT2(code, 2+LINK_SIZE, recno);
4791          }
4792
4793        /* Either an unidentified subpattern, or a reference to (?(0) */
4794
4795        else
4796          {
4797          *errorcodeptr = (recno == 0)? ERR35: ERR15;
4798          goto FAILED;
4799          }
4800        break;
4801
4802
4803        /* ------------------------------------------------------------ */
4804        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
4805        bravalue = OP_ASSERT;
4806        ptr++;
4807        break;
4808
4809
4810        /* ------------------------------------------------------------ */
4811        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
4812        ptr++;
4813        if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
4814          {
4815          *code++ = OP_FAIL;
4816          previous = NULL;
4817          continue;
4818          }
4819        bravalue = OP_ASSERT_NOT;
4820        break;
4821
4822
4823        /* ------------------------------------------------------------ */
4824        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
4825        switch (ptr[1])
4826          {
4827          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
4828          bravalue = OP_ASSERTBACK;
4829          ptr += 2;
4830          break;
4831
4832          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
4833          bravalue = OP_ASSERTBACK_NOT;
4834          ptr += 2;
4835          break;
4836
4837          default:                /* Could be name define, else bad */
4838          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4839          ptr++;                  /* Correct offset for error */
4840          *errorcodeptr = ERR24;
4841          goto FAILED;
4842          }
4843        break;
4844
4845
4846        /* ------------------------------------------------------------ */
4847        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
4848        bravalue = OP_ONCE;
4849        ptr++;
4850        break;
4851
4852
4853        /* ------------------------------------------------------------ */
4854        case CHAR_C:                 /* Callout - may be followed by digits; */
4855        previous_callout = code;  /* Save for later completion */
4856        after_manual_callout = 1; /* Skip one item before completing */
4857        *code++ = OP_CALLOUT;
4858          {
4859          int n = 0;
4860          while ((digitab[*(++ptr)] & ctype_digit) != 0)
4861            n = n * 10 + *ptr - CHAR_0;
4862          if (*ptr != CHAR_RIGHT_PARENTHESIS)
4863            {
4864            *errorcodeptr = ERR39;
4865            goto FAILED;
4866            }
4867          if (n > 255)
4868            {
4869            *errorcodeptr = ERR38;
4870            goto FAILED;
4871            }
4872          *code++ = n;
4873          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4874          PUT(code, LINK_SIZE, 0);                    /* Default length */
4875          code += 2 * LINK_SIZE;
4876          }
4877        previous = NULL;
4878        continue;
4879
4880
4881        /* ------------------------------------------------------------ */
4882        case CHAR_P:              /* Python-style named subpattern handling */
4883        if (*(++ptr) == CHAR_EQUALS_SIGN ||
4884            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4885          {
4886          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4887          terminator = CHAR_RIGHT_PARENTHESIS;
4888          goto NAMED_REF_OR_RECURSE;
4889          }
4890        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
4891          {
4892          *errorcodeptr = ERR41;
4893          goto FAILED;
4894          }
4895        /* Fall through to handle (?P< as (?< is handled */
4896
4897
4898        /* ------------------------------------------------------------ */
4899        DEFINE_NAME:    /* Come here from (?< handling */
4900        case CHAR_APOSTROPHE:
4901          {
4902          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4903            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4904          name = ++ptr;
4905
4906          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4907          namelen = ptr - name;
4908
4909          /* In the pre-compile phase, just do a syntax check. */
4910
4911          if (lengthptr != NULL)
4912            {
4913            if (*ptr != terminator)
4914              {
4915              *errorcodeptr = ERR42;
4916              goto FAILED;
4917              }
4918            if (cd->names_found >= MAX_NAME_COUNT)
4919              {
4920              *errorcodeptr = ERR49;
4921              goto FAILED;
4922              }
4923            if (namelen + 3 > cd->name_entry_size)
4924              {
4925              cd->name_entry_size = namelen + 3;
4926              if (namelen > MAX_NAME_SIZE)
4927                {
4928                *errorcodeptr = ERR48;
4929                goto FAILED;
4930                }
4931              }
4932            }
4933
4934          /* In the real compile, create the entry in the table, maintaining
4935          alphabetical order. Duplicate names for different numbers are
4936          permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
4937          number are always OK. (An existing number can be re-used if (?|
4938          appears in the pattern.) In either event, a duplicate name results in
4939          a duplicate entry in the table, even if the number is the same. This
4940          is because the number of names, and hence the table size, is computed
4941          in the pre-compile, and it affects various numbers and pointers which
4942          would all have to be modified, and the compiled code moved down, if
4943          duplicates with the same number were omitted from the table. This
4944          doesn't seem worth the hassle. However, *different* names for the
4945          same number are not permitted. */
4946
4947          else
4948            {
4949            BOOL dupname = FALSE;
4950            slot = cd->name_table;
4951
4952            for (i = 0; i < cd->names_found; i++)
4953              {
4954              int crc = memcmp(name, slot+2, namelen);
4955              if (crc == 0)
4956                {
4957                if (slot[2+namelen] == 0)
4958                  {
4959                  if (GET2(slot, 0) != cd->bracount + 1 &&
4960                      (options & PCRE_DUPNAMES) == 0)
4961                    {
4962                    *errorcodeptr = ERR43;
4963                    goto FAILED;
4964                    }
4965                  else dupname = TRUE;
4966                  }
4967                else crc = -1;      /* Current name is a substring */
4968                }
4969
4970              /* Make space in the table and break the loop for an earlier
4971              name. For a duplicate or later name, carry on. We do this for
4972              duplicates so that in the simple case (when ?(| is not used) they
4973              are in order of their numbers. */
4974
4975              if (crc < 0)
4976                {
4977                memmove(slot + cd->name_entry_size, slot,
4978                  (cd->names_found - i) * cd->name_entry_size);
4979                break;
4980                }
4981
4982              /* Continue the loop for a later or duplicate name */
4983
4984              slot += cd->name_entry_size;
4985              }
4986
4987            /* For non-duplicate names, check for a duplicate number before
4988            adding the new name. */
4989
4990            if (!dupname)
4991              {
4992              uschar *cslot = cd->name_table;
4993              for (i = 0; i < cd->names_found; i++)
4994                {
4995                if (cslot != slot)
4996                  {
4997                  if (GET2(cslot, 0) == cd->bracount + 1)
4998                    {
4999                    *errorcodeptr = ERR65;
5000                    goto FAILED;
5001                    }
5002                  }
5003                else i--;
5004                cslot += cd->name_entry_size;
5005                }
5006              }
5007
5008            PUT2(slot, 0, cd->bracount + 1);
5009            memcpy(slot + 2, name, namelen);
5010            slot[2+namelen] = 0;
5011            }
5012          }
5013
5014        /* In both pre-compile and compile, count the number of names we've
5015        encountered. */
5016
5017        cd->names_found++;
5018        ptr++;                    /* Move past > or ' */
5019        goto NUMBERED_GROUP;
5020
5021
5022        /* ------------------------------------------------------------ */
5023        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
5024        terminator = CHAR_RIGHT_PARENTHESIS;
5025        is_recurse = TRUE;
5026        /* Fall through */
5027
5028        /* We come here from the Python syntax above that handles both
5029        references (?P=name) and recursion (?P>name), as well as falling
5030        through from the Perl recursion syntax (?&name). We also come here from
5031        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5032        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5033
5034        NAMED_REF_OR_RECURSE:
5035        name = ++ptr;
5036        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5037        namelen = ptr - name;
5038
5039        /* In the pre-compile phase, do a syntax check and set a dummy
5040        reference number. */
5041
5042        if (lengthptr != NULL)
5043          {
5044          if (namelen == 0)
5045            {
5046            *errorcodeptr = ERR62;
5047            goto FAILED;
5048            }
5049          if (*ptr != terminator)
5050            {
5051            *errorcodeptr = ERR42;
5052            goto FAILED;
5053            }
5054          if (namelen > MAX_NAME_SIZE)
5055            {
5056            *errorcodeptr = ERR48;
5057            goto FAILED;
5058            }
5059          recno = 0;
5060          }
5061
5062        /* In the real compile, seek the name in the table. We check the name
5063        first, and then check that we have reached the end of the name in the
5064        table. That way, if the name that is longer than any in the table,
5065        the comparison will fail without reading beyond the table entry. */
5066
5067        else
5068          {
5069          slot = cd->name_table;
5070          for (i = 0; i < cd->names_found; i++)
5071            {
5072            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5073                slot[2+namelen] == 0)
5074              break;
5075            slot += cd->name_entry_size;
5076            }
5077
5078          if (i < cd->names_found)         /* Back reference */
5079            {
5080            recno = GET2(slot, 0);
5081            }
5082          else if ((recno =                /* Forward back reference */
5083                    find_parens(cd, name, namelen,
5084                      (options & PCRE_EXTENDED) != 0)) <= 0)
5085            {
5086            *errorcodeptr = ERR15;
5087            goto FAILED;
5088            }
5089          }
5090
5091        /* In both phases, we can now go to the code than handles numerical
5092        recursion or backreferences. */
5093
5094        if (is_recurse) goto HANDLE_RECURSION;
5095          else goto HANDLE_REFERENCE;
5096
5097
5098        /* ------------------------------------------------------------ */
5099        case CHAR_R:              /* Recursion */
5100        ptr++;                    /* Same as (?0)      */
5101        /* Fall through */
5102
5103
5104        /* ------------------------------------------------------------ */
5105        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
5106        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5107        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5108          {
5109          const uschar *called;
5110          terminator = CHAR_RIGHT_PARENTHESIS;
5111
5112          /* Come here from the \g<...> and \g'...' code (Oniguruma
5113          compatibility). However, the syntax has been checked to ensure that
5114          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5115          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5116          ever be taken. */
5117
5118          HANDLE_NUMERICAL_RECURSION:
5119
5120          if ((refsign = *ptr) == CHAR_PLUS)
5121            {
5122            ptr++;
5123            if ((digitab[*ptr] & ctype_digit) == 0)
5124              {
5125              *errorcodeptr = ERR63;
5126              goto FAILED;
5127              }
5128            }
5129          else if (refsign == CHAR_MINUS)
5130            {
5131            if ((digitab[ptr[1]] & ctype_digit) == 0)
5132              goto OTHER_CHAR_AFTER_QUERY;
5133            ptr++;
5134            }
5135
5136          recno = 0;
5137          while((digitab[*ptr] & ctype_digit) != 0)
5138            recno = recno * 10 + *ptr++ - CHAR_0;
5139
5140          if (*ptr != terminator)
5141            {
5142            *errorcodeptr = ERR29;
5143            goto FAILED;
5144            }
5145
5146          if (refsign == CHAR_MINUS)
5147            {
5148            if (recno == 0)
5149              {
5150              *errorcodeptr = ERR58;
5151              goto FAILED;
5152              }
5153            recno = cd->bracount - recno + 1;
5154            if (recno <= 0)
5155              {
5156              *errorcodeptr = ERR15;
5157              goto FAILED;
5158              }
5159            }
5160          else if (refsign == CHAR_PLUS)
5161            {
5162            if (recno == 0)
5163              {
5164              *errorcodeptr = ERR58;
5165              goto FAILED;
5166              }
5167            recno += cd->bracount;
5168            }
5169
5170          /* Come here from code above that handles a named recursion */
5171
5172          HANDLE_RECURSION:
5173
5174          previous = code;
5175          called = cd->start_code;
5176
5177          /* When we are actually compiling, find the bracket that is being
5178          referenced. Temporarily end the regex in case it doesn't exist before
5179          this point. If we end up with a forward reference, first check that
5180          the bracket does occur later so we can give the error (and position)
5181          now. Then remember this forward reference in the workspace so it can
5182          be filled in at the end. */
5183
5184          if (lengthptr == NULL)
5185            {
5186            *code = OP_END;
5187            if (recno != 0)
5188              called = _pcre_find_bracket(cd->start_code, utf8, recno);
5189
5190            /* Forward reference */
5191
5192            if (called == NULL)
5193              {
5194              if (find_parens(cd, NULL, recno,
5195                    (options & PCRE_EXTENDED) != 0) < 0)
5196                {
5197                *errorcodeptr = ERR15;
5198                goto FAILED;
5199                }
5200
5201              /* Fudge the value of "called" so that when it is inserted as an
5202              offset below, what it actually inserted is the reference number
5203              of the group. */
5204
5205              called = cd->start_code + recno;
5206              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5207              }
5208
5209            /* If not a forward reference, and the subpattern is still open,
5210            this is a recursive call. We check to see if this is a left
5211            recursion that could loop for ever, and diagnose that case. */
5212
5213            else if (GET(called, 1) == 0 &&
5214                     could_be_empty(called, code, bcptr, utf8, cd))
5215              {
5216              *errorcodeptr = ERR40;
5217              goto FAILED;
5218              }
5219            }
5220
5221          /* Insert the recursion/subroutine item, automatically wrapped inside
5222          "once" brackets. Set up a "previous group" length so that a
5223          subsequent quantifier will work. */
5224
5225          *code = OP_ONCE;
5226          PUT(code, 1, 2 + 2*LINK_SIZE);
5227          code += 1 + LINK_SIZE;
5228
5229          *code = OP_RECURSE;
5230          PUT(code, 1, called - cd->start_code);
5231          code += 1 + LINK_SIZE;
5232
5233          *code = OP_KET;
5234          PUT(code, 1, 2 + 2*LINK_SIZE);
5235          code += 1 + LINK_SIZE;
5236
5237          length_prevgroup = 3 + 3*LINK_SIZE;
5238          }
5239
5240        /* Can't determine a first byte now */
5241
5242        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5243        continue;
5244
5245
5246        /* ------------------------------------------------------------ */
5247        default:              /* Other characters: check option setting */
5248        OTHER_CHAR_AFTER_QUERY:
5249        set = unset = 0;
5250        optset = &set;
5251
5252        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5253          {
5254          switch (*ptr++)
5255            {
5256            case CHAR_MINUS: optset = &unset; break;
5257
5258            case CHAR_J:    /* Record that it changed in the external options */
5259            *optset |= PCRE_DUPNAMES;
5260            cd->external_flags |= PCRE_JCHANGED;
5261            break;
5262
5263            case CHAR_i: *optset |= PCRE_CASELESS; break;
5264            case CHAR_m: *optset |= PCRE_MULTILINE; break;
5265            case CHAR_s: *optset |= PCRE_DOTALL; break;
5266            case CHAR_x: *optset |= PCRE_EXTENDED; break;
5267            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5268            case CHAR_X: *optset |= PCRE_EXTRA; break;
5269
5270            default:  *errorcodeptr = ERR12;
5271                      ptr--;    /* Correct the offset */
5272                      goto FAILED;
5273            }
5274          }
5275
5276        /* Set up the changed option bits, but don't change anything yet. */
5277
5278        newoptions = (options | set) & (~unset);
5279
5280        /* If the options ended with ')' this is not the start of a nested
5281        group with option changes, so the options change at this level. If this
5282        item is right at the start of the pattern, the options can be
5283        abstracted and made external in the pre-compile phase, and ignored in
5284        the compile phase. This can be helpful when matching -- for instance in
5285        caseless checking of required bytes.
5286
5287        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5288        definitely *not* at the start of the pattern because something has been
5289        compiled. In the pre-compile phase, however, the code pointer can have
5290        that value after the start, because it gets reset as code is discarded
5291        during the pre-compile. However, this can happen only at top level - if
5292        we are within parentheses, the starting BRA will still be present. At
5293        any parenthesis level, the length value can be used to test if anything
5294        has been compiled at that level. Thus, a test for both these conditions
5295        is necessary to ensure we correctly detect the start of the pattern in
5296        both phases.
5297
5298        If we are not at the pattern start, compile code to change the ims
5299        options if this setting actually changes any of them, and reset the
5300        greedy defaults and the case value for firstbyte and reqbyte. */
5301
5302        if (*ptr == CHAR_RIGHT_PARENTHESIS)
5303          {
5304          if (code == cd->start_code + 1 + LINK_SIZE &&
5305               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5306            {
5307            cd->external_options = newoptions;
5308            }
5309          else
5310            {
5311            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5312              {
5313              *code++ = OP_OPT;
5314              *code++ = newoptions & PCRE_IMS;
5315              }
5316            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5317            greedy_non_default = greedy_default ^ 1;
5318            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5319            }
5320
5321          /* Change options at this level, and pass them back for use
5322          in subsequent branches. When not at the start of the pattern, this
5323          information is also necessary so that a resetting item can be
5324          compiled at the end of a group (if we are in a group). */
5325
5326          *optionsptr = options = newoptions;
5327          previous = NULL;       /* This item can't be repeated */
5328          continue;              /* It is complete */
5329          }
5330
5331        /* If the options ended with ':' we are heading into a nested group
5332        with possible change of options. Such groups are non-capturing and are
5333        not assertions of any kind. All we need to do is skip over the ':';
5334        the newoptions value is handled below. */
5335
5336        bravalue = OP_BRA;
5337        ptr++;
5338        }     /* End of switch for character following (? */
5339      }       /* End of (? handling */
5340
5341    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5342    all unadorned brackets become non-capturing and behave like (?:...)
5343    brackets. */
5344
5345    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5346      {
5347      bravalue = OP_BRA;
5348      }
5349
5350    /* Else we have a capturing group. */
5351
5352    else
5353      {
5354      NUMBERED_GROUP:
5355      cd->bracount += 1;
5356      PUT2(code, 1+LINK_SIZE, cd->bracount);
5357      skipbytes = 2;
5358      }
5359
5360    /* Process nested bracketed regex. Assertions may not be repeated, but
5361    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5362    non-register variable in order to be able to pass its address because some
5363    compilers complain otherwise. Pass in a new setting for the ims options if
5364    they have changed. */
5365
5366    previous = (bravalue >= OP_ONCE)? code : NULL;
5367    *code = bravalue;
5368    tempcode = code;
5369    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5370    length_prevgroup = 0;              /* Initialize for pre-compile phase */
5371
5372    if (!compile_regex(
5373         newoptions,                   /* The complete new option state */
5374         options & PCRE_IMS,           /* The previous ims option state */
5375         &tempcode,                    /* Where to put code (updated) */
5376         &ptr,                         /* Input pointer (updated) */
5377         errorcodeptr,                 /* Where to put an error message */
5378         (bravalue == OP_ASSERTBACK ||
5379          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5380         reset_bracount,               /* True if (?| group */
5381         skipbytes,                    /* Skip over bracket number */
5382         &subfirstbyte,                /* For possible first char */
5383         &subreqbyte,                  /* For possible last char */
5384         bcptr,                        /* Current branch chain */
5385         cd,                           /* Tables block */
5386         (lengthptr == NULL)? NULL :   /* Actual compile phase */
5387           &length_prevgroup           /* Pre-compile phase */
5388         ))
5389      goto FAILED;
5390
5391    /* At the end of compiling, code is still pointing to the start of the
5392    group, while tempcode has been updated to point past the end of the group
5393    and any option resetting that may follow it. The pattern pointer (ptr)
5394    is on the bracket. */
5395
5396    /* If this is a conditional bracket, check that there are no more than
5397    two branches in the group, or just one if it's a DEFINE group. We do this
5398    in the real compile phase, not in the pre-pass, where the whole group may
5399    not be available. */
5400
5401    if (bravalue == OP_COND && lengthptr == NULL)
5402      {
5403      uschar *tc = code;
5404      int condcount = 0;
5405
5406      do {
5407         condcount++;
5408         tc += GET(tc,1);
5409         }
5410      while (*tc != OP_KET);
5411
5412      /* A DEFINE group is never obeyed inline (the "condition" is always
5413      false). It must have only one branch. */
5414
5415      if (code[LINK_SIZE+1] == OP_DEF)
5416        {
5417        if (condcount > 1)
5418          {
5419          *errorcodeptr = ERR54;
5420          goto FAILED;
5421          }
5422        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5423        }
5424
5425      /* A "normal" conditional group. If there is just one branch, we must not
5426      make use of its firstbyte or reqbyte, because this is equivalent to an
5427      empty second branch. */
5428
5429      else
5430        {
5431        if (condcount > 2)
5432          {
5433          *errorcodeptr = ERR27;
5434          goto FAILED;
5435          }
5436        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5437        }
5438      }
5439
5440    /* Error if hit end of pattern */
5441
5442    if (*ptr != CHAR_RIGHT_PARENTHESIS)
5443      {
5444      *errorcodeptr = ERR14;
5445      goto FAILED;
5446      }
5447
5448    /* In the pre-compile phase, update the length by the length of the group,
5449    less the brackets at either end. Then reduce the compiled code to just a
5450    set of non-capturing brackets so that it doesn't use much memory if it is
5451    duplicated by a quantifier.*/
5452
5453    if (lengthptr != NULL)
5454      {
5455      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5456        {
5457        *errorcodeptr = ERR20;
5458        goto FAILED;
5459        }
5460      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5461      *code++ = OP_BRA;
5462      PUTINC(code, 0, 1 + LINK_SIZE);
5463      *code++ = OP_KET;
5464      PUTINC(code, 0, 1 + LINK_SIZE);
5465      break;    /* No need to waste time with special character handling */
5466      }
5467
5468    /* Otherwise update the main code pointer to the end of the group. */
5469
5470    code = tempcode;
5471
5472    /* For a DEFINE group, required and first character settings are not
5473    relevant. */
5474
5475    if (bravalue == OP_DEF) break;
5476
5477    /* Handle updating of the required and first characters for other types of
5478    group. Update for normal brackets of all kinds, and conditions with two
5479    branches (see code above). If the bracket is followed by a quantifier with
5480    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5481    zerofirstbyte outside the main loop so that they can be accessed for the
5482    back off. */
5483
5484    zeroreqbyte = reqbyte;
5485    zerofirstbyte = firstbyte;
5486    groupsetfirstbyte = FALSE;
5487
5488    if (bravalue >= OP_ONCE)
5489      {
5490      /* If we have not yet set a firstbyte in this branch, take it from the
5491      subpattern, remembering that it was set here so that a repeat of more
5492      than one can replicate it as reqbyte if necessary. If the subpattern has
5493      no firstbyte, set "none" for the whole branch. In both cases, a zero
5494      repeat forces firstbyte to "none". */
5495
5496      if (firstbyte == REQ_UNSET)
5497        {
5498        if (subfirstbyte >= 0)
5499          {
5500          firstbyte = subfirstbyte;
5501          groupsetfirstbyte = TRUE;
5502          }
5503        else firstbyte = REQ_NONE;
5504        zerofirstbyte = REQ_NONE;
5505        }
5506
5507      /* If firstbyte was previously set, convert the subpattern's firstbyte
5508      into reqbyte if there wasn't one, using the vary flag that was in
5509      existence beforehand. */
5510
5511      else if (subfirstbyte >= 0 && subreqbyte < 0)
5512        subreqbyte = subfirstbyte | tempreqvary;
5513
5514      /* If the subpattern set a required byte (or set a first byte that isn't
5515      really the first byte - see above), set it. */
5516
5517      if (subreqbyte >= 0) reqbyte = subreqbyte;
5518      }
5519
5520    /* For a forward assertion, we take the reqbyte, if set. This can be
5521    helpful if the pattern that follows the assertion doesn't set a different
5522    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5523    for an assertion, however because it leads to incorrect effect for patterns
5524    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5525    of a firstbyte. This is overcome by a scan at the end if there's no
5526    firstbyte, looking for an asserted first char. */
5527
5528    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5529    break;     /* End of processing '(' */
5530
5531
5532    /* ===================================================================*/
5533    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5534    are arranged to be the negation of the corresponding OP_values. For the
5535    back references, the values are ESC_REF plus the reference number. Only
5536    back references and those types that consume a character may be repeated.
5537    We can test for values between ESC_b and ESC_Z for the latter; this may
5538    have to change if any new ones are ever created. */
5539
5540    case CHAR_BACKSLASH:
5541    tempptr = ptr;
5542    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5543    if (*errorcodeptr != 0) goto FAILED;
5544
5545    if (c < 0)
5546      {
5547      if (-c == ESC_Q)            /* Handle start of quoted string */
5548        {
5549        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5550          ptr += 2;               /* avoid empty string */
5551            else inescq = TRUE;
5552        continue;
5553        }
5554
5555      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5556
5557      /* For metasequences that actually match a character, we disable the
5558      setting of a first character if it hasn't already been set. */
5559
5560      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5561        firstbyte = REQ_NONE;
5562
5563      /* Set values to reset to if this is followed by a zero repeat. */
5564
5565      zerofirstbyte = firstbyte;
5566      zeroreqbyte = reqbyte;
5567
5568      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5569      is a subroutine call by number (Oniguruma syntax). In fact, the value
5570      -ESC_g is returned only for these cases. So we don't need to check for <
5571      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5572      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5573      that is a synonym for a named back reference). */
5574
5575      if (-c == ESC_g)
5576        {
5577        const uschar *p;
5578        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5579        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5580          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5581
5582        /* These two statements stop the compiler for warning about possibly
5583        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5584        fact, because we actually check for a number below, the paths that
5585        would actually be in error are never taken. */
5586
5587        skipbytes = 0;
5588        reset_bracount = FALSE;
5589
5590        /* Test for a name */
5591
5592        if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5593          {
5594          BOOL isnumber = TRUE;
5595          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5596            {
5597            if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5598            if ((cd->ctypes[*p] & ctype_word) == 0) break;
5599            }
5600          if (*p != terminator)
5601            {
5602            *errorcodeptr = ERR57;
5603            break;
5604            }
5605          if (isnumber)
5606            {
5607            ptr++;
5608            goto HANDLE_NUMERICAL_RECURSION;
5609            }
5610          is_recurse = TRUE;
5611          goto NAMED_REF_OR_RECURSE;
5612          }
5613
5614        /* Test a signed number in angle brackets or quotes. */
5615
5616        p = ptr + 2;
5617        while ((digitab[*p] & ctype_digit) != 0) p++;
5618        if (*p != terminator)
5619          {
5620          *errorcodeptr = ERR57;
5621          break;
5622          }
5623        ptr++;
5624        goto HANDLE_NUMERICAL_RECURSION;
5625        }
5626
5627      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5628      We also support \k{name} (.NET syntax) */
5629
5630      if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5631          ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5632        {
5633        is_recurse = FALSE;
5634        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5635          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5636          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5637        goto NAMED_REF_OR_RECURSE;
5638        }
5639
5640      /* Back references are handled specially; must disable firstbyte if
5641      not set to cope with cases like (?=(\w+))\1: which would otherwise set
5642      ':' later. */
5643
5644      if (-c >= ESC_REF)
5645        {
5646        open_capitem *oc;
5647        recno = -c - ESC_REF;
5648
5649        HANDLE_REFERENCE:    /* Come here from named backref handling */
5650        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5651        previous = code;
5652        *code++ = OP_REF;
5653        PUT2INC(code, 0, recno);
5654        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5655        if (recno > cd->top_backref) cd->top_backref = recno;
5656
5657        /* Check to see if this back reference is recursive, that it, it
5658        is inside the group that it references. A flag is set so that the
5659        group can be made atomic. */
5660
5661        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5662          {
5663          if (oc->number == recno)
5664            {
5665            oc->flag = TRUE;
5666            break;
5667            }
5668          }
5669        }
5670
5671      /* So are Unicode property matches, if supported. */
5672
5673#ifdef SUPPORT_UCP
5674      else if (-c == ESC_P || -c == ESC_p)
5675        {
5676        BOOL negated;
5677        int pdata;
5678        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5679        if (ptype < 0) goto FAILED;
5680        previous = code;
5681        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5682        *code++ = ptype;
5683        *code++ = pdata;
5684        }
5685#else
5686
5687      /* If Unicode properties are not supported, \X, \P, and \p are not
5688      allowed. */
5689
5690      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5691        {
5692        *errorcodeptr = ERR45;
5693        goto FAILED;
5694        }
5695#endif
5696
5697      /* For the rest (including \X when Unicode properties are supported), we
5698      can obtain the OP value by negating the escape value. */
5699
5700      else
5701        {
5702        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5703        *code++ = -c;
5704        }
5705      continue;
5706      }
5707
5708    /* We have a data character whose value is in c. In UTF-8 mode it may have
5709    a value > 127. We set its representation in the length/buffer, and then
5710    handle it as a data character. */
5711
5712#ifdef SUPPORT_UTF8
5713    if (utf8 && c > 127)
5714      mclength = _pcre_ord2utf8(c, mcbuffer);
5715    else
5716#endif
5717
5718     {
5719     mcbuffer[0] = c;
5720     mclength = 1;
5721     }
5722    goto ONE_CHAR;
5723
5724
5725    /* ===================================================================*/
5726    /* Handle a literal character. It is guaranteed not to be whitespace or #
5727    when the extended flag is set. If we are in UTF-8 mode, it may be a
5728    multi-byte literal character. */
5729
5730    default:
5731    NORMAL_CHAR:
5732    mclength = 1;
5733    mcbuffer[0] = c;
5734
5735#ifdef SUPPORT_UTF8
5736    if (utf8 && c >= 0xc0)
5737      {
5738      while ((ptr[1] & 0xc0) == 0x80)
5739        mcbuffer[mclength++] = *(++ptr);
5740      }
5741#endif
5742
5743    /* At this point we have the character's bytes in mcbuffer, and the length
5744    in mclength. When not in UTF-8 mode, the length is always 1. */
5745
5746    ONE_CHAR:
5747    previous = code;
5748    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5749    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5750
5751    /* Remember if \r or \n were seen */
5752
5753    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5754      cd->external_flags |= PCRE_HASCRORLF;
5755
5756    /* Set the first and required bytes appropriately. If no previous first
5757    byte, set it from this character, but revert to none on a zero repeat.
5758    Otherwise, leave the firstbyte value alone, and don't change it on a zero
5759    repeat. */
5760
5761    if (firstbyte == REQ_UNSET)
5762      {
5763      zerofirstbyte = REQ_NONE;
5764      zeroreqbyte = reqbyte;
5765
5766      /* If the character is more than one byte long, we can set firstbyte
5767      only if it is not to be matched caselessly. */
5768
5769      if (mclength == 1 || req_caseopt == 0)
5770        {
5771        firstbyte = mcbuffer[0] | req_caseopt;
5772        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5773        }
5774      else firstbyte = reqbyte = REQ_NONE;
5775      }
5776
5777    /* firstbyte was previously set; we can set reqbyte only the length is
5778    1 or the matching is caseful. */
5779
5780    else
5781      {
5782      zerofirstbyte = firstbyte;
5783      zeroreqbyte = reqbyte;
5784      if (mclength == 1 || req_caseopt == 0)
5785        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5786      }
5787
5788    break;            /* End of literal character handling */
5789    }
5790  }                   /* end of big loop */
5791
5792
5793/* Control never reaches here by falling through, only by a goto for all the
5794error states. Pass back the position in the pattern so that it can be displayed
5795to the user for diagnosing the error. */
5796
5797FAILED:
5798*ptrptr = ptr;
5799return FALSE;
5800}
5801
5802
5803
5804
5805/*************************************************
5806*     Compile sequence of alternatives           *
5807*************************************************/
5808
5809/* On entry, ptr is pointing past the bracket character, but on return it
5810points to the closing bracket, or vertical bar, or end of string. The code
5811variable is pointing at the byte into which the BRA operator has been stored.
5812If the ims options are changed at the start (for a (?ims: group) or during any
5813branch, we need to insert an OP_OPT item at the start of every following branch
5814to ensure they get set correctly at run time, and also pass the new options
5815into every subsequent branch compile.
5816
5817This function is used during the pre-compile phase when we are trying to find
5818out the amount of memory needed, as well as during the real compile phase. The
5819value of lengthptr distinguishes the two phases.
5820
5821Arguments:
5822  options        option bits, including any changes for this subpattern
5823  oldims         previous settings of ims option bits
5824  codeptr        -> the address of the current code pointer
5825  ptrptr         -> the address of the current pattern pointer
5826  errorcodeptr   -> pointer to error code variable
5827  lookbehind     TRUE if this is a lookbehind assertion
5828  reset_bracount TRUE to reset the count for each branch
5829  skipbytes      skip this many bytes at start (for brackets and OP_COND)
5830  firstbyteptr   place to put the first required character, or a negative number
5831  reqbyteptr     place to put the last required character, or a negative number
5832  bcptr          pointer to the chain of currently open branches
5833  cd             points to the data block with tables pointers etc.
5834  lengthptr      NULL during the real compile phase
5835                 points to length accumulator during pre-compile phase
5836
5837Returns:         TRUE on success
5838*/
5839
5840static BOOL
5841compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5842  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5843  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5844  int *lengthptr)
5845{
5846const uschar *ptr = *ptrptr;
5847uschar *code = *codeptr;
5848uschar *last_branch = code;
5849uschar *start_bracket = code;
5850uschar *reverse_count = NULL;
5851open_capitem capitem;
5852int capnumber = 0;
5853int firstbyte, reqbyte;
5854int branchfirstbyte, branchreqbyte;
5855int length;
5856int orig_bracount;
5857int max_bracount;
5858int old_external_options = cd->external_options;
5859branch_chain bc;
5860
5861bc.outer = bcptr;
5862bc.current_branch = code;
5863
5864firstbyte = reqbyte = REQ_UNSET;
5865
5866/* Accumulate the length for use in the pre-compile phase. Start with the
5867length of the BRA and KET and any extra bytes that are required at the
5868beginning. We accumulate in a local variable to save frequent testing of
5869lenthptr for NULL. We cannot do this by looking at the value of code at the
5870start and end of each alternative, because compiled items are discarded during
5871the pre-compile phase so that the work space is not exceeded. */
5872
5873length = 2 + 2*LINK_SIZE + skipbytes;
5874
5875/* WARNING: If the above line is changed for any reason, you must also change
5876the code that abstracts option settings at the start of the pattern and makes
5877them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5878pre-compile phase to find out whether anything has yet been compiled or not. */
5879
5880/* If this is a capturing subpattern, add to the chain of open capturing items
5881so that we can detect them if (*ACCEPT) is encountered. This is also used to
5882detect groups that contain recursive back references to themselves. */
5883
5884if (*code == OP_CBRA)
5885  {
5886  capnumber = GET2(code, 1 + LINK_SIZE);
5887  capitem.number = capnumber;
5888  capitem.next = cd->open_caps;
5889  capitem.flag = FALSE;
5890  cd->open_caps = &capitem;
5891  }
5892
5893/* Offset is set zero to mark that this bracket is still open */
5894
5895PUT(code, 1, 0);
5896code += 1 + LINK_SIZE + skipbytes;
5897
5898/* Loop for each alternative branch */
5899
5900orig_bracount = max_bracount = cd->bracount;
5901for (;;)
5902  {
5903  /* For a (?| group, reset the capturing bracket count so that each branch
5904  uses the same numbers. */
5905
5906  if (reset_bracount) cd->bracount = orig_bracount;
5907
5908  /* Handle a change of ims options at the start of the branch */
5909
5910  if ((options & PCRE_IMS) != oldims)
5911    {
5912    *code++ = OP_OPT;
5913    *code++ = options & PCRE_IMS;
5914    length += 2;
5915    }
5916
5917  /* Set up dummy OP_REVERSE if lookbehind assertion */
5918
5919  if (lookbehind)
5920    {
5921    *code++ = OP_REVERSE;
5922    reverse_count = code;
5923    PUTINC(code, 0, 0);
5924    length += 1 + LINK_SIZE;
5925    }
5926
5927  /* Now compile the branch; in the pre-compile phase its length gets added
5928  into the length. */
5929
5930  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5931        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5932    {
5933    *ptrptr = ptr;
5934    return FALSE;
5935    }
5936
5937  /* If the external options have changed during this branch, it means that we
5938  are at the top level, and a leading option setting has been encountered. We
5939  need to re-set the original option values to take account of this so that,
5940  during the pre-compile phase, we know to allow for a re-set at the start of
5941  subsequent branches. */
5942
5943  if (old_external_options != cd->external_options)
5944    oldims = cd->external_options & PCRE_IMS;
5945
5946  /* Keep the highest bracket count in case (?| was used and some branch
5947  has fewer than the rest. */
5948
5949  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5950
5951  /* In the real compile phase, there is some post-processing to be done. */
5952
5953  if (lengthptr == NULL)
5954    {
5955    /* If this is the first branch, the firstbyte and reqbyte values for the
5956    branch become the values for the regex. */
5957
5958    if (*last_branch != OP_ALT)
5959      {
5960      firstbyte = branchfirstbyte;
5961      reqbyte = branchreqbyte;
5962      }
5963
5964    /* If this is not the first branch, the first char and reqbyte have to
5965    match the values from all the previous branches, except that if the
5966    previous value for reqbyte didn't have REQ_VARY set, it can still match,
5967    and we set REQ_VARY for the regex. */
5968
5969    else
5970      {
5971      /* If we previously had a firstbyte, but it doesn't match the new branch,
5972      we have to abandon the firstbyte for the regex, but if there was
5973      previously no reqbyte, it takes on the value of the old firstbyte. */
5974
5975      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5976        {
5977        if (reqbyte < 0) reqbyte = firstbyte;
5978        firstbyte = REQ_NONE;
5979        }
5980
5981      /* If we (now or from before) have no firstbyte, a firstbyte from the
5982      branch becomes a reqbyte if there isn't a branch reqbyte. */
5983
5984      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5985          branchreqbyte = branchfirstbyte;
5986
5987      /* Now ensure that the reqbytes match */
5988
5989      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5990        reqbyte = REQ_NONE;
5991      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5992      }
5993
5994    /* If lookbehind, check that this branch matches a fixed-length string, and
5995    put the length into the OP_REVERSE item. Temporarily mark the end of the
5996    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
5997    because there may be forward references that we can't check here. Set a
5998    flag to cause another lookbehind check at the end. Why not do it all at the
5999    end? Because common, erroneous checks are picked up here and the offset of
6000    the problem can be shown. */
6001
6002    if (lookbehind)
6003      {
6004      int fixed_length;
6005      *code = OP_END;
6006      fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6007      DPRINTF(("fixed length = %d\n", fixed_length));
6008      if (fixed_length == -3)
6009        {
6010        cd->check_lookbehind = TRUE;
6011        }
6012      else if (fixed_length < 0)
6013        {
6014        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6015        *ptrptr = ptr;
6016        return FALSE;
6017        }
6018      else { PUT(reverse_count, 0, fixed_length); }
6019      }
6020    }
6021
6022  /* Reached end of expression, either ')' or end of pattern. In the real
6023  compile phase, go back through the alternative branches and reverse the chain
6024  of offsets, with the field in the BRA item now becoming an offset to the
6025  first alternative. If there are no alternatives, it points to the end of the
6026  group. The length in the terminating ket is always the length of the whole
6027  bracketed item. If any of the ims options were changed inside the group,
6028  compile a resetting op-code following, except at the very end of the pattern.
6029  Return leaving the pointer at the terminating char. */
6030
6031  if (*ptr != CHAR_VERTICAL_LINE)
6032    {
6033    if (lengthptr == NULL)
6034      {
6035      int branch_length = code - last_branch;
6036      do
6037        {
6038        int prev_length = GET(last_branch, 1);
6039        PUT(last_branch, 1, branch_length);
6040        branch_length = prev_length;
6041        last_branch -= branch_length;
6042        }
6043      while (branch_length > 0);
6044      }
6045
6046    /* Fill in the ket */
6047
6048    *code = OP_KET;
6049    PUT(code, 1, code - start_bracket);
6050    code += 1 + LINK_SIZE;
6051
6052    /* If it was a capturing subpattern, check to see if it contained any
6053    recursive back references. If so, we must wrap it in atomic brackets.
6054    In any event, remove the block from the chain. */
6055
6056    if (capnumber > 0)
6057      {
6058      if (cd->open_caps->flag)
6059        {
6060        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6061          code - start_bracket);
6062        *start_bracket = OP_ONCE;
6063        code += 1 + LINK_SIZE;
6064        PUT(start_bracket, 1, code - start_bracket);
6065        *code = OP_KET;
6066        PUT(code, 1, code - start_bracket);
6067        code += 1 + LINK_SIZE;
6068        length += 2 + 2*LINK_SIZE;
6069        }
6070      cd->open_caps = cd->open_caps->next;
6071      }
6072
6073    /* Reset options if needed. */
6074
6075    if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6076      {
6077      *code++ = OP_OPT;
6078      *code++ = oldims;
6079      length += 2;
6080      }
6081
6082    /* Retain the highest bracket number, in case resetting was used. */
6083
6084    cd->bracount = max_bracount;
6085
6086    /* Set values to pass back */
6087
6088    *codeptr = code;
6089    *ptrptr = ptr;
6090    *firstbyteptr = firstbyte;
6091    *reqbyteptr = reqbyte;
6092    if (lengthptr != NULL)
6093      {
6094      if (OFLOW_MAX - *lengthptr < length)
6095        {
6096        *errorcodeptr = ERR20;
6097        return FALSE;
6098        }
6099      *lengthptr += length;
6100      }
6101    return TRUE;
6102    }
6103
6104  /* Another branch follows. In the pre-compile phase, we can move the code
6105  pointer back to where it was for the start of the first branch. (That is,
6106  pretend that each branch is the only one.)
6107
6108  In the real compile phase, insert an ALT node. Its length field points back
6109  to the previous branch while the bracket remains open. At the end the chain
6110  is reversed. It's done like this so that the start of the bracket has a
6111  zero offset until it is closed, making it possible to detect recursion. */
6112
6113  if (lengthptr != NULL)
6114    {
6115    code = *codeptr + 1 + LINK_SIZE + skipbytes;
6116    length += 1 + LINK_SIZE;
6117    }
6118  else
6119    {
6120    *code = OP_ALT;
6121    PUT(code, 1, code - last_branch);
6122    bc.current_branch = last_branch = code;
6123    code += 1 + LINK_SIZE;
6124    }
6125
6126  ptr++;
6127  }
6128/* Control never reaches here */
6129}
6130
6131
6132
6133
6134/*************************************************
6135*          Check for anchored expression         *
6136*************************************************/
6137
6138/* Try to find out if this is an anchored regular expression. Consider each
6139alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6140all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6141it's anchored. However, if this is a multiline pattern, then only OP_SOD
6142counts, since OP_CIRC can match in the middle.
6143
6144We can also consider a regex to be anchored if OP_SOM starts all its branches.
6145This is the code for \G, which means "match at start of match position, taking
6146into account the match offset".
6147
6148A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6149because that will try the rest of the pattern at all possible matching points,
6150so there is no point trying again.... er ....
6151
6152.... except when the .* appears inside capturing parentheses, and there is a
6153subsequent back reference to those parentheses. We haven't enough information
6154to catch that case precisely.
6155
6156At first, the best we could do was to detect when .* was in capturing brackets
6157and the highest back reference was greater than or equal to that level.
6158However, by keeping a bitmap of the first 31 back references, we can catch some
6159of the more common cases more precisely.
6160
6161Arguments:
6162  code           points to start of expression (the bracket)
6163  options        points to the options setting
6164  bracket_map    a bitmap of which brackets we are inside while testing; this
6165                  handles up to substring 31; after that we just have to take
6166                  the less precise approach
6167  backref_map    the back reference bitmap
6168
6169Returns:     TRUE or FALSE
6170*/
6171
6172static BOOL
6173is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6174  unsigned int backref_map)
6175{
6176do {
6177   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6178     options, PCRE_MULTILINE, FALSE);
6179   register int op = *scode;
6180
6181   /* Non-capturing brackets */
6182
6183   if (op == OP_BRA)
6184     {
6185     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6186     }
6187
6188   /* Capturing brackets */
6189
6190   else if (op == OP_CBRA)
6191     {
6192     int n = GET2(scode, 1+LINK_SIZE);
6193     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6194     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6195     }
6196
6197   /* Other brackets */
6198
6199   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6200     {
6201     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6202     }
6203
6204   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6205   it isn't in brackets that are or may be referenced. */
6206
6207   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6208             op == OP_TYPEPOSSTAR))
6209     {
6210     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6211       return FALSE;
6212     }
6213
6214   /* Check for explicit anchoring */
6215
6216   else if (op != OP_SOD && op != OP_SOM &&
6217           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6218     return FALSE;
6219   code += GET(code, 1);
6220   }
6221while (*code == OP_ALT);   /* Loop for each alternative */
6222return TRUE;
6223}
6224
6225
6226
6227/*************************************************
6228*         Check for starting with ^ or .*        *
6229*************************************************/
6230
6231/* This is called to find out if every branch starts with ^ or .* so that
6232"first char" processing can be done to speed things up in multiline
6233matching and for non-DOTALL patterns that start with .* (which must start at
6234the beginning or after \n). As in the case of is_anchored() (see above), we
6235have to take account of back references to capturing brackets that contain .*
6236because in that case we can't make the assumption.
6237
6238Arguments:
6239  code           points to start of expression (the bracket)
6240  bracket_map    a bitmap of which brackets we are inside while testing; this
6241                  handles up to substring 31; after that we just have to take
6242                  the less precise approach
6243  backref_map    the back reference bitmap
6244
6245Returns:         TRUE or FALSE
6246*/
6247
6248static BOOL
6249is_startline(const uschar *code, unsigned int bracket_map,
6250  unsigned int backref_map)
6251{
6252do {
6253   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6254     NULL, 0, FALSE);
6255   register int op = *scode;
6256
6257   /* If we are at the start of a conditional assertion group, *both* the
6258   conditional assertion *and* what follows the condition must satisfy the test
6259   for start of line. Other kinds of condition fail. Note that there may be an
6260   auto-callout at the start of a condition. */
6261
6262   if (op == OP_COND)
6263     {
6264     scode += 1 + LINK_SIZE;
6265     if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6266     switch (*scode)
6267       {
6268       case OP_CREF:
6269       case OP_NCREF:
6270       case OP_RREF:
6271       case OP_NRREF:
6272       case OP_DEF:
6273       return FALSE;
6274
6275       default:     /* Assertion */
6276       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6277       do scode += GET(scode, 1); while (*scode == OP_ALT);
6278       scode += 1 + LINK_SIZE;
6279       break;
6280       }
6281     scode = first_significant_code(scode, NULL, 0, FALSE);
6282     op = *scode;
6283     }
6284
6285   /* Non-capturing brackets */
6286
6287   if (op == OP_BRA)
6288     {
6289     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6290     }
6291
6292   /* Capturing brackets */
6293
6294   else if (op == OP_CBRA)
6295     {
6296     int n = GET2(scode, 1+LINK_SIZE);
6297     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6298     if (!is_startline(scode, new_map, backref_map)) return FALSE;
6299     }
6300
6301   /* Other brackets */
6302
6303   else if (op == OP_ASSERT || op == OP_ONCE)
6304     {
6305     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6306     }
6307
6308   /* .* means "start at start or after \n" if it isn't in brackets that
6309   may be referenced. */
6310
6311   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6312     {
6313     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6314     }
6315
6316   /* Check for explicit circumflex */
6317
6318   else if (op != OP_CIRC) return FALSE;
6319
6320   /* Move on to the next alternative */
6321
6322   code += GET(code, 1);
6323   }
6324while (*code == OP_ALT);  /* Loop for each alternative */
6325return TRUE;
6326}
6327
6328
6329
6330/*************************************************
6331*       Check for asserted fixed first char      *
6332*************************************************/
6333
6334/* During compilation, the "first char" settings from forward assertions are
6335discarded, because they can cause conflicts with actual literals that follow.
6336However, if we end up without a first char setting for an unanchored pattern,
6337it is worth scanning the regex to see if there is an initial asserted first
6338char. If all branches start with the same asserted char, or with a bracket all
6339of whose alternatives start with the same asserted char (recurse ad lib), then
6340we return that char, otherwise -1.
6341
6342Arguments:
6343  code       points to start of expression (the bracket)
6344  options    pointer to the options (used to check casing changes)
6345  inassert   TRUE if in an assertion
6346
6347Returns:     -1 or the fixed first char
6348*/
6349
6350static int
6351find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6352{
6353register int c = -1;
6354do {
6355   int d;
6356   const uschar *scode =
6357     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6358   register int op = *scode;
6359
6360   switch(op)
6361     {
6362     default:
6363     return -1;
6364
6365     case OP_BRA:
6366     case OP_CBRA:
6367     case OP_ASSERT:
6368     case OP_ONCE:
6369     case OP_COND:
6370     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6371       return -1;
6372     if (c < 0) c = d; else if (c != d) return -1;
6373     break;
6374
6375     case OP_EXACT:       /* Fall through */
6376     scode += 2;
6377
6378     case OP_CHAR:
6379     case OP_CHARNC:
6380     case OP_PLUS:
6381     case OP_MINPLUS:
6382     case OP_POSPLUS:
6383     if (!inassert) return -1;
6384     if (c < 0)
6385       {
6386       c = scode[1];
6387       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6388       }
6389     else if (c != scode[1]) return -1;
6390     break;
6391     }
6392
6393   code += GET(code, 1);
6394   }
6395while (*code == OP_ALT);
6396return c;
6397}
6398
6399
6400
6401/*************************************************
6402*        Compile a Regular Expression            *
6403*************************************************/
6404
6405/* This function takes a string and returns a pointer to a block of store
6406holding a compiled version of the expression. The original API for this
6407function had no error code return variable; it is retained for backwards
6408compatibility. The new function is given a new name.
6409
6410Arguments:
6411  pattern       the regular expression
6412  options       various option bits
6413  errorcodeptr  pointer to error code variable (pcre_compile2() only)
6414                  can be NULL if you don't want a code value
6415  errorptr      pointer to pointer to error text
6416  erroroffset   ptr offset in pattern where error was detected
6417  tables        pointer to character tables or NULL
6418
6419Returns:        pointer to compiled data block, or NULL on error,
6420                with errorptr and erroroffset set
6421*/
6422
6423PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6424pcre_compile(const char *pattern, int options, const char **errorptr,
6425  int *erroroffset, const unsigned char *tables)
6426{
6427return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6428}
6429
6430
6431PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6432pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6433  const char **errorptr, int *erroroffset, const unsigned char *tables)
6434{
6435real_pcre *re;
6436int length = 1;  /* For final END opcode */
6437int firstbyte, reqbyte, newline;
6438int errorcode = 0;
6439int skipatstart = 0;
6440BOOL utf8 = (options & PCRE_UTF8) != 0;
6441size_t size;
6442uschar *code;
6443const uschar *codestart;
6444const uschar *ptr;
6445compile_data compile_block;
6446compile_data *cd = &compile_block;
6447
6448/* This space is used for "compiling" into during the first phase, when we are
6449computing the amount of memory that is needed. Compiled items are thrown away
6450as soon as possible, so that a fairly large buffer should be sufficient for
6451this purpose. The same space is used in the second phase for remembering where
6452to fill in forward references to subpatterns. */
6453
6454uschar cworkspace[COMPILE_WORK_SIZE];
6455
6456/* Set this early so that early errors get offset 0. */
6457
6458ptr = (const uschar *)pattern;
6459
6460/* We can't pass back an error message if errorptr is NULL; I guess the best we
6461can do is just return NULL, but we can set a code value if there is a code
6462pointer. */
6463
6464if (errorptr == NULL)
6465  {
6466  if (errorcodeptr != NULL) *errorcodeptr = 99;
6467  return NULL;
6468  }
6469
6470*errorptr = NULL;
6471if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6472
6473/* However, we can give a message for this error */
6474
6475if (erroroffset == NULL)
6476  {
6477  errorcode = ERR16;
6478  goto PCRE_EARLY_ERROR_RETURN2;
6479  }
6480
6481*erroroffset = 0;
6482
6483/* Set up pointers to the individual character tables */
6484
6485if (tables == NULL) tables = _pcre_default_tables;
6486cd->lcc = tables + lcc_offset;
6487cd->fcc = tables + fcc_offset;
6488cd->cbits = tables + cbits_offset;
6489cd->ctypes = tables + ctypes_offset;
6490
6491/* Check that all undefined public option bits are zero */
6492
6493if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6494  {
6495  errorcode = ERR17;
6496  goto PCRE_EARLY_ERROR_RETURN;
6497  }
6498
6499/* Check for global one-time settings at the start of the pattern, and remember
6500the offset for later. */
6501
6502while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6503       ptr[skipatstart+1] == CHAR_ASTERISK)
6504  {
6505  int newnl = 0;
6506  int newbsr = 0;
6507
6508  if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6509    { skipatstart += 7; options |= PCRE_UTF8; continue; }
6510
6511  if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6512    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6513  else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6514    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6515  else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6516    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6517  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6518    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6519  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6520    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6521
6522  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6523    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6524  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6525    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6526
6527  if (newnl != 0)
6528    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6529  else if (newbsr != 0)
6530    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6531  else break;
6532  }
6533
6534/* Can't support UTF8 unless PCRE has been compiled to include the code. */
6535
6536#ifdef SUPPORT_UTF8
6537if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6538     (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6539  {
6540  errorcode = ERR44;
6541  goto PCRE_EARLY_ERROR_RETURN2;
6542  }
6543#else
6544if (utf8)
6545  {
6546  errorcode = ERR32;
6547  goto PCRE_EARLY_ERROR_RETURN;
6548  }
6549#endif
6550
6551/* Check validity of \R options. */
6552
6553switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6554  {
6555  case 0:
6556  case PCRE_BSR_ANYCRLF:
6557  case PCRE_BSR_UNICODE:
6558  break;
6559  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6560  }
6561
6562/* Handle different types of newline. The three bits give seven cases. The
6563current code allows for fixed one- or two-byte sequences, plus "any" and
6564"anycrlf". */
6565
6566switch (options & PCRE_NEWLINE_BITS)
6567  {
6568  case 0: newline = NEWLINE; break;   /* Build-time default */
6569  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6570  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6571  case PCRE_NEWLINE_CR+
6572       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6573  case PCRE_NEWLINE_ANY: newline = -1; break;
6574  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6575  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6576  }
6577
6578if (newline == -2)
6579  {
6580  cd->nltype = NLTYPE_ANYCRLF;
6581  }
6582else if (newline < 0)
6583  {
6584  cd->nltype = NLTYPE_ANY;
6585  }
6586else
6587  {
6588  cd->nltype = NLTYPE_FIXED;
6589  if (newline > 255)
6590    {
6591    cd->nllen = 2;
6592    cd->nl[0] = (newline >> 8) & 255;
6593    cd->nl[1] = newline & 255;
6594    }
6595  else
6596    {
6597    cd->nllen = 1;
6598    cd->nl[0] = newline;
6599    }
6600  }
6601
6602/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6603references to help in deciding whether (.*) can be treated as anchored or not.
6604*/
6605
6606cd->top_backref = 0;
6607cd->backref_map = 0;
6608
6609/* Reflect pattern for debugging output */
6610
6611DPRINTF(("------------------------------------------------------------------\n"));
6612DPRINTF(("%s\n", pattern));
6613
6614/* Pretend to compile the pattern while actually just accumulating the length
6615of memory required. This behaviour is triggered by passing a non-NULL final
6616argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6617to compile parts of the pattern into; the compiled code is discarded when it is
6618no longer needed, so hopefully this workspace will never overflow, though there
6619is a test for its doing so. */
6620
6621cd->bracount = cd->final_bracount = 0;
6622cd->names_found = 0;
6623cd->name_entry_size = 0;
6624cd->name_table = NULL;
6625cd->start_workspace = cworkspace;
6626cd->start_code = cworkspace;
6627cd->hwm = cworkspace;
6628cd->start_pattern = (const uschar *)pattern;
6629cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6630cd->req_varyopt = 0;
6631cd->external_options = options;
6632cd->external_flags = 0;
6633cd->open_caps = NULL;
6634
6635/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6636don't need to look at the result of the function here. The initial options have
6637been put into the cd block so that they can be changed if an option setting is
6638found within the regex right at the beginning. Bringing initial option settings
6639outside can help speed up starting point checks. */
6640
6641ptr += skipatstart;
6642code = cworkspace;
6643*code = OP_BRA;
6644(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6645  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6646  &length);
6647if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6648
6649DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6650  cd->hwm - cworkspace));
6651
6652if (length > MAX_PATTERN_SIZE)
6653  {
6654  errorcode = ERR20;
6655  goto PCRE_EARLY_ERROR_RETURN;
6656  }
6657
6658/* Compute the size of data block needed and get it, either from malloc or
6659externally provided function. Integer overflow should no longer be possible
6660because nowadays we limit the maximum value of cd->names_found and
6661cd->name_entry_size. */
6662
6663size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6664re = (real_pcre *)(pcre_malloc)(size);
6665
6666if (re == NULL)
6667  {
6668  errorcode = ERR21;
6669  goto PCRE_EARLY_ERROR_RETURN;
6670  }
6671
6672/* Put in the magic number, and save the sizes, initial options, internal
6673flags, and character table pointer. NULL is used for the default character
6674tables. The nullpad field is at the end; it's there to help in the case when a
6675regex compiled on a system with 4-byte pointers is run on another with 8-byte
6676pointers. */
6677
6678re->magic_number = MAGIC_NUMBER;
6679re->size = size;
6680re->options = cd->external_options;
6681re->flags = cd->external_flags;
6682re->dummy1 = 0;
6683re->first_byte = 0;
6684re->req_byte = 0;
6685re->name_table_offset = sizeof(real_pcre);
6686re->name_entry_size = cd->name_entry_size;
6687re->name_count = cd->names_found;
6688re->ref_count = 0;
6689re->tables = (tables == _pcre_default_tables)? NULL : tables;
6690re->nullpad = NULL;
6691
6692/* The starting points of the name/number translation table and of the code are
6693passed around in the compile data block. The start/end pattern and initial
6694options are already set from the pre-compile phase, as is the name_entry_size
6695field. Reset the bracket count and the names_found field. Also reset the hwm
6696field; this time it's used for remembering forward references to subpatterns.
6697*/
6698
6699cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6700cd->bracount = 0;
6701cd->names_found = 0;
6702cd->name_table = (uschar *)re + re->name_table_offset;
6703codestart = cd->name_table + re->name_entry_size * re->name_count;
6704cd->start_code = codestart;
6705cd->hwm = cworkspace;
6706cd->req_varyopt = 0;
6707cd->had_accept = FALSE;
6708cd->check_lookbehind = FALSE;
6709cd->open_caps = NULL;
6710
6711/* Set up a starting, non-extracting bracket, then compile the expression. On
6712error, errorcode will be set non-zero, so we don't need to look at the result
6713of the function here. */
6714
6715ptr = (const uschar *)pattern + skipatstart;
6716code = (uschar *)codestart;
6717*code = OP_BRA;
6718(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6719  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6720re->top_bracket = cd->bracount;
6721re->top_backref = cd->top_backref;
6722re->flags = cd->external_flags;
6723
6724if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6725
6726/* If not reached end of pattern on success, there's an excess bracket. */
6727
6728if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6729
6730/* Fill in the terminating state and check for disastrous overflow, but
6731if debugging, leave the test till after things are printed out. */
6732
6733*code++ = OP_END;
6734
6735#ifndef PCRE_DEBUG
6736if (code - codestart > length) errorcode = ERR23;
6737#endif
6738
6739/* Fill in any forward references that are required. */
6740
6741while (errorcode == 0 && cd->hwm > cworkspace)
6742  {
6743  int offset, recno;
6744  const uschar *groupptr;
6745  cd->hwm -= LINK_SIZE;
6746  offset = GET(cd->hwm, 0);
6747  recno = GET(codestart, offset);
6748  groupptr = _pcre_find_bracket(codestart, utf8, recno);
6749  if (groupptr == NULL) errorcode = ERR53;
6750    else PUT(((uschar *)codestart), offset, groupptr - codestart);
6751  }
6752
6753/* Give an error if there's back reference to a non-existent capturing
6754subpattern. */
6755
6756if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6757
6758/* If there were any lookbehind assertions that contained OP_RECURSE
6759(recursions or subroutine calls), a flag is set for them to be checked here,
6760because they may contain forward references. Actual recursions can't be fixed
6761length, but subroutine calls can. It is done like this so that those without
6762OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
6763exceptional ones forgo this. We scan the pattern to check that they are fixed
6764length, and set their lengths. */
6765
6766if (cd->check_lookbehind)
6767  {
6768  uschar *cc = (uschar *)codestart;
6769
6770  /* Loop, searching for OP_REVERSE items, and process those that do not have
6771  their length set. (Actually, it will also re-process any that have a length
6772  of zero, but that is a pathological case, and it does no harm.) When we find
6773  one, we temporarily terminate the branch it is in while we scan it. */
6774
6775  for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
6776       cc != NULL;
6777       cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
6778    {
6779    if (GET(cc, 1) == 0)
6780      {
6781      int fixed_length;
6782      uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
6783      int end_op = *be;
6784      *be = OP_END;
6785      fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
6786      *be = end_op;
6787      DPRINTF(("fixed length = %d\n", fixed_length));
6788      if (fixed_length < 0)
6789        {
6790        errorcode = (fixed_length == -2)? ERR36 : ERR25;
6791        break;
6792        }
6793      PUT(cc, 1, fixed_length);
6794      }
6795    cc += 1 + LINK_SIZE;
6796    }
6797  }
6798
6799/* Failed to compile, or error while post-processing */
6800
6801if (errorcode != 0)
6802  {
6803  (pcre_free)(re);
6804  PCRE_EARLY_ERROR_RETURN:
6805  *erroroffset = ptr - (const uschar *)pattern;
6806  PCRE_EARLY_ERROR_RETURN2:
6807  *errorptr = find_error_text(errorcode);
6808  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6809  return NULL;
6810  }
6811
6812/* If the anchored option was not passed, set the flag if we can determine that
6813the pattern is anchored by virtue of ^ characters or \A or anything else (such
6814as starting with .* when DOTALL is set).
6815
6816Otherwise, if we know what the first byte has to be, save it, because that
6817speeds up unanchored matches no end. If not, see if we can set the
6818PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6819start with ^. and also when all branches start with .* for non-DOTALL matches.
6820*/
6821
6822if ((re->options & PCRE_ANCHORED) == 0)
6823  {
6824  int temp_options = re->options;   /* May get changed during these scans */
6825  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6826    re->options |= PCRE_ANCHORED;
6827  else
6828    {
6829    if (firstbyte < 0)
6830      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6831    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6832      {
6833      int ch = firstbyte & 255;
6834      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6835         cd->fcc[ch] == ch)? ch : firstbyte;
6836      re->flags |= PCRE_FIRSTSET;
6837      }
6838    else if (is_startline(codestart, 0, cd->backref_map))
6839      re->flags |= PCRE_STARTLINE;
6840    }
6841  }
6842
6843/* For an anchored pattern, we use the "required byte" only if it follows a
6844variable length item in the regex. Remove the caseless flag for non-caseable
6845bytes. */
6846
6847if (reqbyte >= 0 &&
6848     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6849  {
6850  int ch = reqbyte & 255;
6851  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6852    cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6853  re->flags |= PCRE_REQCHSET;
6854  }
6855
6856/* Print out the compiled data if debugging is enabled. This is never the
6857case when building a production library. */
6858
6859#ifdef PCRE_DEBUG
6860printf("Length = %d top_bracket = %d top_backref = %d\n",
6861  length, re->top_bracket, re->top_backref);
6862
6863printf("Options=%08x\n", re->options);
6864
6865if ((re->flags & PCRE_FIRSTSET) != 0)
6866  {
6867  int ch = re->first_byte & 255;
6868  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6869    "" : " (caseless)";
6870  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6871    else printf("First char = \\x%02x%s\n", ch, caseless);
6872  }
6873
6874if ((re->flags & PCRE_REQCHSET) != 0)
6875  {
6876  int ch = re->req_byte & 255;
6877  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6878    "" : " (caseless)";
6879  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6880    else printf("Req char = \\x%02x%s\n", ch, caseless);
6881  }
6882
6883pcre_printint(re, stdout, TRUE);
6884
6885/* This check is done here in the debugging case so that the code that
6886was compiled can be seen. */
6887
6888if (code - codestart > length)
6889  {
6890  (pcre_free)(re);
6891  *errorptr = find_error_text(ERR23);
6892  *erroroffset = ptr - (uschar *)pattern;
6893  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6894  return NULL;
6895  }
6896#endif   /* PCRE_DEBUG */
6897
6898return (pcre *)re;
6899}
6900
6901/* End of pcre_compile.c */
6902