1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2012 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
57is also used by pcretest. PCRE_DEBUG is not defined when building a production
58library. We do not need to select pcre16_printint.c specially, because the
59COMPILE_PCREx macro will already be appropriately set. */
60
61#ifdef PCRE_DEBUG
62/* pcre_printint.c should not include any headers */
63#define PCRE_INCLUDED
64#include "pcre_printint.c"
65#undef PCRE_INCLUDED
66#endif
67
68
69/* Macro for setting individual bits in class bitmaps. */
70
71#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
72
73/* Maximum length value to check against when making sure that the integer that
74holds the compiled pattern length does not overflow. We make it a bit less than
75INT_MAX to allow for adding in group terminating bytes, so that we don't have
76to check them every time. */
77
78#define OFLOW_MAX (INT_MAX - 20)
79
80
81/*************************************************
82*      Code parameters and static tables         *
83*************************************************/
84
85/* This value specifies the size of stack workspace that is used during the
86first pre-compile phase that determines how much memory is required. The regex
87is partly compiled into this space, but the compiled parts are discarded as
88soon as they can be, so that hopefully there will never be an overrun. The code
89does, however, check for an overrun. The largest amount I've seen used is 218,
90so this number is very generous.
91
92The same workspace is used during the second, actual compile phase for
93remembering forward references to groups so that they can be filled in at the
94end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
95is 4 there is plenty of room for most patterns. However, the memory can get
96filled up by repetitions of forward references, for example patterns like
97/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
98that the workspace is expanded using malloc() in this situation. The value
99below is therefore a minimum, and we put a maximum on it for safety. The
100minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
101kicks in at the same number of forward references in all cases. */
102
103#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
104#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
105
106/* The overrun tests check for a slightly smaller size so that they detect the
107overrun before it actually does run off the end of the data block. */
108
109#define WORK_SIZE_SAFETY_MARGIN (100)
110
111/* Private flags added to firstchar and reqchar. */
112
113#define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
114#define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
115
116/* Repeated character flags. */
117
118#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
119
120/* Table for handling escaped characters in the range '0'-'z'. Positive returns
121are simple data values; negative values are for special things like \d and so
122on. Zero means further processing is needed (for things like \x), or the escape
123is invalid. */
124
125#ifndef EBCDIC
126
127/* This is the "normal" table for ASCII systems or for EBCDIC systems running
128in UTF-8 mode. */
129
130static const short int escapes[] = {
131     0,                       0,
132     0,                       0,
133     0,                       0,
134     0,                       0,
135     0,                       0,
136     CHAR_COLON,              CHAR_SEMICOLON,
137     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
138     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
139     CHAR_COMMERCIAL_AT,      -ESC_A,
140     -ESC_B,                  -ESC_C,
141     -ESC_D,                  -ESC_E,
142     0,                       -ESC_G,
143     -ESC_H,                  0,
144     0,                       -ESC_K,
145     0,                       0,
146     -ESC_N,                  0,
147     -ESC_P,                  -ESC_Q,
148     -ESC_R,                  -ESC_S,
149     0,                       0,
150     -ESC_V,                  -ESC_W,
151     -ESC_X,                  0,
152     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
153     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
154     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
155     CHAR_GRAVE_ACCENT,       7,
156     -ESC_b,                  0,
157     -ESC_d,                  ESC_e,
158     ESC_f,                   0,
159     -ESC_h,                  0,
160     0,                       -ESC_k,
161     0,                       0,
162     ESC_n,                   0,
163     -ESC_p,                  0,
164     ESC_r,                   -ESC_s,
165     ESC_tee,                 0,
166     -ESC_v,                  -ESC_w,
167     0,                       0,
168     -ESC_z
169};
170
171#else
172
173/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
174
175static const short int escapes[] = {
176/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
177/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
178/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
179/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
180/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
181/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
182/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
183/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
184/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
185/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
186/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
187/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
188/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
189/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
190/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
191/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
192/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
193/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
194/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
195/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
196/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
197/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
198/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
199};
200#endif
201
202
203/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
204searched linearly. Put all the names into a single string, in order to reduce
205the number of relocations when a shared library is dynamically linked. The
206string is built from string macros so that it works in UTF-8 mode on EBCDIC
207platforms. */
208
209typedef struct verbitem {
210  int   len;                 /* Length of verb name */
211  int   op;                  /* Op when no arg, or -1 if arg mandatory */
212  int   op_arg;              /* Op when arg present, or -1 if not allowed */
213} verbitem;
214
215static const char verbnames[] =
216  "\0"                       /* Empty name is a shorthand for MARK */
217  STRING_MARK0
218  STRING_ACCEPT0
219  STRING_COMMIT0
220  STRING_F0
221  STRING_FAIL0
222  STRING_PRUNE0
223  STRING_SKIP0
224  STRING_THEN;
225
226static const verbitem verbs[] = {
227  { 0, -1,        OP_MARK },
228  { 4, -1,        OP_MARK },
229  { 6, OP_ACCEPT, -1 },
230  { 6, OP_COMMIT, -1 },
231  { 1, OP_FAIL,   -1 },
232  { 4, OP_FAIL,   -1 },
233  { 5, OP_PRUNE,  OP_PRUNE_ARG },
234  { 4, OP_SKIP,   OP_SKIP_ARG  },
235  { 4, OP_THEN,   OP_THEN_ARG  }
236};
237
238static const int verbcount = sizeof(verbs)/sizeof(verbitem);
239
240
241/* Tables of names of POSIX character classes and their lengths. The names are
242now all in a single string, to reduce the number of relocations when a shared
243library is dynamically loaded. The list of lengths is terminated by a zero
244length entry. The first three must be alpha, lower, upper, as this is assumed
245for handling case independence. */
246
247static const char posix_names[] =
248  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
249  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
250  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
251  STRING_word0  STRING_xdigit;
252
253static const pcre_uint8 posix_name_lengths[] = {
254  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
255
256/* Table of class bit maps for each POSIX class. Each class is formed from a
257base map, with an optional addition or removal of another map. Then, for some
258classes, there is some additional tweaking: for [:blank:] the vertical space
259characters are removed, and for [:alpha:] and [:alnum:] the underscore
260character is removed. The triples in the table consist of the base map offset,
261second map offset or -1 if no second map, and a non-negative value for map
262addition or a negative value for map subtraction (if there are two maps). The
263absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
264remove vertical space characters, 2 => remove underscore. */
265
266static const int posix_class_maps[] = {
267  cbit_word,  cbit_digit, -2,             /* alpha */
268  cbit_lower, -1,          0,             /* lower */
269  cbit_upper, -1,          0,             /* upper */
270  cbit_word,  -1,          2,             /* alnum - word without underscore */
271  cbit_print, cbit_cntrl,  0,             /* ascii */
272  cbit_space, -1,          1,             /* blank - a GNU extension */
273  cbit_cntrl, -1,          0,             /* cntrl */
274  cbit_digit, -1,          0,             /* digit */
275  cbit_graph, -1,          0,             /* graph */
276  cbit_print, -1,          0,             /* print */
277  cbit_punct, -1,          0,             /* punct */
278  cbit_space, -1,          0,             /* space */
279  cbit_word,  -1,          0,             /* word - a Perl extension */
280  cbit_xdigit,-1,          0              /* xdigit */
281};
282
283/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
284substitutes must be in the order of the names, defined above, and there are
285both positive and negative cases. NULL means no substitute. */
286
287#ifdef SUPPORT_UCP
288static const pcre_uchar string_PNd[]  = {
289  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
290  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
291static const pcre_uchar string_pNd[]  = {
292  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
293  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294static const pcre_uchar string_PXsp[] = {
295  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
296  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297static const pcre_uchar string_pXsp[] = {
298  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
299  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300static const pcre_uchar string_PXwd[] = {
301  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
302  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303static const pcre_uchar string_pXwd[] = {
304  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
305  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306
307static const pcre_uchar *substitutes[] = {
308  string_PNd,           /* \D */
309  string_pNd,           /* \d */
310  string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
311  string_pXsp,          /* \s */
312  string_PXwd,          /* \W */
313  string_pXwd           /* \w */
314};
315
316static const pcre_uchar string_pL[] =   {
317  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
318  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
319static const pcre_uchar string_pLl[] =  {
320  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322static const pcre_uchar string_pLu[] =  {
323  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325static const pcre_uchar string_pXan[] = {
326  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328static const pcre_uchar string_h[] =    {
329  CHAR_BACKSLASH, CHAR_h, '\0' };
330static const pcre_uchar string_pXps[] = {
331  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
332  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
333static const pcre_uchar string_PL[] =   {
334  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
335  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336static const pcre_uchar string_PLl[] =  {
337  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339static const pcre_uchar string_PLu[] =  {
340  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342static const pcre_uchar string_PXan[] = {
343  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345static const pcre_uchar string_H[] =    {
346  CHAR_BACKSLASH, CHAR_H, '\0' };
347static const pcre_uchar string_PXps[] = {
348  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
349  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
350
351static const pcre_uchar *posix_substitutes[] = {
352  string_pL,            /* alpha */
353  string_pLl,           /* lower */
354  string_pLu,           /* upper */
355  string_pXan,          /* alnum */
356  NULL,                 /* ascii */
357  string_h,             /* blank */
358  NULL,                 /* cntrl */
359  string_pNd,           /* digit */
360  NULL,                 /* graph */
361  NULL,                 /* print */
362  NULL,                 /* punct */
363  string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
364  string_pXwd,          /* word */
365  NULL,                 /* xdigit */
366  /* Negated cases */
367  string_PL,            /* ^alpha */
368  string_PLl,           /* ^lower */
369  string_PLu,           /* ^upper */
370  string_PXan,          /* ^alnum */
371  NULL,                 /* ^ascii */
372  string_H,             /* ^blank */
373  NULL,                 /* ^cntrl */
374  string_PNd,           /* ^digit */
375  NULL,                 /* ^graph */
376  NULL,                 /* ^print */
377  NULL,                 /* ^punct */
378  string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
379  string_PXwd,          /* ^word */
380  NULL                  /* ^xdigit */
381};
382#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
383#endif
384
385#define STRING(a)  # a
386#define XSTRING(s) STRING(s)
387
388/* The texts of compile-time error messages. These are "char *" because they
389are passed to the outside world. Do not ever re-use any error number, because
390they are documented. Always add a new error instead. Messages marked DEAD below
391are no longer used. This used to be a table of strings, but in order to reduce
392the number of relocations needed when a shared library is loaded dynamically,
393it is now one long string. We cannot use a table of offsets, because the
394lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
395simply count through to the one we want - this isn't a performance issue
396because these strings are used only when there is a compilation error.
397
398Each substring ends with \0 to insert a null character. This includes the final
399substring, so that the whole string ends with \0\0, which can be detected when
400counting through. */
401
402static const char error_texts[] =
403  "no error\0"
404  "\\ at end of pattern\0"
405  "\\c at end of pattern\0"
406  "unrecognized character follows \\\0"
407  "numbers out of order in {} quantifier\0"
408  /* 5 */
409  "number too big in {} quantifier\0"
410  "missing terminating ] for character class\0"
411  "invalid escape sequence in character class\0"
412  "range out of order in character class\0"
413  "nothing to repeat\0"
414  /* 10 */
415  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
416  "internal error: unexpected repeat\0"
417  "unrecognized character after (? or (?-\0"
418  "POSIX named classes are supported only within a class\0"
419  "missing )\0"
420  /* 15 */
421  "reference to non-existent subpattern\0"
422  "erroffset passed as NULL\0"
423  "unknown option bit(s) set\0"
424  "missing ) after comment\0"
425  "parentheses nested too deeply\0"  /** DEAD **/
426  /* 20 */
427  "regular expression is too large\0"
428  "failed to get memory\0"
429  "unmatched parentheses\0"
430  "internal error: code overflow\0"
431  "unrecognized character after (?<\0"
432  /* 25 */
433  "lookbehind assertion is not fixed length\0"
434  "malformed number or name after (?(\0"
435  "conditional group contains more than two branches\0"
436  "assertion expected after (?(\0"
437  "(?R or (?[+-]digits must be followed by )\0"
438  /* 30 */
439  "unknown POSIX class name\0"
440  "POSIX collating elements are not supported\0"
441  "this version of PCRE is compiled without UTF support\0"
442  "spare error\0"  /** DEAD **/
443  "character value in \\x{...} sequence is too large\0"
444  /* 35 */
445  "invalid condition (?(0)\0"
446  "\\C not allowed in lookbehind assertion\0"
447  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
448  "number after (?C is > 255\0"
449  "closing ) for (?C expected\0"
450  /* 40 */
451  "recursive call could loop indefinitely\0"
452  "unrecognized character after (?P\0"
453  "syntax error in subpattern name (missing terminator)\0"
454  "two named subpatterns have the same name\0"
455  "invalid UTF-8 string\0"
456  /* 45 */
457  "support for \\P, \\p, and \\X has not been compiled\0"
458  "malformed \\P or \\p sequence\0"
459  "unknown property name after \\P or \\p\0"
460  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
461  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
462  /* 50 */
463  "repeated subpattern is too long\0"    /** DEAD **/
464  "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
465  "internal error: overran compiling workspace\0"
466  "internal error: previously-checked referenced subpattern not found\0"
467  "DEFINE group contains more than one branch\0"
468  /* 55 */
469  "repeating a DEFINE group is not allowed\0"  /** DEAD **/
470  "inconsistent NEWLINE options\0"
471  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
472  "a numbered reference must not be zero\0"
473  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
474  /* 60 */
475  "(*VERB) not recognized\0"
476  "number is too big\0"
477  "subpattern name expected\0"
478  "digit expected after (?+\0"
479  "] is an invalid data character in JavaScript compatibility mode\0"
480  /* 65 */
481  "different names for subpatterns of the same number are not allowed\0"
482  "(*MARK) must have an argument\0"
483  "this version of PCRE is not compiled with Unicode property support\0"
484  "\\c must be followed by an ASCII character\0"
485  "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
486  /* 70 */
487  "internal error: unknown opcode in find_fixedlength()\0"
488  "\\N is not supported in a class\0"
489  "too many forward references\0"
490  "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
491  "invalid UTF-16 string\0"
492  /* 75 */
493  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
494  "character value in \\u.... sequence is too large\0"
495  ;
496
497/* Table to identify digits and hex digits. This is used when compiling
498patterns. Note that the tables in chartables are dependent on the locale, and
499may mark arbitrary characters as digits - but the PCRE compiling code expects
500to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
501a private table here. It costs 256 bytes, but it is a lot faster than doing
502character value tests (at least in some simple cases I timed), and in some
503applications one wants PCRE to compile efficiently as well as match
504efficiently.
505
506For convenience, we use the same bit definitions as in chartables:
507
508  0x04   decimal digit
509  0x08   hexadecimal digit
510
511Then we can use ctype_digit and ctype_xdigit in the code. */
512
513/* Using a simple comparison for decimal numbers rather than a memory read
514is much faster, and the resulting code is simpler (the compiler turns it
515into a subtraction and unsigned comparison). */
516
517#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
518
519#ifndef EBCDIC
520
521/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
522UTF-8 mode. */
523
524static const pcre_uint8 digitab[] =
525  {
526  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
527  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
528  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
529  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
530  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
531  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
532  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
533  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
534  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
535  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
536  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
537  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
538  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
539  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
540  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
541  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
542  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
543  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
544  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
545  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
546  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
547  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
548  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
549  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
550  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
551  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
552  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
553  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
554  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
555  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
556  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
557  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
558
559#else
560
561/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
562
563static const pcre_uint8 digitab[] =
564  {
565  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
566  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
567  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
568  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
569  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
570  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
571  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
572  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
573  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
574  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
575  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
576  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
577  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
578  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
579  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
580  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
581  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
582  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
583  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
584  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
585  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
586  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
587  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
588  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
589  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
590  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
591  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
592  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
593  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
594  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
595  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
596  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
597
598static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
599  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
600  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
601  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
602  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
603  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
604  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
605  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
606  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
607  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
608  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
609  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
610  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
611  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
612  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
613  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
614  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
615  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
616  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
617  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
618  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
619  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
620  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
621  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
622  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
623  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
624  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
625  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
626  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
627  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
628  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
629  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
630  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
631#endif
632
633
634/* Definition to allow mutual recursion */
635
636static BOOL
637  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
638    int *, int *, branch_chain *, compile_data *, int *);
639
640
641
642/*************************************************
643*            Find an error text                  *
644*************************************************/
645
646/* The error texts are now all in one long string, to save on relocations. As
647some of the text is of unknown length, we can't use a table of offsets.
648Instead, just count through the strings. This is not a performance issue
649because it happens only when there has been a compilation error.
650
651Argument:   the error number
652Returns:    pointer to the error string
653*/
654
655static const char *
656find_error_text(int n)
657{
658const char *s = error_texts;
659for (; n > 0; n--)
660  {
661  while (*s++ != 0) {};
662  if (*s == 0) return "Error text not found (please report)";
663  }
664return s;
665}
666
667
668/*************************************************
669*           Expand the workspace                 *
670*************************************************/
671
672/* This function is called during the second compiling phase, if the number of
673forward references fills the existing workspace, which is originally a block on
674the stack. A larger block is obtained from malloc() unless the ultimate limit
675has been reached or the increase will be rather small.
676
677Argument: pointer to the compile data block
678Returns:  0 if all went well, else an error number
679*/
680
681static int
682expand_workspace(compile_data *cd)
683{
684pcre_uchar *newspace;
685int newsize = cd->workspace_size * 2;
686
687if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
688if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
689    newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
690 return ERR72;
691
692newspace = (PUBL(malloc))(IN_UCHARS(newsize));
693if (newspace == NULL) return ERR21;
694memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
695cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
696if (cd->workspace_size > COMPILE_WORK_SIZE)
697  (PUBL(free))((void *)cd->start_workspace);
698cd->start_workspace = newspace;
699cd->workspace_size = newsize;
700return 0;
701}
702
703
704
705/*************************************************
706*            Check for counted repeat            *
707*************************************************/
708
709/* This function is called when a '{' is encountered in a place where it might
710start a quantifier. It looks ahead to see if it really is a quantifier or not.
711It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
712where the ddds are digits.
713
714Arguments:
715  p         pointer to the first char after '{'
716
717Returns:    TRUE or FALSE
718*/
719
720static BOOL
721is_counted_repeat(const pcre_uchar *p)
722{
723if (!IS_DIGIT(*p)) return FALSE;
724p++;
725while (IS_DIGIT(*p)) p++;
726if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
727
728if (*p++ != CHAR_COMMA) return FALSE;
729if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
730
731if (!IS_DIGIT(*p)) return FALSE;
732p++;
733while (IS_DIGIT(*p)) p++;
734
735return (*p == CHAR_RIGHT_CURLY_BRACKET);
736}
737
738
739
740/*************************************************
741*            Handle escapes                      *
742*************************************************/
743
744/* This function is called when a \ has been encountered. It either returns a
745positive value for a simple escape such as \n, or a negative value which
746encodes one of the more complicated things such as \d. A backreference to group
747n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
748UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
749ptr is pointing at the \. On exit, it is on the final character of the escape
750sequence.
751
752Arguments:
753  ptrptr         points to the pattern position pointer
754  errorcodeptr   points to the errorcode variable
755  bracount       number of previous extracting brackets
756  options        the options bits
757  isclass        TRUE if inside a character class
758
759Returns:         zero or positive => a data character
760                 negative => a special escape sequence
761                 on error, errorcodeptr is set
762*/
763
764static int
765check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
766  int options, BOOL isclass)
767{
768/* PCRE_UTF16 has the same value as PCRE_UTF8. */
769BOOL utf = (options & PCRE_UTF8) != 0;
770const pcre_uchar *ptr = *ptrptr + 1;
771pcre_int32 c;
772int i;
773
774GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
775ptr--;                            /* Set pointer back to the last byte */
776
777/* If backslash is at the end of the pattern, it's an error. */
778
779if (c == 0) *errorcodeptr = ERR1;
780
781/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
782in a table. A non-zero result is something that can be returned immediately.
783Otherwise further processing may be required. */
784
785#ifndef EBCDIC  /* ASCII/UTF-8 coding */
786/* Not alphanumeric */
787else if (c < CHAR_0 || c > CHAR_z) {}
788else if ((i = escapes[c - CHAR_0]) != 0) c = i;
789
790#else           /* EBCDIC coding */
791/* Not alphanumeric */
792else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
793else if ((i = escapes[c - 0x48]) != 0)  c = i;
794#endif
795
796/* Escapes that need further processing, or are illegal. */
797
798else
799  {
800  const pcre_uchar *oldptr;
801  BOOL braced, negated;
802
803  switch (c)
804    {
805    /* A number of Perl escapes are not handled by PCRE. We give an explicit
806    error. */
807
808    case CHAR_l:
809    case CHAR_L:
810    *errorcodeptr = ERR37;
811    break;
812
813    case CHAR_u:
814    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
815      {
816      /* In JavaScript, \u must be followed by four hexadecimal numbers.
817      Otherwise it is a lowercase u letter. */
818      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
819        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
820        && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
821        && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
822        {
823        c = 0;
824        for (i = 0; i < 4; ++i)
825          {
826          register int cc = *(++ptr);
827#ifndef EBCDIC  /* ASCII/UTF-8 coding */
828          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
829          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
830#else           /* EBCDIC coding */
831          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
832          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
833#endif
834          }
835
836#ifdef COMPILE_PCRE8
837        if (c > (utf ? 0x10ffff : 0xff))
838#else
839#ifdef COMPILE_PCRE16
840        if (c > (utf ? 0x10ffff : 0xffff))
841#endif
842#endif
843          {
844          *errorcodeptr = ERR76;
845          }
846        else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
847        }
848      }
849    else
850      *errorcodeptr = ERR37;
851    break;
852
853    case CHAR_U:
854    /* In JavaScript, \U is an uppercase U letter. */
855    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
856    break;
857
858    /* In a character class, \g is just a literal "g". Outside a character
859    class, \g must be followed by one of a number of specific things:
860
861    (1) A number, either plain or braced. If positive, it is an absolute
862    backreference. If negative, it is a relative backreference. This is a Perl
863    5.10 feature.
864
865    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
866    is part of Perl's movement towards a unified syntax for back references. As
867    this is synonymous with \k{name}, we fudge it up by pretending it really
868    was \k.
869
870    (3) For Oniguruma compatibility we also support \g followed by a name or a
871    number either in angle brackets or in single quotes. However, these are
872    (possibly recursive) subroutine calls, _not_ backreferences. Just return
873    the -ESC_g code (cf \k). */
874
875    case CHAR_g:
876    if (isclass) break;
877    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
878      {
879      c = -ESC_g;
880      break;
881      }
882
883    /* Handle the Perl-compatible cases */
884
885    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
886      {
887      const pcre_uchar *p;
888      for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
889        if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
890      if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
891        {
892        c = -ESC_k;
893        break;
894        }
895      braced = TRUE;
896      ptr++;
897      }
898    else braced = FALSE;
899
900    if (ptr[1] == CHAR_MINUS)
901      {
902      negated = TRUE;
903      ptr++;
904      }
905    else negated = FALSE;
906
907    /* The integer range is limited by the machine's int representation. */
908    c = 0;
909    while (IS_DIGIT(ptr[1]))
910      {
911      if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
912        {
913        c = -1;
914        break;
915        }
916      c = c * 10 + *(++ptr) - CHAR_0;
917      }
918    if (((unsigned int)c) > INT_MAX) /* Integer overflow */
919      {
920      while (IS_DIGIT(ptr[1]))
921        ptr++;
922      *errorcodeptr = ERR61;
923      break;
924      }
925
926    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
927      {
928      *errorcodeptr = ERR57;
929      break;
930      }
931
932    if (c == 0)
933      {
934      *errorcodeptr = ERR58;
935      break;
936      }
937
938    if (negated)
939      {
940      if (c > bracount)
941        {
942        *errorcodeptr = ERR15;
943        break;
944        }
945      c = bracount - (c - 1);
946      }
947
948    c = -(ESC_REF + c);
949    break;
950
951    /* The handling of escape sequences consisting of a string of digits
952    starting with one that is not zero is not straightforward. By experiment,
953    the way Perl works seems to be as follows:
954
955    Outside a character class, the digits are read as a decimal number. If the
956    number is less than 10, or if there are that many previous extracting
957    left brackets, then it is a back reference. Otherwise, up to three octal
958    digits are read to form an escaped byte. Thus \123 is likely to be octal
959    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960    value is greater than 377, the least significant 8 bits are taken. Inside a
961    character class, \ followed by a digit is always an octal number. */
962
963    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
964    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
965
966    if (!isclass)
967      {
968      oldptr = ptr;
969      /* The integer range is limited by the machine's int representation. */
970      c -= CHAR_0;
971      while (IS_DIGIT(ptr[1]))
972        {
973        if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
974          {
975          c = -1;
976          break;
977          }
978        c = c * 10 + *(++ptr) - CHAR_0;
979        }
980      if (((unsigned int)c) > INT_MAX) /* Integer overflow */
981        {
982        while (IS_DIGIT(ptr[1]))
983          ptr++;
984        *errorcodeptr = ERR61;
985        break;
986        }
987      if (c < 10 || c <= bracount)
988        {
989        c = -(ESC_REF + c);
990        break;
991        }
992      ptr = oldptr;      /* Put the pointer back and fall through */
993      }
994
995    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
996    generates a binary zero byte and treats the digit as a following literal.
997    Thus we have to pull back the pointer by one. */
998
999    if ((c = *ptr) >= CHAR_8)
1000      {
1001      ptr--;
1002      c = 0;
1003      break;
1004      }
1005
1006    /* \0 always starts an octal number, but we may drop through to here with a
1007    larger first octal digit. The original code used just to take the least
1008    significant 8 bits of octal numbers (I think this is what early Perls used
1009    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1010    but no more than 3 octal digits. */
1011
1012    case CHAR_0:
1013    c -= CHAR_0;
1014    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1015        c = c * 8 + *(++ptr) - CHAR_0;
1016#ifdef COMPILE_PCRE8
1017    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1018#endif
1019    break;
1020
1021    /* \x is complicated. \x{ddd} is a character number which can be greater
1022    than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1023    If not, { is treated as a data character. */
1024
1025    case CHAR_x:
1026    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1027      {
1028      /* In JavaScript, \x must be followed by two hexadecimal numbers.
1029      Otherwise it is a lowercase x letter. */
1030      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1031        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1032        {
1033        c = 0;
1034        for (i = 0; i < 2; ++i)
1035          {
1036          register int cc = *(++ptr);
1037#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1038          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1039          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1040#else           /* EBCDIC coding */
1041          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1042          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1043#endif
1044          }
1045        }
1046      break;
1047      }
1048
1049    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1050      {
1051      const pcre_uchar *pt = ptr + 2;
1052
1053      c = 0;
1054      while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
1055        {
1056        register int cc = *pt++;
1057        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1058
1059#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1060        if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1061        c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1062#else           /* EBCDIC coding */
1063        if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1064        c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1065#endif
1066
1067#ifdef COMPILE_PCRE8
1068        if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1069#else
1070#ifdef COMPILE_PCRE16
1071        if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1072#endif
1073#endif
1074        }
1075
1076      if (c < 0)
1077        {
1078        while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
1079        *errorcodeptr = ERR34;
1080        }
1081
1082      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1083        {
1084        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1085        ptr = pt;
1086        break;
1087        }
1088
1089      /* If the sequence of hex digits does not end with '}', then we don't
1090      recognize this construct; fall through to the normal \x handling. */
1091      }
1092
1093    /* Read just a single-byte hex-defined char */
1094
1095    c = 0;
1096    while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1097      {
1098      int cc;                                  /* Some compilers don't like */
1099      cc = *(++ptr);                           /* ++ in initializers */
1100#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1101      if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1102      c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1103#else           /* EBCDIC coding */
1104      if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1105      c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1106#endif
1107      }
1108    break;
1109
1110    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1111    An error is given if the byte following \c is not an ASCII character. This
1112    coding is ASCII-specific, but then the whole concept of \cx is
1113    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1114
1115    case CHAR_c:
1116    c = *(++ptr);
1117    if (c == 0)
1118      {
1119      *errorcodeptr = ERR2;
1120      break;
1121      }
1122#ifndef EBCDIC    /* ASCII/UTF-8 coding */
1123    if (c > 127)  /* Excludes all non-ASCII in either mode */
1124      {
1125      *errorcodeptr = ERR68;
1126      break;
1127      }
1128    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1129    c ^= 0x40;
1130#else             /* EBCDIC coding */
1131    if (c >= CHAR_a && c <= CHAR_z) c += 64;
1132    c ^= 0xC0;
1133#endif
1134    break;
1135
1136    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1137    other alphanumeric following \ is an error if PCRE_EXTRA was set;
1138    otherwise, for Perl compatibility, it is a literal. This code looks a bit
1139    odd, but there used to be some cases other than the default, and there may
1140    be again in future, so I haven't "optimized" it. */
1141
1142    default:
1143    if ((options & PCRE_EXTRA) != 0) switch(c)
1144      {
1145      default:
1146      *errorcodeptr = ERR3;
1147      break;
1148      }
1149    break;
1150    }
1151  }
1152
1153/* Perl supports \N{name} for character names, as well as plain \N for "not
1154newline". PCRE does not support \N{name}. However, it does support
1155quantification such as \N{2,3}. */
1156
1157if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1158     !is_counted_repeat(ptr+2))
1159  *errorcodeptr = ERR37;
1160
1161/* If PCRE_UCP is set, we change the values for \d etc. */
1162
1163if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1164  c -= (ESC_DU - ESC_D);
1165
1166/* Set the pointer to the final character before returning. */
1167
1168*ptrptr = ptr;
1169return c;
1170}
1171
1172
1173
1174#ifdef SUPPORT_UCP
1175/*************************************************
1176*               Handle \P and \p                 *
1177*************************************************/
1178
1179/* This function is called after \P or \p has been encountered, provided that
1180PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1181pointing at the P or p. On exit, it is pointing at the final character of the
1182escape sequence.
1183
1184Argument:
1185  ptrptr         points to the pattern position pointer
1186  negptr         points to a boolean that is set TRUE for negation else FALSE
1187  dptr           points to an int that is set to the detailed property value
1188  errorcodeptr   points to the error code variable
1189
1190Returns:         type value from ucp_type_table, or -1 for an invalid type
1191*/
1192
1193static int
1194get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1195{
1196int c, i, bot, top;
1197const pcre_uchar *ptr = *ptrptr;
1198pcre_uchar name[32];
1199
1200c = *(++ptr);
1201if (c == 0) goto ERROR_RETURN;
1202
1203*negptr = FALSE;
1204
1205/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1206negation. */
1207
1208if (c == CHAR_LEFT_CURLY_BRACKET)
1209  {
1210  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1211    {
1212    *negptr = TRUE;
1213    ptr++;
1214    }
1215  for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1216    {
1217    c = *(++ptr);
1218    if (c == 0) goto ERROR_RETURN;
1219    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1220    name[i] = c;
1221    }
1222  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1223  name[i] = 0;
1224  }
1225
1226/* Otherwise there is just one following character */
1227
1228else
1229  {
1230  name[0] = c;
1231  name[1] = 0;
1232  }
1233
1234*ptrptr = ptr;
1235
1236/* Search for a recognized property name using binary chop */
1237
1238bot = 0;
1239top = PRIV(utt_size);
1240
1241while (bot < top)
1242  {
1243  i = (bot + top) >> 1;
1244  c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1245  if (c == 0)
1246    {
1247    *dptr = PRIV(utt)[i].value;
1248    return PRIV(utt)[i].type;
1249    }
1250  if (c > 0) bot = i + 1; else top = i;
1251  }
1252
1253*errorcodeptr = ERR47;
1254*ptrptr = ptr;
1255return -1;
1256
1257ERROR_RETURN:
1258*errorcodeptr = ERR46;
1259*ptrptr = ptr;
1260return -1;
1261}
1262#endif
1263
1264
1265
1266
1267/*************************************************
1268*         Read repeat counts                     *
1269*************************************************/
1270
1271/* Read an item of the form {n,m} and return the values. This is called only
1272after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1273so the syntax is guaranteed to be correct, but we need to check the values.
1274
1275Arguments:
1276  p              pointer to first char after '{'
1277  minp           pointer to int for min
1278  maxp           pointer to int for max
1279                 returned as -1 if no max
1280  errorcodeptr   points to error code variable
1281
1282Returns:         pointer to '}' on success;
1283                 current ptr on error, with errorcodeptr set non-zero
1284*/
1285
1286static const pcre_uchar *
1287read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1288{
1289int min = 0;
1290int max = -1;
1291
1292/* Read the minimum value and do a paranoid check: a negative value indicates
1293an integer overflow. */
1294
1295while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1296if (min < 0 || min > 65535)
1297  {
1298  *errorcodeptr = ERR5;
1299  return p;
1300  }
1301
1302/* Read the maximum value if there is one, and again do a paranoid on its size.
1303Also, max must not be less than min. */
1304
1305if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1306  {
1307  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1308    {
1309    max = 0;
1310    while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1311    if (max < 0 || max > 65535)
1312      {
1313      *errorcodeptr = ERR5;
1314      return p;
1315      }
1316    if (max < min)
1317      {
1318      *errorcodeptr = ERR4;
1319      return p;
1320      }
1321    }
1322  }
1323
1324/* Fill in the required variables, and pass back the pointer to the terminating
1325'}'. */
1326
1327*minp = min;
1328*maxp = max;
1329return p;
1330}
1331
1332
1333
1334/*************************************************
1335*  Subroutine for finding forward reference      *
1336*************************************************/
1337
1338/* This recursive function is called only from find_parens() below. The
1339top-level call starts at the beginning of the pattern. All other calls must
1340start at a parenthesis. It scans along a pattern's text looking for capturing
1341subpatterns, and counting them. If it finds a named pattern that matches the
1342name it is given, it returns its number. Alternatively, if the name is NULL, it
1343returns when it reaches a given numbered subpattern. Recursion is used to keep
1344track of subpatterns that reset the capturing group numbers - the (?| feature.
1345
1346This function was originally called only from the second pass, in which we know
1347that if (?< or (?' or (?P< is encountered, the name will be correctly
1348terminated because that is checked in the first pass. There is now one call to
1349this function in the first pass, to check for a recursive back reference by
1350name (so that we can make the whole group atomic). In this case, we need check
1351only up to the current position in the pattern, and that is still OK because
1352and previous occurrences will have been checked. To make this work, the test
1353for "end of pattern" is a check against cd->end_pattern in the main loop,
1354instead of looking for a binary zero. This means that the special first-pass
1355call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1356processing items within the loop are OK, because afterwards the main loop will
1357terminate.)
1358
1359Arguments:
1360  ptrptr       address of the current character pointer (updated)
1361  cd           compile background data
1362  name         name to seek, or NULL if seeking a numbered subpattern
1363  lorn         name length, or subpattern number if name is NULL
1364  xmode        TRUE if we are in /x mode
1365  utf          TRUE if we are in UTF-8 / UTF-16 mode
1366  count        pointer to the current capturing subpattern number (updated)
1367
1368Returns:       the number of the named subpattern, or -1 if not found
1369*/
1370
1371static int
1372find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1373  BOOL xmode, BOOL utf, int *count)
1374{
1375pcre_uchar *ptr = *ptrptr;
1376int start_count = *count;
1377int hwm_count = start_count;
1378BOOL dup_parens = FALSE;
1379
1380/* If the first character is a parenthesis, check on the type of group we are
1381dealing with. The very first call may not start with a parenthesis. */
1382
1383if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1384  {
1385  /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1386
1387  if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1388
1389  /* Handle a normal, unnamed capturing parenthesis. */
1390
1391  else if (ptr[1] != CHAR_QUESTION_MARK)
1392    {
1393    *count += 1;
1394    if (name == NULL && *count == lorn) return *count;
1395    ptr++;
1396    }
1397
1398  /* All cases now have (? at the start. Remember when we are in a group
1399  where the parenthesis numbers are duplicated. */
1400
1401  else if (ptr[2] == CHAR_VERTICAL_LINE)
1402    {
1403    ptr += 3;
1404    dup_parens = TRUE;
1405    }
1406
1407  /* Handle comments; all characters are allowed until a ket is reached. */
1408
1409  else if (ptr[2] == CHAR_NUMBER_SIGN)
1410    {
1411    for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1412    goto FAIL_EXIT;
1413    }
1414
1415  /* Handle a condition. If it is an assertion, just carry on so that it
1416  is processed as normal. If not, skip to the closing parenthesis of the
1417  condition (there can't be any nested parens). */
1418
1419  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1420    {
1421    ptr += 2;
1422    if (ptr[1] != CHAR_QUESTION_MARK)
1423      {
1424      while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1425      if (*ptr != 0) ptr++;
1426      }
1427    }
1428
1429  /* Start with (? but not a condition. */
1430
1431  else
1432    {
1433    ptr += 2;
1434    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1435
1436    /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1437
1438    if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1439        ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1440      {
1441      int term;
1442      const pcre_uchar *thisname;
1443      *count += 1;
1444      if (name == NULL && *count == lorn) return *count;
1445      term = *ptr++;
1446      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1447      thisname = ptr;
1448      while (*ptr != term) ptr++;
1449      if (name != NULL && lorn == ptr - thisname &&
1450          STRNCMP_UC_UC(name, thisname, lorn) == 0)
1451        return *count;
1452      term++;
1453      }
1454    }
1455  }
1456
1457/* Past any initial parenthesis handling, scan for parentheses or vertical
1458bars. Stop if we get to cd->end_pattern. Note that this is important for the
1459first-pass call when this value is temporarily adjusted to stop at the current
1460position. So DO NOT change this to a test for binary zero. */
1461
1462for (; ptr < cd->end_pattern; ptr++)
1463  {
1464  /* Skip over backslashed characters and also entire \Q...\E */
1465
1466  if (*ptr == CHAR_BACKSLASH)
1467    {
1468    if (*(++ptr) == 0) goto FAIL_EXIT;
1469    if (*ptr == CHAR_Q) for (;;)
1470      {
1471      while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1472      if (*ptr == 0) goto FAIL_EXIT;
1473      if (*(++ptr) == CHAR_E) break;
1474      }
1475    continue;
1476    }
1477
1478  /* Skip over character classes; this logic must be similar to the way they
1479  are handled for real. If the first character is '^', skip it. Also, if the
1480  first few characters (either before or after ^) are \Q\E or \E we skip them
1481  too. This makes for compatibility with Perl. Note the use of STR macros to
1482  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1483
1484  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1485    {
1486    BOOL negate_class = FALSE;
1487    for (;;)
1488      {
1489      if (ptr[1] == CHAR_BACKSLASH)
1490        {
1491        if (ptr[2] == CHAR_E)
1492          ptr+= 2;
1493        else if (STRNCMP_UC_C8(ptr + 2,
1494                 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1495          ptr += 4;
1496        else
1497          break;
1498        }
1499      else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1500        {
1501        negate_class = TRUE;
1502        ptr++;
1503        }
1504      else break;
1505      }
1506
1507    /* If the next character is ']', it is a data character that must be
1508    skipped, except in JavaScript compatibility mode. */
1509
1510    if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1511        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1512      ptr++;
1513
1514    while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1515      {
1516      if (*ptr == 0) return -1;
1517      if (*ptr == CHAR_BACKSLASH)
1518        {
1519        if (*(++ptr) == 0) goto FAIL_EXIT;
1520        if (*ptr == CHAR_Q) for (;;)
1521          {
1522          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1523          if (*ptr == 0) goto FAIL_EXIT;
1524          if (*(++ptr) == CHAR_E) break;
1525          }
1526        continue;
1527        }
1528      }
1529    continue;
1530    }
1531
1532  /* Skip comments in /x mode */
1533
1534  if (xmode && *ptr == CHAR_NUMBER_SIGN)
1535    {
1536    ptr++;
1537    while (*ptr != 0)
1538      {
1539      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1540      ptr++;
1541#ifdef SUPPORT_UTF
1542      if (utf) FORWARDCHAR(ptr);
1543#endif
1544      }
1545    if (*ptr == 0) goto FAIL_EXIT;
1546    continue;
1547    }
1548
1549  /* Check for the special metacharacters */
1550
1551  if (*ptr == CHAR_LEFT_PARENTHESIS)
1552    {
1553    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1554    if (rc > 0) return rc;
1555    if (*ptr == 0) goto FAIL_EXIT;
1556    }
1557
1558  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1559    {
1560    if (dup_parens && *count < hwm_count) *count = hwm_count;
1561    goto FAIL_EXIT;
1562    }
1563
1564  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1565    {
1566    if (*count > hwm_count) hwm_count = *count;
1567    *count = start_count;
1568    }
1569  }
1570
1571FAIL_EXIT:
1572*ptrptr = ptr;
1573return -1;
1574}
1575
1576
1577
1578
1579/*************************************************
1580*       Find forward referenced subpattern       *
1581*************************************************/
1582
1583/* This function scans along a pattern's text looking for capturing
1584subpatterns, and counting them. If it finds a named pattern that matches the
1585name it is given, it returns its number. Alternatively, if the name is NULL, it
1586returns when it reaches a given numbered subpattern. This is used for forward
1587references to subpatterns. We used to be able to start this scan from the
1588current compiling point, using the current count value from cd->bracount, and
1589do it all in a single loop, but the addition of the possibility of duplicate
1590subpattern numbers means that we have to scan from the very start, in order to
1591take account of such duplicates, and to use a recursive function to keep track
1592of the different types of group.
1593
1594Arguments:
1595  cd           compile background data
1596  name         name to seek, or NULL if seeking a numbered subpattern
1597  lorn         name length, or subpattern number if name is NULL
1598  xmode        TRUE if we are in /x mode
1599  utf          TRUE if we are in UTF-8 / UTF-16 mode
1600
1601Returns:       the number of the found subpattern, or -1 if not found
1602*/
1603
1604static int
1605find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1606  BOOL utf)
1607{
1608pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1609int count = 0;
1610int rc;
1611
1612/* If the pattern does not start with an opening parenthesis, the first call
1613to find_parens_sub() will scan right to the end (if necessary). However, if it
1614does start with a parenthesis, find_parens_sub() will return when it hits the
1615matching closing parens. That is why we have to have a loop. */
1616
1617for (;;)
1618  {
1619  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1620  if (rc > 0 || *ptr++ == 0) break;
1621  }
1622
1623return rc;
1624}
1625
1626
1627
1628
1629/*************************************************
1630*      Find first significant op code            *
1631*************************************************/
1632
1633/* This is called by several functions that scan a compiled expression looking
1634for a fixed first character, or an anchoring op code etc. It skips over things
1635that do not influence this. For some calls, it makes sense to skip negative
1636forward and all backward assertions, and also the \b assertion; for others it
1637does not.
1638
1639Arguments:
1640  code         pointer to the start of the group
1641  skipassert   TRUE if certain assertions are to be skipped
1642
1643Returns:       pointer to the first significant opcode
1644*/
1645
1646static const pcre_uchar*
1647first_significant_code(const pcre_uchar *code, BOOL skipassert)
1648{
1649for (;;)
1650  {
1651  switch ((int)*code)
1652    {
1653    case OP_ASSERT_NOT:
1654    case OP_ASSERTBACK:
1655    case OP_ASSERTBACK_NOT:
1656    if (!skipassert) return code;
1657    do code += GET(code, 1); while (*code == OP_ALT);
1658    code += PRIV(OP_lengths)[*code];
1659    break;
1660
1661    case OP_WORD_BOUNDARY:
1662    case OP_NOT_WORD_BOUNDARY:
1663    if (!skipassert) return code;
1664    /* Fall through */
1665
1666    case OP_CALLOUT:
1667    case OP_CREF:
1668    case OP_NCREF:
1669    case OP_RREF:
1670    case OP_NRREF:
1671    case OP_DEF:
1672    code += PRIV(OP_lengths)[*code];
1673    break;
1674
1675    default:
1676    return code;
1677    }
1678  }
1679/* Control never reaches here */
1680}
1681
1682
1683
1684
1685/*************************************************
1686*        Find the fixed length of a branch       *
1687*************************************************/
1688
1689/* Scan a branch and compute the fixed length of subject that will match it,
1690if the length is fixed. This is needed for dealing with backward assertions.
1691In UTF8 mode, the result is in characters rather than bytes. The branch is
1692temporarily terminated with OP_END when this function is called.
1693
1694This function is called when a backward assertion is encountered, so that if it
1695fails, the error message can point to the correct place in the pattern.
1696However, we cannot do this when the assertion contains subroutine calls,
1697because they can be forward references. We solve this by remembering this case
1698and doing the check at the end; a flag specifies which mode we are running in.
1699
1700Arguments:
1701  code     points to the start of the pattern (the bracket)
1702  utf      TRUE in UTF-8 / UTF-16 mode
1703  atend    TRUE if called when the pattern is complete
1704  cd       the "compile data" structure
1705
1706Returns:   the fixed length,
1707             or -1 if there is no fixed length,
1708             or -2 if \C was encountered (in UTF-8 mode only)
1709             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1710             or -4 if an unknown opcode was encountered (internal error)
1711*/
1712
1713static int
1714find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1715{
1716int length = -1;
1717
1718register int branchlength = 0;
1719register pcre_uchar *cc = code + 1 + LINK_SIZE;
1720
1721/* Scan along the opcodes for this branch. If we get to the end of the
1722branch, check the length against that of the other branches. */
1723
1724for (;;)
1725  {
1726  int d;
1727  pcre_uchar *ce, *cs;
1728  register int op = *cc;
1729
1730  switch (op)
1731    {
1732    /* We only need to continue for OP_CBRA (normal capturing bracket) and
1733    OP_BRA (normal non-capturing bracket) because the other variants of these
1734    opcodes are all concerned with unlimited repeated groups, which of course
1735    are not of fixed length. */
1736
1737    case OP_CBRA:
1738    case OP_BRA:
1739    case OP_ONCE:
1740    case OP_ONCE_NC:
1741    case OP_COND:
1742    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1743    if (d < 0) return d;
1744    branchlength += d;
1745    do cc += GET(cc, 1); while (*cc == OP_ALT);
1746    cc += 1 + LINK_SIZE;
1747    break;
1748
1749    /* Reached end of a branch; if it's a ket it is the end of a nested call.
1750    If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1751    an ALT. If it is END it's the end of the outer call. All can be handled by
1752    the same code. Note that we must not include the OP_KETRxxx opcodes here,
1753    because they all imply an unlimited repeat. */
1754
1755    case OP_ALT:
1756    case OP_KET:
1757    case OP_END:
1758    case OP_ACCEPT:
1759    case OP_ASSERT_ACCEPT:
1760    if (length < 0) length = branchlength;
1761      else if (length != branchlength) return -1;
1762    if (*cc != OP_ALT) return length;
1763    cc += 1 + LINK_SIZE;
1764    branchlength = 0;
1765    break;
1766
1767    /* A true recursion implies not fixed length, but a subroutine call may
1768    be OK. If the subroutine is a forward reference, we can't deal with
1769    it until the end of the pattern, so return -3. */
1770
1771    case OP_RECURSE:
1772    if (!atend) return -3;
1773    cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1774    do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1775    if (cc > cs && cc < ce) return -1;                    /* Recursion */
1776    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1777    if (d < 0) return d;
1778    branchlength += d;
1779    cc += 1 + LINK_SIZE;
1780    break;
1781
1782    /* Skip over assertive subpatterns */
1783
1784    case OP_ASSERT:
1785    case OP_ASSERT_NOT:
1786    case OP_ASSERTBACK:
1787    case OP_ASSERTBACK_NOT:
1788    do cc += GET(cc, 1); while (*cc == OP_ALT);
1789    cc += PRIV(OP_lengths)[*cc];
1790    break;
1791
1792    /* Skip over things that don't match chars */
1793
1794    case OP_MARK:
1795    case OP_PRUNE_ARG:
1796    case OP_SKIP_ARG:
1797    case OP_THEN_ARG:
1798    cc += cc[1] + PRIV(OP_lengths)[*cc];
1799    break;
1800
1801    case OP_CALLOUT:
1802    case OP_CIRC:
1803    case OP_CIRCM:
1804    case OP_CLOSE:
1805    case OP_COMMIT:
1806    case OP_CREF:
1807    case OP_DEF:
1808    case OP_DOLL:
1809    case OP_DOLLM:
1810    case OP_EOD:
1811    case OP_EODN:
1812    case OP_FAIL:
1813    case OP_NCREF:
1814    case OP_NRREF:
1815    case OP_NOT_WORD_BOUNDARY:
1816    case OP_PRUNE:
1817    case OP_REVERSE:
1818    case OP_RREF:
1819    case OP_SET_SOM:
1820    case OP_SKIP:
1821    case OP_SOD:
1822    case OP_SOM:
1823    case OP_THEN:
1824    case OP_WORD_BOUNDARY:
1825    cc += PRIV(OP_lengths)[*cc];
1826    break;
1827
1828    /* Handle literal characters */
1829
1830    case OP_CHAR:
1831    case OP_CHARI:
1832    case OP_NOT:
1833    case OP_NOTI:
1834    branchlength++;
1835    cc += 2;
1836#ifdef SUPPORT_UTF
1837    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1838#endif
1839    break;
1840
1841    /* Handle exact repetitions. The count is already in characters, but we
1842    need to skip over a multibyte character in UTF8 mode.  */
1843
1844    case OP_EXACT:
1845    case OP_EXACTI:
1846    case OP_NOTEXACT:
1847    case OP_NOTEXACTI:
1848    branchlength += GET2(cc,1);
1849    cc += 2 + IMM2_SIZE;
1850#ifdef SUPPORT_UTF
1851    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1852#endif
1853    break;
1854
1855    case OP_TYPEEXACT:
1856    branchlength += GET2(cc,1);
1857    if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1858    cc += 1 + IMM2_SIZE + 1;
1859    break;
1860
1861    /* Handle single-char matchers */
1862
1863    case OP_PROP:
1864    case OP_NOTPROP:
1865    cc += 2;
1866    /* Fall through */
1867
1868    case OP_HSPACE:
1869    case OP_VSPACE:
1870    case OP_NOT_HSPACE:
1871    case OP_NOT_VSPACE:
1872    case OP_NOT_DIGIT:
1873    case OP_DIGIT:
1874    case OP_NOT_WHITESPACE:
1875    case OP_WHITESPACE:
1876    case OP_NOT_WORDCHAR:
1877    case OP_WORDCHAR:
1878    case OP_ANY:
1879    case OP_ALLANY:
1880    branchlength++;
1881    cc++;
1882    break;
1883
1884    /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1885    otherwise \C is coded as OP_ALLANY. */
1886
1887    case OP_ANYBYTE:
1888    return -2;
1889
1890    /* Check a class for variable quantification */
1891
1892#if defined SUPPORT_UTF || defined COMPILE_PCRE16
1893    case OP_XCLASS:
1894    cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1895    /* Fall through */
1896#endif
1897
1898    case OP_CLASS:
1899    case OP_NCLASS:
1900    cc += PRIV(OP_lengths)[OP_CLASS];
1901
1902    switch (*cc)
1903      {
1904      case OP_CRPLUS:
1905      case OP_CRMINPLUS:
1906      case OP_CRSTAR:
1907      case OP_CRMINSTAR:
1908      case OP_CRQUERY:
1909      case OP_CRMINQUERY:
1910      return -1;
1911
1912      case OP_CRRANGE:
1913      case OP_CRMINRANGE:
1914      if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1915      branchlength += GET2(cc,1);
1916      cc += 1 + 2 * IMM2_SIZE;
1917      break;
1918
1919      default:
1920      branchlength++;
1921      }
1922    break;
1923
1924    /* Anything else is variable length */
1925
1926    case OP_ANYNL:
1927    case OP_BRAMINZERO:
1928    case OP_BRAPOS:
1929    case OP_BRAPOSZERO:
1930    case OP_BRAZERO:
1931    case OP_CBRAPOS:
1932    case OP_EXTUNI:
1933    case OP_KETRMAX:
1934    case OP_KETRMIN:
1935    case OP_KETRPOS:
1936    case OP_MINPLUS:
1937    case OP_MINPLUSI:
1938    case OP_MINQUERY:
1939    case OP_MINQUERYI:
1940    case OP_MINSTAR:
1941    case OP_MINSTARI:
1942    case OP_MINUPTO:
1943    case OP_MINUPTOI:
1944    case OP_NOTMINPLUS:
1945    case OP_NOTMINPLUSI:
1946    case OP_NOTMINQUERY:
1947    case OP_NOTMINQUERYI:
1948    case OP_NOTMINSTAR:
1949    case OP_NOTMINSTARI:
1950    case OP_NOTMINUPTO:
1951    case OP_NOTMINUPTOI:
1952    case OP_NOTPLUS:
1953    case OP_NOTPLUSI:
1954    case OP_NOTPOSPLUS:
1955    case OP_NOTPOSPLUSI:
1956    case OP_NOTPOSQUERY:
1957    case OP_NOTPOSQUERYI:
1958    case OP_NOTPOSSTAR:
1959    case OP_NOTPOSSTARI:
1960    case OP_NOTPOSUPTO:
1961    case OP_NOTPOSUPTOI:
1962    case OP_NOTQUERY:
1963    case OP_NOTQUERYI:
1964    case OP_NOTSTAR:
1965    case OP_NOTSTARI:
1966    case OP_NOTUPTO:
1967    case OP_NOTUPTOI:
1968    case OP_PLUS:
1969    case OP_PLUSI:
1970    case OP_POSPLUS:
1971    case OP_POSPLUSI:
1972    case OP_POSQUERY:
1973    case OP_POSQUERYI:
1974    case OP_POSSTAR:
1975    case OP_POSSTARI:
1976    case OP_POSUPTO:
1977    case OP_POSUPTOI:
1978    case OP_QUERY:
1979    case OP_QUERYI:
1980    case OP_REF:
1981    case OP_REFI:
1982    case OP_SBRA:
1983    case OP_SBRAPOS:
1984    case OP_SCBRA:
1985    case OP_SCBRAPOS:
1986    case OP_SCOND:
1987    case OP_SKIPZERO:
1988    case OP_STAR:
1989    case OP_STARI:
1990    case OP_TYPEMINPLUS:
1991    case OP_TYPEMINQUERY:
1992    case OP_TYPEMINSTAR:
1993    case OP_TYPEMINUPTO:
1994    case OP_TYPEPLUS:
1995    case OP_TYPEPOSPLUS:
1996    case OP_TYPEPOSQUERY:
1997    case OP_TYPEPOSSTAR:
1998    case OP_TYPEPOSUPTO:
1999    case OP_TYPEQUERY:
2000    case OP_TYPESTAR:
2001    case OP_TYPEUPTO:
2002    case OP_UPTO:
2003    case OP_UPTOI:
2004    return -1;
2005
2006    /* Catch unrecognized opcodes so that when new ones are added they
2007    are not forgotten, as has happened in the past. */
2008
2009    default:
2010    return -4;
2011    }
2012  }
2013/* Control never gets here */
2014}
2015
2016
2017
2018
2019/*************************************************
2020*    Scan compiled regex for specific bracket    *
2021*************************************************/
2022
2023/* This little function scans through a compiled pattern until it finds a
2024capturing bracket with the given number, or, if the number is negative, an
2025instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2026so that it can be called from pcre_study() when finding the minimum matching
2027length.
2028
2029Arguments:
2030  code        points to start of expression
2031  utf         TRUE in UTF-8 / UTF-16 mode
2032  number      the required bracket number or negative to find a lookbehind
2033
2034Returns:      pointer to the opcode for the bracket, or NULL if not found
2035*/
2036
2037const pcre_uchar *
2038PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2039{
2040for (;;)
2041  {
2042  register int c = *code;
2043
2044  if (c == OP_END) return NULL;
2045
2046  /* XCLASS is used for classes that cannot be represented just by a bit
2047  map. This includes negated single high-valued characters. The length in
2048  the table is zero; the actual length is stored in the compiled code. */
2049
2050  if (c == OP_XCLASS) code += GET(code, 1);
2051
2052  /* Handle recursion */
2053
2054  else if (c == OP_REVERSE)
2055    {
2056    if (number < 0) return (pcre_uchar *)code;
2057    code += PRIV(OP_lengths)[c];
2058    }
2059
2060  /* Handle capturing bracket */
2061
2062  else if (c == OP_CBRA || c == OP_SCBRA ||
2063           c == OP_CBRAPOS || c == OP_SCBRAPOS)
2064    {
2065    int n = GET2(code, 1+LINK_SIZE);
2066    if (n == number) return (pcre_uchar *)code;
2067    code += PRIV(OP_lengths)[c];
2068    }
2069
2070  /* Otherwise, we can get the item's length from the table, except that for
2071  repeated character types, we have to test for \p and \P, which have an extra
2072  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2073  must add in its length. */
2074
2075  else
2076    {
2077    switch(c)
2078      {
2079      case OP_TYPESTAR:
2080      case OP_TYPEMINSTAR:
2081      case OP_TYPEPLUS:
2082      case OP_TYPEMINPLUS:
2083      case OP_TYPEQUERY:
2084      case OP_TYPEMINQUERY:
2085      case OP_TYPEPOSSTAR:
2086      case OP_TYPEPOSPLUS:
2087      case OP_TYPEPOSQUERY:
2088      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2089      break;
2090
2091      case OP_TYPEUPTO:
2092      case OP_TYPEMINUPTO:
2093      case OP_TYPEEXACT:
2094      case OP_TYPEPOSUPTO:
2095      if (code[1 + IMM2_SIZE] == OP_PROP
2096        || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2097      break;
2098
2099      case OP_MARK:
2100      case OP_PRUNE_ARG:
2101      case OP_SKIP_ARG:
2102      code += code[1];
2103      break;
2104
2105      case OP_THEN_ARG:
2106      code += code[1];
2107      break;
2108      }
2109
2110    /* Add in the fixed length from the table */
2111
2112    code += PRIV(OP_lengths)[c];
2113
2114  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2115  a multi-byte character. The length in the table is a minimum, so we have to
2116  arrange to skip the extra bytes. */
2117
2118#ifdef SUPPORT_UTF
2119    if (utf) switch(c)
2120      {
2121      case OP_CHAR:
2122      case OP_CHARI:
2123      case OP_EXACT:
2124      case OP_EXACTI:
2125      case OP_UPTO:
2126      case OP_UPTOI:
2127      case OP_MINUPTO:
2128      case OP_MINUPTOI:
2129      case OP_POSUPTO:
2130      case OP_POSUPTOI:
2131      case OP_STAR:
2132      case OP_STARI:
2133      case OP_MINSTAR:
2134      case OP_MINSTARI:
2135      case OP_POSSTAR:
2136      case OP_POSSTARI:
2137      case OP_PLUS:
2138      case OP_PLUSI:
2139      case OP_MINPLUS:
2140      case OP_MINPLUSI:
2141      case OP_POSPLUS:
2142      case OP_POSPLUSI:
2143      case OP_QUERY:
2144      case OP_QUERYI:
2145      case OP_MINQUERY:
2146      case OP_MINQUERYI:
2147      case OP_POSQUERY:
2148      case OP_POSQUERYI:
2149      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2150      break;
2151      }
2152#else
2153    (void)(utf);  /* Keep compiler happy by referencing function argument */
2154#endif
2155    }
2156  }
2157}
2158
2159
2160
2161/*************************************************
2162*   Scan compiled regex for recursion reference  *
2163*************************************************/
2164
2165/* This little function scans through a compiled pattern until it finds an
2166instance of OP_RECURSE.
2167
2168Arguments:
2169  code        points to start of expression
2170  utf         TRUE in UTF-8 / UTF-16 mode
2171
2172Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2173*/
2174
2175static const pcre_uchar *
2176find_recurse(const pcre_uchar *code, BOOL utf)
2177{
2178for (;;)
2179  {
2180  register int c = *code;
2181  if (c == OP_END) return NULL;
2182  if (c == OP_RECURSE) return code;
2183
2184  /* XCLASS is used for classes that cannot be represented just by a bit
2185  map. This includes negated single high-valued characters. The length in
2186  the table is zero; the actual length is stored in the compiled code. */
2187
2188  if (c == OP_XCLASS) code += GET(code, 1);
2189
2190  /* Otherwise, we can get the item's length from the table, except that for
2191  repeated character types, we have to test for \p and \P, which have an extra
2192  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2193  must add in its length. */
2194
2195  else
2196    {
2197    switch(c)
2198      {
2199      case OP_TYPESTAR:
2200      case OP_TYPEMINSTAR:
2201      case OP_TYPEPLUS:
2202      case OP_TYPEMINPLUS:
2203      case OP_TYPEQUERY:
2204      case OP_TYPEMINQUERY:
2205      case OP_TYPEPOSSTAR:
2206      case OP_TYPEPOSPLUS:
2207      case OP_TYPEPOSQUERY:
2208      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2209      break;
2210
2211      case OP_TYPEPOSUPTO:
2212      case OP_TYPEUPTO:
2213      case OP_TYPEMINUPTO:
2214      case OP_TYPEEXACT:
2215      if (code[1 + IMM2_SIZE] == OP_PROP
2216        || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2217      break;
2218
2219      case OP_MARK:
2220      case OP_PRUNE_ARG:
2221      case OP_SKIP_ARG:
2222      code += code[1];
2223      break;
2224
2225      case OP_THEN_ARG:
2226      code += code[1];
2227      break;
2228      }
2229
2230    /* Add in the fixed length from the table */
2231
2232    code += PRIV(OP_lengths)[c];
2233
2234    /* In UTF-8 mode, opcodes that are followed by a character may be followed
2235    by a multi-byte character. The length in the table is a minimum, so we have
2236    to arrange to skip the extra bytes. */
2237
2238#ifdef SUPPORT_UTF
2239    if (utf) switch(c)
2240      {
2241      case OP_CHAR:
2242      case OP_CHARI:
2243      case OP_NOT:
2244      case OP_NOTI:
2245      case OP_EXACT:
2246      case OP_EXACTI:
2247      case OP_NOTEXACT:
2248      case OP_NOTEXACTI:
2249      case OP_UPTO:
2250      case OP_UPTOI:
2251      case OP_NOTUPTO:
2252      case OP_NOTUPTOI:
2253      case OP_MINUPTO:
2254      case OP_MINUPTOI:
2255      case OP_NOTMINUPTO:
2256      case OP_NOTMINUPTOI:
2257      case OP_POSUPTO:
2258      case OP_POSUPTOI:
2259      case OP_NOTPOSUPTO:
2260      case OP_NOTPOSUPTOI:
2261      case OP_STAR:
2262      case OP_STARI:
2263      case OP_NOTSTAR:
2264      case OP_NOTSTARI:
2265      case OP_MINSTAR:
2266      case OP_MINSTARI:
2267      case OP_NOTMINSTAR:
2268      case OP_NOTMINSTARI:
2269      case OP_POSSTAR:
2270      case OP_POSSTARI:
2271      case OP_NOTPOSSTAR:
2272      case OP_NOTPOSSTARI:
2273      case OP_PLUS:
2274      case OP_PLUSI:
2275      case OP_NOTPLUS:
2276      case OP_NOTPLUSI:
2277      case OP_MINPLUS:
2278      case OP_MINPLUSI:
2279      case OP_NOTMINPLUS:
2280      case OP_NOTMINPLUSI:
2281      case OP_POSPLUS:
2282      case OP_POSPLUSI:
2283      case OP_NOTPOSPLUS:
2284      case OP_NOTPOSPLUSI:
2285      case OP_QUERY:
2286      case OP_QUERYI:
2287      case OP_NOTQUERY:
2288      case OP_NOTQUERYI:
2289      case OP_MINQUERY:
2290      case OP_MINQUERYI:
2291      case OP_NOTMINQUERY:
2292      case OP_NOTMINQUERYI:
2293      case OP_POSQUERY:
2294      case OP_POSQUERYI:
2295      case OP_NOTPOSQUERY:
2296      case OP_NOTPOSQUERYI:
2297      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2298      break;
2299      }
2300#else
2301    (void)(utf);  /* Keep compiler happy by referencing function argument */
2302#endif
2303    }
2304  }
2305}
2306
2307
2308
2309/*************************************************
2310*    Scan compiled branch for non-emptiness      *
2311*************************************************/
2312
2313/* This function scans through a branch of a compiled pattern to see whether it
2314can match the empty string or not. It is called from could_be_empty()
2315below and from compile_branch() when checking for an unlimited repeat of a
2316group that can match nothing. Note that first_significant_code() skips over
2317backward and negative forward assertions when its final argument is TRUE. If we
2318hit an unclosed bracket, we return "empty" - this means we've struck an inner
2319bracket whose current branch will already have been scanned.
2320
2321Arguments:
2322  code        points to start of search
2323  endcode     points to where to stop
2324  utf         TRUE if in UTF-8 / UTF-16 mode
2325  cd          contains pointers to tables etc.
2326
2327Returns:      TRUE if what is matched could be empty
2328*/
2329
2330static BOOL
2331could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2332  BOOL utf, compile_data *cd)
2333{
2334register int c;
2335for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2336     code < endcode;
2337     code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2338  {
2339  const pcre_uchar *ccode;
2340
2341  c = *code;
2342
2343  /* Skip over forward assertions; the other assertions are skipped by
2344  first_significant_code() with a TRUE final argument. */
2345
2346  if (c == OP_ASSERT)
2347    {
2348    do code += GET(code, 1); while (*code == OP_ALT);
2349    c = *code;
2350    continue;
2351    }
2352
2353  /* For a recursion/subroutine call, if its end has been reached, which
2354  implies a backward reference subroutine call, we can scan it. If it's a
2355  forward reference subroutine call, we can't. To detect forward reference
2356  we have to scan up the list that is kept in the workspace. This function is
2357  called only when doing the real compile, not during the pre-compile that
2358  measures the size of the compiled pattern. */
2359
2360  if (c == OP_RECURSE)
2361    {
2362    const pcre_uchar *scode;
2363    BOOL empty_branch;
2364
2365    /* Test for forward reference */
2366
2367    for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2368      if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2369
2370    /* Not a forward reference, test for completed backward reference */
2371
2372    empty_branch = FALSE;
2373    scode = cd->start_code + GET(code, 1);
2374    if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2375
2376    /* Completed backwards reference */
2377
2378    do
2379      {
2380      if (could_be_empty_branch(scode, endcode, utf, cd))
2381        {
2382        empty_branch = TRUE;
2383        break;
2384        }
2385      scode += GET(scode, 1);
2386      }
2387    while (*scode == OP_ALT);
2388
2389    if (!empty_branch) return FALSE;  /* All branches are non-empty */
2390    continue;
2391    }
2392
2393  /* Groups with zero repeats can of course be empty; skip them. */
2394
2395  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2396      c == OP_BRAPOSZERO)
2397    {
2398    code += PRIV(OP_lengths)[c];
2399    do code += GET(code, 1); while (*code == OP_ALT);
2400    c = *code;
2401    continue;
2402    }
2403
2404  /* A nested group that is already marked as "could be empty" can just be
2405  skipped. */
2406
2407  if (c == OP_SBRA  || c == OP_SBRAPOS ||
2408      c == OP_SCBRA || c == OP_SCBRAPOS)
2409    {
2410    do code += GET(code, 1); while (*code == OP_ALT);
2411    c = *code;
2412    continue;
2413    }
2414
2415  /* For other groups, scan the branches. */
2416
2417  if (c == OP_BRA  || c == OP_BRAPOS ||
2418      c == OP_CBRA || c == OP_CBRAPOS ||
2419      c == OP_ONCE || c == OP_ONCE_NC ||
2420      c == OP_COND)
2421    {
2422    BOOL empty_branch;
2423    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2424
2425    /* If a conditional group has only one branch, there is a second, implied,
2426    empty branch, so just skip over the conditional, because it could be empty.
2427    Otherwise, scan the individual branches of the group. */
2428
2429    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2430      code += GET(code, 1);
2431    else
2432      {
2433      empty_branch = FALSE;
2434      do
2435        {
2436        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2437          empty_branch = TRUE;
2438        code += GET(code, 1);
2439        }
2440      while (*code == OP_ALT);
2441      if (!empty_branch) return FALSE;   /* All branches are non-empty */
2442      }
2443
2444    c = *code;
2445    continue;
2446    }
2447
2448  /* Handle the other opcodes */
2449
2450  switch (c)
2451    {
2452    /* Check for quantifiers after a class. XCLASS is used for classes that
2453    cannot be represented just by a bit map. This includes negated single
2454    high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2455    actual length is stored in the compiled code, so we must update "code"
2456    here. */
2457
2458#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2459    case OP_XCLASS:
2460    ccode = code += GET(code, 1);
2461    goto CHECK_CLASS_REPEAT;
2462#endif
2463
2464    case OP_CLASS:
2465    case OP_NCLASS:
2466    ccode = code + PRIV(OP_lengths)[OP_CLASS];
2467
2468#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2469    CHECK_CLASS_REPEAT:
2470#endif
2471
2472    switch (*ccode)
2473      {
2474      case OP_CRSTAR:            /* These could be empty; continue */
2475      case OP_CRMINSTAR:
2476      case OP_CRQUERY:
2477      case OP_CRMINQUERY:
2478      break;
2479
2480      default:                   /* Non-repeat => class must match */
2481      case OP_CRPLUS:            /* These repeats aren't empty */
2482      case OP_CRMINPLUS:
2483      return FALSE;
2484
2485      case OP_CRRANGE:
2486      case OP_CRMINRANGE:
2487      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2488      break;
2489      }
2490    break;
2491
2492    /* Opcodes that must match a character */
2493
2494    case OP_PROP:
2495    case OP_NOTPROP:
2496    case OP_EXTUNI:
2497    case OP_NOT_DIGIT:
2498    case OP_DIGIT:
2499    case OP_NOT_WHITESPACE:
2500    case OP_WHITESPACE:
2501    case OP_NOT_WORDCHAR:
2502    case OP_WORDCHAR:
2503    case OP_ANY:
2504    case OP_ALLANY:
2505    case OP_ANYBYTE:
2506    case OP_CHAR:
2507    case OP_CHARI:
2508    case OP_NOT:
2509    case OP_NOTI:
2510    case OP_PLUS:
2511    case OP_MINPLUS:
2512    case OP_POSPLUS:
2513    case OP_EXACT:
2514    case OP_NOTPLUS:
2515    case OP_NOTMINPLUS:
2516    case OP_NOTPOSPLUS:
2517    case OP_NOTEXACT:
2518    case OP_TYPEPLUS:
2519    case OP_TYPEMINPLUS:
2520    case OP_TYPEPOSPLUS:
2521    case OP_TYPEEXACT:
2522    return FALSE;
2523
2524    /* These are going to continue, as they may be empty, but we have to
2525    fudge the length for the \p and \P cases. */
2526
2527    case OP_TYPESTAR:
2528    case OP_TYPEMINSTAR:
2529    case OP_TYPEPOSSTAR:
2530    case OP_TYPEQUERY:
2531    case OP_TYPEMINQUERY:
2532    case OP_TYPEPOSQUERY:
2533    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2534    break;
2535
2536    /* Same for these */
2537
2538    case OP_TYPEUPTO:
2539    case OP_TYPEMINUPTO:
2540    case OP_TYPEPOSUPTO:
2541    if (code[1 + IMM2_SIZE] == OP_PROP
2542      || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2543    break;
2544
2545    /* End of branch */
2546
2547    case OP_KET:
2548    case OP_KETRMAX:
2549    case OP_KETRMIN:
2550    case OP_KETRPOS:
2551    case OP_ALT:
2552    return TRUE;
2553
2554    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2555    MINUPTO, and POSUPTO may be followed by a multibyte character */
2556
2557#ifdef SUPPORT_UTF
2558    case OP_STAR:
2559    case OP_STARI:
2560    case OP_MINSTAR:
2561    case OP_MINSTARI:
2562    case OP_POSSTAR:
2563    case OP_POSSTARI:
2564    case OP_QUERY:
2565    case OP_QUERYI:
2566    case OP_MINQUERY:
2567    case OP_MINQUERYI:
2568    case OP_POSQUERY:
2569    case OP_POSQUERYI:
2570    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2571    break;
2572
2573    case OP_UPTO:
2574    case OP_UPTOI:
2575    case OP_MINUPTO:
2576    case OP_MINUPTOI:
2577    case OP_POSUPTO:
2578    case OP_POSUPTOI:
2579    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2580    break;
2581#endif
2582
2583    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2584    string. */
2585
2586    case OP_MARK:
2587    case OP_PRUNE_ARG:
2588    case OP_SKIP_ARG:
2589    code += code[1];
2590    break;
2591
2592    case OP_THEN_ARG:
2593    code += code[1];
2594    break;
2595
2596    /* None of the remaining opcodes are required to match a character. */
2597
2598    default:
2599    break;
2600    }
2601  }
2602
2603return TRUE;
2604}
2605
2606
2607
2608/*************************************************
2609*    Scan compiled regex for non-emptiness       *
2610*************************************************/
2611
2612/* This function is called to check for left recursive calls. We want to check
2613the current branch of the current pattern to see if it could match the empty
2614string. If it could, we must look outwards for branches at other levels,
2615stopping when we pass beyond the bracket which is the subject of the recursion.
2616This function is called only during the real compile, not during the
2617pre-compile.
2618
2619Arguments:
2620  code        points to start of the recursion
2621  endcode     points to where to stop (current RECURSE item)
2622  bcptr       points to the chain of current (unclosed) branch starts
2623  utf         TRUE if in UTF-8 / UTF-16 mode
2624  cd          pointers to tables etc
2625
2626Returns:      TRUE if what is matched could be empty
2627*/
2628
2629static BOOL
2630could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2631  branch_chain *bcptr, BOOL utf, compile_data *cd)
2632{
2633while (bcptr != NULL && bcptr->current_branch >= code)
2634  {
2635  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2636    return FALSE;
2637  bcptr = bcptr->outer;
2638  }
2639return TRUE;
2640}
2641
2642
2643
2644/*************************************************
2645*           Check for POSIX class syntax         *
2646*************************************************/
2647
2648/* This function is called when the sequence "[:" or "[." or "[=" is
2649encountered in a character class. It checks whether this is followed by a
2650sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2651reach an unescaped ']' without the special preceding character, return FALSE.
2652
2653Originally, this function only recognized a sequence of letters between the
2654terminators, but it seems that Perl recognizes any sequence of characters,
2655though of course unknown POSIX names are subsequently rejected. Perl gives an
2656"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2657didn't consider this to be a POSIX class. Likewise for [:1234:].
2658
2659The problem in trying to be exactly like Perl is in the handling of escapes. We
2660have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2661class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2662below handles the special case of \], but does not try to do any other escape
2663processing. This makes it different from Perl for cases such as [:l\ower:]
2664where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2665"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2666I think.
2667
2668A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2669It seems that the appearance of a nested POSIX class supersedes an apparent
2670external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2671a digit.
2672
2673In Perl, unescaped square brackets may also appear as part of class names. For
2674example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2675[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2676seem right at all. PCRE does not allow closing square brackets in POSIX class
2677names.
2678
2679Arguments:
2680  ptr      pointer to the initial [
2681  endptr   where to return the end pointer
2682
2683Returns:   TRUE or FALSE
2684*/
2685
2686static BOOL
2687check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2688{
2689int terminator;          /* Don't combine these lines; the Solaris cc */
2690terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2691for (++ptr; *ptr != 0; ptr++)
2692  {
2693  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2694    ptr++;
2695  else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2696  else
2697    {
2698    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2699      {
2700      *endptr = ptr;
2701      return TRUE;
2702      }
2703    if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2704         (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2705          ptr[1] == CHAR_EQUALS_SIGN) &&
2706        check_posix_syntax(ptr, endptr))
2707      return FALSE;
2708    }
2709  }
2710return FALSE;
2711}
2712
2713
2714
2715
2716/*************************************************
2717*          Check POSIX class name                *
2718*************************************************/
2719
2720/* This function is called to check the name given in a POSIX-style class entry
2721such as [:alnum:].
2722
2723Arguments:
2724  ptr        points to the first letter
2725  len        the length of the name
2726
2727Returns:     a value representing the name, or -1 if unknown
2728*/
2729
2730static int
2731check_posix_name(const pcre_uchar *ptr, int len)
2732{
2733const char *pn = posix_names;
2734register int yield = 0;
2735while (posix_name_lengths[yield] != 0)
2736  {
2737  if (len == posix_name_lengths[yield] &&
2738    STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2739  pn += posix_name_lengths[yield] + 1;
2740  yield++;
2741  }
2742return -1;
2743}
2744
2745
2746/*************************************************
2747*    Adjust OP_RECURSE items in repeated group   *
2748*************************************************/
2749
2750/* OP_RECURSE items contain an offset from the start of the regex to the group
2751that is referenced. This means that groups can be replicated for fixed
2752repetition simply by copying (because the recursion is allowed to refer to
2753earlier groups that are outside the current group). However, when a group is
2754optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2755inserted before it, after it has been compiled. This means that any OP_RECURSE
2756items within it that refer to the group itself or any contained groups have to
2757have their offsets adjusted. That one of the jobs of this function. Before it
2758is called, the partially compiled regex must be temporarily terminated with
2759OP_END.
2760
2761This function has been extended with the possibility of forward references for
2762recursions and subroutine calls. It must also check the list of such references
2763for the group we are dealing with. If it finds that one of the recursions in
2764the current group is on this list, it adjusts the offset in the list, not the
2765value in the reference (which is a group number).
2766
2767Arguments:
2768  group      points to the start of the group
2769  adjust     the amount by which the group is to be moved
2770  utf        TRUE in UTF-8 / UTF-16 mode
2771  cd         contains pointers to tables etc.
2772  save_hwm   the hwm forward reference pointer at the start of the group
2773
2774Returns:     nothing
2775*/
2776
2777static void
2778adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2779  pcre_uchar *save_hwm)
2780{
2781pcre_uchar *ptr = group;
2782
2783while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2784  {
2785  int offset;
2786  pcre_uchar *hc;
2787
2788  /* See if this recursion is on the forward reference list. If so, adjust the
2789  reference. */
2790
2791  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2792    {
2793    offset = GET(hc, 0);
2794    if (cd->start_code + offset == ptr + 1)
2795      {
2796      PUT(hc, 0, offset + adjust);
2797      break;
2798      }
2799    }
2800
2801  /* Otherwise, adjust the recursion offset if it's after the start of this
2802  group. */
2803
2804  if (hc >= cd->hwm)
2805    {
2806    offset = GET(ptr, 1);
2807    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2808    }
2809
2810  ptr += 1 + LINK_SIZE;
2811  }
2812}
2813
2814
2815
2816/*************************************************
2817*        Insert an automatic callout point       *
2818*************************************************/
2819
2820/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2821callout points before each pattern item.
2822
2823Arguments:
2824  code           current code pointer
2825  ptr            current pattern pointer
2826  cd             pointers to tables etc
2827
2828Returns:         new code pointer
2829*/
2830
2831static pcre_uchar *
2832auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2833{
2834*code++ = OP_CALLOUT;
2835*code++ = 255;
2836PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2837PUT(code, LINK_SIZE, 0);                       /* Default length */
2838return code + 2 * LINK_SIZE;
2839}
2840
2841
2842
2843/*************************************************
2844*         Complete a callout item                *
2845*************************************************/
2846
2847/* A callout item contains the length of the next item in the pattern, which
2848we can't fill in till after we have reached the relevant point. This is used
2849for both automatic and manual callouts.
2850
2851Arguments:
2852  previous_callout   points to previous callout item
2853  ptr                current pattern pointer
2854  cd                 pointers to tables etc
2855
2856Returns:             nothing
2857*/
2858
2859static void
2860complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2861{
2862int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2863PUT(previous_callout, 2 + LINK_SIZE, length);
2864}
2865
2866
2867
2868#ifdef SUPPORT_UCP
2869/*************************************************
2870*           Get othercase range                  *
2871*************************************************/
2872
2873/* This function is passed the start and end of a class range, in UTF-8 mode
2874with UCP support. It searches up the characters, looking for internal ranges of
2875characters in the "other" case. Each call returns the next one, updating the
2876start address.
2877
2878Arguments:
2879  cptr        points to starting character value; updated
2880  d           end value
2881  ocptr       where to put start of othercase range
2882  odptr       where to put end of othercase range
2883
2884Yield:        TRUE when range returned; FALSE when no more
2885*/
2886
2887static BOOL
2888get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2889  unsigned int *odptr)
2890{
2891unsigned int c, othercase, next;
2892
2893for (c = *cptr; c <= d; c++)
2894  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2895
2896if (c > d) return FALSE;
2897
2898*ocptr = othercase;
2899next = othercase + 1;
2900
2901for (++c; c <= d; c++)
2902  {
2903  if (UCD_OTHERCASE(c) != next) break;
2904  next++;
2905  }
2906
2907*odptr = next - 1;
2908*cptr = c;
2909
2910return TRUE;
2911}
2912
2913
2914
2915/*************************************************
2916*        Check a character and a property        *
2917*************************************************/
2918
2919/* This function is called by check_auto_possessive() when a property item
2920is adjacent to a fixed character.
2921
2922Arguments:
2923  c            the character
2924  ptype        the property type
2925  pdata        the data for the type
2926  negated      TRUE if it's a negated property (\P or \p{^)
2927
2928Returns:       TRUE if auto-possessifying is OK
2929*/
2930
2931static BOOL
2932check_char_prop(int c, int ptype, int pdata, BOOL negated)
2933{
2934const ucd_record *prop = GET_UCD(c);
2935switch(ptype)
2936  {
2937  case PT_LAMP:
2938  return (prop->chartype == ucp_Lu ||
2939          prop->chartype == ucp_Ll ||
2940          prop->chartype == ucp_Lt) == negated;
2941
2942  case PT_GC:
2943  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2944
2945  case PT_PC:
2946  return (pdata == prop->chartype) == negated;
2947
2948  case PT_SC:
2949  return (pdata == prop->script) == negated;
2950
2951  /* These are specials */
2952
2953  case PT_ALNUM:
2954  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2955          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2956
2957  case PT_SPACE:    /* Perl space */
2958  return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2959          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2960          == negated;
2961
2962  case PT_PXSPACE:  /* POSIX space */
2963  return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2964          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2965          c == CHAR_FF || c == CHAR_CR)
2966          == negated;
2967
2968  case PT_WORD:
2969  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2970          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2971          c == CHAR_UNDERSCORE) == negated;
2972  }
2973return FALSE;
2974}
2975#endif  /* SUPPORT_UCP */
2976
2977
2978
2979/*************************************************
2980*     Check if auto-possessifying is possible    *
2981*************************************************/
2982
2983/* This function is called for unlimited repeats of certain items, to see
2984whether the next thing could possibly match the repeated item. If not, it makes
2985sense to automatically possessify the repeated item.
2986
2987Arguments:
2988  previous      pointer to the repeated opcode
2989  utf           TRUE in UTF-8 / UTF-16 mode
2990  ptr           next character in pattern
2991  options       options bits
2992  cd            contains pointers to tables etc.
2993
2994Returns:        TRUE if possessifying is wanted
2995*/
2996
2997static BOOL
2998check_auto_possessive(const pcre_uchar *previous, BOOL utf,
2999  const pcre_uchar *ptr, int options, compile_data *cd)
3000{
3001pcre_int32 c, next;
3002int op_code = *previous++;
3003
3004/* Skip whitespace and comments in extended mode */
3005
3006if ((options & PCRE_EXTENDED) != 0)
3007  {
3008  for (;;)
3009    {
3010    while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3011    if (*ptr == CHAR_NUMBER_SIGN)
3012      {
3013      ptr++;
3014      while (*ptr != 0)
3015        {
3016        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3017        ptr++;
3018#ifdef SUPPORT_UTF
3019        if (utf) FORWARDCHAR(ptr);
3020#endif
3021        }
3022      }
3023    else break;
3024    }
3025  }
3026
3027/* If the next item is one that we can handle, get its value. A non-negative
3028value is a character, a negative value is an escape value. */
3029
3030if (*ptr == CHAR_BACKSLASH)
3031  {
3032  int temperrorcode = 0;
3033  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3034  if (temperrorcode != 0) return FALSE;
3035  ptr++;    /* Point after the escape sequence */
3036  }
3037else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3038  {
3039#ifdef SUPPORT_UTF
3040  if (utf) { GETCHARINC(next, ptr); } else
3041#endif
3042  next = *ptr++;
3043  }
3044else return FALSE;
3045
3046/* Skip whitespace and comments in extended mode */
3047
3048if ((options & PCRE_EXTENDED) != 0)
3049  {
3050  for (;;)
3051    {
3052    while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3053    if (*ptr == CHAR_NUMBER_SIGN)
3054      {
3055      ptr++;
3056      while (*ptr != 0)
3057        {
3058        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3059        ptr++;
3060#ifdef SUPPORT_UTF
3061        if (utf) FORWARDCHAR(ptr);
3062#endif
3063        }
3064      }
3065    else break;
3066    }
3067  }
3068
3069/* If the next thing is itself optional, we have to give up. */
3070
3071if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3072  STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3073    return FALSE;
3074
3075/* Now compare the next item with the previous opcode. First, handle cases when
3076the next item is a character. */
3077
3078if (next >= 0) switch(op_code)
3079  {
3080  case OP_CHAR:
3081#ifdef SUPPORT_UTF
3082  GETCHARTEST(c, previous);
3083#else
3084  c = *previous;
3085#endif
3086  return c != next;
3087
3088  /* For CHARI (caseless character) we must check the other case. If we have
3089  Unicode property support, we can use it to test the other case of
3090  high-valued characters. */
3091
3092  case OP_CHARI:
3093#ifdef SUPPORT_UTF
3094  GETCHARTEST(c, previous);
3095#else
3096  c = *previous;
3097#endif
3098  if (c == next) return FALSE;
3099#ifdef SUPPORT_UTF
3100  if (utf)
3101    {
3102    unsigned int othercase;
3103    if (next < 128) othercase = cd->fcc[next]; else
3104#ifdef SUPPORT_UCP
3105    othercase = UCD_OTHERCASE((unsigned int)next);
3106#else
3107    othercase = NOTACHAR;
3108#endif
3109    return (unsigned int)c != othercase;
3110    }
3111  else
3112#endif  /* SUPPORT_UTF */
3113  return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3114
3115  case OP_NOT:
3116#ifdef SUPPORT_UTF
3117  GETCHARTEST(c, previous);
3118#else
3119  c = *previous;
3120#endif
3121  return c == next;
3122
3123  case OP_NOTI:
3124#ifdef SUPPORT_UTF
3125  GETCHARTEST(c, previous);
3126#else
3127  c = *previous;
3128#endif
3129  if (c == next) return TRUE;
3130#ifdef SUPPORT_UTF
3131  if (utf)
3132    {
3133    unsigned int othercase;
3134    if (next < 128) othercase = cd->fcc[next]; else
3135#ifdef SUPPORT_UCP
3136    othercase = UCD_OTHERCASE((unsigned int)next);
3137#else
3138    othercase = NOTACHAR;
3139#endif
3140    return (unsigned int)c == othercase;
3141    }
3142  else
3143#endif  /* SUPPORT_UTF */
3144  return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3145
3146  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3147  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3148
3149  case OP_DIGIT:
3150  return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3151
3152  case OP_NOT_DIGIT:
3153  return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3154
3155  case OP_WHITESPACE:
3156  return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3157
3158  case OP_NOT_WHITESPACE:
3159  return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3160
3161  case OP_WORDCHAR:
3162  return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3163
3164  case OP_NOT_WORDCHAR:
3165  return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3166
3167  case OP_HSPACE:
3168  case OP_NOT_HSPACE:
3169  switch(next)
3170    {
3171    case 0x09:
3172    case 0x20:
3173    case 0xa0:
3174    case 0x1680:
3175    case 0x180e:
3176    case 0x2000:
3177    case 0x2001:
3178    case 0x2002:
3179    case 0x2003:
3180    case 0x2004:
3181    case 0x2005:
3182    case 0x2006:
3183    case 0x2007:
3184    case 0x2008:
3185    case 0x2009:
3186    case 0x200A:
3187    case 0x202f:
3188    case 0x205f:
3189    case 0x3000:
3190    return op_code == OP_NOT_HSPACE;
3191    default:
3192    return op_code != OP_NOT_HSPACE;
3193    }
3194
3195  case OP_ANYNL:
3196  case OP_VSPACE:
3197  case OP_NOT_VSPACE:
3198  switch(next)
3199    {
3200    case 0x0a:
3201    case 0x0b:
3202    case 0x0c:
3203    case 0x0d:
3204    case 0x85:
3205    case 0x2028:
3206    case 0x2029:
3207    return op_code == OP_NOT_VSPACE;
3208    default:
3209    return op_code != OP_NOT_VSPACE;
3210    }
3211
3212#ifdef SUPPORT_UCP
3213  case OP_PROP:
3214  return check_char_prop(next, previous[0], previous[1], FALSE);
3215
3216  case OP_NOTPROP:
3217  return check_char_prop(next, previous[0], previous[1], TRUE);
3218#endif
3219
3220  default:
3221  return FALSE;
3222  }
3223
3224
3225/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3226is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3227generated only when PCRE_UCP is *not* set, that is, when only ASCII
3228characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3229replaced by OP_PROP codes when PCRE_UCP is set. */
3230
3231switch(op_code)
3232  {
3233  case OP_CHAR:
3234  case OP_CHARI:
3235#ifdef SUPPORT_UTF
3236  GETCHARTEST(c, previous);
3237#else
3238  c = *previous;
3239#endif
3240  switch(-next)
3241    {
3242    case ESC_d:
3243    return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3244
3245    case ESC_D:
3246    return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3247
3248    case ESC_s:
3249    return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3250
3251    case ESC_S:
3252    return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3253
3254    case ESC_w:
3255    return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3256
3257    case ESC_W:
3258    return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3259
3260    case ESC_h:
3261    case ESC_H:
3262    switch(c)
3263      {
3264      case 0x09:
3265      case 0x20:
3266      case 0xa0:
3267      case 0x1680:
3268      case 0x180e:
3269      case 0x2000:
3270      case 0x2001:
3271      case 0x2002:
3272      case 0x2003:
3273      case 0x2004:
3274      case 0x2005:
3275      case 0x2006:
3276      case 0x2007:
3277      case 0x2008:
3278      case 0x2009:
3279      case 0x200A:
3280      case 0x202f:
3281      case 0x205f:
3282      case 0x3000:
3283      return -next != ESC_h;
3284      default:
3285      return -next == ESC_h;
3286      }
3287
3288    case ESC_v:
3289    case ESC_V:
3290    switch(c)
3291      {
3292      case 0x0a:
3293      case 0x0b:
3294      case 0x0c:
3295      case 0x0d:
3296      case 0x85:
3297      case 0x2028:
3298      case 0x2029:
3299      return -next != ESC_v;
3300      default:
3301      return -next == ESC_v;
3302      }
3303
3304    /* When PCRE_UCP is set, these values get generated for \d etc. Find
3305    their substitutions and process them. The result will always be either
3306    -ESC_p or -ESC_P. Then fall through to process those values. */
3307
3308#ifdef SUPPORT_UCP
3309    case ESC_du:
3310    case ESC_DU:
3311    case ESC_wu:
3312    case ESC_WU:
3313    case ESC_su:
3314    case ESC_SU:
3315      {
3316      int temperrorcode = 0;
3317      ptr = substitutes[-next - ESC_DU];
3318      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3319      if (temperrorcode != 0) return FALSE;
3320      ptr++;    /* For compatibility */
3321      }
3322    /* Fall through */
3323
3324    case ESC_p:
3325    case ESC_P:
3326      {
3327      int ptype, pdata, errorcodeptr;
3328      BOOL negated;
3329
3330      ptr--;      /* Make ptr point at the p or P */
3331      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3332      if (ptype < 0) return FALSE;
3333      ptr++;      /* Point past the final curly ket */
3334
3335      /* If the property item is optional, we have to give up. (When generated
3336      from \d etc by PCRE_UCP, this test will have been applied much earlier,
3337      to the original \d etc. At this point, ptr will point to a zero byte. */
3338
3339      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3340        STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3341          return FALSE;
3342
3343      /* Do the property check. */
3344
3345      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3346      }
3347#endif
3348
3349    default:
3350    return FALSE;
3351    }
3352
3353  /* In principle, support for Unicode properties should be integrated here as
3354  well. It means re-organizing the above code so as to get hold of the property
3355  values before switching on the op-code. However, I wonder how many patterns
3356  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3357  these op-codes are never generated.) */
3358
3359  case OP_DIGIT:
3360  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3361         next == -ESC_h || next == -ESC_v || next == -ESC_R;
3362
3363  case OP_NOT_DIGIT:
3364  return next == -ESC_d;
3365
3366  case OP_WHITESPACE:
3367  return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3368
3369  case OP_NOT_WHITESPACE:
3370  return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3371
3372  case OP_HSPACE:
3373  return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3374         next == -ESC_w || next == -ESC_v || next == -ESC_R;
3375
3376  case OP_NOT_HSPACE:
3377  return next == -ESC_h;
3378
3379  /* Can't have \S in here because VT matches \S (Perl anomaly) */
3380  case OP_ANYNL:
3381  case OP_VSPACE:
3382  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3383
3384  case OP_NOT_VSPACE:
3385  return next == -ESC_v || next == -ESC_R;
3386
3387  case OP_WORDCHAR:
3388  return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3389         next == -ESC_v || next == -ESC_R;
3390
3391  case OP_NOT_WORDCHAR:
3392  return next == -ESC_w || next == -ESC_d;
3393
3394  default:
3395  return FALSE;
3396  }
3397
3398/* Control does not reach here */
3399}
3400
3401
3402
3403/*************************************************
3404*           Compile one branch                   *
3405*************************************************/
3406
3407/* Scan the pattern, compiling it into the a vector. If the options are
3408changed during the branch, the pointer is used to change the external options
3409bits. This function is used during the pre-compile phase when we are trying
3410to find out the amount of memory needed, as well as during the real compile
3411phase. The value of lengthptr distinguishes the two phases.
3412
3413Arguments:
3414  optionsptr     pointer to the option bits
3415  codeptr        points to the pointer to the current code point
3416  ptrptr         points to the current pattern pointer
3417  errorcodeptr   points to error code variable
3418  firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3419  reqcharptr     set to the last literal character required, else < 0
3420  bcptr          points to current branch chain
3421  cond_depth     conditional nesting depth
3422  cd             contains pointers to tables etc.
3423  lengthptr      NULL during the real compile phase
3424                 points to length accumulator during pre-compile phase
3425
3426Returns:         TRUE on success
3427                 FALSE, with *errorcodeptr set non-zero on error
3428*/
3429
3430static BOOL
3431compile_branch(int *optionsptr, pcre_uchar **codeptr,
3432  const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3433  pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3434  compile_data *cd, int *lengthptr)
3435{
3436int repeat_type, op_type;
3437int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3438int bravalue = 0;
3439int greedy_default, greedy_non_default;
3440pcre_int32 firstchar, reqchar;
3441pcre_int32 zeroreqchar, zerofirstchar;
3442pcre_int32 req_caseopt, reqvary, tempreqvary;
3443int options = *optionsptr;               /* May change dynamically */
3444int after_manual_callout = 0;
3445int length_prevgroup = 0;
3446register int c;
3447register pcre_uchar *code = *codeptr;
3448pcre_uchar *last_code = code;
3449pcre_uchar *orig_code = code;
3450pcre_uchar *tempcode;
3451BOOL inescq = FALSE;
3452BOOL groupsetfirstchar = FALSE;
3453const pcre_uchar *ptr = *ptrptr;
3454const pcre_uchar *tempptr;
3455const pcre_uchar *nestptr = NULL;
3456pcre_uchar *previous = NULL;
3457pcre_uchar *previous_callout = NULL;
3458pcre_uchar *save_hwm = NULL;
3459pcre_uint8 classbits[32];
3460
3461/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3462must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3463dynamically as we process the pattern. */
3464
3465#ifdef SUPPORT_UTF
3466/* PCRE_UTF16 has the same value as PCRE_UTF8. */
3467BOOL utf = (options & PCRE_UTF8) != 0;
3468pcre_uchar utf_chars[6];
3469#else
3470BOOL utf = FALSE;
3471#endif
3472
3473/* Helper variables for OP_XCLASS opcode (for characters > 255). */
3474
3475#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3476BOOL xclass;
3477pcre_uchar *class_uchardata;
3478pcre_uchar *class_uchardata_base;
3479#endif
3480
3481#ifdef PCRE_DEBUG
3482if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3483#endif
3484
3485/* Set up the default and non-default settings for greediness */
3486
3487greedy_default = ((options & PCRE_UNGREEDY) != 0);
3488greedy_non_default = greedy_default ^ 1;
3489
3490/* Initialize no first byte, no required byte. REQ_UNSET means "no char
3491matching encountered yet". It gets changed to REQ_NONE if we hit something that
3492matches a non-fixed char first char; reqchar just remains unset if we never
3493find one.
3494
3495When we hit a repeat whose minimum is zero, we may have to adjust these values
3496to take the zero repeat into account. This is implemented by setting them to
3497zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3498item types that can be repeated set these backoff variables appropriately. */
3499
3500firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3501
3502/* The variable req_caseopt contains either the REQ_CASELESS value
3503or zero, according to the current setting of the caseless flag. The
3504REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3505firstchar or reqchar variables to record the case status of the
3506value. This is used only for ASCII characters. */
3507
3508req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3509
3510/* Switch on next character until the end of the branch */
3511
3512for (;; ptr++)
3513  {
3514  BOOL negate_class;
3515  BOOL should_flip_negation;
3516  BOOL possessive_quantifier;
3517  BOOL is_quantifier;
3518  BOOL is_recurse;
3519  BOOL reset_bracount;
3520  int class_has_8bitchar;
3521  int class_single_char;
3522  int newoptions;
3523  int recno;
3524  int refsign;
3525  int skipbytes;
3526  int subreqchar;
3527  int subfirstchar;
3528  int terminator;
3529  int mclength;
3530  int tempbracount;
3531  pcre_uchar mcbuffer[8];
3532
3533  /* Get next character in the pattern */
3534
3535  c = *ptr;
3536
3537  /* If we are at the end of a nested substitution, revert to the outer level
3538  string. Nesting only happens one level deep. */
3539
3540  if (c == 0 && nestptr != NULL)
3541    {
3542    ptr = nestptr;
3543    nestptr = NULL;
3544    c = *ptr;
3545    }
3546
3547  /* If we are in the pre-compile phase, accumulate the length used for the
3548  previous cycle of this loop. */
3549
3550  if (lengthptr != NULL)
3551    {
3552#ifdef PCRE_DEBUG
3553    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3554#endif
3555    if (code > cd->start_workspace + cd->workspace_size -
3556        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3557      {
3558      *errorcodeptr = ERR52;
3559      goto FAILED;
3560      }
3561
3562    /* There is at least one situation where code goes backwards: this is the
3563    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3564    the class is simply eliminated. However, it is created first, so we have to
3565    allow memory for it. Therefore, don't ever reduce the length at this point.
3566    */
3567
3568    if (code < last_code) code = last_code;
3569
3570    /* Paranoid check for integer overflow */
3571
3572    if (OFLOW_MAX - *lengthptr < code - last_code)
3573      {
3574      *errorcodeptr = ERR20;
3575      goto FAILED;
3576      }
3577
3578    *lengthptr += (int)(code - last_code);
3579    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3580      (int)(code - last_code), c, c));
3581
3582    /* If "previous" is set and it is not at the start of the work space, move
3583    it back to there, in order to avoid filling up the work space. Otherwise,
3584    if "previous" is NULL, reset the current code pointer to the start. */
3585
3586    if (previous != NULL)
3587      {
3588      if (previous > orig_code)
3589        {
3590        memmove(orig_code, previous, IN_UCHARS(code - previous));
3591        code -= previous - orig_code;
3592        previous = orig_code;
3593        }
3594      }
3595    else code = orig_code;
3596
3597    /* Remember where this code item starts so we can pick up the length
3598    next time round. */
3599
3600    last_code = code;
3601    }
3602
3603  /* In the real compile phase, just check the workspace used by the forward
3604  reference list. */
3605
3606  else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3607           WORK_SIZE_SAFETY_MARGIN)
3608    {
3609    *errorcodeptr = ERR52;
3610    goto FAILED;
3611    }
3612
3613  /* If in \Q...\E, check for the end; if not, we have a literal */
3614
3615  if (inescq && c != 0)
3616    {
3617    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3618      {
3619      inescq = FALSE;
3620      ptr++;
3621      continue;
3622      }
3623    else
3624      {
3625      if (previous_callout != NULL)
3626        {
3627        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3628          complete_callout(previous_callout, ptr, cd);
3629        previous_callout = NULL;
3630        }
3631      if ((options & PCRE_AUTO_CALLOUT) != 0)
3632        {
3633        previous_callout = code;
3634        code = auto_callout(code, ptr, cd);
3635        }
3636      goto NORMAL_CHAR;
3637      }
3638    }
3639
3640  /* Fill in length of a previous callout, except when the next thing is
3641  a quantifier. */
3642
3643  is_quantifier =
3644    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3645    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3646
3647  if (!is_quantifier && previous_callout != NULL &&
3648       after_manual_callout-- <= 0)
3649    {
3650    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3651      complete_callout(previous_callout, ptr, cd);
3652    previous_callout = NULL;
3653    }
3654
3655  /* In extended mode, skip white space and comments. */
3656
3657  if ((options & PCRE_EXTENDED) != 0)
3658    {
3659    if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3660    if (c == CHAR_NUMBER_SIGN)
3661      {
3662      ptr++;
3663      while (*ptr != 0)
3664        {
3665        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3666        ptr++;
3667#ifdef SUPPORT_UTF
3668        if (utf) FORWARDCHAR(ptr);
3669#endif
3670        }
3671      if (*ptr != 0) continue;
3672
3673      /* Else fall through to handle end of string */
3674      c = 0;
3675      }
3676    }
3677
3678  /* No auto callout for quantifiers. */
3679
3680  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3681    {
3682    previous_callout = code;
3683    code = auto_callout(code, ptr, cd);
3684    }
3685
3686  switch(c)
3687    {
3688    /* ===================================================================*/
3689    case 0:                        /* The branch terminates at string end */
3690    case CHAR_VERTICAL_LINE:       /* or | or ) */
3691    case CHAR_RIGHT_PARENTHESIS:
3692    *firstcharptr = firstchar;
3693    *reqcharptr = reqchar;
3694    *codeptr = code;
3695    *ptrptr = ptr;
3696    if (lengthptr != NULL)
3697      {
3698      if (OFLOW_MAX - *lengthptr < code - last_code)
3699        {
3700        *errorcodeptr = ERR20;
3701        goto FAILED;
3702        }
3703      *lengthptr += (int)(code - last_code);   /* To include callout length */
3704      DPRINTF((">> end branch\n"));
3705      }
3706    return TRUE;
3707
3708
3709    /* ===================================================================*/
3710    /* Handle single-character metacharacters. In multiline mode, ^ disables
3711    the setting of any following char as a first character. */
3712
3713    case CHAR_CIRCUMFLEX_ACCENT:
3714    previous = NULL;
3715    if ((options & PCRE_MULTILINE) != 0)
3716      {
3717      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3718      *code++ = OP_CIRCM;
3719      }
3720    else *code++ = OP_CIRC;
3721    break;
3722
3723    case CHAR_DOLLAR_SIGN:
3724    previous = NULL;
3725    *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3726    break;
3727
3728    /* There can never be a first char if '.' is first, whatever happens about
3729    repeats. The value of reqchar doesn't change either. */
3730
3731    case CHAR_DOT:
3732    if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3733    zerofirstchar = firstchar;
3734    zeroreqchar = reqchar;
3735    previous = code;
3736    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3737    break;
3738
3739
3740    /* ===================================================================*/
3741    /* Character classes. If the included characters are all < 256, we build a
3742    32-byte bitmap of the permitted characters, except in the special case
3743    where there is only one such character. For negated classes, we build the
3744    map as usual, then invert it at the end. However, we use a different opcode
3745    so that data characters > 255 can be handled correctly.
3746
3747    If the class contains characters outside the 0-255 range, a different
3748    opcode is compiled. It may optionally have a bit map for characters < 256,
3749    but those above are are explicitly listed afterwards. A flag byte tells
3750    whether the bitmap is present, and whether this is a negated class or not.
3751
3752    In JavaScript compatibility mode, an isolated ']' causes an error. In
3753    default (Perl) mode, it is treated as a data character. */
3754
3755    case CHAR_RIGHT_SQUARE_BRACKET:
3756    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3757      {
3758      *errorcodeptr = ERR64;
3759      goto FAILED;
3760      }
3761    goto NORMAL_CHAR;
3762
3763    case CHAR_LEFT_SQUARE_BRACKET:
3764    previous = code;
3765
3766    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3767    they are encountered at the top level, so we'll do that too. */
3768
3769    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3770         ptr[1] == CHAR_EQUALS_SIGN) &&
3771        check_posix_syntax(ptr, &tempptr))
3772      {
3773      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3774      goto FAILED;
3775      }
3776
3777    /* If the first character is '^', set the negation flag and skip it. Also,
3778    if the first few characters (either before or after ^) are \Q\E or \E we
3779    skip them too. This makes for compatibility with Perl. */
3780
3781    negate_class = FALSE;
3782    for (;;)
3783      {
3784      c = *(++ptr);
3785      if (c == CHAR_BACKSLASH)
3786        {
3787        if (ptr[1] == CHAR_E)
3788          ptr++;
3789        else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3790          ptr += 3;
3791        else
3792          break;
3793        }
3794      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3795        negate_class = TRUE;
3796      else break;
3797      }
3798
3799    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3800    an initial ']' is taken as a data character -- the code below handles
3801    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3802    [^] must match any character, so generate OP_ALLANY. */
3803
3804    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3805        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3806      {
3807      *code++ = negate_class? OP_ALLANY : OP_FAIL;
3808      if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3809      zerofirstchar = firstchar;
3810      break;
3811      }
3812
3813    /* If a class contains a negative special such as \S, we need to flip the
3814    negation flag at the end, so that support for characters > 255 works
3815    correctly (they are all included in the class). */
3816
3817    should_flip_negation = FALSE;
3818
3819    /* For optimization purposes, we track some properties of the class.
3820    class_has_8bitchar will be non-zero, if the class contains at least one
3821    < 256 character. class_single_char will be 1 if the class contains only
3822    a single character. */
3823
3824    class_has_8bitchar = 0;
3825    class_single_char = 0;
3826
3827    /* Initialize the 32-char bit map to all zeros. We build the map in a
3828    temporary bit of memory, in case the class contains only 1 character (less
3829    than 256), because in that case the compiled code doesn't use the bit map.
3830    */
3831
3832    memset(classbits, 0, 32 * sizeof(pcre_uint8));
3833
3834#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3835    xclass = FALSE;                           /* No chars >= 256 */
3836    class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3837    class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3838#endif
3839
3840    /* Process characters until ] is reached. By writing this as a "do" it
3841    means that an initial ] is taken as a data character. At the start of the
3842    loop, c contains the first byte of the character. */
3843
3844    if (c != 0) do
3845      {
3846      const pcre_uchar *oldptr;
3847
3848#ifdef SUPPORT_UTF
3849      if (utf && HAS_EXTRALEN(c))
3850        {                           /* Braces are required because the */
3851        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3852        }
3853#endif
3854
3855#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3856      /* In the pre-compile phase, accumulate the length of any extra
3857      data and reset the pointer. This is so that very large classes that
3858      contain a zillion > 255 characters no longer overwrite the work space
3859      (which is on the stack). */
3860
3861      if (lengthptr != NULL)
3862        {
3863        *lengthptr += class_uchardata - class_uchardata_base;
3864        class_uchardata = class_uchardata_base;
3865        }
3866#endif
3867
3868      /* Inside \Q...\E everything is literal except \E */
3869
3870      if (inescq)
3871        {
3872        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3873          {
3874          inescq = FALSE;                   /* Reset literal state */
3875          ptr++;                            /* Skip the 'E' */
3876          continue;                         /* Carry on with next */
3877          }
3878        goto CHECK_RANGE;                   /* Could be range if \E follows */
3879        }
3880
3881      /* Handle POSIX class names. Perl allows a negation extension of the
3882      form [:^name:]. A square bracket that doesn't match the syntax is
3883      treated as a literal. We also recognize the POSIX constructions
3884      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3885      5.6 and 5.8 do. */
3886
3887      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3888          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3889           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3890        {
3891        BOOL local_negate = FALSE;
3892        int posix_class, taboffset, tabopt;
3893        register const pcre_uint8 *cbits = cd->cbits;
3894        pcre_uint8 pbits[32];
3895
3896        if (ptr[1] != CHAR_COLON)
3897          {
3898          *errorcodeptr = ERR31;
3899          goto FAILED;
3900          }
3901
3902        ptr += 2;
3903        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3904          {
3905          local_negate = TRUE;
3906          should_flip_negation = TRUE;  /* Note negative special */
3907          ptr++;
3908          }
3909
3910        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3911        if (posix_class < 0)
3912          {
3913          *errorcodeptr = ERR30;
3914          goto FAILED;
3915          }
3916
3917        /* If matching is caseless, upper and lower are converted to
3918        alpha. This relies on the fact that the class table starts with
3919        alpha, lower, upper as the first 3 entries. */
3920
3921        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3922          posix_class = 0;
3923
3924        /* When PCRE_UCP is set, some of the POSIX classes are converted to
3925        different escape sequences that use Unicode properties. */
3926
3927#ifdef SUPPORT_UCP
3928        if ((options & PCRE_UCP) != 0)
3929          {
3930          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3931          if (posix_substitutes[pc] != NULL)
3932            {
3933            nestptr = tempptr + 1;
3934            ptr = posix_substitutes[pc] - 1;
3935            continue;
3936            }
3937          }
3938#endif
3939        /* In the non-UCP case, we build the bit map for the POSIX class in a
3940        chunk of local store because we may be adding and subtracting from it,
3941        and we don't want to subtract bits that may be in the main map already.
3942        At the end we or the result into the bit map that is being built. */
3943
3944        posix_class *= 3;
3945
3946        /* Copy in the first table (always present) */
3947
3948        memcpy(pbits, cbits + posix_class_maps[posix_class],
3949          32 * sizeof(pcre_uint8));
3950
3951        /* If there is a second table, add or remove it as required. */
3952
3953        taboffset = posix_class_maps[posix_class + 1];
3954        tabopt = posix_class_maps[posix_class + 2];
3955
3956        if (taboffset >= 0)
3957          {
3958          if (tabopt >= 0)
3959            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3960          else
3961            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3962          }
3963
3964        /* Not see if we need to remove any special characters. An option
3965        value of 1 removes vertical space and 2 removes underscore. */
3966
3967        if (tabopt < 0) tabopt = -tabopt;
3968        if (tabopt == 1) pbits[1] &= ~0x3c;
3969          else if (tabopt == 2) pbits[11] &= 0x7f;
3970
3971        /* Add the POSIX table or its complement into the main table that is
3972        being built and we are done. */
3973
3974        if (local_negate)
3975          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3976        else
3977          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3978
3979        ptr = tempptr + 1;
3980        /* Every class contains at least one < 256 characters. */
3981        class_has_8bitchar = 1;
3982        /* Every class contains at least two characters. */
3983        class_single_char = 2;
3984        continue;    /* End of POSIX syntax handling */
3985        }
3986
3987      /* Backslash may introduce a single character, or it may introduce one
3988      of the specials, which just set a flag. The sequence \b is a special
3989      case. Inside a class (and only there) it is treated as backspace. We
3990      assume that other escapes have more than one character in them, so
3991      speculatively set both class_has_8bitchar and class_single_char bigger
3992      than one. Unrecognized escapes fall through and are either treated
3993      as literal characters (by default), or are faulted if
3994      PCRE_EXTRA is set. */
3995
3996      if (c == CHAR_BACKSLASH)
3997        {
3998        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3999        if (*errorcodeptr != 0) goto FAILED;
4000
4001        if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
4002        else if (-c == ESC_N)            /* \N is not supported in a class */
4003          {
4004          *errorcodeptr = ERR71;
4005          goto FAILED;
4006          }
4007        else if (-c == ESC_Q)            /* Handle start of quoted string */
4008          {
4009          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4010            {
4011            ptr += 2; /* avoid empty string */
4012            }
4013          else inescq = TRUE;
4014          continue;
4015          }
4016        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
4017
4018        if (c < 0)
4019          {
4020          register const pcre_uint8 *cbits = cd->cbits;
4021          /* Every class contains at least two < 256 characters. */
4022          class_has_8bitchar++;
4023          /* Every class contains at least two characters. */
4024          class_single_char += 2;
4025
4026          switch (-c)
4027            {
4028#ifdef SUPPORT_UCP
4029            case ESC_du:     /* These are the values given for \d etc */
4030            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
4031            case ESC_wu:     /* escape sequence with an appropriate \p */
4032            case ESC_WU:     /* or \P to test Unicode properties instead */
4033            case ESC_su:     /* of the default ASCII testing. */
4034            case ESC_SU:
4035            nestptr = ptr;
4036            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4037            class_has_8bitchar--;                /* Undo! */
4038            continue;
4039#endif
4040            case ESC_d:
4041            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4042            continue;
4043
4044            case ESC_D:
4045            should_flip_negation = TRUE;
4046            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4047            continue;
4048
4049            case ESC_w:
4050            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4051            continue;
4052
4053            case ESC_W:
4054            should_flip_negation = TRUE;
4055            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4056            continue;
4057
4058            /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4059            if it was previously set by something earlier in the character
4060            class. */
4061
4062            case ESC_s:
4063            classbits[0] |= cbits[cbit_space];
4064            classbits[1] |= cbits[cbit_space+1] & ~0x08;
4065            for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4066            continue;
4067
4068            case ESC_S:
4069            should_flip_negation = TRUE;
4070            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4071            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4072            continue;
4073
4074            case ESC_h:
4075            SETBIT(classbits, 0x09); /* VT */
4076            SETBIT(classbits, 0x20); /* SPACE */
4077            SETBIT(classbits, 0xa0); /* NSBP */
4078#ifndef COMPILE_PCRE8
4079            xclass = TRUE;
4080            *class_uchardata++ = XCL_SINGLE;
4081            *class_uchardata++ = 0x1680;
4082            *class_uchardata++ = XCL_SINGLE;
4083            *class_uchardata++ = 0x180e;
4084            *class_uchardata++ = XCL_RANGE;
4085            *class_uchardata++ = 0x2000;
4086            *class_uchardata++ = 0x200a;
4087            *class_uchardata++ = XCL_SINGLE;
4088            *class_uchardata++ = 0x202f;
4089            *class_uchardata++ = XCL_SINGLE;
4090            *class_uchardata++ = 0x205f;
4091            *class_uchardata++ = XCL_SINGLE;
4092            *class_uchardata++ = 0x3000;
4093#elif defined SUPPORT_UTF
4094            if (utf)
4095              {
4096              xclass = TRUE;
4097              *class_uchardata++ = XCL_SINGLE;
4098              class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4099              *class_uchardata++ = XCL_SINGLE;
4100              class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4101              *class_uchardata++ = XCL_RANGE;
4102              class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4103              class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4104              *class_uchardata++ = XCL_SINGLE;
4105              class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4106              *class_uchardata++ = XCL_SINGLE;
4107              class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4108              *class_uchardata++ = XCL_SINGLE;
4109              class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4110              }
4111#endif
4112            continue;
4113
4114            case ESC_H:
4115            for (c = 0; c < 32; c++)
4116              {
4117              int x = 0xff;
4118              switch (c)
4119                {
4120                case 0x09/8: x ^= 1 << (0x09%8); break;
4121                case 0x20/8: x ^= 1 << (0x20%8); break;
4122                case 0xa0/8: x ^= 1 << (0xa0%8); break;
4123                default: break;
4124                }
4125              classbits[c] |= x;
4126              }
4127#ifndef COMPILE_PCRE8
4128            xclass = TRUE;
4129            *class_uchardata++ = XCL_RANGE;
4130            *class_uchardata++ = 0x0100;
4131            *class_uchardata++ = 0x167f;
4132            *class_uchardata++ = XCL_RANGE;
4133            *class_uchardata++ = 0x1681;
4134            *class_uchardata++ = 0x180d;
4135            *class_uchardata++ = XCL_RANGE;
4136            *class_uchardata++ = 0x180f;
4137            *class_uchardata++ = 0x1fff;
4138            *class_uchardata++ = XCL_RANGE;
4139            *class_uchardata++ = 0x200b;
4140            *class_uchardata++ = 0x202e;
4141            *class_uchardata++ = XCL_RANGE;
4142            *class_uchardata++ = 0x2030;
4143            *class_uchardata++ = 0x205e;
4144            *class_uchardata++ = XCL_RANGE;
4145            *class_uchardata++ = 0x2060;
4146            *class_uchardata++ = 0x2fff;
4147            *class_uchardata++ = XCL_RANGE;
4148            *class_uchardata++ = 0x3001;
4149#ifdef SUPPORT_UTF
4150            if (utf)
4151              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4152            else
4153#endif
4154              *class_uchardata++ = 0xffff;
4155#elif defined SUPPORT_UTF
4156            if (utf)
4157              {
4158              xclass = TRUE;
4159              *class_uchardata++ = XCL_RANGE;
4160              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4161              class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4162              *class_uchardata++ = XCL_RANGE;
4163              class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4164              class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4165              *class_uchardata++ = XCL_RANGE;
4166              class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4167              class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4168              *class_uchardata++ = XCL_RANGE;
4169              class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4170              class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4171              *class_uchardata++ = XCL_RANGE;
4172              class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4173              class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4174              *class_uchardata++ = XCL_RANGE;
4175              class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4176              class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4177              *class_uchardata++ = XCL_RANGE;
4178              class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4179              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4180              }
4181#endif
4182            continue;
4183
4184            case ESC_v:
4185            SETBIT(classbits, 0x0a); /* LF */
4186            SETBIT(classbits, 0x0b); /* VT */
4187            SETBIT(classbits, 0x0c); /* FF */
4188            SETBIT(classbits, 0x0d); /* CR */
4189            SETBIT(classbits, 0x85); /* NEL */
4190#ifndef COMPILE_PCRE8
4191            xclass = TRUE;
4192            *class_uchardata++ = XCL_RANGE;
4193            *class_uchardata++ = 0x2028;
4194            *class_uchardata++ = 0x2029;
4195#elif defined SUPPORT_UTF
4196            if (utf)
4197              {
4198              xclass = TRUE;
4199              *class_uchardata++ = XCL_RANGE;
4200              class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4201              class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4202              }
4203#endif
4204            continue;
4205
4206            case ESC_V:
4207            for (c = 0; c < 32; c++)
4208              {
4209              int x = 0xff;
4210              switch (c)
4211                {
4212                case 0x0a/8: x ^= 1 << (0x0a%8);
4213                             x ^= 1 << (0x0b%8);
4214                             x ^= 1 << (0x0c%8);
4215                             x ^= 1 << (0x0d%8);
4216                             break;
4217                case 0x85/8: x ^= 1 << (0x85%8); break;
4218                default: break;
4219                }
4220              classbits[c] |= x;
4221              }
4222
4223#ifndef COMPILE_PCRE8
4224            xclass = TRUE;
4225            *class_uchardata++ = XCL_RANGE;
4226            *class_uchardata++ = 0x0100;
4227            *class_uchardata++ = 0x2027;
4228            *class_uchardata++ = XCL_RANGE;
4229            *class_uchardata++ = 0x202a;
4230#ifdef SUPPORT_UTF
4231            if (utf)
4232              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4233            else
4234#endif
4235              *class_uchardata++ = 0xffff;
4236#elif defined SUPPORT_UTF
4237            if (utf)
4238              {
4239              xclass = TRUE;
4240              *class_uchardata++ = XCL_RANGE;
4241              class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4242              class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4243              *class_uchardata++ = XCL_RANGE;
4244              class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4245              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4246              }
4247#endif
4248            continue;
4249
4250#ifdef SUPPORT_UCP
4251            case ESC_p:
4252            case ESC_P:
4253              {
4254              BOOL negated;
4255              int pdata;
4256              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4257              if (ptype < 0) goto FAILED;
4258              xclass = TRUE;
4259              *class_uchardata++ = ((-c == ESC_p) != negated)?
4260                XCL_PROP : XCL_NOTPROP;
4261              *class_uchardata++ = ptype;
4262              *class_uchardata++ = pdata;
4263              class_has_8bitchar--;                /* Undo! */
4264              continue;
4265              }
4266#endif
4267            /* Unrecognized escapes are faulted if PCRE is running in its
4268            strict mode. By default, for compatibility with Perl, they are
4269            treated as literals. */
4270
4271            default:
4272            if ((options & PCRE_EXTRA) != 0)
4273              {
4274              *errorcodeptr = ERR7;
4275              goto FAILED;
4276              }
4277            class_has_8bitchar--;    /* Undo the speculative increase. */
4278            class_single_char -= 2;  /* Undo the speculative increase. */
4279            c = *ptr;                /* Get the final character and fall through */
4280            break;
4281            }
4282          }
4283
4284        /* Fall through if we have a single character (c >= 0). This may be
4285        greater than 256. */
4286
4287        }   /* End of backslash handling */
4288
4289      /* A single character may be followed by '-' to form a range. However,
4290      Perl does not permit ']' to be the end of the range. A '-' character
4291      at the end is treated as a literal. Perl ignores orphaned \E sequences
4292      entirely. The code for handling \Q and \E is messy. */
4293
4294      CHECK_RANGE:
4295      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4296        {
4297        inescq = FALSE;
4298        ptr += 2;
4299        }
4300
4301      oldptr = ptr;
4302
4303      /* Remember \r or \n */
4304
4305      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4306
4307      /* Check for range */
4308
4309      if (!inescq && ptr[1] == CHAR_MINUS)
4310        {
4311        int d;
4312        ptr += 2;
4313        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4314
4315        /* If we hit \Q (not followed by \E) at this point, go into escaped
4316        mode. */
4317
4318        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4319          {
4320          ptr += 2;
4321          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4322            { ptr += 2; continue; }
4323          inescq = TRUE;
4324          break;
4325          }
4326
4327        if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4328          {
4329          ptr = oldptr;
4330          goto LONE_SINGLE_CHARACTER;
4331          }
4332
4333#ifdef SUPPORT_UTF
4334        if (utf)
4335          {                           /* Braces are required because the */
4336          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4337          }
4338        else
4339#endif
4340        d = *ptr;  /* Not UTF-8 mode */
4341
4342        /* The second part of a range can be a single-character escape, but
4343        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4344        in such circumstances. */
4345
4346        if (!inescq && d == CHAR_BACKSLASH)
4347          {
4348          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4349          if (*errorcodeptr != 0) goto FAILED;
4350
4351          /* \b is backspace; any other special means the '-' was literal */
4352
4353          if (d < 0)
4354            {
4355            if (d == -ESC_b) d = CHAR_BS; else
4356              {
4357              ptr = oldptr;
4358              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4359              }
4360            }
4361          }
4362
4363        /* Check that the two values are in the correct order. Optimize
4364        one-character ranges */
4365
4366        if (d < c)
4367          {
4368          *errorcodeptr = ERR8;
4369          goto FAILED;
4370          }
4371
4372        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4373
4374        /* Remember \r or \n */
4375
4376        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4377
4378        /* Since we found a character range, single character optimizations
4379        cannot be done anymore. */
4380        class_single_char = 2;
4381
4382        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4383        matching, we have to use an XCLASS with extra data items. Caseless
4384        matching for characters > 127 is available only if UCP support is
4385        available. */
4386
4387#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4388        if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4389#elif defined  SUPPORT_UTF
4390        if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4391#elif !(defined COMPILE_PCRE8)
4392        if (d > 255)
4393#endif
4394#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4395          {
4396          xclass = TRUE;
4397
4398          /* With UCP support, we can find the other case equivalents of
4399          the relevant characters. There may be several ranges. Optimize how
4400          they fit with the basic range. */
4401
4402#ifdef SUPPORT_UCP
4403#ifndef COMPILE_PCRE8
4404          if (utf && (options & PCRE_CASELESS) != 0)
4405#else
4406          if ((options & PCRE_CASELESS) != 0)
4407#endif
4408            {
4409            unsigned int occ, ocd;
4410            unsigned int cc = c;
4411            unsigned int origd = d;
4412            while (get_othercase_range(&cc, origd, &occ, &ocd))
4413              {
4414              if (occ >= (unsigned int)c &&
4415                  ocd <= (unsigned int)d)
4416                continue;                          /* Skip embedded ranges */
4417
4418              if (occ < (unsigned int)c  &&
4419                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
4420                {                                  /* if there is overlap,   */
4421                c = occ;                           /* noting that if occ < c */
4422                continue;                          /* we can't have ocd > d  */
4423                }                                  /* because a subrange is  */
4424              if (ocd > (unsigned int)d &&
4425                  occ <= (unsigned int)d + 1)      /* always shorter than    */
4426                {                                  /* the basic range.       */
4427                d = ocd;
4428                continue;
4429                }
4430
4431              if (occ == ocd)
4432                {
4433                *class_uchardata++ = XCL_SINGLE;
4434                }
4435              else
4436                {
4437                *class_uchardata++ = XCL_RANGE;
4438                class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4439                }
4440              class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4441              }
4442            }
4443#endif  /* SUPPORT_UCP */
4444
4445          /* Now record the original range, possibly modified for UCP caseless
4446          overlapping ranges. */
4447
4448          *class_uchardata++ = XCL_RANGE;
4449#ifdef SUPPORT_UTF
4450#ifndef COMPILE_PCRE8
4451          if (utf)
4452            {
4453            class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4454            class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4455            }
4456          else
4457            {
4458            *class_uchardata++ = c;
4459            *class_uchardata++ = d;
4460            }
4461#else
4462          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4463          class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4464#endif
4465#else /* SUPPORT_UTF */
4466          *class_uchardata++ = c;
4467          *class_uchardata++ = d;
4468#endif /* SUPPORT_UTF */
4469
4470          /* With UCP support, we are done. Without UCP support, there is no
4471          caseless matching for UTF characters > 127; we can use the bit map
4472          for the smaller ones. As for 16 bit characters without UTF, we
4473          can still use  */
4474
4475#ifdef SUPPORT_UCP
4476#ifndef COMPILE_PCRE8
4477          if (utf)
4478#endif
4479            continue;    /* With next character in the class */
4480#endif  /* SUPPORT_UCP */
4481
4482#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4483          if (utf)
4484            {
4485            if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4486            /* Adjust upper limit and fall through to set up the map */
4487            d = 127;
4488            }
4489          else
4490            {
4491            if (c > 255) continue;
4492            /* Adjust upper limit and fall through to set up the map */
4493            d = 255;
4494            }
4495#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4496          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4497          /* Adjust upper limit and fall through to set up the map */
4498          d = 127;
4499#else
4500          if (c > 255) continue;
4501          /* Adjust upper limit and fall through to set up the map */
4502          d = 255;
4503#endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4504          }
4505#endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4506
4507        /* We use the bit map for 8 bit mode, or when the characters fall
4508        partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4509
4510        class_has_8bitchar = 1;
4511
4512        /* We can save a bit of time by skipping this in the pre-compile. */
4513
4514        if (lengthptr == NULL) for (; c <= d; c++)
4515          {
4516          classbits[c/8] |= (1 << (c&7));
4517          if ((options & PCRE_CASELESS) != 0)
4518            {
4519            int uc = cd->fcc[c]; /* flip case */
4520            classbits[uc/8] |= (1 << (uc&7));
4521            }
4522          }
4523
4524        continue;   /* Go get the next char in the class */
4525        }
4526
4527      /* Handle a lone single character - we can get here for a normal
4528      non-escape char, or after \ that introduces a single character or for an
4529      apparent range that isn't. */
4530
4531      LONE_SINGLE_CHARACTER:
4532
4533      /* Only the value of 1 matters for class_single_char. */
4534
4535      if (class_single_char < 2) class_single_char++;
4536
4537      /* If class_charcount is 1, we saw precisely one character. As long as
4538      there was no use of \p or \P, in other words, no use of any XCLASS
4539      features, we can optimize.
4540
4541      The optimization throws away the bit map. We turn the item into a
4542      1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4543      In the positive case, it can cause firstchar to be set. Otherwise, there
4544      can be no first char if this item is first, whatever repeat count may
4545      follow. In the case of reqchar, save the previous value for reinstating. */
4546
4547      if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4548        {
4549        ptr++;
4550        zeroreqchar = reqchar;
4551
4552        if (negate_class)
4553          {
4554          if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4555          zerofirstchar = firstchar;
4556          *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4557#ifdef SUPPORT_UTF
4558          if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4559            code += PRIV(ord2utf)(c, code);
4560          else
4561#endif
4562            *code++ = c;
4563          goto NOT_CHAR;
4564          }
4565
4566        /* For a single, positive character, get the value into mcbuffer, and
4567        then we can handle this with the normal one-character code. */
4568
4569#ifdef SUPPORT_UTF
4570        if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4571          mclength = PRIV(ord2utf)(c, mcbuffer);
4572        else
4573#endif
4574          {
4575          mcbuffer[0] = c;
4576          mclength = 1;
4577          }
4578        goto ONE_CHAR;
4579        }       /* End of 1-char optimization */
4580
4581      /* Handle a character that cannot go in the bit map. */
4582
4583#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4584      if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4585#elif defined SUPPORT_UTF
4586      if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4587#elif !(defined COMPILE_PCRE8)
4588      if (c > 255)
4589#endif
4590
4591#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4592        {
4593        xclass = TRUE;
4594        *class_uchardata++ = XCL_SINGLE;
4595#ifdef SUPPORT_UTF
4596#ifndef COMPILE_PCRE8
4597        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4598        if (!utf)
4599          *class_uchardata++ = c;
4600        else
4601#endif
4602          class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4603#else /* SUPPORT_UTF */
4604        *class_uchardata++ = c;
4605#endif /* SUPPORT_UTF */
4606
4607#ifdef SUPPORT_UCP
4608#ifdef COMPILE_PCRE8
4609        if ((options & PCRE_CASELESS) != 0)
4610#else
4611        /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4612        if (utf && (options & PCRE_CASELESS) != 0)
4613#endif
4614          {
4615          unsigned int othercase;
4616          if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4617            {
4618            *class_uchardata++ = XCL_SINGLE;
4619            class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4620            }
4621          }
4622#endif  /* SUPPORT_UCP */
4623
4624        }
4625      else
4626#endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4627
4628      /* Handle a single-byte character */
4629        {
4630        class_has_8bitchar = 1;
4631        classbits[c/8] |= (1 << (c&7));
4632        if ((options & PCRE_CASELESS) != 0)
4633          {
4634          c = cd->fcc[c]; /* flip case */
4635          classbits[c/8] |= (1 << (c&7));
4636          }
4637        }
4638      }
4639
4640    /* Loop until ']' reached. This "while" is the end of the "do" far above.
4641    If we are at the end of an internal nested string, revert to the outer
4642    string. */
4643
4644    while (((c = *(++ptr)) != 0 ||
4645           (nestptr != NULL &&
4646             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4647           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4648
4649    /* Check for missing terminating ']' */
4650
4651    if (c == 0)
4652      {
4653      *errorcodeptr = ERR6;
4654      goto FAILED;
4655      }
4656
4657    /* If this is the first thing in the branch, there can be no first char
4658    setting, whatever the repeat count. Any reqchar setting must remain
4659    unchanged after any kind of repeat. */
4660
4661    if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4662    zerofirstchar = firstchar;
4663    zeroreqchar = reqchar;
4664
4665    /* If there are characters with values > 255, we have to compile an
4666    extended class, with its own opcode, unless there was a negated special
4667    such as \S in the class, and PCRE_UCP is not set, because in that case all
4668    characters > 255 are in the class, so any that were explicitly given as
4669    well can be ignored. If (when there are explicit characters > 255 that must
4670    be listed) there are no characters < 256, we can omit the bitmap in the
4671    actual compiled code. */
4672
4673#ifdef SUPPORT_UTF
4674    if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4675#elif !defined COMPILE_PCRE8
4676    if (xclass && !should_flip_negation)
4677#endif
4678#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4679      {
4680      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4681      *code++ = OP_XCLASS;
4682      code += LINK_SIZE;
4683      *code = negate_class? XCL_NOT:0;
4684
4685      /* If the map is required, move up the extra data to make room for it;
4686      otherwise just move the code pointer to the end of the extra data. */
4687
4688      if (class_has_8bitchar > 0)
4689        {
4690        *code++ |= XCL_MAP;
4691        memmove(code + (32 / sizeof(pcre_uchar)), code,
4692          IN_UCHARS(class_uchardata - code));
4693        memcpy(code, classbits, 32);
4694        code = class_uchardata + (32 / sizeof(pcre_uchar));
4695        }
4696      else code = class_uchardata;
4697
4698      /* Now fill in the complete length of the item */
4699
4700      PUT(previous, 1, (int)(code - previous));
4701      break;   /* End of class handling */
4702      }
4703#endif
4704
4705    /* If there are no characters > 255, or they are all to be included or
4706    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4707    whole class was negated and whether there were negative specials such as \S
4708    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4709    negating it if necessary. */
4710
4711    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4712    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4713      {
4714      if (negate_class)
4715        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4716      memcpy(code, classbits, 32);
4717      }
4718    code += 32 / sizeof(pcre_uchar);
4719    NOT_CHAR:
4720    break;
4721
4722
4723    /* ===================================================================*/
4724    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4725    has been tested above. */
4726
4727    case CHAR_LEFT_CURLY_BRACKET:
4728    if (!is_quantifier) goto NORMAL_CHAR;
4729    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4730    if (*errorcodeptr != 0) goto FAILED;
4731    goto REPEAT;
4732
4733    case CHAR_ASTERISK:
4734    repeat_min = 0;
4735    repeat_max = -1;
4736    goto REPEAT;
4737
4738    case CHAR_PLUS:
4739    repeat_min = 1;
4740    repeat_max = -1;
4741    goto REPEAT;
4742
4743    case CHAR_QUESTION_MARK:
4744    repeat_min = 0;
4745    repeat_max = 1;
4746
4747    REPEAT:
4748    if (previous == NULL)
4749      {
4750      *errorcodeptr = ERR9;
4751      goto FAILED;
4752      }
4753
4754    if (repeat_min == 0)
4755      {
4756      firstchar = zerofirstchar;    /* Adjust for zero repeat */
4757      reqchar = zeroreqchar;        /* Ditto */
4758      }
4759
4760    /* Remember whether this is a variable length repeat */
4761
4762    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4763
4764    op_type = 0;                    /* Default single-char op codes */
4765    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4766
4767    /* Save start of previous item, in case we have to move it up in order to
4768    insert something before it. */
4769
4770    tempcode = previous;
4771
4772    /* If the next character is '+', we have a possessive quantifier. This
4773    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4774    If the next character is '?' this is a minimizing repeat, by default,
4775    but if PCRE_UNGREEDY is set, it works the other way round. We change the
4776    repeat type to the non-default. */
4777
4778    if (ptr[1] == CHAR_PLUS)
4779      {
4780      repeat_type = 0;                  /* Force greedy */
4781      possessive_quantifier = TRUE;
4782      ptr++;
4783      }
4784    else if (ptr[1] == CHAR_QUESTION_MARK)
4785      {
4786      repeat_type = greedy_non_default;
4787      ptr++;
4788      }
4789    else repeat_type = greedy_default;
4790
4791    /* If previous was a recursion call, wrap it in atomic brackets so that
4792    previous becomes the atomic group. All recursions were so wrapped in the
4793    past, but it no longer happens for non-repeated recursions. In fact, the
4794    repeated ones could be re-implemented independently so as not to need this,
4795    but for the moment we rely on the code for repeating groups. */
4796
4797    if (*previous == OP_RECURSE)
4798      {
4799      memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4800      *previous = OP_ONCE;
4801      PUT(previous, 1, 2 + 2*LINK_SIZE);
4802      previous[2 + 2*LINK_SIZE] = OP_KET;
4803      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4804      code += 2 + 2 * LINK_SIZE;
4805      length_prevgroup = 3 + 3*LINK_SIZE;
4806
4807      /* When actually compiling, we need to check whether this was a forward
4808      reference, and if so, adjust the offset. */
4809
4810      if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4811        {
4812        int offset = GET(cd->hwm, -LINK_SIZE);
4813        if (offset == previous + 1 - cd->start_code)
4814          PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4815        }
4816      }
4817
4818    /* Now handle repetition for the different types of item. */
4819
4820    /* If previous was a character or negated character match, abolish the item
4821    and generate a repeat item instead. If a char item has a minimum of more
4822    than one, ensure that it is set in reqchar - it might not be if a sequence
4823    such as x{3} is the first thing in a branch because the x will have gone
4824    into firstchar instead.  */
4825
4826    if (*previous == OP_CHAR || *previous == OP_CHARI
4827        || *previous == OP_NOT || *previous == OP_NOTI)
4828      {
4829      switch (*previous)
4830        {
4831        default: /* Make compiler happy. */
4832        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4833        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4834        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4835        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4836        }
4837
4838      /* Deal with UTF characters that take up more than one character. It's
4839      easier to write this out separately than try to macrify it. Use c to
4840      hold the length of the character in bytes, plus UTF_LENGTH to flag that
4841      it's a length rather than a small character. */
4842
4843#ifdef SUPPORT_UTF
4844      if (utf && NOT_FIRSTCHAR(code[-1]))
4845        {
4846        pcre_uchar *lastchar = code - 1;
4847        BACKCHAR(lastchar);
4848        c = (int)(code - lastchar);     /* Length of UTF-8 character */
4849        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4850        c |= UTF_LENGTH;                /* Flag c as a length */
4851        }
4852      else
4853#endif /* SUPPORT_UTF */
4854
4855      /* Handle the case of a single charater - either with no UTF support, or
4856      with UTF disabled, or for a single character UTF character. */
4857        {
4858        c = code[-1];
4859        if (*previous <= OP_CHARI && repeat_min > 1)
4860          reqchar = c | req_caseopt | cd->req_varyopt;
4861        }
4862
4863      /* If the repetition is unlimited, it pays to see if the next thing on
4864      the line is something that cannot possibly match this character. If so,
4865      automatically possessifying this item gains some performance in the case
4866      where the match fails. */
4867
4868      if (!possessive_quantifier &&
4869          repeat_max < 0 &&
4870          check_auto_possessive(previous, utf, ptr + 1, options, cd))
4871        {
4872        repeat_type = 0;    /* Force greedy */
4873        possessive_quantifier = TRUE;
4874        }
4875
4876      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4877      }
4878
4879    /* If previous was a character type match (\d or similar), abolish it and
4880    create a suitable repeat item. The code is shared with single-character
4881    repeats by setting op_type to add a suitable offset into repeat_type. Note
4882    the the Unicode property types will be present only when SUPPORT_UCP is
4883    defined, but we don't wrap the little bits of code here because it just
4884    makes it horribly messy. */
4885
4886    else if (*previous < OP_EODN)
4887      {
4888      pcre_uchar *oldcode;
4889      int prop_type, prop_value;
4890      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4891      c = *previous;
4892
4893      if (!possessive_quantifier &&
4894          repeat_max < 0 &&
4895          check_auto_possessive(previous, utf, ptr + 1, options, cd))
4896        {
4897        repeat_type = 0;    /* Force greedy */
4898        possessive_quantifier = TRUE;
4899        }
4900
4901      OUTPUT_SINGLE_REPEAT:
4902      if (*previous == OP_PROP || *previous == OP_NOTPROP)
4903        {
4904        prop_type = previous[1];
4905        prop_value = previous[2];
4906        }
4907      else prop_type = prop_value = -1;
4908
4909      oldcode = code;
4910      code = previous;                  /* Usually overwrite previous item */
4911
4912      /* If the maximum is zero then the minimum must also be zero; Perl allows
4913      this case, so we do too - by simply omitting the item altogether. */
4914
4915      if (repeat_max == 0) goto END_REPEAT;
4916
4917      /*--------------------------------------------------------------------*/
4918      /* This code is obsolete from release 8.00; the restriction was finally
4919      removed: */
4920
4921      /* All real repeats make it impossible to handle partial matching (maybe
4922      one day we will be able to remove this restriction). */
4923
4924      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4925      /*--------------------------------------------------------------------*/
4926
4927      /* Combine the op_type with the repeat_type */
4928
4929      repeat_type += op_type;
4930
4931      /* A minimum of zero is handled either as the special case * or ?, or as
4932      an UPTO, with the maximum given. */
4933
4934      if (repeat_min == 0)
4935        {
4936        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4937          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4938        else
4939          {
4940          *code++ = OP_UPTO + repeat_type;
4941          PUT2INC(code, 0, repeat_max);
4942          }
4943        }
4944
4945      /* A repeat minimum of 1 is optimized into some special cases. If the
4946      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4947      left in place and, if the maximum is greater than 1, we use OP_UPTO with
4948      one less than the maximum. */
4949
4950      else if (repeat_min == 1)
4951        {
4952        if (repeat_max == -1)
4953          *code++ = OP_PLUS + repeat_type;
4954        else
4955          {
4956          code = oldcode;                 /* leave previous item in place */
4957          if (repeat_max == 1) goto END_REPEAT;
4958          *code++ = OP_UPTO + repeat_type;
4959          PUT2INC(code, 0, repeat_max - 1);
4960          }
4961        }
4962
4963      /* The case {n,n} is just an EXACT, while the general case {n,m} is
4964      handled as an EXACT followed by an UPTO. */
4965
4966      else
4967        {
4968        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4969        PUT2INC(code, 0, repeat_min);
4970
4971        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4972        we have to insert the character for the previous code. For a repeated
4973        Unicode property match, there are two extra bytes that define the
4974        required property. In UTF-8 mode, long characters have their length in
4975        c, with the UTF_LENGTH bit as a flag. */
4976
4977        if (repeat_max < 0)
4978          {
4979#ifdef SUPPORT_UTF
4980          if (utf && (c & UTF_LENGTH) != 0)
4981            {
4982            memcpy(code, utf_chars, IN_UCHARS(c & 7));
4983            code += c & 7;
4984            }
4985          else
4986#endif
4987            {
4988            *code++ = c;
4989            if (prop_type >= 0)
4990              {
4991              *code++ = prop_type;
4992              *code++ = prop_value;
4993              }
4994            }
4995          *code++ = OP_STAR + repeat_type;
4996          }
4997
4998        /* Else insert an UPTO if the max is greater than the min, again
4999        preceded by the character, for the previously inserted code. If the
5000        UPTO is just for 1 instance, we can use QUERY instead. */
5001
5002        else if (repeat_max != repeat_min)
5003          {
5004#ifdef SUPPORT_UTF
5005          if (utf && (c & UTF_LENGTH) != 0)
5006            {
5007            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5008            code += c & 7;
5009            }
5010          else
5011#endif
5012          *code++ = c;
5013          if (prop_type >= 0)
5014            {
5015            *code++ = prop_type;
5016            *code++ = prop_value;
5017            }
5018          repeat_max -= repeat_min;
5019
5020          if (repeat_max == 1)
5021            {
5022            *code++ = OP_QUERY + repeat_type;
5023            }
5024          else
5025            {
5026            *code++ = OP_UPTO + repeat_type;
5027            PUT2INC(code, 0, repeat_max);
5028            }
5029          }
5030        }
5031
5032      /* The character or character type itself comes last in all cases. */
5033
5034#ifdef SUPPORT_UTF
5035      if (utf && (c & UTF_LENGTH) != 0)
5036        {
5037        memcpy(code, utf_chars, IN_UCHARS(c & 7));
5038        code += c & 7;
5039        }
5040      else
5041#endif
5042      *code++ = c;
5043
5044      /* For a repeated Unicode property match, there are two extra bytes that
5045      define the required property. */
5046
5047#ifdef SUPPORT_UCP
5048      if (prop_type >= 0)
5049        {
5050        *code++ = prop_type;
5051        *code++ = prop_value;
5052        }
5053#endif
5054      }
5055
5056    /* If previous was a character class or a back reference, we put the repeat
5057    stuff after it, but just skip the item if the repeat was {0,0}. */
5058
5059    else if (*previous == OP_CLASS ||
5060             *previous == OP_NCLASS ||
5061#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5062             *previous == OP_XCLASS ||
5063#endif
5064             *previous == OP_REF ||
5065             *previous == OP_REFI)
5066      {
5067      if (repeat_max == 0)
5068        {
5069        code = previous;
5070        goto END_REPEAT;
5071        }
5072
5073      /*--------------------------------------------------------------------*/
5074      /* This code is obsolete from release 8.00; the restriction was finally
5075      removed: */
5076
5077      /* All real repeats make it impossible to handle partial matching (maybe
5078      one day we will be able to remove this restriction). */
5079
5080      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5081      /*--------------------------------------------------------------------*/
5082
5083      if (repeat_min == 0 && repeat_max == -1)
5084        *code++ = OP_CRSTAR + repeat_type;
5085      else if (repeat_min == 1 && repeat_max == -1)
5086        *code++ = OP_CRPLUS + repeat_type;
5087      else if (repeat_min == 0 && repeat_max == 1)
5088        *code++ = OP_CRQUERY + repeat_type;
5089      else
5090        {
5091        *code++ = OP_CRRANGE + repeat_type;
5092        PUT2INC(code, 0, repeat_min);
5093        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5094        PUT2INC(code, 0, repeat_max);
5095        }
5096      }
5097
5098    /* If previous was a bracket group, we may have to replicate it in certain
5099    cases. Note that at this point we can encounter only the "basic" bracket
5100    opcodes such as BRA and CBRA, as this is the place where they get converted
5101    into the more special varieties such as BRAPOS and SBRA. A test for >=
5102    OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5103    ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5104    repetition of assertions, but now it does, for Perl compatibility. */
5105
5106    else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5107      {
5108      register int i;
5109      int len = (int)(code - previous);
5110      pcre_uchar *bralink = NULL;
5111      pcre_uchar *brazeroptr = NULL;
5112
5113      /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5114      we just ignore the repeat. */
5115
5116      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5117        goto END_REPEAT;
5118
5119      /* There is no sense in actually repeating assertions. The only potential
5120      use of repetition is in cases when the assertion is optional. Therefore,
5121      if the minimum is greater than zero, just ignore the repeat. If the
5122      maximum is not not zero or one, set it to 1. */
5123
5124      if (*previous < OP_ONCE)    /* Assertion */
5125        {
5126        if (repeat_min > 0) goto END_REPEAT;
5127        if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5128        }
5129
5130      /* The case of a zero minimum is special because of the need to stick
5131      OP_BRAZERO in front of it, and because the group appears once in the
5132      data, whereas in other cases it appears the minimum number of times. For
5133      this reason, it is simplest to treat this case separately, as otherwise
5134      the code gets far too messy. There are several special subcases when the
5135      minimum is zero. */
5136
5137      if (repeat_min == 0)
5138        {
5139        /* If the maximum is also zero, we used to just omit the group from the
5140        output altogether, like this:
5141
5142        ** if (repeat_max == 0)
5143        **   {
5144        **   code = previous;
5145        **   goto END_REPEAT;
5146        **   }
5147
5148        However, that fails when a group or a subgroup within it is referenced
5149        as a subroutine from elsewhere in the pattern, so now we stick in
5150        OP_SKIPZERO in front of it so that it is skipped on execution. As we
5151        don't have a list of which groups are referenced, we cannot do this
5152        selectively.
5153
5154        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5155        and do no more at this point. However, we do need to adjust any
5156        OP_RECURSE calls inside the group that refer to the group itself or any
5157        internal or forward referenced group, because the offset is from the
5158        start of the whole regex. Temporarily terminate the pattern while doing
5159        this. */
5160
5161        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5162          {
5163          *code = OP_END;
5164          adjust_recurse(previous, 1, utf, cd, save_hwm);
5165          memmove(previous + 1, previous, IN_UCHARS(len));
5166          code++;
5167          if (repeat_max == 0)
5168            {
5169            *previous++ = OP_SKIPZERO;
5170            goto END_REPEAT;
5171            }
5172          brazeroptr = previous;    /* Save for possessive optimizing */
5173          *previous++ = OP_BRAZERO + repeat_type;
5174          }
5175
5176        /* If the maximum is greater than 1 and limited, we have to replicate
5177        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5178        The first one has to be handled carefully because it's the original
5179        copy, which has to be moved up. The remainder can be handled by code
5180        that is common with the non-zero minimum case below. We have to
5181        adjust the value or repeat_max, since one less copy is required. Once
5182        again, we may have to adjust any OP_RECURSE calls inside the group. */
5183
5184        else
5185          {
5186          int offset;
5187          *code = OP_END;
5188          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5189          memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5190          code += 2 + LINK_SIZE;
5191          *previous++ = OP_BRAZERO + repeat_type;
5192          *previous++ = OP_BRA;
5193
5194          /* We chain together the bracket offset fields that have to be
5195          filled in later when the ends of the brackets are reached. */
5196
5197          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5198          bralink = previous;
5199          PUTINC(previous, 0, offset);
5200          }
5201
5202        repeat_max--;
5203        }
5204
5205      /* If the minimum is greater than zero, replicate the group as many
5206      times as necessary, and adjust the maximum to the number of subsequent
5207      copies that we need. If we set a first char from the group, and didn't
5208      set a required char, copy the latter from the former. If there are any
5209      forward reference subroutine calls in the group, there will be entries on
5210      the workspace list; replicate these with an appropriate increment. */
5211
5212      else
5213        {
5214        if (repeat_min > 1)
5215          {
5216          /* In the pre-compile phase, we don't actually do the replication. We
5217          just adjust the length as if we had. Do some paranoid checks for
5218          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5219          integer type when available, otherwise double. */
5220
5221          if (lengthptr != NULL)
5222            {
5223            int delta = (repeat_min - 1)*length_prevgroup;
5224            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5225                  (INT64_OR_DOUBLE)length_prevgroup >
5226                    (INT64_OR_DOUBLE)INT_MAX ||
5227                OFLOW_MAX - *lengthptr < delta)
5228              {
5229              *errorcodeptr = ERR20;
5230              goto FAILED;
5231              }
5232            *lengthptr += delta;
5233            }
5234
5235          /* This is compiling for real. If there is a set first byte for
5236          the group, and we have not yet set a "required byte", set it. Make
5237          sure there is enough workspace for copying forward references before
5238          doing the copy. */
5239
5240          else
5241            {
5242            if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5243
5244            for (i = 1; i < repeat_min; i++)
5245              {
5246              pcre_uchar *hc;
5247              pcre_uchar *this_hwm = cd->hwm;
5248              memcpy(code, previous, IN_UCHARS(len));
5249
5250              while (cd->hwm > cd->start_workspace + cd->workspace_size -
5251                     WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5252                {
5253                int save_offset = save_hwm - cd->start_workspace;
5254                int this_offset = this_hwm - cd->start_workspace;
5255                *errorcodeptr = expand_workspace(cd);
5256                if (*errorcodeptr != 0) goto FAILED;
5257                save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5258                this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5259                }
5260
5261              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5262                {
5263                PUT(cd->hwm, 0, GET(hc, 0) + len);
5264                cd->hwm += LINK_SIZE;
5265                }
5266              save_hwm = this_hwm;
5267              code += len;
5268              }
5269            }
5270          }
5271
5272        if (repeat_max > 0) repeat_max -= repeat_min;
5273        }
5274
5275      /* This code is common to both the zero and non-zero minimum cases. If
5276      the maximum is limited, it replicates the group in a nested fashion,
5277      remembering the bracket starts on a stack. In the case of a zero minimum,
5278      the first one was set up above. In all cases the repeat_max now specifies
5279      the number of additional copies needed. Again, we must remember to
5280      replicate entries on the forward reference list. */
5281
5282      if (repeat_max >= 0)
5283        {
5284        /* In the pre-compile phase, we don't actually do the replication. We
5285        just adjust the length as if we had. For each repetition we must add 1
5286        to the length for BRAZERO and for all but the last repetition we must
5287        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5288        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5289        a 64-bit integer type when available, otherwise double. */
5290
5291        if (lengthptr != NULL && repeat_max > 0)
5292          {
5293          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5294                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5295          if ((INT64_OR_DOUBLE)repeat_max *
5296                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5297                  > (INT64_OR_DOUBLE)INT_MAX ||
5298              OFLOW_MAX - *lengthptr < delta)
5299            {
5300            *errorcodeptr = ERR20;
5301            goto FAILED;
5302            }
5303          *lengthptr += delta;
5304          }
5305
5306        /* This is compiling for real */
5307
5308        else for (i = repeat_max - 1; i >= 0; i--)
5309          {
5310          pcre_uchar *hc;
5311          pcre_uchar *this_hwm = cd->hwm;
5312
5313          *code++ = OP_BRAZERO + repeat_type;
5314
5315          /* All but the final copy start a new nesting, maintaining the
5316          chain of brackets outstanding. */
5317
5318          if (i != 0)
5319            {
5320            int offset;
5321            *code++ = OP_BRA;
5322            offset = (bralink == NULL)? 0 : (int)(code - bralink);
5323            bralink = code;
5324            PUTINC(code, 0, offset);
5325            }
5326
5327          memcpy(code, previous, IN_UCHARS(len));
5328
5329          /* Ensure there is enough workspace for forward references before
5330          copying them. */
5331
5332          while (cd->hwm > cd->start_workspace + cd->workspace_size -
5333                 WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5334            {
5335            int save_offset = save_hwm - cd->start_workspace;
5336            int this_offset = this_hwm - cd->start_workspace;
5337            *errorcodeptr = expand_workspace(cd);
5338            if (*errorcodeptr != 0) goto FAILED;
5339            save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5340            this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5341            }
5342
5343          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5344            {
5345            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5346            cd->hwm += LINK_SIZE;
5347            }
5348          save_hwm = this_hwm;
5349          code += len;
5350          }
5351
5352        /* Now chain through the pending brackets, and fill in their length
5353        fields (which are holding the chain links pro tem). */
5354
5355        while (bralink != NULL)
5356          {
5357          int oldlinkoffset;
5358          int offset = (int)(code - bralink + 1);
5359          pcre_uchar *bra = code - offset;
5360          oldlinkoffset = GET(bra, 1);
5361          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5362          *code++ = OP_KET;
5363          PUTINC(code, 0, offset);
5364          PUT(bra, 1, offset);
5365          }
5366        }
5367
5368      /* If the maximum is unlimited, set a repeater in the final copy. For
5369      ONCE brackets, that's all we need to do. However, possessively repeated
5370      ONCE brackets can be converted into non-capturing brackets, as the
5371      behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5372      deal with possessive ONCEs specially.
5373
5374      Otherwise, when we are doing the actual compile phase, check to see
5375      whether this group is one that could match an empty string. If so,
5376      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5377      that runtime checking can be done. [This check is also applied to ONCE
5378      groups at runtime, but in a different way.]
5379
5380      Then, if the quantifier was possessive and the bracket is not a
5381      conditional, we convert the BRA code to the POS form, and the KET code to
5382      KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5383      subpattern at both the start and at the end.) The use of special opcodes
5384      makes it possible to reduce greatly the stack usage in pcre_exec(). If
5385      the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5386
5387      Then, if the minimum number of matches is 1 or 0, cancel the possessive
5388      flag so that the default action below, of wrapping everything inside
5389      atomic brackets, does not happen. When the minimum is greater than 1,
5390      there will be earlier copies of the group, and so we still have to wrap
5391      the whole thing. */
5392
5393      else
5394        {
5395        pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5396        pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5397
5398        /* Convert possessive ONCE brackets to non-capturing */
5399
5400        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5401            possessive_quantifier) *bracode = OP_BRA;
5402
5403        /* For non-possessive ONCE brackets, all we need to do is to
5404        set the KET. */
5405
5406        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5407          *ketcode = OP_KETRMAX + repeat_type;
5408
5409        /* Handle non-ONCE brackets and possessive ONCEs (which have been
5410        converted to non-capturing above). */
5411
5412        else
5413          {
5414          /* In the compile phase, check for empty string matching. */
5415
5416          if (lengthptr == NULL)
5417            {
5418            pcre_uchar *scode = bracode;
5419            do
5420              {
5421              if (could_be_empty_branch(scode, ketcode, utf, cd))
5422                {
5423                *bracode += OP_SBRA - OP_BRA;
5424                break;
5425                }
5426              scode += GET(scode, 1);
5427              }
5428            while (*scode == OP_ALT);
5429            }
5430
5431          /* Handle possessive quantifiers. */
5432
5433          if (possessive_quantifier)
5434            {
5435            /* For COND brackets, we wrap the whole thing in a possessively
5436            repeated non-capturing bracket, because we have not invented POS
5437            versions of the COND opcodes. Because we are moving code along, we
5438            must ensure that any pending recursive references are updated. */
5439
5440            if (*bracode == OP_COND || *bracode == OP_SCOND)
5441              {
5442              int nlen = (int)(code - bracode);
5443              *code = OP_END;
5444              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5445              memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5446              code += 1 + LINK_SIZE;
5447              nlen += 1 + LINK_SIZE;
5448              *bracode = OP_BRAPOS;
5449              *code++ = OP_KETRPOS;
5450              PUTINC(code, 0, nlen);
5451              PUT(bracode, 1, nlen);
5452              }
5453
5454            /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5455
5456            else
5457              {
5458              *bracode += 1;              /* Switch to xxxPOS opcodes */
5459              *ketcode = OP_KETRPOS;
5460              }
5461
5462            /* If the minimum is zero, mark it as possessive, then unset the
5463            possessive flag when the minimum is 0 or 1. */
5464
5465            if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5466            if (repeat_min < 2) possessive_quantifier = FALSE;
5467            }
5468
5469          /* Non-possessive quantifier */
5470
5471          else *ketcode = OP_KETRMAX + repeat_type;
5472          }
5473        }
5474      }
5475
5476    /* If previous is OP_FAIL, it was generated by an empty class [] in
5477    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5478    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5479    error above. We can just ignore the repeat in JS case. */
5480
5481    else if (*previous == OP_FAIL) goto END_REPEAT;
5482
5483    /* Else there's some kind of shambles */
5484
5485    else
5486      {
5487      *errorcodeptr = ERR11;
5488      goto FAILED;
5489      }
5490
5491    /* If the character following a repeat is '+', or if certain optimization
5492    tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5493    there are special alternative opcodes for this case. For anything else, we
5494    wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5495    notation is just syntactic sugar, taken from Sun's Java package, but the
5496    special opcodes can optimize it.
5497
5498    Some (but not all) possessively repeated subpatterns have already been
5499    completely handled in the code just above. For them, possessive_quantifier
5500    is always FALSE at this stage.
5501
5502    Note that the repeated item starts at tempcode, not at previous, which
5503    might be the first part of a string whose (former) last char we repeated.
5504
5505    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5506    an 'upto' may follow. We skip over an 'exact' item, and then test the
5507    length of what remains before proceeding. */
5508
5509    if (possessive_quantifier)
5510      {
5511      int len;
5512
5513      if (*tempcode == OP_TYPEEXACT)
5514        tempcode += PRIV(OP_lengths)[*tempcode] +
5515          ((tempcode[1 + IMM2_SIZE] == OP_PROP
5516          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5517
5518      else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5519        {
5520        tempcode += PRIV(OP_lengths)[*tempcode];
5521#ifdef SUPPORT_UTF
5522        if (utf && HAS_EXTRALEN(tempcode[-1]))
5523          tempcode += GET_EXTRALEN(tempcode[-1]);
5524#endif
5525        }
5526
5527      len = (int)(code - tempcode);
5528      if (len > 0) switch (*tempcode)
5529        {
5530        case OP_STAR:  *tempcode = OP_POSSTAR; break;
5531        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
5532        case OP_QUERY: *tempcode = OP_POSQUERY; break;
5533        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5534
5535        case OP_STARI:  *tempcode = OP_POSSTARI; break;
5536        case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5537        case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5538        case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5539
5540        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5541        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5542        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5543        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5544
5545        case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5546        case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5547        case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5548        case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5549
5550        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5551        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5552        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5553        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5554
5555        /* Because we are moving code along, we must ensure that any
5556        pending recursive references are updated. */
5557
5558        default:
5559        *code = OP_END;
5560        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5561        memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5562        code += 1 + LINK_SIZE;
5563        len += 1 + LINK_SIZE;
5564        tempcode[0] = OP_ONCE;
5565        *code++ = OP_KET;
5566        PUTINC(code, 0, len);
5567        PUT(tempcode, 1, len);
5568        break;
5569        }
5570      }
5571
5572    /* In all case we no longer have a previous item. We also set the
5573    "follows varying string" flag for subsequently encountered reqchars if
5574    it isn't already set and we have just passed a varying length item. */
5575
5576    END_REPEAT:
5577    previous = NULL;
5578    cd->req_varyopt |= reqvary;
5579    break;
5580
5581
5582    /* ===================================================================*/
5583    /* Start of nested parenthesized sub-expression, or comment or lookahead or
5584    lookbehind or option setting or condition or all the other extended
5585    parenthesis forms.  */
5586
5587    case CHAR_LEFT_PARENTHESIS:
5588    newoptions = options;
5589    skipbytes = 0;
5590    bravalue = OP_CBRA;
5591    save_hwm = cd->hwm;
5592    reset_bracount = FALSE;
5593
5594    /* First deal with various "verbs" that can be introduced by '*'. */
5595
5596    ptr++;
5597    if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5598         || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5599      {
5600      int i, namelen;
5601      int arglen = 0;
5602      const char *vn = verbnames;
5603      const pcre_uchar *name = ptr + 1;
5604      const pcre_uchar *arg = NULL;
5605      previous = NULL;
5606      ptr++;
5607      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5608      namelen = (int)(ptr - name);
5609
5610      /* It appears that Perl allows any characters whatsoever, other than
5611      a closing parenthesis, to appear in arguments, so we no longer insist on
5612      letters, digits, and underscores. */
5613
5614      if (*ptr == CHAR_COLON)
5615        {
5616        arg = ++ptr;
5617        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5618        arglen = (int)(ptr - arg);
5619        if (arglen > (int)MAX_MARK)
5620          {
5621          *errorcodeptr = ERR75;
5622          goto FAILED;
5623          }
5624        }
5625
5626      if (*ptr != CHAR_RIGHT_PARENTHESIS)
5627        {
5628        *errorcodeptr = ERR60;
5629        goto FAILED;
5630        }
5631
5632      /* Scan the table of verb names */
5633
5634      for (i = 0; i < verbcount; i++)
5635        {
5636        if (namelen == verbs[i].len &&
5637            STRNCMP_UC_C8(name, vn, namelen) == 0)
5638          {
5639          /* Check for open captures before ACCEPT and convert it to
5640          ASSERT_ACCEPT if in an assertion. */
5641
5642          if (verbs[i].op == OP_ACCEPT)
5643            {
5644            open_capitem *oc;
5645            if (arglen != 0)
5646              {
5647              *errorcodeptr = ERR59;
5648              goto FAILED;
5649              }
5650            cd->had_accept = TRUE;
5651            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5652              {
5653              *code++ = OP_CLOSE;
5654              PUT2INC(code, 0, oc->number);
5655              }
5656            *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5657
5658            /* Do not set firstchar after *ACCEPT */
5659            if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5660            }
5661
5662          /* Handle other cases with/without an argument */
5663
5664          else if (arglen == 0)
5665            {
5666            if (verbs[i].op < 0)   /* Argument is mandatory */
5667              {
5668              *errorcodeptr = ERR66;
5669              goto FAILED;
5670              }
5671            *code = verbs[i].op;
5672            if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5673            }
5674
5675          else
5676            {
5677            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
5678              {
5679              *errorcodeptr = ERR59;
5680              goto FAILED;
5681              }
5682            *code = verbs[i].op_arg;
5683            if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5684            *code++ = arglen;
5685            memcpy(code, arg, IN_UCHARS(arglen));
5686            code += arglen;
5687            *code++ = 0;
5688            }
5689
5690          break;  /* Found verb, exit loop */
5691          }
5692
5693        vn += verbs[i].len + 1;
5694        }
5695
5696      if (i < verbcount) continue;    /* Successfully handled a verb */
5697      *errorcodeptr = ERR60;          /* Verb not recognized */
5698      goto FAILED;
5699      }
5700
5701    /* Deal with the extended parentheses; all are introduced by '?', and the
5702    appearance of any of them means that this is not a capturing group. */
5703
5704    else if (*ptr == CHAR_QUESTION_MARK)
5705      {
5706      int i, set, unset, namelen;
5707      int *optset;
5708      const pcre_uchar *name;
5709      pcre_uchar *slot;
5710
5711      switch (*(++ptr))
5712        {
5713        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
5714        ptr++;
5715        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5716        if (*ptr == 0)
5717          {
5718          *errorcodeptr = ERR18;
5719          goto FAILED;
5720          }
5721        continue;
5722
5723
5724        /* ------------------------------------------------------------ */
5725        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
5726        reset_bracount = TRUE;
5727        /* Fall through */
5728
5729        /* ------------------------------------------------------------ */
5730        case CHAR_COLON:          /* Non-capturing bracket */
5731        bravalue = OP_BRA;
5732        ptr++;
5733        break;
5734
5735
5736        /* ------------------------------------------------------------ */
5737        case CHAR_LEFT_PARENTHESIS:
5738        bravalue = OP_COND;       /* Conditional group */
5739
5740        /* A condition can be an assertion, a number (referring to a numbered
5741        group), a name (referring to a named group), or 'R', referring to
5742        recursion. R<digits> and R&name are also permitted for recursion tests.
5743
5744        There are several syntaxes for testing a named group: (?(name)) is used
5745        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5746
5747        There are two unfortunate ambiguities, caused by history. (a) 'R' can
5748        be the recursive thing or the name 'R' (and similarly for 'R' followed
5749        by digits), and (b) a number could be a name that consists of digits.
5750        In both cases, we look for a name first; if not found, we try the other
5751        cases. */
5752
5753        /* For conditions that are assertions, check the syntax, and then exit
5754        the switch. This will take control down to where bracketed groups,
5755        including assertions, are processed. */
5756
5757        if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5758            ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5759          break;
5760
5761        /* Most other conditions use OP_CREF (a couple change to OP_RREF
5762        below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5763
5764        code[1+LINK_SIZE] = OP_CREF;
5765        skipbytes = 1+IMM2_SIZE;
5766        refsign = -1;
5767
5768        /* Check for a test for recursion in a named group. */
5769
5770        if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5771          {
5772          terminator = -1;
5773          ptr += 2;
5774          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
5775          }
5776
5777        /* Check for a test for a named group's having been set, using the Perl
5778        syntax (?(<name>) or (?('name') */
5779
5780        else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5781          {
5782          terminator = CHAR_GREATER_THAN_SIGN;
5783          ptr++;
5784          }
5785        else if (ptr[1] == CHAR_APOSTROPHE)
5786          {
5787          terminator = CHAR_APOSTROPHE;
5788          ptr++;
5789          }
5790        else
5791          {
5792          terminator = 0;
5793          if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5794          }
5795
5796        /* We now expect to read a name; any thing else is an error */
5797
5798        if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5799          {
5800          ptr += 1;  /* To get the right offset */
5801          *errorcodeptr = ERR28;
5802          goto FAILED;
5803          }
5804
5805        /* Read the name, but also get it as a number if it's all digits */
5806
5807        recno = 0;
5808        name = ++ptr;
5809        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5810          {
5811          if (recno >= 0)
5812            recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
5813          ptr++;
5814          }
5815        namelen = (int)(ptr - name);
5816
5817        if ((terminator > 0 && *ptr++ != terminator) ||
5818            *ptr++ != CHAR_RIGHT_PARENTHESIS)
5819          {
5820          ptr--;      /* Error offset */
5821          *errorcodeptr = ERR26;
5822          goto FAILED;
5823          }
5824
5825        /* Do no further checking in the pre-compile phase. */
5826
5827        if (lengthptr != NULL) break;
5828
5829        /* In the real compile we do the work of looking for the actual
5830        reference. If the string started with "+" or "-" we require the rest to
5831        be digits, in which case recno will be set. */
5832
5833        if (refsign > 0)
5834          {
5835          if (recno <= 0)
5836            {
5837            *errorcodeptr = ERR58;
5838            goto FAILED;
5839            }
5840          recno = (refsign == CHAR_MINUS)?
5841            cd->bracount - recno + 1 : recno +cd->bracount;
5842          if (recno <= 0 || recno > cd->final_bracount)
5843            {
5844            *errorcodeptr = ERR15;
5845            goto FAILED;
5846            }
5847          PUT2(code, 2+LINK_SIZE, recno);
5848          break;
5849          }
5850
5851        /* Otherwise (did not start with "+" or "-"), start by looking for the
5852        name. If we find a name, add one to the opcode to change OP_CREF or
5853        OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5854        except they record that the reference was originally to a name. The
5855        information is used to check duplicate names. */
5856
5857        slot = cd->name_table;
5858        for (i = 0; i < cd->names_found; i++)
5859          {
5860          if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5861          slot += cd->name_entry_size;
5862          }
5863
5864        /* Found a previous named subpattern */
5865
5866        if (i < cd->names_found)
5867          {
5868          recno = GET2(slot, 0);
5869          PUT2(code, 2+LINK_SIZE, recno);
5870          code[1+LINK_SIZE]++;
5871          }
5872
5873        /* Search the pattern for a forward reference */
5874
5875        else if ((i = find_parens(cd, name, namelen,
5876                        (options & PCRE_EXTENDED) != 0, utf)) > 0)
5877          {
5878          PUT2(code, 2+LINK_SIZE, i);
5879          code[1+LINK_SIZE]++;
5880          }
5881
5882        /* If terminator == 0 it means that the name followed directly after
5883        the opening parenthesis [e.g. (?(abc)...] and in this case there are
5884        some further alternatives to try. For the cases where terminator != 0
5885        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5886        now checked all the possibilities, so give an error. */
5887
5888        else if (terminator != 0)
5889          {
5890          *errorcodeptr = ERR15;
5891          goto FAILED;
5892          }
5893
5894        /* Check for (?(R) for recursion. Allow digits after R to specify a
5895        specific group number. */
5896
5897        else if (*name == CHAR_R)
5898          {
5899          recno = 0;
5900          for (i = 1; i < namelen; i++)
5901            {
5902            if (!IS_DIGIT(name[i]))
5903              {
5904              *errorcodeptr = ERR15;
5905              goto FAILED;
5906              }
5907            recno = recno * 10 + name[i] - CHAR_0;
5908            }
5909          if (recno == 0) recno = RREF_ANY;
5910          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5911          PUT2(code, 2+LINK_SIZE, recno);
5912          }
5913
5914        /* Similarly, check for the (?(DEFINE) "condition", which is always
5915        false. */
5916
5917        else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5918          {
5919          code[1+LINK_SIZE] = OP_DEF;
5920          skipbytes = 1;
5921          }
5922
5923        /* Check for the "name" actually being a subpattern number. We are
5924        in the second pass here, so final_bracount is set. */
5925
5926        else if (recno > 0 && recno <= cd->final_bracount)
5927          {
5928          PUT2(code, 2+LINK_SIZE, recno);
5929          }
5930
5931        /* Either an unidentified subpattern, or a reference to (?(0) */
5932
5933        else
5934          {
5935          *errorcodeptr = (recno == 0)? ERR35: ERR15;
5936          goto FAILED;
5937          }
5938        break;
5939
5940
5941        /* ------------------------------------------------------------ */
5942        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5943        bravalue = OP_ASSERT;
5944        cd->assert_depth += 1;
5945        ptr++;
5946        break;
5947
5948
5949        /* ------------------------------------------------------------ */
5950        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5951        ptr++;
5952        if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5953          {
5954          *code++ = OP_FAIL;
5955          previous = NULL;
5956          continue;
5957          }
5958        bravalue = OP_ASSERT_NOT;
5959        cd->assert_depth += 1;
5960        break;
5961
5962
5963        /* ------------------------------------------------------------ */
5964        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5965        switch (ptr[1])
5966          {
5967          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5968          bravalue = OP_ASSERTBACK;
5969          cd->assert_depth += 1;
5970          ptr += 2;
5971          break;
5972
5973          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5974          bravalue = OP_ASSERTBACK_NOT;
5975          cd->assert_depth += 1;
5976          ptr += 2;
5977          break;
5978
5979          default:                /* Could be name define, else bad */
5980          if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5981            goto DEFINE_NAME;
5982          ptr++;                  /* Correct offset for error */
5983          *errorcodeptr = ERR24;
5984          goto FAILED;
5985          }
5986        break;
5987
5988
5989        /* ------------------------------------------------------------ */
5990        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5991        bravalue = OP_ONCE;
5992        ptr++;
5993        break;
5994
5995
5996        /* ------------------------------------------------------------ */
5997        case CHAR_C:                 /* Callout - may be followed by digits; */
5998        previous_callout = code;     /* Save for later completion */
5999        after_manual_callout = 1;    /* Skip one item before completing */
6000        *code++ = OP_CALLOUT;
6001          {
6002          int n = 0;
6003          ptr++;
6004          while(IS_DIGIT(*ptr))
6005            n = n * 10 + *ptr++ - CHAR_0;
6006          if (*ptr != CHAR_RIGHT_PARENTHESIS)
6007            {
6008            *errorcodeptr = ERR39;
6009            goto FAILED;
6010            }
6011          if (n > 255)
6012            {
6013            *errorcodeptr = ERR38;
6014            goto FAILED;
6015            }
6016          *code++ = n;
6017          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6018          PUT(code, LINK_SIZE, 0);                          /* Default length */
6019          code += 2 * LINK_SIZE;
6020          }
6021        previous = NULL;
6022        continue;
6023
6024
6025        /* ------------------------------------------------------------ */
6026        case CHAR_P:              /* Python-style named subpattern handling */
6027        if (*(++ptr) == CHAR_EQUALS_SIGN ||
6028            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6029          {
6030          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6031          terminator = CHAR_RIGHT_PARENTHESIS;
6032          goto NAMED_REF_OR_RECURSE;
6033          }
6034        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6035          {
6036          *errorcodeptr = ERR41;
6037          goto FAILED;
6038          }
6039        /* Fall through to handle (?P< as (?< is handled */
6040
6041
6042        /* ------------------------------------------------------------ */
6043        DEFINE_NAME:    /* Come here from (?< handling */
6044        case CHAR_APOSTROPHE:
6045          {
6046          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6047            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6048          name = ++ptr;
6049
6050          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6051          namelen = (int)(ptr - name);
6052
6053          /* In the pre-compile phase, just do a syntax check. */
6054
6055          if (lengthptr != NULL)
6056            {
6057            if (*ptr != terminator)
6058              {
6059              *errorcodeptr = ERR42;
6060              goto FAILED;
6061              }
6062            if (cd->names_found >= MAX_NAME_COUNT)
6063              {
6064              *errorcodeptr = ERR49;
6065              goto FAILED;
6066              }
6067            if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6068              {
6069              cd->name_entry_size = namelen + IMM2_SIZE + 1;
6070              if (namelen > MAX_NAME_SIZE)
6071                {
6072                *errorcodeptr = ERR48;
6073                goto FAILED;
6074                }
6075              }
6076            }
6077
6078          /* In the real compile, create the entry in the table, maintaining
6079          alphabetical order. Duplicate names for different numbers are
6080          permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6081          number are always OK. (An existing number can be re-used if (?|
6082          appears in the pattern.) In either event, a duplicate name results in
6083          a duplicate entry in the table, even if the number is the same. This
6084          is because the number of names, and hence the table size, is computed
6085          in the pre-compile, and it affects various numbers and pointers which
6086          would all have to be modified, and the compiled code moved down, if
6087          duplicates with the same number were omitted from the table. This
6088          doesn't seem worth the hassle. However, *different* names for the
6089          same number are not permitted. */
6090
6091          else
6092            {
6093            BOOL dupname = FALSE;
6094            slot = cd->name_table;
6095
6096            for (i = 0; i < cd->names_found; i++)
6097              {
6098              int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6099              if (crc == 0)
6100                {
6101                if (slot[IMM2_SIZE+namelen] == 0)
6102                  {
6103                  if (GET2(slot, 0) != cd->bracount + 1 &&
6104                      (options & PCRE_DUPNAMES) == 0)
6105                    {
6106                    *errorcodeptr = ERR43;
6107                    goto FAILED;
6108                    }
6109                  else dupname = TRUE;
6110                  }
6111                else crc = -1;      /* Current name is a substring */
6112                }
6113
6114              /* Make space in the table and break the loop for an earlier
6115              name. For a duplicate or later name, carry on. We do this for
6116              duplicates so that in the simple case (when ?(| is not used) they
6117              are in order of their numbers. */
6118
6119              if (crc < 0)
6120                {
6121                memmove(slot + cd->name_entry_size, slot,
6122                  IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6123                break;
6124                }
6125
6126              /* Continue the loop for a later or duplicate name */
6127
6128              slot += cd->name_entry_size;
6129              }
6130
6131            /* For non-duplicate names, check for a duplicate number before
6132            adding the new name. */
6133
6134            if (!dupname)
6135              {
6136              pcre_uchar *cslot = cd->name_table;
6137              for (i = 0; i < cd->names_found; i++)
6138                {
6139                if (cslot != slot)
6140                  {
6141                  if (GET2(cslot, 0) == cd->bracount + 1)
6142                    {
6143                    *errorcodeptr = ERR65;
6144                    goto FAILED;
6145                    }
6146                  }
6147                else i--;
6148                cslot += cd->name_entry_size;
6149                }
6150              }
6151
6152            PUT2(slot, 0, cd->bracount + 1);
6153            memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6154            slot[IMM2_SIZE + namelen] = 0;
6155            }
6156          }
6157
6158        /* In both pre-compile and compile, count the number of names we've
6159        encountered. */
6160
6161        cd->names_found++;
6162        ptr++;                    /* Move past > or ' */
6163        goto NUMBERED_GROUP;
6164
6165
6166        /* ------------------------------------------------------------ */
6167        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6168        terminator = CHAR_RIGHT_PARENTHESIS;
6169        is_recurse = TRUE;
6170        /* Fall through */
6171
6172        /* We come here from the Python syntax above that handles both
6173        references (?P=name) and recursion (?P>name), as well as falling
6174        through from the Perl recursion syntax (?&name). We also come here from
6175        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6176        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6177
6178        NAMED_REF_OR_RECURSE:
6179        name = ++ptr;
6180        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6181        namelen = (int)(ptr - name);
6182
6183        /* In the pre-compile phase, do a syntax check. We used to just set
6184        a dummy reference number, because it was not used in the first pass.
6185        However, with the change of recursive back references to be atomic,
6186        we have to look for the number so that this state can be identified, as
6187        otherwise the incorrect length is computed. If it's not a backwards
6188        reference, the dummy number will do. */
6189
6190        if (lengthptr != NULL)
6191          {
6192          const pcre_uchar *temp;
6193
6194          if (namelen == 0)
6195            {
6196            *errorcodeptr = ERR62;
6197            goto FAILED;
6198            }
6199          if (*ptr != terminator)
6200            {
6201            *errorcodeptr = ERR42;
6202            goto FAILED;
6203            }
6204          if (namelen > MAX_NAME_SIZE)
6205            {
6206            *errorcodeptr = ERR48;
6207            goto FAILED;
6208            }
6209
6210          /* The name table does not exist in the first pass, so we cannot
6211          do a simple search as in the code below. Instead, we have to scan the
6212          pattern to find the number. It is important that we scan it only as
6213          far as we have got because the syntax of named subpatterns has not
6214          been checked for the rest of the pattern, and find_parens() assumes
6215          correct syntax. In any case, it's a waste of resources to scan
6216          further. We stop the scan at the current point by temporarily
6217          adjusting the value of cd->endpattern. */
6218
6219          temp = cd->end_pattern;
6220          cd->end_pattern = ptr;
6221          recno = find_parens(cd, name, namelen,
6222            (options & PCRE_EXTENDED) != 0, utf);
6223          cd->end_pattern = temp;
6224          if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6225          }
6226
6227        /* In the real compile, seek the name in the table. We check the name
6228        first, and then check that we have reached the end of the name in the
6229        table. That way, if the name that is longer than any in the table,
6230        the comparison will fail without reading beyond the table entry. */
6231
6232        else
6233          {
6234          slot = cd->name_table;
6235          for (i = 0; i < cd->names_found; i++)
6236            {
6237            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6238                slot[IMM2_SIZE+namelen] == 0)
6239              break;
6240            slot += cd->name_entry_size;
6241            }
6242
6243          if (i < cd->names_found)         /* Back reference */
6244            {
6245            recno = GET2(slot, 0);
6246            }
6247          else if ((recno =                /* Forward back reference */
6248                    find_parens(cd, name, namelen,
6249                      (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6250            {
6251            *errorcodeptr = ERR15;
6252            goto FAILED;
6253            }
6254          }
6255
6256        /* In both phases, we can now go to the code than handles numerical
6257        recursion or backreferences. */
6258
6259        if (is_recurse) goto HANDLE_RECURSION;
6260          else goto HANDLE_REFERENCE;
6261
6262
6263        /* ------------------------------------------------------------ */
6264        case CHAR_R:              /* Recursion */
6265        ptr++;                    /* Same as (?0)      */
6266        /* Fall through */
6267
6268
6269        /* ------------------------------------------------------------ */
6270        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6271        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6272        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6273          {
6274          const pcre_uchar *called;
6275          terminator = CHAR_RIGHT_PARENTHESIS;
6276
6277          /* Come here from the \g<...> and \g'...' code (Oniguruma
6278          compatibility). However, the syntax has been checked to ensure that
6279          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6280          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6281          ever be taken. */
6282
6283          HANDLE_NUMERICAL_RECURSION:
6284
6285          if ((refsign = *ptr) == CHAR_PLUS)
6286            {
6287            ptr++;
6288            if (!IS_DIGIT(*ptr))
6289              {
6290              *errorcodeptr = ERR63;
6291              goto FAILED;
6292              }
6293            }
6294          else if (refsign == CHAR_MINUS)
6295            {
6296            if (!IS_DIGIT(ptr[1]))
6297              goto OTHER_CHAR_AFTER_QUERY;
6298            ptr++;
6299            }
6300
6301          recno = 0;
6302          while(IS_DIGIT(*ptr))
6303            recno = recno * 10 + *ptr++ - CHAR_0;
6304
6305          if (*ptr != terminator)
6306            {
6307            *errorcodeptr = ERR29;
6308            goto FAILED;
6309            }
6310
6311          if (refsign == CHAR_MINUS)
6312            {
6313            if (recno == 0)
6314              {
6315              *errorcodeptr = ERR58;
6316              goto FAILED;
6317              }
6318            recno = cd->bracount - recno + 1;
6319            if (recno <= 0)
6320              {
6321              *errorcodeptr = ERR15;
6322              goto FAILED;
6323              }
6324            }
6325          else if (refsign == CHAR_PLUS)
6326            {
6327            if (recno == 0)
6328              {
6329              *errorcodeptr = ERR58;
6330              goto FAILED;
6331              }
6332            recno += cd->bracount;
6333            }
6334
6335          /* Come here from code above that handles a named recursion */
6336
6337          HANDLE_RECURSION:
6338
6339          previous = code;
6340          called = cd->start_code;
6341
6342          /* When we are actually compiling, find the bracket that is being
6343          referenced. Temporarily end the regex in case it doesn't exist before
6344          this point. If we end up with a forward reference, first check that
6345          the bracket does occur later so we can give the error (and position)
6346          now. Then remember this forward reference in the workspace so it can
6347          be filled in at the end. */
6348
6349          if (lengthptr == NULL)
6350            {
6351            *code = OP_END;
6352            if (recno != 0)
6353              called = PRIV(find_bracket)(cd->start_code, utf, recno);
6354
6355            /* Forward reference */
6356
6357            if (called == NULL)
6358              {
6359              if (find_parens(cd, NULL, recno,
6360                    (options & PCRE_EXTENDED) != 0, utf) < 0)
6361                {
6362                *errorcodeptr = ERR15;
6363                goto FAILED;
6364                }
6365
6366              /* Fudge the value of "called" so that when it is inserted as an
6367              offset below, what it actually inserted is the reference number
6368              of the group. Then remember the forward reference. */
6369
6370              called = cd->start_code + recno;
6371              if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6372                  WORK_SIZE_SAFETY_MARGIN)
6373                {
6374                *errorcodeptr = expand_workspace(cd);
6375                if (*errorcodeptr != 0) goto FAILED;
6376                }
6377              PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6378              }
6379
6380            /* If not a forward reference, and the subpattern is still open,
6381            this is a recursive call. We check to see if this is a left
6382            recursion that could loop for ever, and diagnose that case. We
6383            must not, however, do this check if we are in a conditional
6384            subpattern because the condition might be testing for recursion in
6385            a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6386            Forever loops are also detected at runtime, so those that occur in
6387            conditional subpatterns will be picked up then. */
6388
6389            else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6390                     could_be_empty(called, code, bcptr, utf, cd))
6391              {
6392              *errorcodeptr = ERR40;
6393              goto FAILED;
6394              }
6395            }
6396
6397          /* Insert the recursion/subroutine item. It does not have a set first
6398          character (relevant if it is repeated, because it will then be
6399          wrapped with ONCE brackets). */
6400
6401          *code = OP_RECURSE;
6402          PUT(code, 1, (int)(called - cd->start_code));
6403          code += 1 + LINK_SIZE;
6404          groupsetfirstchar = FALSE;
6405          }
6406
6407        /* Can't determine a first byte now */
6408
6409        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6410        continue;
6411
6412
6413        /* ------------------------------------------------------------ */
6414        default:              /* Other characters: check option setting */
6415        OTHER_CHAR_AFTER_QUERY:
6416        set = unset = 0;
6417        optset = &set;
6418
6419        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6420          {
6421          switch (*ptr++)
6422            {
6423            case CHAR_MINUS: optset = &unset; break;
6424
6425            case CHAR_J:    /* Record that it changed in the external options */
6426            *optset |= PCRE_DUPNAMES;
6427            cd->external_flags |= PCRE_JCHANGED;
6428            break;
6429
6430            case CHAR_i: *optset |= PCRE_CASELESS; break;
6431            case CHAR_m: *optset |= PCRE_MULTILINE; break;
6432            case CHAR_s: *optset |= PCRE_DOTALL; break;
6433            case CHAR_x: *optset |= PCRE_EXTENDED; break;
6434            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6435            case CHAR_X: *optset |= PCRE_EXTRA; break;
6436
6437            default:  *errorcodeptr = ERR12;
6438                      ptr--;    /* Correct the offset */
6439                      goto FAILED;
6440            }
6441          }
6442
6443        /* Set up the changed option bits, but don't change anything yet. */
6444
6445        newoptions = (options | set) & (~unset);
6446
6447        /* If the options ended with ')' this is not the start of a nested
6448        group with option changes, so the options change at this level. If this
6449        item is right at the start of the pattern, the options can be
6450        abstracted and made external in the pre-compile phase, and ignored in
6451        the compile phase. This can be helpful when matching -- for instance in
6452        caseless checking of required bytes.
6453
6454        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6455        definitely *not* at the start of the pattern because something has been
6456        compiled. In the pre-compile phase, however, the code pointer can have
6457        that value after the start, because it gets reset as code is discarded
6458        during the pre-compile. However, this can happen only at top level - if
6459        we are within parentheses, the starting BRA will still be present. At
6460        any parenthesis level, the length value can be used to test if anything
6461        has been compiled at that level. Thus, a test for both these conditions
6462        is necessary to ensure we correctly detect the start of the pattern in
6463        both phases.
6464
6465        If we are not at the pattern start, reset the greedy defaults and the
6466        case value for firstchar and reqchar. */
6467
6468        if (*ptr == CHAR_RIGHT_PARENTHESIS)
6469          {
6470          if (code == cd->start_code + 1 + LINK_SIZE &&
6471               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6472            {
6473            cd->external_options = newoptions;
6474            }
6475          else
6476            {
6477            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6478            greedy_non_default = greedy_default ^ 1;
6479            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6480            }
6481
6482          /* Change options at this level, and pass them back for use
6483          in subsequent branches. */
6484
6485          *optionsptr = options = newoptions;
6486          previous = NULL;       /* This item can't be repeated */
6487          continue;              /* It is complete */
6488          }
6489
6490        /* If the options ended with ':' we are heading into a nested group
6491        with possible change of options. Such groups are non-capturing and are
6492        not assertions of any kind. All we need to do is skip over the ':';
6493        the newoptions value is handled below. */
6494
6495        bravalue = OP_BRA;
6496        ptr++;
6497        }     /* End of switch for character following (? */
6498      }       /* End of (? handling */
6499
6500    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6501    is set, all unadorned brackets become non-capturing and behave like (?:...)
6502    brackets. */
6503
6504    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6505      {
6506      bravalue = OP_BRA;
6507      }
6508
6509    /* Else we have a capturing group. */
6510
6511    else
6512      {
6513      NUMBERED_GROUP:
6514      cd->bracount += 1;
6515      PUT2(code, 1+LINK_SIZE, cd->bracount);
6516      skipbytes = IMM2_SIZE;
6517      }
6518
6519    /* Process nested bracketed regex. Assertions used not to be repeatable,
6520    but this was changed for Perl compatibility, so all kinds can now be
6521    repeated. We copy code into a non-register variable (tempcode) in order to
6522    be able to pass its address because some compilers complain otherwise. */
6523
6524    previous = code;                      /* For handling repetition */
6525    *code = bravalue;
6526    tempcode = code;
6527    tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6528    tempbracount = cd->bracount;          /* Save value before bracket */
6529    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6530
6531    if (!compile_regex(
6532         newoptions,                      /* The complete new option state */
6533         &tempcode,                       /* Where to put code (updated) */
6534         &ptr,                            /* Input pointer (updated) */
6535         errorcodeptr,                    /* Where to put an error message */
6536         (bravalue == OP_ASSERTBACK ||
6537          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6538         reset_bracount,                  /* True if (?| group */
6539         skipbytes,                       /* Skip over bracket number */
6540         cond_depth +
6541           ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6542         &subfirstchar,                   /* For possible first char */
6543         &subreqchar,                     /* For possible last char */
6544         bcptr,                           /* Current branch chain */
6545         cd,                              /* Tables block */
6546         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6547           &length_prevgroup              /* Pre-compile phase */
6548         ))
6549      goto FAILED;
6550
6551    /* If this was an atomic group and there are no capturing groups within it,
6552    generate OP_ONCE_NC instead of OP_ONCE. */
6553
6554    if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6555      *code = OP_ONCE_NC;
6556
6557    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6558      cd->assert_depth -= 1;
6559
6560    /* At the end of compiling, code is still pointing to the start of the
6561    group, while tempcode has been updated to point past the end of the group.
6562    The pattern pointer (ptr) is on the bracket.
6563
6564    If this is a conditional bracket, check that there are no more than
6565    two branches in the group, or just one if it's a DEFINE group. We do this
6566    in the real compile phase, not in the pre-pass, where the whole group may
6567    not be available. */
6568
6569    if (bravalue == OP_COND && lengthptr == NULL)
6570      {
6571      pcre_uchar *tc = code;
6572      int condcount = 0;
6573
6574      do {
6575         condcount++;
6576         tc += GET(tc,1);
6577         }
6578      while (*tc != OP_KET);
6579
6580      /* A DEFINE group is never obeyed inline (the "condition" is always
6581      false). It must have only one branch. */
6582
6583      if (code[LINK_SIZE+1] == OP_DEF)
6584        {
6585        if (condcount > 1)
6586          {
6587          *errorcodeptr = ERR54;
6588          goto FAILED;
6589          }
6590        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
6591        }
6592
6593      /* A "normal" conditional group. If there is just one branch, we must not
6594      make use of its firstchar or reqchar, because this is equivalent to an
6595      empty second branch. */
6596
6597      else
6598        {
6599        if (condcount > 2)
6600          {
6601          *errorcodeptr = ERR27;
6602          goto FAILED;
6603          }
6604        if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6605        }
6606      }
6607
6608    /* Error if hit end of pattern */
6609
6610    if (*ptr != CHAR_RIGHT_PARENTHESIS)
6611      {
6612      *errorcodeptr = ERR14;
6613      goto FAILED;
6614      }
6615
6616    /* In the pre-compile phase, update the length by the length of the group,
6617    less the brackets at either end. Then reduce the compiled code to just a
6618    set of non-capturing brackets so that it doesn't use much memory if it is
6619    duplicated by a quantifier.*/
6620
6621    if (lengthptr != NULL)
6622      {
6623      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6624        {
6625        *errorcodeptr = ERR20;
6626        goto FAILED;
6627        }
6628      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6629      code++;   /* This already contains bravalue */
6630      PUTINC(code, 0, 1 + LINK_SIZE);
6631      *code++ = OP_KET;
6632      PUTINC(code, 0, 1 + LINK_SIZE);
6633      break;    /* No need to waste time with special character handling */
6634      }
6635
6636    /* Otherwise update the main code pointer to the end of the group. */
6637
6638    code = tempcode;
6639
6640    /* For a DEFINE group, required and first character settings are not
6641    relevant. */
6642
6643    if (bravalue == OP_DEF) break;
6644
6645    /* Handle updating of the required and first characters for other types of
6646    group. Update for normal brackets of all kinds, and conditions with two
6647    branches (see code above). If the bracket is followed by a quantifier with
6648    zero repeat, we have to back off. Hence the definition of zeroreqchar and
6649    zerofirstchar outside the main loop so that they can be accessed for the
6650    back off. */
6651
6652    zeroreqchar = reqchar;
6653    zerofirstchar = firstchar;
6654    groupsetfirstchar = FALSE;
6655
6656    if (bravalue >= OP_ONCE)
6657      {
6658      /* If we have not yet set a firstchar in this branch, take it from the
6659      subpattern, remembering that it was set here so that a repeat of more
6660      than one can replicate it as reqchar if necessary. If the subpattern has
6661      no firstchar, set "none" for the whole branch. In both cases, a zero
6662      repeat forces firstchar to "none". */
6663
6664      if (firstchar == REQ_UNSET)
6665        {
6666        if (subfirstchar >= 0)
6667          {
6668          firstchar = subfirstchar;
6669          groupsetfirstchar = TRUE;
6670          }
6671        else firstchar = REQ_NONE;
6672        zerofirstchar = REQ_NONE;
6673        }
6674
6675      /* If firstchar was previously set, convert the subpattern's firstchar
6676      into reqchar if there wasn't one, using the vary flag that was in
6677      existence beforehand. */
6678
6679      else if (subfirstchar >= 0 && subreqchar < 0)
6680        subreqchar = subfirstchar | tempreqvary;
6681
6682      /* If the subpattern set a required byte (or set a first byte that isn't
6683      really the first byte - see above), set it. */
6684
6685      if (subreqchar >= 0) reqchar = subreqchar;
6686      }
6687
6688    /* For a forward assertion, we take the reqchar, if set. This can be
6689    helpful if the pattern that follows the assertion doesn't set a different
6690    char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6691    for an assertion, however because it leads to incorrect effect for patterns
6692    such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6693    of a firstchar. This is overcome by a scan at the end if there's no
6694    firstchar, looking for an asserted first char. */
6695
6696    else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6697    break;     /* End of processing '(' */
6698
6699
6700    /* ===================================================================*/
6701    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6702    are arranged to be the negation of the corresponding OP_values in the
6703    default case when PCRE_UCP is not set. For the back references, the values
6704    are ESC_REF plus the reference number. Only back references and those types
6705    that consume a character may be repeated. We can test for values between
6706    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6707    ever created. */
6708
6709    case CHAR_BACKSLASH:
6710    tempptr = ptr;
6711    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
6712    if (*errorcodeptr != 0) goto FAILED;
6713
6714    if (c < 0)
6715      {
6716      if (-c == ESC_Q)            /* Handle start of quoted string */
6717        {
6718        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6719          ptr += 2;               /* avoid empty string */
6720            else inescq = TRUE;
6721        continue;
6722        }
6723
6724      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
6725
6726      /* For metasequences that actually match a character, we disable the
6727      setting of a first character if it hasn't already been set. */
6728
6729      if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6730        firstchar = REQ_NONE;
6731
6732      /* Set values to reset to if this is followed by a zero repeat. */
6733
6734      zerofirstchar = firstchar;
6735      zeroreqchar = reqchar;
6736
6737      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6738      is a subroutine call by number (Oniguruma syntax). In fact, the value
6739      -ESC_g is returned only for these cases. So we don't need to check for <
6740      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
6741      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
6742      that is a synonym for a named back reference). */
6743
6744      if (-c == ESC_g)
6745        {
6746        const pcre_uchar *p;
6747        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6748        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6749          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6750
6751        /* These two statements stop the compiler for warning about possibly
6752        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6753        fact, because we actually check for a number below, the paths that
6754        would actually be in error are never taken. */
6755
6756        skipbytes = 0;
6757        reset_bracount = FALSE;
6758
6759        /* Test for a name */
6760
6761        if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6762          {
6763          BOOL is_a_number = TRUE;
6764          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6765            {
6766            if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6767            if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6768            if ((cd->ctypes[*p] & ctype_word) == 0) break;
6769            }
6770          if (*p != terminator)
6771            {
6772            *errorcodeptr = ERR57;
6773            break;
6774            }
6775          if (is_a_number)
6776            {
6777            ptr++;
6778            goto HANDLE_NUMERICAL_RECURSION;
6779            }
6780          is_recurse = TRUE;
6781          goto NAMED_REF_OR_RECURSE;
6782          }
6783
6784        /* Test a signed number in angle brackets or quotes. */
6785
6786        p = ptr + 2;
6787        while (IS_DIGIT(*p)) p++;
6788        if (*p != terminator)
6789          {
6790          *errorcodeptr = ERR57;
6791          break;
6792          }
6793        ptr++;
6794        goto HANDLE_NUMERICAL_RECURSION;
6795        }
6796
6797      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6798      We also support \k{name} (.NET syntax).  */
6799
6800      if (-c == ESC_k)
6801        {
6802        if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6803          ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6804          {
6805          *errorcodeptr = ERR69;
6806          break;
6807          }
6808        is_recurse = FALSE;
6809        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6810          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6811          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6812        goto NAMED_REF_OR_RECURSE;
6813        }
6814
6815      /* Back references are handled specially; must disable firstchar if
6816      not set to cope with cases like (?=(\w+))\1: which would otherwise set
6817      ':' later. */
6818
6819      if (-c >= ESC_REF)
6820        {
6821        open_capitem *oc;
6822        recno = -c - ESC_REF;
6823
6824        HANDLE_REFERENCE:    /* Come here from named backref handling */
6825        if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6826        previous = code;
6827        *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6828        PUT2INC(code, 0, recno);
6829        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6830        if (recno > cd->top_backref) cd->top_backref = recno;
6831
6832        /* Check to see if this back reference is recursive, that it, it
6833        is inside the group that it references. A flag is set so that the
6834        group can be made atomic. */
6835
6836        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6837          {
6838          if (oc->number == recno)
6839            {
6840            oc->flag = TRUE;
6841            break;
6842            }
6843          }
6844        }
6845
6846      /* So are Unicode property matches, if supported. */
6847
6848#ifdef SUPPORT_UCP
6849      else if (-c == ESC_P || -c == ESC_p)
6850        {
6851        BOOL negated;
6852        int pdata;
6853        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6854        if (ptype < 0) goto FAILED;
6855        previous = code;
6856        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6857        *code++ = ptype;
6858        *code++ = pdata;
6859        }
6860#else
6861
6862      /* If Unicode properties are not supported, \X, \P, and \p are not
6863      allowed. */
6864
6865      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6866        {
6867        *errorcodeptr = ERR45;
6868        goto FAILED;
6869        }
6870#endif
6871
6872      /* For the rest (including \X when Unicode properties are supported), we
6873      can obtain the OP value by negating the escape value in the default
6874      situation when PCRE_UCP is not set. When it *is* set, we substitute
6875      Unicode property tests. Note that \b and \B do a one-character
6876      lookbehind. */
6877
6878      else
6879        {
6880        if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6881          cd->max_lookbehind = 1;
6882#ifdef SUPPORT_UCP
6883        if (-c >= ESC_DU && -c <= ESC_wu)
6884          {
6885          nestptr = ptr + 1;                   /* Where to resume */
6886          ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6887          }
6888        else
6889#endif
6890        /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6891        so that it works in DFA mode and in lookbehinds. */
6892
6893          {
6894          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6895          *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6896          }
6897        }
6898      continue;
6899      }
6900
6901    /* We have a data character whose value is in c. In UTF-8 mode it may have
6902    a value > 127. We set its representation in the length/buffer, and then
6903    handle it as a data character. */
6904
6905#ifdef SUPPORT_UTF
6906    if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6907      mclength = PRIV(ord2utf)(c, mcbuffer);
6908    else
6909#endif
6910
6911     {
6912     mcbuffer[0] = c;
6913     mclength = 1;
6914     }
6915    goto ONE_CHAR;
6916
6917
6918    /* ===================================================================*/
6919    /* Handle a literal character. It is guaranteed not to be whitespace or #
6920    when the extended flag is set. If we are in UTF-8 mode, it may be a
6921    multi-byte literal character. */
6922
6923    default:
6924    NORMAL_CHAR:
6925    mclength = 1;
6926    mcbuffer[0] = c;
6927
6928#ifdef SUPPORT_UTF
6929    if (utf && HAS_EXTRALEN(c))
6930      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
6931#endif
6932
6933    /* At this point we have the character's bytes in mcbuffer, and the length
6934    in mclength. When not in UTF-8 mode, the length is always 1. */
6935
6936    ONE_CHAR:
6937    previous = code;
6938    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6939    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6940
6941    /* Remember if \r or \n were seen */
6942
6943    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6944      cd->external_flags |= PCRE_HASCRORLF;
6945
6946    /* Set the first and required bytes appropriately. If no previous first
6947    byte, set it from this character, but revert to none on a zero repeat.
6948    Otherwise, leave the firstchar value alone, and don't change it on a zero
6949    repeat. */
6950
6951    if (firstchar == REQ_UNSET)
6952      {
6953      zerofirstchar = REQ_NONE;
6954      zeroreqchar = reqchar;
6955
6956      /* If the character is more than one byte long, we can set firstchar
6957      only if it is not to be matched caselessly. */
6958
6959      if (mclength == 1 || req_caseopt == 0)
6960        {
6961        firstchar = mcbuffer[0] | req_caseopt;
6962        if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6963        }
6964      else firstchar = reqchar = REQ_NONE;
6965      }
6966
6967    /* firstchar was previously set; we can set reqchar only if the length is
6968    1 or the matching is caseful. */
6969
6970    else
6971      {
6972      zerofirstchar = firstchar;
6973      zeroreqchar = reqchar;
6974      if (mclength == 1 || req_caseopt == 0)
6975        reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6976      }
6977
6978    break;            /* End of literal character handling */
6979    }
6980  }                   /* end of big loop */
6981
6982
6983/* Control never reaches here by falling through, only by a goto for all the
6984error states. Pass back the position in the pattern so that it can be displayed
6985to the user for diagnosing the error. */
6986
6987FAILED:
6988*ptrptr = ptr;
6989return FALSE;
6990}
6991
6992
6993
6994
6995/*************************************************
6996*     Compile sequence of alternatives           *
6997*************************************************/
6998
6999/* On entry, ptr is pointing past the bracket character, but on return it
7000points to the closing bracket, or vertical bar, or end of string. The code
7001variable is pointing at the byte into which the BRA operator has been stored.
7002This function is used during the pre-compile phase when we are trying to find
7003out the amount of memory needed, as well as during the real compile phase. The
7004value of lengthptr distinguishes the two phases.
7005
7006Arguments:
7007  options        option bits, including any changes for this subpattern
7008  codeptr        -> the address of the current code pointer
7009  ptrptr         -> the address of the current pattern pointer
7010  errorcodeptr   -> pointer to error code variable
7011  lookbehind     TRUE if this is a lookbehind assertion
7012  reset_bracount TRUE to reset the count for each branch
7013  skipbytes      skip this many bytes at start (for brackets and OP_COND)
7014  cond_depth     depth of nesting for conditional subpatterns
7015  firstcharptr   place to put the first required character, or a negative number
7016  reqcharptr     place to put the last required character, or a negative number
7017  bcptr          pointer to the chain of currently open branches
7018  cd             points to the data block with tables pointers etc.
7019  lengthptr      NULL during the real compile phase
7020                 points to length accumulator during pre-compile phase
7021
7022Returns:         TRUE on success
7023*/
7024
7025static BOOL
7026compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
7027  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
7028  int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
7029  branch_chain *bcptr, compile_data *cd, int *lengthptr)
7030{
7031const pcre_uchar *ptr = *ptrptr;
7032pcre_uchar *code = *codeptr;
7033pcre_uchar *last_branch = code;
7034pcre_uchar *start_bracket = code;
7035pcre_uchar *reverse_count = NULL;
7036open_capitem capitem;
7037int capnumber = 0;
7038pcre_int32 firstchar, reqchar;
7039pcre_int32 branchfirstchar, branchreqchar;
7040int length;
7041int orig_bracount;
7042int max_bracount;
7043branch_chain bc;
7044
7045bc.outer = bcptr;
7046bc.current_branch = code;
7047
7048firstchar = reqchar = REQ_UNSET;
7049
7050/* Accumulate the length for use in the pre-compile phase. Start with the
7051length of the BRA and KET and any extra bytes that are required at the
7052beginning. We accumulate in a local variable to save frequent testing of
7053lenthptr for NULL. We cannot do this by looking at the value of code at the
7054start and end of each alternative, because compiled items are discarded during
7055the pre-compile phase so that the work space is not exceeded. */
7056
7057length = 2 + 2*LINK_SIZE + skipbytes;
7058
7059/* WARNING: If the above line is changed for any reason, you must also change
7060the code that abstracts option settings at the start of the pattern and makes
7061them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7062pre-compile phase to find out whether anything has yet been compiled or not. */
7063
7064/* If this is a capturing subpattern, add to the chain of open capturing items
7065so that we can detect them if (*ACCEPT) is encountered. This is also used to
7066detect groups that contain recursive back references to themselves. Note that
7067only OP_CBRA need be tested here; changing this opcode to one of its variants,
7068e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7069
7070if (*code == OP_CBRA)
7071  {
7072  capnumber = GET2(code, 1 + LINK_SIZE);
7073  capitem.number = capnumber;
7074  capitem.next = cd->open_caps;
7075  capitem.flag = FALSE;
7076  cd->open_caps = &capitem;
7077  }
7078
7079/* Offset is set zero to mark that this bracket is still open */
7080
7081PUT(code, 1, 0);
7082code += 1 + LINK_SIZE + skipbytes;
7083
7084/* Loop for each alternative branch */
7085
7086orig_bracount = max_bracount = cd->bracount;
7087for (;;)
7088  {
7089  /* For a (?| group, reset the capturing bracket count so that each branch
7090  uses the same numbers. */
7091
7092  if (reset_bracount) cd->bracount = orig_bracount;
7093
7094  /* Set up dummy OP_REVERSE if lookbehind assertion */
7095
7096  if (lookbehind)
7097    {
7098    *code++ = OP_REVERSE;
7099    reverse_count = code;
7100    PUTINC(code, 0, 0);
7101    length += 1 + LINK_SIZE;
7102    }
7103
7104  /* Now compile the branch; in the pre-compile phase its length gets added
7105  into the length. */
7106
7107  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
7108        &branchreqchar, &bc, cond_depth, cd,
7109        (lengthptr == NULL)? NULL : &length))
7110    {
7111    *ptrptr = ptr;
7112    return FALSE;
7113    }
7114
7115  /* Keep the highest bracket count in case (?| was used and some branch
7116  has fewer than the rest. */
7117
7118  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
7119
7120  /* In the real compile phase, there is some post-processing to be done. */
7121
7122  if (lengthptr == NULL)
7123    {
7124    /* If this is the first branch, the firstchar and reqchar values for the
7125    branch become the values for the regex. */
7126
7127    if (*last_branch != OP_ALT)
7128      {
7129      firstchar = branchfirstchar;
7130      reqchar = branchreqchar;
7131      }
7132
7133    /* If this is not the first branch, the first char and reqchar have to
7134    match the values from all the previous branches, except that if the
7135    previous value for reqchar didn't have REQ_VARY set, it can still match,
7136    and we set REQ_VARY for the regex. */
7137
7138    else
7139      {
7140      /* If we previously had a firstchar, but it doesn't match the new branch,
7141      we have to abandon the firstchar for the regex, but if there was
7142      previously no reqchar, it takes on the value of the old firstchar. */
7143
7144      if (firstchar >= 0 && firstchar != branchfirstchar)
7145        {
7146        if (reqchar < 0) reqchar = firstchar;
7147        firstchar = REQ_NONE;
7148        }
7149
7150      /* If we (now or from before) have no firstchar, a firstchar from the
7151      branch becomes a reqchar if there isn't a branch reqchar. */
7152
7153      if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
7154          branchreqchar = branchfirstchar;
7155
7156      /* Now ensure that the reqchars match */
7157
7158      if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
7159        reqchar = REQ_NONE;
7160      else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
7161      }
7162
7163    /* If lookbehind, check that this branch matches a fixed-length string, and
7164    put the length into the OP_REVERSE item. Temporarily mark the end of the
7165    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
7166    because there may be forward references that we can't check here. Set a
7167    flag to cause another lookbehind check at the end. Why not do it all at the
7168    end? Because common, erroneous checks are picked up here and the offset of
7169    the problem can be shown. */
7170
7171    if (lookbehind)
7172      {
7173      int fixed_length;
7174      *code = OP_END;
7175      fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
7176        FALSE, cd);
7177      DPRINTF(("fixed length = %d\n", fixed_length));
7178      if (fixed_length == -3)
7179        {
7180        cd->check_lookbehind = TRUE;
7181        }
7182      else if (fixed_length < 0)
7183        {
7184        *errorcodeptr = (fixed_length == -2)? ERR36 :
7185                        (fixed_length == -4)? ERR70: ERR25;
7186        *ptrptr = ptr;
7187        return FALSE;
7188        }
7189      else
7190        {
7191        if (fixed_length > cd->max_lookbehind)
7192          cd->max_lookbehind = fixed_length;
7193        PUT(reverse_count, 0, fixed_length);
7194        }
7195      }
7196    }
7197
7198  /* Reached end of expression, either ')' or end of pattern. In the real
7199  compile phase, go back through the alternative branches and reverse the chain
7200  of offsets, with the field in the BRA item now becoming an offset to the
7201  first alternative. If there are no alternatives, it points to the end of the
7202  group. The length in the terminating ket is always the length of the whole
7203  bracketed item. Return leaving the pointer at the terminating char. */
7204
7205  if (*ptr != CHAR_VERTICAL_LINE)
7206    {
7207    if (lengthptr == NULL)
7208      {
7209      int branch_length = (int)(code - last_branch);
7210      do
7211        {
7212        int prev_length = GET(last_branch, 1);
7213        PUT(last_branch, 1, branch_length);
7214        branch_length = prev_length;
7215        last_branch -= branch_length;
7216        }
7217      while (branch_length > 0);
7218      }
7219
7220    /* Fill in the ket */
7221
7222    *code = OP_KET;
7223    PUT(code, 1, (int)(code - start_bracket));
7224    code += 1 + LINK_SIZE;
7225
7226    /* If it was a capturing subpattern, check to see if it contained any
7227    recursive back references. If so, we must wrap it in atomic brackets.
7228    In any event, remove the block from the chain. */
7229
7230    if (capnumber > 0)
7231      {
7232      if (cd->open_caps->flag)
7233        {
7234        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7235          IN_UCHARS(code - start_bracket));
7236        *start_bracket = OP_ONCE;
7237        code += 1 + LINK_SIZE;
7238        PUT(start_bracket, 1, (int)(code - start_bracket));
7239        *code = OP_KET;
7240        PUT(code, 1, (int)(code - start_bracket));
7241        code += 1 + LINK_SIZE;
7242        length += 2 + 2*LINK_SIZE;
7243        }
7244      cd->open_caps = cd->open_caps->next;
7245      }
7246
7247    /* Retain the highest bracket number, in case resetting was used. */
7248
7249    cd->bracount = max_bracount;
7250
7251    /* Set values to pass back */
7252
7253    *codeptr = code;
7254    *ptrptr = ptr;
7255    *firstcharptr = firstchar;
7256    *reqcharptr = reqchar;
7257    if (lengthptr != NULL)
7258      {
7259      if (OFLOW_MAX - *lengthptr < length)
7260        {
7261        *errorcodeptr = ERR20;
7262        return FALSE;
7263        }
7264      *lengthptr += length;
7265      }
7266    return TRUE;
7267    }
7268
7269  /* Another branch follows. In the pre-compile phase, we can move the code
7270  pointer back to where it was for the start of the first branch. (That is,
7271  pretend that each branch is the only one.)
7272
7273  In the real compile phase, insert an ALT node. Its length field points back
7274  to the previous branch while the bracket remains open. At the end the chain
7275  is reversed. It's done like this so that the start of the bracket has a
7276  zero offset until it is closed, making it possible to detect recursion. */
7277
7278  if (lengthptr != NULL)
7279    {
7280    code = *codeptr + 1 + LINK_SIZE + skipbytes;
7281    length += 1 + LINK_SIZE;
7282    }
7283  else
7284    {
7285    *code = OP_ALT;
7286    PUT(code, 1, (int)(code - last_branch));
7287    bc.current_branch = last_branch = code;
7288    code += 1 + LINK_SIZE;
7289    }
7290
7291  ptr++;
7292  }
7293/* Control never reaches here */
7294}
7295
7296
7297
7298
7299/*************************************************
7300*          Check for anchored expression         *
7301*************************************************/
7302
7303/* Try to find out if this is an anchored regular expression. Consider each
7304alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7305all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7306it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7307be found, because ^ generates OP_CIRCM in that mode.
7308
7309We can also consider a regex to be anchored if OP_SOM starts all its branches.
7310This is the code for \G, which means "match at start of match position, taking
7311into account the match offset".
7312
7313A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7314because that will try the rest of the pattern at all possible matching points,
7315so there is no point trying again.... er ....
7316
7317.... except when the .* appears inside capturing parentheses, and there is a
7318subsequent back reference to those parentheses. We haven't enough information
7319to catch that case precisely.
7320
7321At first, the best we could do was to detect when .* was in capturing brackets
7322and the highest back reference was greater than or equal to that level.
7323However, by keeping a bitmap of the first 31 back references, we can catch some
7324of the more common cases more precisely.
7325
7326Arguments:
7327  code           points to start of expression (the bracket)
7328  bracket_map    a bitmap of which brackets we are inside while testing; this
7329                  handles up to substring 31; after that we just have to take
7330                  the less precise approach
7331  backref_map    the back reference bitmap
7332
7333Returns:     TRUE or FALSE
7334*/
7335
7336static BOOL
7337is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
7338  unsigned int backref_map)
7339{
7340do {
7341   const pcre_uchar *scode = first_significant_code(
7342     code + PRIV(OP_lengths)[*code], FALSE);
7343   register int op = *scode;
7344
7345   /* Non-capturing brackets */
7346
7347   if (op == OP_BRA  || op == OP_BRAPOS ||
7348       op == OP_SBRA || op == OP_SBRAPOS)
7349     {
7350     if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7351     }
7352
7353   /* Capturing brackets */
7354
7355   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7356            op == OP_SCBRA || op == OP_SCBRAPOS)
7357     {
7358     int n = GET2(scode, 1+LINK_SIZE);
7359     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7360     if (!is_anchored(scode, new_map, backref_map)) return FALSE;
7361     }
7362
7363   /* Other brackets */
7364
7365   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
7366            op == OP_COND)
7367     {
7368     if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7369     }
7370
7371   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7372   it isn't in brackets that are or may be referenced. */
7373
7374   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7375             op == OP_TYPEPOSSTAR))
7376     {
7377     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
7378       return FALSE;
7379     }
7380
7381   /* Check for explicit anchoring */
7382
7383   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7384   code += GET(code, 1);
7385   }
7386while (*code == OP_ALT);   /* Loop for each alternative */
7387return TRUE;
7388}
7389
7390
7391
7392/*************************************************
7393*         Check for starting with ^ or .*        *
7394*************************************************/
7395
7396/* This is called to find out if every branch starts with ^ or .* so that
7397"first char" processing can be done to speed things up in multiline
7398matching and for non-DOTALL patterns that start with .* (which must start at
7399the beginning or after \n). As in the case of is_anchored() (see above), we
7400have to take account of back references to capturing brackets that contain .*
7401because in that case we can't make the assumption.
7402
7403Arguments:
7404  code           points to start of expression (the bracket)
7405  bracket_map    a bitmap of which brackets we are inside while testing; this
7406                  handles up to substring 31; after that we just have to take
7407                  the less precise approach
7408  backref_map    the back reference bitmap
7409
7410Returns:         TRUE or FALSE
7411*/
7412
7413static BOOL
7414is_startline(const pcre_uchar *code, unsigned int bracket_map,
7415  unsigned int backref_map)
7416{
7417do {
7418   const pcre_uchar *scode = first_significant_code(
7419     code + PRIV(OP_lengths)[*code], FALSE);
7420   register int op = *scode;
7421
7422   /* If we are at the start of a conditional assertion group, *both* the
7423   conditional assertion *and* what follows the condition must satisfy the test
7424   for start of line. Other kinds of condition fail. Note that there may be an
7425   auto-callout at the start of a condition. */
7426
7427   if (op == OP_COND)
7428     {
7429     scode += 1 + LINK_SIZE;
7430     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7431     switch (*scode)
7432       {
7433       case OP_CREF:
7434       case OP_NCREF:
7435       case OP_RREF:
7436       case OP_NRREF:
7437       case OP_DEF:
7438       return FALSE;
7439
7440       default:     /* Assertion */
7441       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7442       do scode += GET(scode, 1); while (*scode == OP_ALT);
7443       scode += 1 + LINK_SIZE;
7444       break;
7445       }
7446     scode = first_significant_code(scode, FALSE);
7447     op = *scode;
7448     }
7449
7450   /* Non-capturing brackets */
7451
7452   if (op == OP_BRA  || op == OP_BRAPOS ||
7453       op == OP_SBRA || op == OP_SBRAPOS)
7454     {
7455     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7456     }
7457
7458   /* Capturing brackets */
7459
7460   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7461            op == OP_SCBRA || op == OP_SCBRAPOS)
7462     {
7463     int n = GET2(scode, 1+LINK_SIZE);
7464     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7465     if (!is_startline(scode, new_map, backref_map)) return FALSE;
7466     }
7467
7468   /* Other brackets */
7469
7470   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
7471     {
7472     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7473     }
7474
7475   /* .* means "start at start or after \n" if it isn't in brackets that
7476   may be referenced. */
7477
7478   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7479     {
7480     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
7481     }
7482
7483   /* Check for explicit circumflex */
7484
7485   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7486
7487   /* Move on to the next alternative */
7488
7489   code += GET(code, 1);
7490   }
7491while (*code == OP_ALT);  /* Loop for each alternative */
7492return TRUE;
7493}
7494
7495
7496
7497/*************************************************
7498*       Check for asserted fixed first char      *
7499*************************************************/
7500
7501/* During compilation, the "first char" settings from forward assertions are
7502discarded, because they can cause conflicts with actual literals that follow.
7503However, if we end up without a first char setting for an unanchored pattern,
7504it is worth scanning the regex to see if there is an initial asserted first
7505char. If all branches start with the same asserted char, or with a bracket all
7506of whose alternatives start with the same asserted char (recurse ad lib), then
7507we return that char, otherwise -1.
7508
7509Arguments:
7510  code       points to start of expression (the bracket)
7511  inassert   TRUE if in an assertion
7512
7513Returns:     -1 or the fixed first char
7514*/
7515
7516static int
7517find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7518{
7519register int c = -1;
7520do {
7521   int d;
7522   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
7523             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
7524   const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
7525     TRUE);
7526   register int op = *scode;
7527
7528   switch(op)
7529     {
7530     default:
7531     return -1;
7532
7533     case OP_BRA:
7534     case OP_BRAPOS:
7535     case OP_CBRA:
7536     case OP_SCBRA:
7537     case OP_CBRAPOS:
7538     case OP_SCBRAPOS:
7539     case OP_ASSERT:
7540     case OP_ONCE:
7541     case OP_ONCE_NC:
7542     case OP_COND:
7543     if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
7544       return -1;
7545     if (c < 0) c = d; else if (c != d) return -1;
7546     break;
7547
7548     case OP_EXACT:
7549     scode += IMM2_SIZE;
7550     /* Fall through */
7551
7552     case OP_CHAR:
7553     case OP_PLUS:
7554     case OP_MINPLUS:
7555     case OP_POSPLUS:
7556     if (!inassert) return -1;
7557     if (c < 0) c = scode[1];
7558       else if (c != scode[1]) return -1;
7559     break;
7560
7561     case OP_EXACTI:
7562     scode += IMM2_SIZE;
7563     /* Fall through */
7564
7565     case OP_CHARI:
7566     case OP_PLUSI:
7567     case OP_MINPLUSI:
7568     case OP_POSPLUSI:
7569     if (!inassert) return -1;
7570     if (c < 0) c = scode[1] | REQ_CASELESS;
7571       else if (c != scode[1]) return -1;
7572     break;
7573     }
7574
7575   code += GET(code, 1);
7576   }
7577while (*code == OP_ALT);
7578return c;
7579}
7580
7581
7582
7583/*************************************************
7584*        Compile a Regular Expression            *
7585*************************************************/
7586
7587/* This function takes a string and returns a pointer to a block of store
7588holding a compiled version of the expression. The original API for this
7589function had no error code return variable; it is retained for backwards
7590compatibility. The new function is given a new name.
7591
7592Arguments:
7593  pattern       the regular expression
7594  options       various option bits
7595  errorcodeptr  pointer to error code variable (pcre_compile2() only)
7596                  can be NULL if you don't want a code value
7597  errorptr      pointer to pointer to error text
7598  erroroffset   ptr offset in pattern where error was detected
7599  tables        pointer to character tables or NULL
7600
7601Returns:        pointer to compiled data block, or NULL on error,
7602                with errorptr and erroroffset set
7603*/
7604
7605#ifdef COMPILE_PCRE8
7606PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7607pcre_compile(const char *pattern, int options, const char **errorptr,
7608  int *erroroffset, const unsigned char *tables)
7609#else
7610PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7611pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
7612  int *erroroffset, const unsigned char *tables)
7613#endif
7614{
7615#ifdef COMPILE_PCRE8
7616return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7617#else
7618return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7619#endif
7620}
7621
7622
7623#ifdef COMPILE_PCRE8
7624PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7625pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7626  const char **errorptr, int *erroroffset, const unsigned char *tables)
7627#else
7628PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7629pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
7630  const char **errorptr, int *erroroffset, const unsigned char *tables)
7631#endif
7632{
7633REAL_PCRE *re;
7634int length = 1;  /* For final END opcode */
7635pcre_int32 firstchar, reqchar;
7636int newline;
7637int errorcode = 0;
7638int skipatstart = 0;
7639BOOL utf;
7640size_t size;
7641pcre_uchar *code;
7642const pcre_uchar *codestart;
7643const pcre_uchar *ptr;
7644compile_data compile_block;
7645compile_data *cd = &compile_block;
7646
7647/* This space is used for "compiling" into during the first phase, when we are
7648computing the amount of memory that is needed. Compiled items are thrown away
7649as soon as possible, so that a fairly large buffer should be sufficient for
7650this purpose. The same space is used in the second phase for remembering where
7651to fill in forward references to subpatterns. That may overflow, in which case
7652new memory is obtained from malloc(). */
7653
7654pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7655
7656/* Set this early so that early errors get offset 0. */
7657
7658ptr = (const pcre_uchar *)pattern;
7659
7660/* We can't pass back an error message if errorptr is NULL; I guess the best we
7661can do is just return NULL, but we can set a code value if there is a code
7662pointer. */
7663
7664if (errorptr == NULL)
7665  {
7666  if (errorcodeptr != NULL) *errorcodeptr = 99;
7667  return NULL;
7668  }
7669
7670*errorptr = NULL;
7671if (errorcodeptr != NULL) *errorcodeptr = ERR0;
7672
7673/* However, we can give a message for this error */
7674
7675if (erroroffset == NULL)
7676  {
7677  errorcode = ERR16;
7678  goto PCRE_EARLY_ERROR_RETURN2;
7679  }
7680
7681*erroroffset = 0;
7682
7683/* Set up pointers to the individual character tables */
7684
7685if (tables == NULL) tables = PRIV(default_tables);
7686cd->lcc = tables + lcc_offset;
7687cd->fcc = tables + fcc_offset;
7688cd->cbits = tables + cbits_offset;
7689cd->ctypes = tables + ctypes_offset;
7690
7691/* Check that all undefined public option bits are zero */
7692
7693if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
7694  {
7695  errorcode = ERR17;
7696  goto PCRE_EARLY_ERROR_RETURN;
7697  }
7698
7699/* Check for global one-time settings at the start of the pattern, and remember
7700the offset for later. */
7701
7702while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
7703       ptr[skipatstart+1] == CHAR_ASTERISK)
7704  {
7705  int newnl = 0;
7706  int newbsr = 0;
7707
7708#ifdef COMPILE_PCRE8
7709  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7710    { skipatstart += 7; options |= PCRE_UTF8; continue; }
7711#endif
7712#ifdef COMPILE_PCRE16
7713  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7714    { skipatstart += 8; options |= PCRE_UTF16; continue; }
7715#endif
7716  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7717    { skipatstart += 6; options |= PCRE_UCP; continue; }
7718  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
7719    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7720
7721  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
7722    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
7723  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
7724    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
7725  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
7726    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
7727  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
7728    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
7729  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
7730    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7731
7732  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
7733    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
7734  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
7735    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7736
7737  if (newnl != 0)
7738    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
7739  else if (newbsr != 0)
7740    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
7741  else break;
7742  }
7743
7744/* PCRE_UTF16 has the same value as PCRE_UTF8. */
7745utf = (options & PCRE_UTF8) != 0;
7746
7747/* Can't support UTF unless PCRE has been compiled to include the code. The
7748return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7749release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7750not used here. */
7751
7752#ifdef SUPPORT_UTF
7753if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7754     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7755  {
7756#ifdef COMPILE_PCRE8
7757  errorcode = ERR44;
7758#else
7759  errorcode = ERR74;
7760#endif
7761  goto PCRE_EARLY_ERROR_RETURN2;
7762  }
7763#else
7764if (utf)
7765  {
7766  errorcode = ERR32;
7767  goto PCRE_EARLY_ERROR_RETURN;
7768  }
7769#endif
7770
7771/* Can't support UCP unless PCRE has been compiled to include the code. */
7772
7773#ifndef SUPPORT_UCP
7774if ((options & PCRE_UCP) != 0)
7775  {
7776  errorcode = ERR67;
7777  goto PCRE_EARLY_ERROR_RETURN;
7778  }
7779#endif
7780
7781/* Check validity of \R options. */
7782
7783if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7784     (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7785  {
7786  errorcode = ERR56;
7787  goto PCRE_EARLY_ERROR_RETURN;
7788  }
7789
7790/* Handle different types of newline. The three bits give seven cases. The
7791current code allows for fixed one- or two-byte sequences, plus "any" and
7792"anycrlf". */
7793
7794switch (options & PCRE_NEWLINE_BITS)
7795  {
7796  case 0: newline = NEWLINE; break;   /* Build-time default */
7797  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7798  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7799  case PCRE_NEWLINE_CR+
7800       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7801  case PCRE_NEWLINE_ANY: newline = -1; break;
7802  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7803  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
7804  }
7805
7806if (newline == -2)
7807  {
7808  cd->nltype = NLTYPE_ANYCRLF;
7809  }
7810else if (newline < 0)
7811  {
7812  cd->nltype = NLTYPE_ANY;
7813  }
7814else
7815  {
7816  cd->nltype = NLTYPE_FIXED;
7817  if (newline > 255)
7818    {
7819    cd->nllen = 2;
7820    cd->nl[0] = (newline >> 8) & 255;
7821    cd->nl[1] = newline & 255;
7822    }
7823  else
7824    {
7825    cd->nllen = 1;
7826    cd->nl[0] = newline;
7827    }
7828  }
7829
7830/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7831references to help in deciding whether (.*) can be treated as anchored or not.
7832*/
7833
7834cd->top_backref = 0;
7835cd->backref_map = 0;
7836
7837/* Reflect pattern for debugging output */
7838
7839DPRINTF(("------------------------------------------------------------------\n"));
7840#ifdef PCRE_DEBUG
7841print_puchar(stdout, (PCRE_PUCHAR)pattern);
7842#endif
7843DPRINTF(("\n"));
7844
7845/* Pretend to compile the pattern while actually just accumulating the length
7846of memory required. This behaviour is triggered by passing a non-NULL final
7847argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7848to compile parts of the pattern into; the compiled code is discarded when it is
7849no longer needed, so hopefully this workspace will never overflow, though there
7850is a test for its doing so. */
7851
7852cd->bracount = cd->final_bracount = 0;
7853cd->names_found = 0;
7854cd->name_entry_size = 0;
7855cd->name_table = NULL;
7856cd->start_code = cworkspace;
7857cd->hwm = cworkspace;
7858cd->start_workspace = cworkspace;
7859cd->workspace_size = COMPILE_WORK_SIZE;
7860cd->start_pattern = (const pcre_uchar *)pattern;
7861cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7862cd->req_varyopt = 0;
7863cd->assert_depth = 0;
7864cd->max_lookbehind = 0;
7865cd->external_options = options;
7866cd->external_flags = 0;
7867cd->open_caps = NULL;
7868
7869/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7870don't need to look at the result of the function here. The initial options have
7871been put into the cd block so that they can be changed if an option setting is
7872found within the regex right at the beginning. Bringing initial option settings
7873outside can help speed up starting point checks. */
7874
7875ptr += skipatstart;
7876code = cworkspace;
7877*code = OP_BRA;
7878(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7879  FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length);
7880if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7881
7882DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7883  (int)(cd->hwm - cworkspace)));
7884
7885if (length > MAX_PATTERN_SIZE)
7886  {
7887  errorcode = ERR20;
7888  goto PCRE_EARLY_ERROR_RETURN;
7889  }
7890
7891/* Compute the size of data block needed and get it, either from malloc or
7892externally provided function. Integer overflow should no longer be possible
7893because nowadays we limit the maximum value of cd->names_found and
7894cd->name_entry_size. */
7895
7896size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7897re = (REAL_PCRE *)(PUBL(malloc))(size);
7898
7899if (re == NULL)
7900  {
7901  errorcode = ERR21;
7902  goto PCRE_EARLY_ERROR_RETURN;
7903  }
7904
7905/* Put in the magic number, and save the sizes, initial options, internal
7906flags, and character table pointer. NULL is used for the default character
7907tables. The nullpad field is at the end; it's there to help in the case when a
7908regex compiled on a system with 4-byte pointers is run on another with 8-byte
7909pointers. */
7910
7911re->magic_number = MAGIC_NUMBER;
7912re->size = (int)size;
7913re->options = cd->external_options;
7914re->flags = cd->external_flags;
7915re->first_char = 0;
7916re->req_char = 0;
7917re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
7918re->name_entry_size = cd->name_entry_size;
7919re->name_count = cd->names_found;
7920re->ref_count = 0;
7921re->tables = (tables == PRIV(default_tables))? NULL : tables;
7922re->nullpad = NULL;
7923
7924/* The starting points of the name/number translation table and of the code are
7925passed around in the compile data block. The start/end pattern and initial
7926options are already set from the pre-compile phase, as is the name_entry_size
7927field. Reset the bracket count and the names_found field. Also reset the hwm
7928field; this time it's used for remembering forward references to subpatterns.
7929*/
7930
7931cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7932cd->assert_depth = 0;
7933cd->bracount = 0;
7934cd->max_lookbehind = 0;
7935cd->names_found = 0;
7936cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7937codestart = cd->name_table + re->name_entry_size * re->name_count;
7938cd->start_code = codestart;
7939cd->hwm = (pcre_uchar *)(cd->start_workspace);
7940cd->req_varyopt = 0;
7941cd->had_accept = FALSE;
7942cd->check_lookbehind = FALSE;
7943cd->open_caps = NULL;
7944
7945/* Set up a starting, non-extracting bracket, then compile the expression. On
7946error, errorcode will be set non-zero, so we don't need to look at the result
7947of the function here. */
7948
7949ptr = (const pcre_uchar *)pattern + skipatstart;
7950code = (pcre_uchar *)codestart;
7951*code = OP_BRA;
7952(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7953  &firstchar, &reqchar, NULL, cd, NULL);
7954re->top_bracket = cd->bracount;
7955re->top_backref = cd->top_backref;
7956re->max_lookbehind = cd->max_lookbehind;
7957re->flags = cd->external_flags | PCRE_MODE;
7958
7959if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7960
7961/* If not reached end of pattern on success, there's an excess bracket. */
7962
7963if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7964
7965/* Fill in the terminating state and check for disastrous overflow, but
7966if debugging, leave the test till after things are printed out. */
7967
7968*code++ = OP_END;
7969
7970#ifndef PCRE_DEBUG
7971if (code - codestart > length) errorcode = ERR23;
7972#endif
7973
7974/* Fill in any forward references that are required. There may be repeated
7975references; optimize for them, as searching a large regex takes time. */
7976
7977if (cd->hwm > cd->start_workspace)
7978  {
7979  int prev_recno = -1;
7980  const pcre_uchar *groupptr = NULL;
7981  while (errorcode == 0 && cd->hwm > cd->start_workspace)
7982    {
7983    int offset, recno;
7984    cd->hwm -= LINK_SIZE;
7985    offset = GET(cd->hwm, 0);
7986    recno = GET(codestart, offset);
7987    if (recno != prev_recno)
7988      {
7989      groupptr = PRIV(find_bracket)(codestart, utf, recno);
7990      prev_recno = recno;
7991      }
7992    if (groupptr == NULL) errorcode = ERR53;
7993      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7994    }
7995  }
7996
7997/* If the workspace had to be expanded, free the new memory. */
7998
7999if (cd->workspace_size > COMPILE_WORK_SIZE)
8000  (PUBL(free))((void *)cd->start_workspace);
8001
8002/* Give an error if there's back reference to a non-existent capturing
8003subpattern. */
8004
8005if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8006
8007/* If there were any lookbehind assertions that contained OP_RECURSE
8008(recursions or subroutine calls), a flag is set for them to be checked here,
8009because they may contain forward references. Actual recursions can't be fixed
8010length, but subroutine calls can. It is done like this so that those without
8011OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8012exceptional ones forgo this. We scan the pattern to check that they are fixed
8013length, and set their lengths. */
8014
8015if (cd->check_lookbehind)
8016  {
8017  pcre_uchar *cc = (pcre_uchar *)codestart;
8018
8019  /* Loop, searching for OP_REVERSE items, and process those that do not have
8020  their length set. (Actually, it will also re-process any that have a length
8021  of zero, but that is a pathological case, and it does no harm.) When we find
8022  one, we temporarily terminate the branch it is in while we scan it. */
8023
8024  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
8025       cc != NULL;
8026       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
8027    {
8028    if (GET(cc, 1) == 0)
8029      {
8030      int fixed_length;
8031      pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8032      int end_op = *be;
8033      *be = OP_END;
8034      fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
8035        cd);
8036      *be = end_op;
8037      DPRINTF(("fixed length = %d\n", fixed_length));
8038      if (fixed_length < 0)
8039        {
8040        errorcode = (fixed_length == -2)? ERR36 :
8041                    (fixed_length == -4)? ERR70 : ERR25;
8042        break;
8043        }
8044      if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8045      PUT(cc, 1, fixed_length);
8046      }
8047    cc += 1 + LINK_SIZE;
8048    }
8049  }
8050
8051/* Failed to compile, or error while post-processing */
8052
8053if (errorcode != 0)
8054  {
8055  (PUBL(free))(re);
8056  PCRE_EARLY_ERROR_RETURN:
8057  *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
8058  PCRE_EARLY_ERROR_RETURN2:
8059  *errorptr = find_error_text(errorcode);
8060  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
8061  return NULL;
8062  }
8063
8064/* If the anchored option was not passed, set the flag if we can determine that
8065the pattern is anchored by virtue of ^ characters or \A or anything else (such
8066as starting with .* when DOTALL is set).
8067
8068Otherwise, if we know what the first byte has to be, save it, because that
8069speeds up unanchored matches no end. If not, see if we can set the
8070PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8071start with ^. and also when all branches start with .* for non-DOTALL matches.
8072*/
8073
8074if ((re->options & PCRE_ANCHORED) == 0)
8075  {
8076  if (is_anchored(codestart, 0, cd->backref_map))
8077    re->options |= PCRE_ANCHORED;
8078  else
8079    {
8080    if (firstchar < 0)
8081      firstchar = find_firstassertedchar(codestart, FALSE);
8082    if (firstchar >= 0)   /* Remove caseless flag for non-caseable chars */
8083      {
8084#ifdef COMPILE_PCRE8
8085      re->first_char = firstchar & 0xff;
8086#else
8087#ifdef COMPILE_PCRE16
8088      re->first_char = firstchar & 0xffff;
8089#endif
8090#endif
8091      if ((firstchar & REQ_CASELESS) != 0)
8092        {
8093#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8094        /* We ignore non-ASCII first chars in 8 bit mode. */
8095        if (utf)
8096          {
8097          if (re->first_char < 128)
8098            {
8099            if (cd->fcc[re->first_char] != re->first_char)
8100              re->flags |= PCRE_FCH_CASELESS;
8101            }
8102          else if (UCD_OTHERCASE(re->first_char) != re->first_char)
8103            re->flags |= PCRE_FCH_CASELESS;
8104          }
8105        else
8106#endif
8107        if (MAX_255(re->first_char)
8108            && cd->fcc[re->first_char] != re->first_char)
8109          re->flags |= PCRE_FCH_CASELESS;
8110        }
8111
8112      re->flags |= PCRE_FIRSTSET;
8113      }
8114    else if (is_startline(codestart, 0, cd->backref_map))
8115      re->flags |= PCRE_STARTLINE;
8116    }
8117  }
8118
8119/* For an anchored pattern, we use the "required byte" only if it follows a
8120variable length item in the regex. Remove the caseless flag for non-caseable
8121bytes. */
8122
8123if (reqchar >= 0 &&
8124     ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0))
8125  {
8126#ifdef COMPILE_PCRE8
8127  re->req_char = reqchar & 0xff;
8128#else
8129#ifdef COMPILE_PCRE16
8130  re->req_char = reqchar & 0xffff;
8131#endif
8132#endif
8133  if ((reqchar & REQ_CASELESS) != 0)
8134    {
8135#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8136    /* We ignore non-ASCII first chars in 8 bit mode. */
8137    if (utf)
8138      {
8139      if (re->req_char < 128)
8140        {
8141        if (cd->fcc[re->req_char] != re->req_char)
8142          re->flags |= PCRE_RCH_CASELESS;
8143        }
8144      else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8145        re->flags |= PCRE_RCH_CASELESS;
8146      }
8147    else
8148#endif
8149    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8150      re->flags |= PCRE_RCH_CASELESS;
8151    }
8152
8153  re->flags |= PCRE_REQCHSET;
8154  }
8155
8156/* Print out the compiled data if debugging is enabled. This is never the
8157case when building a production library. */
8158
8159#ifdef PCRE_DEBUG
8160printf("Length = %d top_bracket = %d top_backref = %d\n",
8161  length, re->top_bracket, re->top_backref);
8162
8163printf("Options=%08x\n", re->options);
8164
8165if ((re->flags & PCRE_FIRSTSET) != 0)
8166  {
8167  pcre_uchar ch = re->first_char;
8168  const char *caseless =
8169    ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
8170  if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
8171    else printf("First char = \\x%02x%s\n", ch, caseless);
8172  }
8173
8174if ((re->flags & PCRE_REQCHSET) != 0)
8175  {
8176  pcre_uchar ch = re->req_char;
8177  const char *caseless =
8178    ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
8179  if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
8180    else printf("Req char = \\x%02x%s\n", ch, caseless);
8181  }
8182
8183#ifdef COMPILE_PCRE8
8184pcre_printint((pcre *)re, stdout, TRUE);
8185#else
8186pcre16_printint((pcre *)re, stdout, TRUE);
8187#endif
8188
8189/* This check is done here in the debugging case so that the code that
8190was compiled can be seen. */
8191
8192if (code - codestart > length)
8193  {
8194  (PUBL(free))(re);
8195  *errorptr = find_error_text(ERR23);
8196  *erroroffset = ptr - (pcre_uchar *)pattern;
8197  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
8198  return NULL;
8199  }
8200#endif   /* PCRE_DEBUG */
8201
8202#ifdef COMPILE_PCRE8
8203return (pcre *)re;
8204#else
8205return (pcre16 *)re;
8206#endif
8207}
8208
8209/* End of pcre_compile.c */
8210