1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2016 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing pattern start */
51#define PSEND   end_pattern    /* Field containing pattern end */
52
53#include "pcre_internal.h"
54
55
56/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57is also used by pcretest. PCRE_DEBUG is not defined when building a production
58library. We do not need to select pcre16_printint.c specially, because the
59COMPILE_PCREx macro will already be appropriately set. */
60
61#ifdef PCRE_DEBUG
62/* pcre_printint.c should not include any headers */
63#define PCRE_INCLUDED
64#include "pcre_printint.c"
65#undef PCRE_INCLUDED
66#endif
67
68
69/* Macro for setting individual bits in class bitmaps. */
70
71#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73/* Maximum length value to check against when making sure that the integer that
74holds the compiled pattern length does not overflow. We make it a bit less than
75INT_MAX to allow for adding in group terminating bytes, so that we don't have
76to check them every time. */
77
78#define OFLOW_MAX (INT_MAX - 20)
79
80/* Definitions to allow mutual recursion */
81
82static int
83  add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84    const pcre_uint32 *, unsigned int);
85
86static BOOL
87  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88    pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89    compile_data *, int *);
90
91
92
93/*************************************************
94*      Code parameters and static tables         *
95*************************************************/
96
97/* This value specifies the size of stack workspace that is used during the
98first pre-compile phase that determines how much memory is required. The regex
99is partly compiled into this space, but the compiled parts are discarded as
100soon as they can be, so that hopefully there will never be an overrun. The code
101does, however, check for an overrun. The largest amount I've seen used is 218,
102so this number is very generous.
103
104The same workspace is used during the second, actual compile phase for
105remembering forward references to groups so that they can be filled in at the
106end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107is 4 there is plenty of room for most patterns. However, the memory can get
108filled up by repetitions of forward references, for example patterns like
109/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110that the workspace is expanded using malloc() in this situation. The value
111below is therefore a minimum, and we put a maximum on it for safety. The
112minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113kicks in at the same number of forward references in all cases. */
114
115#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118/* This value determines the size of the initial vector that is used for
119remembering named groups during the pre-compile. It is allocated on the stack,
120but if it is too small, it is expanded using malloc(), in a similar way to the
121workspace. The value is the number of slots in the list. */
122
123#define NAMED_GROUP_LIST_SIZE  20
124
125/* The overrun tests check for a slightly smaller size so that they detect the
126overrun before it actually does run off the end of the data block. */
127
128#define WORK_SIZE_SAFETY_MARGIN (100)
129
130/* Private flags added to firstchar and reqchar. */
131
132#define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133#define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134/* Negative values for the firstchar and reqchar flags */
135#define REQ_UNSET       (-2)
136#define REQ_NONE        (-1)
137
138/* Repeated character flags. */
139
140#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141
142/* Table for handling escaped characters in the range '0'-'z'. Positive returns
143are simple data values; negative values are for special things like \d and so
144on. Zero means further processing is needed (for things like \x), or the escape
145is invalid. */
146
147#ifndef EBCDIC
148
149/* This is the "normal" table for ASCII systems or for EBCDIC systems running
150in UTF-8 mode. */
151
152static const short int escapes[] = {
153     0,                       0,
154     0,                       0,
155     0,                       0,
156     0,                       0,
157     0,                       0,
158     CHAR_COLON,              CHAR_SEMICOLON,
159     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161     CHAR_COMMERCIAL_AT,      -ESC_A,
162     -ESC_B,                  -ESC_C,
163     -ESC_D,                  -ESC_E,
164     0,                       -ESC_G,
165     -ESC_H,                  0,
166     0,                       -ESC_K,
167     0,                       0,
168     -ESC_N,                  0,
169     -ESC_P,                  -ESC_Q,
170     -ESC_R,                  -ESC_S,
171     0,                       0,
172     -ESC_V,                  -ESC_W,
173     -ESC_X,                  0,
174     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177     CHAR_GRAVE_ACCENT,       ESC_a,
178     -ESC_b,                  0,
179     -ESC_d,                  ESC_e,
180     ESC_f,                   0,
181     -ESC_h,                  0,
182     0,                       -ESC_k,
183     0,                       0,
184     ESC_n,                   0,
185     -ESC_p,                  0,
186     ESC_r,                   -ESC_s,
187     ESC_tee,                 0,
188     -ESC_v,                  -ESC_w,
189     0,                       0,
190     -ESC_z
191};
192
193#else
194
195/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197static const short int escapes[] = {
198/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205/*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207/*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
208/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221};
222
223/* We also need a table of characters that may follow \c in an EBCDIC
224environment for characters 0-31. */
225
226static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227
228#endif
229
230
231/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
232searched linearly. Put all the names into a single string, in order to reduce
233the number of relocations when a shared library is dynamically linked. The
234string is built from string macros so that it works in UTF-8 mode on EBCDIC
235platforms. */
236
237typedef struct verbitem {
238  int   len;                 /* Length of verb name */
239  int   op;                  /* Op when no arg, or -1 if arg mandatory */
240  int   op_arg;              /* Op when arg present, or -1 if not allowed */
241} verbitem;
242
243static const char verbnames[] =
244  "\0"                       /* Empty name is a shorthand for MARK */
245  STRING_MARK0
246  STRING_ACCEPT0
247  STRING_COMMIT0
248  STRING_F0
249  STRING_FAIL0
250  STRING_PRUNE0
251  STRING_SKIP0
252  STRING_THEN;
253
254static const verbitem verbs[] = {
255  { 0, -1,        OP_MARK },
256  { 4, -1,        OP_MARK },
257  { 6, OP_ACCEPT, -1 },
258  { 6, OP_COMMIT, -1 },
259  { 1, OP_FAIL,   -1 },
260  { 4, OP_FAIL,   -1 },
261  { 5, OP_PRUNE,  OP_PRUNE_ARG },
262  { 4, OP_SKIP,   OP_SKIP_ARG  },
263  { 4, OP_THEN,   OP_THEN_ARG  }
264};
265
266static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267
268
269/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270another regex library. */
271
272static const pcre_uchar sub_start_of_word[] = {
273  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274  CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275
276static const pcre_uchar sub_end_of_word[] = {
277  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278  CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279  CHAR_RIGHT_PARENTHESIS, '\0' };
280
281
282/* Tables of names of POSIX character classes and their lengths. The names are
283now all in a single string, to reduce the number of relocations when a shared
284library is dynamically loaded. The list of lengths is terminated by a zero
285length entry. The first three must be alpha, lower, upper, as this is assumed
286for handling case independence. The indices for graph, print, and punct are
287needed, so identify them. */
288
289static const char posix_names[] =
290  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
291  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
292  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
293  STRING_word0  STRING_xdigit;
294
295static const pcre_uint8 posix_name_lengths[] = {
296  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297
298#define PC_GRAPH  8
299#define PC_PRINT  9
300#define PC_PUNCT 10
301
302
303/* Table of class bit maps for each POSIX class. Each class is formed from a
304base map, with an optional addition or removal of another map. Then, for some
305classes, there is some additional tweaking: for [:blank:] the vertical space
306characters are removed, and for [:alpha:] and [:alnum:] the underscore
307character is removed. The triples in the table consist of the base map offset,
308second map offset or -1 if no second map, and a non-negative value for map
309addition or a negative value for map subtraction (if there are two maps). The
310absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
311remove vertical space characters, 2 => remove underscore. */
312
313static const int posix_class_maps[] = {
314  cbit_word,  cbit_digit, -2,             /* alpha */
315  cbit_lower, -1,          0,             /* lower */
316  cbit_upper, -1,          0,             /* upper */
317  cbit_word,  -1,          2,             /* alnum - word without underscore */
318  cbit_print, cbit_cntrl,  0,             /* ascii */
319  cbit_space, -1,          1,             /* blank - a GNU extension */
320  cbit_cntrl, -1,          0,             /* cntrl */
321  cbit_digit, -1,          0,             /* digit */
322  cbit_graph, -1,          0,             /* graph */
323  cbit_print, -1,          0,             /* print */
324  cbit_punct, -1,          0,             /* punct */
325  cbit_space, -1,          0,             /* space */
326  cbit_word,  -1,          0,             /* word - a Perl extension */
327  cbit_xdigit,-1,          0              /* xdigit */
328};
329
330/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331Unicode property escapes. */
332
333#ifdef SUPPORT_UCP
334static const pcre_uchar string_PNd[]  = {
335  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337static const pcre_uchar string_pNd[]  = {
338  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340static const pcre_uchar string_PXsp[] = {
341  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343static const pcre_uchar string_pXsp[] = {
344  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346static const pcre_uchar string_PXwd[] = {
347  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
348  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
349static const pcre_uchar string_pXwd[] = {
350  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352
353static const pcre_uchar *substitutes[] = {
354  string_PNd,           /* \D */
355  string_pNd,           /* \d */
356  string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
357  string_pXsp,          /* \s */   /* space and POSIX space are the same. */
358  string_PXwd,          /* \W */
359  string_pXwd           /* \w */
360};
361
362/* The POSIX class substitutes must be in the order of the POSIX class names,
363defined above, and there are both positive and negative cases. NULL means no
364general substitute of a Unicode property escape (\p or \P). However, for some
365POSIX classes (e.g. graph, print, punct) a special property code is compiled
366directly. */
367
368static const pcre_uchar string_pL[] =   {
369  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371static const pcre_uchar string_pLl[] =  {
372  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374static const pcre_uchar string_pLu[] =  {
375  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
376  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
377static const pcre_uchar string_pXan[] = {
378  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
379  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
380static const pcre_uchar string_h[] =    {
381  CHAR_BACKSLASH, CHAR_h, '\0' };
382static const pcre_uchar string_pXps[] = {
383  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
384  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385static const pcre_uchar string_PL[] =   {
386  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388static const pcre_uchar string_PLl[] =  {
389  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391static const pcre_uchar string_PLu[] =  {
392  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
393  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
394static const pcre_uchar string_PXan[] = {
395  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
396  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
397static const pcre_uchar string_H[] =    {
398  CHAR_BACKSLASH, CHAR_H, '\0' };
399static const pcre_uchar string_PXps[] = {
400  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
401  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
402
403static const pcre_uchar *posix_substitutes[] = {
404  string_pL,            /* alpha */
405  string_pLl,           /* lower */
406  string_pLu,           /* upper */
407  string_pXan,          /* alnum */
408  NULL,                 /* ascii */
409  string_h,             /* blank */
410  NULL,                 /* cntrl */
411  string_pNd,           /* digit */
412  NULL,                 /* graph */
413  NULL,                 /* print */
414  NULL,                 /* punct */
415  string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
416  string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
417  NULL,                 /* xdigit */
418  /* Negated cases */
419  string_PL,            /* ^alpha */
420  string_PLl,           /* ^lower */
421  string_PLu,           /* ^upper */
422  string_PXan,          /* ^alnum */
423  NULL,                 /* ^ascii */
424  string_H,             /* ^blank */
425  NULL,                 /* ^cntrl */
426  string_PNd,           /* ^digit */
427  NULL,                 /* ^graph */
428  NULL,                 /* ^print */
429  NULL,                 /* ^punct */
430  string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
431  string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
432  NULL                  /* ^xdigit */
433};
434#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
435#endif
436
437#define STRING(a)  # a
438#define XSTRING(s) STRING(s)
439
440/* The texts of compile-time error messages. These are "char *" because they
441are passed to the outside world. Do not ever re-use any error number, because
442they are documented. Always add a new error instead. Messages marked DEAD below
443are no longer used. This used to be a table of strings, but in order to reduce
444the number of relocations needed when a shared library is loaded dynamically,
445it is now one long string. We cannot use a table of offsets, because the
446lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
447simply count through to the one we want - this isn't a performance issue
448because these strings are used only when there is a compilation error.
449
450Each substring ends with \0 to insert a null character. This includes the final
451substring, so that the whole string ends with \0\0, which can be detected when
452counting through. */
453
454static const char error_texts[] =
455  "no error\0"
456  "\\ at end of pattern\0"
457  "\\c at end of pattern\0"
458  "unrecognized character follows \\\0"
459  "numbers out of order in {} quantifier\0"
460  /* 5 */
461  "number too big in {} quantifier\0"
462  "missing terminating ] for character class\0"
463  "invalid escape sequence in character class\0"
464  "range out of order in character class\0"
465  "nothing to repeat\0"
466  /* 10 */
467  "internal error: invalid forward reference offset\0"
468  "internal error: unexpected repeat\0"
469  "unrecognized character after (? or (?-\0"
470  "POSIX named classes are supported only within a class\0"
471  "missing )\0"
472  /* 15 */
473  "reference to non-existent subpattern\0"
474  "erroffset passed as NULL\0"
475  "unknown option bit(s) set\0"
476  "missing ) after comment\0"
477  "parentheses nested too deeply\0"  /** DEAD **/
478  /* 20 */
479  "regular expression is too large\0"
480  "failed to get memory\0"
481  "unmatched parentheses\0"
482  "internal error: code overflow\0"
483  "unrecognized character after (?<\0"
484  /* 25 */
485  "lookbehind assertion is not fixed length\0"
486  "malformed number or name after (?(\0"
487  "conditional group contains more than two branches\0"
488  "assertion expected after (?( or (?(?C)\0"
489  "(?R or (?[+-]digits must be followed by )\0"
490  /* 30 */
491  "unknown POSIX class name\0"
492  "POSIX collating elements are not supported\0"
493  "this version of PCRE is compiled without UTF support\0"
494  "spare error\0"  /** DEAD **/
495  "character value in \\x{} or \\o{} is too large\0"
496  /* 35 */
497  "invalid condition (?(0)\0"
498  "\\C not allowed in lookbehind assertion\0"
499  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
500  "number after (?C is > 255\0"
501  "closing ) for (?C expected\0"
502  /* 40 */
503  "recursive call could loop indefinitely\0"
504  "unrecognized character after (?P\0"
505  "syntax error in subpattern name (missing terminator)\0"
506  "two named subpatterns have the same name\0"
507  "invalid UTF-8 string\0"
508  /* 45 */
509  "support for \\P, \\p, and \\X has not been compiled\0"
510  "malformed \\P or \\p sequence\0"
511  "unknown property name after \\P or \\p\0"
512  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
513  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
514  /* 50 */
515  "repeated subpattern is too long\0"    /** DEAD **/
516  "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
517  "internal error: overran compiling workspace\0"
518  "internal error: previously-checked referenced subpattern not found\0"
519  "DEFINE group contains more than one branch\0"
520  /* 55 */
521  "repeating a DEFINE group is not allowed\0"  /** DEAD **/
522  "inconsistent NEWLINE options\0"
523  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
524  "a numbered reference must not be zero\0"
525  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
526  /* 60 */
527  "(*VERB) not recognized or malformed\0"
528  "number is too big\0"
529  "subpattern name expected\0"
530  "digit expected after (?+\0"
531  "] is an invalid data character in JavaScript compatibility mode\0"
532  /* 65 */
533  "different names for subpatterns of the same number are not allowed\0"
534  "(*MARK) must have an argument\0"
535  "this version of PCRE is not compiled with Unicode property support\0"
536#ifndef EBCDIC
537  "\\c must be followed by an ASCII character\0"
538#else
539  "\\c must be followed by a letter or one of [\\]^_?\0"
540#endif
541  "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542  /* 70 */
543  "internal error: unknown opcode in find_fixedlength()\0"
544  "\\N is not supported in a class\0"
545  "too many forward references\0"
546  "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
547  "invalid UTF-16 string\0"
548  /* 75 */
549  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
550  "character value in \\u.... sequence is too large\0"
551  "invalid UTF-32 string\0"
552  "setting UTF is disabled by the application\0"
553  "non-hex character in \\x{} (closing brace missing?)\0"
554  /* 80 */
555  "non-octal character in \\o{} (closing brace missing?)\0"
556  "missing opening brace after \\o\0"
557  "parentheses are too deeply nested\0"
558  "invalid range in character class\0"
559  "group name must start with a non-digit\0"
560  /* 85 */
561  "parentheses are too deeply nested (stack check)\0"
562  "digits missing in \\x{} or \\o{}\0"
563  "regular expression is too complicated\0"
564  ;
565
566/* Table to identify digits and hex digits. This is used when compiling
567patterns. Note that the tables in chartables are dependent on the locale, and
568may mark arbitrary characters as digits - but the PCRE compiling code expects
569to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
570a private table here. It costs 256 bytes, but it is a lot faster than doing
571character value tests (at least in some simple cases I timed), and in some
572applications one wants PCRE to compile efficiently as well as match
573efficiently.
574
575For convenience, we use the same bit definitions as in chartables:
576
577  0x04   decimal digit
578  0x08   hexadecimal digit
579
580Then we can use ctype_digit and ctype_xdigit in the code. */
581
582/* Using a simple comparison for decimal numbers rather than a memory read
583is much faster, and the resulting code is simpler (the compiler turns it
584into a subtraction and unsigned comparison). */
585
586#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
587
588#ifndef EBCDIC
589
590/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
591UTF-8 mode. */
592
593static const pcre_uint8 digitab[] =
594  {
595  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
596  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
597  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
598  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
599  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
600  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
601  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
602  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
603  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
604  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
605  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
606  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
607  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
608  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
609  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
610  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
611  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
612  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
613  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
614  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
615  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
616  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
617  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
618  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
619  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
620  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
621  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
622  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
623  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
624  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
625  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
626  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
627
628#else
629
630/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
631
632static const pcre_uint8 digitab[] =
633  {
634  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
635  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
636  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
637  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
638  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
639  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
640  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
641  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
642  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
643  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
644  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
645  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
646  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
647  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
648  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
649  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
650  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
651  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
652  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
653  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
654  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
655  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
656  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
657  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
658  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
659  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
660  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
661  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
662  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
663  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
664  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
665  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
666
667static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
668  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
669  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
670  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
671  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
672  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
673  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
674  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
675  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
676  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
677  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
678  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
679  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
680  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
681  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
682  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
683  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
684  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
685  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
686  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
687  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
688  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
689  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
690  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
691  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
692  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
693  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
694  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
695  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
696  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
697  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
698  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
699  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
700#endif
701
702
703/* This table is used to check whether auto-possessification is possible
704between adjacent character-type opcodes. The left-hand (repeated) opcode is
705used to select the row, and the right-hand opcode is use to select the column.
706A value of 1 means that auto-possessification is OK. For example, the second
707value in the first row means that \D+\d can be turned into \D++\d.
708
709The Unicode property types (\P and \p) have to be present to fill out the table
710because of what their opcode values are, but the table values should always be
711zero because property types are handled separately in the code. The last four
712columns apply to items that cannot be repeated, so there is no need to have
713rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
714*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
715
716#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
717#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
718
719static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
720/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
721  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
722  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
723  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
724  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
725  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
726  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
727  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
728  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
729  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
730  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
731  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
732  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
733  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
734  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
735  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
736  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
737  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
738};
739
740#ifdef SUPPORT_UCP
741
742/* This table is used to check whether auto-possessification is possible
743between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
744left-hand (repeated) opcode is used to select the row, and the right-hand
745opcode is used to select the column. The values are as follows:
746
747  0   Always return FALSE (never auto-possessify)
748  1   Character groups are distinct (possessify if both are OP_PROP)
749  2   Check character categories in the same group (general or particular)
750  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
751
752  4   Check left general category vs right particular category
753  5   Check right general category vs left particular category
754
755  6   Left alphanum vs right general category
756  7   Left space vs right general category
757  8   Left word vs right general category
758
759  9   Right alphanum vs left general category
760 10   Right space vs left general category
761 11   Right word vs left general category
762
763 12   Left alphanum vs right particular category
764 13   Left space vs right particular category
765 14   Left word vs right particular category
766
767 15   Right alphanum vs left particular category
768 16   Right space vs left particular category
769 17   Right word vs left particular category
770*/
771
772static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
773/* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
774  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
775  { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
776  { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
777  { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
778  { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
779  { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
780  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
781  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
782  { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
783  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
784  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
785};
786
787/* This table is used to check whether auto-possessification is possible
788between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
789specifies a general category and the other specifies a particular category. The
790row is selected by the general category and the column by the particular
791category. The value is 1 if the particular category is not part of the general
792category. */
793
794static const pcre_uint8 catposstab[7][30] = {
795/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
796  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
797  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
798  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
799  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
800  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
801  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
802  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
803};
804
805/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
806a general or particular category. The properties in each row are those
807that apply to the character set in question. Duplication means that a little
808unnecessary work is done when checking, but this keeps things much simpler
809because they can all use the same code. For more details see the comment where
810this table is used.
811
812Note: SPACE and PXSPACE used to be different because Perl excluded VT from
813"space", but from Perl 5.18 it's included, so both categories are treated the
814same here. */
815
816static const pcre_uint8 posspropstab[3][4] = {
817  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
818  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
819  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
820};
821#endif
822
823/* This table is used when converting repeating opcodes into possessified
824versions as a result of an explicit possessive quantifier such as ++. A zero
825value means there is no possessified version - in those cases the item in
826question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
827because all relevant opcodes are less than that. */
828
829static const pcre_uint8 opcode_possessify[] = {
830  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
831  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
832
833  0,                       /* NOTI */
834  OP_POSSTAR, 0,           /* STAR, MINSTAR */
835  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
836  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
837  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
838  0,                       /* EXACT */
839  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
840
841  OP_POSSTARI, 0,          /* STARI, MINSTARI */
842  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
843  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
844  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
845  0,                       /* EXACTI */
846  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
847
848  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
849  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
850  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
851  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
852  0,                       /* NOTEXACT */
853  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
854
855  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
856  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
857  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
858  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
859  0,                       /* NOTEXACTI */
860  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
861
862  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
863  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
864  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
865  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
866  0,                       /* TYPEEXACT */
867  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
868
869  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
870  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
871  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
872  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
873  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
874
875  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
876  0, 0,                    /* REF, REFI */
877  0, 0,                    /* DNREF, DNREFI */
878  0, 0                     /* RECURSE, CALLOUT */
879};
880
881
882
883/*************************************************
884*            Find an error text                  *
885*************************************************/
886
887/* The error texts are now all in one long string, to save on relocations. As
888some of the text is of unknown length, we can't use a table of offsets.
889Instead, just count through the strings. This is not a performance issue
890because it happens only when there has been a compilation error.
891
892Argument:   the error number
893Returns:    pointer to the error string
894*/
895
896static const char *
897find_error_text(int n)
898{
899const char *s = error_texts;
900for (; n > 0; n--)
901  {
902  while (*s++ != CHAR_NULL) {};
903  if (*s == CHAR_NULL) return "Error text not found (please report)";
904  }
905return s;
906}
907
908
909
910/*************************************************
911*           Expand the workspace                 *
912*************************************************/
913
914/* This function is called during the second compiling phase, if the number of
915forward references fills the existing workspace, which is originally a block on
916the stack. A larger block is obtained from malloc() unless the ultimate limit
917has been reached or the increase will be rather small.
918
919Argument: pointer to the compile data block
920Returns:  0 if all went well, else an error number
921*/
922
923static int
924expand_workspace(compile_data *cd)
925{
926pcre_uchar *newspace;
927int newsize = cd->workspace_size * 2;
928
929if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
930if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
931    newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
932 return ERR72;
933
934newspace = (PUBL(malloc))(IN_UCHARS(newsize));
935if (newspace == NULL) return ERR21;
936memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
937cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
938if (cd->workspace_size > COMPILE_WORK_SIZE)
939  (PUBL(free))((void *)cd->start_workspace);
940cd->start_workspace = newspace;
941cd->workspace_size = newsize;
942return 0;
943}
944
945
946
947/*************************************************
948*            Check for counted repeat            *
949*************************************************/
950
951/* This function is called when a '{' is encountered in a place where it might
952start a quantifier. It looks ahead to see if it really is a quantifier or not.
953It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
954where the ddds are digits.
955
956Arguments:
957  p         pointer to the first char after '{'
958
959Returns:    TRUE or FALSE
960*/
961
962static BOOL
963is_counted_repeat(const pcre_uchar *p)
964{
965if (!IS_DIGIT(*p)) return FALSE;
966p++;
967while (IS_DIGIT(*p)) p++;
968if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
969
970if (*p++ != CHAR_COMMA) return FALSE;
971if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
972
973if (!IS_DIGIT(*p)) return FALSE;
974p++;
975while (IS_DIGIT(*p)) p++;
976
977return (*p == CHAR_RIGHT_CURLY_BRACKET);
978}
979
980
981
982/*************************************************
983*            Handle escapes                      *
984*************************************************/
985
986/* This function is called when a \ has been encountered. It either returns a
987positive value for a simple escape such as \n, or 0 for a data character which
988will be placed in chptr. A backreference to group n is returned as negative n.
989When UTF-8 is enabled, a positive value greater than 255 may be returned in
990chptr. On entry, ptr is pointing at the \. On exit, it is on the final
991character of the escape sequence.
992
993Arguments:
994  ptrptr         points to the pattern position pointer
995  chptr          points to a returned data character
996  errorcodeptr   points to the errorcode variable
997  bracount       number of previous extracting brackets
998  options        the options bits
999  isclass        TRUE if inside a character class
1000
1001Returns:         zero => a data character
1002                 positive => a special escape sequence
1003                 negative => a back reference
1004                 on error, errorcodeptr is set
1005*/
1006
1007static int
1008check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1009  int bracount, int options, BOOL isclass)
1010{
1011/* PCRE_UTF16 has the same value as PCRE_UTF8. */
1012BOOL utf = (options & PCRE_UTF8) != 0;
1013const pcre_uchar *ptr = *ptrptr + 1;
1014pcre_uint32 c;
1015int escape = 0;
1016int i;
1017
1018GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1019ptr--;                            /* Set pointer back to the last byte */
1020
1021/* If backslash is at the end of the pattern, it's an error. */
1022
1023if (c == CHAR_NULL) *errorcodeptr = ERR1;
1024
1025/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1026in a table. A non-zero result is something that can be returned immediately.
1027Otherwise further processing may be required. */
1028
1029#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1030/* Not alphanumeric */
1031else if (c < CHAR_0 || c > CHAR_z) {}
1032else if ((i = escapes[c - CHAR_0]) != 0)
1033  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1034
1035#else           /* EBCDIC coding */
1036/* Not alphanumeric */
1037else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1038else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1039#endif
1040
1041/* Escapes that need further processing, or are illegal. */
1042
1043else
1044  {
1045  const pcre_uchar *oldptr;
1046  BOOL braced, negated, overflow;
1047  int s;
1048
1049  switch (c)
1050    {
1051    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1052    error. */
1053
1054    case CHAR_l:
1055    case CHAR_L:
1056    *errorcodeptr = ERR37;
1057    break;
1058
1059    case CHAR_u:
1060    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1061      {
1062      /* In JavaScript, \u must be followed by four hexadecimal numbers.
1063      Otherwise it is a lowercase u letter. */
1064      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1065        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1066        && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1067        && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1068        {
1069        c = 0;
1070        for (i = 0; i < 4; ++i)
1071          {
1072          register pcre_uint32 cc = *(++ptr);
1073#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1074          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1075          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1076#else           /* EBCDIC coding */
1077          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1078          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1079#endif
1080          }
1081
1082#if defined COMPILE_PCRE8
1083        if (c > (utf ? 0x10ffffU : 0xffU))
1084#elif defined COMPILE_PCRE16
1085        if (c > (utf ? 0x10ffffU : 0xffffU))
1086#elif defined COMPILE_PCRE32
1087        if (utf && c > 0x10ffffU)
1088#endif
1089          {
1090          *errorcodeptr = ERR76;
1091          }
1092        else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1093        }
1094      }
1095    else
1096      *errorcodeptr = ERR37;
1097    break;
1098
1099    case CHAR_U:
1100    /* In JavaScript, \U is an uppercase U letter. */
1101    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1102    break;
1103
1104    /* In a character class, \g is just a literal "g". Outside a character
1105    class, \g must be followed by one of a number of specific things:
1106
1107    (1) A number, either plain or braced. If positive, it is an absolute
1108    backreference. If negative, it is a relative backreference. This is a Perl
1109    5.10 feature.
1110
1111    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1112    is part of Perl's movement towards a unified syntax for back references. As
1113    this is synonymous with \k{name}, we fudge it up by pretending it really
1114    was \k.
1115
1116    (3) For Oniguruma compatibility we also support \g followed by a name or a
1117    number either in angle brackets or in single quotes. However, these are
1118    (possibly recursive) subroutine calls, _not_ backreferences. Just return
1119    the ESC_g code (cf \k). */
1120
1121    case CHAR_g:
1122    if (isclass) break;
1123    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1124      {
1125      escape = ESC_g;
1126      break;
1127      }
1128
1129    /* Handle the Perl-compatible cases */
1130
1131    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1132      {
1133      const pcre_uchar *p;
1134      for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1135        if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1136      if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1137        {
1138        escape = ESC_k;
1139        break;
1140        }
1141      braced = TRUE;
1142      ptr++;
1143      }
1144    else braced = FALSE;
1145
1146    if (ptr[1] == CHAR_MINUS)
1147      {
1148      negated = TRUE;
1149      ptr++;
1150      }
1151    else negated = FALSE;
1152
1153    /* The integer range is limited by the machine's int representation. */
1154    s = 0;
1155    overflow = FALSE;
1156    while (IS_DIGIT(ptr[1]))
1157      {
1158      if (s > INT_MAX / 10 - 1) /* Integer overflow */
1159        {
1160        overflow = TRUE;
1161        break;
1162        }
1163      s = s * 10 + (int)(*(++ptr) - CHAR_0);
1164      }
1165    if (overflow) /* Integer overflow */
1166      {
1167      while (IS_DIGIT(ptr[1]))
1168        ptr++;
1169      *errorcodeptr = ERR61;
1170      break;
1171      }
1172
1173    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1174      {
1175      *errorcodeptr = ERR57;
1176      break;
1177      }
1178
1179    if (s == 0)
1180      {
1181      *errorcodeptr = ERR58;
1182      break;
1183      }
1184
1185    if (negated)
1186      {
1187      if (s > bracount)
1188        {
1189        *errorcodeptr = ERR15;
1190        break;
1191        }
1192      s = bracount - (s - 1);
1193      }
1194
1195    escape = -s;
1196    break;
1197
1198    /* The handling of escape sequences consisting of a string of digits
1199    starting with one that is not zero is not straightforward. Perl has changed
1200    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1201    recommended to avoid the ambiguities in the old syntax.
1202
1203    Outside a character class, the digits are read as a decimal number. If the
1204    number is less than 8 (used to be 10), or if there are that many previous
1205    extracting left brackets, then it is a back reference. Otherwise, up to
1206    three octal digits are read to form an escaped byte. Thus \123 is likely to
1207    be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1208    the octal value is greater than 377, the least significant 8 bits are
1209    taken. \8 and \9 are treated as the literal characters 8 and 9.
1210
1211    Inside a character class, \ followed by a digit is always either a literal
1212    8 or 9 or an octal number. */
1213
1214    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1215    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1216
1217    if (!isclass)
1218      {
1219      oldptr = ptr;
1220      /* The integer range is limited by the machine's int representation. */
1221      s = (int)(c -CHAR_0);
1222      overflow = FALSE;
1223      while (IS_DIGIT(ptr[1]))
1224        {
1225        if (s > INT_MAX / 10 - 1) /* Integer overflow */
1226          {
1227          overflow = TRUE;
1228          break;
1229          }
1230        s = s * 10 + (int)(*(++ptr) - CHAR_0);
1231        }
1232      if (overflow) /* Integer overflow */
1233        {
1234        while (IS_DIGIT(ptr[1]))
1235          ptr++;
1236        *errorcodeptr = ERR61;
1237        break;
1238        }
1239      if (s < 8 || s <= bracount)  /* Check for back reference */
1240        {
1241        escape = -s;
1242        break;
1243        }
1244      ptr = oldptr;      /* Put the pointer back and fall through */
1245      }
1246
1247    /* Handle a digit following \ when the number is not a back reference. If
1248    the first digit is 8 or 9, Perl used to generate a binary zero byte and
1249    then treat the digit as a following literal. At least by Perl 5.18 this
1250    changed so as not to insert the binary zero. */
1251
1252    if ((c = *ptr) >= CHAR_8) break;
1253
1254    /* Fall through with a digit less than 8 */
1255
1256    /* \0 always starts an octal number, but we may drop through to here with a
1257    larger first octal digit. The original code used just to take the least
1258    significant 8 bits of octal numbers (I think this is what early Perls used
1259    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1260    but no more than 3 octal digits. */
1261
1262    case CHAR_0:
1263    c -= CHAR_0;
1264    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1265        c = c * 8 + *(++ptr) - CHAR_0;
1266#ifdef COMPILE_PCRE8
1267    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1268#endif
1269    break;
1270
1271    /* \o is a relatively new Perl feature, supporting a more general way of
1272    specifying character codes in octal. The only supported form is \o{ddd}. */
1273
1274    case CHAR_o:
1275    if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1276    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1277      {
1278      ptr += 2;
1279      c = 0;
1280      overflow = FALSE;
1281      while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1282        {
1283        register pcre_uint32 cc = *ptr++;
1284        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1285#ifdef COMPILE_PCRE32
1286        if (c >= 0x20000000l) { overflow = TRUE; break; }
1287#endif
1288        c = (c << 3) + cc - CHAR_0 ;
1289#if defined COMPILE_PCRE8
1290        if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1291#elif defined COMPILE_PCRE16
1292        if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1293#elif defined COMPILE_PCRE32
1294        if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1295#endif
1296        }
1297      if (overflow)
1298        {
1299        while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1300        *errorcodeptr = ERR34;
1301        }
1302      else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1303        {
1304        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1305        }
1306      else *errorcodeptr = ERR80;
1307      }
1308    break;
1309
1310    /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1311    numbers. Otherwise it is a lowercase x letter. */
1312
1313    case CHAR_x:
1314    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1315      {
1316      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1317        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1318        {
1319        c = 0;
1320        for (i = 0; i < 2; ++i)
1321          {
1322          register pcre_uint32 cc = *(++ptr);
1323#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1324          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1325          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1326#else           /* EBCDIC coding */
1327          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1328          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1329#endif
1330          }
1331        }
1332      }    /* End JavaScript handling */
1333
1334    /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1335    greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1336    digits. If not, { used to be treated as a data character. However, Perl
1337    seems to read hex digits up to the first non-such, and ignore the rest, so
1338    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1339    now gives an error. */
1340
1341    else
1342      {
1343      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1344        {
1345        ptr += 2;
1346        if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1347          {
1348          *errorcodeptr = ERR86;
1349          break;
1350          }
1351        c = 0;
1352        overflow = FALSE;
1353        while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1354          {
1355          register pcre_uint32 cc = *ptr++;
1356          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1357
1358#ifdef COMPILE_PCRE32
1359          if (c >= 0x10000000l) { overflow = TRUE; break; }
1360#endif
1361
1362#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1363          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1364          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1365#else           /* EBCDIC coding */
1366          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1367          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1368#endif
1369
1370#if defined COMPILE_PCRE8
1371          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1372#elif defined COMPILE_PCRE16
1373          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1374#elif defined COMPILE_PCRE32
1375          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1376#endif
1377          }
1378
1379        if (overflow)
1380          {
1381          while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1382          *errorcodeptr = ERR34;
1383          }
1384
1385        else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1386          {
1387          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1388          }
1389
1390        /* If the sequence of hex digits does not end with '}', give an error.
1391        We used just to recognize this construct and fall through to the normal
1392        \x handling, but nowadays Perl gives an error, which seems much more
1393        sensible, so we do too. */
1394
1395        else *errorcodeptr = ERR79;
1396        }   /* End of \x{} processing */
1397
1398      /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1399
1400      else
1401        {
1402        c = 0;
1403        while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1404          {
1405          pcre_uint32 cc;                          /* Some compilers don't like */
1406          cc = *(++ptr);                           /* ++ in initializers */
1407#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1408          if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1409          c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1410#else           /* EBCDIC coding */
1411          if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1412          c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1413#endif
1414          }
1415        }     /* End of \xdd handling */
1416      }       /* End of Perl-style \x handling */
1417    break;
1418
1419    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1420    An error is given if the byte following \c is not an ASCII character. This
1421    coding is ASCII-specific, but then the whole concept of \cx is
1422    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1423
1424    case CHAR_c:
1425    c = *(++ptr);
1426    if (c == CHAR_NULL)
1427      {
1428      *errorcodeptr = ERR2;
1429      break;
1430      }
1431#ifndef EBCDIC    /* ASCII/UTF-8 coding */
1432    if (c > 127)  /* Excludes all non-ASCII in either mode */
1433      {
1434      *errorcodeptr = ERR68;
1435      break;
1436      }
1437    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1438    c ^= 0x40;
1439#else             /* EBCDIC coding */
1440    if (c >= CHAR_a && c <= CHAR_z) c += 64;
1441    if (c == CHAR_QUESTION_MARK)
1442      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1443    else
1444      {
1445      for (i = 0; i < 32; i++)
1446        {
1447        if (c == ebcdic_escape_c[i]) break;
1448        }
1449      if (i < 32) c = i; else *errorcodeptr = ERR68;
1450      }
1451#endif
1452    break;
1453
1454    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1455    other alphanumeric following \ is an error if PCRE_EXTRA was set;
1456    otherwise, for Perl compatibility, it is a literal. This code looks a bit
1457    odd, but there used to be some cases other than the default, and there may
1458    be again in future, so I haven't "optimized" it. */
1459
1460    default:
1461    if ((options & PCRE_EXTRA) != 0) switch(c)
1462      {
1463      default:
1464      *errorcodeptr = ERR3;
1465      break;
1466      }
1467    break;
1468    }
1469  }
1470
1471/* Perl supports \N{name} for character names, as well as plain \N for "not
1472newline". PCRE does not support \N{name}. However, it does support
1473quantification such as \N{2,3}. */
1474
1475if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1476     !is_counted_repeat(ptr+2))
1477  *errorcodeptr = ERR37;
1478
1479/* If PCRE_UCP is set, we change the values for \d etc. */
1480
1481if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1482  escape += (ESC_DU - ESC_D);
1483
1484/* Set the pointer to the final character before returning. */
1485
1486*ptrptr = ptr;
1487*chptr = c;
1488return escape;
1489}
1490
1491
1492
1493#ifdef SUPPORT_UCP
1494/*************************************************
1495*               Handle \P and \p                 *
1496*************************************************/
1497
1498/* This function is called after \P or \p has been encountered, provided that
1499PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1500pointing at the P or p. On exit, it is pointing at the final character of the
1501escape sequence.
1502
1503Argument:
1504  ptrptr         points to the pattern position pointer
1505  negptr         points to a boolean that is set TRUE for negation else FALSE
1506  ptypeptr       points to an unsigned int that is set to the type value
1507  pdataptr       points to an unsigned int that is set to the detailed property value
1508  errorcodeptr   points to the error code variable
1509
1510Returns:         TRUE if the type value was found, or FALSE for an invalid type
1511*/
1512
1513static BOOL
1514get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1515  unsigned int *pdataptr, int *errorcodeptr)
1516{
1517pcre_uchar c;
1518int i, bot, top;
1519const pcre_uchar *ptr = *ptrptr;
1520pcre_uchar name[32];
1521
1522c = *(++ptr);
1523if (c == CHAR_NULL) goto ERROR_RETURN;
1524
1525*negptr = FALSE;
1526
1527/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1528negation. */
1529
1530if (c == CHAR_LEFT_CURLY_BRACKET)
1531  {
1532  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1533    {
1534    *negptr = TRUE;
1535    ptr++;
1536    }
1537  for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1538    {
1539    c = *(++ptr);
1540    if (c == CHAR_NULL) goto ERROR_RETURN;
1541    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1542    name[i] = c;
1543    }
1544  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1545  name[i] = 0;
1546  }
1547
1548/* Otherwise there is just one following character */
1549
1550else
1551  {
1552  name[0] = c;
1553  name[1] = 0;
1554  }
1555
1556*ptrptr = ptr;
1557
1558/* Search for a recognized property name using binary chop */
1559
1560bot = 0;
1561top = PRIV(utt_size);
1562
1563while (bot < top)
1564  {
1565  int r;
1566  i = (bot + top) >> 1;
1567  r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1568  if (r == 0)
1569    {
1570    *ptypeptr = PRIV(utt)[i].type;
1571    *pdataptr = PRIV(utt)[i].value;
1572    return TRUE;
1573    }
1574  if (r > 0) bot = i + 1; else top = i;
1575  }
1576
1577*errorcodeptr = ERR47;
1578*ptrptr = ptr;
1579return FALSE;
1580
1581ERROR_RETURN:
1582*errorcodeptr = ERR46;
1583*ptrptr = ptr;
1584return FALSE;
1585}
1586#endif
1587
1588
1589
1590/*************************************************
1591*         Read repeat counts                     *
1592*************************************************/
1593
1594/* Read an item of the form {n,m} and return the values. This is called only
1595after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1596so the syntax is guaranteed to be correct, but we need to check the values.
1597
1598Arguments:
1599  p              pointer to first char after '{'
1600  minp           pointer to int for min
1601  maxp           pointer to int for max
1602                 returned as -1 if no max
1603  errorcodeptr   points to error code variable
1604
1605Returns:         pointer to '}' on success;
1606                 current ptr on error, with errorcodeptr set non-zero
1607*/
1608
1609static const pcre_uchar *
1610read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1611{
1612int min = 0;
1613int max = -1;
1614
1615while (IS_DIGIT(*p))
1616  {
1617  min = min * 10 + (int)(*p++ - CHAR_0);
1618  if (min > 65535)
1619    {
1620    *errorcodeptr = ERR5;
1621    return p;
1622    }
1623  }
1624
1625if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1626  {
1627  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1628    {
1629    max = 0;
1630    while(IS_DIGIT(*p))
1631      {
1632      max = max * 10 + (int)(*p++ - CHAR_0);
1633      if (max > 65535)
1634        {
1635        *errorcodeptr = ERR5;
1636        return p;
1637        }
1638      }
1639    if (max < min)
1640      {
1641      *errorcodeptr = ERR4;
1642      return p;
1643      }
1644    }
1645  }
1646
1647*minp = min;
1648*maxp = max;
1649return p;
1650}
1651
1652
1653
1654/*************************************************
1655*      Find first significant op code            *
1656*************************************************/
1657
1658/* This is called by several functions that scan a compiled expression looking
1659for a fixed first character, or an anchoring op code etc. It skips over things
1660that do not influence this. For some calls, it makes sense to skip negative
1661forward and all backward assertions, and also the \b assertion; for others it
1662does not.
1663
1664Arguments:
1665  code         pointer to the start of the group
1666  skipassert   TRUE if certain assertions are to be skipped
1667
1668Returns:       pointer to the first significant opcode
1669*/
1670
1671static const pcre_uchar*
1672first_significant_code(const pcre_uchar *code, BOOL skipassert)
1673{
1674for (;;)
1675  {
1676  switch ((int)*code)
1677    {
1678    case OP_ASSERT_NOT:
1679    case OP_ASSERTBACK:
1680    case OP_ASSERTBACK_NOT:
1681    if (!skipassert) return code;
1682    do code += GET(code, 1); while (*code == OP_ALT);
1683    code += PRIV(OP_lengths)[*code];
1684    break;
1685
1686    case OP_WORD_BOUNDARY:
1687    case OP_NOT_WORD_BOUNDARY:
1688    if (!skipassert) return code;
1689    /* Fall through */
1690
1691    case OP_CALLOUT:
1692    case OP_CREF:
1693    case OP_DNCREF:
1694    case OP_RREF:
1695    case OP_DNRREF:
1696    case OP_DEF:
1697    code += PRIV(OP_lengths)[*code];
1698    break;
1699
1700    default:
1701    return code;
1702    }
1703  }
1704/* Control never reaches here */
1705}
1706
1707
1708
1709/*************************************************
1710*        Find the fixed length of a branch       *
1711*************************************************/
1712
1713/* Scan a branch and compute the fixed length of subject that will match it,
1714if the length is fixed. This is needed for dealing with backward assertions.
1715In UTF8 mode, the result is in characters rather than bytes. The branch is
1716temporarily terminated with OP_END when this function is called.
1717
1718This function is called when a backward assertion is encountered, so that if it
1719fails, the error message can point to the correct place in the pattern.
1720However, we cannot do this when the assertion contains subroutine calls,
1721because they can be forward references. We solve this by remembering this case
1722and doing the check at the end; a flag specifies which mode we are running in.
1723
1724Arguments:
1725  code     points to the start of the pattern (the bracket)
1726  utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1727  atend    TRUE if called when the pattern is complete
1728  cd       the "compile data" structure
1729  recurses    chain of recurse_check to catch mutual recursion
1730
1731Returns:   the fixed length,
1732             or -1 if there is no fixed length,
1733             or -2 if \C was encountered (in UTF-8 mode only)
1734             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1735             or -4 if an unknown opcode was encountered (internal error)
1736*/
1737
1738static int
1739find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1740  recurse_check *recurses)
1741{
1742int length = -1;
1743recurse_check this_recurse;
1744register int branchlength = 0;
1745register pcre_uchar *cc = code + 1 + LINK_SIZE;
1746
1747/* Scan along the opcodes for this branch. If we get to the end of the
1748branch, check the length against that of the other branches. */
1749
1750for (;;)
1751  {
1752  int d;
1753  pcre_uchar *ce, *cs;
1754  register pcre_uchar op = *cc;
1755
1756  switch (op)
1757    {
1758    /* We only need to continue for OP_CBRA (normal capturing bracket) and
1759    OP_BRA (normal non-capturing bracket) because the other variants of these
1760    opcodes are all concerned with unlimited repeated groups, which of course
1761    are not of fixed length. */
1762
1763    case OP_CBRA:
1764    case OP_BRA:
1765    case OP_ONCE:
1766    case OP_ONCE_NC:
1767    case OP_COND:
1768    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1769      recurses);
1770    if (d < 0) return d;
1771    branchlength += d;
1772    do cc += GET(cc, 1); while (*cc == OP_ALT);
1773    cc += 1 + LINK_SIZE;
1774    break;
1775
1776    /* Reached end of a branch; if it's a ket it is the end of a nested call.
1777    If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1778    an ALT. If it is END it's the end of the outer call. All can be handled by
1779    the same code. Note that we must not include the OP_KETRxxx opcodes here,
1780    because they all imply an unlimited repeat. */
1781
1782    case OP_ALT:
1783    case OP_KET:
1784    case OP_END:
1785    case OP_ACCEPT:
1786    case OP_ASSERT_ACCEPT:
1787    if (length < 0) length = branchlength;
1788      else if (length != branchlength) return -1;
1789    if (*cc != OP_ALT) return length;
1790    cc += 1 + LINK_SIZE;
1791    branchlength = 0;
1792    break;
1793
1794    /* A true recursion implies not fixed length, but a subroutine call may
1795    be OK. If the subroutine is a forward reference, we can't deal with
1796    it until the end of the pattern, so return -3. */
1797
1798    case OP_RECURSE:
1799    if (!atend) return -3;
1800    cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1801    do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1802    if (cc > cs && cc < ce) return -1;                    /* Recursion */
1803    else   /* Check for mutual recursion */
1804      {
1805      recurse_check *r = recurses;
1806      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1807      if (r != NULL) return -1;   /* Mutual recursion */
1808      }
1809    this_recurse.prev = recurses;
1810    this_recurse.group = cs;
1811    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1812    if (d < 0) return d;
1813    branchlength += d;
1814    cc += 1 + LINK_SIZE;
1815    break;
1816
1817    /* Skip over assertive subpatterns */
1818
1819    case OP_ASSERT:
1820    case OP_ASSERT_NOT:
1821    case OP_ASSERTBACK:
1822    case OP_ASSERTBACK_NOT:
1823    do cc += GET(cc, 1); while (*cc == OP_ALT);
1824    cc += 1 + LINK_SIZE;
1825    break;
1826
1827    /* Skip over things that don't match chars */
1828
1829    case OP_MARK:
1830    case OP_PRUNE_ARG:
1831    case OP_SKIP_ARG:
1832    case OP_THEN_ARG:
1833    cc += cc[1] + PRIV(OP_lengths)[*cc];
1834    break;
1835
1836    case OP_CALLOUT:
1837    case OP_CIRC:
1838    case OP_CIRCM:
1839    case OP_CLOSE:
1840    case OP_COMMIT:
1841    case OP_CREF:
1842    case OP_DEF:
1843    case OP_DNCREF:
1844    case OP_DNRREF:
1845    case OP_DOLL:
1846    case OP_DOLLM:
1847    case OP_EOD:
1848    case OP_EODN:
1849    case OP_FAIL:
1850    case OP_NOT_WORD_BOUNDARY:
1851    case OP_PRUNE:
1852    case OP_REVERSE:
1853    case OP_RREF:
1854    case OP_SET_SOM:
1855    case OP_SKIP:
1856    case OP_SOD:
1857    case OP_SOM:
1858    case OP_THEN:
1859    case OP_WORD_BOUNDARY:
1860    cc += PRIV(OP_lengths)[*cc];
1861    break;
1862
1863    /* Handle literal characters */
1864
1865    case OP_CHAR:
1866    case OP_CHARI:
1867    case OP_NOT:
1868    case OP_NOTI:
1869    branchlength++;
1870    cc += 2;
1871#ifdef SUPPORT_UTF
1872    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1873#endif
1874    break;
1875
1876    /* Handle exact repetitions. The count is already in characters, but we
1877    need to skip over a multibyte character in UTF8 mode.  */
1878
1879    case OP_EXACT:
1880    case OP_EXACTI:
1881    case OP_NOTEXACT:
1882    case OP_NOTEXACTI:
1883    branchlength += (int)GET2(cc,1);
1884    cc += 2 + IMM2_SIZE;
1885#ifdef SUPPORT_UTF
1886    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1887#endif
1888    break;
1889
1890    case OP_TYPEEXACT:
1891    branchlength += GET2(cc,1);
1892    if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1893      cc += 2;
1894    cc += 1 + IMM2_SIZE + 1;
1895    break;
1896
1897    /* Handle single-char matchers */
1898
1899    case OP_PROP:
1900    case OP_NOTPROP:
1901    cc += 2;
1902    /* Fall through */
1903
1904    case OP_HSPACE:
1905    case OP_VSPACE:
1906    case OP_NOT_HSPACE:
1907    case OP_NOT_VSPACE:
1908    case OP_NOT_DIGIT:
1909    case OP_DIGIT:
1910    case OP_NOT_WHITESPACE:
1911    case OP_WHITESPACE:
1912    case OP_NOT_WORDCHAR:
1913    case OP_WORDCHAR:
1914    case OP_ANY:
1915    case OP_ALLANY:
1916    branchlength++;
1917    cc++;
1918    break;
1919
1920    /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1921    otherwise \C is coded as OP_ALLANY. */
1922
1923    case OP_ANYBYTE:
1924    return -2;
1925
1926    /* Check a class for variable quantification */
1927
1928    case OP_CLASS:
1929    case OP_NCLASS:
1930#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1931    case OP_XCLASS:
1932    /* The original code caused an unsigned overflow in 64 bit systems,
1933    so now we use a conditional statement. */
1934    if (op == OP_XCLASS)
1935      cc += GET(cc, 1);
1936    else
1937      cc += PRIV(OP_lengths)[OP_CLASS];
1938#else
1939    cc += PRIV(OP_lengths)[OP_CLASS];
1940#endif
1941
1942    switch (*cc)
1943      {
1944      case OP_CRSTAR:
1945      case OP_CRMINSTAR:
1946      case OP_CRPLUS:
1947      case OP_CRMINPLUS:
1948      case OP_CRQUERY:
1949      case OP_CRMINQUERY:
1950      case OP_CRPOSSTAR:
1951      case OP_CRPOSPLUS:
1952      case OP_CRPOSQUERY:
1953      return -1;
1954
1955      case OP_CRRANGE:
1956      case OP_CRMINRANGE:
1957      case OP_CRPOSRANGE:
1958      if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1959      branchlength += (int)GET2(cc,1);
1960      cc += 1 + 2 * IMM2_SIZE;
1961      break;
1962
1963      default:
1964      branchlength++;
1965      }
1966    break;
1967
1968    /* Anything else is variable length */
1969
1970    case OP_ANYNL:
1971    case OP_BRAMINZERO:
1972    case OP_BRAPOS:
1973    case OP_BRAPOSZERO:
1974    case OP_BRAZERO:
1975    case OP_CBRAPOS:
1976    case OP_EXTUNI:
1977    case OP_KETRMAX:
1978    case OP_KETRMIN:
1979    case OP_KETRPOS:
1980    case OP_MINPLUS:
1981    case OP_MINPLUSI:
1982    case OP_MINQUERY:
1983    case OP_MINQUERYI:
1984    case OP_MINSTAR:
1985    case OP_MINSTARI:
1986    case OP_MINUPTO:
1987    case OP_MINUPTOI:
1988    case OP_NOTMINPLUS:
1989    case OP_NOTMINPLUSI:
1990    case OP_NOTMINQUERY:
1991    case OP_NOTMINQUERYI:
1992    case OP_NOTMINSTAR:
1993    case OP_NOTMINSTARI:
1994    case OP_NOTMINUPTO:
1995    case OP_NOTMINUPTOI:
1996    case OP_NOTPLUS:
1997    case OP_NOTPLUSI:
1998    case OP_NOTPOSPLUS:
1999    case OP_NOTPOSPLUSI:
2000    case OP_NOTPOSQUERY:
2001    case OP_NOTPOSQUERYI:
2002    case OP_NOTPOSSTAR:
2003    case OP_NOTPOSSTARI:
2004    case OP_NOTPOSUPTO:
2005    case OP_NOTPOSUPTOI:
2006    case OP_NOTQUERY:
2007    case OP_NOTQUERYI:
2008    case OP_NOTSTAR:
2009    case OP_NOTSTARI:
2010    case OP_NOTUPTO:
2011    case OP_NOTUPTOI:
2012    case OP_PLUS:
2013    case OP_PLUSI:
2014    case OP_POSPLUS:
2015    case OP_POSPLUSI:
2016    case OP_POSQUERY:
2017    case OP_POSQUERYI:
2018    case OP_POSSTAR:
2019    case OP_POSSTARI:
2020    case OP_POSUPTO:
2021    case OP_POSUPTOI:
2022    case OP_QUERY:
2023    case OP_QUERYI:
2024    case OP_REF:
2025    case OP_REFI:
2026    case OP_DNREF:
2027    case OP_DNREFI:
2028    case OP_SBRA:
2029    case OP_SBRAPOS:
2030    case OP_SCBRA:
2031    case OP_SCBRAPOS:
2032    case OP_SCOND:
2033    case OP_SKIPZERO:
2034    case OP_STAR:
2035    case OP_STARI:
2036    case OP_TYPEMINPLUS:
2037    case OP_TYPEMINQUERY:
2038    case OP_TYPEMINSTAR:
2039    case OP_TYPEMINUPTO:
2040    case OP_TYPEPLUS:
2041    case OP_TYPEPOSPLUS:
2042    case OP_TYPEPOSQUERY:
2043    case OP_TYPEPOSSTAR:
2044    case OP_TYPEPOSUPTO:
2045    case OP_TYPEQUERY:
2046    case OP_TYPESTAR:
2047    case OP_TYPEUPTO:
2048    case OP_UPTO:
2049    case OP_UPTOI:
2050    return -1;
2051
2052    /* Catch unrecognized opcodes so that when new ones are added they
2053    are not forgotten, as has happened in the past. */
2054
2055    default:
2056    return -4;
2057    }
2058  }
2059/* Control never gets here */
2060}
2061
2062
2063
2064/*************************************************
2065*    Scan compiled regex for specific bracket    *
2066*************************************************/
2067
2068/* This little function scans through a compiled pattern until it finds a
2069capturing bracket with the given number, or, if the number is negative, an
2070instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2071so that it can be called from pcre_study() when finding the minimum matching
2072length.
2073
2074Arguments:
2075  code        points to start of expression
2076  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2077  number      the required bracket number or negative to find a lookbehind
2078
2079Returns:      pointer to the opcode for the bracket, or NULL if not found
2080*/
2081
2082const pcre_uchar *
2083PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2084{
2085for (;;)
2086  {
2087  register pcre_uchar c = *code;
2088
2089  if (c == OP_END) return NULL;
2090
2091  /* XCLASS is used for classes that cannot be represented just by a bit
2092  map. This includes negated single high-valued characters. The length in
2093  the table is zero; the actual length is stored in the compiled code. */
2094
2095  if (c == OP_XCLASS) code += GET(code, 1);
2096
2097  /* Handle recursion */
2098
2099  else if (c == OP_REVERSE)
2100    {
2101    if (number < 0) return (pcre_uchar *)code;
2102    code += PRIV(OP_lengths)[c];
2103    }
2104
2105  /* Handle capturing bracket */
2106
2107  else if (c == OP_CBRA || c == OP_SCBRA ||
2108           c == OP_CBRAPOS || c == OP_SCBRAPOS)
2109    {
2110    int n = (int)GET2(code, 1+LINK_SIZE);
2111    if (n == number) return (pcre_uchar *)code;
2112    code += PRIV(OP_lengths)[c];
2113    }
2114
2115  /* Otherwise, we can get the item's length from the table, except that for
2116  repeated character types, we have to test for \p and \P, which have an extra
2117  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2118  must add in its length. */
2119
2120  else
2121    {
2122    switch(c)
2123      {
2124      case OP_TYPESTAR:
2125      case OP_TYPEMINSTAR:
2126      case OP_TYPEPLUS:
2127      case OP_TYPEMINPLUS:
2128      case OP_TYPEQUERY:
2129      case OP_TYPEMINQUERY:
2130      case OP_TYPEPOSSTAR:
2131      case OP_TYPEPOSPLUS:
2132      case OP_TYPEPOSQUERY:
2133      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2134      break;
2135
2136      case OP_TYPEUPTO:
2137      case OP_TYPEMINUPTO:
2138      case OP_TYPEEXACT:
2139      case OP_TYPEPOSUPTO:
2140      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2141        code += 2;
2142      break;
2143
2144      case OP_MARK:
2145      case OP_PRUNE_ARG:
2146      case OP_SKIP_ARG:
2147      case OP_THEN_ARG:
2148      code += code[1];
2149      break;
2150      }
2151
2152    /* Add in the fixed length from the table */
2153
2154    code += PRIV(OP_lengths)[c];
2155
2156  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2157  a multi-byte character. The length in the table is a minimum, so we have to
2158  arrange to skip the extra bytes. */
2159
2160#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2161    if (utf) switch(c)
2162      {
2163      case OP_CHAR:
2164      case OP_CHARI:
2165      case OP_NOT:
2166      case OP_NOTI:
2167      case OP_EXACT:
2168      case OP_EXACTI:
2169      case OP_NOTEXACT:
2170      case OP_NOTEXACTI:
2171      case OP_UPTO:
2172      case OP_UPTOI:
2173      case OP_NOTUPTO:
2174      case OP_NOTUPTOI:
2175      case OP_MINUPTO:
2176      case OP_MINUPTOI:
2177      case OP_NOTMINUPTO:
2178      case OP_NOTMINUPTOI:
2179      case OP_POSUPTO:
2180      case OP_POSUPTOI:
2181      case OP_NOTPOSUPTO:
2182      case OP_NOTPOSUPTOI:
2183      case OP_STAR:
2184      case OP_STARI:
2185      case OP_NOTSTAR:
2186      case OP_NOTSTARI:
2187      case OP_MINSTAR:
2188      case OP_MINSTARI:
2189      case OP_NOTMINSTAR:
2190      case OP_NOTMINSTARI:
2191      case OP_POSSTAR:
2192      case OP_POSSTARI:
2193      case OP_NOTPOSSTAR:
2194      case OP_NOTPOSSTARI:
2195      case OP_PLUS:
2196      case OP_PLUSI:
2197      case OP_NOTPLUS:
2198      case OP_NOTPLUSI:
2199      case OP_MINPLUS:
2200      case OP_MINPLUSI:
2201      case OP_NOTMINPLUS:
2202      case OP_NOTMINPLUSI:
2203      case OP_POSPLUS:
2204      case OP_POSPLUSI:
2205      case OP_NOTPOSPLUS:
2206      case OP_NOTPOSPLUSI:
2207      case OP_QUERY:
2208      case OP_QUERYI:
2209      case OP_NOTQUERY:
2210      case OP_NOTQUERYI:
2211      case OP_MINQUERY:
2212      case OP_MINQUERYI:
2213      case OP_NOTMINQUERY:
2214      case OP_NOTMINQUERYI:
2215      case OP_POSQUERY:
2216      case OP_POSQUERYI:
2217      case OP_NOTPOSQUERY:
2218      case OP_NOTPOSQUERYI:
2219      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2220      break;
2221      }
2222#else
2223    (void)(utf);  /* Keep compiler happy by referencing function argument */
2224#endif
2225    }
2226  }
2227}
2228
2229
2230
2231/*************************************************
2232*   Scan compiled regex for recursion reference  *
2233*************************************************/
2234
2235/* This little function scans through a compiled pattern until it finds an
2236instance of OP_RECURSE.
2237
2238Arguments:
2239  code        points to start of expression
2240  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2241
2242Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2243*/
2244
2245static const pcre_uchar *
2246find_recurse(const pcre_uchar *code, BOOL utf)
2247{
2248for (;;)
2249  {
2250  register pcre_uchar c = *code;
2251  if (c == OP_END) return NULL;
2252  if (c == OP_RECURSE) return code;
2253
2254  /* XCLASS is used for classes that cannot be represented just by a bit
2255  map. This includes negated single high-valued characters. The length in
2256  the table is zero; the actual length is stored in the compiled code. */
2257
2258  if (c == OP_XCLASS) code += GET(code, 1);
2259
2260  /* Otherwise, we can get the item's length from the table, except that for
2261  repeated character types, we have to test for \p and \P, which have an extra
2262  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2263  must add in its length. */
2264
2265  else
2266    {
2267    switch(c)
2268      {
2269      case OP_TYPESTAR:
2270      case OP_TYPEMINSTAR:
2271      case OP_TYPEPLUS:
2272      case OP_TYPEMINPLUS:
2273      case OP_TYPEQUERY:
2274      case OP_TYPEMINQUERY:
2275      case OP_TYPEPOSSTAR:
2276      case OP_TYPEPOSPLUS:
2277      case OP_TYPEPOSQUERY:
2278      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2279      break;
2280
2281      case OP_TYPEPOSUPTO:
2282      case OP_TYPEUPTO:
2283      case OP_TYPEMINUPTO:
2284      case OP_TYPEEXACT:
2285      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2286        code += 2;
2287      break;
2288
2289      case OP_MARK:
2290      case OP_PRUNE_ARG:
2291      case OP_SKIP_ARG:
2292      case OP_THEN_ARG:
2293      code += code[1];
2294      break;
2295      }
2296
2297    /* Add in the fixed length from the table */
2298
2299    code += PRIV(OP_lengths)[c];
2300
2301    /* In UTF-8 mode, opcodes that are followed by a character may be followed
2302    by a multi-byte character. The length in the table is a minimum, so we have
2303    to arrange to skip the extra bytes. */
2304
2305#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2306    if (utf) switch(c)
2307      {
2308      case OP_CHAR:
2309      case OP_CHARI:
2310      case OP_NOT:
2311      case OP_NOTI:
2312      case OP_EXACT:
2313      case OP_EXACTI:
2314      case OP_NOTEXACT:
2315      case OP_NOTEXACTI:
2316      case OP_UPTO:
2317      case OP_UPTOI:
2318      case OP_NOTUPTO:
2319      case OP_NOTUPTOI:
2320      case OP_MINUPTO:
2321      case OP_MINUPTOI:
2322      case OP_NOTMINUPTO:
2323      case OP_NOTMINUPTOI:
2324      case OP_POSUPTO:
2325      case OP_POSUPTOI:
2326      case OP_NOTPOSUPTO:
2327      case OP_NOTPOSUPTOI:
2328      case OP_STAR:
2329      case OP_STARI:
2330      case OP_NOTSTAR:
2331      case OP_NOTSTARI:
2332      case OP_MINSTAR:
2333      case OP_MINSTARI:
2334      case OP_NOTMINSTAR:
2335      case OP_NOTMINSTARI:
2336      case OP_POSSTAR:
2337      case OP_POSSTARI:
2338      case OP_NOTPOSSTAR:
2339      case OP_NOTPOSSTARI:
2340      case OP_PLUS:
2341      case OP_PLUSI:
2342      case OP_NOTPLUS:
2343      case OP_NOTPLUSI:
2344      case OP_MINPLUS:
2345      case OP_MINPLUSI:
2346      case OP_NOTMINPLUS:
2347      case OP_NOTMINPLUSI:
2348      case OP_POSPLUS:
2349      case OP_POSPLUSI:
2350      case OP_NOTPOSPLUS:
2351      case OP_NOTPOSPLUSI:
2352      case OP_QUERY:
2353      case OP_QUERYI:
2354      case OP_NOTQUERY:
2355      case OP_NOTQUERYI:
2356      case OP_MINQUERY:
2357      case OP_MINQUERYI:
2358      case OP_NOTMINQUERY:
2359      case OP_NOTMINQUERYI:
2360      case OP_POSQUERY:
2361      case OP_POSQUERYI:
2362      case OP_NOTPOSQUERY:
2363      case OP_NOTPOSQUERYI:
2364      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2365      break;
2366      }
2367#else
2368    (void)(utf);  /* Keep compiler happy by referencing function argument */
2369#endif
2370    }
2371  }
2372}
2373
2374
2375
2376/*************************************************
2377*    Scan compiled branch for non-emptiness      *
2378*************************************************/
2379
2380/* This function scans through a branch of a compiled pattern to see whether it
2381can match the empty string or not. It is called from could_be_empty()
2382below and from compile_branch() when checking for an unlimited repeat of a
2383group that can match nothing. Note that first_significant_code() skips over
2384backward and negative forward assertions when its final argument is TRUE. If we
2385hit an unclosed bracket, we return "empty" - this means we've struck an inner
2386bracket whose current branch will already have been scanned.
2387
2388Arguments:
2389  code        points to start of search
2390  endcode     points to where to stop
2391  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2392  cd          contains pointers to tables etc.
2393  recurses    chain of recurse_check to catch mutual recursion
2394
2395Returns:      TRUE if what is matched could be empty
2396*/
2397
2398static BOOL
2399could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2400  BOOL utf, compile_data *cd, recurse_check *recurses)
2401{
2402register pcre_uchar c;
2403recurse_check this_recurse;
2404
2405for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2406     code < endcode;
2407     code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2408  {
2409  const pcre_uchar *ccode;
2410
2411  c = *code;
2412
2413  /* Skip over forward assertions; the other assertions are skipped by
2414  first_significant_code() with a TRUE final argument. */
2415
2416  if (c == OP_ASSERT)
2417    {
2418    do code += GET(code, 1); while (*code == OP_ALT);
2419    c = *code;
2420    continue;
2421    }
2422
2423  /* For a recursion/subroutine call, if its end has been reached, which
2424  implies a backward reference subroutine call, we can scan it. If it's a
2425  forward reference subroutine call, we can't. To detect forward reference
2426  we have to scan up the list that is kept in the workspace. This function is
2427  called only when doing the real compile, not during the pre-compile that
2428  measures the size of the compiled pattern. */
2429
2430  if (c == OP_RECURSE)
2431    {
2432    const pcre_uchar *scode = cd->start_code + GET(code, 1);
2433    const pcre_uchar *endgroup = scode;
2434    BOOL empty_branch;
2435
2436    /* Test for forward reference or uncompleted reference. This is disabled
2437    when called to scan a completed pattern by setting cd->start_workspace to
2438    NULL. */
2439
2440    if (cd->start_workspace != NULL)
2441      {
2442      const pcre_uchar *tcode;
2443      for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2444        if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2445      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2446      }
2447
2448    /* If the reference is to a completed group, we need to detect whether this
2449    is a recursive call, as otherwise there will be an infinite loop. If it is
2450    a recursion, just skip over it. Simple recursions are easily detected. For
2451    mutual recursions we keep a chain on the stack. */
2452
2453    do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2454    if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2455    else
2456      {
2457      recurse_check *r = recurses;
2458      for (r = recurses; r != NULL; r = r->prev)
2459        if (r->group == scode) break;
2460      if (r != NULL) continue;   /* Mutual recursion */
2461      }
2462
2463    /* Completed reference; scan the referenced group, remembering it on the
2464    stack chain to detect mutual recursions. */
2465
2466    empty_branch = FALSE;
2467    this_recurse.prev = recurses;
2468    this_recurse.group = scode;
2469
2470    do
2471      {
2472      if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2473        {
2474        empty_branch = TRUE;
2475        break;
2476        }
2477      scode += GET(scode, 1);
2478      }
2479    while (*scode == OP_ALT);
2480
2481    if (!empty_branch) return FALSE;  /* All branches are non-empty */
2482    continue;
2483    }
2484
2485  /* Groups with zero repeats can of course be empty; skip them. */
2486
2487  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2488      c == OP_BRAPOSZERO)
2489    {
2490    code += PRIV(OP_lengths)[c];
2491    do code += GET(code, 1); while (*code == OP_ALT);
2492    c = *code;
2493    continue;
2494    }
2495
2496  /* A nested group that is already marked as "could be empty" can just be
2497  skipped. */
2498
2499  if (c == OP_SBRA  || c == OP_SBRAPOS ||
2500      c == OP_SCBRA || c == OP_SCBRAPOS)
2501    {
2502    do code += GET(code, 1); while (*code == OP_ALT);
2503    c = *code;
2504    continue;
2505    }
2506
2507  /* For other groups, scan the branches. */
2508
2509  if (c == OP_BRA  || c == OP_BRAPOS ||
2510      c == OP_CBRA || c == OP_CBRAPOS ||
2511      c == OP_ONCE || c == OP_ONCE_NC ||
2512      c == OP_COND || c == OP_SCOND)
2513    {
2514    BOOL empty_branch;
2515    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2516
2517    /* If a conditional group has only one branch, there is a second, implied,
2518    empty branch, so just skip over the conditional, because it could be empty.
2519    Otherwise, scan the individual branches of the group. */
2520
2521    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2522      code += GET(code, 1);
2523    else
2524      {
2525      empty_branch = FALSE;
2526      do
2527        {
2528        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2529          recurses)) empty_branch = TRUE;
2530        code += GET(code, 1);
2531        }
2532      while (*code == OP_ALT);
2533      if (!empty_branch) return FALSE;   /* All branches are non-empty */
2534      }
2535
2536    c = *code;
2537    continue;
2538    }
2539
2540  /* Handle the other opcodes */
2541
2542  switch (c)
2543    {
2544    /* Check for quantifiers after a class. XCLASS is used for classes that
2545    cannot be represented just by a bit map. This includes negated single
2546    high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2547    actual length is stored in the compiled code, so we must update "code"
2548    here. */
2549
2550#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2551    case OP_XCLASS:
2552    ccode = code += GET(code, 1);
2553    goto CHECK_CLASS_REPEAT;
2554#endif
2555
2556    case OP_CLASS:
2557    case OP_NCLASS:
2558    ccode = code + PRIV(OP_lengths)[OP_CLASS];
2559
2560#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2561    CHECK_CLASS_REPEAT:
2562#endif
2563
2564    switch (*ccode)
2565      {
2566      case OP_CRSTAR:            /* These could be empty; continue */
2567      case OP_CRMINSTAR:
2568      case OP_CRQUERY:
2569      case OP_CRMINQUERY:
2570      case OP_CRPOSSTAR:
2571      case OP_CRPOSQUERY:
2572      break;
2573
2574      default:                   /* Non-repeat => class must match */
2575      case OP_CRPLUS:            /* These repeats aren't empty */
2576      case OP_CRMINPLUS:
2577      case OP_CRPOSPLUS:
2578      return FALSE;
2579
2580      case OP_CRRANGE:
2581      case OP_CRMINRANGE:
2582      case OP_CRPOSRANGE:
2583      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2584      break;
2585      }
2586    break;
2587
2588    /* Opcodes that must match a character */
2589
2590    case OP_ANY:
2591    case OP_ALLANY:
2592    case OP_ANYBYTE:
2593
2594    case OP_PROP:
2595    case OP_NOTPROP:
2596    case OP_ANYNL:
2597
2598    case OP_NOT_HSPACE:
2599    case OP_HSPACE:
2600    case OP_NOT_VSPACE:
2601    case OP_VSPACE:
2602    case OP_EXTUNI:
2603
2604    case OP_NOT_DIGIT:
2605    case OP_DIGIT:
2606    case OP_NOT_WHITESPACE:
2607    case OP_WHITESPACE:
2608    case OP_NOT_WORDCHAR:
2609    case OP_WORDCHAR:
2610
2611    case OP_CHAR:
2612    case OP_CHARI:
2613    case OP_NOT:
2614    case OP_NOTI:
2615
2616    case OP_PLUS:
2617    case OP_PLUSI:
2618    case OP_MINPLUS:
2619    case OP_MINPLUSI:
2620
2621    case OP_NOTPLUS:
2622    case OP_NOTPLUSI:
2623    case OP_NOTMINPLUS:
2624    case OP_NOTMINPLUSI:
2625
2626    case OP_POSPLUS:
2627    case OP_POSPLUSI:
2628    case OP_NOTPOSPLUS:
2629    case OP_NOTPOSPLUSI:
2630
2631    case OP_EXACT:
2632    case OP_EXACTI:
2633    case OP_NOTEXACT:
2634    case OP_NOTEXACTI:
2635
2636    case OP_TYPEPLUS:
2637    case OP_TYPEMINPLUS:
2638    case OP_TYPEPOSPLUS:
2639    case OP_TYPEEXACT:
2640
2641    return FALSE;
2642
2643    /* These are going to continue, as they may be empty, but we have to
2644    fudge the length for the \p and \P cases. */
2645
2646    case OP_TYPESTAR:
2647    case OP_TYPEMINSTAR:
2648    case OP_TYPEPOSSTAR:
2649    case OP_TYPEQUERY:
2650    case OP_TYPEMINQUERY:
2651    case OP_TYPEPOSQUERY:
2652    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2653    break;
2654
2655    /* Same for these */
2656
2657    case OP_TYPEUPTO:
2658    case OP_TYPEMINUPTO:
2659    case OP_TYPEPOSUPTO:
2660    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2661      code += 2;
2662    break;
2663
2664    /* End of branch */
2665
2666    case OP_KET:
2667    case OP_KETRMAX:
2668    case OP_KETRMIN:
2669    case OP_KETRPOS:
2670    case OP_ALT:
2671    return TRUE;
2672
2673    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2674    MINUPTO, and POSUPTO and their caseless and negative versions may be
2675    followed by a multibyte character. */
2676
2677#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2678    case OP_STAR:
2679    case OP_STARI:
2680    case OP_NOTSTAR:
2681    case OP_NOTSTARI:
2682
2683    case OP_MINSTAR:
2684    case OP_MINSTARI:
2685    case OP_NOTMINSTAR:
2686    case OP_NOTMINSTARI:
2687
2688    case OP_POSSTAR:
2689    case OP_POSSTARI:
2690    case OP_NOTPOSSTAR:
2691    case OP_NOTPOSSTARI:
2692
2693    case OP_QUERY:
2694    case OP_QUERYI:
2695    case OP_NOTQUERY:
2696    case OP_NOTQUERYI:
2697
2698    case OP_MINQUERY:
2699    case OP_MINQUERYI:
2700    case OP_NOTMINQUERY:
2701    case OP_NOTMINQUERYI:
2702
2703    case OP_POSQUERY:
2704    case OP_POSQUERYI:
2705    case OP_NOTPOSQUERY:
2706    case OP_NOTPOSQUERYI:
2707
2708    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2709    break;
2710
2711    case OP_UPTO:
2712    case OP_UPTOI:
2713    case OP_NOTUPTO:
2714    case OP_NOTUPTOI:
2715
2716    case OP_MINUPTO:
2717    case OP_MINUPTOI:
2718    case OP_NOTMINUPTO:
2719    case OP_NOTMINUPTOI:
2720
2721    case OP_POSUPTO:
2722    case OP_POSUPTOI:
2723    case OP_NOTPOSUPTO:
2724    case OP_NOTPOSUPTOI:
2725
2726    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2727    break;
2728#endif
2729
2730    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2731    string. */
2732
2733    case OP_MARK:
2734    case OP_PRUNE_ARG:
2735    case OP_SKIP_ARG:
2736    case OP_THEN_ARG:
2737    code += code[1];
2738    break;
2739
2740    /* None of the remaining opcodes are required to match a character. */
2741
2742    default:
2743    break;
2744    }
2745  }
2746
2747return TRUE;
2748}
2749
2750
2751
2752/*************************************************
2753*    Scan compiled regex for non-emptiness       *
2754*************************************************/
2755
2756/* This function is called to check for left recursive calls. We want to check
2757the current branch of the current pattern to see if it could match the empty
2758string. If it could, we must look outwards for branches at other levels,
2759stopping when we pass beyond the bracket which is the subject of the recursion.
2760This function is called only during the real compile, not during the
2761pre-compile.
2762
2763Arguments:
2764  code        points to start of the recursion
2765  endcode     points to where to stop (current RECURSE item)
2766  bcptr       points to the chain of current (unclosed) branch starts
2767  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2768  cd          pointers to tables etc
2769
2770Returns:      TRUE if what is matched could be empty
2771*/
2772
2773static BOOL
2774could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2775  branch_chain *bcptr, BOOL utf, compile_data *cd)
2776{
2777while (bcptr != NULL && bcptr->current_branch >= code)
2778  {
2779  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2780    return FALSE;
2781  bcptr = bcptr->outer;
2782  }
2783return TRUE;
2784}
2785
2786
2787
2788/*************************************************
2789*        Base opcode of repeated opcodes         *
2790*************************************************/
2791
2792/* Returns the base opcode for repeated single character type opcodes. If the
2793opcode is not a repeated character type, it returns with the original value.
2794
2795Arguments:  c opcode
2796Returns:    base opcode for the type
2797*/
2798
2799static pcre_uchar
2800get_repeat_base(pcre_uchar c)
2801{
2802return (c > OP_TYPEPOSUPTO)? c :
2803       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2804       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2805       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2806       (c >= OP_STARI)?      OP_STARI :
2807                             OP_STAR;
2808}
2809
2810
2811
2812#ifdef SUPPORT_UCP
2813/*************************************************
2814*        Check a character and a property        *
2815*************************************************/
2816
2817/* This function is called by check_auto_possessive() when a property item
2818is adjacent to a fixed character.
2819
2820Arguments:
2821  c            the character
2822  ptype        the property type
2823  pdata        the data for the type
2824  negated      TRUE if it's a negated property (\P or \p{^)
2825
2826Returns:       TRUE if auto-possessifying is OK
2827*/
2828
2829static BOOL
2830check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2831  BOOL negated)
2832{
2833const pcre_uint32 *p;
2834const ucd_record *prop = GET_UCD(c);
2835
2836switch(ptype)
2837  {
2838  case PT_LAMP:
2839  return (prop->chartype == ucp_Lu ||
2840          prop->chartype == ucp_Ll ||
2841          prop->chartype == ucp_Lt) == negated;
2842
2843  case PT_GC:
2844  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2845
2846  case PT_PC:
2847  return (pdata == prop->chartype) == negated;
2848
2849  case PT_SC:
2850  return (pdata == prop->script) == negated;
2851
2852  /* These are specials */
2853
2854  case PT_ALNUM:
2855  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2856          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2857
2858  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2859  means that Perl space and POSIX space are now identical. PCRE was changed
2860  at release 8.34. */
2861
2862  case PT_SPACE:    /* Perl space */
2863  case PT_PXSPACE:  /* POSIX space */
2864  switch(c)
2865    {
2866    HSPACE_CASES:
2867    VSPACE_CASES:
2868    return negated;
2869
2870    default:
2871    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2872    }
2873  break;  /* Control never reaches here */
2874
2875  case PT_WORD:
2876  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2877          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2878          c == CHAR_UNDERSCORE) == negated;
2879
2880  case PT_CLIST:
2881  p = PRIV(ucd_caseless_sets) + prop->caseset;
2882  for (;;)
2883    {
2884    if (c < *p) return !negated;
2885    if (c == *p++) return negated;
2886    }
2887  break;  /* Control never reaches here */
2888  }
2889
2890return FALSE;
2891}
2892#endif  /* SUPPORT_UCP */
2893
2894
2895
2896/*************************************************
2897*        Fill the character property list        *
2898*************************************************/
2899
2900/* Checks whether the code points to an opcode that can take part in auto-
2901possessification, and if so, fills a list with its properties.
2902
2903Arguments:
2904  code        points to start of expression
2905  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2906  fcc         points to case-flipping table
2907  list        points to output list
2908              list[0] will be filled with the opcode
2909              list[1] will be non-zero if this opcode
2910                can match an empty character string
2911              list[2..7] depends on the opcode
2912
2913Returns:      points to the start of the next opcode if *code is accepted
2914              NULL if *code is not accepted
2915*/
2916
2917static const pcre_uchar *
2918get_chr_property_list(const pcre_uchar *code, BOOL utf,
2919  const pcre_uint8 *fcc, pcre_uint32 *list)
2920{
2921pcre_uchar c = *code;
2922pcre_uchar base;
2923const pcre_uchar *end;
2924pcre_uint32 chr;
2925
2926#ifdef SUPPORT_UCP
2927pcre_uint32 *clist_dest;
2928const pcre_uint32 *clist_src;
2929#else
2930utf = utf;  /* Suppress "unused parameter" compiler warning */
2931#endif
2932
2933list[0] = c;
2934list[1] = FALSE;
2935code++;
2936
2937if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2938  {
2939  base = get_repeat_base(c);
2940  c -= (base - OP_STAR);
2941
2942  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2943    code += IMM2_SIZE;
2944
2945  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2946
2947  switch(base)
2948    {
2949    case OP_STAR:
2950    list[0] = OP_CHAR;
2951    break;
2952
2953    case OP_STARI:
2954    list[0] = OP_CHARI;
2955    break;
2956
2957    case OP_NOTSTAR:
2958    list[0] = OP_NOT;
2959    break;
2960
2961    case OP_NOTSTARI:
2962    list[0] = OP_NOTI;
2963    break;
2964
2965    case OP_TYPESTAR:
2966    list[0] = *code;
2967    code++;
2968    break;
2969    }
2970  c = list[0];
2971  }
2972
2973switch(c)
2974  {
2975  case OP_NOT_DIGIT:
2976  case OP_DIGIT:
2977  case OP_NOT_WHITESPACE:
2978  case OP_WHITESPACE:
2979  case OP_NOT_WORDCHAR:
2980  case OP_WORDCHAR:
2981  case OP_ANY:
2982  case OP_ALLANY:
2983  case OP_ANYNL:
2984  case OP_NOT_HSPACE:
2985  case OP_HSPACE:
2986  case OP_NOT_VSPACE:
2987  case OP_VSPACE:
2988  case OP_EXTUNI:
2989  case OP_EODN:
2990  case OP_EOD:
2991  case OP_DOLL:
2992  case OP_DOLLM:
2993  return code;
2994
2995  case OP_CHAR:
2996  case OP_NOT:
2997  GETCHARINCTEST(chr, code);
2998  list[2] = chr;
2999  list[3] = NOTACHAR;
3000  return code;
3001
3002  case OP_CHARI:
3003  case OP_NOTI:
3004  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3005  GETCHARINCTEST(chr, code);
3006  list[2] = chr;
3007
3008#ifdef SUPPORT_UCP
3009  if (chr < 128 || (chr < 256 && !utf))
3010    list[3] = fcc[chr];
3011  else
3012    list[3] = UCD_OTHERCASE(chr);
3013#elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3014  list[3] = (chr < 256) ? fcc[chr] : chr;
3015#else
3016  list[3] = fcc[chr];
3017#endif
3018
3019  /* The othercase might be the same value. */
3020
3021  if (chr == list[3])
3022    list[3] = NOTACHAR;
3023  else
3024    list[4] = NOTACHAR;
3025  return code;
3026
3027#ifdef SUPPORT_UCP
3028  case OP_PROP:
3029  case OP_NOTPROP:
3030  if (code[0] != PT_CLIST)
3031    {
3032    list[2] = code[0];
3033    list[3] = code[1];
3034    return code + 2;
3035    }
3036
3037  /* Convert only if we have enough space. */
3038
3039  clist_src = PRIV(ucd_caseless_sets) + code[1];
3040  clist_dest = list + 2;
3041  code += 2;
3042
3043  do {
3044     if (clist_dest >= list + 8)
3045       {
3046       /* Early return if there is not enough space. This should never
3047       happen, since all clists are shorter than 5 character now. */
3048       list[2] = code[0];
3049       list[3] = code[1];
3050       return code;
3051       }
3052     *clist_dest++ = *clist_src;
3053     }
3054  while(*clist_src++ != NOTACHAR);
3055
3056  /* All characters are stored. The terminating NOTACHAR
3057  is copied form the clist itself. */
3058
3059  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3060  return code;
3061#endif
3062
3063  case OP_NCLASS:
3064  case OP_CLASS:
3065#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3066  case OP_XCLASS:
3067  if (c == OP_XCLASS)
3068    end = code + GET(code, 0) - 1;
3069  else
3070#endif
3071    end = code + 32 / sizeof(pcre_uchar);
3072
3073  switch(*end)
3074    {
3075    case OP_CRSTAR:
3076    case OP_CRMINSTAR:
3077    case OP_CRQUERY:
3078    case OP_CRMINQUERY:
3079    case OP_CRPOSSTAR:
3080    case OP_CRPOSQUERY:
3081    list[1] = TRUE;
3082    end++;
3083    break;
3084
3085    case OP_CRPLUS:
3086    case OP_CRMINPLUS:
3087    case OP_CRPOSPLUS:
3088    end++;
3089    break;
3090
3091    case OP_CRRANGE:
3092    case OP_CRMINRANGE:
3093    case OP_CRPOSRANGE:
3094    list[1] = (GET2(end, 1) == 0);
3095    end += 1 + 2 * IMM2_SIZE;
3096    break;
3097    }
3098  list[2] = (pcre_uint32)(end - code);
3099  return end;
3100  }
3101return NULL;    /* Opcode not accepted */
3102}
3103
3104
3105
3106/*************************************************
3107*    Scan further character sets for match       *
3108*************************************************/
3109
3110/* Checks whether the base and the current opcode have a common character, in
3111which case the base cannot be possessified.
3112
3113Arguments:
3114  code        points to the byte code
3115  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3116  cd          static compile data
3117  base_list   the data list of the base opcode
3118
3119Returns:      TRUE if the auto-possessification is possible
3120*/
3121
3122static BOOL
3123compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3124  const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3125{
3126pcre_uchar c;
3127pcre_uint32 list[8];
3128const pcre_uint32 *chr_ptr;
3129const pcre_uint32 *ochr_ptr;
3130const pcre_uint32 *list_ptr;
3131const pcre_uchar *next_code;
3132#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3133const pcre_uchar *xclass_flags;
3134#endif
3135const pcre_uint8 *class_bitset;
3136const pcre_uint8 *set1, *set2, *set_end;
3137pcre_uint32 chr;
3138BOOL accepted, invert_bits;
3139BOOL entered_a_group = FALSE;
3140
3141if (*rec_limit == 0) return FALSE;
3142--(*rec_limit);
3143
3144/* Note: the base_list[1] contains whether the current opcode has greedy
3145(represented by a non-zero value) quantifier. This is a different from
3146other character type lists, which stores here that the character iterator
3147matches to an empty string (also represented by a non-zero value). */
3148
3149for(;;)
3150  {
3151  /* All operations move the code pointer forward.
3152  Therefore infinite recursions are not possible. */
3153
3154  c = *code;
3155
3156  /* Skip over callouts */
3157
3158  if (c == OP_CALLOUT)
3159    {
3160    code += PRIV(OP_lengths)[c];
3161    continue;
3162    }
3163
3164  if (c == OP_ALT)
3165    {
3166    do code += GET(code, 1); while (*code == OP_ALT);
3167    c = *code;
3168    }
3169
3170  switch(c)
3171    {
3172    case OP_END:
3173    case OP_KETRPOS:
3174    /* TRUE only in greedy case. The non-greedy case could be replaced by
3175    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3176    uses more memory, which we cannot get at this stage.) */
3177
3178    return base_list[1] != 0;
3179
3180    case OP_KET:
3181    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3182    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3183    cannot be converted to a possessive form. */
3184
3185    if (base_list[1] == 0) return FALSE;
3186
3187    switch(*(code - GET(code, 1)))
3188      {
3189      case OP_ASSERT:
3190      case OP_ASSERT_NOT:
3191      case OP_ASSERTBACK:
3192      case OP_ASSERTBACK_NOT:
3193      case OP_ONCE:
3194      case OP_ONCE_NC:
3195      /* Atomic sub-patterns and assertions can always auto-possessify their
3196      last iterator. However, if the group was entered as a result of checking
3197      a previous iterator, this is not possible. */
3198
3199      return !entered_a_group;
3200      }
3201
3202    code += PRIV(OP_lengths)[c];
3203    continue;
3204
3205    case OP_ONCE:
3206    case OP_ONCE_NC:
3207    case OP_BRA:
3208    case OP_CBRA:
3209    next_code = code + GET(code, 1);
3210    code += PRIV(OP_lengths)[c];
3211
3212    while (*next_code == OP_ALT)
3213      {
3214      if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3215        return FALSE;
3216      code = next_code + 1 + LINK_SIZE;
3217      next_code += GET(next_code, 1);
3218      }
3219
3220    entered_a_group = TRUE;
3221    continue;
3222
3223    case OP_BRAZERO:
3224    case OP_BRAMINZERO:
3225
3226    next_code = code + 1;
3227    if (*next_code != OP_BRA && *next_code != OP_CBRA
3228        && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3229
3230    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3231
3232    /* The bracket content will be checked by the
3233    OP_BRA/OP_CBRA case above. */
3234    next_code += 1 + LINK_SIZE;
3235    if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3236      return FALSE;
3237
3238    code += PRIV(OP_lengths)[c];
3239    continue;
3240
3241    default:
3242    break;
3243    }
3244
3245  /* Check for a supported opcode, and load its properties. */
3246
3247  code = get_chr_property_list(code, utf, cd->fcc, list);
3248  if (code == NULL) return FALSE;    /* Unsupported */
3249
3250  /* If either opcode is a small character list, set pointers for comparing
3251  characters from that list with another list, or with a property. */
3252
3253  if (base_list[0] == OP_CHAR)
3254    {
3255    chr_ptr = base_list + 2;
3256    list_ptr = list;
3257    }
3258  else if (list[0] == OP_CHAR)
3259    {
3260    chr_ptr = list + 2;
3261    list_ptr = base_list;
3262    }
3263
3264  /* Character bitsets can also be compared to certain opcodes. */
3265
3266  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3267#ifdef COMPILE_PCRE8
3268      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3269      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3270#endif
3271      )
3272    {
3273#ifdef COMPILE_PCRE8
3274    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3275#else
3276    if (base_list[0] == OP_CLASS)
3277#endif
3278      {
3279      set1 = (pcre_uint8 *)(base_end - base_list[2]);
3280      list_ptr = list;
3281      }
3282    else
3283      {
3284      set1 = (pcre_uint8 *)(code - list[2]);
3285      list_ptr = base_list;
3286      }
3287
3288    invert_bits = FALSE;
3289    switch(list_ptr[0])
3290      {
3291      case OP_CLASS:
3292      case OP_NCLASS:
3293      set2 = (pcre_uint8 *)
3294        ((list_ptr == list ? code : base_end) - list_ptr[2]);
3295      break;
3296
3297#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3298      case OP_XCLASS:
3299      xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3300      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3301      if ((*xclass_flags & XCL_MAP) == 0)
3302        {
3303        /* No bits are set for characters < 256. */
3304        if (list[1] == 0) return TRUE;
3305        /* Might be an empty repeat. */
3306        continue;
3307        }
3308      set2 = (pcre_uint8 *)(xclass_flags + 1);
3309      break;
3310#endif
3311
3312      case OP_NOT_DIGIT:
3313      invert_bits = TRUE;
3314      /* Fall through */
3315      case OP_DIGIT:
3316      set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3317      break;
3318
3319      case OP_NOT_WHITESPACE:
3320      invert_bits = TRUE;
3321      /* Fall through */
3322      case OP_WHITESPACE:
3323      set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3324      break;
3325
3326      case OP_NOT_WORDCHAR:
3327      invert_bits = TRUE;
3328      /* Fall through */
3329      case OP_WORDCHAR:
3330      set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3331      break;
3332
3333      default:
3334      return FALSE;
3335      }
3336
3337    /* Because the sets are unaligned, we need
3338    to perform byte comparison here. */
3339    set_end = set1 + 32;
3340    if (invert_bits)
3341      {
3342      do
3343        {
3344        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3345        }
3346      while (set1 < set_end);
3347      }
3348    else
3349      {
3350      do
3351        {
3352        if ((*set1++ & *set2++) != 0) return FALSE;
3353        }
3354      while (set1 < set_end);
3355      }
3356
3357    if (list[1] == 0) return TRUE;
3358    /* Might be an empty repeat. */
3359    continue;
3360    }
3361
3362  /* Some property combinations also acceptable. Unicode property opcodes are
3363  processed specially; the rest can be handled with a lookup table. */
3364
3365  else
3366    {
3367    pcre_uint32 leftop, rightop;
3368
3369    leftop = base_list[0];
3370    rightop = list[0];
3371
3372#ifdef SUPPORT_UCP
3373    accepted = FALSE; /* Always set in non-unicode case. */
3374    if (leftop == OP_PROP || leftop == OP_NOTPROP)
3375      {
3376      if (rightop == OP_EOD)
3377        accepted = TRUE;
3378      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3379        {
3380        int n;
3381        const pcre_uint8 *p;
3382        BOOL same = leftop == rightop;
3383        BOOL lisprop = leftop == OP_PROP;
3384        BOOL risprop = rightop == OP_PROP;
3385        BOOL bothprop = lisprop && risprop;
3386
3387        /* There's a table that specifies how each combination is to be
3388        processed:
3389          0   Always return FALSE (never auto-possessify)
3390          1   Character groups are distinct (possessify if both are OP_PROP)
3391          2   Check character categories in the same group (general or particular)
3392          3   Return TRUE if the two opcodes are not the same
3393          ... see comments below
3394        */
3395
3396        n = propposstab[base_list[2]][list[2]];
3397        switch(n)
3398          {
3399          case 0: break;
3400          case 1: accepted = bothprop; break;
3401          case 2: accepted = (base_list[3] == list[3]) != same; break;
3402          case 3: accepted = !same; break;
3403
3404          case 4:  /* Left general category, right particular category */
3405          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3406          break;
3407
3408          case 5:  /* Right general category, left particular category */
3409          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3410          break;
3411
3412          /* This code is logically tricky. Think hard before fiddling with it.
3413          The posspropstab table has four entries per row. Each row relates to
3414          one of PCRE's special properties such as ALNUM or SPACE or WORD.
3415          Only WORD actually needs all four entries, but using repeats for the
3416          others means they can all use the same code below.
3417
3418          The first two entries in each row are Unicode general categories, and
3419          apply always, because all the characters they include are part of the
3420          PCRE character set. The third and fourth entries are a general and a
3421          particular category, respectively, that include one or more relevant
3422          characters. One or the other is used, depending on whether the check
3423          is for a general or a particular category. However, in both cases the
3424          category contains more characters than the specials that are defined
3425          for the property being tested against. Therefore, it cannot be used
3426          in a NOTPROP case.
3427
3428          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3429          Underscore is covered by ucp_P or ucp_Po. */
3430
3431          case 6:  /* Left alphanum vs right general category */
3432          case 7:  /* Left space vs right general category */
3433          case 8:  /* Left word vs right general category */
3434          p = posspropstab[n-6];
3435          accepted = risprop && lisprop ==
3436            (list[3] != p[0] &&
3437             list[3] != p[1] &&
3438            (list[3] != p[2] || !lisprop));
3439          break;
3440
3441          case 9:   /* Right alphanum vs left general category */
3442          case 10:  /* Right space vs left general category */
3443          case 11:  /* Right word vs left general category */
3444          p = posspropstab[n-9];
3445          accepted = lisprop && risprop ==
3446            (base_list[3] != p[0] &&
3447             base_list[3] != p[1] &&
3448            (base_list[3] != p[2] || !risprop));
3449          break;
3450
3451          case 12:  /* Left alphanum vs right particular category */
3452          case 13:  /* Left space vs right particular category */
3453          case 14:  /* Left word vs right particular category */
3454          p = posspropstab[n-12];
3455          accepted = risprop && lisprop ==
3456            (catposstab[p[0]][list[3]] &&
3457             catposstab[p[1]][list[3]] &&
3458            (list[3] != p[3] || !lisprop));
3459          break;
3460
3461          case 15:  /* Right alphanum vs left particular category */
3462          case 16:  /* Right space vs left particular category */
3463          case 17:  /* Right word vs left particular category */
3464          p = posspropstab[n-15];
3465          accepted = lisprop && risprop ==
3466            (catposstab[p[0]][base_list[3]] &&
3467             catposstab[p[1]][base_list[3]] &&
3468            (base_list[3] != p[3] || !risprop));
3469          break;
3470          }
3471        }
3472      }
3473
3474    else
3475#endif  /* SUPPORT_UCP */
3476
3477    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3478           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3479           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3480
3481    if (!accepted) return FALSE;
3482
3483    if (list[1] == 0) return TRUE;
3484    /* Might be an empty repeat. */
3485    continue;
3486    }
3487
3488  /* Control reaches here only if one of the items is a small character list.
3489  All characters are checked against the other side. */
3490
3491  do
3492    {
3493    chr = *chr_ptr;
3494
3495    switch(list_ptr[0])
3496      {
3497      case OP_CHAR:
3498      ochr_ptr = list_ptr + 2;
3499      do
3500        {
3501        if (chr == *ochr_ptr) return FALSE;
3502        ochr_ptr++;
3503        }
3504      while(*ochr_ptr != NOTACHAR);
3505      break;
3506
3507      case OP_NOT:
3508      ochr_ptr = list_ptr + 2;
3509      do
3510        {
3511        if (chr == *ochr_ptr)
3512          break;
3513        ochr_ptr++;
3514        }
3515      while(*ochr_ptr != NOTACHAR);
3516      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3517      break;
3518
3519      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3520      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3521
3522      case OP_DIGIT:
3523      if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3524      break;
3525
3526      case OP_NOT_DIGIT:
3527      if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3528      break;
3529
3530      case OP_WHITESPACE:
3531      if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3532      break;
3533
3534      case OP_NOT_WHITESPACE:
3535      if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3536      break;
3537
3538      case OP_WORDCHAR:
3539      if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3540      break;
3541
3542      case OP_NOT_WORDCHAR:
3543      if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3544      break;
3545
3546      case OP_HSPACE:
3547      switch(chr)
3548        {
3549        HSPACE_CASES: return FALSE;
3550        default: break;
3551        }
3552      break;
3553
3554      case OP_NOT_HSPACE:
3555      switch(chr)
3556        {
3557        HSPACE_CASES: break;
3558        default: return FALSE;
3559        }
3560      break;
3561
3562      case OP_ANYNL:
3563      case OP_VSPACE:
3564      switch(chr)
3565        {
3566        VSPACE_CASES: return FALSE;
3567        default: break;
3568        }
3569      break;
3570
3571      case OP_NOT_VSPACE:
3572      switch(chr)
3573        {
3574        VSPACE_CASES: break;
3575        default: return FALSE;
3576        }
3577      break;
3578
3579      case OP_DOLL:
3580      case OP_EODN:
3581      switch (chr)
3582        {
3583        case CHAR_CR:
3584        case CHAR_LF:
3585        case CHAR_VT:
3586        case CHAR_FF:
3587        case CHAR_NEL:
3588#ifndef EBCDIC
3589        case 0x2028:
3590        case 0x2029:
3591#endif  /* Not EBCDIC */
3592        return FALSE;
3593        }
3594      break;
3595
3596      case OP_EOD:    /* Can always possessify before \z */
3597      break;
3598
3599#ifdef SUPPORT_UCP
3600      case OP_PROP:
3601      case OP_NOTPROP:
3602      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3603            list_ptr[0] == OP_NOTPROP))
3604        return FALSE;
3605      break;
3606#endif
3607
3608      case OP_NCLASS:
3609      if (chr > 255) return FALSE;
3610      /* Fall through */
3611
3612      case OP_CLASS:
3613      if (chr > 255) break;
3614      class_bitset = (pcre_uint8 *)
3615        ((list_ptr == list ? code : base_end) - list_ptr[2]);
3616      if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3617      break;
3618
3619#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3620      case OP_XCLASS:
3621      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3622          list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3623      break;
3624#endif
3625
3626      default:
3627      return FALSE;
3628      }
3629
3630    chr_ptr++;
3631    }
3632  while(*chr_ptr != NOTACHAR);
3633
3634  /* At least one character must be matched from this opcode. */
3635
3636  if (list[1] == 0) return TRUE;
3637  }
3638
3639/* Control never reaches here. There used to be a fail-save return FALSE; here,
3640but some compilers complain about an unreachable statement. */
3641
3642}
3643
3644
3645
3646/*************************************************
3647*    Scan compiled regex for auto-possession     *
3648*************************************************/
3649
3650/* Replaces single character iterations with their possessive alternatives
3651if appropriate. This function modifies the compiled opcode!
3652
3653Arguments:
3654  code        points to start of the byte code
3655  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3656  cd          static compile data
3657
3658Returns:      nothing
3659*/
3660
3661static void
3662auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3663{
3664register pcre_uchar c;
3665const pcre_uchar *end;
3666pcre_uchar *repeat_opcode;
3667pcre_uint32 list[8];
3668int rec_limit;
3669
3670for (;;)
3671  {
3672  c = *code;
3673
3674  /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3675  it may compile without complaining, but may get into a loop here if the code
3676  pointer points to a bad value. This is, of course a documentated possibility,
3677  when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3678  just give up on this optimization. */
3679
3680  if (c >= OP_TABLE_LENGTH) return;
3681
3682  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3683    {
3684    c -= get_repeat_base(c) - OP_STAR;
3685    end = (c <= OP_MINUPTO) ?
3686      get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3687    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3688
3689    rec_limit = 1000;
3690    if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3691      {
3692      switch(c)
3693        {
3694        case OP_STAR:
3695        *code += OP_POSSTAR - OP_STAR;
3696        break;
3697
3698        case OP_MINSTAR:
3699        *code += OP_POSSTAR - OP_MINSTAR;
3700        break;
3701
3702        case OP_PLUS:
3703        *code += OP_POSPLUS - OP_PLUS;
3704        break;
3705
3706        case OP_MINPLUS:
3707        *code += OP_POSPLUS - OP_MINPLUS;
3708        break;
3709
3710        case OP_QUERY:
3711        *code += OP_POSQUERY - OP_QUERY;
3712        break;
3713
3714        case OP_MINQUERY:
3715        *code += OP_POSQUERY - OP_MINQUERY;
3716        break;
3717
3718        case OP_UPTO:
3719        *code += OP_POSUPTO - OP_UPTO;
3720        break;
3721
3722        case OP_MINUPTO:
3723        *code += OP_POSUPTO - OP_MINUPTO;
3724        break;
3725        }
3726      }
3727    c = *code;
3728    }
3729  else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3730    {
3731#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3732    if (c == OP_XCLASS)
3733      repeat_opcode = code + GET(code, 1);
3734    else
3735#endif
3736      repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3737
3738    c = *repeat_opcode;
3739    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3740      {
3741      /* end must not be NULL. */
3742      end = get_chr_property_list(code, utf, cd->fcc, list);
3743
3744      list[1] = (c & 1) == 0;
3745
3746      rec_limit = 1000;
3747      if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3748        {
3749        switch (c)
3750          {
3751          case OP_CRSTAR:
3752          case OP_CRMINSTAR:
3753          *repeat_opcode = OP_CRPOSSTAR;
3754          break;
3755
3756          case OP_CRPLUS:
3757          case OP_CRMINPLUS:
3758          *repeat_opcode = OP_CRPOSPLUS;
3759          break;
3760
3761          case OP_CRQUERY:
3762          case OP_CRMINQUERY:
3763          *repeat_opcode = OP_CRPOSQUERY;
3764          break;
3765
3766          case OP_CRRANGE:
3767          case OP_CRMINRANGE:
3768          *repeat_opcode = OP_CRPOSRANGE;
3769          break;
3770          }
3771        }
3772      }
3773    c = *code;
3774    }
3775
3776  switch(c)
3777    {
3778    case OP_END:
3779    return;
3780
3781    case OP_TYPESTAR:
3782    case OP_TYPEMINSTAR:
3783    case OP_TYPEPLUS:
3784    case OP_TYPEMINPLUS:
3785    case OP_TYPEQUERY:
3786    case OP_TYPEMINQUERY:
3787    case OP_TYPEPOSSTAR:
3788    case OP_TYPEPOSPLUS:
3789    case OP_TYPEPOSQUERY:
3790    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3791    break;
3792
3793    case OP_TYPEUPTO:
3794    case OP_TYPEMINUPTO:
3795    case OP_TYPEEXACT:
3796    case OP_TYPEPOSUPTO:
3797    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3798      code += 2;
3799    break;
3800
3801#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3802    case OP_XCLASS:
3803    code += GET(code, 1);
3804    break;
3805#endif
3806
3807    case OP_MARK:
3808    case OP_PRUNE_ARG:
3809    case OP_SKIP_ARG:
3810    case OP_THEN_ARG:
3811    code += code[1];
3812    break;
3813    }
3814
3815  /* Add in the fixed length from the table */
3816
3817  code += PRIV(OP_lengths)[c];
3818
3819  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3820  a multi-byte character. The length in the table is a minimum, so we have to
3821  arrange to skip the extra bytes. */
3822
3823#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3824  if (utf) switch(c)
3825    {
3826    case OP_CHAR:
3827    case OP_CHARI:
3828    case OP_NOT:
3829    case OP_NOTI:
3830    case OP_STAR:
3831    case OP_MINSTAR:
3832    case OP_PLUS:
3833    case OP_MINPLUS:
3834    case OP_QUERY:
3835    case OP_MINQUERY:
3836    case OP_UPTO:
3837    case OP_MINUPTO:
3838    case OP_EXACT:
3839    case OP_POSSTAR:
3840    case OP_POSPLUS:
3841    case OP_POSQUERY:
3842    case OP_POSUPTO:
3843    case OP_STARI:
3844    case OP_MINSTARI:
3845    case OP_PLUSI:
3846    case OP_MINPLUSI:
3847    case OP_QUERYI:
3848    case OP_MINQUERYI:
3849    case OP_UPTOI:
3850    case OP_MINUPTOI:
3851    case OP_EXACTI:
3852    case OP_POSSTARI:
3853    case OP_POSPLUSI:
3854    case OP_POSQUERYI:
3855    case OP_POSUPTOI:
3856    case OP_NOTSTAR:
3857    case OP_NOTMINSTAR:
3858    case OP_NOTPLUS:
3859    case OP_NOTMINPLUS:
3860    case OP_NOTQUERY:
3861    case OP_NOTMINQUERY:
3862    case OP_NOTUPTO:
3863    case OP_NOTMINUPTO:
3864    case OP_NOTEXACT:
3865    case OP_NOTPOSSTAR:
3866    case OP_NOTPOSPLUS:
3867    case OP_NOTPOSQUERY:
3868    case OP_NOTPOSUPTO:
3869    case OP_NOTSTARI:
3870    case OP_NOTMINSTARI:
3871    case OP_NOTPLUSI:
3872    case OP_NOTMINPLUSI:
3873    case OP_NOTQUERYI:
3874    case OP_NOTMINQUERYI:
3875    case OP_NOTUPTOI:
3876    case OP_NOTMINUPTOI:
3877    case OP_NOTEXACTI:
3878    case OP_NOTPOSSTARI:
3879    case OP_NOTPOSPLUSI:
3880    case OP_NOTPOSQUERYI:
3881    case OP_NOTPOSUPTOI:
3882    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3883    break;
3884    }
3885#else
3886  (void)(utf);  /* Keep compiler happy by referencing function argument */
3887#endif
3888  }
3889}
3890
3891
3892
3893/*************************************************
3894*           Check for POSIX class syntax         *
3895*************************************************/
3896
3897/* This function is called when the sequence "[:" or "[." or "[=" is
3898encountered in a character class. It checks whether this is followed by a
3899sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3900reach an unescaped ']' without the special preceding character, return FALSE.
3901
3902Originally, this function only recognized a sequence of letters between the
3903terminators, but it seems that Perl recognizes any sequence of characters,
3904though of course unknown POSIX names are subsequently rejected. Perl gives an
3905"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3906didn't consider this to be a POSIX class. Likewise for [:1234:].
3907
3908The problem in trying to be exactly like Perl is in the handling of escapes. We
3909have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3910class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3911below handles the special cases \\ and \], but does not try to do any other
3912escape processing. This makes it different from Perl for cases such as
3913[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3914not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3915when Perl does, I think.
3916
3917A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3918It seems that the appearance of a nested POSIX class supersedes an apparent
3919external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3920a digit.
3921
3922In Perl, unescaped square brackets may also appear as part of class names. For
3923example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3924[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3925seem right at all. PCRE does not allow closing square brackets in POSIX class
3926names.
3927
3928Arguments:
3929  ptr      pointer to the initial [
3930  endptr   where to return the end pointer
3931
3932Returns:   TRUE or FALSE
3933*/
3934
3935static BOOL
3936check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3937{
3938pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3939terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3940for (++ptr; *ptr != CHAR_NULL; ptr++)
3941  {
3942  if (*ptr == CHAR_BACKSLASH &&
3943      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3944       ptr[1] == CHAR_BACKSLASH))
3945    ptr++;
3946  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3947            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3948  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3949    {
3950    *endptr = ptr;
3951    return TRUE;
3952    }
3953  }
3954return FALSE;
3955}
3956
3957
3958
3959
3960/*************************************************
3961*          Check POSIX class name                *
3962*************************************************/
3963
3964/* This function is called to check the name given in a POSIX-style class entry
3965such as [:alnum:].
3966
3967Arguments:
3968  ptr        points to the first letter
3969  len        the length of the name
3970
3971Returns:     a value representing the name, or -1 if unknown
3972*/
3973
3974static int
3975check_posix_name(const pcre_uchar *ptr, int len)
3976{
3977const char *pn = posix_names;
3978register int yield = 0;
3979while (posix_name_lengths[yield] != 0)
3980  {
3981  if (len == posix_name_lengths[yield] &&
3982    STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3983  pn += posix_name_lengths[yield] + 1;
3984  yield++;
3985  }
3986return -1;
3987}
3988
3989
3990/*************************************************
3991*    Adjust OP_RECURSE items in repeated group   *
3992*************************************************/
3993
3994/* OP_RECURSE items contain an offset from the start of the regex to the group
3995that is referenced. This means that groups can be replicated for fixed
3996repetition simply by copying (because the recursion is allowed to refer to
3997earlier groups that are outside the current group). However, when a group is
3998optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3999inserted before it, after it has been compiled. This means that any OP_RECURSE
4000items within it that refer to the group itself or any contained groups have to
4001have their offsets adjusted. That one of the jobs of this function. Before it
4002is called, the partially compiled regex must be temporarily terminated with
4003OP_END.
4004
4005This function has been extended to cope with forward references for recursions
4006and subroutine calls. It must check the list of such references for the
4007group we are dealing with. If it finds that one of the recursions in the
4008current group is on this list, it does not adjust the value in the reference
4009(which is a group number). After the group has been scanned, all the offsets in
4010the forward reference list for the group are adjusted.
4011
4012Arguments:
4013  group      points to the start of the group
4014  adjust     the amount by which the group is to be moved
4015  utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4016  cd         contains pointers to tables etc.
4017  save_hwm_offset   the hwm forward reference offset at the start of the group
4018
4019Returns:     nothing
4020*/
4021
4022static void
4023adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4024  size_t save_hwm_offset)
4025{
4026int offset;
4027pcre_uchar *hc;
4028pcre_uchar *ptr = group;
4029
4030while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4031  {
4032  for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4033       hc += LINK_SIZE)
4034    {
4035    offset = (int)GET(hc, 0);
4036    if (cd->start_code + offset == ptr + 1) break;
4037    }
4038
4039  /* If we have not found this recursion on the forward reference list, adjust
4040  the recursion's offset if it's after the start of this group. */
4041
4042  if (hc >= cd->hwm)
4043    {
4044    offset = (int)GET(ptr, 1);
4045    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4046    }
4047
4048  ptr += 1 + LINK_SIZE;
4049  }
4050
4051/* Now adjust all forward reference offsets for the group. */
4052
4053for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4054     hc += LINK_SIZE)
4055  {
4056  offset = (int)GET(hc, 0);
4057  PUT(hc, 0, offset + adjust);
4058  }
4059}
4060
4061
4062
4063/*************************************************
4064*        Insert an automatic callout point       *
4065*************************************************/
4066
4067/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4068callout points before each pattern item.
4069
4070Arguments:
4071  code           current code pointer
4072  ptr            current pattern pointer
4073  cd             pointers to tables etc
4074
4075Returns:         new code pointer
4076*/
4077
4078static pcre_uchar *
4079auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4080{
4081*code++ = OP_CALLOUT;
4082*code++ = 255;
4083PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4084PUT(code, LINK_SIZE, 0);                       /* Default length */
4085return code + 2 * LINK_SIZE;
4086}
4087
4088
4089
4090/*************************************************
4091*         Complete a callout item                *
4092*************************************************/
4093
4094/* A callout item contains the length of the next item in the pattern, which
4095we can't fill in till after we have reached the relevant point. This is used
4096for both automatic and manual callouts.
4097
4098Arguments:
4099  previous_callout   points to previous callout item
4100  ptr                current pattern pointer
4101  cd                 pointers to tables etc
4102
4103Returns:             nothing
4104*/
4105
4106static void
4107complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4108{
4109int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4110PUT(previous_callout, 2 + LINK_SIZE, length);
4111}
4112
4113
4114
4115#ifdef SUPPORT_UCP
4116/*************************************************
4117*           Get othercase range                  *
4118*************************************************/
4119
4120/* This function is passed the start and end of a class range, in UTF-8 mode
4121with UCP support. It searches up the characters, looking for ranges of
4122characters in the "other" case. Each call returns the next one, updating the
4123start address. A character with multiple other cases is returned on its own
4124with a special return value.
4125
4126Arguments:
4127  cptr        points to starting character value; updated
4128  d           end value
4129  ocptr       where to put start of othercase range
4130  odptr       where to put end of othercase range
4131
4132Yield:        -1 when no more
4133               0 when a range is returned
4134              >0 the CASESET offset for char with multiple other cases
4135                in this case, ocptr contains the original
4136*/
4137
4138static int
4139get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4140  pcre_uint32 *odptr)
4141{
4142pcre_uint32 c, othercase, next;
4143unsigned int co;
4144
4145/* Find the first character that has an other case. If it has multiple other
4146cases, return its case offset value. */
4147
4148for (c = *cptr; c <= d; c++)
4149  {
4150  if ((co = UCD_CASESET(c)) != 0)
4151    {
4152    *ocptr = c++;   /* Character that has the set */
4153    *cptr = c;      /* Rest of input range */
4154    return (int)co;
4155    }
4156  if ((othercase = UCD_OTHERCASE(c)) != c) break;
4157  }
4158
4159if (c > d) return -1;  /* Reached end of range */
4160
4161/* Found a character that has a single other case. Search for the end of the
4162range, which is either the end of the input range, or a character that has zero
4163or more than one other cases. */
4164
4165*ocptr = othercase;
4166next = othercase + 1;
4167
4168for (++c; c <= d; c++)
4169  {
4170  if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4171  next++;
4172  }
4173
4174*odptr = next - 1;     /* End of othercase range */
4175*cptr = c;             /* Rest of input range */
4176return 0;
4177}
4178#endif  /* SUPPORT_UCP */
4179
4180
4181
4182/*************************************************
4183*        Add a character or range to a class     *
4184*************************************************/
4185
4186/* This function packages up the logic of adding a character or range of
4187characters to a class. The character values in the arguments will be within the
4188valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4189mutually recursive with the function immediately below.
4190
4191Arguments:
4192  classbits     the bit map for characters < 256
4193  uchardptr     points to the pointer for extra data
4194  options       the options word
4195  cd            contains pointers to tables etc.
4196  start         start of range character
4197  end           end of range character
4198
4199Returns:        the number of < 256 characters added
4200                the pointer to extra data is updated
4201*/
4202
4203static int
4204add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4205  compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4206{
4207pcre_uint32 c;
4208pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4209int n8 = 0;
4210
4211/* If caseless matching is required, scan the range and process alternate
4212cases. In Unicode, there are 8-bit characters that have alternate cases that
4213are greater than 255 and vice-versa. Sometimes we can just extend the original
4214range. */
4215
4216if ((options & PCRE_CASELESS) != 0)
4217  {
4218#ifdef SUPPORT_UCP
4219  if ((options & PCRE_UTF8) != 0)
4220    {
4221    int rc;
4222    pcre_uint32 oc, od;
4223
4224    options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4225    c = start;
4226
4227    while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4228      {
4229      /* Handle a single character that has more than one other case. */
4230
4231      if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4232        PRIV(ucd_caseless_sets) + rc, oc);
4233
4234      /* Do nothing if the other case range is within the original range. */
4235
4236      else if (oc >= start && od <= end) continue;
4237
4238      /* Extend the original range if there is overlap, noting that if oc < c, we
4239      can't have od > end because a subrange is always shorter than the basic
4240      range. Otherwise, use a recursive call to add the additional range. */
4241
4242      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4243      else if (od > end && oc <= end + 1)
4244        {
4245        end = od;       /* Extend upwards */
4246        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4247        }
4248      else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4249      }
4250    }
4251  else
4252#endif  /* SUPPORT_UCP */
4253
4254  /* Not UTF-mode, or no UCP */
4255
4256  for (c = start; c <= classbits_end; c++)
4257    {
4258    SETBIT(classbits, cd->fcc[c]);
4259    n8++;
4260    }
4261  }
4262
4263/* Now handle the original range. Adjust the final value according to the bit
4264length - this means that the same lists of (e.g.) horizontal spaces can be used
4265in all cases. */
4266
4267#if defined COMPILE_PCRE8
4268#ifdef SUPPORT_UTF
4269  if ((options & PCRE_UTF8) == 0)
4270#endif
4271  if (end > 0xff) end = 0xff;
4272
4273#elif defined COMPILE_PCRE16
4274#ifdef SUPPORT_UTF
4275  if ((options & PCRE_UTF16) == 0)
4276#endif
4277  if (end > 0xffff) end = 0xffff;
4278
4279#endif /* COMPILE_PCRE[8|16] */
4280
4281/* Use the bitmap for characters < 256. Otherwise use extra data.*/
4282
4283for (c = start; c <= classbits_end; c++)
4284  {
4285  /* Regardless of start, c will always be <= 255. */
4286  SETBIT(classbits, c);
4287  n8++;
4288  }
4289
4290#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4291if (start <= 0xff) start = 0xff + 1;
4292
4293if (end >= start)
4294  {
4295  pcre_uchar *uchardata = *uchardptr;
4296#ifdef SUPPORT_UTF
4297  if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4298    {
4299    if (start < end)
4300      {
4301      *uchardata++ = XCL_RANGE;
4302      uchardata += PRIV(ord2utf)(start, uchardata);
4303      uchardata += PRIV(ord2utf)(end, uchardata);
4304      }
4305    else if (start == end)
4306      {
4307      *uchardata++ = XCL_SINGLE;
4308      uchardata += PRIV(ord2utf)(start, uchardata);
4309      }
4310    }
4311  else
4312#endif  /* SUPPORT_UTF */
4313
4314  /* Without UTF support, character values are constrained by the bit length,
4315  and can only be > 256 for 16-bit and 32-bit libraries. */
4316
4317#ifdef COMPILE_PCRE8
4318    {}
4319#else
4320  if (start < end)
4321    {
4322    *uchardata++ = XCL_RANGE;
4323    *uchardata++ = start;
4324    *uchardata++ = end;
4325    }
4326  else if (start == end)
4327    {
4328    *uchardata++ = XCL_SINGLE;
4329    *uchardata++ = start;
4330    }
4331#endif
4332
4333  *uchardptr = uchardata;   /* Updata extra data pointer */
4334  }
4335#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4336
4337return n8;    /* Number of 8-bit characters */
4338}
4339
4340
4341
4342
4343/*************************************************
4344*        Add a list of characters to a class     *
4345*************************************************/
4346
4347/* This function is used for adding a list of case-equivalent characters to a
4348class, and also for adding a list of horizontal or vertical whitespace. If the
4349list is in order (which it should be), ranges of characters are detected and
4350handled appropriately. This function is mutually recursive with the function
4351above.
4352
4353Arguments:
4354  classbits     the bit map for characters < 256
4355  uchardptr     points to the pointer for extra data
4356  options       the options word
4357  cd            contains pointers to tables etc.
4358  p             points to row of 32-bit values, terminated by NOTACHAR
4359  except        character to omit; this is used when adding lists of
4360                  case-equivalent characters to avoid including the one we
4361                  already know about
4362
4363Returns:        the number of < 256 characters added
4364                the pointer to extra data is updated
4365*/
4366
4367static int
4368add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4369  compile_data *cd, const pcre_uint32 *p, unsigned int except)
4370{
4371int n8 = 0;
4372while (p[0] < NOTACHAR)
4373  {
4374  int n = 0;
4375  if (p[0] != except)
4376    {
4377    while(p[n+1] == p[0] + n + 1) n++;
4378    n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4379    }
4380  p += n + 1;
4381  }
4382return n8;
4383}
4384
4385
4386
4387/*************************************************
4388*    Add characters not in a list to a class     *
4389*************************************************/
4390
4391/* This function is used for adding the complement of a list of horizontal or
4392vertical whitespace to a class. The list must be in order.
4393
4394Arguments:
4395  classbits     the bit map for characters < 256
4396  uchardptr     points to the pointer for extra data
4397  options       the options word
4398  cd            contains pointers to tables etc.
4399  p             points to row of 32-bit values, terminated by NOTACHAR
4400
4401Returns:        the number of < 256 characters added
4402                the pointer to extra data is updated
4403*/
4404
4405static int
4406add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4407  int options, compile_data *cd, const pcre_uint32 *p)
4408{
4409BOOL utf = (options & PCRE_UTF8) != 0;
4410int n8 = 0;
4411if (p[0] > 0)
4412  n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4413while (p[0] < NOTACHAR)
4414  {
4415  while (p[1] == p[0] + 1) p++;
4416  n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4417    (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4418  p++;
4419  }
4420return n8;
4421}
4422
4423
4424
4425/*************************************************
4426*           Compile one branch                   *
4427*************************************************/
4428
4429/* Scan the pattern, compiling it into the a vector. If the options are
4430changed during the branch, the pointer is used to change the external options
4431bits. This function is used during the pre-compile phase when we are trying
4432to find out the amount of memory needed, as well as during the real compile
4433phase. The value of lengthptr distinguishes the two phases.
4434
4435Arguments:
4436  optionsptr        pointer to the option bits
4437  codeptr           points to the pointer to the current code point
4438  ptrptr            points to the current pattern pointer
4439  errorcodeptr      points to error code variable
4440  firstcharptr      place to put the first required character
4441  firstcharflagsptr place to put the first character flags, or a negative number
4442  reqcharptr        place to put the last required character
4443  reqcharflagsptr   place to put the last required character flags, or a negative number
4444  bcptr             points to current branch chain
4445  cond_depth        conditional nesting depth
4446  cd                contains pointers to tables etc.
4447  lengthptr         NULL during the real compile phase
4448                    points to length accumulator during pre-compile phase
4449
4450Returns:            TRUE on success
4451                    FALSE, with *errorcodeptr set non-zero on error
4452*/
4453
4454static BOOL
4455compile_branch(int *optionsptr, pcre_uchar **codeptr,
4456  const pcre_uchar **ptrptr, int *errorcodeptr,
4457  pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4458  pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4459  branch_chain *bcptr, int cond_depth,
4460  compile_data *cd, int *lengthptr)
4461{
4462int repeat_type, op_type;
4463int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4464int bravalue = 0;
4465int greedy_default, greedy_non_default;
4466pcre_uint32 firstchar, reqchar;
4467pcre_int32 firstcharflags, reqcharflags;
4468pcre_uint32 zeroreqchar, zerofirstchar;
4469pcre_int32 zeroreqcharflags, zerofirstcharflags;
4470pcre_int32 req_caseopt, reqvary, tempreqvary;
4471int options = *optionsptr;               /* May change dynamically */
4472int after_manual_callout = 0;
4473int length_prevgroup = 0;
4474register pcre_uint32 c;
4475int escape;
4476register pcre_uchar *code = *codeptr;
4477pcre_uchar *last_code = code;
4478pcre_uchar *orig_code = code;
4479pcre_uchar *tempcode;
4480BOOL inescq = FALSE;
4481BOOL groupsetfirstchar = FALSE;
4482const pcre_uchar *ptr = *ptrptr;
4483const pcre_uchar *tempptr;
4484const pcre_uchar *nestptr = NULL;
4485pcre_uchar *previous = NULL;
4486pcre_uchar *previous_callout = NULL;
4487size_t item_hwm_offset = 0;
4488pcre_uint8 classbits[32];
4489
4490/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4491must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4492dynamically as we process the pattern. */
4493
4494#ifdef SUPPORT_UTF
4495/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4496BOOL utf = (options & PCRE_UTF8) != 0;
4497#ifndef COMPILE_PCRE32
4498pcre_uchar utf_chars[6];
4499#endif
4500#else
4501BOOL utf = FALSE;
4502#endif
4503
4504/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4505class_uchardata always so that it can be passed to add_to_class() always,
4506though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4507alternative calls for the different cases. */
4508
4509pcre_uchar *class_uchardata;
4510#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4511BOOL xclass;
4512pcre_uchar *class_uchardata_base;
4513#endif
4514
4515#ifdef PCRE_DEBUG
4516if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4517#endif
4518
4519/* Set up the default and non-default settings for greediness */
4520
4521greedy_default = ((options & PCRE_UNGREEDY) != 0);
4522greedy_non_default = greedy_default ^ 1;
4523
4524/* Initialize no first byte, no required byte. REQ_UNSET means "no char
4525matching encountered yet". It gets changed to REQ_NONE if we hit something that
4526matches a non-fixed char first char; reqchar just remains unset if we never
4527find one.
4528
4529When we hit a repeat whose minimum is zero, we may have to adjust these values
4530to take the zero repeat into account. This is implemented by setting them to
4531zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4532item types that can be repeated set these backoff variables appropriately. */
4533
4534firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4535firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4536
4537/* The variable req_caseopt contains either the REQ_CASELESS value
4538or zero, according to the current setting of the caseless flag. The
4539REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4540firstchar or reqchar variables to record the case status of the
4541value. This is used only for ASCII characters. */
4542
4543req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4544
4545/* Switch on next character until the end of the branch */
4546
4547for (;; ptr++)
4548  {
4549  BOOL negate_class;
4550  BOOL should_flip_negation;
4551  BOOL possessive_quantifier;
4552  BOOL is_quantifier;
4553  BOOL is_recurse;
4554  BOOL reset_bracount;
4555  int class_has_8bitchar;
4556  int class_one_char;
4557#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4558  BOOL xclass_has_prop;
4559#endif
4560  int newoptions;
4561  int recno;
4562  int refsign;
4563  int skipbytes;
4564  pcre_uint32 subreqchar, subfirstchar;
4565  pcre_int32 subreqcharflags, subfirstcharflags;
4566  int terminator;
4567  unsigned int mclength;
4568  unsigned int tempbracount;
4569  pcre_uint32 ec;
4570  pcre_uchar mcbuffer[8];
4571
4572  /* Come here to restart the loop without advancing the pointer. */
4573
4574  REDO_LOOP:
4575
4576  /* Get next character in the pattern */
4577
4578  c = *ptr;
4579
4580  /* If we are at the end of a nested substitution, revert to the outer level
4581  string. Nesting only happens one level deep. */
4582
4583  if (c == CHAR_NULL && nestptr != NULL)
4584    {
4585    ptr = nestptr;
4586    nestptr = NULL;
4587    c = *ptr;
4588    }
4589
4590  /* If we are in the pre-compile phase, accumulate the length used for the
4591  previous cycle of this loop. */
4592
4593  if (lengthptr != NULL)
4594    {
4595#ifdef PCRE_DEBUG
4596    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4597#endif
4598    if (code > cd->start_workspace + cd->workspace_size -
4599        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4600      {
4601      *errorcodeptr = (code >= cd->start_workspace + cd->workspace_size)?
4602        ERR52 : ERR87;
4603      goto FAILED;
4604      }
4605
4606    /* There is at least one situation where code goes backwards: this is the
4607    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4608    the class is simply eliminated. However, it is created first, so we have to
4609    allow memory for it. Therefore, don't ever reduce the length at this point.
4610    */
4611
4612    if (code < last_code) code = last_code;
4613
4614    /* Paranoid check for integer overflow */
4615
4616    if (OFLOW_MAX - *lengthptr < code - last_code)
4617      {
4618      *errorcodeptr = ERR20;
4619      goto FAILED;
4620      }
4621
4622    *lengthptr += (int)(code - last_code);
4623    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4624      (int)(code - last_code), c, c));
4625
4626    /* If "previous" is set and it is not at the start of the work space, move
4627    it back to there, in order to avoid filling up the work space. Otherwise,
4628    if "previous" is NULL, reset the current code pointer to the start. */
4629
4630    if (previous != NULL)
4631      {
4632      if (previous > orig_code)
4633        {
4634        memmove(orig_code, previous, IN_UCHARS(code - previous));
4635        code -= previous - orig_code;
4636        previous = orig_code;
4637        }
4638      }
4639    else code = orig_code;
4640
4641    /* Remember where this code item starts so we can pick up the length
4642    next time round. */
4643
4644    last_code = code;
4645    }
4646
4647  /* In the real compile phase, just check the workspace used by the forward
4648  reference list. */
4649
4650  else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4651    {
4652    *errorcodeptr = ERR52;
4653    goto FAILED;
4654    }
4655
4656  /* If in \Q...\E, check for the end; if not, we have a literal. Otherwise an
4657  isolated \E is ignored. */
4658
4659  if (c != CHAR_NULL)
4660    {
4661    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4662      {
4663      inescq = FALSE;
4664      ptr++;
4665      continue;
4666      }
4667    else if (inescq)
4668      {
4669      if (previous_callout != NULL)
4670        {
4671        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4672          complete_callout(previous_callout, ptr, cd);
4673        previous_callout = NULL;
4674        }
4675      if ((options & PCRE_AUTO_CALLOUT) != 0)
4676        {
4677        previous_callout = code;
4678        code = auto_callout(code, ptr, cd);
4679        }
4680      goto NORMAL_CHAR;
4681      }
4682
4683    /* Check for the start of a \Q...\E sequence. We must do this here rather
4684    than later in case it is immediately followed by \E, which turns it into a
4685    "do nothing" sequence. */
4686
4687    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4688      {
4689      inescq = TRUE;
4690      ptr++;
4691      continue;
4692      }
4693    }
4694
4695  /* In extended mode, skip white space and comments. */
4696
4697  if ((options & PCRE_EXTENDED) != 0)
4698    {
4699    const pcre_uchar *wscptr = ptr;
4700    while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4701    if (c == CHAR_NUMBER_SIGN)
4702      {
4703      ptr++;
4704      while (*ptr != CHAR_NULL)
4705        {
4706        if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4707          {                          /* IS_NEWLINE sets cd->nllen. */
4708          ptr += cd->nllen;
4709          break;
4710          }
4711        ptr++;
4712#ifdef SUPPORT_UTF
4713        if (utf) FORWARDCHAR(ptr);
4714#endif
4715        }
4716      }
4717
4718    /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4719    a comment. */
4720
4721    if (ptr > wscptr) goto REDO_LOOP;
4722    }
4723
4724  /* Skip over (?# comments. We need to do this here because we want to know if
4725  the next thing is a quantifier, and these comments may come between an item
4726  and its quantifier. */
4727
4728  if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4729      ptr[2] == CHAR_NUMBER_SIGN)
4730    {
4731    ptr += 3;
4732    while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4733    if (*ptr == CHAR_NULL)
4734      {
4735      *errorcodeptr = ERR18;
4736      goto FAILED;
4737      }
4738    continue;
4739    }
4740
4741  /* See if the next thing is a quantifier. */
4742
4743  is_quantifier =
4744    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4745    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4746
4747  /* Fill in length of a previous callout, except when the next thing is a
4748  quantifier or when processing a property substitution string in UCP mode. */
4749
4750  if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4751       after_manual_callout-- <= 0)
4752    {
4753    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4754      complete_callout(previous_callout, ptr, cd);
4755    previous_callout = NULL;
4756    }
4757
4758  /* Create auto callout, except for quantifiers, or while processing property
4759  strings that are substituted for \w etc in UCP mode. */
4760
4761  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4762    {
4763    previous_callout = code;
4764    code = auto_callout(code, ptr, cd);
4765    }
4766
4767  /* Process the next pattern item. */
4768
4769  switch(c)
4770    {
4771    /* ===================================================================*/
4772    case CHAR_NULL:                /* The branch terminates at string end */
4773    case CHAR_VERTICAL_LINE:       /* or | or ) */
4774    case CHAR_RIGHT_PARENTHESIS:
4775    *firstcharptr = firstchar;
4776    *firstcharflagsptr = firstcharflags;
4777    *reqcharptr = reqchar;
4778    *reqcharflagsptr = reqcharflags;
4779    *codeptr = code;
4780    *ptrptr = ptr;
4781    if (lengthptr != NULL)
4782      {
4783      if (OFLOW_MAX - *lengthptr < code - last_code)
4784        {
4785        *errorcodeptr = ERR20;
4786        goto FAILED;
4787        }
4788      *lengthptr += (int)(code - last_code);   /* To include callout length */
4789      DPRINTF((">> end branch\n"));
4790      }
4791    return TRUE;
4792
4793
4794    /* ===================================================================*/
4795    /* Handle single-character metacharacters. In multiline mode, ^ disables
4796    the setting of any following char as a first character. */
4797
4798    case CHAR_CIRCUMFLEX_ACCENT:
4799    previous = NULL;
4800    if ((options & PCRE_MULTILINE) != 0)
4801      {
4802      if (firstcharflags == REQ_UNSET)
4803        zerofirstcharflags = firstcharflags = REQ_NONE;
4804      *code++ = OP_CIRCM;
4805      }
4806    else *code++ = OP_CIRC;
4807    break;
4808
4809    case CHAR_DOLLAR_SIGN:
4810    previous = NULL;
4811    *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4812    break;
4813
4814    /* There can never be a first char if '.' is first, whatever happens about
4815    repeats. The value of reqchar doesn't change either. */
4816
4817    case CHAR_DOT:
4818    if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4819    zerofirstchar = firstchar;
4820    zerofirstcharflags = firstcharflags;
4821    zeroreqchar = reqchar;
4822    zeroreqcharflags = reqcharflags;
4823    previous = code;
4824    item_hwm_offset = cd->hwm - cd->start_workspace;
4825    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4826    break;
4827
4828
4829    /* ===================================================================*/
4830    /* Character classes. If the included characters are all < 256, we build a
4831    32-byte bitmap of the permitted characters, except in the special case
4832    where there is only one such character. For negated classes, we build the
4833    map as usual, then invert it at the end. However, we use a different opcode
4834    so that data characters > 255 can be handled correctly.
4835
4836    If the class contains characters outside the 0-255 range, a different
4837    opcode is compiled. It may optionally have a bit map for characters < 256,
4838    but those above are are explicitly listed afterwards. A flag byte tells
4839    whether the bitmap is present, and whether this is a negated class or not.
4840
4841    In JavaScript compatibility mode, an isolated ']' causes an error. In
4842    default (Perl) mode, it is treated as a data character. */
4843
4844    case CHAR_RIGHT_SQUARE_BRACKET:
4845    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4846      {
4847      *errorcodeptr = ERR64;
4848      goto FAILED;
4849      }
4850    goto NORMAL_CHAR;
4851
4852    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4853    used for "start of word" and "end of word". As these are otherwise illegal
4854    sequences, we don't break anything by recognizing them. They are replaced
4855    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4856    erroneous and are handled by the normal code below. */
4857
4858    case CHAR_LEFT_SQUARE_BRACKET:
4859    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4860      {
4861      nestptr = ptr + 7;
4862      ptr = sub_start_of_word;
4863      goto REDO_LOOP;
4864      }
4865
4866    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4867      {
4868      nestptr = ptr + 7;
4869      ptr = sub_end_of_word;
4870      goto REDO_LOOP;
4871      }
4872
4873    /* Handle a real character class. */
4874
4875    previous = code;
4876    item_hwm_offset = cd->hwm - cd->start_workspace;
4877
4878    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4879    they are encountered at the top level, so we'll do that too. */
4880
4881    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4882         ptr[1] == CHAR_EQUALS_SIGN) &&
4883        check_posix_syntax(ptr, &tempptr))
4884      {
4885      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4886      goto FAILED;
4887      }
4888
4889    /* If the first character is '^', set the negation flag and skip it. Also,
4890    if the first few characters (either before or after ^) are \Q\E or \E we
4891    skip them too. This makes for compatibility with Perl. */
4892
4893    negate_class = FALSE;
4894    for (;;)
4895      {
4896      c = *(++ptr);
4897      if (c == CHAR_BACKSLASH)
4898        {
4899        if (ptr[1] == CHAR_E)
4900          ptr++;
4901        else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4902          ptr += 3;
4903        else
4904          break;
4905        }
4906      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4907        negate_class = TRUE;
4908      else break;
4909      }
4910
4911    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4912    an initial ']' is taken as a data character -- the code below handles
4913    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4914    [^] must match any character, so generate OP_ALLANY. */
4915
4916    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4917        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4918      {
4919      *code++ = negate_class? OP_ALLANY : OP_FAIL;
4920      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4921      zerofirstchar = firstchar;
4922      zerofirstcharflags = firstcharflags;
4923      break;
4924      }
4925
4926    /* If a class contains a negative special such as \S, we need to flip the
4927    negation flag at the end, so that support for characters > 255 works
4928    correctly (they are all included in the class). */
4929
4930    should_flip_negation = FALSE;
4931
4932    /* Extended class (xclass) will be used when characters > 255
4933    might match. */
4934
4935#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4936    xclass = FALSE;
4937    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4938    class_uchardata_base = class_uchardata;   /* Save the start */
4939#endif
4940
4941    /* For optimization purposes, we track some properties of the class:
4942    class_has_8bitchar will be non-zero if the class contains at least one <
4943    256 character; class_one_char will be 1 if the class contains just one
4944    character; xclass_has_prop will be TRUE if unicode property checks
4945    are present in the class. */
4946
4947    class_has_8bitchar = 0;
4948    class_one_char = 0;
4949#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4950    xclass_has_prop = FALSE;
4951#endif
4952
4953    /* Initialize the 32-char bit map to all zeros. We build the map in a
4954    temporary bit of memory, in case the class contains fewer than two
4955    8-bit characters because in that case the compiled code doesn't use the bit
4956    map. */
4957
4958    memset(classbits, 0, 32 * sizeof(pcre_uint8));
4959
4960    /* Process characters until ] is reached. By writing this as a "do" it
4961    means that an initial ] is taken as a data character. At the start of the
4962    loop, c contains the first byte of the character. */
4963
4964    if (c != CHAR_NULL) do
4965      {
4966      const pcre_uchar *oldptr;
4967
4968#ifdef SUPPORT_UTF
4969      if (utf && HAS_EXTRALEN(c))
4970        {                           /* Braces are required because the */
4971        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4972        }
4973#endif
4974
4975#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4976      /* In the pre-compile phase, accumulate the length of any extra
4977      data and reset the pointer. This is so that very large classes that
4978      contain a zillion > 255 characters no longer overwrite the work space
4979      (which is on the stack). We have to remember that there was XCLASS data,
4980      however. */
4981
4982      if (class_uchardata > class_uchardata_base) xclass = TRUE;
4983
4984      if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4985        {
4986        *lengthptr += (int)(class_uchardata - class_uchardata_base);
4987        class_uchardata = class_uchardata_base;
4988        }
4989#endif
4990
4991      /* Inside \Q...\E everything is literal except \E */
4992
4993      if (inescq)
4994        {
4995        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4996          {
4997          inescq = FALSE;                   /* Reset literal state */
4998          ptr++;                            /* Skip the 'E' */
4999          continue;                         /* Carry on with next */
5000          }
5001        goto CHECK_RANGE;                   /* Could be range if \E follows */
5002        }
5003
5004      /* Handle POSIX class names. Perl allows a negation extension of the
5005      form [:^name:]. A square bracket that doesn't match the syntax is
5006      treated as a literal. We also recognize the POSIX constructions
5007      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5008      5.6 and 5.8 do. */
5009
5010      if (c == CHAR_LEFT_SQUARE_BRACKET &&
5011          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5012           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
5013        {
5014        BOOL local_negate = FALSE;
5015        int posix_class, taboffset, tabopt;
5016        register const pcre_uint8 *cbits = cd->cbits;
5017        pcre_uint8 pbits[32];
5018
5019        if (ptr[1] != CHAR_COLON)
5020          {
5021          *errorcodeptr = ERR31;
5022          goto FAILED;
5023          }
5024
5025        ptr += 2;
5026        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
5027          {
5028          local_negate = TRUE;
5029          should_flip_negation = TRUE;  /* Note negative special */
5030          ptr++;
5031          }
5032
5033        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
5034        if (posix_class < 0)
5035          {
5036          *errorcodeptr = ERR30;
5037          goto FAILED;
5038          }
5039
5040        /* If matching is caseless, upper and lower are converted to
5041        alpha. This relies on the fact that the class table starts with
5042        alpha, lower, upper as the first 3 entries. */
5043
5044        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5045          posix_class = 0;
5046
5047        /* When PCRE_UCP is set, some of the POSIX classes are converted to
5048        different escape sequences that use Unicode properties \p or \P. Others
5049        that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5050        directly. */
5051
5052#ifdef SUPPORT_UCP
5053        if ((options & PCRE_UCP) != 0)
5054          {
5055          unsigned int ptype = 0;
5056          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5057
5058          /* The posix_substitutes table specifies which POSIX classes can be
5059          converted to \p or \P items. */
5060
5061          if (posix_substitutes[pc] != NULL)
5062            {
5063            nestptr = tempptr + 1;
5064            ptr = posix_substitutes[pc] - 1;
5065            continue;
5066            }
5067
5068          /* There are three other classes that generate special property calls
5069          that are recognized only in an XCLASS. */
5070
5071          else switch(posix_class)
5072            {
5073            case PC_GRAPH:
5074            ptype = PT_PXGRAPH;
5075            /* Fall through */
5076            case PC_PRINT:
5077            if (ptype == 0) ptype = PT_PXPRINT;
5078            /* Fall through */
5079            case PC_PUNCT:
5080            if (ptype == 0) ptype = PT_PXPUNCT;
5081            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5082            *class_uchardata++ = ptype;
5083            *class_uchardata++ = 0;
5084            xclass_has_prop = TRUE;
5085            ptr = tempptr + 1;
5086            continue;
5087
5088            /* For the other POSIX classes (ascii, cntrl, xdigit) we are going
5089            to fall through to the non-UCP case and build a bit map for
5090            characters with code points less than 256. If we are in a negated
5091            POSIX class, characters with code points greater than 255 must
5092            either all match or all not match. In the special case where we
5093            have not yet generated any xclass data, and this is the final item
5094            in the overall class, we need do nothing: later on, the opcode
5095            OP_NCLASS will be used to indicate that characters greater than 255
5096            are acceptable. If we have already seen an xclass item or one may
5097            follow (we have to assume that it might if this is not the end of
5098            the class), explicitly list all wide codepoints, which will then
5099            either not match or match, depending on whether the class is or is
5100            not negated. */
5101
5102            default:
5103            if (local_negate &&
5104                (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5105              {
5106              *class_uchardata++ = XCL_RANGE;
5107              class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5108              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5109              }
5110            break;
5111            }
5112          }
5113#endif
5114        /* In the non-UCP case, or when UCP makes no difference, we build the
5115        bit map for the POSIX class in a chunk of local store because we may be
5116        adding and subtracting from it, and we don't want to subtract bits that
5117        may be in the main map already. At the end we or the result into the
5118        bit map that is being built. */
5119
5120        posix_class *= 3;
5121
5122        /* Copy in the first table (always present) */
5123
5124        memcpy(pbits, cbits + posix_class_maps[posix_class],
5125          32 * sizeof(pcre_uint8));
5126
5127        /* If there is a second table, add or remove it as required. */
5128
5129        taboffset = posix_class_maps[posix_class + 1];
5130        tabopt = posix_class_maps[posix_class + 2];
5131
5132        if (taboffset >= 0)
5133          {
5134          if (tabopt >= 0)
5135            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5136          else
5137            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5138          }
5139
5140        /* Now see if we need to remove any special characters. An option
5141        value of 1 removes vertical space and 2 removes underscore. */
5142
5143        if (tabopt < 0) tabopt = -tabopt;
5144        if (tabopt == 1) pbits[1] &= ~0x3c;
5145          else if (tabopt == 2) pbits[11] &= 0x7f;
5146
5147        /* Add the POSIX table or its complement into the main table that is
5148        being built and we are done. */
5149
5150        if (local_negate)
5151          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5152        else
5153          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5154
5155        ptr = tempptr + 1;
5156        /* Every class contains at least one < 256 character. */
5157        class_has_8bitchar = 1;
5158        /* Every class contains at least two characters. */
5159        class_one_char = 2;
5160        continue;    /* End of POSIX syntax handling */
5161        }
5162
5163      /* Backslash may introduce a single character, or it may introduce one
5164      of the specials, which just set a flag. The sequence \b is a special
5165      case. Inside a class (and only there) it is treated as backspace. We
5166      assume that other escapes have more than one character in them, so
5167      speculatively set both class_has_8bitchar and class_one_char bigger
5168      than one. Unrecognized escapes fall through and are either treated
5169      as literal characters (by default), or are faulted if
5170      PCRE_EXTRA is set. */
5171
5172      if (c == CHAR_BACKSLASH)
5173        {
5174        escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5175          TRUE);
5176        if (*errorcodeptr != 0) goto FAILED;
5177        if (escape == 0) c = ec;
5178        else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5179        else if (escape == ESC_N)          /* \N is not supported in a class */
5180          {
5181          *errorcodeptr = ERR71;
5182          goto FAILED;
5183          }
5184        else if (escape == ESC_Q)            /* Handle start of quoted string */
5185          {
5186          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5187            {
5188            ptr += 2; /* avoid empty string */
5189            }
5190          else inescq = TRUE;
5191          continue;
5192          }
5193        else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5194
5195        else
5196          {
5197          register const pcre_uint8 *cbits = cd->cbits;
5198          /* Every class contains at least two < 256 characters. */
5199          class_has_8bitchar++;
5200          /* Every class contains at least two characters. */
5201          class_one_char += 2;
5202
5203          switch (escape)
5204            {
5205#ifdef SUPPORT_UCP
5206            case ESC_du:     /* These are the values given for \d etc */
5207            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5208            case ESC_wu:     /* escape sequence with an appropriate \p */
5209            case ESC_WU:     /* or \P to test Unicode properties instead */
5210            case ESC_su:     /* of the default ASCII testing. */
5211            case ESC_SU:
5212            nestptr = ptr;
5213            ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5214            class_has_8bitchar--;                /* Undo! */
5215            continue;
5216#endif
5217            case ESC_d:
5218            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5219            continue;
5220
5221            case ESC_D:
5222            should_flip_negation = TRUE;
5223            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5224            continue;
5225
5226            case ESC_w:
5227            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5228            continue;
5229
5230            case ESC_W:
5231            should_flip_negation = TRUE;
5232            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5233            continue;
5234
5235            /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5236            5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5237            previously set by something earlier in the character class.
5238            Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5239            we could just adjust the appropriate bit. From PCRE 8.34 we no
5240            longer treat \s and \S specially. */
5241
5242            case ESC_s:
5243            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5244            continue;
5245
5246            case ESC_S:
5247            should_flip_negation = TRUE;
5248            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5249            continue;
5250
5251            /* The rest apply in both UCP and non-UCP cases. */
5252
5253            case ESC_h:
5254            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5255              PRIV(hspace_list), NOTACHAR);
5256            continue;
5257
5258            case ESC_H:
5259            (void)add_not_list_to_class(classbits, &class_uchardata, options,
5260              cd, PRIV(hspace_list));
5261            continue;
5262
5263            case ESC_v:
5264            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5265              PRIV(vspace_list), NOTACHAR);
5266            continue;
5267
5268            case ESC_V:
5269            (void)add_not_list_to_class(classbits, &class_uchardata, options,
5270              cd, PRIV(vspace_list));
5271            continue;
5272
5273            case ESC_p:
5274            case ESC_P:
5275#ifdef SUPPORT_UCP
5276              {
5277              BOOL negated;
5278              unsigned int ptype = 0, pdata = 0;
5279              if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5280                goto FAILED;
5281              *class_uchardata++ = ((escape == ESC_p) != negated)?
5282                XCL_PROP : XCL_NOTPROP;
5283              *class_uchardata++ = ptype;
5284              *class_uchardata++ = pdata;
5285              xclass_has_prop = TRUE;
5286              class_has_8bitchar--;                /* Undo! */
5287              continue;
5288              }
5289#else
5290            *errorcodeptr = ERR45;
5291            goto FAILED;
5292#endif
5293            /* Unrecognized escapes are faulted if PCRE is running in its
5294            strict mode. By default, for compatibility with Perl, they are
5295            treated as literals. */
5296
5297            default:
5298            if ((options & PCRE_EXTRA) != 0)
5299              {
5300              *errorcodeptr = ERR7;
5301              goto FAILED;
5302              }
5303            class_has_8bitchar--;    /* Undo the speculative increase. */
5304            class_one_char -= 2;     /* Undo the speculative increase. */
5305            c = *ptr;                /* Get the final character and fall through */
5306            break;
5307            }
5308          }
5309
5310        /* Fall through if the escape just defined a single character (c >= 0).
5311        This may be greater than 256. */
5312
5313        escape = 0;
5314
5315        }   /* End of backslash handling */
5316
5317      /* A character may be followed by '-' to form a range. However, Perl does
5318      not permit ']' to be the end of the range. A '-' character at the end is
5319      treated as a literal. Perl ignores orphaned \E sequences entirely. The
5320      code for handling \Q and \E is messy. */
5321
5322      CHECK_RANGE:
5323      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5324        {
5325        inescq = FALSE;
5326        ptr += 2;
5327        }
5328      oldptr = ptr;
5329
5330      /* Remember if \r or \n were explicitly used */
5331
5332      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5333
5334      /* Check for range */
5335
5336      if (!inescq && ptr[1] == CHAR_MINUS)
5337        {
5338        pcre_uint32 d;
5339        ptr += 2;
5340        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5341
5342        /* If we hit \Q (not followed by \E) at this point, go into escaped
5343        mode. */
5344
5345        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5346          {
5347          ptr += 2;
5348          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5349            { ptr += 2; continue; }
5350          inescq = TRUE;
5351          break;
5352          }
5353
5354        /* Minus (hyphen) at the end of a class is treated as a literal, so put
5355        back the pointer and jump to handle the character that preceded it. */
5356
5357        if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5358          {
5359          ptr = oldptr;
5360          goto CLASS_SINGLE_CHARACTER;
5361          }
5362
5363        /* Otherwise, we have a potential range; pick up the next character */
5364
5365#ifdef SUPPORT_UTF
5366        if (utf)
5367          {                           /* Braces are required because the */
5368          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5369          }
5370        else
5371#endif
5372        d = *ptr;  /* Not UTF-8 mode */
5373
5374        /* The second part of a range can be a single-character escape
5375        sequence, but not any of the other escapes. Perl treats a hyphen as a
5376        literal in such circumstances. However, in Perl's warning mode, a
5377        warning is given, so PCRE now faults it as it is almost certainly a
5378        mistake on the user's part. */
5379
5380        if (!inescq)
5381          {
5382          if (d == CHAR_BACKSLASH)
5383            {
5384            int descape;
5385            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5386            if (*errorcodeptr != 0) goto FAILED;
5387
5388            /* 0 means a character was put into d; \b is backspace; any other
5389            special causes an error. */
5390
5391            if (descape != 0)
5392              {
5393              if (descape == ESC_b) d = CHAR_BS; else
5394                {
5395                *errorcodeptr = ERR83;
5396                goto FAILED;
5397                }
5398              }
5399            }
5400
5401          /* A hyphen followed by a POSIX class is treated in the same way. */
5402
5403          else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5404                   (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5405                    ptr[1] == CHAR_EQUALS_SIGN) &&
5406                   check_posix_syntax(ptr, &tempptr))
5407            {
5408            *errorcodeptr = ERR83;
5409            goto FAILED;
5410            }
5411          }
5412
5413        /* Check that the two values are in the correct order. Optimize
5414        one-character ranges. */
5415
5416        if (d < c)
5417          {
5418          *errorcodeptr = ERR8;
5419          goto FAILED;
5420          }
5421        if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5422
5423        /* We have found a character range, so single character optimizations
5424        cannot be done anymore. Any value greater than 1 indicates that there
5425        is more than one character. */
5426
5427        class_one_char = 2;
5428
5429        /* Remember an explicit \r or \n, and add the range to the class. */
5430
5431        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5432
5433        class_has_8bitchar +=
5434          add_to_class(classbits, &class_uchardata, options, cd, c, d);
5435
5436        continue;   /* Go get the next char in the class */
5437        }
5438
5439      /* Handle a single character - we can get here for a normal non-escape
5440      char, or after \ that introduces a single character or for an apparent
5441      range that isn't. Only the value 1 matters for class_one_char, so don't
5442      increase it if it is already 2 or more ... just in case there's a class
5443      with a zillion characters in it. */
5444
5445      CLASS_SINGLE_CHARACTER:
5446      if (class_one_char < 2) class_one_char++;
5447
5448      /* If xclass_has_prop is false and class_one_char is 1, we have the first
5449      single character in the class, and there have been no prior ranges, or
5450      XCLASS items generated by escapes. If this is the final character in the
5451      class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5452      if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5453      can cause firstchar to be set. Otherwise, there can be no first char if
5454      this item is first, whatever repeat count may follow. In the case of
5455      reqchar, save the previous value for reinstating. */
5456
5457      if (!inescq &&
5458#ifdef SUPPORT_UCP
5459          !xclass_has_prop &&
5460#endif
5461          class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5462        {
5463        ptr++;
5464        zeroreqchar = reqchar;
5465        zeroreqcharflags = reqcharflags;
5466
5467        if (negate_class)
5468          {
5469#ifdef SUPPORT_UCP
5470          int d;
5471#endif
5472          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5473          zerofirstchar = firstchar;
5474          zerofirstcharflags = firstcharflags;
5475
5476          /* For caseless UTF-8 mode when UCP support is available, check
5477          whether this character has more than one other case. If so, generate
5478          a special OP_NOTPROP item instead of OP_NOTI. */
5479
5480#ifdef SUPPORT_UCP
5481          if (utf && (options & PCRE_CASELESS) != 0 &&
5482              (d = UCD_CASESET(c)) != 0)
5483            {
5484            *code++ = OP_NOTPROP;
5485            *code++ = PT_CLIST;
5486            *code++ = d;
5487            }
5488          else
5489#endif
5490          /* Char has only one other case, or UCP not available */
5491
5492            {
5493            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5494#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5495            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5496              code += PRIV(ord2utf)(c, code);
5497            else
5498#endif
5499              *code++ = c;
5500            }
5501
5502          /* We are finished with this character class */
5503
5504          goto END_CLASS;
5505          }
5506
5507        /* For a single, positive character, get the value into mcbuffer, and
5508        then we can handle this with the normal one-character code. */
5509
5510#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5511        if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5512          mclength = PRIV(ord2utf)(c, mcbuffer);
5513        else
5514#endif
5515          {
5516          mcbuffer[0] = c;
5517          mclength = 1;
5518          }
5519        goto ONE_CHAR;
5520        }       /* End of 1-char optimization */
5521
5522      /* There is more than one character in the class, or an XCLASS item
5523      has been generated. Add this character to the class. */
5524
5525      class_has_8bitchar +=
5526        add_to_class(classbits, &class_uchardata, options, cd, c, c);
5527      }
5528
5529    /* Loop until ']' reached. This "while" is the end of the "do" far above.
5530    If we are at the end of an internal nested string, revert to the outer
5531    string. */
5532
5533    while (((c = *(++ptr)) != CHAR_NULL ||
5534           (nestptr != NULL &&
5535             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5536           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5537
5538    /* Check for missing terminating ']' */
5539
5540    if (c == CHAR_NULL)
5541      {
5542      *errorcodeptr = ERR6;
5543      goto FAILED;
5544      }
5545
5546    /* We will need an XCLASS if data has been placed in class_uchardata. In
5547    the second phase this is a sufficient test. However, in the pre-compile
5548    phase, class_uchardata gets emptied to prevent workspace overflow, so it
5549    only if the very last character in the class needs XCLASS will it contain
5550    anything at this point. For this reason, xclass gets set TRUE above when
5551    uchar_classdata is emptied, and that's why this code is the way it is here
5552    instead of just doing a test on class_uchardata below. */
5553
5554#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5555    if (class_uchardata > class_uchardata_base) xclass = TRUE;
5556#endif
5557
5558    /* If this is the first thing in the branch, there can be no first char
5559    setting, whatever the repeat count. Any reqchar setting must remain
5560    unchanged after any kind of repeat. */
5561
5562    if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5563    zerofirstchar = firstchar;
5564    zerofirstcharflags = firstcharflags;
5565    zeroreqchar = reqchar;
5566    zeroreqcharflags = reqcharflags;
5567
5568    /* If there are characters with values > 255, we have to compile an
5569    extended class, with its own opcode, unless there was a negated special
5570    such as \S in the class, and PCRE_UCP is not set, because in that case all
5571    characters > 255 are in the class, so any that were explicitly given as
5572    well can be ignored. If (when there are explicit characters > 255 that must
5573    be listed) there are no characters < 256, we can omit the bitmap in the
5574    actual compiled code. */
5575
5576#ifdef SUPPORT_UTF
5577    if (xclass && (xclass_has_prop || !should_flip_negation ||
5578        (options & PCRE_UCP) != 0))
5579#elif !defined COMPILE_PCRE8
5580    if (xclass && (xclass_has_prop || !should_flip_negation))
5581#endif
5582#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5583      {
5584      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5585      *code++ = OP_XCLASS;
5586      code += LINK_SIZE;
5587      *code = negate_class? XCL_NOT:0;
5588      if (xclass_has_prop) *code |= XCL_HASPROP;
5589
5590      /* If the map is required, move up the extra data to make room for it;
5591      otherwise just move the code pointer to the end of the extra data. */
5592
5593      if (class_has_8bitchar > 0)
5594        {
5595        *code++ |= XCL_MAP;
5596        memmove(code + (32 / sizeof(pcre_uchar)), code,
5597          IN_UCHARS(class_uchardata - code));
5598        if (negate_class && !xclass_has_prop)
5599          for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5600        memcpy(code, classbits, 32);
5601        code = class_uchardata + (32 / sizeof(pcre_uchar));
5602        }
5603      else code = class_uchardata;
5604
5605      /* Now fill in the complete length of the item */
5606
5607      PUT(previous, 1, (int)(code - previous));
5608      break;   /* End of class handling */
5609      }
5610
5611    /* Even though any XCLASS list is now discarded, we must allow for
5612    its memory. */
5613
5614    if (lengthptr != NULL)
5615      *lengthptr += (int)(class_uchardata - class_uchardata_base);
5616#endif
5617
5618    /* If there are no characters > 255, or they are all to be included or
5619    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5620    whole class was negated and whether there were negative specials such as \S
5621    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5622    negating it if necessary. */
5623
5624    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5625    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5626      {
5627      if (negate_class)
5628        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5629      memcpy(code, classbits, 32);
5630      }
5631    code += 32 / sizeof(pcre_uchar);
5632
5633    END_CLASS:
5634    break;
5635
5636
5637    /* ===================================================================*/
5638    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5639    has been tested above. */
5640
5641    case CHAR_LEFT_CURLY_BRACKET:
5642    if (!is_quantifier) goto NORMAL_CHAR;
5643    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5644    if (*errorcodeptr != 0) goto FAILED;
5645    goto REPEAT;
5646
5647    case CHAR_ASTERISK:
5648    repeat_min = 0;
5649    repeat_max = -1;
5650    goto REPEAT;
5651
5652    case CHAR_PLUS:
5653    repeat_min = 1;
5654    repeat_max = -1;
5655    goto REPEAT;
5656
5657    case CHAR_QUESTION_MARK:
5658    repeat_min = 0;
5659    repeat_max = 1;
5660
5661    REPEAT:
5662    if (previous == NULL)
5663      {
5664      *errorcodeptr = ERR9;
5665      goto FAILED;
5666      }
5667
5668    if (repeat_min == 0)
5669      {
5670      firstchar = zerofirstchar;    /* Adjust for zero repeat */
5671      firstcharflags = zerofirstcharflags;
5672      reqchar = zeroreqchar;        /* Ditto */
5673      reqcharflags = zeroreqcharflags;
5674      }
5675
5676    /* Remember whether this is a variable length repeat */
5677
5678    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5679
5680    op_type = 0;                    /* Default single-char op codes */
5681    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5682
5683    /* Save start of previous item, in case we have to move it up in order to
5684    insert something before it. */
5685
5686    tempcode = previous;
5687
5688    /* Before checking for a possessive quantifier, we must skip over
5689    whitespace and comments in extended mode because Perl allows white space at
5690    this point. */
5691
5692    if ((options & PCRE_EXTENDED) != 0)
5693      {
5694      const pcre_uchar *p = ptr + 1;
5695      for (;;)
5696        {
5697        while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5698        if (*p != CHAR_NUMBER_SIGN) break;
5699        p++;
5700        while (*p != CHAR_NULL)
5701          {
5702          if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5703            {                        /* IS_NEWLINE sets cd->nllen. */
5704            p += cd->nllen;
5705            break;
5706            }
5707          p++;
5708#ifdef SUPPORT_UTF
5709          if (utf) FORWARDCHAR(p);
5710#endif
5711          }           /* Loop for comment characters */
5712        }             /* Loop for multiple comments */
5713      ptr = p - 1;    /* Character before the next significant one. */
5714      }
5715
5716    /* If the next character is '+', we have a possessive quantifier. This
5717    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5718    If the next character is '?' this is a minimizing repeat, by default,
5719    but if PCRE_UNGREEDY is set, it works the other way round. We change the
5720    repeat type to the non-default. */
5721
5722    if (ptr[1] == CHAR_PLUS)
5723      {
5724      repeat_type = 0;                  /* Force greedy */
5725      possessive_quantifier = TRUE;
5726      ptr++;
5727      }
5728    else if (ptr[1] == CHAR_QUESTION_MARK)
5729      {
5730      repeat_type = greedy_non_default;
5731      ptr++;
5732      }
5733    else repeat_type = greedy_default;
5734
5735    /* If previous was a recursion call, wrap it in atomic brackets so that
5736    previous becomes the atomic group. All recursions were so wrapped in the
5737    past, but it no longer happens for non-repeated recursions. In fact, the
5738    repeated ones could be re-implemented independently so as not to need this,
5739    but for the moment we rely on the code for repeating groups. */
5740
5741    if (*previous == OP_RECURSE)
5742      {
5743      memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5744      *previous = OP_ONCE;
5745      PUT(previous, 1, 2 + 2*LINK_SIZE);
5746      previous[2 + 2*LINK_SIZE] = OP_KET;
5747      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5748      code += 2 + 2 * LINK_SIZE;
5749      length_prevgroup = 3 + 3*LINK_SIZE;
5750
5751      /* When actually compiling, we need to check whether this was a forward
5752      reference, and if so, adjust the offset. */
5753
5754      if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5755        {
5756        int offset = GET(cd->hwm, -LINK_SIZE);
5757        if (offset == previous + 1 - cd->start_code)
5758          PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5759        }
5760      }
5761
5762    /* Now handle repetition for the different types of item. */
5763
5764    /* If previous was a character or negated character match, abolish the item
5765    and generate a repeat item instead. If a char item has a minimum of more
5766    than one, ensure that it is set in reqchar - it might not be if a sequence
5767    such as x{3} is the first thing in a branch because the x will have gone
5768    into firstchar instead.  */
5769
5770    if (*previous == OP_CHAR || *previous == OP_CHARI
5771        || *previous == OP_NOT || *previous == OP_NOTI)
5772      {
5773      switch (*previous)
5774        {
5775        default: /* Make compiler happy. */
5776        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5777        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5778        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5779        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5780        }
5781
5782      /* Deal with UTF characters that take up more than one character. It's
5783      easier to write this out separately than try to macrify it. Use c to
5784      hold the length of the character in bytes, plus UTF_LENGTH to flag that
5785      it's a length rather than a small character. */
5786
5787#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5788      if (utf && NOT_FIRSTCHAR(code[-1]))
5789        {
5790        pcre_uchar *lastchar = code - 1;
5791        BACKCHAR(lastchar);
5792        c = (int)(code - lastchar);     /* Length of UTF-8 character */
5793        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5794        c |= UTF_LENGTH;                /* Flag c as a length */
5795        }
5796      else
5797#endif /* SUPPORT_UTF */
5798
5799      /* Handle the case of a single charater - either with no UTF support, or
5800      with UTF disabled, or for a single character UTF character. */
5801        {
5802        c = code[-1];
5803        if (*previous <= OP_CHARI && repeat_min > 1)
5804          {
5805          reqchar = c;
5806          reqcharflags = req_caseopt | cd->req_varyopt;
5807          }
5808        }
5809
5810      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5811      }
5812
5813    /* If previous was a character type match (\d or similar), abolish it and
5814    create a suitable repeat item. The code is shared with single-character
5815    repeats by setting op_type to add a suitable offset into repeat_type. Note
5816    the the Unicode property types will be present only when SUPPORT_UCP is
5817    defined, but we don't wrap the little bits of code here because it just
5818    makes it horribly messy. */
5819
5820    else if (*previous < OP_EODN)
5821      {
5822      pcre_uchar *oldcode;
5823      int prop_type, prop_value;
5824      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5825      c = *previous;
5826
5827      OUTPUT_SINGLE_REPEAT:
5828      if (*previous == OP_PROP || *previous == OP_NOTPROP)
5829        {
5830        prop_type = previous[1];
5831        prop_value = previous[2];
5832        }
5833      else prop_type = prop_value = -1;
5834
5835      oldcode = code;
5836      code = previous;                  /* Usually overwrite previous item */
5837
5838      /* If the maximum is zero then the minimum must also be zero; Perl allows
5839      this case, so we do too - by simply omitting the item altogether. */
5840
5841      if (repeat_max == 0) goto END_REPEAT;
5842
5843      /* Combine the op_type with the repeat_type */
5844
5845      repeat_type += op_type;
5846
5847      /* A minimum of zero is handled either as the special case * or ?, or as
5848      an UPTO, with the maximum given. */
5849
5850      if (repeat_min == 0)
5851        {
5852        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5853          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5854        else
5855          {
5856          *code++ = OP_UPTO + repeat_type;
5857          PUT2INC(code, 0, repeat_max);
5858          }
5859        }
5860
5861      /* A repeat minimum of 1 is optimized into some special cases. If the
5862      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5863      left in place and, if the maximum is greater than 1, we use OP_UPTO with
5864      one less than the maximum. */
5865
5866      else if (repeat_min == 1)
5867        {
5868        if (repeat_max == -1)
5869          *code++ = OP_PLUS + repeat_type;
5870        else
5871          {
5872          code = oldcode;                 /* leave previous item in place */
5873          if (repeat_max == 1) goto END_REPEAT;
5874          *code++ = OP_UPTO + repeat_type;
5875          PUT2INC(code, 0, repeat_max - 1);
5876          }
5877        }
5878
5879      /* The case {n,n} is just an EXACT, while the general case {n,m} is
5880      handled as an EXACT followed by an UPTO. */
5881
5882      else
5883        {
5884        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5885        PUT2INC(code, 0, repeat_min);
5886
5887        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5888        we have to insert the character for the previous code. For a repeated
5889        Unicode property match, there are two extra bytes that define the
5890        required property. In UTF-8 mode, long characters have their length in
5891        c, with the UTF_LENGTH bit as a flag. */
5892
5893        if (repeat_max < 0)
5894          {
5895#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5896          if (utf && (c & UTF_LENGTH) != 0)
5897            {
5898            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5899            code += c & 7;
5900            }
5901          else
5902#endif
5903            {
5904            *code++ = c;
5905            if (prop_type >= 0)
5906              {
5907              *code++ = prop_type;
5908              *code++ = prop_value;
5909              }
5910            }
5911          *code++ = OP_STAR + repeat_type;
5912          }
5913
5914        /* Else insert an UPTO if the max is greater than the min, again
5915        preceded by the character, for the previously inserted code. If the
5916        UPTO is just for 1 instance, we can use QUERY instead. */
5917
5918        else if (repeat_max != repeat_min)
5919          {
5920#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5921          if (utf && (c & UTF_LENGTH) != 0)
5922            {
5923            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5924            code += c & 7;
5925            }
5926          else
5927#endif
5928          *code++ = c;
5929          if (prop_type >= 0)
5930            {
5931            *code++ = prop_type;
5932            *code++ = prop_value;
5933            }
5934          repeat_max -= repeat_min;
5935
5936          if (repeat_max == 1)
5937            {
5938            *code++ = OP_QUERY + repeat_type;
5939            }
5940          else
5941            {
5942            *code++ = OP_UPTO + repeat_type;
5943            PUT2INC(code, 0, repeat_max);
5944            }
5945          }
5946        }
5947
5948      /* The character or character type itself comes last in all cases. */
5949
5950#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5951      if (utf && (c & UTF_LENGTH) != 0)
5952        {
5953        memcpy(code, utf_chars, IN_UCHARS(c & 7));
5954        code += c & 7;
5955        }
5956      else
5957#endif
5958      *code++ = c;
5959
5960      /* For a repeated Unicode property match, there are two extra bytes that
5961      define the required property. */
5962
5963#ifdef SUPPORT_UCP
5964      if (prop_type >= 0)
5965        {
5966        *code++ = prop_type;
5967        *code++ = prop_value;
5968        }
5969#endif
5970      }
5971
5972    /* If previous was a character class or a back reference, we put the repeat
5973    stuff after it, but just skip the item if the repeat was {0,0}. */
5974
5975    else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5976#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5977             *previous == OP_XCLASS ||
5978#endif
5979             *previous == OP_REF   || *previous == OP_REFI ||
5980             *previous == OP_DNREF || *previous == OP_DNREFI)
5981      {
5982      if (repeat_max == 0)
5983        {
5984        code = previous;
5985        goto END_REPEAT;
5986        }
5987
5988      if (repeat_min == 0 && repeat_max == -1)
5989        *code++ = OP_CRSTAR + repeat_type;
5990      else if (repeat_min == 1 && repeat_max == -1)
5991        *code++ = OP_CRPLUS + repeat_type;
5992      else if (repeat_min == 0 && repeat_max == 1)
5993        *code++ = OP_CRQUERY + repeat_type;
5994      else
5995        {
5996        *code++ = OP_CRRANGE + repeat_type;
5997        PUT2INC(code, 0, repeat_min);
5998        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5999        PUT2INC(code, 0, repeat_max);
6000        }
6001      }
6002
6003    /* If previous was a bracket group, we may have to replicate it in certain
6004    cases. Note that at this point we can encounter only the "basic" bracket
6005    opcodes such as BRA and CBRA, as this is the place where they get converted
6006    into the more special varieties such as BRAPOS and SBRA. A test for >=
6007    OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
6008    ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
6009    Originally, PCRE did not allow repetition of assertions, but now it does,
6010    for Perl compatibility. */
6011
6012    else if (*previous >= OP_ASSERT && *previous <= OP_COND)
6013      {
6014      register int i;
6015      int len = (int)(code - previous);
6016      size_t base_hwm_offset = item_hwm_offset;
6017      pcre_uchar *bralink = NULL;
6018      pcre_uchar *brazeroptr = NULL;
6019
6020      /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
6021      we just ignore the repeat. */
6022
6023      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
6024        goto END_REPEAT;
6025
6026      /* There is no sense in actually repeating assertions. The only potential
6027      use of repetition is in cases when the assertion is optional. Therefore,
6028      if the minimum is greater than zero, just ignore the repeat. If the
6029      maximum is not zero or one, set it to 1. */
6030
6031      if (*previous < OP_ONCE)    /* Assertion */
6032        {
6033        if (repeat_min > 0) goto END_REPEAT;
6034        if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
6035        }
6036
6037      /* The case of a zero minimum is special because of the need to stick
6038      OP_BRAZERO in front of it, and because the group appears once in the
6039      data, whereas in other cases it appears the minimum number of times. For
6040      this reason, it is simplest to treat this case separately, as otherwise
6041      the code gets far too messy. There are several special subcases when the
6042      minimum is zero. */
6043
6044      if (repeat_min == 0)
6045        {
6046        /* If the maximum is also zero, we used to just omit the group from the
6047        output altogether, like this:
6048
6049        ** if (repeat_max == 0)
6050        **   {
6051        **   code = previous;
6052        **   goto END_REPEAT;
6053        **   }
6054
6055        However, that fails when a group or a subgroup within it is referenced
6056        as a subroutine from elsewhere in the pattern, so now we stick in
6057        OP_SKIPZERO in front of it so that it is skipped on execution. As we
6058        don't have a list of which groups are referenced, we cannot do this
6059        selectively.
6060
6061        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6062        and do no more at this point. However, we do need to adjust any
6063        OP_RECURSE calls inside the group that refer to the group itself or any
6064        internal or forward referenced group, because the offset is from the
6065        start of the whole regex. Temporarily terminate the pattern while doing
6066        this. */
6067
6068        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6069          {
6070          *code = OP_END;
6071          adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6072          memmove(previous + 1, previous, IN_UCHARS(len));
6073          code++;
6074          if (repeat_max == 0)
6075            {
6076            *previous++ = OP_SKIPZERO;
6077            goto END_REPEAT;
6078            }
6079          brazeroptr = previous;    /* Save for possessive optimizing */
6080          *previous++ = OP_BRAZERO + repeat_type;
6081          }
6082
6083        /* If the maximum is greater than 1 and limited, we have to replicate
6084        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6085        The first one has to be handled carefully because it's the original
6086        copy, which has to be moved up. The remainder can be handled by code
6087        that is common with the non-zero minimum case below. We have to
6088        adjust the value or repeat_max, since one less copy is required. Once
6089        again, we may have to adjust any OP_RECURSE calls inside the group. */
6090
6091        else
6092          {
6093          int offset;
6094          *code = OP_END;
6095          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6096          memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6097          code += 2 + LINK_SIZE;
6098          *previous++ = OP_BRAZERO + repeat_type;
6099          *previous++ = OP_BRA;
6100
6101          /* We chain together the bracket offset fields that have to be
6102          filled in later when the ends of the brackets are reached. */
6103
6104          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6105          bralink = previous;
6106          PUTINC(previous, 0, offset);
6107          }
6108
6109        repeat_max--;
6110        }
6111
6112      /* If the minimum is greater than zero, replicate the group as many
6113      times as necessary, and adjust the maximum to the number of subsequent
6114      copies that we need. If we set a first char from the group, and didn't
6115      set a required char, copy the latter from the former. If there are any
6116      forward reference subroutine calls in the group, there will be entries on
6117      the workspace list; replicate these with an appropriate increment. */
6118
6119      else
6120        {
6121        if (repeat_min > 1)
6122          {
6123          /* In the pre-compile phase, we don't actually do the replication. We
6124          just adjust the length as if we had. Do some paranoid checks for
6125          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6126          integer type when available, otherwise double. */
6127
6128          if (lengthptr != NULL)
6129            {
6130            int delta = (repeat_min - 1)*length_prevgroup;
6131            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6132                  (INT64_OR_DOUBLE)length_prevgroup >
6133                    (INT64_OR_DOUBLE)INT_MAX ||
6134                OFLOW_MAX - *lengthptr < delta)
6135              {
6136              *errorcodeptr = ERR20;
6137              goto FAILED;
6138              }
6139            *lengthptr += delta;
6140            }
6141
6142          /* This is compiling for real. If there is a set first byte for
6143          the group, and we have not yet set a "required byte", set it. Make
6144          sure there is enough workspace for copying forward references before
6145          doing the copy. */
6146
6147          else
6148            {
6149            if (groupsetfirstchar && reqcharflags < 0)
6150              {
6151              reqchar = firstchar;
6152              reqcharflags = firstcharflags;
6153              }
6154
6155            for (i = 1; i < repeat_min; i++)
6156              {
6157              pcre_uchar *hc;
6158              size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6159              memcpy(code, previous, IN_UCHARS(len));
6160
6161              while (cd->hwm > cd->start_workspace + cd->workspace_size -
6162                     WORK_SIZE_SAFETY_MARGIN -
6163                     (this_hwm_offset - base_hwm_offset))
6164                {
6165                *errorcodeptr = expand_workspace(cd);
6166                if (*errorcodeptr != 0) goto FAILED;
6167                }
6168
6169              for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6170                   hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6171                   hc += LINK_SIZE)
6172                {
6173                PUT(cd->hwm, 0, GET(hc, 0) + len);
6174                cd->hwm += LINK_SIZE;
6175                }
6176              base_hwm_offset = this_hwm_offset;
6177              code += len;
6178              }
6179            }
6180          }
6181
6182        if (repeat_max > 0) repeat_max -= repeat_min;
6183        }
6184
6185      /* This code is common to both the zero and non-zero minimum cases. If
6186      the maximum is limited, it replicates the group in a nested fashion,
6187      remembering the bracket starts on a stack. In the case of a zero minimum,
6188      the first one was set up above. In all cases the repeat_max now specifies
6189      the number of additional copies needed. Again, we must remember to
6190      replicate entries on the forward reference list. */
6191
6192      if (repeat_max >= 0)
6193        {
6194        /* In the pre-compile phase, we don't actually do the replication. We
6195        just adjust the length as if we had. For each repetition we must add 1
6196        to the length for BRAZERO and for all but the last repetition we must
6197        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6198        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6199        a 64-bit integer type when available, otherwise double. */
6200
6201        if (lengthptr != NULL && repeat_max > 0)
6202          {
6203          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6204                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6205          if ((INT64_OR_DOUBLE)repeat_max *
6206                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6207                  > (INT64_OR_DOUBLE)INT_MAX ||
6208              OFLOW_MAX - *lengthptr < delta)
6209            {
6210            *errorcodeptr = ERR20;
6211            goto FAILED;
6212            }
6213          *lengthptr += delta;
6214          }
6215
6216        /* This is compiling for real */
6217
6218        else for (i = repeat_max - 1; i >= 0; i--)
6219          {
6220          pcre_uchar *hc;
6221          size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6222
6223          *code++ = OP_BRAZERO + repeat_type;
6224
6225          /* All but the final copy start a new nesting, maintaining the
6226          chain of brackets outstanding. */
6227
6228          if (i != 0)
6229            {
6230            int offset;
6231            *code++ = OP_BRA;
6232            offset = (bralink == NULL)? 0 : (int)(code - bralink);
6233            bralink = code;
6234            PUTINC(code, 0, offset);
6235            }
6236
6237          memcpy(code, previous, IN_UCHARS(len));
6238
6239          /* Ensure there is enough workspace for forward references before
6240          copying them. */
6241
6242          while (cd->hwm > cd->start_workspace + cd->workspace_size -
6243                 WORK_SIZE_SAFETY_MARGIN -
6244                 (this_hwm_offset - base_hwm_offset))
6245            {
6246            *errorcodeptr = expand_workspace(cd);
6247            if (*errorcodeptr != 0) goto FAILED;
6248            }
6249
6250          for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6251               hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6252               hc += LINK_SIZE)
6253            {
6254            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6255            cd->hwm += LINK_SIZE;
6256            }
6257          base_hwm_offset = this_hwm_offset;
6258          code += len;
6259          }
6260
6261        /* Now chain through the pending brackets, and fill in their length
6262        fields (which are holding the chain links pro tem). */
6263
6264        while (bralink != NULL)
6265          {
6266          int oldlinkoffset;
6267          int offset = (int)(code - bralink + 1);
6268          pcre_uchar *bra = code - offset;
6269          oldlinkoffset = GET(bra, 1);
6270          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6271          *code++ = OP_KET;
6272          PUTINC(code, 0, offset);
6273          PUT(bra, 1, offset);
6274          }
6275        }
6276
6277      /* If the maximum is unlimited, set a repeater in the final copy. For
6278      ONCE brackets, that's all we need to do. However, possessively repeated
6279      ONCE brackets can be converted into non-capturing brackets, as the
6280      behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6281      deal with possessive ONCEs specially.
6282
6283      Otherwise, when we are doing the actual compile phase, check to see
6284      whether this group is one that could match an empty string. If so,
6285      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6286      that runtime checking can be done. [This check is also applied to ONCE
6287      groups at runtime, but in a different way.]
6288
6289      Then, if the quantifier was possessive and the bracket is not a
6290      conditional, we convert the BRA code to the POS form, and the KET code to
6291      KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6292      subpattern at both the start and at the end.) The use of special opcodes
6293      makes it possible to reduce greatly the stack usage in pcre_exec(). If
6294      the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6295
6296      Then, if the minimum number of matches is 1 or 0, cancel the possessive
6297      flag so that the default action below, of wrapping everything inside
6298      atomic brackets, does not happen. When the minimum is greater than 1,
6299      there will be earlier copies of the group, and so we still have to wrap
6300      the whole thing. */
6301
6302      else
6303        {
6304        pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6305        pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6306
6307        /* Convert possessive ONCE brackets to non-capturing */
6308
6309        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6310            possessive_quantifier) *bracode = OP_BRA;
6311
6312        /* For non-possessive ONCE brackets, all we need to do is to
6313        set the KET. */
6314
6315        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6316          *ketcode = OP_KETRMAX + repeat_type;
6317
6318        /* Handle non-ONCE brackets and possessive ONCEs (which have been
6319        converted to non-capturing above). */
6320
6321        else
6322          {
6323          /* In the compile phase, check for empty string matching. */
6324
6325          if (lengthptr == NULL)
6326            {
6327            pcre_uchar *scode = bracode;
6328            do
6329              {
6330              if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6331                {
6332                *bracode += OP_SBRA - OP_BRA;
6333                break;
6334                }
6335              scode += GET(scode, 1);
6336              }
6337            while (*scode == OP_ALT);
6338            }
6339
6340          /* A conditional group with only one branch has an implicit empty
6341          alternative branch. */
6342
6343          if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6344            *bracode = OP_SCOND;
6345
6346          /* Handle possessive quantifiers. */
6347
6348          if (possessive_quantifier)
6349            {
6350            /* For COND brackets, we wrap the whole thing in a possessively
6351            repeated non-capturing bracket, because we have not invented POS
6352            versions of the COND opcodes. Because we are moving code along, we
6353            must ensure that any pending recursive references are updated. */
6354
6355            if (*bracode == OP_COND || *bracode == OP_SCOND)
6356              {
6357              int nlen = (int)(code - bracode);
6358              *code = OP_END;
6359              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6360              memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6361              code += 1 + LINK_SIZE;
6362              nlen += 1 + LINK_SIZE;
6363              *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6364              *code++ = OP_KETRPOS;
6365              PUTINC(code, 0, nlen);
6366              PUT(bracode, 1, nlen);
6367              }
6368
6369            /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6370
6371            else
6372              {
6373              *bracode += 1;              /* Switch to xxxPOS opcodes */
6374              *ketcode = OP_KETRPOS;
6375              }
6376
6377            /* If the minimum is zero, mark it as possessive, then unset the
6378            possessive flag when the minimum is 0 or 1. */
6379
6380            if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6381            if (repeat_min < 2) possessive_quantifier = FALSE;
6382            }
6383
6384          /* Non-possessive quantifier */
6385
6386          else *ketcode = OP_KETRMAX + repeat_type;
6387          }
6388        }
6389      }
6390
6391    /* If previous is OP_FAIL, it was generated by an empty class [] in
6392    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6393    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6394    error above. We can just ignore the repeat in JS case. */
6395
6396    else if (*previous == OP_FAIL) goto END_REPEAT;
6397
6398    /* Else there's some kind of shambles */
6399
6400    else
6401      {
6402      *errorcodeptr = ERR11;
6403      goto FAILED;
6404      }
6405
6406    /* If the character following a repeat is '+', possessive_quantifier is
6407    TRUE. For some opcodes, there are special alternative opcodes for this
6408    case. For anything else, we wrap the entire repeated item inside OP_ONCE
6409    brackets. Logically, the '+' notation is just syntactic sugar, taken from
6410    Sun's Java package, but the special opcodes can optimize it.
6411
6412    Some (but not all) possessively repeated subpatterns have already been
6413    completely handled in the code just above. For them, possessive_quantifier
6414    is always FALSE at this stage. Note that the repeated item starts at
6415    tempcode, not at previous, which might be the first part of a string whose
6416    (former) last char we repeated. */
6417
6418    if (possessive_quantifier)
6419      {
6420      int len;
6421
6422      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6423      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6424      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6425      remains is greater than zero, there's a further opcode that can be
6426      handled. If not, do nothing, leaving the EXACT alone. */
6427
6428      switch(*tempcode)
6429        {
6430        case OP_TYPEEXACT:
6431        tempcode += PRIV(OP_lengths)[*tempcode] +
6432          ((tempcode[1 + IMM2_SIZE] == OP_PROP
6433          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6434        break;
6435
6436        /* CHAR opcodes are used for exacts whose count is 1. */
6437
6438        case OP_CHAR:
6439        case OP_CHARI:
6440        case OP_NOT:
6441        case OP_NOTI:
6442        case OP_EXACT:
6443        case OP_EXACTI:
6444        case OP_NOTEXACT:
6445        case OP_NOTEXACTI:
6446        tempcode += PRIV(OP_lengths)[*tempcode];
6447#ifdef SUPPORT_UTF
6448        if (utf && HAS_EXTRALEN(tempcode[-1]))
6449          tempcode += GET_EXTRALEN(tempcode[-1]);
6450#endif
6451        break;
6452
6453        /* For the class opcodes, the repeat operator appears at the end;
6454        adjust tempcode to point to it. */
6455
6456        case OP_CLASS:
6457        case OP_NCLASS:
6458        tempcode += 1 + 32/sizeof(pcre_uchar);
6459        break;
6460
6461#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6462        case OP_XCLASS:
6463        tempcode += GET(tempcode, 1);
6464        break;
6465#endif
6466        }
6467
6468      /* If tempcode is equal to code (which points to the end of the repeated
6469      item), it means we have skipped an EXACT item but there is no following
6470      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6471      all other cases, tempcode will be pointing to the repeat opcode, and will
6472      be less than code, so the value of len will be greater than 0. */
6473
6474      len = (int)(code - tempcode);
6475      if (len > 0)
6476        {
6477        unsigned int repcode = *tempcode;
6478
6479        /* There is a table for possessifying opcodes, all of which are less
6480        than OP_CALLOUT. A zero entry means there is no possessified version.
6481        */
6482
6483        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6484          *tempcode = opcode_possessify[repcode];
6485
6486        /* For opcode without a special possessified version, wrap the item in
6487        ONCE brackets. Because we are moving code along, we must ensure that any
6488        pending recursive references are updated. */
6489
6490        else
6491          {
6492          *code = OP_END;
6493          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6494          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6495          code += 1 + LINK_SIZE;
6496          len += 1 + LINK_SIZE;
6497          tempcode[0] = OP_ONCE;
6498          *code++ = OP_KET;
6499          PUTINC(code, 0, len);
6500          PUT(tempcode, 1, len);
6501          }
6502        }
6503
6504#ifdef NEVER
6505      if (len > 0) switch (*tempcode)
6506        {
6507        case OP_STAR:  *tempcode = OP_POSSTAR; break;
6508        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6509        case OP_QUERY: *tempcode = OP_POSQUERY; break;
6510        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6511
6512        case OP_STARI:  *tempcode = OP_POSSTARI; break;
6513        case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6514        case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6515        case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6516
6517        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6518        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6519        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6520        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6521
6522        case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6523        case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6524        case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6525        case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6526
6527        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6528        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6529        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6530        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6531
6532        case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6533        case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6534        case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6535        case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6536
6537        /* Because we are moving code along, we must ensure that any
6538        pending recursive references are updated. */
6539
6540        default:
6541        *code = OP_END;
6542        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6543        memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6544        code += 1 + LINK_SIZE;
6545        len += 1 + LINK_SIZE;
6546        tempcode[0] = OP_ONCE;
6547        *code++ = OP_KET;
6548        PUTINC(code, 0, len);
6549        PUT(tempcode, 1, len);
6550        break;
6551        }
6552#endif
6553      }
6554
6555    /* In all case we no longer have a previous item. We also set the
6556    "follows varying string" flag for subsequently encountered reqchars if
6557    it isn't already set and we have just passed a varying length item. */
6558
6559    END_REPEAT:
6560    previous = NULL;
6561    cd->req_varyopt |= reqvary;
6562    break;
6563
6564
6565    /* ===================================================================*/
6566    /* Start of nested parenthesized sub-expression, or comment or lookahead or
6567    lookbehind or option setting or condition or all the other extended
6568    parenthesis forms.  */
6569
6570    case CHAR_LEFT_PARENTHESIS:
6571    ptr++;
6572
6573    /* Now deal with various "verbs" that can be introduced by '*'. */
6574
6575    if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6576         || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6577      {
6578      int i, namelen;
6579      int arglen = 0;
6580      const char *vn = verbnames;
6581      const pcre_uchar *name = ptr + 1;
6582      const pcre_uchar *arg = NULL;
6583      previous = NULL;
6584      ptr++;
6585      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6586      namelen = (int)(ptr - name);
6587
6588      /* It appears that Perl allows any characters whatsoever, other than
6589      a closing parenthesis, to appear in arguments, so we no longer insist on
6590      letters, digits, and underscores. */
6591
6592      if (*ptr == CHAR_COLON)
6593        {
6594        arg = ++ptr;
6595        while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6596        arglen = (int)(ptr - arg);
6597        if ((unsigned int)arglen > MAX_MARK)
6598          {
6599          *errorcodeptr = ERR75;
6600          goto FAILED;
6601          }
6602        }
6603
6604      if (*ptr != CHAR_RIGHT_PARENTHESIS)
6605        {
6606        *errorcodeptr = ERR60;
6607        goto FAILED;
6608        }
6609
6610      /* Scan the table of verb names */
6611
6612      for (i = 0; i < verbcount; i++)
6613        {
6614        if (namelen == verbs[i].len &&
6615            STRNCMP_UC_C8(name, vn, namelen) == 0)
6616          {
6617          int setverb;
6618
6619          /* Check for open captures before ACCEPT and convert it to
6620          ASSERT_ACCEPT if in an assertion. */
6621
6622          if (verbs[i].op == OP_ACCEPT)
6623            {
6624            open_capitem *oc;
6625            if (arglen != 0)
6626              {
6627              *errorcodeptr = ERR59;
6628              goto FAILED;
6629              }
6630            cd->had_accept = TRUE;
6631            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6632              {
6633              if (lengthptr != NULL)
6634                {
6635#ifdef COMPILE_PCRE8
6636                *lengthptr += 1 + IMM2_SIZE;
6637#elif defined COMPILE_PCRE16
6638                *lengthptr += 2 + IMM2_SIZE;
6639#elif defined COMPILE_PCRE32
6640                *lengthptr += 4 + IMM2_SIZE;
6641#endif
6642                }
6643              else
6644                {
6645                *code++ = OP_CLOSE;
6646                PUT2INC(code, 0, oc->number);
6647                }
6648              }
6649            setverb = *code++ =
6650              (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6651
6652            /* Do not set firstchar after *ACCEPT */
6653            if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6654            }
6655
6656          /* Handle other cases with/without an argument */
6657
6658          else if (arglen == 0)
6659            {
6660            if (verbs[i].op < 0)   /* Argument is mandatory */
6661              {
6662              *errorcodeptr = ERR66;
6663              goto FAILED;
6664              }
6665            setverb = *code++ = verbs[i].op;
6666            }
6667
6668          else
6669            {
6670            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6671              {
6672              *errorcodeptr = ERR59;
6673              goto FAILED;
6674              }
6675            setverb = *code++ = verbs[i].op_arg;
6676            if (lengthptr != NULL)    /* In pass 1 just add in the length */
6677              {                       /* to avoid potential workspace */
6678              *lengthptr += arglen;   /* overflow. */
6679              *code++ = 0;
6680              }
6681            else
6682              {
6683              *code++ = arglen;
6684              memcpy(code, arg, IN_UCHARS(arglen));
6685              code += arglen;
6686              }
6687            *code++ = 0;
6688            }
6689
6690          switch (setverb)
6691            {
6692            case OP_THEN:
6693            case OP_THEN_ARG:
6694            cd->external_flags |= PCRE_HASTHEN;
6695            break;
6696
6697            case OP_PRUNE:
6698            case OP_PRUNE_ARG:
6699            case OP_SKIP:
6700            case OP_SKIP_ARG:
6701            cd->had_pruneorskip = TRUE;
6702            break;
6703            }
6704
6705          break;  /* Found verb, exit loop */
6706          }
6707
6708        vn += verbs[i].len + 1;
6709        }
6710
6711      if (i < verbcount) continue;    /* Successfully handled a verb */
6712      *errorcodeptr = ERR60;          /* Verb not recognized */
6713      goto FAILED;
6714      }
6715
6716    /* Initialize for "real" parentheses */
6717
6718    newoptions = options;
6719    skipbytes = 0;
6720    bravalue = OP_CBRA;
6721    item_hwm_offset = cd->hwm - cd->start_workspace;
6722    reset_bracount = FALSE;
6723
6724    /* Deal with the extended parentheses; all are introduced by '?', and the
6725    appearance of any of them means that this is not a capturing group. */
6726
6727    if (*ptr == CHAR_QUESTION_MARK)
6728      {
6729      int i, set, unset, namelen;
6730      int *optset;
6731      const pcre_uchar *name;
6732      pcre_uchar *slot;
6733
6734      switch (*(++ptr))
6735        {
6736        /* ------------------------------------------------------------ */
6737        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6738        reset_bracount = TRUE;
6739        cd->dupgroups = TRUE;     /* Record (?| encountered */
6740        /* Fall through */
6741
6742        /* ------------------------------------------------------------ */
6743        case CHAR_COLON:          /* Non-capturing bracket */
6744        bravalue = OP_BRA;
6745        ptr++;
6746        break;
6747
6748
6749        /* ------------------------------------------------------------ */
6750        case CHAR_LEFT_PARENTHESIS:
6751        bravalue = OP_COND;       /* Conditional group */
6752        tempptr = ptr;
6753
6754        /* A condition can be an assertion, a number (referring to a numbered
6755        group's having been set), a name (referring to a named group), or 'R',
6756        referring to recursion. R<digits> and R&name are also permitted for
6757        recursion tests.
6758
6759        There are ways of testing a named group: (?(name)) is used by Python;
6760        Perl 5.10 onwards uses (?(<name>) or (?('name')).
6761
6762        There is one unfortunate ambiguity, caused by history. 'R' can be the
6763        recursive thing or the name 'R' (and similarly for 'R' followed by
6764        digits). We look for a name first; if not found, we try the other case.
6765
6766        For compatibility with auto-callouts, we allow a callout to be
6767        specified before a condition that is an assertion. First, check for the
6768        syntax of a callout; if found, adjust the temporary pointer that is
6769        used to check for an assertion condition. That's all that is needed! */
6770
6771        if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6772          {
6773          for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6774          if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6775            tempptr += i + 1;
6776
6777          /* tempptr should now be pointing to the opening parenthesis of the
6778          assertion condition. */
6779
6780          if (*tempptr != CHAR_LEFT_PARENTHESIS)
6781            {
6782            *errorcodeptr = ERR28;
6783            goto FAILED;
6784            }
6785          }
6786
6787        /* For conditions that are assertions, check the syntax, and then exit
6788        the switch. This will take control down to where bracketed groups,
6789        including assertions, are processed. */
6790
6791        if (tempptr[1] == CHAR_QUESTION_MARK &&
6792              (tempptr[2] == CHAR_EQUALS_SIGN ||
6793               tempptr[2] == CHAR_EXCLAMATION_MARK ||
6794                 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6795                   (tempptr[3] == CHAR_EQUALS_SIGN ||
6796                    tempptr[3] == CHAR_EXCLAMATION_MARK))))
6797          {
6798          cd->iscondassert = TRUE;
6799          break;
6800          }
6801
6802        /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6803        need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6804
6805        code[1+LINK_SIZE] = OP_CREF;
6806        skipbytes = 1+IMM2_SIZE;
6807        refsign = -1;     /* => not a number */
6808        namelen = -1;     /* => not a name; must set to avoid warning */
6809        name = NULL;      /* Always set to avoid warning */
6810        recno = 0;        /* Always set to avoid warning */
6811
6812        /* Check for a test for recursion in a named group. */
6813
6814        ptr++;
6815        if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6816          {
6817          terminator = -1;
6818          ptr += 2;
6819          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6820          }
6821
6822        /* Check for a test for a named group's having been set, using the Perl
6823        syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6824        syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6825
6826        else if (*ptr == CHAR_LESS_THAN_SIGN)
6827          {
6828          terminator = CHAR_GREATER_THAN_SIGN;
6829          ptr++;
6830          }
6831        else if (*ptr == CHAR_APOSTROPHE)
6832          {
6833          terminator = CHAR_APOSTROPHE;
6834          ptr++;
6835          }
6836        else
6837          {
6838          terminator = CHAR_NULL;
6839          if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6840            else if (IS_DIGIT(*ptr)) refsign = 0;
6841          }
6842
6843        /* Handle a number */
6844
6845        if (refsign >= 0)
6846          {
6847          while (IS_DIGIT(*ptr))
6848            {
6849            if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6850              {
6851              while (IS_DIGIT(*ptr)) ptr++;
6852              *errorcodeptr = ERR61;
6853              goto FAILED;
6854              }
6855            recno = recno * 10 + (int)(*ptr - CHAR_0);
6856            ptr++;
6857            }
6858          }
6859
6860        /* Otherwise we expect to read a name; anything else is an error. When
6861        a name is one of a number of duplicates, a different opcode is used and
6862        it needs more memory. Unfortunately we cannot tell whether a name is a
6863        duplicate in the first pass, so we have to allow for more memory. */
6864
6865        else
6866          {
6867          if (IS_DIGIT(*ptr))
6868            {
6869            *errorcodeptr = ERR84;
6870            goto FAILED;
6871            }
6872          if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6873            {
6874            *errorcodeptr = ERR28;   /* Assertion expected */
6875            goto FAILED;
6876            }
6877          name = ptr++;
6878          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6879            {
6880            ptr++;
6881            }
6882          namelen = (int)(ptr - name);
6883          if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6884          }
6885
6886        /* Check the terminator */
6887
6888        if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6889            *ptr++ != CHAR_RIGHT_PARENTHESIS)
6890          {
6891          ptr--;                  /* Error offset */
6892          *errorcodeptr = ERR26;  /* Malformed number or name */
6893          goto FAILED;
6894          }
6895
6896        /* Do no further checking in the pre-compile phase. */
6897
6898        if (lengthptr != NULL) break;
6899
6900        /* In the real compile we do the work of looking for the actual
6901        reference. If refsign is not negative, it means we have a number in
6902        recno. */
6903
6904        if (refsign >= 0)
6905          {
6906          if (recno <= 0)
6907            {
6908            *errorcodeptr = ERR35;
6909            goto FAILED;
6910            }
6911          if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6912            cd->bracount - recno + 1 : recno + cd->bracount;
6913          if (recno <= 0 || recno > cd->final_bracount)
6914            {
6915            *errorcodeptr = ERR15;
6916            goto FAILED;
6917            }
6918          PUT2(code, 2+LINK_SIZE, recno);
6919          if (recno > cd->top_backref) cd->top_backref = recno;
6920          break;
6921          }
6922
6923        /* Otherwise look for the name. */
6924
6925        slot = cd->name_table;
6926        for (i = 0; i < cd->names_found; i++)
6927          {
6928          if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6929          slot += cd->name_entry_size;
6930          }
6931
6932        /* Found the named subpattern. If the name is duplicated, add one to
6933        the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6934        appropriate data values. Otherwise, just insert the unique subpattern
6935        number. */
6936
6937        if (i < cd->names_found)
6938          {
6939          int offset = i++;
6940          int count = 1;
6941          recno = GET2(slot, 0);   /* Number from first found */
6942          if (recno > cd->top_backref) cd->top_backref = recno;
6943          for (; i < cd->names_found; i++)
6944            {
6945            slot += cd->name_entry_size;
6946            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6947              (slot+IMM2_SIZE)[namelen] != 0) break;
6948            count++;
6949            }
6950
6951          if (count > 1)
6952            {
6953            PUT2(code, 2+LINK_SIZE, offset);
6954            PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6955            skipbytes += IMM2_SIZE;
6956            code[1+LINK_SIZE]++;
6957            }
6958          else  /* Not a duplicated name */
6959            {
6960            PUT2(code, 2+LINK_SIZE, recno);
6961            }
6962          }
6963
6964        /* If terminator == CHAR_NULL it means that the name followed directly
6965        after the opening parenthesis [e.g. (?(abc)...] and in this case there
6966        are some further alternatives to try. For the cases where terminator !=
6967        CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6968        we have now checked all the possibilities, so give an error. */
6969
6970        else if (terminator != CHAR_NULL)
6971          {
6972          *errorcodeptr = ERR15;
6973          goto FAILED;
6974          }
6975
6976        /* Check for (?(R) for recursion. Allow digits after R to specify a
6977        specific group number. */
6978
6979        else if (*name == CHAR_R)
6980          {
6981          recno = 0;
6982          for (i = 1; i < namelen; i++)
6983            {
6984            if (!IS_DIGIT(name[i]))
6985              {
6986              *errorcodeptr = ERR15;
6987              goto FAILED;
6988              }
6989            if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6990              {
6991              *errorcodeptr = ERR61;
6992              goto FAILED;
6993              }
6994            recno = recno * 10 + name[i] - CHAR_0;
6995            }
6996          if (recno == 0) recno = RREF_ANY;
6997          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6998          PUT2(code, 2+LINK_SIZE, recno);
6999          }
7000
7001        /* Similarly, check for the (?(DEFINE) "condition", which is always
7002        false. */
7003
7004        else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
7005          {
7006          code[1+LINK_SIZE] = OP_DEF;
7007          skipbytes = 1;
7008          }
7009
7010        /* Reference to an unidentified subpattern. */
7011
7012        else
7013          {
7014          *errorcodeptr = ERR15;
7015          goto FAILED;
7016          }
7017        break;
7018
7019
7020        /* ------------------------------------------------------------ */
7021        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
7022        bravalue = OP_ASSERT;
7023        cd->assert_depth += 1;
7024        ptr++;
7025        break;
7026
7027        /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
7028        thing to do, but Perl allows all assertions to be quantified, and when
7029        they contain capturing parentheses there may be a potential use for
7030        this feature. Not that that applies to a quantified (?!) but we allow
7031        it for uniformity. */
7032
7033        /* ------------------------------------------------------------ */
7034        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
7035        ptr++;
7036        if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
7037             ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
7038            (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
7039          {
7040          *code++ = OP_FAIL;
7041          previous = NULL;
7042          continue;
7043          }
7044        bravalue = OP_ASSERT_NOT;
7045        cd->assert_depth += 1;
7046        break;
7047
7048
7049        /* ------------------------------------------------------------ */
7050        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7051        switch (ptr[1])
7052          {
7053          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7054          bravalue = OP_ASSERTBACK;
7055          cd->assert_depth += 1;
7056          ptr += 2;
7057          break;
7058
7059          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7060          bravalue = OP_ASSERTBACK_NOT;
7061          cd->assert_depth += 1;
7062          ptr += 2;
7063          break;
7064
7065          default:                /* Could be name define, else bad */
7066          if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7067            goto DEFINE_NAME;
7068          ptr++;                  /* Correct offset for error */
7069          *errorcodeptr = ERR24;
7070          goto FAILED;
7071          }
7072        break;
7073
7074
7075        /* ------------------------------------------------------------ */
7076        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7077        bravalue = OP_ONCE;
7078        ptr++;
7079        break;
7080
7081
7082        /* ------------------------------------------------------------ */
7083        case CHAR_C:                 /* Callout - may be followed by digits; */
7084        previous_callout = code;     /* Save for later completion */
7085        after_manual_callout = 1;    /* Skip one item before completing */
7086        *code++ = OP_CALLOUT;
7087          {
7088          int n = 0;
7089          ptr++;
7090          while(IS_DIGIT(*ptr))
7091            n = n * 10 + *ptr++ - CHAR_0;
7092          if (*ptr != CHAR_RIGHT_PARENTHESIS)
7093            {
7094            *errorcodeptr = ERR39;
7095            goto FAILED;
7096            }
7097          if (n > 255)
7098            {
7099            *errorcodeptr = ERR38;
7100            goto FAILED;
7101            }
7102          *code++ = n;
7103          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7104          PUT(code, LINK_SIZE, 0);                          /* Default length */
7105          code += 2 * LINK_SIZE;
7106          }
7107        previous = NULL;
7108        continue;
7109
7110
7111        /* ------------------------------------------------------------ */
7112        case CHAR_P:              /* Python-style named subpattern handling */
7113        if (*(++ptr) == CHAR_EQUALS_SIGN ||
7114            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7115          {
7116          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7117          terminator = CHAR_RIGHT_PARENTHESIS;
7118          goto NAMED_REF_OR_RECURSE;
7119          }
7120        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7121          {
7122          *errorcodeptr = ERR41;
7123          goto FAILED;
7124          }
7125        /* Fall through to handle (?P< as (?< is handled */
7126
7127
7128        /* ------------------------------------------------------------ */
7129        DEFINE_NAME:    /* Come here from (?< handling */
7130        case CHAR_APOSTROPHE:
7131        terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7132          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7133        name = ++ptr;
7134        if (IS_DIGIT(*ptr))
7135          {
7136          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7137          goto FAILED;
7138          }
7139        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7140        namelen = (int)(ptr - name);
7141
7142        /* In the pre-compile phase, do a syntax check, remember the longest
7143        name, and then remember the group in a vector, expanding it if
7144        necessary. Duplicates for the same number are skipped; other duplicates
7145        are checked for validity. In the actual compile, there is nothing to
7146        do. */
7147
7148        if (lengthptr != NULL)
7149          {
7150          named_group *ng;
7151          pcre_uint32 number = cd->bracount + 1;
7152
7153          if (*ptr != (pcre_uchar)terminator)
7154            {
7155            *errorcodeptr = ERR42;
7156            goto FAILED;
7157            }
7158
7159          if (cd->names_found >= MAX_NAME_COUNT)
7160            {
7161            *errorcodeptr = ERR49;
7162            goto FAILED;
7163            }
7164
7165          if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7166            {
7167            cd->name_entry_size = namelen + IMM2_SIZE + 1;
7168            if (namelen > MAX_NAME_SIZE)
7169              {
7170              *errorcodeptr = ERR48;
7171              goto FAILED;
7172              }
7173            }
7174
7175          /* Scan the list to check for duplicates. For duplicate names, if the
7176          number is the same, break the loop, which causes the name to be
7177          discarded; otherwise, if DUPNAMES is not set, give an error.
7178          If it is set, allow the name with a different number, but continue
7179          scanning in case this is a duplicate with the same number. For
7180          non-duplicate names, give an error if the number is duplicated. */
7181
7182          ng = cd->named_groups;
7183          for (i = 0; i < cd->names_found; i++, ng++)
7184            {
7185            if (namelen == ng->length &&
7186                STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7187              {
7188              if (ng->number == number) break;
7189              if ((options & PCRE_DUPNAMES) == 0)
7190                {
7191                *errorcodeptr = ERR43;
7192                goto FAILED;
7193                }
7194              cd->dupnames = TRUE;  /* Duplicate names exist */
7195              }
7196            else if (ng->number == number)
7197              {
7198              *errorcodeptr = ERR65;
7199              goto FAILED;
7200              }
7201            }
7202
7203          if (i >= cd->names_found)     /* Not a duplicate with same number */
7204            {
7205            /* Increase the list size if necessary */
7206
7207            if (cd->names_found >= cd->named_group_list_size)
7208              {
7209              int newsize = cd->named_group_list_size * 2;
7210              named_group *newspace = (PUBL(malloc))
7211                (newsize * sizeof(named_group));
7212
7213              if (newspace == NULL)
7214                {
7215                *errorcodeptr = ERR21;
7216                goto FAILED;
7217                }
7218
7219              memcpy(newspace, cd->named_groups,
7220                cd->named_group_list_size * sizeof(named_group));
7221              if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7222                (PUBL(free))((void *)cd->named_groups);
7223              cd->named_groups = newspace;
7224              cd->named_group_list_size = newsize;
7225              }
7226
7227            cd->named_groups[cd->names_found].name = name;
7228            cd->named_groups[cd->names_found].length = namelen;
7229            cd->named_groups[cd->names_found].number = number;
7230            cd->names_found++;
7231            }
7232          }
7233
7234        ptr++;                    /* Move past > or ' in both passes. */
7235        goto NUMBERED_GROUP;
7236
7237
7238        /* ------------------------------------------------------------ */
7239        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7240        terminator = CHAR_RIGHT_PARENTHESIS;
7241        is_recurse = TRUE;
7242        /* Fall through */
7243
7244        /* We come here from the Python syntax above that handles both
7245        references (?P=name) and recursion (?P>name), as well as falling
7246        through from the Perl recursion syntax (?&name). We also come here from
7247        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7248        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7249
7250        NAMED_REF_OR_RECURSE:
7251        name = ++ptr;
7252        if (IS_DIGIT(*ptr))
7253          {
7254          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7255          goto FAILED;
7256          }
7257        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7258        namelen = (int)(ptr - name);
7259
7260        /* In the pre-compile phase, do a syntax check. We used to just set
7261        a dummy reference number, because it was not used in the first pass.
7262        However, with the change of recursive back references to be atomic,
7263        we have to look for the number so that this state can be identified, as
7264        otherwise the incorrect length is computed. If it's not a backwards
7265        reference, the dummy number will do. */
7266
7267        if (lengthptr != NULL)
7268          {
7269          named_group *ng;
7270          recno = 0;
7271
7272          if (namelen == 0)
7273            {
7274            *errorcodeptr = ERR62;
7275            goto FAILED;
7276            }
7277          if (*ptr != (pcre_uchar)terminator)
7278            {
7279            *errorcodeptr = ERR42;
7280            goto FAILED;
7281            }
7282          if (namelen > MAX_NAME_SIZE)
7283            {
7284            *errorcodeptr = ERR48;
7285            goto FAILED;
7286            }
7287
7288          /* Count named back references. */
7289
7290          if (!is_recurse) cd->namedrefcount++;
7291
7292          /* We have to allow for a named reference to a duplicated name (this
7293          cannot be determined until the second pass). This needs an extra
7294          16-bit data item. */
7295
7296          *lengthptr += IMM2_SIZE;
7297
7298          /* If this is a forward reference and we are within a (?|...) group,
7299          the reference may end up as the number of a group which we are
7300          currently inside, that is, it could be a recursive reference. In the
7301          real compile this will be picked up and the reference wrapped with
7302          OP_ONCE to make it atomic, so we must space in case this occurs. */
7303
7304          /* In fact, this can happen for a non-forward reference because
7305          another group with the same number might be created later. This
7306          issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7307          only mode, we finesse the bug by allowing more memory always. */
7308
7309          *lengthptr += 4 + 4*LINK_SIZE;
7310
7311          /* It is even worse than that. The current reference may be to an
7312          existing named group with a different number (so apparently not
7313          recursive) but which later on is also attached to a group with the
7314          current number. This can only happen if $(| has been previous
7315          encountered. In that case, we allow yet more memory, just in case.
7316          (Again, this is fixed "properly" in PCRE2. */
7317
7318          if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7319
7320          /* Otherwise, check for recursion here. The name table does not exist
7321          in the first pass; instead we must scan the list of names encountered
7322          so far in order to get the number. If the name is not found, leave
7323          the value of recno as 0 for a forward reference. */
7324
7325          /* This patch (removing "else") fixes a problem when a reference is
7326          to multiple identically named nested groups from within the nest.
7327          Once again, it is not the "proper" fix, and it results in an
7328          over-allocation of memory. */
7329
7330          /* else */
7331            {
7332            ng = cd->named_groups;
7333            for (i = 0; i < cd->names_found; i++, ng++)
7334              {
7335              if (namelen == ng->length &&
7336                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7337                {
7338                open_capitem *oc;
7339                recno = ng->number;
7340                if (is_recurse) break;
7341                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7342                  {
7343                  if (oc->number == recno)
7344                    {
7345                    oc->flag = TRUE;
7346                    break;
7347                    }
7348                  }
7349                }
7350              }
7351            }
7352          }
7353
7354        /* In the real compile, search the name table. We check the name
7355        first, and then check that we have reached the end of the name in the
7356        table. That way, if the name is longer than any in the table, the
7357        comparison will fail without reading beyond the table entry. */
7358
7359        else
7360          {
7361          slot = cd->name_table;
7362          for (i = 0; i < cd->names_found; i++)
7363            {
7364            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7365                slot[IMM2_SIZE+namelen] == 0)
7366              break;
7367            slot += cd->name_entry_size;
7368            }
7369
7370          if (i < cd->names_found)
7371            {
7372            recno = GET2(slot, 0);
7373            }
7374          else
7375            {
7376            *errorcodeptr = ERR15;
7377            goto FAILED;
7378            }
7379          }
7380
7381        /* In both phases, for recursions, we can now go to the code than
7382        handles numerical recursion. */
7383
7384        if (is_recurse) goto HANDLE_RECURSION;
7385
7386        /* In the second pass we must see if the name is duplicated. If so, we
7387        generate a different opcode. */
7388
7389        if (lengthptr == NULL && cd->dupnames)
7390          {
7391          int count = 1;
7392          unsigned int index = i;
7393          pcre_uchar *cslot = slot + cd->name_entry_size;
7394
7395          for (i++; i < cd->names_found; i++)
7396            {
7397            if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7398            count++;
7399            cslot += cd->name_entry_size;
7400            }
7401
7402          if (count > 1)
7403            {
7404            if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7405            previous = code;
7406            item_hwm_offset = cd->hwm - cd->start_workspace;
7407            *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7408            PUT2INC(code, 0, index);
7409            PUT2INC(code, 0, count);
7410
7411            /* Process each potentially referenced group. */
7412
7413            for (; slot < cslot; slot += cd->name_entry_size)
7414              {
7415              open_capitem *oc;
7416              recno = GET2(slot, 0);
7417              cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7418              if (recno > cd->top_backref) cd->top_backref = recno;
7419
7420              /* Check to see if this back reference is recursive, that it, it
7421              is inside the group that it references. A flag is set so that the
7422              group can be made atomic. */
7423
7424              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7425                {
7426                if (oc->number == recno)
7427                  {
7428                  oc->flag = TRUE;
7429                  break;
7430                  }
7431                }
7432              }
7433
7434            continue;  /* End of back ref handling */
7435            }
7436          }
7437
7438        /* First pass, or a non-duplicated name. */
7439
7440        goto HANDLE_REFERENCE;
7441
7442
7443        /* ------------------------------------------------------------ */
7444        case CHAR_R:              /* Recursion, same as (?0) */
7445        recno = 0;
7446        if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7447          {
7448          *errorcodeptr = ERR29;
7449          goto FAILED;
7450          }
7451        goto HANDLE_RECURSION;
7452
7453
7454        /* ------------------------------------------------------------ */
7455        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7456        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7457        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7458          {
7459          const pcre_uchar *called;
7460          terminator = CHAR_RIGHT_PARENTHESIS;
7461
7462          /* Come here from the \g<...> and \g'...' code (Oniguruma
7463          compatibility). However, the syntax has been checked to ensure that
7464          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7465          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7466          ever be taken. */
7467
7468          HANDLE_NUMERICAL_RECURSION:
7469
7470          if ((refsign = *ptr) == CHAR_PLUS)
7471            {
7472            ptr++;
7473            if (!IS_DIGIT(*ptr))
7474              {
7475              *errorcodeptr = ERR63;
7476              goto FAILED;
7477              }
7478            }
7479          else if (refsign == CHAR_MINUS)
7480            {
7481            if (!IS_DIGIT(ptr[1]))
7482              goto OTHER_CHAR_AFTER_QUERY;
7483            ptr++;
7484            }
7485
7486          recno = 0;
7487          while(IS_DIGIT(*ptr))
7488            {
7489            if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7490              {
7491              while (IS_DIGIT(*ptr)) ptr++;
7492              *errorcodeptr = ERR61;
7493              goto FAILED;
7494              }
7495            recno = recno * 10 + *ptr++ - CHAR_0;
7496            }
7497
7498          if (*ptr != (pcre_uchar)terminator)
7499            {
7500            *errorcodeptr = ERR29;
7501            goto FAILED;
7502            }
7503
7504          if (refsign == CHAR_MINUS)
7505            {
7506            if (recno == 0)
7507              {
7508              *errorcodeptr = ERR58;
7509              goto FAILED;
7510              }
7511            recno = cd->bracount - recno + 1;
7512            if (recno <= 0)
7513              {
7514              *errorcodeptr = ERR15;
7515              goto FAILED;
7516              }
7517            }
7518          else if (refsign == CHAR_PLUS)
7519            {
7520            if (recno == 0)
7521              {
7522              *errorcodeptr = ERR58;
7523              goto FAILED;
7524              }
7525            recno += cd->bracount;
7526            }
7527
7528          /* Come here from code above that handles a named recursion */
7529
7530          HANDLE_RECURSION:
7531
7532          previous = code;
7533          item_hwm_offset = cd->hwm - cd->start_workspace;
7534          called = cd->start_code;
7535
7536          /* When we are actually compiling, find the bracket that is being
7537          referenced. Temporarily end the regex in case it doesn't exist before
7538          this point. If we end up with a forward reference, first check that
7539          the bracket does occur later so we can give the error (and position)
7540          now. Then remember this forward reference in the workspace so it can
7541          be filled in at the end. */
7542
7543          if (lengthptr == NULL)
7544            {
7545            *code = OP_END;
7546            if (recno != 0)
7547              called = PRIV(find_bracket)(cd->start_code, utf, recno);
7548
7549            /* Forward reference */
7550
7551            if (called == NULL)
7552              {
7553              if (recno > cd->final_bracount)
7554                {
7555                *errorcodeptr = ERR15;
7556                goto FAILED;
7557                }
7558
7559              /* Fudge the value of "called" so that when it is inserted as an
7560              offset below, what it actually inserted is the reference number
7561              of the group. Then remember the forward reference. */
7562
7563              called = cd->start_code + recno;
7564              if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7565                  WORK_SIZE_SAFETY_MARGIN)
7566                {
7567                *errorcodeptr = expand_workspace(cd);
7568                if (*errorcodeptr != 0) goto FAILED;
7569                }
7570              PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7571              }
7572
7573            /* If not a forward reference, and the subpattern is still open,
7574            this is a recursive call. We check to see if this is a left
7575            recursion that could loop for ever, and diagnose that case. We
7576            must not, however, do this check if we are in a conditional
7577            subpattern because the condition might be testing for recursion in
7578            a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7579            Forever loops are also detected at runtime, so those that occur in
7580            conditional subpatterns will be picked up then. */
7581
7582            else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7583                     could_be_empty(called, code, bcptr, utf, cd))
7584              {
7585              *errorcodeptr = ERR40;
7586              goto FAILED;
7587              }
7588            }
7589
7590          /* Insert the recursion/subroutine item. It does not have a set first
7591          character (relevant if it is repeated, because it will then be
7592          wrapped with ONCE brackets). */
7593
7594          *code = OP_RECURSE;
7595          PUT(code, 1, (int)(called - cd->start_code));
7596          code += 1 + LINK_SIZE;
7597          groupsetfirstchar = FALSE;
7598          }
7599
7600        /* Can't determine a first byte now */
7601
7602        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7603        continue;
7604
7605
7606        /* ------------------------------------------------------------ */
7607        default:              /* Other characters: check option setting */
7608        OTHER_CHAR_AFTER_QUERY:
7609        set = unset = 0;
7610        optset = &set;
7611
7612        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7613          {
7614          switch (*ptr++)
7615            {
7616            case CHAR_MINUS: optset = &unset; break;
7617
7618            case CHAR_J:    /* Record that it changed in the external options */
7619            *optset |= PCRE_DUPNAMES;
7620            cd->external_flags |= PCRE_JCHANGED;
7621            break;
7622
7623            case CHAR_i: *optset |= PCRE_CASELESS; break;
7624            case CHAR_m: *optset |= PCRE_MULTILINE; break;
7625            case CHAR_s: *optset |= PCRE_DOTALL; break;
7626            case CHAR_x: *optset |= PCRE_EXTENDED; break;
7627            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7628            case CHAR_X: *optset |= PCRE_EXTRA; break;
7629
7630            default:  *errorcodeptr = ERR12;
7631                      ptr--;    /* Correct the offset */
7632                      goto FAILED;
7633            }
7634          }
7635
7636        /* Set up the changed option bits, but don't change anything yet. */
7637
7638        newoptions = (options | set) & (~unset);
7639
7640        /* If the options ended with ')' this is not the start of a nested
7641        group with option changes, so the options change at this level.
7642        If we are not at the pattern start, reset the greedy defaults and the
7643        case value for firstchar and reqchar. */
7644
7645        if (*ptr == CHAR_RIGHT_PARENTHESIS)
7646          {
7647          greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7648          greedy_non_default = greedy_default ^ 1;
7649          req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7650
7651          /* Change options at this level, and pass them back for use
7652          in subsequent branches. */
7653
7654          *optionsptr = options = newoptions;
7655          previous = NULL;       /* This item can't be repeated */
7656          continue;              /* It is complete */
7657          }
7658
7659        /* If the options ended with ':' we are heading into a nested group
7660        with possible change of options. Such groups are non-capturing and are
7661        not assertions of any kind. All we need to do is skip over the ':';
7662        the newoptions value is handled below. */
7663
7664        bravalue = OP_BRA;
7665        ptr++;
7666        }     /* End of switch for character following (? */
7667      }       /* End of (? handling */
7668
7669    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7670    is set, all unadorned brackets become non-capturing and behave like (?:...)
7671    brackets. */
7672
7673    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7674      {
7675      bravalue = OP_BRA;
7676      }
7677
7678    /* Else we have a capturing group. */
7679
7680    else
7681      {
7682      NUMBERED_GROUP:
7683      cd->bracount += 1;
7684      PUT2(code, 1+LINK_SIZE, cd->bracount);
7685      skipbytes = IMM2_SIZE;
7686      }
7687
7688    /* Process nested bracketed regex. First check for parentheses nested too
7689    deeply. */
7690
7691    if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7692      {
7693      *errorcodeptr = ERR82;
7694      goto FAILED;
7695      }
7696
7697    /* All assertions used not to be repeatable, but this was changed for Perl
7698    compatibility. All kinds can now be repeated except for assertions that are
7699    conditions (Perl also forbids these to be repeated). We copy code into a
7700    non-register variable (tempcode) in order to be able to pass its address
7701    because some compilers complain otherwise. At the start of a conditional
7702    group whose condition is an assertion, cd->iscondassert is set. We unset it
7703    here so as to allow assertions later in the group to be quantified. */
7704
7705    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7706        cd->iscondassert)
7707      {
7708      previous = NULL;
7709      cd->iscondassert = FALSE;
7710      }
7711    else
7712      {
7713      previous = code;
7714      item_hwm_offset = cd->hwm - cd->start_workspace;
7715      }
7716
7717    *code = bravalue;
7718    tempcode = code;
7719    tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7720    tempbracount = cd->bracount;          /* Save value before bracket */
7721    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7722
7723    if (!compile_regex(
7724         newoptions,                      /* The complete new option state */
7725         &tempcode,                       /* Where to put code (updated) */
7726         &ptr,                            /* Input pointer (updated) */
7727         errorcodeptr,                    /* Where to put an error message */
7728         (bravalue == OP_ASSERTBACK ||
7729          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7730         reset_bracount,                  /* True if (?| group */
7731         skipbytes,                       /* Skip over bracket number */
7732         cond_depth +
7733           ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7734         &subfirstchar,                   /* For possible first char */
7735         &subfirstcharflags,
7736         &subreqchar,                     /* For possible last char */
7737         &subreqcharflags,
7738         bcptr,                           /* Current branch chain */
7739         cd,                              /* Tables block */
7740         (lengthptr == NULL)? NULL :      /* Actual compile phase */
7741           &length_prevgroup              /* Pre-compile phase */
7742         ))
7743      goto FAILED;
7744
7745    cd->parens_depth -= 1;
7746
7747    /* If this was an atomic group and there are no capturing groups within it,
7748    generate OP_ONCE_NC instead of OP_ONCE. */
7749
7750    if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7751      *code = OP_ONCE_NC;
7752
7753    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7754      cd->assert_depth -= 1;
7755
7756    /* At the end of compiling, code is still pointing to the start of the
7757    group, while tempcode has been updated to point past the end of the group.
7758    The pattern pointer (ptr) is on the bracket.
7759
7760    If this is a conditional bracket, check that there are no more than
7761    two branches in the group, or just one if it's a DEFINE group. We do this
7762    in the real compile phase, not in the pre-pass, where the whole group may
7763    not be available. */
7764
7765    if (bravalue == OP_COND && lengthptr == NULL)
7766      {
7767      pcre_uchar *tc = code;
7768      int condcount = 0;
7769
7770      do {
7771         condcount++;
7772         tc += GET(tc,1);
7773         }
7774      while (*tc != OP_KET);
7775
7776      /* A DEFINE group is never obeyed inline (the "condition" is always
7777      false). It must have only one branch. */
7778
7779      if (code[LINK_SIZE+1] == OP_DEF)
7780        {
7781        if (condcount > 1)
7782          {
7783          *errorcodeptr = ERR54;
7784          goto FAILED;
7785          }
7786        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7787        }
7788
7789      /* A "normal" conditional group. If there is just one branch, we must not
7790      make use of its firstchar or reqchar, because this is equivalent to an
7791      empty second branch. */
7792
7793      else
7794        {
7795        if (condcount > 2)
7796          {
7797          *errorcodeptr = ERR27;
7798          goto FAILED;
7799          }
7800        if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7801        }
7802      }
7803
7804    /* Error if hit end of pattern */
7805
7806    if (*ptr != CHAR_RIGHT_PARENTHESIS)
7807      {
7808      *errorcodeptr = ERR14;
7809      goto FAILED;
7810      }
7811
7812    /* In the pre-compile phase, update the length by the length of the group,
7813    less the brackets at either end. Then reduce the compiled code to just a
7814    set of non-capturing brackets so that it doesn't use much memory if it is
7815    duplicated by a quantifier.*/
7816
7817    if (lengthptr != NULL)
7818      {
7819      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7820        {
7821        *errorcodeptr = ERR20;
7822        goto FAILED;
7823        }
7824      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7825      code++;   /* This already contains bravalue */
7826      PUTINC(code, 0, 1 + LINK_SIZE);
7827      *code++ = OP_KET;
7828      PUTINC(code, 0, 1 + LINK_SIZE);
7829      break;    /* No need to waste time with special character handling */
7830      }
7831
7832    /* Otherwise update the main code pointer to the end of the group. */
7833
7834    code = tempcode;
7835
7836    /* For a DEFINE group, required and first character settings are not
7837    relevant. */
7838
7839    if (bravalue == OP_DEF) break;
7840
7841    /* Handle updating of the required and first characters for other types of
7842    group. Update for normal brackets of all kinds, and conditions with two
7843    branches (see code above). If the bracket is followed by a quantifier with
7844    zero repeat, we have to back off. Hence the definition of zeroreqchar and
7845    zerofirstchar outside the main loop so that they can be accessed for the
7846    back off. */
7847
7848    zeroreqchar = reqchar;
7849    zeroreqcharflags = reqcharflags;
7850    zerofirstchar = firstchar;
7851    zerofirstcharflags = firstcharflags;
7852    groupsetfirstchar = FALSE;
7853
7854    if (bravalue >= OP_ONCE)
7855      {
7856      /* If we have not yet set a firstchar in this branch, take it from the
7857      subpattern, remembering that it was set here so that a repeat of more
7858      than one can replicate it as reqchar if necessary. If the subpattern has
7859      no firstchar, set "none" for the whole branch. In both cases, a zero
7860      repeat forces firstchar to "none". */
7861
7862      if (firstcharflags == REQ_UNSET)
7863        {
7864        if (subfirstcharflags >= 0)
7865          {
7866          firstchar = subfirstchar;
7867          firstcharflags = subfirstcharflags;
7868          groupsetfirstchar = TRUE;
7869          }
7870        else firstcharflags = REQ_NONE;
7871        zerofirstcharflags = REQ_NONE;
7872        }
7873
7874      /* If firstchar was previously set, convert the subpattern's firstchar
7875      into reqchar if there wasn't one, using the vary flag that was in
7876      existence beforehand. */
7877
7878      else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7879        {
7880        subreqchar = subfirstchar;
7881        subreqcharflags = subfirstcharflags | tempreqvary;
7882        }
7883
7884      /* If the subpattern set a required byte (or set a first byte that isn't
7885      really the first byte - see above), set it. */
7886
7887      if (subreqcharflags >= 0)
7888        {
7889        reqchar = subreqchar;
7890        reqcharflags = subreqcharflags;
7891        }
7892      }
7893
7894    /* For a forward assertion, we take the reqchar, if set. This can be
7895    helpful if the pattern that follows the assertion doesn't set a different
7896    char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7897    for an assertion, however because it leads to incorrect effect for patterns
7898    such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7899    of a firstchar. This is overcome by a scan at the end if there's no
7900    firstchar, looking for an asserted first char. */
7901
7902    else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7903      {
7904      reqchar = subreqchar;
7905      reqcharflags = subreqcharflags;
7906      }
7907    break;     /* End of processing '(' */
7908
7909
7910    /* ===================================================================*/
7911    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7912    are arranged to be the negation of the corresponding OP_values in the
7913    default case when PCRE_UCP is not set. For the back references, the values
7914    are negative the reference number. Only back references and those types
7915    that consume a character may be repeated. We can test for values between
7916    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7917    ever created. */
7918
7919    case CHAR_BACKSLASH:
7920    tempptr = ptr;
7921    escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7922    if (*errorcodeptr != 0) goto FAILED;
7923
7924    if (escape == 0)                  /* The escape coded a single character */
7925      c = ec;
7926    else
7927      {
7928      /* For metasequences that actually match a character, we disable the
7929      setting of a first character if it hasn't already been set. */
7930
7931      if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7932        firstcharflags = REQ_NONE;
7933
7934      /* Set values to reset to if this is followed by a zero repeat. */
7935
7936      zerofirstchar = firstchar;
7937      zerofirstcharflags = firstcharflags;
7938      zeroreqchar = reqchar;
7939      zeroreqcharflags = reqcharflags;
7940
7941      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7942      is a subroutine call by number (Oniguruma syntax). In fact, the value
7943      ESC_g is returned only for these cases. So we don't need to check for <
7944      or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7945      -n, and for the Perl syntax \g{name} the result is ESC_k (as
7946      that is a synonym for a named back reference). */
7947
7948      if (escape == ESC_g)
7949        {
7950        const pcre_uchar *p;
7951        pcre_uint32 cf;
7952
7953        item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7954        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7955          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7956
7957        /* These two statements stop the compiler for warning about possibly
7958        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7959        fact, because we do the check for a number below, the paths that
7960        would actually be in error are never taken. */
7961
7962        skipbytes = 0;
7963        reset_bracount = FALSE;
7964
7965        /* If it's not a signed or unsigned number, treat it as a name. */
7966
7967        cf = ptr[1];
7968        if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7969          {
7970          is_recurse = TRUE;
7971          goto NAMED_REF_OR_RECURSE;
7972          }
7973
7974        /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7975        or a digit. */
7976
7977        p = ptr + 2;
7978        while (IS_DIGIT(*p)) p++;
7979        if (*p != (pcre_uchar)terminator)
7980          {
7981          *errorcodeptr = ERR57;
7982          goto FAILED;
7983          }
7984        ptr++;
7985        goto HANDLE_NUMERICAL_RECURSION;
7986        }
7987
7988      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7989      We also support \k{name} (.NET syntax).  */
7990
7991      if (escape == ESC_k)
7992        {
7993        if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7994          ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7995          {
7996          *errorcodeptr = ERR69;
7997          goto FAILED;
7998          }
7999        is_recurse = FALSE;
8000        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
8001          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
8002          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
8003        goto NAMED_REF_OR_RECURSE;
8004        }
8005
8006      /* Back references are handled specially; must disable firstchar if
8007      not set to cope with cases like (?=(\w+))\1: which would otherwise set
8008      ':' later. */
8009
8010      if (escape < 0)
8011        {
8012        open_capitem *oc;
8013        recno = -escape;
8014
8015        /* Come here from named backref handling when the reference is to a
8016        single group (i.e. not to a duplicated name. */
8017
8018        HANDLE_REFERENCE:
8019        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8020        previous = code;
8021        item_hwm_offset = cd->hwm - cd->start_workspace;
8022        *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8023        PUT2INC(code, 0, recno);
8024        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8025        if (recno > cd->top_backref) cd->top_backref = recno;
8026
8027        /* Check to see if this back reference is recursive, that it, it
8028        is inside the group that it references. A flag is set so that the
8029        group can be made atomic. */
8030
8031        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8032          {
8033          if (oc->number == recno)
8034            {
8035            oc->flag = TRUE;
8036            break;
8037            }
8038          }
8039        }
8040
8041      /* So are Unicode property matches, if supported. */
8042
8043#ifdef SUPPORT_UCP
8044      else if (escape == ESC_P || escape == ESC_p)
8045        {
8046        BOOL negated;
8047        unsigned int ptype = 0, pdata = 0;
8048        if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8049          goto FAILED;
8050        previous = code;
8051        item_hwm_offset = cd->hwm - cd->start_workspace;
8052        *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8053        *code++ = ptype;
8054        *code++ = pdata;
8055        }
8056#else
8057
8058      /* If Unicode properties are not supported, \X, \P, and \p are not
8059      allowed. */
8060
8061      else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8062        {
8063        *errorcodeptr = ERR45;
8064        goto FAILED;
8065        }
8066#endif
8067
8068      /* For the rest (including \X when Unicode properties are supported), we
8069      can obtain the OP value by negating the escape value in the default
8070      situation when PCRE_UCP is not set. When it *is* set, we substitute
8071      Unicode property tests. Note that \b and \B do a one-character
8072      lookbehind, and \A also behaves as if it does. */
8073
8074      else
8075        {
8076        if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8077             cd->max_lookbehind == 0)
8078          cd->max_lookbehind = 1;
8079#ifdef SUPPORT_UCP
8080        if (escape >= ESC_DU && escape <= ESC_wu)
8081          {
8082          nestptr = ptr + 1;                   /* Where to resume */
8083          ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8084          }
8085        else
8086#endif
8087        /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8088        so that it works in DFA mode and in lookbehinds. */
8089
8090          {
8091          previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8092          item_hwm_offset = cd->hwm - cd->start_workspace;
8093          *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8094          }
8095        }
8096      continue;
8097      }
8098
8099    /* We have a data character whose value is in c. In UTF-8 mode it may have
8100    a value > 127. We set its representation in the length/buffer, and then
8101    handle it as a data character. */
8102
8103#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8104    if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8105      mclength = PRIV(ord2utf)(c, mcbuffer);
8106    else
8107#endif
8108
8109     {
8110     mcbuffer[0] = c;
8111     mclength = 1;
8112     }
8113    goto ONE_CHAR;
8114
8115
8116    /* ===================================================================*/
8117    /* Handle a literal character. It is guaranteed not to be whitespace or #
8118    when the extended flag is set. If we are in a UTF mode, it may be a
8119    multi-unit literal character. */
8120
8121    default:
8122    NORMAL_CHAR:
8123    mclength = 1;
8124    mcbuffer[0] = c;
8125
8126#ifdef SUPPORT_UTF
8127    if (utf && HAS_EXTRALEN(c))
8128      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8129#endif
8130
8131    /* At this point we have the character's bytes in mcbuffer, and the length
8132    in mclength. When not in UTF-8 mode, the length is always 1. */
8133
8134    ONE_CHAR:
8135    previous = code;
8136    item_hwm_offset = cd->hwm - cd->start_workspace;
8137
8138    /* For caseless UTF-8 mode when UCP support is available, check whether
8139    this character has more than one other case. If so, generate a special
8140    OP_PROP item instead of OP_CHARI. */
8141
8142#ifdef SUPPORT_UCP
8143    if (utf && (options & PCRE_CASELESS) != 0)
8144      {
8145      GETCHAR(c, mcbuffer);
8146      if ((c = UCD_CASESET(c)) != 0)
8147        {
8148        *code++ = OP_PROP;
8149        *code++ = PT_CLIST;
8150        *code++ = c;
8151        if (firstcharflags == REQ_UNSET)
8152          firstcharflags = zerofirstcharflags = REQ_NONE;
8153        break;
8154        }
8155      }
8156#endif
8157
8158    /* Caseful matches, or not one of the multicase characters. */
8159
8160    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8161    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8162
8163    /* Remember if \r or \n were seen */
8164
8165    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8166      cd->external_flags |= PCRE_HASCRORLF;
8167
8168    /* Set the first and required bytes appropriately. If no previous first
8169    byte, set it from this character, but revert to none on a zero repeat.
8170    Otherwise, leave the firstchar value alone, and don't change it on a zero
8171    repeat. */
8172
8173    if (firstcharflags == REQ_UNSET)
8174      {
8175      zerofirstcharflags = REQ_NONE;
8176      zeroreqchar = reqchar;
8177      zeroreqcharflags = reqcharflags;
8178
8179      /* If the character is more than one byte long, we can set firstchar
8180      only if it is not to be matched caselessly. */
8181
8182      if (mclength == 1 || req_caseopt == 0)
8183        {
8184        firstchar = mcbuffer[0] | req_caseopt;
8185        firstchar = mcbuffer[0];
8186        firstcharflags = req_caseopt;
8187
8188        if (mclength != 1)
8189          {
8190          reqchar = code[-1];
8191          reqcharflags = cd->req_varyopt;
8192          }
8193        }
8194      else firstcharflags = reqcharflags = REQ_NONE;
8195      }
8196
8197    /* firstchar was previously set; we can set reqchar only if the length is
8198    1 or the matching is caseful. */
8199
8200    else
8201      {
8202      zerofirstchar = firstchar;
8203      zerofirstcharflags = firstcharflags;
8204      zeroreqchar = reqchar;
8205      zeroreqcharflags = reqcharflags;
8206      if (mclength == 1 || req_caseopt == 0)
8207        {
8208        reqchar = code[-1];
8209        reqcharflags = req_caseopt | cd->req_varyopt;
8210        }
8211      }
8212
8213    break;            /* End of literal character handling */
8214    }
8215  }                   /* end of big loop */
8216
8217
8218/* Control never reaches here by falling through, only by a goto for all the
8219error states. Pass back the position in the pattern so that it can be displayed
8220to the user for diagnosing the error. */
8221
8222FAILED:
8223*ptrptr = ptr;
8224return FALSE;
8225}
8226
8227
8228
8229/*************************************************
8230*     Compile sequence of alternatives           *
8231*************************************************/
8232
8233/* On entry, ptr is pointing past the bracket character, but on return it
8234points to the closing bracket, or vertical bar, or end of string. The code
8235variable is pointing at the byte into which the BRA operator has been stored.
8236This function is used during the pre-compile phase when we are trying to find
8237out the amount of memory needed, as well as during the real compile phase. The
8238value of lengthptr distinguishes the two phases.
8239
8240Arguments:
8241  options           option bits, including any changes for this subpattern
8242  codeptr           -> the address of the current code pointer
8243  ptrptr            -> the address of the current pattern pointer
8244  errorcodeptr      -> pointer to error code variable
8245  lookbehind        TRUE if this is a lookbehind assertion
8246  reset_bracount    TRUE to reset the count for each branch
8247  skipbytes         skip this many bytes at start (for brackets and OP_COND)
8248  cond_depth        depth of nesting for conditional subpatterns
8249  firstcharptr      place to put the first required character
8250  firstcharflagsptr place to put the first character flags, or a negative number
8251  reqcharptr        place to put the last required character
8252  reqcharflagsptr   place to put the last required character flags, or a negative number
8253  bcptr             pointer to the chain of currently open branches
8254  cd                points to the data block with tables pointers etc.
8255  lengthptr         NULL during the real compile phase
8256                    points to length accumulator during pre-compile phase
8257
8258Returns:            TRUE on success
8259*/
8260
8261static BOOL
8262compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8263  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8264  int cond_depth,
8265  pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8266  pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8267  branch_chain *bcptr, compile_data *cd, int *lengthptr)
8268{
8269const pcre_uchar *ptr = *ptrptr;
8270pcre_uchar *code = *codeptr;
8271pcre_uchar *last_branch = code;
8272pcre_uchar *start_bracket = code;
8273pcre_uchar *reverse_count = NULL;
8274open_capitem capitem;
8275int capnumber = 0;
8276pcre_uint32 firstchar, reqchar;
8277pcre_int32 firstcharflags, reqcharflags;
8278pcre_uint32 branchfirstchar, branchreqchar;
8279pcre_int32 branchfirstcharflags, branchreqcharflags;
8280int length;
8281unsigned int orig_bracount;
8282unsigned int max_bracount;
8283branch_chain bc;
8284size_t save_hwm_offset;
8285
8286/* If set, call the external function that checks for stack availability. */
8287
8288if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8289  {
8290  *errorcodeptr= ERR85;
8291  return FALSE;
8292  }
8293
8294/* Miscellaneous initialization */
8295
8296bc.outer = bcptr;
8297bc.current_branch = code;
8298
8299firstchar = reqchar = 0;
8300firstcharflags = reqcharflags = REQ_UNSET;
8301
8302save_hwm_offset = cd->hwm - cd->start_workspace;
8303
8304/* Accumulate the length for use in the pre-compile phase. Start with the
8305length of the BRA and KET and any extra bytes that are required at the
8306beginning. We accumulate in a local variable to save frequent testing of
8307lenthptr for NULL. We cannot do this by looking at the value of code at the
8308start and end of each alternative, because compiled items are discarded during
8309the pre-compile phase so that the work space is not exceeded. */
8310
8311length = 2 + 2*LINK_SIZE + skipbytes;
8312
8313/* WARNING: If the above line is changed for any reason, you must also change
8314the code that abstracts option settings at the start of the pattern and makes
8315them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8316pre-compile phase to find out whether anything has yet been compiled or not. */
8317
8318/* If this is a capturing subpattern, add to the chain of open capturing items
8319so that we can detect them if (*ACCEPT) is encountered. This is also used to
8320detect groups that contain recursive back references to themselves. Note that
8321only OP_CBRA need be tested here; changing this opcode to one of its variants,
8322e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8323
8324if (*code == OP_CBRA)
8325  {
8326  capnumber = GET2(code, 1 + LINK_SIZE);
8327  capitem.number = capnumber;
8328  capitem.next = cd->open_caps;
8329  capitem.flag = FALSE;
8330  cd->open_caps = &capitem;
8331  }
8332
8333/* Offset is set zero to mark that this bracket is still open */
8334
8335PUT(code, 1, 0);
8336code += 1 + LINK_SIZE + skipbytes;
8337
8338/* Loop for each alternative branch */
8339
8340orig_bracount = max_bracount = cd->bracount;
8341for (;;)
8342  {
8343  /* For a (?| group, reset the capturing bracket count so that each branch
8344  uses the same numbers. */
8345
8346  if (reset_bracount) cd->bracount = orig_bracount;
8347
8348  /* Set up dummy OP_REVERSE if lookbehind assertion */
8349
8350  if (lookbehind)
8351    {
8352    *code++ = OP_REVERSE;
8353    reverse_count = code;
8354    PUTINC(code, 0, 0);
8355    length += 1 + LINK_SIZE;
8356    }
8357
8358  /* Now compile the branch; in the pre-compile phase its length gets added
8359  into the length. */
8360
8361  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8362        &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8363        cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8364    {
8365    *ptrptr = ptr;
8366    return FALSE;
8367    }
8368
8369  /* Keep the highest bracket count in case (?| was used and some branch
8370  has fewer than the rest. */
8371
8372  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8373
8374  /* In the real compile phase, there is some post-processing to be done. */
8375
8376  if (lengthptr == NULL)
8377    {
8378    /* If this is the first branch, the firstchar and reqchar values for the
8379    branch become the values for the regex. */
8380
8381    if (*last_branch != OP_ALT)
8382      {
8383      firstchar = branchfirstchar;
8384      firstcharflags = branchfirstcharflags;
8385      reqchar = branchreqchar;
8386      reqcharflags = branchreqcharflags;
8387      }
8388
8389    /* If this is not the first branch, the first char and reqchar have to
8390    match the values from all the previous branches, except that if the
8391    previous value for reqchar didn't have REQ_VARY set, it can still match,
8392    and we set REQ_VARY for the regex. */
8393
8394    else
8395      {
8396      /* If we previously had a firstchar, but it doesn't match the new branch,
8397      we have to abandon the firstchar for the regex, but if there was
8398      previously no reqchar, it takes on the value of the old firstchar. */
8399
8400      if (firstcharflags >= 0 &&
8401          (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8402        {
8403        if (reqcharflags < 0)
8404          {
8405          reqchar = firstchar;
8406          reqcharflags = firstcharflags;
8407          }
8408        firstcharflags = REQ_NONE;
8409        }
8410
8411      /* If we (now or from before) have no firstchar, a firstchar from the
8412      branch becomes a reqchar if there isn't a branch reqchar. */
8413
8414      if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8415        {
8416        branchreqchar = branchfirstchar;
8417        branchreqcharflags = branchfirstcharflags;
8418        }
8419
8420      /* Now ensure that the reqchars match */
8421
8422      if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8423          reqchar != branchreqchar)
8424        reqcharflags = REQ_NONE;
8425      else
8426        {
8427        reqchar = branchreqchar;
8428        reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8429        }
8430      }
8431
8432    /* If lookbehind, check that this branch matches a fixed-length string, and
8433    put the length into the OP_REVERSE item. Temporarily mark the end of the
8434    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8435    because there may be forward references that we can't check here. Set a
8436    flag to cause another lookbehind check at the end. Why not do it all at the
8437    end? Because common, erroneous checks are picked up here and the offset of
8438    the problem can be shown. */
8439
8440    if (lookbehind)
8441      {
8442      int fixed_length;
8443      *code = OP_END;
8444      fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8445        FALSE, cd, NULL);
8446      DPRINTF(("fixed length = %d\n", fixed_length));
8447      if (fixed_length == -3)
8448        {
8449        cd->check_lookbehind = TRUE;
8450        }
8451      else if (fixed_length < 0)
8452        {
8453        *errorcodeptr = (fixed_length == -2)? ERR36 :
8454                        (fixed_length == -4)? ERR70: ERR25;
8455        *ptrptr = ptr;
8456        return FALSE;
8457        }
8458      else
8459        {
8460        if (fixed_length > cd->max_lookbehind)
8461          cd->max_lookbehind = fixed_length;
8462        PUT(reverse_count, 0, fixed_length);
8463        }
8464      }
8465    }
8466
8467  /* Reached end of expression, either ')' or end of pattern. In the real
8468  compile phase, go back through the alternative branches and reverse the chain
8469  of offsets, with the field in the BRA item now becoming an offset to the
8470  first alternative. If there are no alternatives, it points to the end of the
8471  group. The length in the terminating ket is always the length of the whole
8472  bracketed item. Return leaving the pointer at the terminating char. */
8473
8474  if (*ptr != CHAR_VERTICAL_LINE)
8475    {
8476    if (lengthptr == NULL)
8477      {
8478      int branch_length = (int)(code - last_branch);
8479      do
8480        {
8481        int prev_length = GET(last_branch, 1);
8482        PUT(last_branch, 1, branch_length);
8483        branch_length = prev_length;
8484        last_branch -= branch_length;
8485        }
8486      while (branch_length > 0);
8487      }
8488
8489    /* Fill in the ket */
8490
8491    *code = OP_KET;
8492    PUT(code, 1, (int)(code - start_bracket));
8493    code += 1 + LINK_SIZE;
8494
8495    /* If it was a capturing subpattern, check to see if it contained any
8496    recursive back references. If so, we must wrap it in atomic brackets.
8497    Because we are moving code along, we must ensure that any pending recursive
8498    references are updated. In any event, remove the block from the chain. */
8499
8500    if (capnumber > 0)
8501      {
8502      if (cd->open_caps->flag)
8503        {
8504        *code = OP_END;
8505        adjust_recurse(start_bracket, 1 + LINK_SIZE,
8506          (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8507        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8508          IN_UCHARS(code - start_bracket));
8509        *start_bracket = OP_ONCE;
8510        code += 1 + LINK_SIZE;
8511        PUT(start_bracket, 1, (int)(code - start_bracket));
8512        *code = OP_KET;
8513        PUT(code, 1, (int)(code - start_bracket));
8514        code += 1 + LINK_SIZE;
8515        length += 2 + 2*LINK_SIZE;
8516        }
8517      cd->open_caps = cd->open_caps->next;
8518      }
8519
8520    /* Retain the highest bracket number, in case resetting was used. */
8521
8522    cd->bracount = max_bracount;
8523
8524    /* Set values to pass back */
8525
8526    *codeptr = code;
8527    *ptrptr = ptr;
8528    *firstcharptr = firstchar;
8529    *firstcharflagsptr = firstcharflags;
8530    *reqcharptr = reqchar;
8531    *reqcharflagsptr = reqcharflags;
8532    if (lengthptr != NULL)
8533      {
8534      if (OFLOW_MAX - *lengthptr < length)
8535        {
8536        *errorcodeptr = ERR20;
8537        return FALSE;
8538        }
8539      *lengthptr += length;
8540      }
8541    return TRUE;
8542    }
8543
8544  /* Another branch follows. In the pre-compile phase, we can move the code
8545  pointer back to where it was for the start of the first branch. (That is,
8546  pretend that each branch is the only one.)
8547
8548  In the real compile phase, insert an ALT node. Its length field points back
8549  to the previous branch while the bracket remains open. At the end the chain
8550  is reversed. It's done like this so that the start of the bracket has a
8551  zero offset until it is closed, making it possible to detect recursion. */
8552
8553  if (lengthptr != NULL)
8554    {
8555    code = *codeptr + 1 + LINK_SIZE + skipbytes;
8556    length += 1 + LINK_SIZE;
8557    }
8558  else
8559    {
8560    *code = OP_ALT;
8561    PUT(code, 1, (int)(code - last_branch));
8562    bc.current_branch = last_branch = code;
8563    code += 1 + LINK_SIZE;
8564    }
8565
8566  ptr++;
8567  }
8568/* Control never reaches here */
8569}
8570
8571
8572
8573
8574/*************************************************
8575*          Check for anchored expression         *
8576*************************************************/
8577
8578/* Try to find out if this is an anchored regular expression. Consider each
8579alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8580all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8581it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8582be found, because ^ generates OP_CIRCM in that mode.
8583
8584We can also consider a regex to be anchored if OP_SOM starts all its branches.
8585This is the code for \G, which means "match at start of match position, taking
8586into account the match offset".
8587
8588A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8589because that will try the rest of the pattern at all possible matching points,
8590so there is no point trying again.... er ....
8591
8592.... except when the .* appears inside capturing parentheses, and there is a
8593subsequent back reference to those parentheses. We haven't enough information
8594to catch that case precisely.
8595
8596At first, the best we could do was to detect when .* was in capturing brackets
8597and the highest back reference was greater than or equal to that level.
8598However, by keeping a bitmap of the first 31 back references, we can catch some
8599of the more common cases more precisely.
8600
8601... A second exception is when the .* appears inside an atomic group, because
8602this prevents the number of characters it matches from being adjusted.
8603
8604Arguments:
8605  code           points to start of expression (the bracket)
8606  bracket_map    a bitmap of which brackets we are inside while testing; this
8607                  handles up to substring 31; after that we just have to take
8608                  the less precise approach
8609  cd             points to the compile data block
8610  atomcount      atomic group level
8611
8612Returns:     TRUE or FALSE
8613*/
8614
8615static BOOL
8616is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8617  compile_data *cd, int atomcount)
8618{
8619do {
8620   const pcre_uchar *scode = first_significant_code(
8621     code + PRIV(OP_lengths)[*code], FALSE);
8622   register int op = *scode;
8623
8624   /* Non-capturing brackets */
8625
8626   if (op == OP_BRA  || op == OP_BRAPOS ||
8627       op == OP_SBRA || op == OP_SBRAPOS)
8628     {
8629     if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8630     }
8631
8632   /* Capturing brackets */
8633
8634   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8635            op == OP_SCBRA || op == OP_SCBRAPOS)
8636     {
8637     int n = GET2(scode, 1+LINK_SIZE);
8638     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8639     if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8640     }
8641
8642   /* Positive forward assertions and conditions */
8643
8644   else if (op == OP_ASSERT || op == OP_COND)
8645     {
8646     if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8647     }
8648
8649   /* Atomic groups */
8650
8651   else if (op == OP_ONCE || op == OP_ONCE_NC)
8652     {
8653     if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8654       return FALSE;
8655     }
8656
8657   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8658   it isn't in brackets that are or may be referenced or inside an atomic
8659   group. */
8660
8661   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8662             op == OP_TYPEPOSSTAR))
8663     {
8664     if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8665         atomcount > 0 || cd->had_pruneorskip)
8666       return FALSE;
8667     }
8668
8669   /* Check for explicit anchoring */
8670
8671   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8672
8673   code += GET(code, 1);
8674   }
8675while (*code == OP_ALT);   /* Loop for each alternative */
8676return TRUE;
8677}
8678
8679
8680
8681/*************************************************
8682*         Check for starting with ^ or .*        *
8683*************************************************/
8684
8685/* This is called to find out if every branch starts with ^ or .* so that
8686"first char" processing can be done to speed things up in multiline
8687matching and for non-DOTALL patterns that start with .* (which must start at
8688the beginning or after \n). As in the case of is_anchored() (see above), we
8689have to take account of back references to capturing brackets that contain .*
8690because in that case we can't make the assumption. Also, the appearance of .*
8691inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8692count, because once again the assumption no longer holds.
8693
8694Arguments:
8695  code           points to start of expression (the bracket)
8696  bracket_map    a bitmap of which brackets we are inside while testing; this
8697                  handles up to substring 31; after that we just have to take
8698                  the less precise approach
8699  cd             points to the compile data
8700  atomcount      atomic group level
8701
8702Returns:         TRUE or FALSE
8703*/
8704
8705static BOOL
8706is_startline(const pcre_uchar *code, unsigned int bracket_map,
8707  compile_data *cd, int atomcount)
8708{
8709do {
8710   const pcre_uchar *scode = first_significant_code(
8711     code + PRIV(OP_lengths)[*code], FALSE);
8712   register int op = *scode;
8713
8714   /* If we are at the start of a conditional assertion group, *both* the
8715   conditional assertion *and* what follows the condition must satisfy the test
8716   for start of line. Other kinds of condition fail. Note that there may be an
8717   auto-callout at the start of a condition. */
8718
8719   if (op == OP_COND)
8720     {
8721     scode += 1 + LINK_SIZE;
8722     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8723     switch (*scode)
8724       {
8725       case OP_CREF:
8726       case OP_DNCREF:
8727       case OP_RREF:
8728       case OP_DNRREF:
8729       case OP_DEF:
8730       case OP_FAIL:
8731       return FALSE;
8732
8733       default:     /* Assertion */
8734       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8735       do scode += GET(scode, 1); while (*scode == OP_ALT);
8736       scode += 1 + LINK_SIZE;
8737       break;
8738       }
8739     scode = first_significant_code(scode, FALSE);
8740     op = *scode;
8741     }
8742
8743   /* Non-capturing brackets */
8744
8745   if (op == OP_BRA  || op == OP_BRAPOS ||
8746       op == OP_SBRA || op == OP_SBRAPOS)
8747     {
8748     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8749     }
8750
8751   /* Capturing brackets */
8752
8753   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8754            op == OP_SCBRA || op == OP_SCBRAPOS)
8755     {
8756     int n = GET2(scode, 1+LINK_SIZE);
8757     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8758     if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8759     }
8760
8761   /* Positive forward assertions */
8762
8763   else if (op == OP_ASSERT)
8764     {
8765     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8766     }
8767
8768   /* Atomic brackets */
8769
8770   else if (op == OP_ONCE || op == OP_ONCE_NC)
8771     {
8772     if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8773     }
8774
8775   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8776   brackets that may be referenced, as long as the pattern does not contain
8777   *PRUNE or *SKIP, because these break the feature. Consider, for example,
8778   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8779   start of a line. */
8780
8781   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8782     {
8783     if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8784         atomcount > 0 || cd->had_pruneorskip)
8785       return FALSE;
8786     }
8787
8788   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8789   in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8790   because the number of characters matched by .* cannot be adjusted inside
8791   them. */
8792
8793   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8794
8795   /* Move on to the next alternative */
8796
8797   code += GET(code, 1);
8798   }
8799while (*code == OP_ALT);  /* Loop for each alternative */
8800return TRUE;
8801}
8802
8803
8804
8805/*************************************************
8806*       Check for asserted fixed first char      *
8807*************************************************/
8808
8809/* During compilation, the "first char" settings from forward assertions are
8810discarded, because they can cause conflicts with actual literals that follow.
8811However, if we end up without a first char setting for an unanchored pattern,
8812it is worth scanning the regex to see if there is an initial asserted first
8813char. If all branches start with the same asserted char, or with a
8814non-conditional bracket all of whose alternatives start with the same asserted
8815char (recurse ad lib), then we return that char, with the flags set to zero or
8816REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8817
8818Arguments:
8819  code       points to start of expression (the bracket)
8820  flags      points to the first char flags, or to REQ_NONE
8821  inassert   TRUE if in an assertion
8822
8823Returns:     the fixed first char, or 0 with REQ_NONE in flags
8824*/
8825
8826static pcre_uint32
8827find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8828  BOOL inassert)
8829{
8830register pcre_uint32 c = 0;
8831int cflags = REQ_NONE;
8832
8833*flags = REQ_NONE;
8834do {
8835   pcre_uint32 d;
8836   int dflags;
8837   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8838             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8839   const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8840     TRUE);
8841   register pcre_uchar op = *scode;
8842
8843   switch(op)
8844     {
8845     default:
8846     return 0;
8847
8848     case OP_BRA:
8849     case OP_BRAPOS:
8850     case OP_CBRA:
8851     case OP_SCBRA:
8852     case OP_CBRAPOS:
8853     case OP_SCBRAPOS:
8854     case OP_ASSERT:
8855     case OP_ONCE:
8856     case OP_ONCE_NC:
8857     d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8858     if (dflags < 0)
8859       return 0;
8860     if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8861     break;
8862
8863     case OP_EXACT:
8864     scode += IMM2_SIZE;
8865     /* Fall through */
8866
8867     case OP_CHAR:
8868     case OP_PLUS:
8869     case OP_MINPLUS:
8870     case OP_POSPLUS:
8871     if (!inassert) return 0;
8872     if (cflags < 0) { c = scode[1]; cflags = 0; }
8873       else if (c != scode[1]) return 0;
8874     break;
8875
8876     case OP_EXACTI:
8877     scode += IMM2_SIZE;
8878     /* Fall through */
8879
8880     case OP_CHARI:
8881     case OP_PLUSI:
8882     case OP_MINPLUSI:
8883     case OP_POSPLUSI:
8884     if (!inassert) return 0;
8885     if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8886       else if (c != scode[1]) return 0;
8887     break;
8888     }
8889
8890   code += GET(code, 1);
8891   }
8892while (*code == OP_ALT);
8893
8894*flags = cflags;
8895return c;
8896}
8897
8898
8899
8900/*************************************************
8901*     Add an entry to the name/number table      *
8902*************************************************/
8903
8904/* This function is called between compiling passes to add an entry to the
8905name/number table, maintaining alphabetical order. Checking for permitted
8906and forbidden duplicates has already been done.
8907
8908Arguments:
8909  cd           the compile data block
8910  name         the name to add
8911  length       the length of the name
8912  groupno      the group number
8913
8914Returns:       nothing
8915*/
8916
8917static void
8918add_name(compile_data *cd, const pcre_uchar *name, int length,
8919  unsigned int groupno)
8920{
8921int i;
8922pcre_uchar *slot = cd->name_table;
8923
8924for (i = 0; i < cd->names_found; i++)
8925  {
8926  int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8927  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8928    crc = -1; /* Current name is a substring */
8929
8930  /* Make space in the table and break the loop for an earlier name. For a
8931  duplicate or later name, carry on. We do this for duplicates so that in the
8932  simple case (when ?(| is not used) they are in order of their numbers. In all
8933  cases they are in the order in which they appear in the pattern. */
8934
8935  if (crc < 0)
8936    {
8937    memmove(slot + cd->name_entry_size, slot,
8938      IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8939    break;
8940    }
8941
8942  /* Continue the loop for a later or duplicate name */
8943
8944  slot += cd->name_entry_size;
8945  }
8946
8947PUT2(slot, 0, groupno);
8948memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8949slot[IMM2_SIZE + length] = 0;
8950cd->names_found++;
8951}
8952
8953
8954
8955/*************************************************
8956*        Compile a Regular Expression            *
8957*************************************************/
8958
8959/* This function takes a string and returns a pointer to a block of store
8960holding a compiled version of the expression. The original API for this
8961function had no error code return variable; it is retained for backwards
8962compatibility. The new function is given a new name.
8963
8964Arguments:
8965  pattern       the regular expression
8966  options       various option bits
8967  errorcodeptr  pointer to error code variable (pcre_compile2() only)
8968                  can be NULL if you don't want a code value
8969  errorptr      pointer to pointer to error text
8970  erroroffset   ptr offset in pattern where error was detected
8971  tables        pointer to character tables or NULL
8972
8973Returns:        pointer to compiled data block, or NULL on error,
8974                with errorptr and erroroffset set
8975*/
8976
8977#if defined COMPILE_PCRE8
8978PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8979pcre_compile(const char *pattern, int options, const char **errorptr,
8980  int *erroroffset, const unsigned char *tables)
8981#elif defined COMPILE_PCRE16
8982PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8983pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8984  int *erroroffset, const unsigned char *tables)
8985#elif defined COMPILE_PCRE32
8986PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8987pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8988  int *erroroffset, const unsigned char *tables)
8989#endif
8990{
8991#if defined COMPILE_PCRE8
8992return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8993#elif defined COMPILE_PCRE16
8994return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8995#elif defined COMPILE_PCRE32
8996return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8997#endif
8998}
8999
9000
9001#if defined COMPILE_PCRE8
9002PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
9003pcre_compile2(const char *pattern, int options, int *errorcodeptr,
9004  const char **errorptr, int *erroroffset, const unsigned char *tables)
9005#elif defined COMPILE_PCRE16
9006PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
9007pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
9008  const char **errorptr, int *erroroffset, const unsigned char *tables)
9009#elif defined COMPILE_PCRE32
9010PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
9011pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
9012  const char **errorptr, int *erroroffset, const unsigned char *tables)
9013#endif
9014{
9015REAL_PCRE *re;
9016int length = 1;  /* For final END opcode */
9017pcre_int32 firstcharflags, reqcharflags;
9018pcre_uint32 firstchar, reqchar;
9019pcre_uint32 limit_match = PCRE_UINT32_MAX;
9020pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9021int newline;
9022int errorcode = 0;
9023int skipatstart = 0;
9024BOOL utf;
9025BOOL never_utf = FALSE;
9026size_t size;
9027pcre_uchar *code;
9028const pcre_uchar *codestart;
9029const pcre_uchar *ptr;
9030compile_data compile_block;
9031compile_data *cd = &compile_block;
9032
9033/* This space is used for "compiling" into during the first phase, when we are
9034computing the amount of memory that is needed. Compiled items are thrown away
9035as soon as possible, so that a fairly large buffer should be sufficient for
9036this purpose. The same space is used in the second phase for remembering where
9037to fill in forward references to subpatterns. That may overflow, in which case
9038new memory is obtained from malloc(). */
9039
9040pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9041
9042/* This vector is used for remembering name groups during the pre-compile. In a
9043similar way to cworkspace, it can be expanded using malloc() if necessary. */
9044
9045named_group named_groups[NAMED_GROUP_LIST_SIZE];
9046
9047/* Set this early so that early errors get offset 0. */
9048
9049ptr = (const pcre_uchar *)pattern;
9050
9051/* We can't pass back an error message if errorptr is NULL; I guess the best we
9052can do is just return NULL, but we can set a code value if there is a code
9053pointer. */
9054
9055if (errorptr == NULL)
9056  {
9057  if (errorcodeptr != NULL) *errorcodeptr = 99;
9058  return NULL;
9059  }
9060
9061*errorptr = NULL;
9062if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9063
9064/* However, we can give a message for this error */
9065
9066if (erroroffset == NULL)
9067  {
9068  errorcode = ERR16;
9069  goto PCRE_EARLY_ERROR_RETURN2;
9070  }
9071
9072*erroroffset = 0;
9073
9074/* Set up pointers to the individual character tables */
9075
9076if (tables == NULL) tables = PRIV(default_tables);
9077cd->lcc = tables + lcc_offset;
9078cd->fcc = tables + fcc_offset;
9079cd->cbits = tables + cbits_offset;
9080cd->ctypes = tables + ctypes_offset;
9081
9082/* Check that all undefined public option bits are zero */
9083
9084if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9085  {
9086  errorcode = ERR17;
9087  goto PCRE_EARLY_ERROR_RETURN;
9088  }
9089
9090/* If PCRE_NEVER_UTF is set, remember it. */
9091
9092if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9093
9094/* Check for global one-time settings at the start of the pattern, and remember
9095the offset for later. */
9096
9097cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9098
9099while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9100       ptr[skipatstart+1] == CHAR_ASTERISK)
9101  {
9102  int newnl = 0;
9103  int newbsr = 0;
9104
9105/* For completeness and backward compatibility, (*UTFn) is supported in the
9106relevant libraries, but (*UTF) is generic and always supported. Note that
9107PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9108
9109#ifdef COMPILE_PCRE8
9110  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9111    { skipatstart += 7; options |= PCRE_UTF8; continue; }
9112#endif
9113#ifdef COMPILE_PCRE16
9114  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9115    { skipatstart += 8; options |= PCRE_UTF16; continue; }
9116#endif
9117#ifdef COMPILE_PCRE32
9118  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9119    { skipatstart += 8; options |= PCRE_UTF32; continue; }
9120#endif
9121
9122  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9123    { skipatstart += 6; options |= PCRE_UTF8; continue; }
9124  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9125    { skipatstart += 6; options |= PCRE_UCP; continue; }
9126  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9127    { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9128  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9129    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9130
9131  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9132    {
9133    pcre_uint32 c = 0;
9134    int p = skipatstart + 14;
9135    while (isdigit(ptr[p]))
9136      {
9137      if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9138      c = c*10 + ptr[p++] - CHAR_0;
9139      }
9140    if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9141    if (c < limit_match)
9142      {
9143      limit_match = c;
9144      cd->external_flags |= PCRE_MLSET;
9145      }
9146    skipatstart = p;
9147    continue;
9148    }
9149
9150  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9151    {
9152    pcre_uint32 c = 0;
9153    int p = skipatstart + 18;
9154    while (isdigit(ptr[p]))
9155      {
9156      if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9157      c = c*10 + ptr[p++] - CHAR_0;
9158      }
9159    if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9160    if (c < limit_recursion)
9161      {
9162      limit_recursion = c;
9163      cd->external_flags |= PCRE_RLSET;
9164      }
9165    skipatstart = p;
9166    continue;
9167    }
9168
9169  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9170    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9171  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9172    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9173  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9174    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9175  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9176    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9177  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9178    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9179
9180  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9181    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9182  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9183    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9184
9185  if (newnl != 0)
9186    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9187  else if (newbsr != 0)
9188    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9189  else break;
9190  }
9191
9192/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9193utf = (options & PCRE_UTF8) != 0;
9194if (utf && never_utf)
9195  {
9196  errorcode = ERR78;
9197  goto PCRE_EARLY_ERROR_RETURN2;
9198  }
9199
9200/* Can't support UTF unless PCRE has been compiled to include the code. The
9201return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9202release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9203not used here. */
9204
9205#ifdef SUPPORT_UTF
9206if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9207     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9208  {
9209#if defined COMPILE_PCRE8
9210  errorcode = ERR44;
9211#elif defined COMPILE_PCRE16
9212  errorcode = ERR74;
9213#elif defined COMPILE_PCRE32
9214  errorcode = ERR77;
9215#endif
9216  goto PCRE_EARLY_ERROR_RETURN2;
9217  }
9218#else
9219if (utf)
9220  {
9221  errorcode = ERR32;
9222  goto PCRE_EARLY_ERROR_RETURN;
9223  }
9224#endif
9225
9226/* Can't support UCP unless PCRE has been compiled to include the code. */
9227
9228#ifndef SUPPORT_UCP
9229if ((options & PCRE_UCP) != 0)
9230  {
9231  errorcode = ERR67;
9232  goto PCRE_EARLY_ERROR_RETURN;
9233  }
9234#endif
9235
9236/* Check validity of \R options. */
9237
9238if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9239     (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9240  {
9241  errorcode = ERR56;
9242  goto PCRE_EARLY_ERROR_RETURN;
9243  }
9244
9245/* Handle different types of newline. The three bits give seven cases. The
9246current code allows for fixed one- or two-byte sequences, plus "any" and
9247"anycrlf". */
9248
9249switch (options & PCRE_NEWLINE_BITS)
9250  {
9251  case 0: newline = NEWLINE; break;   /* Build-time default */
9252  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9253  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9254  case PCRE_NEWLINE_CR+
9255       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9256  case PCRE_NEWLINE_ANY: newline = -1; break;
9257  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9258  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9259  }
9260
9261if (newline == -2)
9262  {
9263  cd->nltype = NLTYPE_ANYCRLF;
9264  }
9265else if (newline < 0)
9266  {
9267  cd->nltype = NLTYPE_ANY;
9268  }
9269else
9270  {
9271  cd->nltype = NLTYPE_FIXED;
9272  if (newline > 255)
9273    {
9274    cd->nllen = 2;
9275    cd->nl[0] = (newline >> 8) & 255;
9276    cd->nl[1] = newline & 255;
9277    }
9278  else
9279    {
9280    cd->nllen = 1;
9281    cd->nl[0] = newline;
9282    }
9283  }
9284
9285/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9286references to help in deciding whether (.*) can be treated as anchored or not.
9287*/
9288
9289cd->top_backref = 0;
9290cd->backref_map = 0;
9291
9292/* Reflect pattern for debugging output */
9293
9294DPRINTF(("------------------------------------------------------------------\n"));
9295#ifdef PCRE_DEBUG
9296print_puchar(stdout, (PCRE_PUCHAR)pattern);
9297#endif
9298DPRINTF(("\n"));
9299
9300/* Pretend to compile the pattern while actually just accumulating the length
9301of memory required. This behaviour is triggered by passing a non-NULL final
9302argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9303to compile parts of the pattern into; the compiled code is discarded when it is
9304no longer needed, so hopefully this workspace will never overflow, though there
9305is a test for its doing so. */
9306
9307cd->bracount = cd->final_bracount = 0;
9308cd->names_found = 0;
9309cd->name_entry_size = 0;
9310cd->name_table = NULL;
9311cd->dupnames = FALSE;
9312cd->dupgroups = FALSE;
9313cd->namedrefcount = 0;
9314cd->start_code = cworkspace;
9315cd->hwm = cworkspace;
9316cd->iscondassert = FALSE;
9317cd->start_workspace = cworkspace;
9318cd->workspace_size = COMPILE_WORK_SIZE;
9319cd->named_groups = named_groups;
9320cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9321cd->start_pattern = (const pcre_uchar *)pattern;
9322cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9323cd->req_varyopt = 0;
9324cd->parens_depth = 0;
9325cd->assert_depth = 0;
9326cd->max_lookbehind = 0;
9327cd->external_options = options;
9328cd->open_caps = NULL;
9329
9330/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9331don't need to look at the result of the function here. The initial options have
9332been put into the cd block so that they can be changed if an option setting is
9333found within the regex right at the beginning. Bringing initial option settings
9334outside can help speed up starting point checks. */
9335
9336ptr += skipatstart;
9337code = cworkspace;
9338*code = OP_BRA;
9339
9340(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9341  FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9342  cd, &length);
9343if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9344
9345DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9346  (int)(cd->hwm - cworkspace)));
9347
9348if (length > MAX_PATTERN_SIZE)
9349  {
9350  errorcode = ERR20;
9351  goto PCRE_EARLY_ERROR_RETURN;
9352  }
9353
9354/* Compute the size of the data block for storing the compiled pattern. Integer
9355overflow should no longer be possible because nowadays we limit the maximum
9356value of cd->names_found and cd->name_entry_size. */
9357
9358size = sizeof(REAL_PCRE) +
9359  (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9360
9361/* Get the memory. */
9362
9363re = (REAL_PCRE *)(PUBL(malloc))(size);
9364if (re == NULL)
9365  {
9366  errorcode = ERR21;
9367  goto PCRE_EARLY_ERROR_RETURN;
9368  }
9369
9370/* Put in the magic number, and save the sizes, initial options, internal
9371flags, and character table pointer. NULL is used for the default character
9372tables. The nullpad field is at the end; it's there to help in the case when a
9373regex compiled on a system with 4-byte pointers is run on another with 8-byte
9374pointers. */
9375
9376re->magic_number = MAGIC_NUMBER;
9377re->size = (int)size;
9378re->options = cd->external_options;
9379re->flags = cd->external_flags;
9380re->limit_match = limit_match;
9381re->limit_recursion = limit_recursion;
9382re->first_char = 0;
9383re->req_char = 0;
9384re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9385re->name_entry_size = cd->name_entry_size;
9386re->name_count = cd->names_found;
9387re->ref_count = 0;
9388re->tables = (tables == PRIV(default_tables))? NULL : tables;
9389re->nullpad = NULL;
9390#ifdef COMPILE_PCRE32
9391re->dummy = 0;
9392#else
9393re->dummy1 = re->dummy2 = re->dummy3 = 0;
9394#endif
9395
9396/* The starting points of the name/number translation table and of the code are
9397passed around in the compile data block. The start/end pattern and initial
9398options are already set from the pre-compile phase, as is the name_entry_size
9399field. Reset the bracket count and the names_found field. Also reset the hwm
9400field; this time it's used for remembering forward references to subpatterns.
9401*/
9402
9403cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9404cd->parens_depth = 0;
9405cd->assert_depth = 0;
9406cd->bracount = 0;
9407cd->max_lookbehind = 0;
9408cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9409codestart = cd->name_table + re->name_entry_size * re->name_count;
9410cd->start_code = codestart;
9411cd->hwm = (pcre_uchar *)(cd->start_workspace);
9412cd->iscondassert = FALSE;
9413cd->req_varyopt = 0;
9414cd->had_accept = FALSE;
9415cd->had_pruneorskip = FALSE;
9416cd->check_lookbehind = FALSE;
9417cd->open_caps = NULL;
9418
9419/* If any named groups were found, create the name/number table from the list
9420created in the first pass. */
9421
9422if (cd->names_found > 0)
9423  {
9424  int i = cd->names_found;
9425  named_group *ng = cd->named_groups;
9426  cd->names_found = 0;
9427  for (; i > 0; i--, ng++)
9428    add_name(cd, ng->name, ng->length, ng->number);
9429  if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9430    (PUBL(free))((void *)cd->named_groups);
9431  }
9432
9433/* Set up a starting, non-extracting bracket, then compile the expression. On
9434error, errorcode will be set non-zero, so we don't need to look at the result
9435of the function here. */
9436
9437ptr = (const pcre_uchar *)pattern + skipatstart;
9438code = (pcre_uchar *)codestart;
9439*code = OP_BRA;
9440(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9441  &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9442re->top_bracket = cd->bracount;
9443re->top_backref = cd->top_backref;
9444re->max_lookbehind = cd->max_lookbehind;
9445re->flags = cd->external_flags | PCRE_MODE;
9446
9447if (cd->had_accept)
9448  {
9449  reqchar = 0;              /* Must disable after (*ACCEPT) */
9450  reqcharflags = REQ_NONE;
9451  }
9452
9453/* If not reached end of pattern on success, there's an excess bracket. */
9454
9455if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9456
9457/* Fill in the terminating state and check for disastrous overflow, but
9458if debugging, leave the test till after things are printed out. */
9459
9460*code++ = OP_END;
9461
9462#ifndef PCRE_DEBUG
9463if (code - codestart > length) errorcode = ERR23;
9464#endif
9465
9466#ifdef SUPPORT_VALGRIND
9467/* If the estimated length exceeds the really used length, mark the extra
9468allocated memory as unaddressable, so that any out-of-bound reads can be
9469detected. */
9470VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9471#endif
9472
9473/* Fill in any forward references that are required. There may be repeated
9474references; optimize for them, as searching a large regex takes time. */
9475
9476if (cd->hwm > cd->start_workspace)
9477  {
9478  int prev_recno = -1;
9479  const pcre_uchar *groupptr = NULL;
9480  while (errorcode == 0 && cd->hwm > cd->start_workspace)
9481    {
9482    int offset, recno;
9483    cd->hwm -= LINK_SIZE;
9484    offset = GET(cd->hwm, 0);
9485
9486    /* Check that the hwm handling hasn't gone wrong. This whole area is
9487    rewritten in PCRE2 because there are some obscure cases. */
9488
9489    if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9490      {
9491      errorcode = ERR10;
9492      break;
9493      }
9494
9495    recno = GET(codestart, offset);
9496    if (recno != prev_recno)
9497      {
9498      groupptr = PRIV(find_bracket)(codestart, utf, recno);
9499      prev_recno = recno;
9500      }
9501    if (groupptr == NULL) errorcode = ERR53;
9502      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9503    }
9504  }
9505
9506/* If the workspace had to be expanded, free the new memory. Set the pointer to
9507NULL to indicate that forward references have been filled in. */
9508
9509if (cd->workspace_size > COMPILE_WORK_SIZE)
9510  (PUBL(free))((void *)cd->start_workspace);
9511cd->start_workspace = NULL;
9512
9513/* Give an error if there's back reference to a non-existent capturing
9514subpattern. */
9515
9516if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9517
9518/* Unless disabled, check whether any single character iterators can be
9519auto-possessified. The function overwrites the appropriate opcode values, so
9520the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9521used in this code because at least one compiler gives a warning about loss of
9522"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9523function call. */
9524
9525if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9526  {
9527  pcre_uchar *temp = (pcre_uchar *)codestart;
9528  auto_possessify(temp, utf, cd);
9529  }
9530
9531/* If there were any lookbehind assertions that contained OP_RECURSE
9532(recursions or subroutine calls), a flag is set for them to be checked here,
9533because they may contain forward references. Actual recursions cannot be fixed
9534length, but subroutine calls can. It is done like this so that those without
9535OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9536exceptional ones forgo this. We scan the pattern to check that they are fixed
9537length, and set their lengths. */
9538
9539if (errorcode == 0 && cd->check_lookbehind)
9540  {
9541  pcre_uchar *cc = (pcre_uchar *)codestart;
9542
9543  /* Loop, searching for OP_REVERSE items, and process those that do not have
9544  their length set. (Actually, it will also re-process any that have a length
9545  of zero, but that is a pathological case, and it does no harm.) When we find
9546  one, we temporarily terminate the branch it is in while we scan it. */
9547
9548  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9549       cc != NULL;
9550       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9551    {
9552    if (GET(cc, 1) == 0)
9553      {
9554      int fixed_length;
9555      pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9556      int end_op = *be;
9557      *be = OP_END;
9558      fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9559        cd, NULL);
9560      *be = end_op;
9561      DPRINTF(("fixed length = %d\n", fixed_length));
9562      if (fixed_length < 0)
9563        {
9564        errorcode = (fixed_length == -2)? ERR36 :
9565                    (fixed_length == -4)? ERR70 : ERR25;
9566        break;
9567        }
9568      if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9569      PUT(cc, 1, fixed_length);
9570      }
9571    cc += 1 + LINK_SIZE;
9572    }
9573  }
9574
9575/* Failed to compile, or error while post-processing */
9576
9577if (errorcode != 0)
9578  {
9579  (PUBL(free))(re);
9580  PCRE_EARLY_ERROR_RETURN:
9581  *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9582  PCRE_EARLY_ERROR_RETURN2:
9583  *errorptr = find_error_text(errorcode);
9584  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9585  return NULL;
9586  }
9587
9588/* If the anchored option was not passed, set the flag if we can determine that
9589the pattern is anchored by virtue of ^ characters or \A or anything else, such
9590as starting with non-atomic .* when DOTALL is set and there are no occurrences
9591of *PRUNE or *SKIP.
9592
9593Otherwise, if we know what the first byte has to be, save it, because that
9594speeds up unanchored matches no end. If not, see if we can set the
9595PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9596start with ^. and also when all branches start with non-atomic .* for
9597non-DOTALL matches when *PRUNE and SKIP are not present. */
9598
9599if ((re->options & PCRE_ANCHORED) == 0)
9600  {
9601  if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9602  else
9603    {
9604    if (firstcharflags < 0)
9605      firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9606    if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9607      {
9608#if defined COMPILE_PCRE8
9609      re->first_char = firstchar & 0xff;
9610#elif defined COMPILE_PCRE16
9611      re->first_char = firstchar & 0xffff;
9612#elif defined COMPILE_PCRE32
9613      re->first_char = firstchar;
9614#endif
9615      if ((firstcharflags & REQ_CASELESS) != 0)
9616        {
9617#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9618        /* We ignore non-ASCII first chars in 8 bit mode. */
9619        if (utf)
9620          {
9621          if (re->first_char < 128)
9622            {
9623            if (cd->fcc[re->first_char] != re->first_char)
9624              re->flags |= PCRE_FCH_CASELESS;
9625            }
9626          else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9627            re->flags |= PCRE_FCH_CASELESS;
9628          }
9629        else
9630#endif
9631        if (MAX_255(re->first_char)
9632            && cd->fcc[re->first_char] != re->first_char)
9633          re->flags |= PCRE_FCH_CASELESS;
9634        }
9635
9636      re->flags |= PCRE_FIRSTSET;
9637      }
9638
9639    else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9640    }
9641  }
9642
9643/* For an anchored pattern, we use the "required byte" only if it follows a
9644variable length item in the regex. Remove the caseless flag for non-caseable
9645bytes. */
9646
9647if (reqcharflags >= 0 &&
9648     ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9649  {
9650#if defined COMPILE_PCRE8
9651  re->req_char = reqchar & 0xff;
9652#elif defined COMPILE_PCRE16
9653  re->req_char = reqchar & 0xffff;
9654#elif defined COMPILE_PCRE32
9655  re->req_char = reqchar;
9656#endif
9657  if ((reqcharflags & REQ_CASELESS) != 0)
9658    {
9659#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9660    /* We ignore non-ASCII first chars in 8 bit mode. */
9661    if (utf)
9662      {
9663      if (re->req_char < 128)
9664        {
9665        if (cd->fcc[re->req_char] != re->req_char)
9666          re->flags |= PCRE_RCH_CASELESS;
9667        }
9668      else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9669        re->flags |= PCRE_RCH_CASELESS;
9670      }
9671    else
9672#endif
9673    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9674      re->flags |= PCRE_RCH_CASELESS;
9675    }
9676
9677  re->flags |= PCRE_REQCHSET;
9678  }
9679
9680/* Print out the compiled data if debugging is enabled. This is never the
9681case when building a production library. */
9682
9683#ifdef PCRE_DEBUG
9684printf("Length = %d top_bracket = %d top_backref = %d\n",
9685  length, re->top_bracket, re->top_backref);
9686
9687printf("Options=%08x\n", re->options);
9688
9689if ((re->flags & PCRE_FIRSTSET) != 0)
9690  {
9691  pcre_uchar ch = re->first_char;
9692  const char *caseless =
9693    ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9694  if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9695    else printf("First char = \\x%02x%s\n", ch, caseless);
9696  }
9697
9698if ((re->flags & PCRE_REQCHSET) != 0)
9699  {
9700  pcre_uchar ch = re->req_char;
9701  const char *caseless =
9702    ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9703  if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9704    else printf("Req char = \\x%02x%s\n", ch, caseless);
9705  }
9706
9707#if defined COMPILE_PCRE8
9708pcre_printint((pcre *)re, stdout, TRUE);
9709#elif defined COMPILE_PCRE16
9710pcre16_printint((pcre *)re, stdout, TRUE);
9711#elif defined COMPILE_PCRE32
9712pcre32_printint((pcre *)re, stdout, TRUE);
9713#endif
9714
9715/* This check is done here in the debugging case so that the code that
9716was compiled can be seen. */
9717
9718if (code - codestart > length)
9719  {
9720  (PUBL(free))(re);
9721  *errorptr = find_error_text(ERR23);
9722  *erroroffset = ptr - (pcre_uchar *)pattern;
9723  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9724  return NULL;
9725  }
9726#endif   /* PCRE_DEBUG */
9727
9728/* Check for a pattern than can match an empty string, so that this information
9729can be provided to applications. */
9730
9731do
9732  {
9733  if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9734    {
9735    re->flags |= PCRE_MATCH_EMPTY;
9736    break;
9737    }
9738  codestart += GET(codestart, 1);
9739  }
9740while (*codestart == OP_ALT);
9741
9742#if defined COMPILE_PCRE8
9743return (pcre *)re;
9744#elif defined COMPILE_PCRE16
9745return (pcre16 *)re;
9746#elif defined COMPILE_PCRE32
9747return (pcre32 *)re;
9748#endif
9749}
9750
9751/* End of pcre_compile.c */
9752