1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/*
6This is a library of functions to support regular expressions whose syntax
7and semantics are as close as possible to those of the Perl 5 language. See
8the file Tech.Notes for some information on the internals.
9
10Written by: Philip Hazel <ph10@cam.ac.uk>
11
12           Copyright (c) 1997-2004 University of Cambridge
13
14-----------------------------------------------------------------------------
15Redistribution and use in source and binary forms, with or without
16modification, are permitted provided that the following conditions are met:
17
18    * Redistributions of source code must retain the above copyright notice,
19      this list of conditions and the following disclaimer.
20
21    * Redistributions in binary form must reproduce the above copyright
22      notice, this list of conditions and the following disclaimer in the
23      documentation and/or other materials provided with the distribution.
24
25    * Neither the name of the University of Cambridge nor the names of its
26      contributors may be used to endorse or promote products derived from
27      this software without specific prior written permission.
28
29THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39POSSIBILITY OF SUCH DAMAGE.
40-----------------------------------------------------------------------------
41*/
42
43
44/* Define DEBUG to get debugging output on stdout. */
45/* #define DEBUG */
46
47/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48inline, and there are *still* stupid compilers about that don't like indented
49pre-processor statements. I suppose it's only been 10 years... */
50
51#ifdef DEBUG
52#define DPRINTF(p) printf p
53#else
54#define DPRINTF(p) /*nothing*/
55#endif
56
57/* Include the internals header, which itself includes "config.h", the Standard
58C headers, and the external pcre header. */
59
60#include "internal.h"
61
62/* If Unicode Property support is wanted, include a private copy of the
63function that does it, and the table that translates names to numbers. */
64
65#ifdef SUPPORT_UCP
66#include "ucp.c"
67#include "ucptypetable.c"
68#endif
69
70/* Maximum number of items on the nested bracket stacks at compile time. This
71applies to the nesting of all kinds of parentheses. It does not limit
72un-nested, non-capturing parentheses. This number can be made bigger if
73necessary - it is used to dimension one int and one unsigned char vector at
74compile time. */
75
76#define BRASTACK_SIZE 200
77
78
79/* Maximum number of ints of offset to save on the stack for recursive calls.
80If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81because the offset vector is always a multiple of 3 long. */
82
83#define REC_STACK_SAVE_MAX 30
84
85
86/* The maximum remaining length of subject we are prepared to search for a
87req_byte match. */
88
89#define REQ_BYTE_MAX 1000
90
91
92/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93the definition is next to the definition of the opcodes in internal.h. */
94
95static const uschar OP_lengths[] = { OP_LENGTHS };
96
97/* Min and max values for the common repeats; for the maxima, 0 => infinity */
98
99static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
101
102/* Table for handling escaped characters in the range '0'-'z'. Positive returns
103are simple data values; negative values are for special things like \d and so
104on. Zero means further processing is needed (for things like \x), or the escape
105is invalid. */
106
107#if !EBCDIC   /* This is the "normal" table for ASCII systems */
108static const short int escapes[] = {
109     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
110     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
111   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
112     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
113-ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
114-ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
115   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
116     0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
117-ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
118     0,      0, -ESC_z                                            /* x - z */
119};
120
121#else         /* This is the "abnormal" table for EBCDIC systems */
122static const short int escapes[] = {
123/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
124/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
125/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
126/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
127/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
128/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
129/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
130/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
131/*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
132/*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
133/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
134/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
135/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
136/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
137/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
138/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
139/*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
140/*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
141/*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
142/*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
143/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
144/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
145/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
146};
147#endif
148
149
150/* Tables of names of POSIX character classes and their lengths. The list is
151terminated by a zero length entry. The first three must be alpha, upper, lower,
152as this is assumed for handling case independence. */
153
154static const char *const posix_names[] = {
155  "alpha", "lower", "upper",
156  "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157  "print", "punct", "space", "word",  "xdigit" };
158
159static const uschar posix_name_lengths[] = {
160  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
161
162/* Table of class bit maps for each POSIX class; up to three may be combined
163to form the class. The table for [:blank:] is dynamically modified to remove
164the vertical space characters. */
165
166static const int posix_class_maps[] = {
167  cbit_lower, cbit_upper, -1,             /* alpha */
168  cbit_lower, -1,         -1,             /* lower */
169  cbit_upper, -1,         -1,             /* upper */
170  cbit_digit, cbit_lower, cbit_upper,     /* alnum */
171  cbit_print, cbit_cntrl, -1,             /* ascii */
172  cbit_space, -1,         -1,             /* blank - a GNU extension */
173  cbit_cntrl, -1,         -1,             /* cntrl */
174  cbit_digit, -1,         -1,             /* digit */
175  cbit_graph, -1,         -1,             /* graph */
176  cbit_print, -1,         -1,             /* print */
177  cbit_punct, -1,         -1,             /* punct */
178  cbit_space, -1,         -1,             /* space */
179  cbit_word,  -1,         -1,             /* word - a Perl extension */
180  cbit_xdigit,-1,         -1              /* xdigit */
181};
182
183/* Table to identify digits and hex digits. This is used when compiling
184patterns. Note that the tables in chartables are dependent on the locale, and
185may mark arbitrary characters as digits - but the PCRE compiling code expects
186to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187a private table here. It costs 256 bytes, but it is a lot faster than doing
188character value tests (at least in some simple cases I timed), and in some
189applications one wants PCRE to compile efficiently as well as match
190efficiently.
191
192For convenience, we use the same bit definitions as in chartables:
193
194  0x04   decimal digit
195  0x08   hexadecimal digit
196
197Then we can use ctype_digit and ctype_xdigit in the code. */
198
199#if !EBCDIC    /* This is the "normal" case, for ASCII systems */
200static const unsigned char digitab[] =
201  {
202  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
203  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
204  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
205  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
206  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
207  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
208  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
209  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
210  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
211  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
212  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
213  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
214  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
215  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
216  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
217  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
218  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
234
235#else          /* This is the "abnormal" case, for EBCDIC systems */
236static const unsigned char digitab[] =
237  {
238  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
239  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
240  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
241  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
242  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
243  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
244  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
245  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
246  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
247  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
248  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
249  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- �     */
250  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
251  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
252  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
254  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
255  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
256  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
257  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
258  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
259  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
260  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
261  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
262  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
263  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
264  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
265  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
266  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
267  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
268  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
269  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
270
271static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
273  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
274  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
275  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
276  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
277  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
278  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
279  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
280  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
281  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
282  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
283  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- �  */
284  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
285  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
286  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
288  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
289  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
290  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
291  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
292  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
293  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
294  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
295  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
297  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
298  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
299  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
300  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
301  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
302  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
303  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
304#endif
305
306
307/* Definition to allow mutual recursion */
308
309static BOOL
310  compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311    BOOL, int, int *, int *, branch_chain *, compile_data *);
312
313/* Structure for building a chain of data that actually lives on the
314stack, for holding the values of the subject pointer at the start of each
315subpattern, so as to detect when an empty string has been matched by a
316subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317are on the heap, not on the stack. */
318
319typedef struct eptrblock {
320  struct eptrblock *epb_prev;
321  const uschar *epb_saved_eptr;
322} eptrblock;
323
324/* Flag bits for the match() function */
325
326#define match_condassert   0x01    /* Called to check a condition assertion */
327#define match_isgroup      0x02    /* Set if start of bracketed group */
328
329/* Non-error returns from the match() function. Error returns are externally
330defined PCRE_ERROR_xxx codes, which are all negative. */
331
332#define MATCH_MATCH        1
333#define MATCH_NOMATCH      0
334
335
336
337/*************************************************
338*               Global variables                 *
339*************************************************/
340
341/* PCRE is thread-clean and doesn't use any global variables in the normal
342sense. However, it calls memory allocation and free functions via the four
343indirections below, and it can optionally do callouts. These values can be
344changed by the caller, but are shared between all threads. However, when
345compiling for Virtual Pascal, things are done differently (see pcre.in). */
346
347#ifndef VPCOMPAT
348#ifdef __cplusplus
349extern "C" void *(*pcre_malloc)(size_t) = malloc;
350extern "C" void  (*pcre_free)(void *) = free;
351extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352extern "C" void  (*pcre_stack_free)(void *) = free;
353extern "C" int   (*pcre_callout)(pcre_callout_block *) = NULL;
354#else
355void *(*pcre_malloc)(size_t) = malloc;
356void  (*pcre_free)(void *) = free;
357void *(*pcre_stack_malloc)(size_t) = malloc;
358void  (*pcre_stack_free)(void *) = free;
359int   (*pcre_callout)(pcre_callout_block *) = NULL;
360#endif
361#endif
362
363
364/*************************************************
365*    Macros and tables for character handling    *
366*************************************************/
367
368/* When UTF-8 encoding is being used, a character is no longer just a single
369byte. The macros for character handling generate simple sequences when used in
370byte-mode, and more complicated ones for UTF-8 characters. */
371
372#ifndef SUPPORT_UTF8
373#define GETCHAR(c, eptr) c = *eptr;
374#define GETCHARINC(c, eptr) c = *eptr++;
375#define GETCHARINCTEST(c, eptr) c = *eptr++;
376#define GETCHARLEN(c, eptr, len) c = *eptr;
377#define BACKCHAR(eptr)
378
379#else   /* SUPPORT_UTF8 */
380
381/* Get the next UTF-8 character, not advancing the pointer. This is called when
382we know we are in UTF-8 mode. */
383
384#define GETCHAR(c, eptr) \
385  c = *eptr; \
386  if ((c & 0xc0) == 0xc0) \
387    { \
388    int gcii; \
389    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
390    int gcss = 6*gcaa; \
391    c = (c & utf8_table3[gcaa]) << gcss; \
392    for (gcii = 1; gcii <= gcaa; gcii++) \
393      { \
394      gcss -= 6; \
395      c |= (eptr[gcii] & 0x3f) << gcss; \
396      } \
397    }
398
399/* Get the next UTF-8 character, advancing the pointer. This is called when we
400know we are in UTF-8 mode. */
401
402#define GETCHARINC(c, eptr) \
403  c = *eptr++; \
404  if ((c & 0xc0) == 0xc0) \
405    { \
406    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
407    int gcss = 6*gcaa; \
408    c = (c & utf8_table3[gcaa]) << gcss; \
409    while (gcaa-- > 0) \
410      { \
411      gcss -= 6; \
412      c |= (*eptr++ & 0x3f) << gcss; \
413      } \
414    }
415
416/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
417
418#define GETCHARINCTEST(c, eptr) \
419  c = *eptr++; \
420  if (md->utf8 && (c & 0xc0) == 0xc0) \
421    { \
422    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
423    int gcss = 6*gcaa; \
424    c = (c & utf8_table3[gcaa]) << gcss; \
425    while (gcaa-- > 0) \
426      { \
427      gcss -= 6; \
428      c |= (*eptr++ & 0x3f) << gcss; \
429      } \
430    }
431
432/* Get the next UTF-8 character, not advancing the pointer, incrementing length
433if there are extra bytes. This is called when we know we are in UTF-8 mode. */
434
435#define GETCHARLEN(c, eptr, len) \
436  c = *eptr; \
437  if ((c & 0xc0) == 0xc0) \
438    { \
439    int gcii; \
440    int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
441    int gcss = 6*gcaa; \
442    c = (c & utf8_table3[gcaa]) << gcss; \
443    for (gcii = 1; gcii <= gcaa; gcii++) \
444      { \
445      gcss -= 6; \
446      c |= (eptr[gcii] & 0x3f) << gcss; \
447      } \
448    len += gcaa; \
449    }
450
451/* If the pointer is not at the start of a character, move it back until
452it is. Called only in UTF-8 mode. */
453
454#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
455
456#endif
457
458
459
460/*************************************************
461*             Default character tables           *
462*************************************************/
463
464/* A default set of character tables is included in the PCRE binary. Its source
465is built by the maketables auxiliary program, which uses the default C ctypes
466functions, and put in the file chartables.c. These tables are used by PCRE
467whenever the caller of pcre_compile() does not provide an alternate set of
468tables. */
469
470#include "chartables.c"
471
472
473
474#ifdef SUPPORT_UTF8
475/*************************************************
476*           Tables for UTF-8 support             *
477*************************************************/
478
479/* These are the breakpoints for different numbers of bytes in a UTF-8
480character. */
481
482static const int utf8_table1[] =
483  { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
484
485/* These are the indicator bits and the mask for the data bits to set in the
486first byte of a character, indexed by the number of additional bytes. */
487
488static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
490
491/* Table of the number of extra characters, indexed by the first character
492masked with 0x3f. The highest number for a valid UTF-8 character is in fact
4930x3d. */
494
495static const uschar utf8_table4[] = {
496  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
500
501
502/*************************************************
503*       Convert character value to UTF-8         *
504*************************************************/
505
506/* This function takes an integer value in the range 0 - 0x7fffffff
507and encodes it as a UTF-8 character in 0 to 6 bytes.
508
509Arguments:
510  cvalue     the character value
511  buffer     pointer to buffer for result - at least 6 bytes long
512
513Returns:     number of characters placed in the buffer
514*/
515
516static int
517ord2utf8(int cvalue, uschar *buffer)
518{
519register int i, j;
520for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521  if (cvalue <= utf8_table1[i]) break;
522buffer += i;
523for (j = i; j > 0; j--)
524 {
525 *buffer-- = 0x80 | (cvalue & 0x3f);
526 cvalue >>= 6;
527 }
528*buffer = utf8_table2[i] | cvalue;
529return i + 1;
530}
531#endif
532
533
534
535/*************************************************
536*         Print compiled regex                   *
537*************************************************/
538
539/* The code for doing this is held in a separate file that is also included in
540pcretest.c. It defines a function called print_internals(). */
541
542#ifdef DEBUG
543#include "printint.c"
544#endif
545
546
547
548/*************************************************
549*          Return version string                 *
550*************************************************/
551
552#define STRING(a)  # a
553#define XSTRING(s) STRING(s)
554
555EXPORT const char *
556pcre_version(void)
557{
558return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
559}
560
561
562
563
564/*************************************************
565*         Flip bytes in an integer               *
566*************************************************/
567
568/* This function is called when the magic number in a regex doesn't match in
569order to flip its bytes to see if we are dealing with a pattern that was
570compiled on a host of different endianness. If so, this function is used to
571flip other byte values.
572
573Arguments:
574  value        the number to flip
575  n            the number of bytes to flip (assumed to be 2 or 4)
576
577Returns:       the flipped value
578*/
579
580static pcre_uint16
581byteflip2(pcre_uint16 value)
582{
583return ((value & 0x00ff) << 8) |
584       ((value & 0xff00) >> 8);
585}
586
587static pcre_uint32
588byteflip4(pcre_uint32 value)
589{
590return ((value & 0x000000ff) << 24) |
591       ((value & 0x0000ff00) <<  8) |
592       ((value & 0x00ff0000) >>  8) |
593       ((value & 0xff000000) >> 24);
594}
595
596/*************************************************
597*       Test for a byte-flipped compiled regex   *
598*************************************************/
599
600/* This function is called from pce_exec() and also from pcre_fullinfo(). Its
601job is to test whether the regex is byte-flipped - that is, it was compiled on
602a system of opposite endianness. The function is called only when the native
603MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
604relevant values into a different data block, and return it.
605
606Arguments:
607  re               points to the regex
608  study            points to study data, or NULL
609  internal_re      points to a new regex block
610  internal_study   points to a new study block
611
612Returns:           the new block if is is indeed a byte-flipped regex
613                   NULL if it is not
614*/
615
616static real_pcre *
617try_flipped(const real_pcre *re, real_pcre *internal_re,
618  const pcre_study_data *study, pcre_study_data *internal_study)
619{
620if (byteflip4(re->magic_number) != MAGIC_NUMBER)
621  return NULL;
622
623*internal_re = *re;           /* To copy other fields */
624internal_re->size = byteflip4(re->size);
625internal_re->options = byteflip4(re->options);
626internal_re->top_bracket = byteflip2(re->top_bracket);
627internal_re->top_backref = byteflip2(re->top_backref);
628internal_re->first_byte = byteflip2(re->first_byte);
629internal_re->req_byte = byteflip2(re->req_byte);
630internal_re->name_table_offset = byteflip2(re->name_table_offset);
631internal_re->name_entry_size = byteflip2(re->name_entry_size);
632internal_re->name_count = byteflip2(re->name_count);
633
634if (study != NULL)
635  {
636  *internal_study = *study;   /* To copy other fields */
637  internal_study->size = byteflip4(study->size);
638  internal_study->options = byteflip4(study->options);
639  }
640
641return internal_re;
642}
643
644
645
646/*************************************************
647* (Obsolete) Return info about compiled pattern  *
648*************************************************/
649
650/* This is the original "info" function. It picks potentially useful data out
651of the private structure, but its interface was too rigid. It remains for
652backwards compatibility. The public options are passed back in an int - though
653the re->options field has been expanded to a long int, all the public options
654at the low end of it, and so even on 16-bit systems this will still be OK.
655Therefore, I haven't changed the API for pcre_info().
656
657Arguments:
658  argument_re   points to compiled code
659  optptr        where to pass back the options
660  first_byte    where to pass back the first character,
661                or -1 if multiline and all branches start ^,
662                or -2 otherwise
663
664Returns:        number of capturing subpatterns
665                or negative values on error
666*/
667
668EXPORT int
669pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
670{
671real_pcre internal_re;
672const real_pcre *re = (const real_pcre *)argument_re;
673if (re == NULL) return PCRE_ERROR_NULL;
674if (re->magic_number != MAGIC_NUMBER)
675  {
676  re = try_flipped(re, &internal_re, NULL, NULL);
677  if (re == NULL) return PCRE_ERROR_BADMAGIC;
678  }
679if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
680if (first_byte != NULL)
681  *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
682     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
683return re->top_bracket;
684}
685
686
687
688/*************************************************
689*        Return info about compiled pattern      *
690*************************************************/
691
692/* This is a newer "info" function which has an extensible interface so
693that additional items can be added compatibly.
694
695Arguments:
696  argument_re      points to compiled code
697  extra_data       points extra data, or NULL
698  what             what information is required
699  where            where to put the information
700
701Returns:           0 if data returned, negative on error
702*/
703
704EXPORT int
705pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
706  void *where)
707{
708real_pcre internal_re;
709pcre_study_data internal_study;
710const real_pcre *re = (const real_pcre *)argument_re;
711const pcre_study_data *study = NULL;
712
713if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
714
715if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
716  study = (const pcre_study_data *)extra_data->study_data;
717
718if (re->magic_number != MAGIC_NUMBER)
719  {
720  re = try_flipped(re, &internal_re, study, &internal_study);
721  if (re == NULL) return PCRE_ERROR_BADMAGIC;
722  if (study != NULL) study = &internal_study;
723  }
724
725switch (what)
726  {
727  case PCRE_INFO_OPTIONS:
728  *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
729  break;
730
731  case PCRE_INFO_SIZE:
732  *((size_t *)where) = re->size;
733  break;
734
735  case PCRE_INFO_STUDYSIZE:
736  *((size_t *)where) = (study == NULL)? 0 : study->size;
737  break;
738
739  case PCRE_INFO_CAPTURECOUNT:
740  *((int *)where) = re->top_bracket;
741  break;
742
743  case PCRE_INFO_BACKREFMAX:
744  *((int *)where) = re->top_backref;
745  break;
746
747  case PCRE_INFO_FIRSTBYTE:
748  *((int *)where) =
749    ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
750    ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
751  break;
752
753  /* Make sure we pass back the pointer to the bit vector in the external
754  block, not the internal copy (with flipped integer fields). */
755
756  case PCRE_INFO_FIRSTTABLE:
757  *((const uschar **)where) =
758    (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
759      ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
760  break;
761
762  case PCRE_INFO_LASTLITERAL:
763  *((int *)where) =
764    ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
765  break;
766
767  case PCRE_INFO_NAMEENTRYSIZE:
768  *((int *)where) = re->name_entry_size;
769  break;
770
771  case PCRE_INFO_NAMECOUNT:
772  *((int *)where) = re->name_count;
773  break;
774
775  case PCRE_INFO_NAMETABLE:
776  *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
777  break;
778
779  case PCRE_INFO_DEFAULT_TABLES:
780  *((const uschar **)where) = (const uschar *)pcre_default_tables;
781  break;
782
783  default: return PCRE_ERROR_BADOPTION;
784  }
785
786return 0;
787}
788
789
790
791/*************************************************
792* Return info about what features are configured *
793*************************************************/
794
795/* This is function which has an extensible interface so that additional items
796can be added compatibly.
797
798Arguments:
799  what             what information is required
800  where            where to put the information
801
802Returns:           0 if data returned, negative on error
803*/
804
805EXPORT int
806pcre_config(int what, void *where)
807{
808switch (what)
809  {
810  case PCRE_CONFIG_UTF8:
811#ifdef SUPPORT_UTF8
812  *((int *)where) = 1;
813#else
814  *((int *)where) = 0;
815#endif
816  break;
817
818  case PCRE_CONFIG_UNICODE_PROPERTIES:
819#ifdef SUPPORT_UCP
820  *((int *)where) = 1;
821#else
822  *((int *)where) = 0;
823#endif
824  break;
825
826  case PCRE_CONFIG_NEWLINE:
827  *((int *)where) = NEWLINE;
828  break;
829
830  case PCRE_CONFIG_LINK_SIZE:
831  *((int *)where) = LINK_SIZE;
832  break;
833
834  case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
835  *((int *)where) = POSIX_MALLOC_THRESHOLD;
836  break;
837
838  case PCRE_CONFIG_MATCH_LIMIT:
839  *((unsigned int *)where) = MATCH_LIMIT;
840  break;
841
842  case PCRE_CONFIG_STACKRECURSE:
843#ifdef NO_RECURSE
844  *((int *)where) = 0;
845#else
846  *((int *)where) = 1;
847#endif
848  break;
849
850  default: return PCRE_ERROR_BADOPTION;
851  }
852
853return 0;
854}
855
856
857
858#ifdef DEBUG
859/*************************************************
860*        Debugging function to print chars       *
861*************************************************/
862
863/* Print a sequence of chars in printable format, stopping at the end of the
864subject if the requested.
865
866Arguments:
867  p           points to characters
868  length      number to print
869  is_subject  TRUE if printing from within md->start_subject
870  md          pointer to matching data block, if is_subject is TRUE
871
872Returns:     nothing
873*/
874
875static void
876pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
877{
878int c;
879if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
880while (length-- > 0)
881  if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
882}
883#endif
884
885
886
887
888/*************************************************
889*            Handle escapes                      *
890*************************************************/
891
892/* This function is called when a \ has been encountered. It either returns a
893positive value for a simple escape such as \n, or a negative value which
894encodes one of the more complicated things such as \d. When UTF-8 is enabled,
895a positive value greater than 255 may be returned. On entry, ptr is pointing at
896the \. On exit, it is on the final character of the escape sequence.
897
898Arguments:
899  ptrptr     points to the pattern position pointer
900  errorptr   points to the pointer to the error message
901  bracount   number of previous extracting brackets
902  options    the options bits
903  isclass    TRUE if inside a character class
904
905Returns:     zero or positive => a data character
906             negative => a special escape sequence
907             on error, errorptr is set
908*/
909
910static int
911check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
912  int options, BOOL isclass)
913{
914const uschar *ptr = *ptrptr;
915int c, i;
916
917/* If backslash is at the end of the pattern, it's an error. */
918
919c = *(++ptr);
920if (c == 0) *errorptr = ERR1;
921
922/* Non-alphamerics are literals. For digits or letters, do an initial lookup in
923a table. A non-zero result is something that can be returned immediately.
924Otherwise further processing may be required. */
925
926#if !EBCDIC    /* ASCII coding */
927else if (c < '0' || c > 'z') {}                           /* Not alphameric */
928else if ((i = escapes[c - '0']) != 0) c = i;
929
930#else          /* EBCDIC coding */
931else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
932else if ((i = escapes[c - 0x48]) != 0)  c = i;
933#endif
934
935/* Escapes that need further processing, or are illegal. */
936
937else
938  {
939  const uschar *oldptr;
940  switch (c)
941    {
942    /* A number of Perl escapes are not handled by PCRE. We give an explicit
943    error. */
944
945    case 'l':
946    case 'L':
947    case 'N':
948    case 'u':
949    case 'U':
950    *errorptr = ERR37;
951    break;
952
953    /* The handling of escape sequences consisting of a string of digits
954    starting with one that is not zero is not straightforward. By experiment,
955    the way Perl works seems to be as follows:
956
957    Outside a character class, the digits are read as a decimal number. If the
958    number is less than 10, or if there are that many previous extracting
959    left brackets, then it is a back reference. Otherwise, up to three octal
960    digits are read to form an escaped byte. Thus \123 is likely to be octal
961    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
962    value is greater than 377, the least significant 8 bits are taken. Inside a
963    character class, \ followed by a digit is always an octal number. */
964
965    case '1': case '2': case '3': case '4': case '5':
966    case '6': case '7': case '8': case '9':
967
968    if (!isclass)
969      {
970      oldptr = ptr;
971      c -= '0';
972      while ((digitab[ptr[1]] & ctype_digit) != 0)
973        c = c * 10 + *(++ptr) - '0';
974      if (c < 10 || c <= bracount)
975        {
976        c = -(ESC_REF + c);
977        break;
978        }
979      ptr = oldptr;      /* Put the pointer back and fall through */
980      }
981
982    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
983    generates a binary zero byte and treats the digit as a following literal.
984    Thus we have to pull back the pointer by one. */
985
986    if ((c = *ptr) >= '8')
987      {
988      ptr--;
989      c = 0;
990      break;
991      }
992
993    /* \0 always starts an octal number, but we may drop through to here with a
994    larger first octal digit. */
995
996    case '0':
997    c -= '0';
998    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
999        c = c * 8 + *(++ptr) - '0';
1000    c &= 255;     /* Take least significant 8 bits */
1001    break;
1002
1003    /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004    which can be greater than 0xff, but only if the ddd are hex digits. */
1005
1006    case 'x':
1007#ifdef SUPPORT_UTF8
1008    if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1009      {
1010      const uschar *pt = ptr + 2;
1011      register int count = 0;
1012      c = 0;
1013      while ((digitab[*pt] & ctype_xdigit) != 0)
1014        {
1015        int cc = *pt++;
1016        count++;
1017#if !EBCDIC    /* ASCII coding */
1018        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
1019        c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020#else          /* EBCDIC coding */
1021        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
1022        c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1023#endif
1024        }
1025      if (*pt == '}')
1026        {
1027        if (c < 0 || count > 8) *errorptr = ERR34;
1028        ptr = pt;
1029        break;
1030        }
1031      /* If the sequence of hex digits does not end with '}', then we don't
1032      recognize this construct; fall through to the normal \x handling. */
1033      }
1034#endif
1035
1036    /* Read just a single hex char */
1037
1038    c = 0;
1039    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1040      {
1041      int cc;                               /* Some compilers don't like ++ */
1042      cc = *(++ptr);                        /* in initializers */
1043#if !EBCDIC    /* ASCII coding */
1044      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
1045      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046#else          /* EBCDIC coding */
1047      if (cc <= 'z') cc += 64;              /* Convert to upper case */
1048      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1049#endif
1050      }
1051    break;
1052
1053    /* Other special escapes not starting with a digit are straightforward */
1054
1055    case 'c':
1056    c = *(++ptr);
1057    if (c == 0)
1058      {
1059      *errorptr = ERR2;
1060      return 0;
1061      }
1062
1063    /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064    is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065    (However, an EBCDIC equivalent has now been added.) */
1066
1067#if !EBCDIC    /* ASCII coding */
1068    if (c >= 'a' && c <= 'z') c -= 32;
1069    c ^= 0x40;
1070#else          /* EBCDIC coding */
1071    if (c >= 'a' && c <= 'z') c += 64;
1072    c ^= 0xC0;
1073#endif
1074    break;
1075
1076    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077    other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078    for Perl compatibility, it is a literal. This code looks a bit odd, but
1079    there used to be some cases other than the default, and there may be again
1080    in future, so I haven't "optimized" it. */
1081
1082    default:
1083    if ((options & PCRE_EXTRA) != 0) switch(c)
1084      {
1085      default:
1086      *errorptr = ERR3;
1087      break;
1088      }
1089    break;
1090    }
1091  }
1092
1093*ptrptr = ptr;
1094return c;
1095}
1096
1097
1098
1099#ifdef SUPPORT_UCP
1100/*************************************************
1101*               Handle \P and \p                 *
1102*************************************************/
1103
1104/* This function is called after \P or \p has been encountered, provided that
1105PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106pointing at the P or p. On exit, it is pointing at the final character of the
1107escape sequence.
1108
1109Argument:
1110  ptrptr     points to the pattern position pointer
1111  negptr     points to a boolean that is set TRUE for negation else FALSE
1112  errorptr   points to the pointer to the error message
1113
1114Returns:     value from ucp_type_table, or -1 for an invalid type
1115*/
1116
1117static int
1118get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1119{
1120int c, i, bot, top;
1121const uschar *ptr = *ptrptr;
1122char name[4];
1123
1124c = *(++ptr);
1125if (c == 0) goto ERROR_RETURN;
1126
1127*negptr = FALSE;
1128
1129/* \P or \p can be followed by a one- or two-character name in {}, optionally
1130preceded by ^ for negation. */
1131
1132if (c == '{')
1133  {
1134  if (ptr[1] == '^')
1135    {
1136    *negptr = TRUE;
1137    ptr++;
1138    }
1139  for (i = 0; i <= 2; i++)
1140    {
1141    c = *(++ptr);
1142    if (c == 0) goto ERROR_RETURN;
1143    if (c == '}') break;
1144    name[i] = c;
1145    }
1146  if (c !='}')   /* Try to distinguish error cases */
1147    {
1148    while (*(++ptr) != 0 && *ptr != '}');
1149    if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1150    }
1151  name[i] = 0;
1152  }
1153
1154/* Otherwise there is just one following character */
1155
1156else
1157  {
1158  name[0] = c;
1159  name[1] = 0;
1160  }
1161
1162*ptrptr = ptr;
1163
1164/* Search for a recognized property name using binary chop */
1165
1166bot = 0;
1167top = sizeof(utt)/sizeof(ucp_type_table);
1168
1169while (bot < top)
1170  {
1171  i = (bot + top)/2;
1172  c = strcmp(name, utt[i].name);
1173  if (c == 0) return utt[i].value;
1174  if (c > 0) bot = i + 1; else top = i;
1175  }
1176
1177UNKNOWN_RETURN:
1178*errorptr = ERR47;
1179*ptrptr = ptr;
1180return -1;
1181
1182ERROR_RETURN:
1183*errorptr = ERR46;
1184*ptrptr = ptr;
1185return -1;
1186}
1187#endif
1188
1189
1190
1191
1192/*************************************************
1193*            Check for counted repeat            *
1194*************************************************/
1195
1196/* This function is called when a '{' is encountered in a place where it might
1197start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199where the ddds are digits.
1200
1201Arguments:
1202  p         pointer to the first char after '{'
1203
1204Returns:    TRUE or FALSE
1205*/
1206
1207static BOOL
1208is_counted_repeat(const uschar *p)
1209{
1210if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211while ((digitab[*p] & ctype_digit) != 0) p++;
1212if (*p == '}') return TRUE;
1213
1214if (*p++ != ',') return FALSE;
1215if (*p == '}') return TRUE;
1216
1217if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218while ((digitab[*p] & ctype_digit) != 0) p++;
1219
1220return (*p == '}');
1221}
1222
1223
1224
1225/*************************************************
1226*         Read repeat counts                     *
1227*************************************************/
1228
1229/* Read an item of the form {n,m} and return the values. This is called only
1230after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231so the syntax is guaranteed to be correct, but we need to check the values.
1232
1233Arguments:
1234  p          pointer to first char after '{'
1235  minp       pointer to int for min
1236  maxp       pointer to int for max
1237             returned as -1 if no max
1238  errorptr   points to pointer to error message
1239
1240Returns:     pointer to '}' on success;
1241             current ptr on error, with errorptr set
1242*/
1243
1244static const uschar *
1245read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1246{
1247int min = 0;
1248int max = -1;
1249
1250/* Read the minimum value and do a paranoid check: a negative value indicates
1251an integer overflow. */
1252
1253while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1254if (min < 0 || min > 65535)
1255  {
1256  *errorptr = ERR5;
1257  return p;
1258  }
1259
1260/* Read the maximum value if there is one, and again do a paranoid on its size.
1261Also, max must not be less than min. */
1262
1263if (*p == '}') max = min; else
1264  {
1265  if (*(++p) != '}')
1266    {
1267    max = 0;
1268    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1269    if (max < 0 || max > 65535)
1270      {
1271      *errorptr = ERR5;
1272      return p;
1273      }
1274    if (max < min)
1275      {
1276      *errorptr = ERR4;
1277      return p;
1278      }
1279    }
1280  }
1281
1282/* Fill in the required variables, and pass back the pointer to the terminating
1283'}'. */
1284
1285*minp = min;
1286*maxp = max;
1287return p;
1288}
1289
1290
1291
1292/*************************************************
1293*      Find first significant op code            *
1294*************************************************/
1295
1296/* This is called by several functions that scan a compiled expression looking
1297for a fixed first character, or an anchoring op code etc. It skips over things
1298that do not influence this. For some calls, a change of option is important.
1299For some calls, it makes sense to skip negative forward and all backward
1300assertions, and also the \b assertion; for others it does not.
1301
1302Arguments:
1303  code         pointer to the start of the group
1304  options      pointer to external options
1305  optbit       the option bit whose changing is significant, or
1306                 zero if none are
1307  skipassert   TRUE if certain assertions are to be skipped
1308
1309Returns:       pointer to the first significant opcode
1310*/
1311
1312static const uschar*
1313first_significant_code(const uschar *code, int *options, int optbit,
1314  BOOL skipassert)
1315{
1316for (;;)
1317  {
1318  switch ((int)*code)
1319    {
1320    case OP_OPT:
1321    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1322      *options = (int)code[1];
1323    code += 2;
1324    break;
1325
1326    case OP_ASSERT_NOT:
1327    case OP_ASSERTBACK:
1328    case OP_ASSERTBACK_NOT:
1329    if (!skipassert) return code;
1330    do code += GET(code, 1); while (*code == OP_ALT);
1331    code += OP_lengths[*code];
1332    break;
1333
1334    case OP_WORD_BOUNDARY:
1335    case OP_NOT_WORD_BOUNDARY:
1336    if (!skipassert) return code;
1337    /* Fall through */
1338
1339    case OP_CALLOUT:
1340    case OP_CREF:
1341    case OP_BRANUMBER:
1342    code += OP_lengths[*code];
1343    break;
1344
1345    default:
1346    return code;
1347    }
1348  }
1349/* Control never reaches here */
1350}
1351
1352
1353
1354
1355/*************************************************
1356*        Find the fixed length of a pattern      *
1357*************************************************/
1358
1359/* Scan a pattern and compute the fixed length of subject that will match it,
1360if the length is fixed. This is needed for dealing with backward assertions.
1361In UTF8 mode, the result is in characters rather than bytes.
1362
1363Arguments:
1364  code     points to the start of the pattern (the bracket)
1365  options  the compiling options
1366
1367Returns:   the fixed length, or -1 if there is no fixed length,
1368             or -2 if \C was encountered
1369*/
1370
1371static int
1372find_fixedlength(uschar *code, int options)
1373{
1374int length = -1;
1375
1376register int branchlength = 0;
1377register uschar *cc = code + 1 + LINK_SIZE;
1378
1379/* Scan along the opcodes for this branch. If we get to the end of the
1380branch, check the length against that of the other branches. */
1381
1382for (;;)
1383  {
1384  int d;
1385  register int op = *cc;
1386  if (op >= OP_BRA) op = OP_BRA;
1387
1388  switch (op)
1389    {
1390    case OP_BRA:
1391    case OP_ONCE:
1392    case OP_COND:
1393    d = find_fixedlength(cc, options);
1394    if (d < 0) return d;
1395    branchlength += d;
1396    do cc += GET(cc, 1); while (*cc == OP_ALT);
1397    cc += 1 + LINK_SIZE;
1398    break;
1399
1400    /* Reached end of a branch; if it's a ket it is the end of a nested
1401    call. If it's ALT it is an alternation in a nested call. If it is
1402    END it's the end of the outer call. All can be handled by the same code. */
1403
1404    case OP_ALT:
1405    case OP_KET:
1406    case OP_KETRMAX:
1407    case OP_KETRMIN:
1408    case OP_END:
1409    if (length < 0) length = branchlength;
1410      else if (length != branchlength) return -1;
1411    if (*cc != OP_ALT) return length;
1412    cc += 1 + LINK_SIZE;
1413    branchlength = 0;
1414    break;
1415
1416    /* Skip over assertive subpatterns */
1417
1418    case OP_ASSERT:
1419    case OP_ASSERT_NOT:
1420    case OP_ASSERTBACK:
1421    case OP_ASSERTBACK_NOT:
1422    do cc += GET(cc, 1); while (*cc == OP_ALT);
1423    /* Fall through */
1424
1425    /* Skip over things that don't match chars */
1426
1427    case OP_REVERSE:
1428    case OP_BRANUMBER:
1429    case OP_CREF:
1430    case OP_OPT:
1431    case OP_CALLOUT:
1432    case OP_SOD:
1433    case OP_SOM:
1434    case OP_EOD:
1435    case OP_EODN:
1436    case OP_CIRC:
1437    case OP_DOLL:
1438    case OP_NOT_WORD_BOUNDARY:
1439    case OP_WORD_BOUNDARY:
1440    cc += OP_lengths[*cc];
1441    break;
1442
1443    /* Handle literal characters */
1444
1445    case OP_CHAR:
1446    case OP_CHARNC:
1447    branchlength++;
1448    cc += 2;
1449#ifdef SUPPORT_UTF8
1450    if ((options & PCRE_UTF8) != 0)
1451      {
1452      while ((*cc & 0xc0) == 0x80) cc++;
1453      }
1454#endif
1455    break;
1456
1457    /* Handle exact repetitions. The count is already in characters, but we
1458    need to skip over a multibyte character in UTF8 mode.  */
1459
1460    case OP_EXACT:
1461    branchlength += GET2(cc,1);
1462    cc += 4;
1463#ifdef SUPPORT_UTF8
1464    if ((options & PCRE_UTF8) != 0)
1465      {
1466      while((*cc & 0x80) == 0x80) cc++;
1467      }
1468#endif
1469    break;
1470
1471    case OP_TYPEEXACT:
1472    branchlength += GET2(cc,1);
1473    cc += 4;
1474    break;
1475
1476    /* Handle single-char matchers */
1477
1478    case OP_PROP:
1479    case OP_NOTPROP:
1480    cc++;
1481    /* Fall through */
1482
1483    case OP_NOT_DIGIT:
1484    case OP_DIGIT:
1485    case OP_NOT_WHITESPACE:
1486    case OP_WHITESPACE:
1487    case OP_NOT_WORDCHAR:
1488    case OP_WORDCHAR:
1489    case OP_ANY:
1490    branchlength++;
1491    cc++;
1492    break;
1493
1494    /* The single-byte matcher isn't allowed */
1495
1496    case OP_ANYBYTE:
1497    return -2;
1498
1499    /* Check a class for variable quantification */
1500
1501#ifdef SUPPORT_UTF8
1502    case OP_XCLASS:
1503    cc += GET(cc, 1) - 33;
1504    /* Fall through */
1505#endif
1506
1507    case OP_CLASS:
1508    case OP_NCLASS:
1509    cc += 33;
1510
1511    switch (*cc)
1512      {
1513      case OP_CRSTAR:
1514      case OP_CRMINSTAR:
1515      case OP_CRQUERY:
1516      case OP_CRMINQUERY:
1517      return -1;
1518
1519      case OP_CRRANGE:
1520      case OP_CRMINRANGE:
1521      if (GET2(cc,1) != GET2(cc,3)) return -1;
1522      branchlength += GET2(cc,1);
1523      cc += 5;
1524      break;
1525
1526      default:
1527      branchlength++;
1528      }
1529    break;
1530
1531    /* Anything else is variable length */
1532
1533    default:
1534    return -1;
1535    }
1536  }
1537/* Control never gets here */
1538}
1539
1540
1541
1542
1543/*************************************************
1544*    Scan compiled regex for numbered bracket    *
1545*************************************************/
1546
1547/* This little function scans through a compiled pattern until it finds a
1548capturing bracket with the given number.
1549
1550Arguments:
1551  code        points to start of expression
1552  utf8        TRUE in UTF-8 mode
1553  number      the required bracket number
1554
1555Returns:      pointer to the opcode for the bracket, or NULL if not found
1556*/
1557
1558static const uschar *
1559find_bracket(const uschar *code, BOOL utf8, int number)
1560{
1561#ifndef SUPPORT_UTF8
1562utf8 = utf8;               /* Stop pedantic compilers complaining */
1563#endif
1564
1565for (;;)
1566  {
1567  register int c = *code;
1568  if (c == OP_END) return NULL;
1569  else if (c > OP_BRA)
1570    {
1571    int n = c - OP_BRA;
1572    if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1573    if (n == number) return (uschar *)code;
1574    code += OP_lengths[OP_BRA];
1575    }
1576  else
1577    {
1578    code += OP_lengths[c];
1579
1580#ifdef SUPPORT_UTF8
1581
1582    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1583    by a multi-byte character. The length in the table is a minimum, so we have
1584    to scan along to skip the extra bytes. All opcodes are less than 128, so we
1585    can use relatively efficient code. */
1586
1587    if (utf8) switch(c)
1588      {
1589      case OP_CHAR:
1590      case OP_CHARNC:
1591      case OP_EXACT:
1592      case OP_UPTO:
1593      case OP_MINUPTO:
1594      case OP_STAR:
1595      case OP_MINSTAR:
1596      case OP_PLUS:
1597      case OP_MINPLUS:
1598      case OP_QUERY:
1599      case OP_MINQUERY:
1600      while ((*code & 0xc0) == 0x80) code++;
1601      break;
1602
1603      /* XCLASS is used for classes that cannot be represented just by a bit
1604      map. This includes negated single high-valued characters. The length in
1605      the table is zero; the actual length is stored in the compiled code. */
1606
1607      case OP_XCLASS:
1608      code += GET(code, 1) + 1;
1609      break;
1610      }
1611#endif
1612    }
1613  }
1614}
1615
1616
1617
1618/*************************************************
1619*   Scan compiled regex for recursion reference  *
1620*************************************************/
1621
1622/* This little function scans through a compiled pattern until it finds an
1623instance of OP_RECURSE.
1624
1625Arguments:
1626  code        points to start of expression
1627  utf8        TRUE in UTF-8 mode
1628
1629Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1630*/
1631
1632static const uschar *
1633find_recurse(const uschar *code, BOOL utf8)
1634{
1635#ifndef SUPPORT_UTF8
1636utf8 = utf8;               /* Stop pedantic compilers complaining */
1637#endif
1638
1639for (;;)
1640  {
1641  register int c = *code;
1642  if (c == OP_END) return NULL;
1643  else if (c == OP_RECURSE) return code;
1644  else if (c > OP_BRA)
1645    {
1646    code += OP_lengths[OP_BRA];
1647    }
1648  else
1649    {
1650    code += OP_lengths[c];
1651
1652#ifdef SUPPORT_UTF8
1653
1654    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1655    by a multi-byte character. The length in the table is a minimum, so we have
1656    to scan along to skip the extra bytes. All opcodes are less than 128, so we
1657    can use relatively efficient code. */
1658
1659    if (utf8) switch(c)
1660      {
1661      case OP_CHAR:
1662      case OP_CHARNC:
1663      case OP_EXACT:
1664      case OP_UPTO:
1665      case OP_MINUPTO:
1666      case OP_STAR:
1667      case OP_MINSTAR:
1668      case OP_PLUS:
1669      case OP_MINPLUS:
1670      case OP_QUERY:
1671      case OP_MINQUERY:
1672      while ((*code & 0xc0) == 0x80) code++;
1673      break;
1674
1675      /* XCLASS is used for classes that cannot be represented just by a bit
1676      map. This includes negated single high-valued characters. The length in
1677      the table is zero; the actual length is stored in the compiled code. */
1678
1679      case OP_XCLASS:
1680      code += GET(code, 1) + 1;
1681      break;
1682      }
1683#endif
1684    }
1685  }
1686}
1687
1688
1689
1690/*************************************************
1691*    Scan compiled branch for non-emptiness      *
1692*************************************************/
1693
1694/* This function scans through a branch of a compiled pattern to see whether it
1695can match the empty string or not. It is called only from could_be_empty()
1696below. Note that first_significant_code() skips over assertions. If we hit an
1697unclosed bracket, we return "empty" - this means we've struck an inner bracket
1698whose current branch will already have been scanned.
1699
1700Arguments:
1701  code        points to start of search
1702  endcode     points to where to stop
1703  utf8        TRUE if in UTF8 mode
1704
1705Returns:      TRUE if what is matched could be empty
1706*/
1707
1708static BOOL
1709could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1710{
1711register int c;
1712for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1713     code < endcode;
1714     code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1715  {
1716  const uschar *ccode;
1717
1718  c = *code;
1719
1720  if (c >= OP_BRA)
1721    {
1722    BOOL empty_branch;
1723    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1724
1725    /* Scan a closed bracket */
1726
1727    empty_branch = FALSE;
1728    do
1729      {
1730      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1731        empty_branch = TRUE;
1732      code += GET(code, 1);
1733      }
1734    while (*code == OP_ALT);
1735    if (!empty_branch) return FALSE;   /* All branches are non-empty */
1736    code += 1 + LINK_SIZE;
1737    c = *code;
1738    }
1739
1740  else switch (c)
1741    {
1742    /* Check for quantifiers after a class */
1743
1744#ifdef SUPPORT_UTF8
1745    case OP_XCLASS:
1746    ccode = code + GET(code, 1);
1747    goto CHECK_CLASS_REPEAT;
1748#endif
1749
1750    case OP_CLASS:
1751    case OP_NCLASS:
1752    ccode = code + 33;
1753
1754#ifdef SUPPORT_UTF8
1755    CHECK_CLASS_REPEAT:
1756#endif
1757
1758    switch (*ccode)
1759      {
1760      case OP_CRSTAR:            /* These could be empty; continue */
1761      case OP_CRMINSTAR:
1762      case OP_CRQUERY:
1763      case OP_CRMINQUERY:
1764      break;
1765
1766      default:                   /* Non-repeat => class must match */
1767      case OP_CRPLUS:            /* These repeats aren't empty */
1768      case OP_CRMINPLUS:
1769      return FALSE;
1770
1771      case OP_CRRANGE:
1772      case OP_CRMINRANGE:
1773      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1774      break;
1775      }
1776    break;
1777
1778    /* Opcodes that must match a character */
1779
1780    case OP_PROP:
1781    case OP_NOTPROP:
1782    case OP_EXTUNI:
1783    case OP_NOT_DIGIT:
1784    case OP_DIGIT:
1785    case OP_NOT_WHITESPACE:
1786    case OP_WHITESPACE:
1787    case OP_NOT_WORDCHAR:
1788    case OP_WORDCHAR:
1789    case OP_ANY:
1790    case OP_ANYBYTE:
1791    case OP_CHAR:
1792    case OP_CHARNC:
1793    case OP_NOT:
1794    case OP_PLUS:
1795    case OP_MINPLUS:
1796    case OP_EXACT:
1797    case OP_NOTPLUS:
1798    case OP_NOTMINPLUS:
1799    case OP_NOTEXACT:
1800    case OP_TYPEPLUS:
1801    case OP_TYPEMINPLUS:
1802    case OP_TYPEEXACT:
1803    return FALSE;
1804
1805    /* End of branch */
1806
1807    case OP_KET:
1808    case OP_KETRMAX:
1809    case OP_KETRMIN:
1810    case OP_ALT:
1811    return TRUE;
1812
1813    /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1814    followed by a multibyte character */
1815
1816#ifdef SUPPORT_UTF8
1817    case OP_STAR:
1818    case OP_MINSTAR:
1819    case OP_QUERY:
1820    case OP_MINQUERY:
1821    case OP_UPTO:
1822    case OP_MINUPTO:
1823    if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1824    break;
1825#endif
1826    }
1827  }
1828
1829return TRUE;
1830}
1831
1832
1833
1834/*************************************************
1835*    Scan compiled regex for non-emptiness       *
1836*************************************************/
1837
1838/* This function is called to check for left recursive calls. We want to check
1839the current branch of the current pattern to see if it could match the empty
1840string. If it could, we must look outwards for branches at other levels,
1841stopping when we pass beyond the bracket which is the subject of the recursion.
1842
1843Arguments:
1844  code        points to start of the recursion
1845  endcode     points to where to stop (current RECURSE item)
1846  bcptr       points to the chain of current (unclosed) branch starts
1847  utf8        TRUE if in UTF-8 mode
1848
1849Returns:      TRUE if what is matched could be empty
1850*/
1851
1852static BOOL
1853could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1854  BOOL utf8)
1855{
1856while (bcptr != NULL && bcptr->current >= code)
1857  {
1858  if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1859  bcptr = bcptr->outer;
1860  }
1861return TRUE;
1862}
1863
1864
1865
1866/*************************************************
1867*           Check for POSIX class syntax         *
1868*************************************************/
1869
1870/* This function is called when the sequence "[:" or "[." or "[=" is
1871encountered in a character class. It checks whether this is followed by an
1872optional ^ and then a sequence of letters, terminated by a matching ":]" or
1873".]" or "=]".
1874
1875Argument:
1876  ptr      pointer to the initial [
1877  endptr   where to return the end pointer
1878  cd       pointer to compile data
1879
1880Returns:   TRUE or FALSE
1881*/
1882
1883static BOOL
1884check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1885{
1886int terminator;          /* Don't combine these lines; the Solaris cc */
1887terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1888if (*(++ptr) == '^') ptr++;
1889while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1890if (*ptr == terminator && ptr[1] == ']')
1891  {
1892  *endptr = ptr;
1893  return TRUE;
1894  }
1895return FALSE;
1896}
1897
1898
1899
1900
1901/*************************************************
1902*          Check POSIX class name                *
1903*************************************************/
1904
1905/* This function is called to check the name given in a POSIX-style class entry
1906such as [:alnum:].
1907
1908Arguments:
1909  ptr        points to the first letter
1910  len        the length of the name
1911
1912Returns:     a value representing the name, or -1 if unknown
1913*/
1914
1915static int
1916check_posix_name(const uschar *ptr, int len)
1917{
1918register int yield = 0;
1919while (posix_name_lengths[yield] != 0)
1920  {
1921  if (len == posix_name_lengths[yield] &&
1922    strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1923  yield++;
1924  }
1925return -1;
1926}
1927
1928
1929/*************************************************
1930*    Adjust OP_RECURSE items in repeated group   *
1931*************************************************/
1932
1933/* OP_RECURSE items contain an offset from the start of the regex to the group
1934that is referenced. This means that groups can be replicated for fixed
1935repetition simply by copying (because the recursion is allowed to refer to
1936earlier groups that are outside the current group). However, when a group is
1937optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1938it, after it has been compiled. This means that any OP_RECURSE items within it
1939that refer to the group itself or any contained groups have to have their
1940offsets adjusted. That is the job of this function. Before it is called, the
1941partially compiled regex must be temporarily terminated with OP_END.
1942
1943Arguments:
1944  group      points to the start of the group
1945  adjust     the amount by which the group is to be moved
1946  utf8       TRUE in UTF-8 mode
1947  cd         contains pointers to tables etc.
1948
1949Returns:     nothing
1950*/
1951
1952static void
1953adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1954{
1955uschar *ptr = group;
1956while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1957  {
1958  int offset = GET(ptr, 1);
1959  if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1960  ptr += 1 + LINK_SIZE;
1961  }
1962}
1963
1964
1965
1966/*************************************************
1967*        Insert an automatic callout point       *
1968*************************************************/
1969
1970/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1971callout points before each pattern item.
1972
1973Arguments:
1974  code           current code pointer
1975  ptr            current pattern pointer
1976  cd             pointers to tables etc
1977
1978Returns:         new code pointer
1979*/
1980
1981static uschar *
1982auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1983{
1984*code++ = OP_CALLOUT;
1985*code++ = 255;
1986PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1987PUT(code, LINK_SIZE, 0);                /* Default length */
1988return code + 2*LINK_SIZE;
1989}
1990
1991
1992
1993/*************************************************
1994*         Complete a callout item                *
1995*************************************************/
1996
1997/* A callout item contains the length of the next item in the pattern, which
1998we can't fill in till after we have reached the relevant point. This is used
1999for both automatic and manual callouts.
2000
2001Arguments:
2002  previous_callout   points to previous callout item
2003  ptr                current pattern pointer
2004  cd                 pointers to tables etc
2005
2006Returns:             nothing
2007*/
2008
2009static void
2010complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2011{
2012int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2013PUT(previous_callout, 2 + LINK_SIZE, length);
2014}
2015
2016
2017
2018#ifdef SUPPORT_UCP
2019/*************************************************
2020*           Get othercase range                  *
2021*************************************************/
2022
2023/* This function is passed the start and end of a class range, in UTF-8 mode
2024with UCP support. It searches up the characters, looking for internal ranges of
2025characters in the "other" case. Each call returns the next one, updating the
2026start address.
2027
2028Arguments:
2029  cptr        points to starting character value; updated
2030  d           end value
2031  ocptr       where to put start of othercase range
2032  odptr       where to put end of othercase range
2033
2034Yield:        TRUE when range returned; FALSE when no more
2035*/
2036
2037static BOOL
2038get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2039{
2040int c, chartype, othercase, next;
2041
2042for (c = *cptr; c <= d; c++)
2043  {
2044  if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2045  }
2046
2047if (c > d) return FALSE;
2048
2049*ocptr = othercase;
2050next = othercase + 1;
2051
2052for (++c; c <= d; c++)
2053  {
2054  if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2055    break;
2056  next++;
2057  }
2058
2059*odptr = next - 1;
2060*cptr = c;
2061
2062return TRUE;
2063}
2064#endif  /* SUPPORT_UCP */
2065
2066
2067/*************************************************
2068*           Compile one branch                   *
2069*************************************************/
2070
2071/* Scan the pattern, compiling it into the code vector. If the options are
2072changed during the branch, the pointer is used to change the external options
2073bits.
2074
2075Arguments:
2076  optionsptr     pointer to the option bits
2077  brackets       points to number of extracting brackets used
2078  codeptr        points to the pointer to the current code point
2079  ptrptr         points to the current pattern pointer
2080  errorptr       points to pointer to error message
2081  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2082  reqbyteptr     set to the last literal character required, else < 0
2083  bcptr          points to current branch chain
2084  cd             contains pointers to tables etc.
2085
2086Returns:         TRUE on success
2087                 FALSE, with *errorptr set on error
2088*/
2089
2090static BOOL
2091compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2092  const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2093  int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2094{
2095int repeat_type, op_type;
2096int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2097int bravalue = 0;
2098int greedy_default, greedy_non_default;
2099int firstbyte, reqbyte;
2100int zeroreqbyte, zerofirstbyte;
2101int req_caseopt, reqvary, tempreqvary;
2102int condcount = 0;
2103int options = *optionsptr;
2104int after_manual_callout = 0;
2105register int c;
2106register uschar *code = *codeptr;
2107uschar *tempcode;
2108BOOL inescq = FALSE;
2109BOOL groupsetfirstbyte = FALSE;
2110const uschar *ptr = *ptrptr;
2111const uschar *tempptr;
2112uschar *previous = NULL;
2113uschar *previous_callout = NULL;
2114uschar classbits[32];
2115
2116#ifdef SUPPORT_UTF8
2117BOOL class_utf8;
2118BOOL utf8 = (options & PCRE_UTF8) != 0;
2119uschar *class_utf8data;
2120uschar utf8_char[6];
2121#else
2122BOOL utf8 = FALSE;
2123#endif
2124
2125/* Set up the default and non-default settings for greediness */
2126
2127greedy_default = ((options & PCRE_UNGREEDY) != 0);
2128greedy_non_default = greedy_default ^ 1;
2129
2130/* Initialize no first byte, no required byte. REQ_UNSET means "no char
2131matching encountered yet". It gets changed to REQ_NONE if we hit something that
2132matches a non-fixed char first char; reqbyte just remains unset if we never
2133find one.
2134
2135When we hit a repeat whose minimum is zero, we may have to adjust these values
2136to take the zero repeat into account. This is implemented by setting them to
2137zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2138item types that can be repeated set these backoff variables appropriately. */
2139
2140firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2141
2142/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2143according to the current setting of the caseless flag. REQ_CASELESS is a bit
2144value > 255. It is added into the firstbyte or reqbyte variables to record the
2145case status of the value. This is used only for ASCII characters. */
2146
2147req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2148
2149/* Switch on next character until the end of the branch */
2150
2151for (;; ptr++)
2152  {
2153  BOOL negate_class;
2154  BOOL possessive_quantifier;
2155  BOOL is_quantifier;
2156  int class_charcount;
2157  int class_lastchar;
2158  int newoptions;
2159  int recno;
2160  int skipbytes;
2161  int subreqbyte;
2162  int subfirstbyte;
2163  int mclength;
2164  uschar mcbuffer[8];
2165
2166  /* Next byte in the pattern */
2167
2168  c = *ptr;
2169
2170  /* If in \Q...\E, check for the end; if not, we have a literal */
2171
2172  if (inescq && c != 0)
2173    {
2174    if (c == '\\' && ptr[1] == 'E')
2175      {
2176      inescq = FALSE;
2177      ptr++;
2178      continue;
2179      }
2180    else
2181      {
2182      if (previous_callout != NULL)
2183        {
2184        complete_callout(previous_callout, ptr, cd);
2185        previous_callout = NULL;
2186        }
2187      if ((options & PCRE_AUTO_CALLOUT) != 0)
2188        {
2189        previous_callout = code;
2190        code = auto_callout(code, ptr, cd);
2191        }
2192      goto NORMAL_CHAR;
2193      }
2194    }
2195
2196  /* Fill in length of a previous callout, except when the next thing is
2197  a quantifier. */
2198
2199  is_quantifier = c == '*' || c == '+' || c == '?' ||
2200    (c == '{' && is_counted_repeat(ptr+1));
2201
2202  if (!is_quantifier && previous_callout != NULL &&
2203       after_manual_callout-- <= 0)
2204    {
2205    complete_callout(previous_callout, ptr, cd);
2206    previous_callout = NULL;
2207    }
2208
2209  /* In extended mode, skip white space and comments */
2210
2211  if ((options & PCRE_EXTENDED) != 0)
2212    {
2213    if ((cd->ctypes[c] & ctype_space) != 0) continue;
2214    if (c == '#')
2215      {
2216      /* The space before the ; is to avoid a warning on a silly compiler
2217      on the Macintosh. */
2218      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2219      if (c != 0) continue;   /* Else fall through to handle end of string */
2220      }
2221    }
2222
2223  /* No auto callout for quantifiers. */
2224
2225  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2226    {
2227    previous_callout = code;
2228    code = auto_callout(code, ptr, cd);
2229    }
2230
2231  switch(c)
2232    {
2233    /* The branch terminates at end of string, |, or ). */
2234
2235    case 0:
2236    case '|':
2237    case ')':
2238    *firstbyteptr = firstbyte;
2239    *reqbyteptr = reqbyte;
2240    *codeptr = code;
2241    *ptrptr = ptr;
2242    return TRUE;
2243
2244    /* Handle single-character metacharacters. In multiline mode, ^ disables
2245    the setting of any following char as a first character. */
2246
2247    case '^':
2248    if ((options & PCRE_MULTILINE) != 0)
2249      {
2250      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2251      }
2252    previous = NULL;
2253    *code++ = OP_CIRC;
2254    break;
2255
2256    case '$':
2257    previous = NULL;
2258    *code++ = OP_DOLL;
2259    break;
2260
2261    /* There can never be a first char if '.' is first, whatever happens about
2262    repeats. The value of reqbyte doesn't change either. */
2263
2264    case '.':
2265    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2266    zerofirstbyte = firstbyte;
2267    zeroreqbyte = reqbyte;
2268    previous = code;
2269    *code++ = OP_ANY;
2270    break;
2271
2272    /* Character classes. If the included characters are all < 255 in value, we
2273    build a 32-byte bitmap of the permitted characters, except in the special
2274    case where there is only one such character. For negated classes, we build
2275    the map as usual, then invert it at the end. However, we use a different
2276    opcode so that data characters > 255 can be handled correctly.
2277
2278    If the class contains characters outside the 0-255 range, a different
2279    opcode is compiled. It may optionally have a bit map for characters < 256,
2280    but those above are are explicitly listed afterwards. A flag byte tells
2281    whether the bitmap is present, and whether this is a negated class or not.
2282    */
2283
2284    case '[':
2285    previous = code;
2286
2287    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2288    they are encountered at the top level, so we'll do that too. */
2289
2290    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2291        check_posix_syntax(ptr, &tempptr, cd))
2292      {
2293      *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2294      goto FAILED;
2295      }
2296
2297    /* If the first character is '^', set the negation flag and skip it. */
2298
2299    if ((c = *(++ptr)) == '^')
2300      {
2301      negate_class = TRUE;
2302      c = *(++ptr);
2303      }
2304    else
2305      {
2306      negate_class = FALSE;
2307      }
2308
2309    /* Keep a count of chars with values < 256 so that we can optimize the case
2310    of just a single character (as long as it's < 256). For higher valued UTF-8
2311    characters, we don't yet do any optimization. */
2312
2313    class_charcount = 0;
2314    class_lastchar = -1;
2315
2316#ifdef SUPPORT_UTF8
2317    class_utf8 = FALSE;                       /* No chars >= 256 */
2318    class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
2319#endif
2320
2321    /* Initialize the 32-char bit map to all zeros. We have to build the
2322    map in a temporary bit of store, in case the class contains only 1
2323    character (< 256), because in that case the compiled code doesn't use the
2324    bit map. */
2325
2326    memset(classbits, 0, 32 * sizeof(uschar));
2327
2328    /* Process characters until ] is reached. By writing this as a "do" it
2329    means that an initial ] is taken as a data character. The first pass
2330    through the regex checked the overall syntax, so we don't need to be very
2331    strict here. At the start of the loop, c contains the first byte of the
2332    character. */
2333
2334    do
2335      {
2336#ifdef SUPPORT_UTF8
2337      if (utf8 && c > 127)
2338        {                           /* Braces are required because the */
2339        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2340        }
2341#endif
2342
2343      /* Inside \Q...\E everything is literal except \E */
2344
2345      if (inescq)
2346        {
2347        if (c == '\\' && ptr[1] == 'E')
2348          {
2349          inescq = FALSE;
2350          ptr++;
2351          continue;
2352          }
2353        else goto LONE_SINGLE_CHARACTER;
2354        }
2355
2356      /* Handle POSIX class names. Perl allows a negation extension of the
2357      form [:^name:]. A square bracket that doesn't match the syntax is
2358      treated as a literal. We also recognize the POSIX constructions
2359      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2360      5.6 and 5.8 do. */
2361
2362      if (c == '[' &&
2363          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2364          check_posix_syntax(ptr, &tempptr, cd))
2365        {
2366        BOOL local_negate = FALSE;
2367        int posix_class, i;
2368        register const uschar *cbits = cd->cbits;
2369
2370        if (ptr[1] != ':')
2371          {
2372          *errorptr = ERR31;
2373          goto FAILED;
2374          }
2375
2376        ptr += 2;
2377        if (*ptr == '^')
2378          {
2379          local_negate = TRUE;
2380          ptr++;
2381          }
2382
2383        posix_class = check_posix_name(ptr, tempptr - ptr);
2384        if (posix_class < 0)
2385          {
2386          *errorptr = ERR30;
2387          goto FAILED;
2388          }
2389
2390        /* If matching is caseless, upper and lower are converted to
2391        alpha. This relies on the fact that the class table starts with
2392        alpha, lower, upper as the first 3 entries. */
2393
2394        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2395          posix_class = 0;
2396
2397        /* Or into the map we are building up to 3 of the static class
2398        tables, or their negations. The [:blank:] class sets up the same
2399        chars as the [:space:] class (all white space). We remove the vertical
2400        white space chars afterwards. */
2401
2402        posix_class *= 3;
2403        for (i = 0; i < 3; i++)
2404          {
2405          BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2406          int taboffset = posix_class_maps[posix_class + i];
2407          if (taboffset < 0) break;
2408          if (local_negate)
2409            {
2410            if (i == 0)
2411              for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2412            else
2413              for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2414            if (blankclass) classbits[1] |= 0x3c;
2415            }
2416          else
2417            {
2418            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2419            if (blankclass) classbits[1] &= ~0x3c;
2420            }
2421          }
2422
2423        ptr = tempptr + 1;
2424        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2425        continue;    /* End of POSIX syntax handling */
2426        }
2427
2428      /* Backslash may introduce a single character, or it may introduce one
2429      of the specials, which just set a flag. Escaped items are checked for
2430      validity in the pre-compiling pass. The sequence \b is a special case.
2431      Inside a class (and only there) it is treated as backspace. Elsewhere
2432      it marks a word boundary. Other escapes have preset maps ready to
2433      or into the one we are building. We assume they have more than one
2434      character in them, so set class_charcount bigger than one. */
2435
2436      if (c == '\\')
2437        {
2438        c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2439
2440        if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2441        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2442        else if (-c == ESC_Q)            /* Handle start of quoted string */
2443          {
2444          if (ptr[1] == '\\' && ptr[2] == 'E')
2445            {
2446            ptr += 2; /* avoid empty string */
2447            }
2448          else inescq = TRUE;
2449          continue;
2450          }
2451
2452        if (c < 0)
2453          {
2454          register const uschar *cbits = cd->cbits;
2455          class_charcount += 2;     /* Greater than 1 is what matters */
2456          switch (-c)
2457            {
2458            case ESC_d:
2459            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2460            continue;
2461
2462            case ESC_D:
2463            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2464            continue;
2465
2466            case ESC_w:
2467            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2468            continue;
2469
2470            case ESC_W:
2471            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2472            continue;
2473
2474            case ESC_s:
2475            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2476            classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2477            continue;
2478
2479            case ESC_S:
2480            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2481            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2482            continue;
2483
2484#ifdef SUPPORT_UCP
2485            case ESC_p:
2486            case ESC_P:
2487              {
2488              BOOL negated;
2489              int property = get_ucp(&ptr, &negated, errorptr);
2490              if (property < 0) goto FAILED;
2491              class_utf8 = TRUE;
2492              *class_utf8data++ = ((-c == ESC_p) != negated)?
2493                XCL_PROP : XCL_NOTPROP;
2494              *class_utf8data++ = property;
2495              class_charcount -= 2;   /* Not a < 256 character */
2496              }
2497            continue;
2498#endif
2499
2500            /* Unrecognized escapes are faulted if PCRE is running in its
2501            strict mode. By default, for compatibility with Perl, they are
2502            treated as literals. */
2503
2504            default:
2505            if ((options & PCRE_EXTRA) != 0)
2506              {
2507              *errorptr = ERR7;
2508              goto FAILED;
2509              }
2510            c = *ptr;              /* The final character */
2511            class_charcount -= 2;  /* Undo the default count from above */
2512            }
2513          }
2514
2515        /* Fall through if we have a single character (c >= 0). This may be
2516        > 256 in UTF-8 mode. */
2517
2518        }   /* End of backslash handling */
2519
2520      /* A single character may be followed by '-' to form a range. However,
2521      Perl does not permit ']' to be the end of the range. A '-' character
2522      here is treated as a literal. */
2523
2524      if (ptr[1] == '-' && ptr[2] != ']')
2525        {
2526        int d;
2527        ptr += 2;
2528
2529#ifdef SUPPORT_UTF8
2530        if (utf8)
2531          {                           /* Braces are required because the */
2532          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2533          }
2534        else
2535#endif
2536        d = *ptr;  /* Not UTF-8 mode */
2537
2538        /* The second part of a range can be a single-character escape, but
2539        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2540        in such circumstances. */
2541
2542        if (d == '\\')
2543          {
2544          const uschar *oldptr = ptr;
2545          d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2546
2547          /* \b is backslash; \X is literal X; any other special means the '-'
2548          was literal */
2549
2550          if (d < 0)
2551            {
2552            if (d == -ESC_b) d = '\b';
2553            else if (d == -ESC_X) d = 'X'; else
2554              {
2555              ptr = oldptr - 2;
2556              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2557              }
2558            }
2559          }
2560
2561        /* The check that the two values are in the correct order happens in
2562        the pre-pass. Optimize one-character ranges */
2563
2564        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2565
2566        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2567        matching, we have to use an XCLASS with extra data items. Caseless
2568        matching for characters > 127 is available only if UCP support is
2569        available. */
2570
2571#ifdef SUPPORT_UTF8
2572        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2573          {
2574          class_utf8 = TRUE;
2575
2576          /* With UCP support, we can find the other case equivalents of
2577          the relevant characters. There may be several ranges. Optimize how
2578          they fit with the basic range. */
2579
2580#ifdef SUPPORT_UCP
2581          if ((options & PCRE_CASELESS) != 0)
2582            {
2583            int occ, ocd;
2584            int cc = c;
2585            int origd = d;
2586            while (get_othercase_range(&cc, origd, &occ, &ocd))
2587              {
2588              if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2589
2590              if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2591                {                                  /* if there is overlap,   */
2592                c = occ;                           /* noting that if occ < c */
2593                continue;                          /* we can't have ocd > d  */
2594                }                                  /* because a subrange is  */
2595              if (ocd > d && occ <= d + 1)         /* always shorter than    */
2596                {                                  /* the basic range.       */
2597                d = ocd;
2598                continue;
2599                }
2600
2601              if (occ == ocd)
2602                {
2603                *class_utf8data++ = XCL_SINGLE;
2604                }
2605              else
2606                {
2607                *class_utf8data++ = XCL_RANGE;
2608                class_utf8data += ord2utf8(occ, class_utf8data);
2609                }
2610              class_utf8data += ord2utf8(ocd, class_utf8data);
2611              }
2612            }
2613#endif  /* SUPPORT_UCP */
2614
2615          /* Now record the original range, possibly modified for UCP caseless
2616          overlapping ranges. */
2617
2618          *class_utf8data++ = XCL_RANGE;
2619          class_utf8data += ord2utf8(c, class_utf8data);
2620          class_utf8data += ord2utf8(d, class_utf8data);
2621
2622          /* With UCP support, we are done. Without UCP support, there is no
2623          caseless matching for UTF-8 characters > 127; we can use the bit map
2624          for the smaller ones. */
2625
2626#ifdef SUPPORT_UCP
2627          continue;    /* With next character in the class */
2628#else
2629          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2630
2631          /* Adjust upper limit and fall through to set up the map */
2632
2633          d = 127;
2634
2635#endif  /* SUPPORT_UCP */
2636          }
2637#endif  /* SUPPORT_UTF8 */
2638
2639        /* We use the bit map for all cases when not in UTF-8 mode; else
2640        ranges that lie entirely within 0-127 when there is UCP support; else
2641        for partial ranges without UCP support. */
2642
2643        for (; c <= d; c++)
2644          {
2645          classbits[c/8] |= (1 << (c&7));
2646          if ((options & PCRE_CASELESS) != 0)
2647            {
2648            int uc = cd->fcc[c];           /* flip case */
2649            classbits[uc/8] |= (1 << (uc&7));
2650            }
2651          class_charcount++;                /* in case a one-char range */
2652          class_lastchar = c;
2653          }
2654
2655        continue;   /* Go get the next char in the class */
2656        }
2657
2658      /* Handle a lone single character - we can get here for a normal
2659      non-escape char, or after \ that introduces a single character or for an
2660      apparent range that isn't. */
2661
2662      LONE_SINGLE_CHARACTER:
2663
2664      /* Handle a character that cannot go in the bit map */
2665
2666#ifdef SUPPORT_UTF8
2667      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2668        {
2669        class_utf8 = TRUE;
2670        *class_utf8data++ = XCL_SINGLE;
2671        class_utf8data += ord2utf8(c, class_utf8data);
2672
2673#ifdef SUPPORT_UCP
2674        if ((options & PCRE_CASELESS) != 0)
2675          {
2676          int chartype;
2677          int othercase;
2678          if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2679            {
2680            *class_utf8data++ = XCL_SINGLE;
2681            class_utf8data += ord2utf8(othercase, class_utf8data);
2682            }
2683          }
2684#endif  /* SUPPORT_UCP */
2685
2686        }
2687      else
2688#endif  /* SUPPORT_UTF8 */
2689
2690      /* Handle a single-byte character */
2691        {
2692        classbits[c/8] |= (1 << (c&7));
2693        if ((options & PCRE_CASELESS) != 0)
2694          {
2695          c = cd->fcc[c];   /* flip case */
2696          classbits[c/8] |= (1 << (c&7));
2697          }
2698        class_charcount++;
2699        class_lastchar = c;
2700        }
2701      }
2702
2703    /* Loop until ']' reached; the check for end of string happens inside the
2704    loop. This "while" is the end of the "do" above. */
2705
2706    while ((c = *(++ptr)) != ']' || inescq);
2707
2708    /* If class_charcount is 1, we saw precisely one character whose value is
2709    less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2710    can optimize the negative case only if there were no characters >= 128
2711    because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2712    single-bytes only. This is an historical hangover. Maybe one day we can
2713    tidy these opcodes to handle multi-byte characters.
2714
2715    The optimization throws away the bit map. We turn the item into a
2716    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2717    that OP_NOT does not support multibyte characters. In the positive case, it
2718    can cause firstbyte to be set. Otherwise, there can be no first char if
2719    this item is first, whatever repeat count may follow. In the case of
2720    reqbyte, save the previous value for reinstating. */
2721
2722#ifdef SUPPORT_UTF8
2723    if (class_charcount == 1 &&
2724          (!utf8 ||
2725          (!class_utf8 && (!negate_class || class_lastchar < 128))))
2726
2727#else
2728    if (class_charcount == 1)
2729#endif
2730      {
2731      zeroreqbyte = reqbyte;
2732
2733      /* The OP_NOT opcode works on one-byte characters only. */
2734
2735      if (negate_class)
2736        {
2737        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2738        zerofirstbyte = firstbyte;
2739        *code++ = OP_NOT;
2740        *code++ = class_lastchar;
2741        break;
2742        }
2743
2744      /* For a single, positive character, get the value into mcbuffer, and
2745      then we can handle this with the normal one-character code. */
2746
2747#ifdef SUPPORT_UTF8
2748      if (utf8 && class_lastchar > 127)
2749        mclength = ord2utf8(class_lastchar, mcbuffer);
2750      else
2751#endif
2752        {
2753        mcbuffer[0] = class_lastchar;
2754        mclength = 1;
2755        }
2756      goto ONE_CHAR;
2757      }       /* End of 1-char optimization */
2758
2759    /* The general case - not the one-char optimization. If this is the first
2760    thing in the branch, there can be no first char setting, whatever the
2761    repeat count. Any reqbyte setting must remain unchanged after any kind of
2762    repeat. */
2763
2764    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2765    zerofirstbyte = firstbyte;
2766    zeroreqbyte = reqbyte;
2767
2768    /* If there are characters with values > 255, we have to compile an
2769    extended class, with its own opcode. If there are no characters < 256,
2770    we can omit the bitmap. */
2771
2772#ifdef SUPPORT_UTF8
2773    if (class_utf8)
2774      {
2775      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2776      *code++ = OP_XCLASS;
2777      code += LINK_SIZE;
2778      *code = negate_class? XCL_NOT : 0;
2779
2780      /* If the map is required, install it, and move on to the end of
2781      the extra data */
2782
2783      if (class_charcount > 0)
2784        {
2785        *code++ |= XCL_MAP;
2786        memcpy(code, classbits, 32);
2787        code = class_utf8data;
2788        }
2789
2790      /* If the map is not required, slide down the extra data. */
2791
2792      else
2793        {
2794        int len = class_utf8data - (code + 33);
2795        memmove(code + 1, code + 33, len);
2796        code += len + 1;
2797        }
2798
2799      /* Now fill in the complete length of the item */
2800
2801      PUT(previous, 1, code - previous);
2802      break;   /* End of class handling */
2803      }
2804#endif
2805
2806    /* If there are no characters > 255, negate the 32-byte map if necessary,
2807    and copy it into the code vector. If this is the first thing in the branch,
2808    there can be no first char setting, whatever the repeat count. Any reqbyte
2809    setting must remain unchanged after any kind of repeat. */
2810
2811    if (negate_class)
2812      {
2813      *code++ = OP_NCLASS;
2814      for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2815      }
2816    else
2817      {
2818      *code++ = OP_CLASS;
2819      memcpy(code, classbits, 32);
2820      }
2821    code += 32;
2822    break;
2823
2824    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2825    has been tested above. */
2826
2827    case '{':
2828    if (!is_quantifier) goto NORMAL_CHAR;
2829    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2830    if (*errorptr != NULL) goto FAILED;
2831    goto REPEAT;
2832
2833    case '*':
2834    repeat_min = 0;
2835    repeat_max = -1;
2836    goto REPEAT;
2837
2838    case '+':
2839    repeat_min = 1;
2840    repeat_max = -1;
2841    goto REPEAT;
2842
2843    case '?':
2844    repeat_min = 0;
2845    repeat_max = 1;
2846
2847    REPEAT:
2848    if (previous == NULL)
2849      {
2850      *errorptr = ERR9;
2851      goto FAILED;
2852      }
2853
2854    if (repeat_min == 0)
2855      {
2856      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2857      reqbyte = zeroreqbyte;        /* Ditto */
2858      }
2859
2860    /* Remember whether this is a variable length repeat */
2861
2862    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2863
2864    op_type = 0;                    /* Default single-char op codes */
2865    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2866
2867    /* Save start of previous item, in case we have to move it up to make space
2868    for an inserted OP_ONCE for the additional '+' extension. */
2869
2870    tempcode = previous;
2871
2872    /* If the next character is '+', we have a possessive quantifier. This
2873    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2874    If the next character is '?' this is a minimizing repeat, by default,
2875    but if PCRE_UNGREEDY is set, it works the other way round. We change the
2876    repeat type to the non-default. */
2877
2878    if (ptr[1] == '+')
2879      {
2880      repeat_type = 0;                  /* Force greedy */
2881      possessive_quantifier = TRUE;
2882      ptr++;
2883      }
2884    else if (ptr[1] == '?')
2885      {
2886      repeat_type = greedy_non_default;
2887      ptr++;
2888      }
2889    else repeat_type = greedy_default;
2890
2891    /* If previous was a recursion, we need to wrap it inside brackets so that
2892    it can be replicated if necessary. */
2893
2894    if (*previous == OP_RECURSE)
2895      {
2896      memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2897      code += 1 + LINK_SIZE;
2898      *previous = OP_BRA;
2899      PUT(previous, 1, code - previous);
2900      *code = OP_KET;
2901      PUT(code, 1, code - previous);
2902      code += 1 + LINK_SIZE;
2903      }
2904
2905    /* If previous was a character match, abolish the item and generate a
2906    repeat item instead. If a char item has a minumum of more than one, ensure
2907    that it is set in reqbyte - it might not be if a sequence such as x{3} is
2908    the first thing in a branch because the x will have gone into firstbyte
2909    instead.  */
2910
2911    if (*previous == OP_CHAR || *previous == OP_CHARNC)
2912      {
2913      /* Deal with UTF-8 characters that take up more than one byte. It's
2914      easier to write this out separately than try to macrify it. Use c to
2915      hold the length of the character in bytes, plus 0x80 to flag that it's a
2916      length rather than a small character. */
2917
2918#ifdef SUPPORT_UTF8
2919      if (utf8 && (code[-1] & 0x80) != 0)
2920        {
2921        uschar *lastchar = code - 1;
2922        while((*lastchar & 0xc0) == 0x80) lastchar--;
2923        c = code - lastchar;            /* Length of UTF-8 character */
2924        memcpy(utf8_char, lastchar, c); /* Save the char */
2925        c |= 0x80;                      /* Flag c as a length */
2926        }
2927      else
2928#endif
2929
2930      /* Handle the case of a single byte - either with no UTF8 support, or
2931      with UTF-8 disabled, or for a UTF-8 character < 128. */
2932
2933        {
2934        c = code[-1];
2935        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2936        }
2937
2938      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2939      }
2940
2941    /* If previous was a single negated character ([^a] or similar), we use
2942    one of the special opcodes, replacing it. The code is shared with single-
2943    character repeats by setting opt_type to add a suitable offset into
2944    repeat_type. OP_NOT is currently used only for single-byte chars. */
2945
2946    else if (*previous == OP_NOT)
2947      {
2948      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2949      c = previous[1];
2950      goto OUTPUT_SINGLE_REPEAT;
2951      }
2952
2953    /* If previous was a character type match (\d or similar), abolish it and
2954    create a suitable repeat item. The code is shared with single-character
2955    repeats by setting op_type to add a suitable offset into repeat_type. Note
2956    the the Unicode property types will be present only when SUPPORT_UCP is
2957    defined, but we don't wrap the little bits of code here because it just
2958    makes it horribly messy. */
2959
2960    else if (*previous < OP_EODN)
2961      {
2962      uschar *oldcode;
2963      int prop_type;
2964      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2965      c = *previous;
2966
2967      OUTPUT_SINGLE_REPEAT:
2968      prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2969        previous[1] : -1;
2970
2971      oldcode = code;
2972      code = previous;                  /* Usually overwrite previous item */
2973
2974      /* If the maximum is zero then the minimum must also be zero; Perl allows
2975      this case, so we do too - by simply omitting the item altogether. */
2976
2977      if (repeat_max == 0) goto END_REPEAT;
2978
2979      /* All real repeats make it impossible to handle partial matching (maybe
2980      one day we will be able to remove this restriction). */
2981
2982      if (repeat_max != 1) cd->nopartial = TRUE;
2983
2984      /* Combine the op_type with the repeat_type */
2985
2986      repeat_type += op_type;
2987
2988      /* A minimum of zero is handled either as the special case * or ?, or as
2989      an UPTO, with the maximum given. */
2990
2991      if (repeat_min == 0)
2992        {
2993        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2994          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2995        else
2996          {
2997          *code++ = OP_UPTO + repeat_type;
2998          PUT2INC(code, 0, repeat_max);
2999          }
3000        }
3001
3002      /* A repeat minimum of 1 is optimized into some special cases. If the
3003      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
3004      left in place and, if the maximum is greater than 1, we use OP_UPTO with
3005      one less than the maximum. */
3006
3007      else if (repeat_min == 1)
3008        {
3009        if (repeat_max == -1)
3010          *code++ = OP_PLUS + repeat_type;
3011        else
3012          {
3013          code = oldcode;                 /* leave previous item in place */
3014          if (repeat_max == 1) goto END_REPEAT;
3015          *code++ = OP_UPTO + repeat_type;
3016          PUT2INC(code, 0, repeat_max - 1);
3017          }
3018        }
3019
3020      /* The case {n,n} is just an EXACT, while the general case {n,m} is
3021      handled as an EXACT followed by an UPTO. */
3022
3023      else
3024        {
3025        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3026        PUT2INC(code, 0, repeat_min);
3027
3028        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3029        we have to insert the character for the previous code. For a repeated
3030        Unicode property match, there is an extra byte that defines the
3031        required property. In UTF-8 mode, long characters have their length in
3032        c, with the 0x80 bit as a flag. */
3033
3034        if (repeat_max < 0)
3035          {
3036#ifdef SUPPORT_UTF8
3037          if (utf8 && c >= 128)
3038            {
3039            memcpy(code, utf8_char, c & 7);
3040            code += c & 7;
3041            }
3042          else
3043#endif
3044            {
3045            *code++ = c;
3046            if (prop_type >= 0) *code++ = prop_type;
3047            }
3048          *code++ = OP_STAR + repeat_type;
3049          }
3050
3051        /* Else insert an UPTO if the max is greater than the min, again
3052        preceded by the character, for the previously inserted code. */
3053
3054        else if (repeat_max != repeat_min)
3055          {
3056#ifdef SUPPORT_UTF8
3057          if (utf8 && c >= 128)
3058            {
3059            memcpy(code, utf8_char, c & 7);
3060            code += c & 7;
3061            }
3062          else
3063#endif
3064          *code++ = c;
3065          if (prop_type >= 0) *code++ = prop_type;
3066          repeat_max -= repeat_min;
3067          *code++ = OP_UPTO + repeat_type;
3068          PUT2INC(code, 0, repeat_max);
3069          }
3070        }
3071
3072      /* The character or character type itself comes last in all cases. */
3073
3074#ifdef SUPPORT_UTF8
3075      if (utf8 && c >= 128)
3076        {
3077        memcpy(code, utf8_char, c & 7);
3078        code += c & 7;
3079        }
3080      else
3081#endif
3082      *code++ = c;
3083
3084      /* For a repeated Unicode property match, there is an extra byte that
3085      defines the required property. */
3086
3087#ifdef SUPPORT_UCP
3088      if (prop_type >= 0) *code++ = prop_type;
3089#endif
3090      }
3091
3092    /* If previous was a character class or a back reference, we put the repeat
3093    stuff after it, but just skip the item if the repeat was {0,0}. */
3094
3095    else if (*previous == OP_CLASS ||
3096             *previous == OP_NCLASS ||
3097#ifdef SUPPORT_UTF8
3098             *previous == OP_XCLASS ||
3099#endif
3100             *previous == OP_REF)
3101      {
3102      if (repeat_max == 0)
3103        {
3104        code = previous;
3105        goto END_REPEAT;
3106        }
3107
3108      /* All real repeats make it impossible to handle partial matching (maybe
3109      one day we will be able to remove this restriction). */
3110
3111      if (repeat_max != 1) cd->nopartial = TRUE;
3112
3113      if (repeat_min == 0 && repeat_max == -1)
3114        *code++ = OP_CRSTAR + repeat_type;
3115      else if (repeat_min == 1 && repeat_max == -1)
3116        *code++ = OP_CRPLUS + repeat_type;
3117      else if (repeat_min == 0 && repeat_max == 1)
3118        *code++ = OP_CRQUERY + repeat_type;
3119      else
3120        {
3121        *code++ = OP_CRRANGE + repeat_type;
3122        PUT2INC(code, 0, repeat_min);
3123        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3124        PUT2INC(code, 0, repeat_max);
3125        }
3126      }
3127
3128    /* If previous was a bracket group, we may have to replicate it in certain
3129    cases. */
3130
3131    else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3132             *previous == OP_COND)
3133      {
3134      register int i;
3135      int ketoffset = 0;
3136      int len = code - previous;
3137      uschar *bralink = NULL;
3138
3139      /* If the maximum repeat count is unlimited, find the end of the bracket
3140      by scanning through from the start, and compute the offset back to it
3141      from the current code pointer. There may be an OP_OPT setting following
3142      the final KET, so we can't find the end just by going back from the code
3143      pointer. */
3144
3145      if (repeat_max == -1)
3146        {
3147        register uschar *ket = previous;
3148        do ket += GET(ket, 1); while (*ket != OP_KET);
3149        ketoffset = code - ket;
3150        }
3151
3152      /* The case of a zero minimum is special because of the need to stick
3153      OP_BRAZERO in front of it, and because the group appears once in the
3154      data, whereas in other cases it appears the minimum number of times. For
3155      this reason, it is simplest to treat this case separately, as otherwise
3156      the code gets far too messy. There are several special subcases when the
3157      minimum is zero. */
3158
3159      if (repeat_min == 0)
3160        {
3161        /* If the maximum is also zero, we just omit the group from the output
3162        altogether. */
3163
3164        if (repeat_max == 0)
3165          {
3166          code = previous;
3167          goto END_REPEAT;
3168          }
3169
3170        /* If the maximum is 1 or unlimited, we just have to stick in the
3171        BRAZERO and do no more at this point. However, we do need to adjust
3172        any OP_RECURSE calls inside the group that refer to the group itself or
3173        any internal group, because the offset is from the start of the whole
3174        regex. Temporarily terminate the pattern while doing this. */
3175
3176        if (repeat_max <= 1)
3177          {
3178          *code = OP_END;
3179          adjust_recurse(previous, 1, utf8, cd);
3180          memmove(previous+1, previous, len);
3181          code++;
3182          *previous++ = OP_BRAZERO + repeat_type;
3183          }
3184
3185        /* If the maximum is greater than 1 and limited, we have to replicate
3186        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3187        The first one has to be handled carefully because it's the original
3188        copy, which has to be moved up. The remainder can be handled by code
3189        that is common with the non-zero minimum case below. We have to
3190        adjust the value or repeat_max, since one less copy is required. Once
3191        again, we may have to adjust any OP_RECURSE calls inside the group. */
3192
3193        else
3194          {
3195          int offset;
3196          *code = OP_END;
3197          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3198          memmove(previous + 2 + LINK_SIZE, previous, len);
3199          code += 2 + LINK_SIZE;
3200          *previous++ = OP_BRAZERO + repeat_type;
3201          *previous++ = OP_BRA;
3202
3203          /* We chain together the bracket offset fields that have to be
3204          filled in later when the ends of the brackets are reached. */
3205
3206          offset = (bralink == NULL)? 0 : previous - bralink;
3207          bralink = previous;
3208          PUTINC(previous, 0, offset);
3209          }
3210
3211        repeat_max--;
3212        }
3213
3214      /* If the minimum is greater than zero, replicate the group as many
3215      times as necessary, and adjust the maximum to the number of subsequent
3216      copies that we need. If we set a first char from the group, and didn't
3217      set a required char, copy the latter from the former. */
3218
3219      else
3220        {
3221        if (repeat_min > 1)
3222          {
3223          if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3224          for (i = 1; i < repeat_min; i++)
3225            {
3226            memcpy(code, previous, len);
3227            code += len;
3228            }
3229          }
3230        if (repeat_max > 0) repeat_max -= repeat_min;
3231        }
3232
3233      /* This code is common to both the zero and non-zero minimum cases. If
3234      the maximum is limited, it replicates the group in a nested fashion,
3235      remembering the bracket starts on a stack. In the case of a zero minimum,
3236      the first one was set up above. In all cases the repeat_max now specifies
3237      the number of additional copies needed. */
3238
3239      if (repeat_max >= 0)
3240        {
3241        for (i = repeat_max - 1; i >= 0; i--)
3242          {
3243          *code++ = OP_BRAZERO + repeat_type;
3244
3245          /* All but the final copy start a new nesting, maintaining the
3246          chain of brackets outstanding. */
3247
3248          if (i != 0)
3249            {
3250            int offset;
3251            *code++ = OP_BRA;
3252            offset = (bralink == NULL)? 0 : code - bralink;
3253            bralink = code;
3254            PUTINC(code, 0, offset);
3255            }
3256
3257          memcpy(code, previous, len);
3258          code += len;
3259          }
3260
3261        /* Now chain through the pending brackets, and fill in their length
3262        fields (which are holding the chain links pro tem). */
3263
3264        while (bralink != NULL)
3265          {
3266          int oldlinkoffset;
3267          int offset = code - bralink + 1;
3268          uschar *bra = code - offset;
3269          oldlinkoffset = GET(bra, 1);
3270          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3271          *code++ = OP_KET;
3272          PUTINC(code, 0, offset);
3273          PUT(bra, 1, offset);
3274          }
3275        }
3276
3277      /* If the maximum is unlimited, set a repeater in the final copy. We
3278      can't just offset backwards from the current code point, because we
3279      don't know if there's been an options resetting after the ket. The
3280      correct offset was computed above. */
3281
3282      else code[-ketoffset] = OP_KETRMAX + repeat_type;
3283      }
3284
3285    /* Else there's some kind of shambles */
3286
3287    else
3288      {
3289      *errorptr = ERR11;
3290      goto FAILED;
3291      }
3292
3293    /* If the character following a repeat is '+', we wrap the entire repeated
3294    item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3295    Sun's Java package. The repeated item starts at tempcode, not at previous,
3296    which might be the first part of a string whose (former) last char we
3297    repeated. However, we don't support '+' after a greediness '?'. */
3298
3299    if (possessive_quantifier)
3300      {
3301      int len = code - tempcode;
3302      memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3303      code += 1 + LINK_SIZE;
3304      len += 1 + LINK_SIZE;
3305      tempcode[0] = OP_ONCE;
3306      *code++ = OP_KET;
3307      PUTINC(code, 0, len);
3308      PUT(tempcode, 1, len);
3309      }
3310
3311    /* In all case we no longer have a previous item. We also set the
3312    "follows varying string" flag for subsequently encountered reqbytes if
3313    it isn't already set and we have just passed a varying length item. */
3314
3315    END_REPEAT:
3316    previous = NULL;
3317    cd->req_varyopt |= reqvary;
3318    break;
3319
3320
3321    /* Start of nested bracket sub-expression, or comment or lookahead or
3322    lookbehind or option setting or condition. First deal with special things
3323    that can come after a bracket; all are introduced by ?, and the appearance
3324    of any of them means that this is not a referencing group. They were
3325    checked for validity in the first pass over the string, so we don't have to
3326    check for syntax errors here.  */
3327
3328    case '(':
3329    newoptions = options;
3330    skipbytes = 0;
3331
3332    if (*(++ptr) == '?')
3333      {
3334      int set, unset;
3335      int *optset;
3336
3337      switch (*(++ptr))
3338        {
3339        case '#':                 /* Comment; skip to ket */
3340        ptr++;
3341        while (*ptr != ')') ptr++;
3342        continue;
3343
3344        case ':':                 /* Non-extracting bracket */
3345        bravalue = OP_BRA;
3346        ptr++;
3347        break;
3348
3349        case '(':
3350        bravalue = OP_COND;       /* Conditional group */
3351
3352        /* Condition to test for recursion */
3353
3354        if (ptr[1] == 'R')
3355          {
3356          code[1+LINK_SIZE] = OP_CREF;
3357          PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3358          skipbytes = 3;
3359          ptr += 3;
3360          }
3361
3362        /* Condition to test for a numbered subpattern match. We know that
3363        if a digit follows ( then there will just be digits until ) because
3364        the syntax was checked in the first pass. */
3365
3366        else if ((digitab[ptr[1]] && ctype_digit) != 0)
3367          {
3368          int condref;                 /* Don't amalgamate; some compilers */
3369          condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
3370          while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3371          if (condref == 0)
3372            {
3373            *errorptr = ERR35;
3374            goto FAILED;
3375            }
3376          ptr++;
3377          code[1+LINK_SIZE] = OP_CREF;
3378          PUT2(code, 2+LINK_SIZE, condref);
3379          skipbytes = 3;
3380          }
3381        /* For conditions that are assertions, we just fall through, having
3382        set bravalue above. */
3383        break;
3384
3385        case '=':                 /* Positive lookahead */
3386        bravalue = OP_ASSERT;
3387        ptr++;
3388        break;
3389
3390        case '!':                 /* Negative lookahead */
3391        bravalue = OP_ASSERT_NOT;
3392        ptr++;
3393        break;
3394
3395        case '<':                 /* Lookbehinds */
3396        switch (*(++ptr))
3397          {
3398          case '=':               /* Positive lookbehind */
3399          bravalue = OP_ASSERTBACK;
3400          ptr++;
3401          break;
3402
3403          case '!':               /* Negative lookbehind */
3404          bravalue = OP_ASSERTBACK_NOT;
3405          ptr++;
3406          break;
3407          }
3408        break;
3409
3410        case '>':                 /* One-time brackets */
3411        bravalue = OP_ONCE;
3412        ptr++;
3413        break;
3414
3415        case 'C':                 /* Callout - may be followed by digits; */
3416        previous_callout = code;  /* Save for later completion */
3417        after_manual_callout = 1; /* Skip one item before completing */
3418        *code++ = OP_CALLOUT;     /* Already checked that the terminating */
3419          {                       /* closing parenthesis is present. */
3420          int n = 0;
3421          while ((digitab[*(++ptr)] & ctype_digit) != 0)
3422            n = n * 10 + *ptr - '0';
3423          if (n > 255)
3424            {
3425            *errorptr = ERR38;
3426            goto FAILED;
3427            }
3428          *code++ = n;
3429          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
3430          PUT(code, LINK_SIZE, 0);                    /* Default length */
3431          code += 2 * LINK_SIZE;
3432          }
3433        previous = NULL;
3434        continue;
3435
3436        case 'P':                 /* Named subpattern handling */
3437        if (*(++ptr) == '<')      /* Definition */
3438          {
3439          int i, namelen;
3440          uschar *slot = cd->name_table;
3441          const uschar *name;     /* Don't amalgamate; some compilers */
3442          name = ++ptr;           /* grumble at autoincrement in declaration */
3443
3444          while (*ptr++ != '>');
3445          namelen = ptr - name - 1;
3446
3447          for (i = 0; i < cd->names_found; i++)
3448            {
3449            int crc = memcmp(name, slot+2, namelen);
3450            if (crc == 0)
3451              {
3452              if (slot[2+namelen] == 0)
3453                {
3454                *errorptr = ERR43;
3455                goto FAILED;
3456                }
3457              crc = -1;             /* Current name is substring */
3458              }
3459            if (crc < 0)
3460              {
3461              memmove(slot + cd->name_entry_size, slot,
3462                (cd->names_found - i) * cd->name_entry_size);
3463              break;
3464              }
3465            slot += cd->name_entry_size;
3466            }
3467
3468          PUT2(slot, 0, *brackets + 1);
3469          memcpy(slot + 2, name, namelen);
3470          slot[2+namelen] = 0;
3471          cd->names_found++;
3472          goto NUMBERED_GROUP;
3473          }
3474
3475        if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
3476          {
3477          int i, namelen;
3478          int type = *ptr++;
3479          const uschar *name = ptr;
3480          uschar *slot = cd->name_table;
3481
3482          while (*ptr != ')') ptr++;
3483          namelen = ptr - name;
3484
3485          for (i = 0; i < cd->names_found; i++)
3486            {
3487            if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3488            slot += cd->name_entry_size;
3489            }
3490          if (i >= cd->names_found)
3491            {
3492            *errorptr = ERR15;
3493            goto FAILED;
3494            }
3495
3496          recno = GET2(slot, 0);
3497
3498          if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
3499
3500          /* Back reference */
3501
3502          previous = code;
3503          *code++ = OP_REF;
3504          PUT2INC(code, 0, recno);
3505          cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3506          if (recno > cd->top_backref) cd->top_backref = recno;
3507          continue;
3508          }
3509
3510        /* Should never happen */
3511        break;
3512
3513        case 'R':                 /* Pattern recursion */
3514        ptr++;                    /* Same as (?0)      */
3515        /* Fall through */
3516
3517        /* Recursion or "subroutine" call */
3518
3519        case '0': case '1': case '2': case '3': case '4':
3520        case '5': case '6': case '7': case '8': case '9':
3521          {
3522          const uschar *called;
3523          recno = 0;
3524          while((digitab[*ptr] & ctype_digit) != 0)
3525            recno = recno * 10 + *ptr++ - '0';
3526
3527          /* Come here from code above that handles a named recursion */
3528
3529          HANDLE_RECURSION:
3530
3531          previous = code;
3532
3533          /* Find the bracket that is being referenced. Temporarily end the
3534          regex in case it doesn't exist. */
3535
3536          *code = OP_END;
3537          called = (recno == 0)?
3538            cd->start_code : find_bracket(cd->start_code, utf8, recno);
3539
3540          if (called == NULL)
3541            {
3542            *errorptr = ERR15;
3543            goto FAILED;
3544            }
3545
3546          /* If the subpattern is still open, this is a recursive call. We
3547          check to see if this is a left recursion that could loop for ever,
3548          and diagnose that case. */
3549
3550          if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3551            {
3552            *errorptr = ERR40;
3553            goto FAILED;
3554            }
3555
3556          /* Insert the recursion/subroutine item */
3557
3558          *code = OP_RECURSE;
3559          PUT(code, 1, called - cd->start_code);
3560          code += 1 + LINK_SIZE;
3561          }
3562        continue;
3563
3564        /* Character after (? not specially recognized */
3565
3566        default:                  /* Option setting */
3567        set = unset = 0;
3568        optset = &set;
3569
3570        while (*ptr != ')' && *ptr != ':')
3571          {
3572          switch (*ptr++)
3573            {
3574            case '-': optset = &unset; break;
3575
3576            case 'i': *optset |= PCRE_CASELESS; break;
3577            case 'm': *optset |= PCRE_MULTILINE; break;
3578            case 's': *optset |= PCRE_DOTALL; break;
3579            case 'x': *optset |= PCRE_EXTENDED; break;
3580            case 'U': *optset |= PCRE_UNGREEDY; break;
3581            case 'X': *optset |= PCRE_EXTRA; break;
3582            }
3583          }
3584
3585        /* Set up the changed option bits, but don't change anything yet. */
3586
3587        newoptions = (options | set) & (~unset);
3588
3589        /* If the options ended with ')' this is not the start of a nested
3590        group with option changes, so the options change at this level. Compile
3591        code to change the ims options if this setting actually changes any of
3592        them. We also pass the new setting back so that it can be put at the
3593        start of any following branches, and when this group ends (if we are in
3594        a group), a resetting item can be compiled.
3595
3596        Note that if this item is right at the start of the pattern, the
3597        options will have been abstracted and made global, so there will be no
3598        change to compile. */
3599
3600        if (*ptr == ')')
3601          {
3602          if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3603            {
3604            *code++ = OP_OPT;
3605            *code++ = newoptions & PCRE_IMS;
3606            }
3607
3608          /* Change options at this level, and pass them back for use
3609          in subsequent branches. Reset the greedy defaults and the case
3610          value for firstbyte and reqbyte. */
3611
3612          *optionsptr = options = newoptions;
3613          greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3614          greedy_non_default = greedy_default ^ 1;
3615          req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3616
3617          previous = NULL;       /* This item can't be repeated */
3618          continue;              /* It is complete */
3619          }
3620
3621        /* If the options ended with ':' we are heading into a nested group
3622        with possible change of options. Such groups are non-capturing and are
3623        not assertions of any kind. All we need to do is skip over the ':';
3624        the newoptions value is handled below. */
3625
3626        bravalue = OP_BRA;
3627        ptr++;
3628        }
3629      }
3630
3631    /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3632    non-capturing and behave like (?:...) brackets */
3633
3634    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3635      {
3636      bravalue = OP_BRA;
3637      }
3638
3639    /* Else we have a referencing group; adjust the opcode. If the bracket
3640    number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3641    arrange for the true number to follow later, in an OP_BRANUMBER item. */
3642
3643    else
3644      {
3645      NUMBERED_GROUP:
3646      if (++(*brackets) > EXTRACT_BASIC_MAX)
3647        {
3648        bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3649        code[1+LINK_SIZE] = OP_BRANUMBER;
3650        PUT2(code, 2+LINK_SIZE, *brackets);
3651        skipbytes = 3;
3652        }
3653      else bravalue = OP_BRA + *brackets;
3654      }
3655
3656    /* Process nested bracketed re. Assertions may not be repeated, but other
3657    kinds can be. We copy code into a non-register variable in order to be able
3658    to pass its address because some compilers complain otherwise. Pass in a
3659    new setting for the ims options if they have changed. */
3660
3661    previous = (bravalue >= OP_ONCE)? code : NULL;
3662    *code = bravalue;
3663    tempcode = code;
3664    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
3665
3666    if (!compile_regex(
3667         newoptions,                   /* The complete new option state */
3668         options & PCRE_IMS,           /* The previous ims option state */
3669         brackets,                     /* Extracting bracket count */
3670         &tempcode,                    /* Where to put code (updated) */
3671         &ptr,                         /* Input pointer (updated) */
3672         errorptr,                     /* Where to put an error message */
3673         (bravalue == OP_ASSERTBACK ||
3674          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3675         skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
3676         &subfirstbyte,                /* For possible first char */
3677         &subreqbyte,                  /* For possible last char */
3678         bcptr,                        /* Current branch chain */
3679         cd))                          /* Tables block */
3680      goto FAILED;
3681
3682    /* At the end of compiling, code is still pointing to the start of the
3683    group, while tempcode has been updated to point past the end of the group
3684    and any option resetting that may follow it. The pattern pointer (ptr)
3685    is on the bracket. */
3686
3687    /* If this is a conditional bracket, check that there are no more than
3688    two branches in the group. */
3689
3690    else if (bravalue == OP_COND)
3691      {
3692      uschar *tc = code;
3693      condcount = 0;
3694
3695      do {
3696         condcount++;
3697         tc += GET(tc,1);
3698         }
3699      while (*tc != OP_KET);
3700
3701      if (condcount > 2)
3702        {
3703        *errorptr = ERR27;
3704        goto FAILED;
3705        }
3706
3707      /* If there is just one branch, we must not make use of its firstbyte or
3708      reqbyte, because this is equivalent to an empty second branch. */
3709
3710      if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3711      }
3712
3713    /* Handle updating of the required and first characters. Update for normal
3714    brackets of all kinds, and conditions with two branches (see code above).
3715    If the bracket is followed by a quantifier with zero repeat, we have to
3716    back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3717    main loop so that they can be accessed for the back off. */
3718
3719    zeroreqbyte = reqbyte;
3720    zerofirstbyte = firstbyte;
3721    groupsetfirstbyte = FALSE;
3722
3723    if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3724      {
3725      /* If we have not yet set a firstbyte in this branch, take it from the
3726      subpattern, remembering that it was set here so that a repeat of more
3727      than one can replicate it as reqbyte if necessary. If the subpattern has
3728      no firstbyte, set "none" for the whole branch. In both cases, a zero
3729      repeat forces firstbyte to "none". */
3730
3731      if (firstbyte == REQ_UNSET)
3732        {
3733        if (subfirstbyte >= 0)
3734          {
3735          firstbyte = subfirstbyte;
3736          groupsetfirstbyte = TRUE;
3737          }
3738        else firstbyte = REQ_NONE;
3739        zerofirstbyte = REQ_NONE;
3740        }
3741
3742      /* If firstbyte was previously set, convert the subpattern's firstbyte
3743      into reqbyte if there wasn't one, using the vary flag that was in
3744      existence beforehand. */
3745
3746      else if (subfirstbyte >= 0 && subreqbyte < 0)
3747        subreqbyte = subfirstbyte | tempreqvary;
3748
3749      /* If the subpattern set a required byte (or set a first byte that isn't
3750      really the first byte - see above), set it. */
3751
3752      if (subreqbyte >= 0) reqbyte = subreqbyte;
3753      }
3754
3755    /* For a forward assertion, we take the reqbyte, if set. This can be
3756    helpful if the pattern that follows the assertion doesn't set a different
3757    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3758    for an assertion, however because it leads to incorrect effect for patterns
3759    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3760    of a firstbyte. This is overcome by a scan at the end if there's no
3761    firstbyte, looking for an asserted first char. */
3762
3763    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3764
3765    /* Now update the main code pointer to the end of the group. */
3766
3767    code = tempcode;
3768
3769    /* Error if hit end of pattern */
3770
3771    if (*ptr != ')')
3772      {
3773      *errorptr = ERR14;
3774      goto FAILED;
3775      }
3776    break;
3777
3778    /* Check \ for being a real metacharacter; if not, fall through and handle
3779    it as a data character at the start of a string. Escape items are checked
3780    for validity in the pre-compiling pass. */
3781
3782    case '\\':
3783    tempptr = ptr;
3784    c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3785
3786    /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3787    are arranged to be the negation of the corresponding OP_values. For the
3788    back references, the values are ESC_REF plus the reference number. Only
3789    back references and those types that consume a character may be repeated.
3790    We can test for values between ESC_b and ESC_Z for the latter; this may
3791    have to change if any new ones are ever created. */
3792
3793    if (c < 0)
3794      {
3795      if (-c == ESC_Q)            /* Handle start of quoted string */
3796        {
3797        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3798          else inescq = TRUE;
3799        continue;
3800        }
3801
3802      /* For metasequences that actually match a character, we disable the
3803      setting of a first character if it hasn't already been set. */
3804
3805      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3806        firstbyte = REQ_NONE;
3807
3808      /* Set values to reset to if this is followed by a zero repeat. */
3809
3810      zerofirstbyte = firstbyte;
3811      zeroreqbyte = reqbyte;
3812
3813      /* Back references are handled specially */
3814
3815      if (-c >= ESC_REF)
3816        {
3817        int number = -c - ESC_REF;
3818        previous = code;
3819        *code++ = OP_REF;
3820        PUT2INC(code, 0, number);
3821        }
3822
3823      /* So are Unicode property matches, if supported. We know that get_ucp
3824      won't fail because it was tested in the pre-pass. */
3825
3826#ifdef SUPPORT_UCP
3827      else if (-c == ESC_P || -c == ESC_p)
3828        {
3829        BOOL negated;
3830        int value = get_ucp(&ptr, &negated, errorptr);
3831        previous = code;
3832        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3833        *code++ = value;
3834        }
3835#endif
3836
3837      /* For the rest, we can obtain the OP value by negating the escape
3838      value */
3839
3840      else
3841        {
3842        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3843        *code++ = -c;
3844        }
3845      continue;
3846      }
3847
3848    /* We have a data character whose value is in c. In UTF-8 mode it may have
3849    a value > 127. We set its representation in the length/buffer, and then
3850    handle it as a data character. */
3851
3852#ifdef SUPPORT_UTF8
3853    if (utf8 && c > 127)
3854      mclength = ord2utf8(c, mcbuffer);
3855    else
3856#endif
3857
3858     {
3859     mcbuffer[0] = c;
3860     mclength = 1;
3861     }
3862
3863    goto ONE_CHAR;
3864
3865    /* Handle a literal character. It is guaranteed not to be whitespace or #
3866    when the extended flag is set. If we are in UTF-8 mode, it may be a
3867    multi-byte literal character. */
3868
3869    default:
3870    NORMAL_CHAR:
3871    mclength = 1;
3872    mcbuffer[0] = c;
3873
3874#ifdef SUPPORT_UTF8
3875    if (utf8 && (c & 0xc0) == 0xc0)
3876      {
3877      while ((ptr[1] & 0xc0) == 0x80)
3878        mcbuffer[mclength++] = *(++ptr);
3879      }
3880#endif
3881
3882    /* At this point we have the character's bytes in mcbuffer, and the length
3883    in mclength. When not in UTF-8 mode, the length is always 1. */
3884
3885    ONE_CHAR:
3886    previous = code;
3887    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3888    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3889
3890    /* Set the first and required bytes appropriately. If no previous first
3891    byte, set it from this character, but revert to none on a zero repeat.
3892    Otherwise, leave the firstbyte value alone, and don't change it on a zero
3893    repeat. */
3894
3895    if (firstbyte == REQ_UNSET)
3896      {
3897      zerofirstbyte = REQ_NONE;
3898      zeroreqbyte = reqbyte;
3899
3900      /* If the character is more than one byte long, we can set firstbyte
3901      only if it is not to be matched caselessly. */
3902
3903      if (mclength == 1 || req_caseopt == 0)
3904        {
3905        firstbyte = mcbuffer[0] | req_caseopt;
3906        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3907        }
3908      else firstbyte = reqbyte = REQ_NONE;
3909      }
3910
3911    /* firstbyte was previously set; we can set reqbyte only the length is
3912    1 or the matching is caseful. */
3913
3914    else
3915      {
3916      zerofirstbyte = firstbyte;
3917      zeroreqbyte = reqbyte;
3918      if (mclength == 1 || req_caseopt == 0)
3919        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3920      }
3921
3922    break;            /* End of literal character handling */
3923    }
3924  }                   /* end of big loop */
3925
3926/* Control never reaches here by falling through, only by a goto for all the
3927error states. Pass back the position in the pattern so that it can be displayed
3928to the user for diagnosing the error. */
3929
3930FAILED:
3931*ptrptr = ptr;
3932return FALSE;
3933}
3934
3935
3936
3937
3938/*************************************************
3939*     Compile sequence of alternatives           *
3940*************************************************/
3941
3942/* On entry, ptr is pointing past the bracket character, but on return
3943it points to the closing bracket, or vertical bar, or end of string.
3944The code variable is pointing at the byte into which the BRA operator has been
3945stored. If the ims options are changed at the start (for a (?ims: group) or
3946during any branch, we need to insert an OP_OPT item at the start of every
3947following branch to ensure they get set correctly at run time, and also pass
3948the new options into every subsequent branch compile.
3949
3950Argument:
3951  options        option bits, including any changes for this subpattern
3952  oldims         previous settings of ims option bits
3953  brackets       -> int containing the number of extracting brackets used
3954  codeptr        -> the address of the current code pointer
3955  ptrptr         -> the address of the current pattern pointer
3956  errorptr       -> pointer to error message
3957  lookbehind     TRUE if this is a lookbehind assertion
3958  skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3959  firstbyteptr   place to put the first required character, or a negative number
3960  reqbyteptr     place to put the last required character, or a negative number
3961  bcptr          pointer to the chain of currently open branches
3962  cd             points to the data block with tables pointers etc.
3963
3964Returns:      TRUE on success
3965*/
3966
3967static BOOL
3968compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3969  const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3970  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3971{
3972const uschar *ptr = *ptrptr;
3973uschar *code = *codeptr;
3974uschar *last_branch = code;
3975uschar *start_bracket = code;
3976uschar *reverse_count = NULL;
3977int firstbyte, reqbyte;
3978int branchfirstbyte, branchreqbyte;
3979branch_chain bc;
3980
3981bc.outer = bcptr;
3982bc.current = code;
3983
3984firstbyte = reqbyte = REQ_UNSET;
3985
3986/* Offset is set zero to mark that this bracket is still open */
3987
3988PUT(code, 1, 0);
3989code += 1 + LINK_SIZE + skipbytes;
3990
3991/* Loop for each alternative branch */
3992
3993for (;;)
3994  {
3995  /* Handle a change of ims options at the start of the branch */
3996
3997  if ((options & PCRE_IMS) != oldims)
3998    {
3999    *code++ = OP_OPT;
4000    *code++ = options & PCRE_IMS;
4001    }
4002
4003  /* Set up dummy OP_REVERSE if lookbehind assertion */
4004
4005  if (lookbehind)
4006    {
4007    *code++ = OP_REVERSE;
4008    reverse_count = code;
4009    PUTINC(code, 0, 0);
4010    }
4011
4012  /* Now compile the branch */
4013
4014  if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4015        &branchfirstbyte, &branchreqbyte, &bc, cd))
4016    {
4017    *ptrptr = ptr;
4018    return FALSE;
4019    }
4020
4021  /* If this is the first branch, the firstbyte and reqbyte values for the
4022  branch become the values for the regex. */
4023
4024  if (*last_branch != OP_ALT)
4025    {
4026    firstbyte = branchfirstbyte;
4027    reqbyte = branchreqbyte;
4028    }
4029
4030  /* If this is not the first branch, the first char and reqbyte have to
4031  match the values from all the previous branches, except that if the previous
4032  value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4033  REQ_VARY for the regex. */
4034
4035  else
4036    {
4037    /* If we previously had a firstbyte, but it doesn't match the new branch,
4038    we have to abandon the firstbyte for the regex, but if there was previously
4039    no reqbyte, it takes on the value of the old firstbyte. */
4040
4041    if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4042      {
4043      if (reqbyte < 0) reqbyte = firstbyte;
4044      firstbyte = REQ_NONE;
4045      }
4046
4047    /* If we (now or from before) have no firstbyte, a firstbyte from the
4048    branch becomes a reqbyte if there isn't a branch reqbyte. */
4049
4050    if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4051        branchreqbyte = branchfirstbyte;
4052
4053    /* Now ensure that the reqbytes match */
4054
4055    if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4056      reqbyte = REQ_NONE;
4057    else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4058    }
4059
4060  /* If lookbehind, check that this branch matches a fixed-length string,
4061  and put the length into the OP_REVERSE item. Temporarily mark the end of
4062  the branch with OP_END. */
4063
4064  if (lookbehind)
4065    {
4066    int length;
4067    *code = OP_END;
4068    length = find_fixedlength(last_branch, options);
4069    DPRINTF(("fixed length = %d\n", length));
4070    if (length < 0)
4071      {
4072      *errorptr = (length == -2)? ERR36 : ERR25;
4073      *ptrptr = ptr;
4074      return FALSE;
4075      }
4076    PUT(reverse_count, 0, length);
4077    }
4078
4079  /* Reached end of expression, either ')' or end of pattern. Go back through
4080  the alternative branches and reverse the chain of offsets, with the field in
4081  the BRA item now becoming an offset to the first alternative. If there are
4082  no alternatives, it points to the end of the group. The length in the
4083  terminating ket is always the length of the whole bracketed item. If any of
4084  the ims options were changed inside the group, compile a resetting op-code
4085  following, except at the very end of the pattern. Return leaving the pointer
4086  at the terminating char. */
4087
4088  if (*ptr != '|')
4089    {
4090    int length = code - last_branch;
4091    do
4092      {
4093      int prev_length = GET(last_branch, 1);
4094      PUT(last_branch, 1, length);
4095      length = prev_length;
4096      last_branch -= length;
4097      }
4098    while (length > 0);
4099
4100    /* Fill in the ket */
4101
4102    *code = OP_KET;
4103    PUT(code, 1, code - start_bracket);
4104    code += 1 + LINK_SIZE;
4105
4106    /* Resetting option if needed */
4107
4108    if ((options & PCRE_IMS) != oldims && *ptr == ')')
4109      {
4110      *code++ = OP_OPT;
4111      *code++ = oldims;
4112      }
4113
4114    /* Set values to pass back */
4115
4116    *codeptr = code;
4117    *ptrptr = ptr;
4118    *firstbyteptr = firstbyte;
4119    *reqbyteptr = reqbyte;
4120    return TRUE;
4121    }
4122
4123  /* Another branch follows; insert an "or" node. Its length field points back
4124  to the previous branch while the bracket remains open. At the end the chain
4125  is reversed. It's done like this so that the start of the bracket has a
4126  zero offset until it is closed, making it possible to detect recursion. */
4127
4128  *code = OP_ALT;
4129  PUT(code, 1, code - last_branch);
4130  bc.current = last_branch = code;
4131  code += 1 + LINK_SIZE;
4132  ptr++;
4133  }
4134/* Control never reaches here */
4135}
4136
4137
4138
4139
4140/*************************************************
4141*          Check for anchored expression         *
4142*************************************************/
4143
4144/* Try to find out if this is an anchored regular expression. Consider each
4145alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4146all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4147it's anchored. However, if this is a multiline pattern, then only OP_SOD
4148counts, since OP_CIRC can match in the middle.
4149
4150We can also consider a regex to be anchored if OP_SOM starts all its branches.
4151This is the code for \G, which means "match at start of match position, taking
4152into account the match offset".
4153
4154A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4155because that will try the rest of the pattern at all possible matching points,
4156so there is no point trying again.... er ....
4157
4158.... except when the .* appears inside capturing parentheses, and there is a
4159subsequent back reference to those parentheses. We haven't enough information
4160to catch that case precisely.
4161
4162At first, the best we could do was to detect when .* was in capturing brackets
4163and the highest back reference was greater than or equal to that level.
4164However, by keeping a bitmap of the first 31 back references, we can catch some
4165of the more common cases more precisely.
4166
4167Arguments:
4168  code           points to start of expression (the bracket)
4169  options        points to the options setting
4170  bracket_map    a bitmap of which brackets we are inside while testing; this
4171                  handles up to substring 31; after that we just have to take
4172                  the less precise approach
4173  backref_map    the back reference bitmap
4174
4175Returns:     TRUE or FALSE
4176*/
4177
4178static BOOL
4179is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4180  unsigned int backref_map)
4181{
4182do {
4183   const uschar *scode =
4184     first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4185   register int op = *scode;
4186
4187   /* Capturing brackets */
4188
4189   if (op > OP_BRA)
4190     {
4191     int new_map;
4192     op -= OP_BRA;
4193     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4194     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4195     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4196     }
4197
4198   /* Other brackets */
4199
4200   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4201     {
4202     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4203     }
4204
4205   /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4206   are or may be referenced. */
4207
4208   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4209            (*options & PCRE_DOTALL) != 0)
4210     {
4211     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4212     }
4213
4214   /* Check for explicit anchoring */
4215
4216   else if (op != OP_SOD && op != OP_SOM &&
4217           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4218     return FALSE;
4219   code += GET(code, 1);
4220   }
4221while (*code == OP_ALT);   /* Loop for each alternative */
4222return TRUE;
4223}
4224
4225
4226
4227/*************************************************
4228*         Check for starting with ^ or .*        *
4229*************************************************/
4230
4231/* This is called to find out if every branch starts with ^ or .* so that
4232"first char" processing can be done to speed things up in multiline
4233matching and for non-DOTALL patterns that start with .* (which must start at
4234the beginning or after \n). As in the case of is_anchored() (see above), we
4235have to take account of back references to capturing brackets that contain .*
4236because in that case we can't make the assumption.
4237
4238Arguments:
4239  code           points to start of expression (the bracket)
4240  bracket_map    a bitmap of which brackets we are inside while testing; this
4241                  handles up to substring 31; after that we just have to take
4242                  the less precise approach
4243  backref_map    the back reference bitmap
4244
4245Returns:         TRUE or FALSE
4246*/
4247
4248static BOOL
4249is_startline(const uschar *code, unsigned int bracket_map,
4250  unsigned int backref_map)
4251{
4252do {
4253   const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4254     FALSE);
4255   register int op = *scode;
4256
4257   /* Capturing brackets */
4258
4259   if (op > OP_BRA)
4260     {
4261     int new_map;
4262     op -= OP_BRA;
4263     if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4264     new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4265     if (!is_startline(scode, new_map, backref_map)) return FALSE;
4266     }
4267
4268   /* Other brackets */
4269
4270   else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4271     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4272
4273   /* .* means "start at start or after \n" if it isn't in brackets that
4274   may be referenced. */
4275
4276   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4277     {
4278     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4279     }
4280
4281   /* Check for explicit circumflex */
4282
4283   else if (op != OP_CIRC) return FALSE;
4284
4285   /* Move on to the next alternative */
4286
4287   code += GET(code, 1);
4288   }
4289while (*code == OP_ALT);  /* Loop for each alternative */
4290return TRUE;
4291}
4292
4293
4294
4295/*************************************************
4296*       Check for asserted fixed first char      *
4297*************************************************/
4298
4299/* During compilation, the "first char" settings from forward assertions are
4300discarded, because they can cause conflicts with actual literals that follow.
4301However, if we end up without a first char setting for an unanchored pattern,
4302it is worth scanning the regex to see if there is an initial asserted first
4303char. If all branches start with the same asserted char, or with a bracket all
4304of whose alternatives start with the same asserted char (recurse ad lib), then
4305we return that char, otherwise -1.
4306
4307Arguments:
4308  code       points to start of expression (the bracket)
4309  options    pointer to the options (used to check casing changes)
4310  inassert   TRUE if in an assertion
4311
4312Returns:     -1 or the fixed first char
4313*/
4314
4315static int
4316find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4317{
4318register int c = -1;
4319do {
4320   int d;
4321   const uschar *scode =
4322     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4323   register int op = *scode;
4324
4325   if (op >= OP_BRA) op = OP_BRA;
4326
4327   switch(op)
4328     {
4329     default:
4330     return -1;
4331
4332     case OP_BRA:
4333     case OP_ASSERT:
4334     case OP_ONCE:
4335     case OP_COND:
4336     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4337       return -1;
4338     if (c < 0) c = d; else if (c != d) return -1;
4339     break;
4340
4341     case OP_EXACT:       /* Fall through */
4342     scode += 2;
4343
4344     case OP_CHAR:
4345     case OP_CHARNC:
4346     case OP_PLUS:
4347     case OP_MINPLUS:
4348     if (!inassert) return -1;
4349     if (c < 0)
4350       {
4351       c = scode[1];
4352       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4353       }
4354     else if (c != scode[1]) return -1;
4355     break;
4356     }
4357
4358   code += GET(code, 1);
4359   }
4360while (*code == OP_ALT);
4361return c;
4362}
4363
4364
4365
4366
4367#ifdef SUPPORT_UTF8
4368/*************************************************
4369*         Validate a UTF-8 string                *
4370*************************************************/
4371
4372/* This function is called (optionally) at the start of compile or match, to
4373validate that a supposed UTF-8 string is actually valid. The early check means
4374that subsequent code can assume it is dealing with a valid string. The check
4375can be turned off for maximum performance, but then consequences of supplying
4376an invalid string are then undefined.
4377
4378Arguments:
4379  string       points to the string
4380  length       length of string, or -1 if the string is zero-terminated
4381
4382Returns:       < 0    if the string is a valid UTF-8 string
4383               >= 0   otherwise; the value is the offset of the bad byte
4384*/
4385
4386static int
4387valid_utf8(const uschar *string, int length)
4388{
4389register const uschar *p;
4390
4391if (length < 0)
4392  {
4393  for (p = string; *p != 0; p++);
4394  length = p - string;
4395  }
4396
4397for (p = string; length-- > 0; p++)
4398  {
4399  register int ab;
4400  register int c = *p;
4401  if (c < 128) continue;
4402  if ((c & 0xc0) != 0xc0) return p - string;
4403  ab = utf8_table4[c & 0x3f];  /* Number of additional bytes */
4404  if (length < ab) return p - string;
4405  length -= ab;
4406
4407  /* Check top bits in the second byte */
4408  if ((*(++p) & 0xc0) != 0x80) return p - string;
4409
4410  /* Check for overlong sequences for each different length */
4411  switch (ab)
4412    {
4413    /* Check for xx00 000x */
4414    case 1:
4415    if ((c & 0x3e) == 0) return p - string;
4416    continue;   /* We know there aren't any more bytes to check */
4417
4418    /* Check for 1110 0000, xx0x xxxx */
4419    case 2:
4420    if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4421    break;
4422
4423    /* Check for 1111 0000, xx00 xxxx */
4424    case 3:
4425    if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4426    break;
4427
4428    /* Check for 1111 1000, xx00 0xxx */
4429    case 4:
4430    if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4431    break;
4432
4433    /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4434    case 5:
4435    if (c == 0xfe || c == 0xff ||
4436       (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4437    break;
4438    }
4439
4440  /* Check for valid bytes after the 2nd, if any; all must start 10 */
4441  while (--ab > 0)
4442    {
4443    if ((*(++p) & 0xc0) != 0x80) return p - string;
4444    }
4445  }
4446
4447return -1;
4448}
4449#endif
4450
4451
4452
4453/*************************************************
4454*        Compile a Regular Expression            *
4455*************************************************/
4456
4457/* This function takes a string and returns a pointer to a block of store
4458holding a compiled version of the expression.
4459
4460Arguments:
4461  pattern      the regular expression
4462  options      various option bits
4463  errorptr     pointer to pointer to error text
4464  erroroffset  ptr offset in pattern where error was detected
4465  tables       pointer to character tables or NULL
4466
4467Returns:       pointer to compiled data block, or NULL on error,
4468               with errorptr and erroroffset set
4469*/
4470
4471EXPORT pcre *
4472pcre_compile(const char *pattern, int options, const char **errorptr,
4473  int *erroroffset, const unsigned char *tables)
4474{
4475real_pcre *re;
4476int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
4477int c, firstbyte, reqbyte;
4478int bracount = 0;
4479int branch_extra = 0;
4480int branch_newextra;
4481int item_count = -1;
4482int name_count = 0;
4483int max_name_size = 0;
4484int lastitemlength = 0;
4485#ifdef SUPPORT_UTF8
4486BOOL utf8;
4487BOOL class_utf8;
4488#endif
4489BOOL inescq = FALSE;
4490unsigned int brastackptr = 0;
4491size_t size;
4492uschar *code;
4493const uschar *codestart;
4494const uschar *ptr;
4495compile_data compile_block;
4496int brastack[BRASTACK_SIZE];
4497uschar bralenstack[BRASTACK_SIZE];
4498
4499/* We can't pass back an error message if errorptr is NULL; I guess the best we
4500can do is just return NULL. */
4501
4502if (errorptr == NULL) return NULL;
4503*errorptr = NULL;
4504
4505/* However, we can give a message for this error */
4506
4507if (erroroffset == NULL)
4508  {
4509  *errorptr = ERR16;
4510  return NULL;
4511  }
4512*erroroffset = 0;
4513
4514/* Can't support UTF8 unless PCRE has been compiled to include the code. */
4515
4516#ifdef SUPPORT_UTF8
4517utf8 = (options & PCRE_UTF8) != 0;
4518if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4519     (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4520  {
4521  *errorptr = ERR44;
4522  return NULL;
4523  }
4524#else
4525if ((options & PCRE_UTF8) != 0)
4526  {
4527  *errorptr = ERR32;
4528  return NULL;
4529  }
4530#endif
4531
4532if ((options & ~PUBLIC_OPTIONS) != 0)
4533  {
4534  *errorptr = ERR17;
4535  return NULL;
4536  }
4537
4538/* Set up pointers to the individual character tables */
4539
4540if (tables == NULL) tables = pcre_default_tables;
4541compile_block.lcc = tables + lcc_offset;
4542compile_block.fcc = tables + fcc_offset;
4543compile_block.cbits = tables + cbits_offset;
4544compile_block.ctypes = tables + ctypes_offset;
4545
4546/* Maximum back reference and backref bitmap. This is updated for numeric
4547references during the first pass, but for named references during the actual
4548compile pass. The bitmap records up to 31 back references to help in deciding
4549whether (.*) can be treated as anchored or not. */
4550
4551compile_block.top_backref = 0;
4552compile_block.backref_map = 0;
4553
4554/* Reflect pattern for debugging output */
4555
4556DPRINTF(("------------------------------------------------------------------\n"));
4557DPRINTF(("%s\n", pattern));
4558
4559/* The first thing to do is to make a pass over the pattern to compute the
4560amount of store required to hold the compiled code. This does not have to be
4561perfect as long as errors are overestimates. At the same time we can detect any
4562flag settings right at the start, and extract them. Make an attempt to correct
4563for any counted white space if an "extended" flag setting appears late in the
4564pattern. We can't be so clever for #-comments. */
4565
4566ptr = (const uschar *)(pattern - 1);
4567while ((c = *(++ptr)) != 0)
4568  {
4569  int min, max;
4570  int class_optcount;
4571  int bracket_length;
4572  int duplength;
4573
4574  /* If we are inside a \Q...\E sequence, all chars are literal */
4575
4576  if (inescq)
4577    {
4578    if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4579    goto NORMAL_CHAR;
4580    }
4581
4582  /* Otherwise, first check for ignored whitespace and comments */
4583
4584  if ((options & PCRE_EXTENDED) != 0)
4585    {
4586    if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4587    if (c == '#')
4588      {
4589      /* The space before the ; is to avoid a warning on a silly compiler
4590      on the Macintosh. */
4591      while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4592      if (c == 0) break;
4593      continue;
4594      }
4595    }
4596
4597  item_count++;    /* Is zero for the first non-comment item */
4598
4599  /* Allow space for auto callout before every item except quantifiers. */
4600
4601  if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4602       c != '*' && c != '+' && c != '?' &&
4603       (c != '{' || !is_counted_repeat(ptr + 1)))
4604    length += 2 + 2*LINK_SIZE;
4605
4606  switch(c)
4607    {
4608    /* A backslashed item may be an escaped data character or it may be a
4609    character type. */
4610
4611    case '\\':
4612    c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4613    if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4614
4615    lastitemlength = 1;     /* Default length of last item for repeats */
4616
4617    if (c >= 0)             /* Data character */
4618      {
4619      length += 2;          /* For a one-byte character */
4620
4621#ifdef SUPPORT_UTF8
4622      if (utf8 && c > 127)
4623        {
4624        int i;
4625        for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4626          if (c <= utf8_table1[i]) break;
4627        length += i;
4628        lastitemlength += i;
4629        }
4630#endif
4631
4632      continue;
4633      }
4634
4635    /* If \Q, enter "literal" mode */
4636
4637    if (-c == ESC_Q)
4638      {
4639      inescq = TRUE;
4640      continue;
4641      }
4642
4643    /* \X is supported only if Unicode property support is compiled */
4644
4645#ifndef SUPPORT_UCP
4646    if (-c == ESC_X)
4647      {
4648      *errorptr = ERR45;
4649      goto PCRE_ERROR_RETURN;
4650      }
4651#endif
4652
4653    /* \P and \p are for Unicode properties, but only when the support has
4654    been compiled. Each item needs 2 bytes. */
4655
4656    else if (-c == ESC_P || -c == ESC_p)
4657      {
4658#ifdef SUPPORT_UCP
4659      BOOL negated;
4660      length += 2;
4661      lastitemlength = 2;
4662      if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4663      continue;
4664#else
4665      *errorptr = ERR45;
4666      goto PCRE_ERROR_RETURN;
4667#endif
4668      }
4669
4670    /* Other escapes need one byte */
4671
4672    length++;
4673
4674    /* A back reference needs an additional 2 bytes, plus either one or 5
4675    bytes for a repeat. We also need to keep the value of the highest
4676    back reference. */
4677
4678    if (c <= -ESC_REF)
4679      {
4680      int refnum = -c - ESC_REF;
4681      compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4682      if (refnum > compile_block.top_backref)
4683        compile_block.top_backref = refnum;
4684      length += 2;   /* For single back reference */
4685      if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4686        {
4687        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4688        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4689        if ((min == 0 && (max == 1 || max == -1)) ||
4690          (min == 1 && max == -1))
4691            length++;
4692        else length += 5;
4693        if (ptr[1] == '?') ptr++;
4694        }
4695      }
4696    continue;
4697
4698    case '^':     /* Single-byte metacharacters */
4699    case '.':
4700    case '$':
4701    length++;
4702    lastitemlength = 1;
4703    continue;
4704
4705    case '*':            /* These repeats won't be after brackets; */
4706    case '+':            /* those are handled separately */
4707    case '?':
4708    length++;
4709    goto POSESSIVE;      /* A few lines below */
4710
4711    /* This covers the cases of braced repeats after a single char, metachar,
4712    class, or back reference. */
4713
4714    case '{':
4715    if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4716    ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4717    if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4718
4719    /* These special cases just insert one extra opcode */
4720
4721    if ((min == 0 && (max == 1 || max == -1)) ||
4722      (min == 1 && max == -1))
4723        length++;
4724
4725    /* These cases might insert additional copies of a preceding character. */
4726
4727    else
4728      {
4729      if (min != 1)
4730        {
4731        length -= lastitemlength;   /* Uncount the original char or metachar */
4732        if (min > 0) length += 3 + lastitemlength;
4733        }
4734      length += lastitemlength + ((max > 0)? 3 : 1);
4735      }
4736
4737    if (ptr[1] == '?') ptr++;      /* Needs no extra length */
4738
4739    POSESSIVE:                     /* Test for possessive quantifier */
4740    if (ptr[1] == '+')
4741      {
4742      ptr++;
4743      length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
4744      }
4745    continue;
4746
4747    /* An alternation contains an offset to the next branch or ket. If any ims
4748    options changed in the previous branch(es), and/or if we are in a
4749    lookbehind assertion, extra space will be needed at the start of the
4750    branch. This is handled by branch_extra. */
4751
4752    case '|':
4753    length += 1 + LINK_SIZE + branch_extra;
4754    continue;
4755
4756    /* A character class uses 33 characters provided that all the character
4757    values are less than 256. Otherwise, it uses a bit map for low valued
4758    characters, and individual items for others. Don't worry about character
4759    types that aren't allowed in classes - they'll get picked up during the
4760    compile. A character class that contains only one single-byte character
4761    uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4762    where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4763
4764    case '[':
4765    if (*(++ptr) == '^')
4766      {
4767      class_optcount = 10;  /* Greater than one */
4768      ptr++;
4769      }
4770    else class_optcount = 0;
4771
4772#ifdef SUPPORT_UTF8
4773    class_utf8 = FALSE;
4774#endif
4775
4776    /* Written as a "do" so that an initial ']' is taken as data */
4777
4778    if (*ptr != 0) do
4779      {
4780      /* Inside \Q...\E everything is literal except \E */
4781
4782      if (inescq)
4783        {
4784        if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4785        inescq = FALSE;
4786        ptr += 1;
4787        continue;
4788        }
4789
4790      /* Outside \Q...\E, check for escapes */
4791
4792      if (*ptr == '\\')
4793        {
4794        c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4795        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4796
4797        /* \b is backspace inside a class; \X is literal */
4798
4799        if (-c == ESC_b) c = '\b';
4800        else if (-c == ESC_X) c = 'X';
4801
4802        /* \Q enters quoting mode */
4803
4804        else if (-c == ESC_Q)
4805          {
4806          inescq = TRUE;
4807          continue;
4808          }
4809
4810        /* Handle escapes that turn into characters */
4811
4812        if (c >= 0) goto NON_SPECIAL_CHARACTER;
4813
4814        /* Escapes that are meta-things. The normal ones just affect the
4815        bit map, but Unicode properties require an XCLASS extended item. */
4816
4817        else
4818          {
4819          class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4820#ifdef SUPPORT_UTF8
4821          if (-c == ESC_p || -c == ESC_P)
4822            {
4823            if (!class_utf8)
4824              {
4825              class_utf8 = TRUE;
4826              length += LINK_SIZE + 2;
4827              }
4828            length += 2;
4829            }
4830#endif
4831          }
4832        }
4833
4834      /* Check the syntax for POSIX stuff. The bits we actually handle are
4835      checked during the real compile phase. */
4836
4837      else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4838        {
4839        ptr++;
4840        class_optcount = 10;    /* Make sure > 1 */
4841        }
4842
4843      /* Anything else increments the possible optimization count. We have to
4844      detect ranges here so that we can compute the number of extra ranges for
4845      caseless wide characters when UCP support is available. If there are wide
4846      characters, we are going to have to use an XCLASS, even for single
4847      characters. */
4848
4849      else
4850        {
4851        int d;
4852
4853        GET_ONE_CHARACTER:
4854
4855#ifdef SUPPORT_UTF8
4856        if (utf8)
4857          {
4858          int extra = 0;
4859          GETCHARLEN(c, ptr, extra);
4860          ptr += extra;
4861          }
4862        else c = *ptr;
4863#else
4864        c = *ptr;
4865#endif
4866
4867        /* Come here from handling \ above when it escapes to a char value */
4868
4869        NON_SPECIAL_CHARACTER:
4870        class_optcount++;
4871
4872        d = -1;
4873        if (ptr[1] == '-')
4874          {
4875          uschar const *hyptr = ptr++;
4876          if (ptr[1] == '\\')
4877            {
4878            ptr++;
4879            d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4880            if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4881            if (-d == ESC_b) d = '\b';        /* backspace */
4882            else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4883            }
4884          else if (ptr[1] != 0 && ptr[1] != ']')
4885            {
4886            ptr++;
4887#ifdef SUPPORT_UTF8
4888            if (utf8)
4889              {
4890              int extra = 0;
4891              GETCHARLEN(d, ptr, extra);
4892              ptr += extra;
4893              }
4894            else
4895#endif
4896            d = *ptr;
4897            }
4898          if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4899          }
4900
4901        /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4902        127 for caseless matching, we will need to use an XCLASS. */
4903
4904        if (d >= 0)
4905          {
4906          class_optcount = 10;     /* Ensure > 1 */
4907          if (d < c)
4908            {
4909            *errorptr = ERR8;
4910            goto PCRE_ERROR_RETURN;
4911            }
4912
4913#ifdef SUPPORT_UTF8
4914          if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4915            {
4916            uschar buffer[6];
4917            if (!class_utf8)         /* Allow for XCLASS overhead */
4918              {
4919              class_utf8 = TRUE;
4920              length += LINK_SIZE + 2;
4921              }
4922
4923#ifdef SUPPORT_UCP
4924            /* If we have UCP support, find out how many extra ranges are
4925            needed to map the other case of characters within this range. We
4926            have to mimic the range optimization here, because extending the
4927            range upwards might push d over a boundary that makes is use
4928            another byte in the UTF-8 representation. */
4929
4930            if ((options & PCRE_CASELESS) != 0)
4931              {
4932              int occ, ocd;
4933              int cc = c;
4934              int origd = d;
4935              while (get_othercase_range(&cc, origd, &occ, &ocd))
4936                {
4937                if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4938
4939                if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4940                  {                            /* if there is overlap,   */
4941                  c = occ;                     /* noting that if occ < c */
4942                  continue;                    /* we can't have ocd > d  */
4943                  }                            /* because a subrange is  */
4944                if (ocd > d && occ <= d + 1)   /* always shorter than    */
4945                  {                            /* the basic range.       */
4946                  d = ocd;
4947                  continue;
4948                  }
4949
4950                /* An extra item is needed */
4951
4952                length += 1 + ord2utf8(occ, buffer) +
4953                  ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4954                }
4955              }
4956#endif  /* SUPPORT_UCP */
4957
4958            /* The length of the (possibly extended) range */
4959
4960            length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4961            }
4962#endif  /* SUPPORT_UTF8 */
4963
4964          }
4965
4966        /* We have a single character. There is nothing to be done unless we
4967        are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4968        allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4969        support. */
4970
4971        else
4972          {
4973#ifdef SUPPORT_UTF8
4974          if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4975            {
4976            uschar buffer[6];
4977            class_optcount = 10;     /* Ensure > 1 */
4978            if (!class_utf8)         /* Allow for XCLASS overhead */
4979              {
4980              class_utf8 = TRUE;
4981              length += LINK_SIZE + 2;
4982              }
4983#ifdef SUPPORT_UCP
4984            length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4985              (1 + ord2utf8(c, buffer));
4986#else   /* SUPPORT_UCP */
4987            length += 1 + ord2utf8(c, buffer);
4988#endif  /* SUPPORT_UCP */
4989            }
4990#endif  /* SUPPORT_UTF8 */
4991          }
4992        }
4993      }
4994    while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4995
4996    if (*ptr == 0)                          /* Missing terminating ']' */
4997      {
4998      *errorptr = ERR6;
4999      goto PCRE_ERROR_RETURN;
5000      }
5001
5002    /* We can optimize when there was only one optimizable character. Repeats
5003    for positive and negated single one-byte chars are handled by the general
5004    code. Here, we handle repeats for the class opcodes. */
5005
5006    if (class_optcount == 1) length += 3; else
5007      {
5008      length += 33;
5009
5010      /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5011      we also need extra for wrapping the whole thing in a sub-pattern. */
5012
5013      if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5014        {
5015        ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5016        if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5017        if ((min == 0 && (max == 1 || max == -1)) ||
5018          (min == 1 && max == -1))
5019            length++;
5020        else length += 5;
5021        if (ptr[1] == '+')
5022          {
5023          ptr++;
5024          length += 2 + 2*LINK_SIZE;
5025          }
5026        else if (ptr[1] == '?') ptr++;
5027        }
5028      }
5029    continue;
5030
5031    /* Brackets may be genuine groups or special things */
5032
5033    case '(':
5034    branch_newextra = 0;
5035    bracket_length = 1 + LINK_SIZE;
5036
5037    /* Handle special forms of bracket, which all start (? */
5038
5039    if (ptr[1] == '?')
5040      {
5041      int set, unset;
5042      int *optset;
5043
5044      switch (c = ptr[2])
5045        {
5046        /* Skip over comments entirely */
5047        case '#':
5048        ptr += 3;
5049        while (*ptr != 0 && *ptr != ')') ptr++;
5050        if (*ptr == 0)
5051          {
5052          *errorptr = ERR18;
5053          goto PCRE_ERROR_RETURN;
5054          }
5055        continue;
5056
5057        /* Non-referencing groups and lookaheads just move the pointer on, and
5058        then behave like a non-special bracket, except that they don't increment
5059        the count of extracting brackets. Ditto for the "once only" bracket,
5060        which is in Perl from version 5.005. */
5061
5062        case ':':
5063        case '=':
5064        case '!':
5065        case '>':
5066        ptr += 2;
5067        break;
5068
5069        /* (?R) specifies a recursive call to the regex, which is an extension
5070        to provide the facility which can be obtained by (?p{perl-code}) in
5071        Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5072
5073        From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5074        the appropriate numbered brackets. This includes both recursive and
5075        non-recursive calls. (?R) is now synonymous with (?0). */
5076
5077        case 'R':
5078        ptr++;
5079
5080        case '0': case '1': case '2': case '3': case '4':
5081        case '5': case '6': case '7': case '8': case '9':
5082        ptr += 2;
5083        if (c != 'R')
5084          while ((digitab[*(++ptr)] & ctype_digit) != 0);
5085        if (*ptr != ')')
5086          {
5087          *errorptr = ERR29;
5088          goto PCRE_ERROR_RETURN;
5089          }
5090        length += 1 + LINK_SIZE;
5091
5092        /* If this item is quantified, it will get wrapped inside brackets so
5093        as to use the code for quantified brackets. We jump down and use the
5094        code that handles this for real brackets. */
5095
5096        if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5097          {
5098          length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
5099          duplength = 5 + 3 * LINK_SIZE;
5100          goto HANDLE_QUANTIFIED_BRACKETS;
5101          }
5102        continue;
5103
5104        /* (?C) is an extension which provides "callout" - to provide a bit of
5105        the functionality of the Perl (?{...}) feature. An optional number may
5106        follow (default is zero). */
5107
5108        case 'C':
5109        ptr += 2;
5110        while ((digitab[*(++ptr)] & ctype_digit) != 0);
5111        if (*ptr != ')')
5112          {
5113          *errorptr = ERR39;
5114          goto PCRE_ERROR_RETURN;
5115          }
5116        length += 2 + 2*LINK_SIZE;
5117        continue;
5118
5119        /* Named subpatterns are an extension copied from Python */
5120
5121        case 'P':
5122        ptr += 3;
5123        if (*ptr == '<')
5124          {
5125          const uschar *p;    /* Don't amalgamate; some compilers */
5126          p = ++ptr;          /* grumble at autoincrement in declaration */
5127          while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5128          if (*ptr != '>')
5129            {
5130            *errorptr = ERR42;
5131            goto PCRE_ERROR_RETURN;
5132            }
5133          name_count++;
5134          if (ptr - p > max_name_size) max_name_size = (ptr - p);
5135          break;
5136          }
5137
5138        if (*ptr == '=' || *ptr == '>')
5139          {
5140          while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5141          if (*ptr != ')')
5142            {
5143            *errorptr = ERR42;
5144            goto PCRE_ERROR_RETURN;
5145            }
5146          break;
5147          }
5148
5149        /* Unknown character after (?P */
5150
5151        *errorptr = ERR41;
5152        goto PCRE_ERROR_RETURN;
5153
5154        /* Lookbehinds are in Perl from version 5.005 */
5155
5156        case '<':
5157        ptr += 3;
5158        if (*ptr == '=' || *ptr == '!')
5159          {
5160          branch_newextra = 1 + LINK_SIZE;
5161          length += 1 + LINK_SIZE;         /* For the first branch */
5162          break;
5163          }
5164        *errorptr = ERR24;
5165        goto PCRE_ERROR_RETURN;
5166
5167        /* Conditionals are in Perl from version 5.005. The bracket must either
5168        be followed by a number (for bracket reference) or by an assertion
5169        group, or (a PCRE extension) by 'R' for a recursion test. */
5170
5171        case '(':
5172        if (ptr[3] == 'R' && ptr[4] == ')')
5173          {
5174          ptr += 4;
5175          length += 3;
5176          }
5177        else if ((digitab[ptr[3]] & ctype_digit) != 0)
5178          {
5179          ptr += 4;
5180          length += 3;
5181          while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5182          if (*ptr != ')')
5183            {
5184            *errorptr = ERR26;
5185            goto PCRE_ERROR_RETURN;
5186            }
5187          }
5188        else   /* An assertion must follow */
5189          {
5190          ptr++;   /* Can treat like ':' as far as spacing is concerned */
5191          if (ptr[2] != '?' ||
5192             (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5193            {
5194            ptr += 2;    /* To get right offset in message */
5195            *errorptr = ERR28;
5196            goto PCRE_ERROR_RETURN;
5197            }
5198          }
5199        break;
5200
5201        /* Else loop checking valid options until ) is met. Anything else is an
5202        error. If we are without any brackets, i.e. at top level, the settings
5203        act as if specified in the options, so massage the options immediately.
5204        This is for backward compatibility with Perl 5.004. */
5205
5206        default:
5207        set = unset = 0;
5208        optset = &set;
5209        ptr += 2;
5210
5211        for (;; ptr++)
5212          {
5213          c = *ptr;
5214          switch (c)
5215            {
5216            case 'i':
5217            *optset |= PCRE_CASELESS;
5218            continue;
5219
5220            case 'm':
5221            *optset |= PCRE_MULTILINE;
5222            continue;
5223
5224            case 's':
5225            *optset |= PCRE_DOTALL;
5226            continue;
5227
5228            case 'x':
5229            *optset |= PCRE_EXTENDED;
5230            continue;
5231
5232            case 'X':
5233            *optset |= PCRE_EXTRA;
5234            continue;
5235
5236            case 'U':
5237            *optset |= PCRE_UNGREEDY;
5238            continue;
5239
5240            case '-':
5241            optset = &unset;
5242            continue;
5243
5244            /* A termination by ')' indicates an options-setting-only item; if
5245            this is at the very start of the pattern (indicated by item_count
5246            being zero), we use it to set the global options. This is helpful
5247            when analyzing the pattern for first characters, etc. Otherwise
5248            nothing is done here and it is handled during the compiling
5249            process.
5250
5251            [Historical note: Up to Perl 5.8, options settings at top level
5252            were always global settings, wherever they appeared in the pattern.
5253            That is, they were equivalent to an external setting. From 5.8
5254            onwards, they apply only to what follows (which is what you might
5255            expect).] */
5256
5257            case ')':
5258            if (item_count == 0)
5259              {
5260              options = (options | set) & (~unset);
5261              set = unset = 0;     /* To save length */
5262              item_count--;        /* To allow for several */
5263              }
5264
5265            /* Fall through */
5266
5267            /* A termination by ':' indicates the start of a nested group with
5268            the given options set. This is again handled at compile time, but
5269            we must allow for compiled space if any of the ims options are
5270            set. We also have to allow for resetting space at the end of
5271            the group, which is why 4 is added to the length and not just 2.
5272            If there are several changes of options within the same group, this
5273            will lead to an over-estimate on the length, but this shouldn't
5274            matter very much. We also have to allow for resetting options at
5275            the start of any alternations, which we do by setting
5276            branch_newextra to 2. Finally, we record whether the case-dependent
5277            flag ever changes within the regex. This is used by the "required
5278            character" code. */
5279
5280            case ':':
5281            if (((set|unset) & PCRE_IMS) != 0)
5282              {
5283              length += 4;
5284              branch_newextra = 2;
5285              if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5286              }
5287            goto END_OPTIONS;
5288
5289            /* Unrecognized option character */
5290
5291            default:
5292            *errorptr = ERR12;
5293            goto PCRE_ERROR_RETURN;
5294            }
5295          }
5296
5297        /* If we hit a closing bracket, that's it - this is a freestanding
5298        option-setting. We need to ensure that branch_extra is updated if
5299        necessary. The only values branch_newextra can have here are 0 or 2.
5300        If the value is 2, then branch_extra must either be 2 or 5, depending
5301        on whether this is a lookbehind group or not. */
5302
5303        END_OPTIONS:
5304        if (c == ')')
5305          {
5306          if (branch_newextra == 2 &&
5307              (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5308            branch_extra += branch_newextra;
5309          continue;
5310          }
5311
5312        /* If options were terminated by ':' control comes here. Fall through
5313        to handle the group below. */
5314        }
5315      }
5316
5317    /* Extracting brackets must be counted so we can process escapes in a
5318    Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5319    need an additional 3 bytes of store per extracting bracket. However, if
5320    PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5321    must leave the count alone (it will aways be zero). */
5322
5323    else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5324      {
5325      bracount++;
5326      if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5327      }
5328
5329    /* Save length for computing whole length at end if there's a repeat that
5330    requires duplication of the group. Also save the current value of
5331    branch_extra, and start the new group with the new value. If non-zero, this
5332    will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5333
5334    if (brastackptr >= sizeof(brastack)/sizeof(int))
5335      {
5336      *errorptr = ERR19;
5337      goto PCRE_ERROR_RETURN;
5338      }
5339
5340    bralenstack[brastackptr] = branch_extra;
5341    branch_extra = branch_newextra;
5342
5343    brastack[brastackptr++] = length;
5344    length += bracket_length;
5345    continue;
5346
5347    /* Handle ket. Look for subsequent max/min; for certain sets of values we
5348    have to replicate this bracket up to that many times. If brastackptr is
5349    0 this is an unmatched bracket which will generate an error, but take care
5350    not to try to access brastack[-1] when computing the length and restoring
5351    the branch_extra value. */
5352
5353    case ')':
5354    length += 1 + LINK_SIZE;
5355    if (brastackptr > 0)
5356      {
5357      duplength = length - brastack[--brastackptr];
5358      branch_extra = bralenstack[brastackptr];
5359      }
5360    else duplength = 0;
5361
5362    /* The following code is also used when a recursion such as (?3) is
5363    followed by a quantifier, because in that case, it has to be wrapped inside
5364    brackets so that the quantifier works. The value of duplength must be
5365    set before arrival. */
5366
5367    HANDLE_QUANTIFIED_BRACKETS:
5368
5369    /* Leave ptr at the final char; for read_repeat_counts this happens
5370    automatically; for the others we need an increment. */
5371
5372    if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5373      {
5374      ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5375      if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5376      }
5377    else if (c == '*') { min = 0; max = -1; ptr++; }
5378    else if (c == '+') { min = 1; max = -1; ptr++; }
5379    else if (c == '?') { min = 0; max = 1;  ptr++; }
5380    else { min = 1; max = 1; }
5381
5382    /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5383    group, and if the maximum is greater than zero, we have to replicate
5384    maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5385    bracket set. */
5386
5387    if (min == 0)
5388      {
5389      length++;
5390      if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5391      }
5392
5393    /* When the minimum is greater than zero, we have to replicate up to
5394    minval-1 times, with no additions required in the copies. Then, if there
5395    is a limited maximum we have to replicate up to maxval-1 times allowing
5396    for a BRAZERO item before each optional copy and nesting brackets for all
5397    but one of the optional copies. */
5398
5399    else
5400      {
5401      length += (min - 1) * duplength;
5402      if (max > min)   /* Need this test as max=-1 means no limit */
5403        length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5404          - (2 + 2*LINK_SIZE);
5405      }
5406
5407    /* Allow space for once brackets for "possessive quantifier" */
5408
5409    if (ptr[1] == '+')
5410      {
5411      ptr++;
5412      length += 2 + 2*LINK_SIZE;
5413      }
5414    continue;
5415
5416    /* Non-special character. It won't be space or # in extended mode, so it is
5417    always a genuine character. If we are in a \Q...\E sequence, check for the
5418    end; if not, we have a literal. */
5419
5420    default:
5421    NORMAL_CHAR:
5422
5423    if (inescq && c == '\\' && ptr[1] == 'E')
5424      {
5425      inescq = FALSE;
5426      ptr++;
5427      continue;
5428      }
5429
5430    length += 2;          /* For a one-byte character */
5431    lastitemlength = 1;   /* Default length of last item for repeats */
5432
5433    /* In UTF-8 mode, check for additional bytes. */
5434
5435#ifdef SUPPORT_UTF8
5436    if (utf8 && (c & 0xc0) == 0xc0)
5437      {
5438      while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
5439        {                                     /* because the end is marked */
5440        lastitemlength++;                     /* by a zero byte. */
5441        length++;
5442        ptr++;
5443        }
5444      }
5445#endif
5446
5447    continue;
5448    }
5449  }
5450
5451length += 2 + LINK_SIZE;    /* For final KET and END */
5452
5453if ((options & PCRE_AUTO_CALLOUT) != 0)
5454  length += 2 + 2*LINK_SIZE;  /* For final callout */
5455
5456if (length > MAX_PATTERN_SIZE)
5457  {
5458  *errorptr = ERR20;
5459  return NULL;
5460  }
5461
5462/* Compute the size of data block needed and get it, either from malloc or
5463externally provided function. */
5464
5465size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5466re = (real_pcre *)(pcre_malloc)(size);
5467
5468if (re == NULL)
5469  {
5470  *errorptr = ERR21;
5471  return NULL;
5472  }
5473
5474/* Put in the magic number, and save the sizes, options, and character table
5475pointer. NULL is used for the default character tables. The nullpad field is at
5476the end; it's there to help in the case when a regex compiled on a system with
54774-byte pointers is run on another with 8-byte pointers. */
5478
5479re->magic_number = MAGIC_NUMBER;
5480re->size = size;
5481re->options = options;
5482re->dummy1 = re->dummy2 = 0;
5483re->name_table_offset = sizeof(real_pcre);
5484re->name_entry_size = max_name_size + 3;
5485re->name_count = name_count;
5486re->tables = (tables == pcre_default_tables)? NULL : tables;
5487re->nullpad = NULL;
5488
5489/* The starting points of the name/number translation table and of the code are
5490passed around in the compile data block. */
5491
5492compile_block.names_found = 0;
5493compile_block.name_entry_size = max_name_size + 3;
5494compile_block.name_table = (uschar *)re + re->name_table_offset;
5495codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5496compile_block.start_code = codestart;
5497compile_block.start_pattern = (const uschar *)pattern;
5498compile_block.req_varyopt = 0;
5499compile_block.nopartial = FALSE;
5500
5501/* Set up a starting, non-extracting bracket, then compile the expression. On
5502error, *errorptr will be set non-NULL, so we don't need to look at the result
5503of the function here. */
5504
5505ptr = (const uschar *)pattern;
5506code = (uschar *)codestart;
5507*code = OP_BRA;
5508bracount = 0;
5509(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5510  errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5511re->top_bracket = bracount;
5512re->top_backref = compile_block.top_backref;
5513
5514if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5515
5516/* If not reached end of pattern on success, there's an excess bracket. */
5517
5518if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5519
5520/* Fill in the terminating state and check for disastrous overflow, but
5521if debugging, leave the test till after things are printed out. */
5522
5523*code++ = OP_END;
5524
5525#ifndef DEBUG
5526if (code - codestart > length) *errorptr = ERR23;
5527#endif
5528
5529/* Give an error if there's back reference to a non-existent capturing
5530subpattern. */
5531
5532if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5533
5534/* Failed to compile, or error while post-processing */
5535
5536if (*errorptr != NULL)
5537  {
5538  (pcre_free)(re);
5539  PCRE_ERROR_RETURN:
5540  *erroroffset = ptr - (const uschar *)pattern;
5541  return NULL;
5542  }
5543
5544/* If the anchored option was not passed, set the flag if we can determine that
5545the pattern is anchored by virtue of ^ characters or \A or anything else (such
5546as starting with .* when DOTALL is set).
5547
5548Otherwise, if we know what the first character has to be, save it, because that
5549speeds up unanchored matches no end. If not, see if we can set the
5550PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5551start with ^. and also when all branches start with .* for non-DOTALL matches.
5552*/
5553
5554if ((options & PCRE_ANCHORED) == 0)
5555  {
5556  int temp_options = options;
5557  if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5558    re->options |= PCRE_ANCHORED;
5559  else
5560    {
5561    if (firstbyte < 0)
5562      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5563    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
5564      {
5565      int ch = firstbyte & 255;
5566      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5567         compile_block.fcc[ch] == ch)? ch : firstbyte;
5568      re->options |= PCRE_FIRSTSET;
5569      }
5570    else if (is_startline(codestart, 0, compile_block.backref_map))
5571      re->options |= PCRE_STARTLINE;
5572    }
5573  }
5574
5575/* For an anchored pattern, we use the "required byte" only if it follows a
5576variable length item in the regex. Remove the caseless flag for non-caseable
5577bytes. */
5578
5579if (reqbyte >= 0 &&
5580     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5581  {
5582  int ch = reqbyte & 255;
5583  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5584    compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5585  re->options |= PCRE_REQCHSET;
5586  }
5587
5588/* Print out the compiled data for debugging */
5589
5590#ifdef DEBUG
5591
5592printf("Length = %d top_bracket = %d top_backref = %d\n",
5593  length, re->top_bracket, re->top_backref);
5594
5595if (re->options != 0)
5596  {
5597  printf("%s%s%s%s%s%s%s%s%s%s\n",
5598    ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5599    ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5600    ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5601    ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5602    ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5603    ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5604    ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5605    ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5606    ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5607    ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5608  }
5609
5610if ((re->options & PCRE_FIRSTSET) != 0)
5611  {
5612  int ch = re->first_byte & 255;
5613  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5614  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5615    else printf("First char = \\x%02x%s\n", ch, caseless);
5616  }
5617
5618if ((re->options & PCRE_REQCHSET) != 0)
5619  {
5620  int ch = re->req_byte & 255;
5621  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5622  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5623    else printf("Req char = \\x%02x%s\n", ch, caseless);
5624  }
5625
5626print_internals(re, stdout);
5627
5628/* This check is done here in the debugging case so that the code that
5629was compiled can be seen. */
5630
5631if (code - codestart > length)
5632  {
5633  *errorptr = ERR23;
5634  (pcre_free)(re);
5635  *erroroffset = ptr - (uschar *)pattern;
5636  return NULL;
5637  }
5638#endif
5639
5640return (pcre *)re;
5641}
5642
5643
5644
5645/*************************************************
5646*          Match a back-reference                *
5647*************************************************/
5648
5649/* If a back reference hasn't been set, the length that is passed is greater
5650than the number of characters left in the string, so the match fails.
5651
5652Arguments:
5653  offset      index into the offset vector
5654  eptr        points into the subject
5655  length      length to be matched
5656  md          points to match data block
5657  ims         the ims flags
5658
5659Returns:      TRUE if matched
5660*/
5661
5662static BOOL
5663match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5664  unsigned long int ims)
5665{
5666const uschar *p = md->start_subject + md->offset_vector[offset];
5667
5668#ifdef DEBUG
5669if (eptr >= md->end_subject)
5670  printf("matching subject <null>");
5671else
5672  {
5673  printf("matching subject ");
5674  pchars(eptr, length, TRUE, md);
5675  }
5676printf(" against backref ");
5677pchars(p, length, FALSE, md);
5678printf("\n");
5679#endif
5680
5681/* Always fail if not enough characters left */
5682
5683if (length > md->end_subject - eptr) return FALSE;
5684
5685/* Separate the caselesss case for speed */
5686
5687if ((ims & PCRE_CASELESS) != 0)
5688  {
5689  while (length-- > 0)
5690    if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5691  }
5692else
5693  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5694
5695return TRUE;
5696}
5697
5698
5699#ifdef SUPPORT_UTF8
5700/*************************************************
5701*       Match character against an XCLASS        *
5702*************************************************/
5703
5704/* This function is called from within the XCLASS code below, to match a
5705character against an extended class which might match values > 255.
5706
5707Arguments:
5708  c           the character
5709  data        points to the flag byte of the XCLASS data
5710
5711Returns:      TRUE if character matches, else FALSE
5712*/
5713
5714static BOOL
5715match_xclass(int c, const uschar *data)
5716{
5717int t;
5718BOOL negated = (*data & XCL_NOT) != 0;
5719
5720/* Character values < 256 are matched against a bitmap, if one is present. If
5721not, we still carry on, because there may be ranges that start below 256 in the
5722additional data. */
5723
5724if (c < 256)
5725  {
5726  if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5727    return !negated;   /* char found */
5728  }
5729
5730/* First skip the bit map if present. Then match against the list of Unicode
5731properties or large chars or ranges that end with a large char. We won't ever
5732encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5733
5734if ((*data++ & XCL_MAP) != 0) data += 32;
5735
5736while ((t = *data++) != XCL_END)
5737  {
5738  int x, y;
5739  if (t == XCL_SINGLE)
5740    {
5741    GETCHARINC(x, data);
5742    if (c == x) return !negated;
5743    }
5744  else if (t == XCL_RANGE)
5745    {
5746    GETCHARINC(x, data);
5747    GETCHARINC(y, data);
5748    if (c >= x && c <= y) return !negated;
5749    }
5750
5751#ifdef SUPPORT_UCP
5752  else  /* XCL_PROP & XCL_NOTPROP */
5753    {
5754    int chartype, othercase;
5755    int rqdtype = *data++;
5756    int category = ucp_findchar(c, &chartype, &othercase);
5757    if (rqdtype >= 128)
5758      {
5759      if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5760      }
5761    else
5762      {
5763      if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5764      }
5765    }
5766#endif  /* SUPPORT_UCP */
5767  }
5768
5769return negated;   /* char did not match */
5770}
5771#endif
5772
5773
5774/***************************************************************************
5775****************************************************************************
5776                   RECURSION IN THE match() FUNCTION
5777
5778The match() function is highly recursive. Some regular expressions can cause
5779it to recurse thousands of times. I was writing for Unix, so I just let it
5780call itself recursively. This uses the stack for saving everything that has
5781to be saved for a recursive call. On Unix, the stack can be large, and this
5782works fine.
5783
5784It turns out that on non-Unix systems there are problems with programs that
5785use a lot of stack. (This despite the fact that every last chip has oodles
5786of memory these days, and techniques for extending the stack have been known
5787for decades.) So....
5788
5789There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5790calls by keeping local variables that need to be preserved in blocks of memory
5791obtained from malloc instead instead of on the stack. Macros are used to
5792achieve this so that the actual code doesn't look very different to what it
5793always used to.
5794****************************************************************************
5795***************************************************************************/
5796
5797
5798/* These versions of the macros use the stack, as normal */
5799
5800#ifndef NO_RECURSE
5801#define REGISTER register
5802#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5803#define RRETURN(ra) return ra
5804#else
5805
5806
5807/* These versions of the macros manage a private stack on the heap. Note
5808that the rd argument of RMATCH isn't actually used. It's the md argument of
5809match(), which never changes. */
5810
5811#define REGISTER
5812
5813#define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5814  {\
5815  heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5816  if (setjmp(frame->Xwhere) == 0)\
5817    {\
5818    newframe->Xeptr = ra;\
5819    newframe->Xecode = rb;\
5820    newframe->Xoffset_top = rc;\
5821    newframe->Xims = re;\
5822    newframe->Xeptrb = rf;\
5823    newframe->Xflags = rg;\
5824    newframe->Xprevframe = frame;\
5825    frame = newframe;\
5826    DPRINTF(("restarting from line %d\n", __LINE__));\
5827    goto HEAP_RECURSE;\
5828    }\
5829  else\
5830    {\
5831    DPRINTF(("longjumped back to line %d\n", __LINE__));\
5832    frame = md->thisframe;\
5833    rx = frame->Xresult;\
5834    }\
5835  }
5836
5837#define RRETURN(ra)\
5838  {\
5839  heapframe *newframe = frame;\
5840  frame = newframe->Xprevframe;\
5841  (pcre_stack_free)(newframe);\
5842  if (frame != NULL)\
5843    {\
5844    frame->Xresult = ra;\
5845    md->thisframe = frame;\
5846    longjmp(frame->Xwhere, 1);\
5847    }\
5848  return ra;\
5849  }
5850
5851
5852/* Structure for remembering the local variables in a private frame */
5853
5854typedef struct heapframe {
5855  struct heapframe *Xprevframe;
5856
5857  /* Function arguments that may change */
5858
5859  const uschar *Xeptr;
5860  const uschar *Xecode;
5861  int Xoffset_top;
5862  long int Xims;
5863  eptrblock *Xeptrb;
5864  int Xflags;
5865
5866  /* Function local variables */
5867
5868  const uschar *Xcallpat;
5869  const uschar *Xcharptr;
5870  const uschar *Xdata;
5871  const uschar *Xnext;
5872  const uschar *Xpp;
5873  const uschar *Xprev;
5874  const uschar *Xsaved_eptr;
5875
5876  recursion_info Xnew_recursive;
5877
5878  BOOL Xcur_is_word;
5879  BOOL Xcondition;
5880  BOOL Xminimize;
5881  BOOL Xprev_is_word;
5882
5883  unsigned long int Xoriginal_ims;
5884
5885#ifdef SUPPORT_UCP
5886  int Xprop_type;
5887  int Xprop_fail_result;
5888  int Xprop_category;
5889  int Xprop_chartype;
5890  int Xprop_othercase;
5891  int Xprop_test_against;
5892  int *Xprop_test_variable;
5893#endif
5894
5895  int Xctype;
5896  int Xfc;
5897  int Xfi;
5898  int Xlength;
5899  int Xmax;
5900  int Xmin;
5901  int Xnumber;
5902  int Xoffset;
5903  int Xop;
5904  int Xsave_capture_last;
5905  int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5906  int Xstacksave[REC_STACK_SAVE_MAX];
5907
5908  eptrblock Xnewptrb;
5909
5910  /* Place to pass back result, and where to jump back to */
5911
5912  int  Xresult;
5913  jmp_buf Xwhere;
5914
5915} heapframe;
5916
5917#endif
5918
5919
5920/***************************************************************************
5921***************************************************************************/
5922
5923
5924
5925/*************************************************
5926*         Match from current position            *
5927*************************************************/
5928
5929/* On entry ecode points to the first opcode, and eptr to the first character
5930in the subject string, while eptrb holds the value of eptr at the start of the
5931last bracketed group - used for breaking infinite loops matching zero-length
5932strings. This function is called recursively in many circumstances. Whenever it
5933returns a negative (error) response, the outer incarnation must also return the
5934same response.
5935
5936Performance note: It might be tempting to extract commonly used fields from the
5937md structure (e.g. utf8, end_subject) into individual variables to improve
5938performance. Tests using gcc on a SPARC disproved this; in the first case, it
5939made performance worse.
5940
5941Arguments:
5942   eptr        pointer in subject
5943   ecode       position in code
5944   offset_top  current top pointer
5945   md          pointer to "static" info for the match
5946   ims         current /i, /m, and /s options
5947   eptrb       pointer to chain of blocks containing eptr at start of
5948                 brackets - for testing for empty matches
5949   flags       can contain
5950                 match_condassert - this is an assertion condition
5951                 match_isgroup - this is the start of a bracketed group
5952
5953Returns:       MATCH_MATCH if matched            )  these values are >= 0
5954               MATCH_NOMATCH if failed to match  )
5955               a negative PCRE_ERROR_xxx value if aborted by an error condition
5956                 (e.g. stopped by recursion limit)
5957*/
5958
5959static int
5960match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5961  int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5962  int flags)
5963{
5964/* These variables do not need to be preserved over recursion in this function,
5965so they can be ordinary variables in all cases. Mark them with "register"
5966because they are used a lot in loops. */
5967
5968register int rrc;    /* Returns from recursive calls */
5969register int i;      /* Used for loops not involving calls to RMATCH() */
5970register int c;      /* Character values not kept over RMATCH() calls */
5971
5972/* When recursion is not being used, all "local" variables that have to be
5973preserved over calls to RMATCH() are part of a "frame" which is obtained from
5974heap storage. Set up the top-level frame here; others are obtained from the
5975heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5976
5977#ifdef NO_RECURSE
5978heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5979frame->Xprevframe = NULL;            /* Marks the top level */
5980
5981/* Copy in the original argument variables */
5982
5983frame->Xeptr = eptr;
5984frame->Xecode = ecode;
5985frame->Xoffset_top = offset_top;
5986frame->Xims = ims;
5987frame->Xeptrb = eptrb;
5988frame->Xflags = flags;
5989
5990/* This is where control jumps back to to effect "recursion" */
5991
5992HEAP_RECURSE:
5993
5994/* Macros make the argument variables come from the current frame */
5995
5996#define eptr               frame->Xeptr
5997#define ecode              frame->Xecode
5998#define offset_top         frame->Xoffset_top
5999#define ims                frame->Xims
6000#define eptrb              frame->Xeptrb
6001#define flags              frame->Xflags
6002
6003/* Ditto for the local variables */
6004
6005#ifdef SUPPORT_UTF8
6006#define charptr            frame->Xcharptr
6007#endif
6008#define callpat            frame->Xcallpat
6009#define data               frame->Xdata
6010#define next               frame->Xnext
6011#define pp                 frame->Xpp
6012#define prev               frame->Xprev
6013#define saved_eptr         frame->Xsaved_eptr
6014
6015#define new_recursive      frame->Xnew_recursive
6016
6017#define cur_is_word        frame->Xcur_is_word
6018#define condition          frame->Xcondition
6019#define minimize           frame->Xminimize
6020#define prev_is_word       frame->Xprev_is_word
6021
6022#define original_ims       frame->Xoriginal_ims
6023
6024#ifdef SUPPORT_UCP
6025#define prop_type          frame->Xprop_type
6026#define prop_fail_result   frame->Xprop_fail_result
6027#define prop_category      frame->Xprop_category
6028#define prop_chartype      frame->Xprop_chartype
6029#define prop_othercase     frame->Xprop_othercase
6030#define prop_test_against  frame->Xprop_test_against
6031#define prop_test_variable frame->Xprop_test_variable
6032#endif
6033
6034#define ctype              frame->Xctype
6035#define fc                 frame->Xfc
6036#define fi                 frame->Xfi
6037#define length             frame->Xlength
6038#define max                frame->Xmax
6039#define min                frame->Xmin
6040#define number             frame->Xnumber
6041#define offset             frame->Xoffset
6042#define op                 frame->Xop
6043#define save_capture_last  frame->Xsave_capture_last
6044#define save_offset1       frame->Xsave_offset1
6045#define save_offset2       frame->Xsave_offset2
6046#define save_offset3       frame->Xsave_offset3
6047#define stacksave          frame->Xstacksave
6048
6049#define newptrb            frame->Xnewptrb
6050
6051/* When recursion is being used, local variables are allocated on the stack and
6052get preserved during recursion in the normal way. In this environment, fi and
6053i, and fc and c, can be the same variables. */
6054
6055#else
6056#define fi i
6057#define fc c
6058
6059
6060#ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
6061const uschar *charptr;             /* small blocks of the code. My normal  */
6062#endif                             /* style of coding would have declared  */
6063const uschar *callpat;             /* them within each of those blocks.    */
6064const uschar *data;                /* However, in order to accommodate the */
6065const uschar *next;                /* version of this code that uses an    */
6066const uschar *pp;                  /* external "stack" implemented on the  */
6067const uschar *prev;                /* heap, it is easier to declare them   */
6068const uschar *saved_eptr;          /* all here, so the declarations can    */
6069                                   /* be cut out in a block. The only      */
6070recursion_info new_recursive;      /* declarations within blocks below are */
6071                                   /* for variables that do not have to    */
6072BOOL cur_is_word;                  /* be preserved over a recursive call   */
6073BOOL condition;                    /* to RMATCH().                         */
6074BOOL minimize;
6075BOOL prev_is_word;
6076
6077unsigned long int original_ims;
6078
6079#ifdef SUPPORT_UCP
6080int prop_type;
6081int prop_fail_result;
6082int prop_category;
6083int prop_chartype;
6084int prop_othercase;
6085int prop_test_against;
6086int *prop_test_variable;
6087#endif
6088
6089int ctype;
6090int length;
6091int max;
6092int min;
6093int number;
6094int offset;
6095int op;
6096int save_capture_last;
6097int save_offset1, save_offset2, save_offset3;
6098int stacksave[REC_STACK_SAVE_MAX];
6099
6100eptrblock newptrb;
6101#endif
6102
6103/* These statements are here to stop the compiler complaining about unitialized
6104variables. */
6105
6106#ifdef SUPPORT_UCP
6107prop_fail_result = 0;
6108prop_test_against = 0;
6109prop_test_variable = NULL;
6110#endif
6111
6112/* OK, now we can get on with the real code of the function. Recursion is
6113specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6114these just turn into a recursive call to match() and a "return", respectively.
6115However, RMATCH isn't like a function call because it's quite a complicated
6116macro. It has to be used in one particular way. This shouldn't, however, impact
6117performance when true recursion is being used. */
6118
6119if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6120
6121original_ims = ims;    /* Save for resetting on ')' */
6122
6123/* At the start of a bracketed group, add the current subject pointer to the
6124stack of such pointers, to be re-instated at the end of the group when we hit
6125the closing ket. When match() is called in other circumstances, we don't add to
6126this stack. */
6127
6128if ((flags & match_isgroup) != 0)
6129  {
6130  newptrb.epb_prev = eptrb;
6131  newptrb.epb_saved_eptr = eptr;
6132  eptrb = &newptrb;
6133  }
6134
6135/* Now start processing the operations. */
6136
6137for (;;)
6138  {
6139  op = *ecode;
6140  minimize = FALSE;
6141
6142  /* For partial matching, remember if we ever hit the end of the subject after
6143  matching at least one subject character. */
6144
6145  if (md->partial &&
6146      eptr >= md->end_subject &&
6147      eptr > md->start_match)
6148    md->hitend = TRUE;
6149
6150  /* Opening capturing bracket. If there is space in the offset vector, save
6151  the current subject position in the working slot at the top of the vector. We
6152  mustn't change the current values of the data slot, because they may be set
6153  from a previous iteration of this group, and be referred to by a reference
6154  inside the group.
6155
6156  If the bracket fails to match, we need to restore this value and also the
6157  values of the final offsets, in case they were set by a previous iteration of
6158  the same bracket.
6159
6160  If there isn't enough space in the offset vector, treat this as if it were a
6161  non-capturing bracket. Don't worry about setting the flag for the error case
6162  here; that is handled in the code for KET. */
6163
6164  if (op > OP_BRA)
6165    {
6166    number = op - OP_BRA;
6167
6168    /* For extended extraction brackets (large number), we have to fish out the
6169    number from a dummy opcode at the start. */
6170
6171    if (number > EXTRACT_BASIC_MAX)
6172      number = GET2(ecode, 2+LINK_SIZE);
6173    offset = number << 1;
6174
6175#ifdef DEBUG
6176    printf("start bracket %d subject=", number);
6177    pchars(eptr, 16, TRUE, md);
6178    printf("\n");
6179#endif
6180
6181    if (offset < md->offset_max)
6182      {
6183      save_offset1 = md->offset_vector[offset];
6184      save_offset2 = md->offset_vector[offset+1];
6185      save_offset3 = md->offset_vector[md->offset_end - number];
6186      save_capture_last = md->capture_last;
6187
6188      DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6189      md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6190
6191      do
6192        {
6193        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6194          match_isgroup);
6195        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6196        md->capture_last = save_capture_last;
6197        ecode += GET(ecode, 1);
6198        }
6199      while (*ecode == OP_ALT);
6200
6201      DPRINTF(("bracket %d failed\n", number));
6202
6203      md->offset_vector[offset] = save_offset1;
6204      md->offset_vector[offset+1] = save_offset2;
6205      md->offset_vector[md->offset_end - number] = save_offset3;
6206
6207      RRETURN(MATCH_NOMATCH);
6208      }
6209
6210    /* Insufficient room for saving captured contents */
6211
6212    else op = OP_BRA;
6213    }
6214
6215  /* Other types of node can be handled by a switch */
6216
6217  switch(op)
6218    {
6219    case OP_BRA:     /* Non-capturing bracket: optimized */
6220    DPRINTF(("start bracket 0\n"));
6221    do
6222      {
6223      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6224        match_isgroup);
6225      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6226      ecode += GET(ecode, 1);
6227      }
6228    while (*ecode == OP_ALT);
6229    DPRINTF(("bracket 0 failed\n"));
6230    RRETURN(MATCH_NOMATCH);
6231
6232    /* Conditional group: compilation checked that there are no more than
6233    two branches. If the condition is false, skipping the first branch takes us
6234    past the end if there is only one branch, but that's OK because that is
6235    exactly what going to the ket would do. */
6236
6237    case OP_COND:
6238    if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6239      {
6240      offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
6241      condition = (offset == CREF_RECURSE * 2)?
6242        (md->recursive != NULL) :
6243        (offset < offset_top && md->offset_vector[offset] >= 0);
6244      RMATCH(rrc, eptr, ecode + (condition?
6245        (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6246        offset_top, md, ims, eptrb, match_isgroup);
6247      RRETURN(rrc);
6248      }
6249
6250    /* The condition is an assertion. Call match() to evaluate it - setting
6251    the final argument TRUE causes it to stop at the end of an assertion. */
6252
6253    else
6254      {
6255      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6256          match_condassert | match_isgroup);
6257      if (rrc == MATCH_MATCH)
6258        {
6259        ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6260        while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6261        }
6262      else if (rrc != MATCH_NOMATCH)
6263        {
6264        RRETURN(rrc);         /* Need braces because of following else */
6265        }
6266      else ecode += GET(ecode, 1);
6267      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6268        match_isgroup);
6269      RRETURN(rrc);
6270      }
6271    /* Control never reaches here */
6272
6273    /* Skip over conditional reference or large extraction number data if
6274    encountered. */
6275
6276    case OP_CREF:
6277    case OP_BRANUMBER:
6278    ecode += 3;
6279    break;
6280
6281    /* End of the pattern. If we are in a recursion, we should restore the
6282    offsets appropriately and continue from after the call. */
6283
6284    case OP_END:
6285    if (md->recursive != NULL && md->recursive->group_num == 0)
6286      {
6287      recursion_info *rec = md->recursive;
6288      DPRINTF(("Hit the end in a (?0) recursion\n"));
6289      md->recursive = rec->prevrec;
6290      memmove(md->offset_vector, rec->offset_save,
6291        rec->saved_max * sizeof(int));
6292      md->start_match = rec->save_start;
6293      ims = original_ims;
6294      ecode = rec->after_call;
6295      break;
6296      }
6297
6298    /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6299    string - backtracking will then try other alternatives, if any. */
6300
6301    if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6302    md->end_match_ptr = eptr;          /* Record where we ended */
6303    md->end_offset_top = offset_top;   /* and how many extracts were taken */
6304    RRETURN(MATCH_MATCH);
6305
6306    /* Change option settings */
6307
6308    case OP_OPT:
6309    ims = ecode[1];
6310    ecode += 2;
6311    DPRINTF(("ims set to %02lx\n", ims));
6312    break;
6313
6314    /* Assertion brackets. Check the alternative branches in turn - the
6315    matching won't pass the KET for an assertion. If any one branch matches,
6316    the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6317    start of each branch to move the current point backwards, so the code at
6318    this level is identical to the lookahead case. */
6319
6320    case OP_ASSERT:
6321    case OP_ASSERTBACK:
6322    do
6323      {
6324      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6325        match_isgroup);
6326      if (rrc == MATCH_MATCH) break;
6327      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6328      ecode += GET(ecode, 1);
6329      }
6330    while (*ecode == OP_ALT);
6331    if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6332
6333    /* If checking an assertion for a condition, return MATCH_MATCH. */
6334
6335    if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6336
6337    /* Continue from after the assertion, updating the offsets high water
6338    mark, since extracts may have been taken during the assertion. */
6339
6340    do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6341    ecode += 1 + LINK_SIZE;
6342    offset_top = md->end_offset_top;
6343    continue;
6344
6345    /* Negative assertion: all branches must fail to match */
6346
6347    case OP_ASSERT_NOT:
6348    case OP_ASSERTBACK_NOT:
6349    do
6350      {
6351      RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6352        match_isgroup);
6353      if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6354      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6355      ecode += GET(ecode,1);
6356      }
6357    while (*ecode == OP_ALT);
6358
6359    if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6360
6361    ecode += 1 + LINK_SIZE;
6362    continue;
6363
6364    /* Move the subject pointer back. This occurs only at the start of
6365    each branch of a lookbehind assertion. If we are too close to the start to
6366    move back, this match function fails. When working with UTF-8 we move
6367    back a number of characters, not bytes. */
6368
6369    case OP_REVERSE:
6370#ifdef SUPPORT_UTF8
6371    if (md->utf8)
6372      {
6373      c = GET(ecode,1);
6374      for (i = 0; i < c; i++)
6375        {
6376        eptr--;
6377        if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6378        BACKCHAR(eptr)
6379        }
6380      }
6381    else
6382#endif
6383
6384    /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6385
6386      {
6387      eptr -= GET(ecode,1);
6388      if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6389      }
6390
6391    /* Skip to next op code */
6392
6393    ecode += 1 + LINK_SIZE;
6394    break;
6395
6396    /* The callout item calls an external function, if one is provided, passing
6397    details of the match so far. This is mainly for debugging, though the
6398    function is able to force a failure. */
6399
6400    case OP_CALLOUT:
6401    if (pcre_callout != NULL)
6402      {
6403      pcre_callout_block cb;
6404      cb.version          = 1;   /* Version 1 of the callout block */
6405      cb.callout_number   = ecode[1];
6406      cb.offset_vector    = md->offset_vector;
6407      cb.subject          = (const char *)md->start_subject;
6408      cb.subject_length   = md->end_subject - md->start_subject;
6409      cb.start_match      = md->start_match - md->start_subject;
6410      cb.current_position = eptr - md->start_subject;
6411      cb.pattern_position = GET(ecode, 2);
6412      cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6413      cb.capture_top      = offset_top/2;
6414      cb.capture_last     = md->capture_last;
6415      cb.callout_data     = md->callout_data;
6416      if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6417      if (rrc < 0) RRETURN(rrc);
6418      }
6419    ecode += 2 + 2*LINK_SIZE;
6420    break;
6421
6422    /* Recursion either matches the current regex, or some subexpression. The
6423    offset data is the offset to the starting bracket from the start of the
6424    whole pattern. (This is so that it works from duplicated subpatterns.)
6425
6426    If there are any capturing brackets started but not finished, we have to
6427    save their starting points and reinstate them after the recursion. However,
6428    we don't know how many such there are (offset_top records the completed
6429    total) so we just have to save all the potential data. There may be up to
6430    65535 such values, which is too large to put on the stack, but using malloc
6431    for small numbers seems expensive. As a compromise, the stack is used when
6432    there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6433    is used. A problem is what to do if the malloc fails ... there is no way of
6434    returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6435    values on the stack, and accept that the rest may be wrong.
6436
6437    There are also other values that have to be saved. We use a chained
6438    sequence of blocks that actually live on the stack. Thanks to Robin Houston
6439    for the original version of this logic. */
6440
6441    case OP_RECURSE:
6442      {
6443      callpat = md->start_code + GET(ecode, 1);
6444      new_recursive.group_num = *callpat - OP_BRA;
6445
6446      /* For extended extraction brackets (large number), we have to fish out
6447      the number from a dummy opcode at the start. */
6448
6449      if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6450        new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6451
6452      /* Add to "recursing stack" */
6453
6454      new_recursive.prevrec = md->recursive;
6455      md->recursive = &new_recursive;
6456
6457      /* Find where to continue from afterwards */
6458
6459      ecode += 1 + LINK_SIZE;
6460      new_recursive.after_call = ecode;
6461
6462      /* Now save the offset data. */
6463
6464      new_recursive.saved_max = md->offset_end;
6465      if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6466        new_recursive.offset_save = stacksave;
6467      else
6468        {
6469        new_recursive.offset_save =
6470          (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6471        if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6472        }
6473
6474      memcpy(new_recursive.offset_save, md->offset_vector,
6475            new_recursive.saved_max * sizeof(int));
6476      new_recursive.save_start = md->start_match;
6477      md->start_match = eptr;
6478
6479      /* OK, now we can do the recursion. For each top-level alternative we
6480      restore the offset and recursion data. */
6481
6482      DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6483      do
6484        {
6485        RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6486            eptrb, match_isgroup);
6487        if (rrc == MATCH_MATCH)
6488          {
6489          md->recursive = new_recursive.prevrec;
6490          if (new_recursive.offset_save != stacksave)
6491            (pcre_free)(new_recursive.offset_save);
6492          RRETURN(MATCH_MATCH);
6493          }
6494        else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6495
6496        md->recursive = &new_recursive;
6497        memcpy(md->offset_vector, new_recursive.offset_save,
6498            new_recursive.saved_max * sizeof(int));
6499        callpat += GET(callpat, 1);
6500        }
6501      while (*callpat == OP_ALT);
6502
6503      DPRINTF(("Recursion didn't match\n"));
6504      md->recursive = new_recursive.prevrec;
6505      if (new_recursive.offset_save != stacksave)
6506        (pcre_free)(new_recursive.offset_save);
6507      RRETURN(MATCH_NOMATCH);
6508      }
6509    /* Control never reaches here */
6510
6511    /* "Once" brackets are like assertion brackets except that after a match,
6512    the point in the subject string is not moved back. Thus there can never be
6513    a move back into the brackets. Friedl calls these "atomic" subpatterns.
6514    Check the alternative branches in turn - the matching won't pass the KET
6515    for this kind of subpattern. If any one branch matches, we carry on as at
6516    the end of a normal bracket, leaving the subject pointer. */
6517
6518    case OP_ONCE:
6519      {
6520      prev = ecode;
6521      saved_eptr = eptr;
6522
6523      do
6524        {
6525        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6526          eptrb, match_isgroup);
6527        if (rrc == MATCH_MATCH) break;
6528        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6529        ecode += GET(ecode,1);
6530        }
6531      while (*ecode == OP_ALT);
6532
6533      /* If hit the end of the group (which could be repeated), fail */
6534
6535      if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6536
6537      /* Continue as from after the assertion, updating the offsets high water
6538      mark, since extracts may have been taken. */
6539
6540      do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6541
6542      offset_top = md->end_offset_top;
6543      eptr = md->end_match_ptr;
6544
6545      /* For a non-repeating ket, just continue at this level. This also
6546      happens for a repeating ket if no characters were matched in the group.
6547      This is the forcible breaking of infinite loops as implemented in Perl
6548      5.005. If there is an options reset, it will get obeyed in the normal
6549      course of events. */
6550
6551      if (*ecode == OP_KET || eptr == saved_eptr)
6552        {
6553        ecode += 1+LINK_SIZE;
6554        break;
6555        }
6556
6557      /* The repeating kets try the rest of the pattern or restart from the
6558      preceding bracket, in the appropriate order. We need to reset any options
6559      that changed within the bracket before re-running it, so check the next
6560      opcode. */
6561
6562      if (ecode[1+LINK_SIZE] == OP_OPT)
6563        {
6564        ims = (ims & ~PCRE_IMS) | ecode[4];
6565        DPRINTF(("ims set to %02lx at group repeat\n", ims));
6566        }
6567
6568      if (*ecode == OP_KETRMIN)
6569        {
6570        RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6571        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6572        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6573        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6574        }
6575      else  /* OP_KETRMAX */
6576        {
6577        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6578        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6579        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6580        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6581        }
6582      }
6583    RRETURN(MATCH_NOMATCH);
6584
6585    /* An alternation is the end of a branch; scan along to find the end of the
6586    bracketed group and go to there. */
6587
6588    case OP_ALT:
6589    do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6590    break;
6591
6592    /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6593    that it may occur zero times. It may repeat infinitely, or not at all -
6594    i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6595    repeat limits are compiled as a number of copies, with the optional ones
6596    preceded by BRAZERO or BRAMINZERO. */
6597
6598    case OP_BRAZERO:
6599      {
6600      next = ecode+1;
6601      RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6602      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603      do next += GET(next,1); while (*next == OP_ALT);
6604      ecode = next + 1+LINK_SIZE;
6605      }
6606    break;
6607
6608    case OP_BRAMINZERO:
6609      {
6610      next = ecode+1;
6611      do next += GET(next,1); while (*next == OP_ALT);
6612      RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6613        match_isgroup);
6614      if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6615      ecode++;
6616      }
6617    break;
6618
6619    /* End of a group, repeated or non-repeating. If we are at the end of
6620    an assertion "group", stop matching and return MATCH_MATCH, but record the
6621    current high water mark for use by positive assertions. Do this also
6622    for the "once" (not-backup up) groups. */
6623
6624    case OP_KET:
6625    case OP_KETRMIN:
6626    case OP_KETRMAX:
6627      {
6628      prev = ecode - GET(ecode, 1);
6629      saved_eptr = eptrb->epb_saved_eptr;
6630
6631      /* Back up the stack of bracket start pointers. */
6632
6633      eptrb = eptrb->epb_prev;
6634
6635      if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6636          *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6637          *prev == OP_ONCE)
6638        {
6639        md->end_match_ptr = eptr;      /* For ONCE */
6640        md->end_offset_top = offset_top;
6641        RRETURN(MATCH_MATCH);
6642        }
6643
6644      /* In all other cases except a conditional group we have to check the
6645      group number back at the start and if necessary complete handling an
6646      extraction by setting the offsets and bumping the high water mark. */
6647
6648      if (*prev != OP_COND)
6649        {
6650        number = *prev - OP_BRA;
6651
6652        /* For extended extraction brackets (large number), we have to fish out
6653        the number from a dummy opcode at the start. */
6654
6655        if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6656        offset = number << 1;
6657
6658#ifdef DEBUG
6659        printf("end bracket %d", number);
6660        printf("\n");
6661#endif
6662
6663        /* Test for a numbered group. This includes groups called as a result
6664        of recursion. Note that whole-pattern recursion is coded as a recurse
6665        into group 0, so it won't be picked up here. Instead, we catch it when
6666        the OP_END is reached. */
6667
6668        if (number > 0)
6669          {
6670          md->capture_last = number;
6671          if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6672            {
6673            md->offset_vector[offset] =
6674              md->offset_vector[md->offset_end - number];
6675            md->offset_vector[offset+1] = eptr - md->start_subject;
6676            if (offset_top <= offset) offset_top = offset + 2;
6677            }
6678
6679          /* Handle a recursively called group. Restore the offsets
6680          appropriately and continue from after the call. */
6681
6682          if (md->recursive != NULL && md->recursive->group_num == number)
6683            {
6684            recursion_info *rec = md->recursive;
6685            DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6686            md->recursive = rec->prevrec;
6687            md->start_match = rec->save_start;
6688            memcpy(md->offset_vector, rec->offset_save,
6689              rec->saved_max * sizeof(int));
6690            ecode = rec->after_call;
6691            ims = original_ims;
6692            break;
6693            }
6694          }
6695        }
6696
6697      /* Reset the value of the ims flags, in case they got changed during
6698      the group. */
6699
6700      ims = original_ims;
6701      DPRINTF(("ims reset to %02lx\n", ims));
6702
6703      /* For a non-repeating ket, just continue at this level. This also
6704      happens for a repeating ket if no characters were matched in the group.
6705      This is the forcible breaking of infinite loops as implemented in Perl
6706      5.005. If there is an options reset, it will get obeyed in the normal
6707      course of events. */
6708
6709      if (*ecode == OP_KET || eptr == saved_eptr)
6710        {
6711        ecode += 1 + LINK_SIZE;
6712        break;
6713        }
6714
6715      /* The repeating kets try the rest of the pattern or restart from the
6716      preceding bracket, in the appropriate order. */
6717
6718      if (*ecode == OP_KETRMIN)
6719        {
6720        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6721        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6722        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6723        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6724        }
6725      else  /* OP_KETRMAX */
6726        {
6727        RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6728        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6729        RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6730        if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6731        }
6732      }
6733
6734    RRETURN(MATCH_NOMATCH);
6735
6736    /* Start of subject unless notbol, or after internal newline if multiline */
6737
6738    case OP_CIRC:
6739    if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6740    if ((ims & PCRE_MULTILINE) != 0)
6741      {
6742      if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6743        RRETURN(MATCH_NOMATCH);
6744      ecode++;
6745      break;
6746      }
6747    /* ... else fall through */
6748
6749    /* Start of subject assertion */
6750
6751    case OP_SOD:
6752    if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6753    ecode++;
6754    break;
6755
6756    /* Start of match assertion */
6757
6758    case OP_SOM:
6759    if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6760    ecode++;
6761    break;
6762
6763    /* Assert before internal newline if multiline, or before a terminating
6764    newline unless endonly is set, else end of subject unless noteol is set. */
6765
6766    case OP_DOLL:
6767    if ((ims & PCRE_MULTILINE) != 0)
6768      {
6769      if (eptr < md->end_subject)
6770        { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6771      else
6772        { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6773      ecode++;
6774      break;
6775      }
6776    else
6777      {
6778      if (md->noteol) RRETURN(MATCH_NOMATCH);
6779      if (!md->endonly)
6780        {
6781        if (eptr < md->end_subject - 1 ||
6782           (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6783          RRETURN(MATCH_NOMATCH);
6784        ecode++;
6785        break;
6786        }
6787      }
6788    /* ... else fall through */
6789
6790    /* End of subject assertion (\z) */
6791
6792    case OP_EOD:
6793    if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6794    ecode++;
6795    break;
6796
6797    /* End of subject or ending \n assertion (\Z) */
6798
6799    case OP_EODN:
6800    if (eptr < md->end_subject - 1 ||
6801       (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6802    ecode++;
6803    break;
6804
6805    /* Word boundary assertions */
6806
6807    case OP_NOT_WORD_BOUNDARY:
6808    case OP_WORD_BOUNDARY:
6809      {
6810
6811      /* Find out if the previous and current characters are "word" characters.
6812      It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6813      be "non-word" characters. */
6814
6815#ifdef SUPPORT_UTF8
6816      if (md->utf8)
6817        {
6818        if (eptr == md->start_subject) prev_is_word = FALSE; else
6819          {
6820          const uschar *lastptr = eptr - 1;
6821          while((*lastptr & 0xc0) == 0x80) lastptr--;
6822          GETCHAR(c, lastptr);
6823          prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6824          }
6825        if (eptr >= md->end_subject) cur_is_word = FALSE; else
6826          {
6827          GETCHAR(c, eptr);
6828          cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6829          }
6830        }
6831      else
6832#endif
6833
6834      /* More streamlined when not in UTF-8 mode */
6835
6836        {
6837        prev_is_word = (eptr != md->start_subject) &&
6838          ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6839        cur_is_word = (eptr < md->end_subject) &&
6840          ((md->ctypes[*eptr] & ctype_word) != 0);
6841        }
6842
6843      /* Now see if the situation is what we want */
6844
6845      if ((*ecode++ == OP_WORD_BOUNDARY)?
6846           cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6847        RRETURN(MATCH_NOMATCH);
6848      }
6849    break;
6850
6851    /* Match a single character type; inline for speed */
6852
6853    case OP_ANY:
6854    if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6855      RRETURN(MATCH_NOMATCH);
6856    if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6857#ifdef SUPPORT_UTF8
6858    if (md->utf8)
6859      while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6860#endif
6861    ecode++;
6862    break;
6863
6864    /* Match a single byte, even in UTF-8 mode. This opcode really does match
6865    any byte, even newline, independent of the setting of PCRE_DOTALL. */
6866
6867    case OP_ANYBYTE:
6868    if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6869    ecode++;
6870    break;
6871
6872    case OP_NOT_DIGIT:
6873    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6874    GETCHARINCTEST(c, eptr);
6875    if (
6876#ifdef SUPPORT_UTF8
6877       c < 256 &&
6878#endif
6879       (md->ctypes[c] & ctype_digit) != 0
6880       )
6881      RRETURN(MATCH_NOMATCH);
6882    ecode++;
6883    break;
6884
6885    case OP_DIGIT:
6886    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6887    GETCHARINCTEST(c, eptr);
6888    if (
6889#ifdef SUPPORT_UTF8
6890       c >= 256 ||
6891#endif
6892       (md->ctypes[c] & ctype_digit) == 0
6893       )
6894      RRETURN(MATCH_NOMATCH);
6895    ecode++;
6896    break;
6897
6898    case OP_NOT_WHITESPACE:
6899    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6900    GETCHARINCTEST(c, eptr);
6901    if (
6902#ifdef SUPPORT_UTF8
6903       c < 256 &&
6904#endif
6905       (md->ctypes[c] & ctype_space) != 0
6906       )
6907      RRETURN(MATCH_NOMATCH);
6908    ecode++;
6909    break;
6910
6911    case OP_WHITESPACE:
6912    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6913    GETCHARINCTEST(c, eptr);
6914    if (
6915#ifdef SUPPORT_UTF8
6916       c >= 256 ||
6917#endif
6918       (md->ctypes[c] & ctype_space) == 0
6919       )
6920      RRETURN(MATCH_NOMATCH);
6921    ecode++;
6922    break;
6923
6924    case OP_NOT_WORDCHAR:
6925    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6926    GETCHARINCTEST(c, eptr);
6927    if (
6928#ifdef SUPPORT_UTF8
6929       c < 256 &&
6930#endif
6931       (md->ctypes[c] & ctype_word) != 0
6932       )
6933      RRETURN(MATCH_NOMATCH);
6934    ecode++;
6935    break;
6936
6937    case OP_WORDCHAR:
6938    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6939    GETCHARINCTEST(c, eptr);
6940    if (
6941#ifdef SUPPORT_UTF8
6942       c >= 256 ||
6943#endif
6944       (md->ctypes[c] & ctype_word) == 0
6945       )
6946      RRETURN(MATCH_NOMATCH);
6947    ecode++;
6948    break;
6949
6950#ifdef SUPPORT_UCP
6951    /* Check the next character by Unicode property. We will get here only
6952    if the support is in the binary; otherwise a compile-time error occurs. */
6953
6954    case OP_PROP:
6955    case OP_NOTPROP:
6956    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6957    GETCHARINCTEST(c, eptr);
6958      {
6959      int chartype, rqdtype;
6960      int othercase;
6961      int category = ucp_findchar(c, &chartype, &othercase);
6962
6963      rqdtype = *(++ecode);
6964      ecode++;
6965
6966      if (rqdtype >= 128)
6967        {
6968        if ((rqdtype - 128 != category) == (op == OP_PROP))
6969          RRETURN(MATCH_NOMATCH);
6970        }
6971      else
6972        {
6973        if ((rqdtype != chartype) == (op == OP_PROP))
6974          RRETURN(MATCH_NOMATCH);
6975        }
6976      }
6977    break;
6978
6979    /* Match an extended Unicode sequence. We will get here only if the support
6980    is in the binary; otherwise a compile-time error occurs. */
6981
6982    case OP_EXTUNI:
6983    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6984    GETCHARINCTEST(c, eptr);
6985      {
6986      int chartype;
6987      int othercase;
6988      int category = ucp_findchar(c, &chartype, &othercase);
6989      if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6990      while (eptr < md->end_subject)
6991        {
6992        int len = 1;
6993        if (!md->utf8) c = *eptr; else
6994          {
6995          GETCHARLEN(c, eptr, len);
6996          }
6997        category = ucp_findchar(c, &chartype, &othercase);
6998        if (category != ucp_M) break;
6999        eptr += len;
7000        }
7001      }
7002    ecode++;
7003    break;
7004#endif
7005
7006
7007    /* Match a back reference, possibly repeatedly. Look past the end of the
7008    item to see if there is repeat information following. The code is similar
7009    to that for character classes, but repeated for efficiency. Then obey
7010    similar code to character type repeats - written out again for speed.
7011    However, if the referenced string is the empty string, always treat
7012    it as matched, any number of times (otherwise there could be infinite
7013    loops). */
7014
7015    case OP_REF:
7016      {
7017      offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
7018      ecode += 3;                                 /* Advance past item */
7019
7020      /* If the reference is unset, set the length to be longer than the amount
7021      of subject left; this ensures that every attempt at a match fails. We
7022      can't just fail here, because of the possibility of quantifiers with zero
7023      minima. */
7024
7025      length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7026        md->end_subject - eptr + 1 :
7027        md->offset_vector[offset+1] - md->offset_vector[offset];
7028
7029      /* Set up for repetition, or handle the non-repeated case */
7030
7031      switch (*ecode)
7032        {
7033        case OP_CRSTAR:
7034        case OP_CRMINSTAR:
7035        case OP_CRPLUS:
7036        case OP_CRMINPLUS:
7037        case OP_CRQUERY:
7038        case OP_CRMINQUERY:
7039        c = *ecode++ - OP_CRSTAR;
7040        minimize = (c & 1) != 0;
7041        min = rep_min[c];                 /* Pick up values from tables; */
7042        max = rep_max[c];                 /* zero for max => infinity */
7043        if (max == 0) max = INT_MAX;
7044        break;
7045
7046        case OP_CRRANGE:
7047        case OP_CRMINRANGE:
7048        minimize = (*ecode == OP_CRMINRANGE);
7049        min = GET2(ecode, 1);
7050        max = GET2(ecode, 3);
7051        if (max == 0) max = INT_MAX;
7052        ecode += 5;
7053        break;
7054
7055        default:               /* No repeat follows */
7056        if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7057        eptr += length;
7058        continue;              /* With the main loop */
7059        }
7060
7061      /* If the length of the reference is zero, just continue with the
7062      main loop. */
7063
7064      if (length == 0) continue;
7065
7066      /* First, ensure the minimum number of matches are present. We get back
7067      the length of the reference string explicitly rather than passing the
7068      address of eptr, so that eptr can be a register variable. */
7069
7070      for (i = 1; i <= min; i++)
7071        {
7072        if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7073        eptr += length;
7074        }
7075
7076      /* If min = max, continue at the same level without recursion.
7077      They are not both allowed to be zero. */
7078
7079      if (min == max) continue;
7080
7081      /* If minimizing, keep trying and advancing the pointer */
7082
7083      if (minimize)
7084        {
7085        for (fi = min;; fi++)
7086          {
7087          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7088          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7089          if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7090            RRETURN(MATCH_NOMATCH);
7091          eptr += length;
7092          }
7093        /* Control never gets here */
7094        }
7095
7096      /* If maximizing, find the longest string and work backwards */
7097
7098      else
7099        {
7100        pp = eptr;
7101        for (i = min; i < max; i++)
7102          {
7103          if (!match_ref(offset, eptr, length, md, ims)) break;
7104          eptr += length;
7105          }
7106        while (eptr >= pp)
7107          {
7108          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7109          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7110          eptr -= length;
7111          }
7112        RRETURN(MATCH_NOMATCH);
7113        }
7114      }
7115    /* Control never gets here */
7116
7117
7118
7119    /* Match a bit-mapped character class, possibly repeatedly. This op code is
7120    used when all the characters in the class have values in the range 0-255,
7121    and either the matching is caseful, or the characters are in the range
7122    0-127 when UTF-8 processing is enabled. The only difference between
7123    OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7124    encountered.
7125
7126    First, look past the end of the item to see if there is repeat information
7127    following. Then obey similar code to character type repeats - written out
7128    again for speed. */
7129
7130    case OP_NCLASS:
7131    case OP_CLASS:
7132      {
7133      data = ecode + 1;                /* Save for matching */
7134      ecode += 33;                     /* Advance past the item */
7135
7136      switch (*ecode)
7137        {
7138        case OP_CRSTAR:
7139        case OP_CRMINSTAR:
7140        case OP_CRPLUS:
7141        case OP_CRMINPLUS:
7142        case OP_CRQUERY:
7143        case OP_CRMINQUERY:
7144        c = *ecode++ - OP_CRSTAR;
7145        minimize = (c & 1) != 0;
7146        min = rep_min[c];                 /* Pick up values from tables; */
7147        max = rep_max[c];                 /* zero for max => infinity */
7148        if (max == 0) max = INT_MAX;
7149        break;
7150
7151        case OP_CRRANGE:
7152        case OP_CRMINRANGE:
7153        minimize = (*ecode == OP_CRMINRANGE);
7154        min = GET2(ecode, 1);
7155        max = GET2(ecode, 3);
7156        if (max == 0) max = INT_MAX;
7157        ecode += 5;
7158        break;
7159
7160        default:               /* No repeat follows */
7161        min = max = 1;
7162        break;
7163        }
7164
7165      /* First, ensure the minimum number of matches are present. */
7166
7167#ifdef SUPPORT_UTF8
7168      /* UTF-8 mode */
7169      if (md->utf8)
7170        {
7171        for (i = 1; i <= min; i++)
7172          {
7173          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7174          GETCHARINC(c, eptr);
7175          if (c > 255)
7176            {
7177            if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7178            }
7179          else
7180            {
7181            if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7182            }
7183          }
7184        }
7185      else
7186#endif
7187      /* Not UTF-8 mode */
7188        {
7189        for (i = 1; i <= min; i++)
7190          {
7191          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7192          c = *eptr++;
7193          if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7194          }
7195        }
7196
7197      /* If max == min we can continue with the main loop without the
7198      need to recurse. */
7199
7200      if (min == max) continue;
7201
7202      /* If minimizing, keep testing the rest of the expression and advancing
7203      the pointer while it matches the class. */
7204
7205      if (minimize)
7206        {
7207#ifdef SUPPORT_UTF8
7208        /* UTF-8 mode */
7209        if (md->utf8)
7210          {
7211          for (fi = min;; fi++)
7212            {
7213            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7214            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7215            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7216            GETCHARINC(c, eptr);
7217            if (c > 255)
7218              {
7219              if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7220              }
7221            else
7222              {
7223              if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7224              }
7225            }
7226          }
7227        else
7228#endif
7229        /* Not UTF-8 mode */
7230          {
7231          for (fi = min;; fi++)
7232            {
7233            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7234            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7235            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7236            c = *eptr++;
7237            if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7238            }
7239          }
7240        /* Control never gets here */
7241        }
7242
7243      /* If maximizing, find the longest possible run, then work backwards. */
7244
7245      else
7246        {
7247        pp = eptr;
7248
7249#ifdef SUPPORT_UTF8
7250        /* UTF-8 mode */
7251        if (md->utf8)
7252          {
7253          for (i = min; i < max; i++)
7254            {
7255            int len = 1;
7256            if (eptr >= md->end_subject) break;
7257            GETCHARLEN(c, eptr, len);
7258            if (c > 255)
7259              {
7260              if (op == OP_CLASS) break;
7261              }
7262            else
7263              {
7264              if ((data[c/8] & (1 << (c&7))) == 0) break;
7265              }
7266            eptr += len;
7267            }
7268          for (;;)
7269            {
7270            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7271            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7272            if (eptr-- == pp) break;        /* Stop if tried at original pos */
7273            BACKCHAR(eptr);
7274            }
7275          }
7276        else
7277#endif
7278          /* Not UTF-8 mode */
7279          {
7280          for (i = min; i < max; i++)
7281            {
7282            if (eptr >= md->end_subject) break;
7283            c = *eptr;
7284            if ((data[c/8] & (1 << (c&7))) == 0) break;
7285            eptr++;
7286            }
7287          while (eptr >= pp)
7288            {
7289            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7290            eptr--;
7291            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7292            }
7293          }
7294
7295        RRETURN(MATCH_NOMATCH);
7296        }
7297      }
7298    /* Control never gets here */
7299
7300
7301    /* Match an extended character class. This opcode is encountered only
7302    in UTF-8 mode, because that's the only time it is compiled. */
7303
7304#ifdef SUPPORT_UTF8
7305    case OP_XCLASS:
7306      {
7307      data = ecode + 1 + LINK_SIZE;                /* Save for matching */
7308      ecode += GET(ecode, 1);                      /* Advance past the item */
7309
7310      switch (*ecode)
7311        {
7312        case OP_CRSTAR:
7313        case OP_CRMINSTAR:
7314        case OP_CRPLUS:
7315        case OP_CRMINPLUS:
7316        case OP_CRQUERY:
7317        case OP_CRMINQUERY:
7318        c = *ecode++ - OP_CRSTAR;
7319        minimize = (c & 1) != 0;
7320        min = rep_min[c];                 /* Pick up values from tables; */
7321        max = rep_max[c];                 /* zero for max => infinity */
7322        if (max == 0) max = INT_MAX;
7323        break;
7324
7325        case OP_CRRANGE:
7326        case OP_CRMINRANGE:
7327        minimize = (*ecode == OP_CRMINRANGE);
7328        min = GET2(ecode, 1);
7329        max = GET2(ecode, 3);
7330        if (max == 0) max = INT_MAX;
7331        ecode += 5;
7332        break;
7333
7334        default:               /* No repeat follows */
7335        min = max = 1;
7336        break;
7337        }
7338
7339      /* First, ensure the minimum number of matches are present. */
7340
7341      for (i = 1; i <= min; i++)
7342        {
7343        if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7344        GETCHARINC(c, eptr);
7345        if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7346        }
7347
7348      /* If max == min we can continue with the main loop without the
7349      need to recurse. */
7350
7351      if (min == max) continue;
7352
7353      /* If minimizing, keep testing the rest of the expression and advancing
7354      the pointer while it matches the class. */
7355
7356      if (minimize)
7357        {
7358        for (fi = min;; fi++)
7359          {
7360          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7361          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7362          if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7363          GETCHARINC(c, eptr);
7364          if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7365          }
7366        /* Control never gets here */
7367        }
7368
7369      /* If maximizing, find the longest possible run, then work backwards. */
7370
7371      else
7372        {
7373        pp = eptr;
7374        for (i = min; i < max; i++)
7375          {
7376          int len = 1;
7377          if (eptr >= md->end_subject) break;
7378          GETCHARLEN(c, eptr, len);
7379          if (!match_xclass(c, data)) break;
7380          eptr += len;
7381          }
7382        for(;;)
7383          {
7384          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7385          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7386          if (eptr-- == pp) break;        /* Stop if tried at original pos */
7387          BACKCHAR(eptr)
7388          }
7389        RRETURN(MATCH_NOMATCH);
7390        }
7391
7392      /* Control never gets here */
7393      }
7394#endif    /* End of XCLASS */
7395
7396    /* Match a single character, casefully */
7397
7398    case OP_CHAR:
7399#ifdef SUPPORT_UTF8
7400    if (md->utf8)
7401      {
7402      length = 1;
7403      ecode++;
7404      GETCHARLEN(fc, ecode, length);
7405      if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7406      while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7407      }
7408    else
7409#endif
7410
7411    /* Non-UTF-8 mode */
7412      {
7413      if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7414      if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7415      ecode += 2;
7416      }
7417    break;
7418
7419    /* Match a single character, caselessly */
7420
7421    case OP_CHARNC:
7422#ifdef SUPPORT_UTF8
7423    if (md->utf8)
7424      {
7425      length = 1;
7426      ecode++;
7427      GETCHARLEN(fc, ecode, length);
7428
7429      if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7430
7431      /* If the pattern character's value is < 128, we have only one byte, and
7432      can use the fast lookup table. */
7433
7434      if (fc < 128)
7435        {
7436        if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7437        }
7438
7439      /* Otherwise we must pick up the subject character */
7440
7441      else
7442        {
7443        int dc;
7444        GETCHARINC(dc, eptr);
7445        ecode += length;
7446
7447        /* If we have Unicode property support, we can use it to test the other
7448        case of the character, if there is one. The result of ucp_findchar() is
7449        < 0 if the char isn't found, and othercase is returned as zero if there
7450        isn't one. */
7451
7452        if (fc != dc)
7453          {
7454#ifdef SUPPORT_UCP
7455          int chartype;
7456          int othercase;
7457          if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7458#endif
7459            RRETURN(MATCH_NOMATCH);
7460          }
7461        }
7462      }
7463    else
7464#endif   /* SUPPORT_UTF8 */
7465
7466    /* Non-UTF-8 mode */
7467      {
7468      if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7469      if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7470      ecode += 2;
7471      }
7472    break;
7473
7474    /* Match a single character repeatedly; different opcodes share code. */
7475
7476    case OP_EXACT:
7477    min = max = GET2(ecode, 1);
7478    ecode += 3;
7479    goto REPEATCHAR;
7480
7481    case OP_UPTO:
7482    case OP_MINUPTO:
7483    min = 0;
7484    max = GET2(ecode, 1);
7485    minimize = *ecode == OP_MINUPTO;
7486    ecode += 3;
7487    goto REPEATCHAR;
7488
7489    case OP_STAR:
7490    case OP_MINSTAR:
7491    case OP_PLUS:
7492    case OP_MINPLUS:
7493    case OP_QUERY:
7494    case OP_MINQUERY:
7495    c = *ecode++ - OP_STAR;
7496    minimize = (c & 1) != 0;
7497    min = rep_min[c];                 /* Pick up values from tables; */
7498    max = rep_max[c];                 /* zero for max => infinity */
7499    if (max == 0) max = INT_MAX;
7500
7501    /* Common code for all repeated single-character matches. We can give
7502    up quickly if there are fewer than the minimum number of characters left in
7503    the subject. */
7504
7505    REPEATCHAR:
7506#ifdef SUPPORT_UTF8
7507    if (md->utf8)
7508      {
7509      length = 1;
7510      charptr = ecode;
7511      GETCHARLEN(fc, ecode, length);
7512      if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7513      ecode += length;
7514
7515      /* Handle multibyte character matching specially here. There is
7516      support for caseless matching if UCP support is present. */
7517
7518      if (length > 1)
7519        {
7520        int oclength = 0;
7521        uschar occhars[8];
7522
7523#ifdef SUPPORT_UCP
7524        int othercase;
7525        int chartype;
7526        if ((ims & PCRE_CASELESS) != 0 &&
7527             ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7528             othercase > 0)
7529          oclength = ord2utf8(othercase, occhars);
7530#endif  /* SUPPORT_UCP */
7531
7532        for (i = 1; i <= min; i++)
7533          {
7534          if (memcmp(eptr, charptr, length) == 0) eptr += length;
7535          /* Need braces because of following else */
7536          else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7537          else
7538            {
7539            if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7540            eptr += oclength;
7541            }
7542          }
7543
7544        if (min == max) continue;
7545
7546        if (minimize)
7547          {
7548          for (fi = min;; fi++)
7549            {
7550            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7551            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7552            if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7553            if (memcmp(eptr, charptr, length) == 0) eptr += length;
7554            /* Need braces because of following else */
7555            else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7556            else
7557              {
7558              if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7559              eptr += oclength;
7560              }
7561            }
7562          /* Control never gets here */
7563          }
7564        else
7565          {
7566          pp = eptr;
7567          for (i = min; i < max; i++)
7568            {
7569            if (eptr > md->end_subject - length) break;
7570            if (memcmp(eptr, charptr, length) == 0) eptr += length;
7571            else if (oclength == 0) break;
7572            else
7573              {
7574              if (memcmp(eptr, occhars, oclength) != 0) break;
7575              eptr += oclength;
7576              }
7577            }
7578          while (eptr >= pp)
7579           {
7580           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7581           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7582           eptr -= length;
7583           }
7584          RRETURN(MATCH_NOMATCH);
7585          }
7586        /* Control never gets here */
7587        }
7588
7589      /* If the length of a UTF-8 character is 1, we fall through here, and
7590      obey the code as for non-UTF-8 characters below, though in this case the
7591      value of fc will always be < 128. */
7592      }
7593    else
7594#endif  /* SUPPORT_UTF8 */
7595
7596    /* When not in UTF-8 mode, load a single-byte character. */
7597      {
7598      if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7599      fc = *ecode++;
7600      }
7601
7602    /* The value of fc at this point is always less than 256, though we may or
7603    may not be in UTF-8 mode. The code is duplicated for the caseless and
7604    caseful cases, for speed, since matching characters is likely to be quite
7605    common. First, ensure the minimum number of matches are present. If min =
7606    max, continue at the same level without recursing. Otherwise, if
7607    minimizing, keep trying the rest of the expression and advancing one
7608    matching character if failing, up to the maximum. Alternatively, if
7609    maximizing, find the maximum number of characters and work backwards. */
7610
7611    DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7612      max, eptr));
7613
7614    if ((ims & PCRE_CASELESS) != 0)
7615      {
7616      fc = md->lcc[fc];
7617      for (i = 1; i <= min; i++)
7618        if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7619      if (min == max) continue;
7620      if (minimize)
7621        {
7622        for (fi = min;; fi++)
7623          {
7624          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7625          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7626          if (fi >= max || eptr >= md->end_subject ||
7627              fc != md->lcc[*eptr++])
7628            RRETURN(MATCH_NOMATCH);
7629          }
7630        /* Control never gets here */
7631        }
7632      else
7633        {
7634        pp = eptr;
7635        for (i = min; i < max; i++)
7636          {
7637          if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7638          eptr++;
7639          }
7640        while (eptr >= pp)
7641          {
7642          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7643          eptr--;
7644          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7645          }
7646        RRETURN(MATCH_NOMATCH);
7647        }
7648      /* Control never gets here */
7649      }
7650
7651    /* Caseful comparisons (includes all multi-byte characters) */
7652
7653    else
7654      {
7655      for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7656      if (min == max) continue;
7657      if (minimize)
7658        {
7659        for (fi = min;; fi++)
7660          {
7661          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7662          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7663          if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7664            RRETURN(MATCH_NOMATCH);
7665          }
7666        /* Control never gets here */
7667        }
7668      else
7669        {
7670        pp = eptr;
7671        for (i = min; i < max; i++)
7672          {
7673          if (eptr >= md->end_subject || fc != *eptr) break;
7674          eptr++;
7675          }
7676        while (eptr >= pp)
7677          {
7678          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7679          eptr--;
7680          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7681          }
7682        RRETURN(MATCH_NOMATCH);
7683        }
7684      }
7685    /* Control never gets here */
7686
7687    /* Match a negated single one-byte character. The character we are
7688    checking can be multibyte. */
7689
7690    case OP_NOT:
7691    if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7692    ecode++;
7693    GETCHARINCTEST(c, eptr);
7694    if ((ims & PCRE_CASELESS) != 0)
7695      {
7696#ifdef SUPPORT_UTF8
7697      if (c < 256)
7698#endif
7699      c = md->lcc[c];
7700      if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7701      }
7702    else
7703      {
7704      if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7705      }
7706    break;
7707
7708    /* Match a negated single one-byte character repeatedly. This is almost a
7709    repeat of the code for a repeated single character, but I haven't found a
7710    nice way of commoning these up that doesn't require a test of the
7711    positive/negative option for each character match. Maybe that wouldn't add
7712    very much to the time taken, but character matching *is* what this is all
7713    about... */
7714
7715    case OP_NOTEXACT:
7716    min = max = GET2(ecode, 1);
7717    ecode += 3;
7718    goto REPEATNOTCHAR;
7719
7720    case OP_NOTUPTO:
7721    case OP_NOTMINUPTO:
7722    min = 0;
7723    max = GET2(ecode, 1);
7724    minimize = *ecode == OP_NOTMINUPTO;
7725    ecode += 3;
7726    goto REPEATNOTCHAR;
7727
7728    case OP_NOTSTAR:
7729    case OP_NOTMINSTAR:
7730    case OP_NOTPLUS:
7731    case OP_NOTMINPLUS:
7732    case OP_NOTQUERY:
7733    case OP_NOTMINQUERY:
7734    c = *ecode++ - OP_NOTSTAR;
7735    minimize = (c & 1) != 0;
7736    min = rep_min[c];                 /* Pick up values from tables; */
7737    max = rep_max[c];                 /* zero for max => infinity */
7738    if (max == 0) max = INT_MAX;
7739
7740    /* Common code for all repeated single-byte matches. We can give up quickly
7741    if there are fewer than the minimum number of bytes left in the
7742    subject. */
7743
7744    REPEATNOTCHAR:
7745    if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7746    fc = *ecode++;
7747
7748    /* The code is duplicated for the caseless and caseful cases, for speed,
7749    since matching characters is likely to be quite common. First, ensure the
7750    minimum number of matches are present. If min = max, continue at the same
7751    level without recursing. Otherwise, if minimizing, keep trying the rest of
7752    the expression and advancing one matching character if failing, up to the
7753    maximum. Alternatively, if maximizing, find the maximum number of
7754    characters and work backwards. */
7755
7756    DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7757      max, eptr));
7758
7759    if ((ims & PCRE_CASELESS) != 0)
7760      {
7761      fc = md->lcc[fc];
7762
7763#ifdef SUPPORT_UTF8
7764      /* UTF-8 mode */
7765      if (md->utf8)
7766        {
7767        register int d;
7768        for (i = 1; i <= min; i++)
7769          {
7770          GETCHARINC(d, eptr);
7771          if (d < 256) d = md->lcc[d];
7772          if (fc == d) RRETURN(MATCH_NOMATCH);
7773          }
7774        }
7775      else
7776#endif
7777
7778      /* Not UTF-8 mode */
7779        {
7780        for (i = 1; i <= min; i++)
7781          if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7782        }
7783
7784      if (min == max) continue;
7785
7786      if (minimize)
7787        {
7788#ifdef SUPPORT_UTF8
7789        /* UTF-8 mode */
7790        if (md->utf8)
7791          {
7792          register int d;
7793          for (fi = min;; fi++)
7794            {
7795            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7796            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7797            GETCHARINC(d, eptr);
7798            if (d < 256) d = md->lcc[d];
7799            if (fi >= max || eptr >= md->end_subject || fc == d)
7800              RRETURN(MATCH_NOMATCH);
7801            }
7802          }
7803        else
7804#endif
7805        /* Not UTF-8 mode */
7806          {
7807          for (fi = min;; fi++)
7808            {
7809            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7810            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7811            if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7812              RRETURN(MATCH_NOMATCH);
7813            }
7814          }
7815        /* Control never gets here */
7816        }
7817
7818      /* Maximize case */
7819
7820      else
7821        {
7822        pp = eptr;
7823
7824#ifdef SUPPORT_UTF8
7825        /* UTF-8 mode */
7826        if (md->utf8)
7827          {
7828          register int d;
7829          for (i = min; i < max; i++)
7830            {
7831            int len = 1;
7832            if (eptr >= md->end_subject) break;
7833            GETCHARLEN(d, eptr, len);
7834            if (d < 256) d = md->lcc[d];
7835            if (fc == d) break;
7836            eptr += len;
7837            }
7838          for(;;)
7839            {
7840            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7841            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7842            if (eptr-- == pp) break;        /* Stop if tried at original pos */
7843            BACKCHAR(eptr);
7844            }
7845          }
7846        else
7847#endif
7848        /* Not UTF-8 mode */
7849          {
7850          for (i = min; i < max; i++)
7851            {
7852            if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7853            eptr++;
7854            }
7855          while (eptr >= pp)
7856            {
7857            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7858            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7859            eptr--;
7860            }
7861          }
7862
7863        RRETURN(MATCH_NOMATCH);
7864        }
7865      /* Control never gets here */
7866      }
7867
7868    /* Caseful comparisons */
7869
7870    else
7871      {
7872#ifdef SUPPORT_UTF8
7873      /* UTF-8 mode */
7874      if (md->utf8)
7875        {
7876        register int d;
7877        for (i = 1; i <= min; i++)
7878          {
7879          GETCHARINC(d, eptr);
7880          if (fc == d) RRETURN(MATCH_NOMATCH);
7881          }
7882        }
7883      else
7884#endif
7885      /* Not UTF-8 mode */
7886        {
7887        for (i = 1; i <= min; i++)
7888          if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7889        }
7890
7891      if (min == max) continue;
7892
7893      if (minimize)
7894        {
7895#ifdef SUPPORT_UTF8
7896        /* UTF-8 mode */
7897        if (md->utf8)
7898          {
7899          register int d;
7900          for (fi = min;; fi++)
7901            {
7902            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7903            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7904            GETCHARINC(d, eptr);
7905            if (fi >= max || eptr >= md->end_subject || fc == d)
7906              RRETURN(MATCH_NOMATCH);
7907            }
7908          }
7909        else
7910#endif
7911        /* Not UTF-8 mode */
7912          {
7913          for (fi = min;; fi++)
7914            {
7915            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7916            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7917            if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7918              RRETURN(MATCH_NOMATCH);
7919            }
7920          }
7921        /* Control never gets here */
7922        }
7923
7924      /* Maximize case */
7925
7926      else
7927        {
7928        pp = eptr;
7929
7930#ifdef SUPPORT_UTF8
7931        /* UTF-8 mode */
7932        if (md->utf8)
7933          {
7934          register int d;
7935          for (i = min; i < max; i++)
7936            {
7937            int len = 1;
7938            if (eptr >= md->end_subject) break;
7939            GETCHARLEN(d, eptr, len);
7940            if (fc == d) break;
7941            eptr += len;
7942            }
7943          for(;;)
7944            {
7945            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7946            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7947            if (eptr-- == pp) break;        /* Stop if tried at original pos */
7948            BACKCHAR(eptr);
7949            }
7950          }
7951        else
7952#endif
7953        /* Not UTF-8 mode */
7954          {
7955          for (i = min; i < max; i++)
7956            {
7957            if (eptr >= md->end_subject || fc == *eptr) break;
7958            eptr++;
7959            }
7960          while (eptr >= pp)
7961            {
7962            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7963            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7964            eptr--;
7965            }
7966          }
7967
7968        RRETURN(MATCH_NOMATCH);
7969        }
7970      }
7971    /* Control never gets here */
7972
7973    /* Match a single character type repeatedly; several different opcodes
7974    share code. This is very similar to the code for single characters, but we
7975    repeat it in the interests of efficiency. */
7976
7977    case OP_TYPEEXACT:
7978    min = max = GET2(ecode, 1);
7979    minimize = TRUE;
7980    ecode += 3;
7981    goto REPEATTYPE;
7982
7983    case OP_TYPEUPTO:
7984    case OP_TYPEMINUPTO:
7985    min = 0;
7986    max = GET2(ecode, 1);
7987    minimize = *ecode == OP_TYPEMINUPTO;
7988    ecode += 3;
7989    goto REPEATTYPE;
7990
7991    case OP_TYPESTAR:
7992    case OP_TYPEMINSTAR:
7993    case OP_TYPEPLUS:
7994    case OP_TYPEMINPLUS:
7995    case OP_TYPEQUERY:
7996    case OP_TYPEMINQUERY:
7997    c = *ecode++ - OP_TYPESTAR;
7998    minimize = (c & 1) != 0;
7999    min = rep_min[c];                 /* Pick up values from tables; */
8000    max = rep_max[c];                 /* zero for max => infinity */
8001    if (max == 0) max = INT_MAX;
8002
8003    /* Common code for all repeated single character type matches. Note that
8004    in UTF-8 mode, '.' matches a character of any length, but for the other
8005    character types, the valid characters are all one-byte long. */
8006
8007    REPEATTYPE:
8008    ctype = *ecode++;      /* Code for the character type */
8009
8010#ifdef SUPPORT_UCP
8011    if (ctype == OP_PROP || ctype == OP_NOTPROP)
8012      {
8013      prop_fail_result = ctype == OP_NOTPROP;
8014      prop_type = *ecode++;
8015      if (prop_type >= 128)
8016        {
8017        prop_test_against = prop_type - 128;
8018        prop_test_variable = &prop_category;
8019        }
8020      else
8021        {
8022        prop_test_against = prop_type;
8023        prop_test_variable = &prop_chartype;
8024        }
8025      }
8026    else prop_type = -1;
8027#endif
8028
8029    /* First, ensure the minimum number of matches are present. Use inline
8030    code for maximizing the speed, and do the type test once at the start
8031    (i.e. keep it out of the loop). Also we can test that there are at least
8032    the minimum number of bytes before we start. This isn't as effective in
8033    UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8034    is tidier. Also separate the UCP code, which can be the same for both UTF-8
8035    and single-bytes. */
8036
8037    if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8038    if (min > 0)
8039      {
8040#ifdef SUPPORT_UCP
8041      if (prop_type > 0)
8042        {
8043        for (i = 1; i <= min; i++)
8044          {
8045          GETCHARINC(c, eptr);
8046          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8047          if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8048            RRETURN(MATCH_NOMATCH);
8049          }
8050        }
8051
8052      /* Match extended Unicode sequences. We will get here only if the
8053      support is in the binary; otherwise a compile-time error occurs. */
8054
8055      else if (ctype == OP_EXTUNI)
8056        {
8057        for (i = 1; i <= min; i++)
8058          {
8059          GETCHARINCTEST(c, eptr);
8060          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8061          if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8062          while (eptr < md->end_subject)
8063            {
8064            int len = 1;
8065            if (!md->utf8) c = *eptr; else
8066              {
8067              GETCHARLEN(c, eptr, len);
8068              }
8069            prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8070            if (prop_category != ucp_M) break;
8071            eptr += len;
8072            }
8073          }
8074        }
8075
8076      else
8077#endif     /* SUPPORT_UCP */
8078
8079/* Handle all other cases when the coding is UTF-8 */
8080
8081#ifdef SUPPORT_UTF8
8082      if (md->utf8) switch(ctype)
8083        {
8084        case OP_ANY:
8085        for (i = 1; i <= min; i++)
8086          {
8087          if (eptr >= md->end_subject ||
8088             (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8089            RRETURN(MATCH_NOMATCH);
8090          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8091          }
8092        break;
8093
8094        case OP_ANYBYTE:
8095        eptr += min;
8096        break;
8097
8098        case OP_NOT_DIGIT:
8099        for (i = 1; i <= min; i++)
8100          {
8101          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8102          GETCHARINC(c, eptr);
8103          if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8104            RRETURN(MATCH_NOMATCH);
8105          }
8106        break;
8107
8108        case OP_DIGIT:
8109        for (i = 1; i <= min; i++)
8110          {
8111          if (eptr >= md->end_subject ||
8112             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8113            RRETURN(MATCH_NOMATCH);
8114          /* No need to skip more bytes - we know it's a 1-byte character */
8115          }
8116        break;
8117
8118        case OP_NOT_WHITESPACE:
8119        for (i = 1; i <= min; i++)
8120          {
8121          if (eptr >= md->end_subject ||
8122             (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8123            RRETURN(MATCH_NOMATCH);
8124          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8125          }
8126        break;
8127
8128        case OP_WHITESPACE:
8129        for (i = 1; i <= min; i++)
8130          {
8131          if (eptr >= md->end_subject ||
8132             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8133            RRETURN(MATCH_NOMATCH);
8134          /* No need to skip more bytes - we know it's a 1-byte character */
8135          }
8136        break;
8137
8138        case OP_NOT_WORDCHAR:
8139        for (i = 1; i <= min; i++)
8140          {
8141          if (eptr >= md->end_subject ||
8142             (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8143            RRETURN(MATCH_NOMATCH);
8144          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8145          }
8146        break;
8147
8148        case OP_WORDCHAR:
8149        for (i = 1; i <= min; i++)
8150          {
8151          if (eptr >= md->end_subject ||
8152             *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8153            RRETURN(MATCH_NOMATCH);
8154          /* No need to skip more bytes - we know it's a 1-byte character */
8155          }
8156        break;
8157
8158        default:
8159        RRETURN(PCRE_ERROR_INTERNAL);
8160        }  /* End switch(ctype) */
8161
8162      else
8163#endif     /* SUPPORT_UTF8 */
8164
8165      /* Code for the non-UTF-8 case for minimum matching of operators other
8166      than OP_PROP and OP_NOTPROP. */
8167
8168      switch(ctype)
8169        {
8170        case OP_ANY:
8171        if ((ims & PCRE_DOTALL) == 0)
8172          {
8173          for (i = 1; i <= min; i++)
8174            if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8175          }
8176        else eptr += min;
8177        break;
8178
8179        case OP_ANYBYTE:
8180        eptr += min;
8181        break;
8182
8183        case OP_NOT_DIGIT:
8184        for (i = 1; i <= min; i++)
8185          if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8186        break;
8187
8188        case OP_DIGIT:
8189        for (i = 1; i <= min; i++)
8190          if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8191        break;
8192
8193        case OP_NOT_WHITESPACE:
8194        for (i = 1; i <= min; i++)
8195          if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8196        break;
8197
8198        case OP_WHITESPACE:
8199        for (i = 1; i <= min; i++)
8200          if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8201        break;
8202
8203        case OP_NOT_WORDCHAR:
8204        for (i = 1; i <= min; i++)
8205          if ((md->ctypes[*eptr++] & ctype_word) != 0)
8206            RRETURN(MATCH_NOMATCH);
8207        break;
8208
8209        case OP_WORDCHAR:
8210        for (i = 1; i <= min; i++)
8211          if ((md->ctypes[*eptr++] & ctype_word) == 0)
8212            RRETURN(MATCH_NOMATCH);
8213        break;
8214
8215        default:
8216        RRETURN(PCRE_ERROR_INTERNAL);
8217        }
8218      }
8219
8220    /* If min = max, continue at the same level without recursing */
8221
8222    if (min == max) continue;
8223
8224    /* If minimizing, we have to test the rest of the pattern before each
8225    subsequent match. Again, separate the UTF-8 case for speed, and also
8226    separate the UCP cases. */
8227
8228    if (minimize)
8229      {
8230#ifdef SUPPORT_UCP
8231      if (prop_type > 0)
8232        {
8233        for (fi = min;; fi++)
8234          {
8235          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8236          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8237          if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8238          GETCHARINC(c, eptr);
8239          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8240          if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8241            RRETURN(MATCH_NOMATCH);
8242          }
8243        }
8244
8245      /* Match extended Unicode sequences. We will get here only if the
8246      support is in the binary; otherwise a compile-time error occurs. */
8247
8248      else if (ctype == OP_EXTUNI)
8249        {
8250        for (fi = min;; fi++)
8251          {
8252          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8253          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8254          if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8255          GETCHARINCTEST(c, eptr);
8256          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8257          if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8258          while (eptr < md->end_subject)
8259            {
8260            int len = 1;
8261            if (!md->utf8) c = *eptr; else
8262              {
8263              GETCHARLEN(c, eptr, len);
8264              }
8265            prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8266            if (prop_category != ucp_M) break;
8267            eptr += len;
8268            }
8269          }
8270        }
8271
8272      else
8273#endif     /* SUPPORT_UCP */
8274
8275#ifdef SUPPORT_UTF8
8276      /* UTF-8 mode */
8277      if (md->utf8)
8278        {
8279        for (fi = min;; fi++)
8280          {
8281          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8282          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8283          if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8284
8285          GETCHARINC(c, eptr);
8286          switch(ctype)
8287            {
8288            case OP_ANY:
8289            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8290            break;
8291
8292            case OP_ANYBYTE:
8293            break;
8294
8295            case OP_NOT_DIGIT:
8296            if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8297              RRETURN(MATCH_NOMATCH);
8298            break;
8299
8300            case OP_DIGIT:
8301            if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8302              RRETURN(MATCH_NOMATCH);
8303            break;
8304
8305            case OP_NOT_WHITESPACE:
8306            if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8307              RRETURN(MATCH_NOMATCH);
8308            break;
8309
8310            case OP_WHITESPACE:
8311            if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8312              RRETURN(MATCH_NOMATCH);
8313            break;
8314
8315            case OP_NOT_WORDCHAR:
8316            if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8317              RRETURN(MATCH_NOMATCH);
8318            break;
8319
8320            case OP_WORDCHAR:
8321            if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8322              RRETURN(MATCH_NOMATCH);
8323            break;
8324
8325            default:
8326            RRETURN(PCRE_ERROR_INTERNAL);
8327            }
8328          }
8329        }
8330      else
8331#endif
8332      /* Not UTF-8 mode */
8333        {
8334        for (fi = min;; fi++)
8335          {
8336          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8337          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8338          if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8339          c = *eptr++;
8340          switch(ctype)
8341            {
8342            case OP_ANY:
8343            if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8344            break;
8345
8346            case OP_ANYBYTE:
8347            break;
8348
8349            case OP_NOT_DIGIT:
8350            if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8351            break;
8352
8353            case OP_DIGIT:
8354            if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8355            break;
8356
8357            case OP_NOT_WHITESPACE:
8358            if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8359            break;
8360
8361            case OP_WHITESPACE:
8362            if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8363            break;
8364
8365            case OP_NOT_WORDCHAR:
8366            if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8367            break;
8368
8369            case OP_WORDCHAR:
8370            if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8371            break;
8372
8373            default:
8374            RRETURN(PCRE_ERROR_INTERNAL);
8375            }
8376          }
8377        }
8378      /* Control never gets here */
8379      }
8380
8381    /* If maximizing it is worth using inline code for speed, doing the type
8382    test once at the start (i.e. keep it out of the loop). Again, keep the
8383    UTF-8 and UCP stuff separate. */
8384
8385    else
8386      {
8387      pp = eptr;  /* Remember where we started */
8388
8389#ifdef SUPPORT_UCP
8390      if (prop_type > 0)
8391        {
8392        for (i = min; i < max; i++)
8393          {
8394          int len = 1;
8395          if (eptr >= md->end_subject) break;
8396          GETCHARLEN(c, eptr, len);
8397          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8398          if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8399            break;
8400          eptr+= len;
8401          }
8402
8403        /* eptr is now past the end of the maximum run */
8404
8405        for(;;)
8406          {
8407          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8408          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8409          if (eptr-- == pp) break;        /* Stop if tried at original pos */
8410          BACKCHAR(eptr);
8411          }
8412        }
8413
8414      /* Match extended Unicode sequences. We will get here only if the
8415      support is in the binary; otherwise a compile-time error occurs. */
8416
8417      else if (ctype == OP_EXTUNI)
8418        {
8419        for (i = min; i < max; i++)
8420          {
8421          if (eptr >= md->end_subject) break;
8422          GETCHARINCTEST(c, eptr);
8423          prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8424          if (prop_category == ucp_M) break;
8425          while (eptr < md->end_subject)
8426            {
8427            int len = 1;
8428            if (!md->utf8) c = *eptr; else
8429              {
8430              GETCHARLEN(c, eptr, len);
8431              }
8432            prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8433            if (prop_category != ucp_M) break;
8434            eptr += len;
8435            }
8436          }
8437
8438        /* eptr is now past the end of the maximum run */
8439
8440        for(;;)
8441          {
8442          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8443          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8444          if (eptr-- == pp) break;        /* Stop if tried at original pos */
8445          for (;;)                        /* Move back over one extended */
8446            {
8447            int len = 1;
8448            BACKCHAR(eptr);
8449            if (!md->utf8) c = *eptr; else
8450              {
8451              GETCHARLEN(c, eptr, len);
8452              }
8453            prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8454            if (prop_category != ucp_M) break;
8455            eptr--;
8456            }
8457          }
8458        }
8459
8460      else
8461#endif   /* SUPPORT_UCP */
8462
8463#ifdef SUPPORT_UTF8
8464      /* UTF-8 mode */
8465
8466      if (md->utf8)
8467        {
8468        switch(ctype)
8469          {
8470          case OP_ANY:
8471
8472          /* Special code is required for UTF8, but when the maximum is unlimited
8473          we don't need it, so we repeat the non-UTF8 code. This is probably
8474          worth it, because .* is quite a common idiom. */
8475
8476          if (max < INT_MAX)
8477            {
8478            if ((ims & PCRE_DOTALL) == 0)
8479              {
8480              for (i = min; i < max; i++)
8481                {
8482                if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8483                eptr++;
8484                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8485                }
8486              }
8487            else
8488              {
8489              for (i = min; i < max; i++)
8490                {
8491                eptr++;
8492                while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8493                }
8494              }
8495            }
8496
8497          /* Handle unlimited UTF-8 repeat */
8498
8499          else
8500            {
8501            if ((ims & PCRE_DOTALL) == 0)
8502              {
8503              for (i = min; i < max; i++)
8504                {
8505                if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8506                eptr++;
8507                }
8508              break;
8509              }
8510            else
8511              {
8512              c = max - min;
8513              if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8514              eptr += c;
8515              }
8516            }
8517          break;
8518
8519          /* The byte case is the same as non-UTF8 */
8520
8521          case OP_ANYBYTE:
8522          c = max - min;
8523          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8524          eptr += c;
8525          break;
8526
8527          case OP_NOT_DIGIT:
8528          for (i = min; i < max; i++)
8529            {
8530            int len = 1;
8531            if (eptr >= md->end_subject) break;
8532            GETCHARLEN(c, eptr, len);
8533            if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8534            eptr+= len;
8535            }
8536          break;
8537
8538          case OP_DIGIT:
8539          for (i = min; i < max; i++)
8540            {
8541            int len = 1;
8542            if (eptr >= md->end_subject) break;
8543            GETCHARLEN(c, eptr, len);
8544            if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8545            eptr+= len;
8546            }
8547          break;
8548
8549          case OP_NOT_WHITESPACE:
8550          for (i = min; i < max; i++)
8551            {
8552            int len = 1;
8553            if (eptr >= md->end_subject) break;
8554            GETCHARLEN(c, eptr, len);
8555            if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8556            eptr+= len;
8557            }
8558          break;
8559
8560          case OP_WHITESPACE:
8561          for (i = min; i < max; i++)
8562            {
8563            int len = 1;
8564            if (eptr >= md->end_subject) break;
8565            GETCHARLEN(c, eptr, len);
8566            if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8567            eptr+= len;
8568            }
8569          break;
8570
8571          case OP_NOT_WORDCHAR:
8572          for (i = min; i < max; i++)
8573            {
8574            int len = 1;
8575            if (eptr >= md->end_subject) break;
8576            GETCHARLEN(c, eptr, len);
8577            if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8578            eptr+= len;
8579            }
8580          break;
8581
8582          case OP_WORDCHAR:
8583          for (i = min; i < max; i++)
8584            {
8585            int len = 1;
8586            if (eptr >= md->end_subject) break;
8587            GETCHARLEN(c, eptr, len);
8588            if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8589            eptr+= len;
8590            }
8591          break;
8592
8593          default:
8594          RRETURN(PCRE_ERROR_INTERNAL);
8595          }
8596
8597        /* eptr is now past the end of the maximum run */
8598
8599        for(;;)
8600          {
8601          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8602          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8603          if (eptr-- == pp) break;        /* Stop if tried at original pos */
8604          BACKCHAR(eptr);
8605          }
8606        }
8607      else
8608#endif
8609
8610      /* Not UTF-8 mode */
8611        {
8612        switch(ctype)
8613          {
8614          case OP_ANY:
8615          if ((ims & PCRE_DOTALL) == 0)
8616            {
8617            for (i = min; i < max; i++)
8618              {
8619              if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8620              eptr++;
8621              }
8622            break;
8623            }
8624          /* For DOTALL case, fall through and treat as \C */
8625
8626          case OP_ANYBYTE:
8627          c = max - min;
8628          if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8629          eptr += c;
8630          break;
8631
8632          case OP_NOT_DIGIT:
8633          for (i = min; i < max; i++)
8634            {
8635            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8636              break;
8637            eptr++;
8638            }
8639          break;
8640
8641          case OP_DIGIT:
8642          for (i = min; i < max; i++)
8643            {
8644            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8645              break;
8646            eptr++;
8647            }
8648          break;
8649
8650          case OP_NOT_WHITESPACE:
8651          for (i = min; i < max; i++)
8652            {
8653            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8654              break;
8655            eptr++;
8656            }
8657          break;
8658
8659          case OP_WHITESPACE:
8660          for (i = min; i < max; i++)
8661            {
8662            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8663              break;
8664            eptr++;
8665            }
8666          break;
8667
8668          case OP_NOT_WORDCHAR:
8669          for (i = min; i < max; i++)
8670            {
8671            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8672              break;
8673            eptr++;
8674            }
8675          break;
8676
8677          case OP_WORDCHAR:
8678          for (i = min; i < max; i++)
8679            {
8680            if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8681              break;
8682            eptr++;
8683            }
8684          break;
8685
8686          default:
8687          RRETURN(PCRE_ERROR_INTERNAL);
8688          }
8689
8690        /* eptr is now past the end of the maximum run */
8691
8692        while (eptr >= pp)
8693          {
8694          RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8695          eptr--;
8696          if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8697          }
8698        }
8699
8700      /* Get here if we can't make it match with any permitted repetitions */
8701
8702      RRETURN(MATCH_NOMATCH);
8703      }
8704    /* Control never gets here */
8705
8706    /* There's been some horrible disaster. Since all codes > OP_BRA are
8707    for capturing brackets, and there shouldn't be any gaps between 0 and
8708    OP_BRA, arrival here can only mean there is something seriously wrong
8709    in the code above or the OP_xxx definitions. */
8710
8711    default:
8712    DPRINTF(("Unknown opcode %d\n", *ecode));
8713    RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8714    }
8715
8716  /* Do not stick any code in here without much thought; it is assumed
8717  that "continue" in the code above comes out to here to repeat the main
8718  loop. */
8719
8720  }             /* End of main loop */
8721/* Control never reaches here */
8722}
8723
8724
8725/***************************************************************************
8726****************************************************************************
8727                   RECURSION IN THE match() FUNCTION
8728
8729Undefine all the macros that were defined above to handle this. */
8730
8731#ifdef NO_RECURSE
8732#undef eptr
8733#undef ecode
8734#undef offset_top
8735#undef ims
8736#undef eptrb
8737#undef flags
8738
8739#undef callpat
8740#undef charptr
8741#undef data
8742#undef next
8743#undef pp
8744#undef prev
8745#undef saved_eptr
8746
8747#undef new_recursive
8748
8749#undef cur_is_word
8750#undef condition
8751#undef minimize
8752#undef prev_is_word
8753
8754#undef original_ims
8755
8756#undef ctype
8757#undef length
8758#undef max
8759#undef min
8760#undef number
8761#undef offset
8762#undef op
8763#undef save_capture_last
8764#undef save_offset1
8765#undef save_offset2
8766#undef save_offset3
8767#undef stacksave
8768
8769#undef newptrb
8770
8771#endif
8772
8773/* These two are defined as macros in both cases */
8774
8775#undef fc
8776#undef fi
8777
8778/***************************************************************************
8779***************************************************************************/
8780
8781
8782
8783/*************************************************
8784*         Execute a Regular Expression           *
8785*************************************************/
8786
8787/* This function applies a compiled re to a subject string and picks out
8788portions of the string if it matches. Two elements in the vector are set for
8789each substring: the offsets to the start and end of the substring.
8790
8791Arguments:
8792  argument_re     points to the compiled expression
8793  extra_data      points to extra data or is NULL
8794  subject         points to the subject string
8795  length          length of subject string (may contain binary zeros)
8796  start_offset    where to start in the subject string
8797  options         option bits
8798  offsets         points to a vector of ints to be filled in with offsets
8799  offsetcount     the number of elements in the vector
8800
8801Returns:          > 0 => success; value is the number of elements filled in
8802                  = 0 => success, but offsets is not big enough
8803                   -1 => failed to match
8804                 < -1 => some kind of unexpected problem
8805*/
8806
8807EXPORT int
8808pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8809  const char *subject, int length, int start_offset, int options, int *offsets,
8810  int offsetcount)
8811{
8812int rc, resetcount, ocount;
8813int first_byte = -1;
8814int req_byte = -1;
8815int req_byte2 = -1;
8816unsigned long int ims = 0;
8817BOOL using_temporary_offsets = FALSE;
8818BOOL anchored;
8819BOOL startline;
8820BOOL first_byte_caseless = FALSE;
8821BOOL req_byte_caseless = FALSE;
8822match_data match_block;
8823const uschar *tables;
8824const uschar *start_bits = NULL;
8825const uschar *start_match = (const uschar *)subject + start_offset;
8826const uschar *end_subject;
8827const uschar *req_byte_ptr = start_match - 1;
8828
8829pcre_study_data internal_study;
8830const pcre_study_data *study;
8831
8832real_pcre internal_re;
8833const real_pcre *external_re = (const real_pcre *)argument_re;
8834const real_pcre *re = external_re;
8835
8836/* Plausibility checks */
8837
8838if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8839if (re == NULL || subject == NULL ||
8840   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8841if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8842
8843/* Fish out the optional data from the extra_data structure, first setting
8844the default values. */
8845
8846study = NULL;
8847match_block.match_limit = MATCH_LIMIT;
8848match_block.callout_data = NULL;
8849
8850/* The table pointer is always in native byte order. */
8851
8852tables = external_re->tables;
8853
8854if (extra_data != NULL)
8855  {
8856  register unsigned int flags = extra_data->flags;
8857  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8858    study = (const pcre_study_data *)extra_data->study_data;
8859  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8860    match_block.match_limit = extra_data->match_limit;
8861  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8862    match_block.callout_data = extra_data->callout_data;
8863  if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8864  }
8865
8866/* If the exec call supplied NULL for tables, use the inbuilt ones. This
8867is a feature that makes it possible to save compiled regex and re-use them
8868in other programs later. */
8869
8870if (tables == NULL) tables = pcre_default_tables;
8871
8872/* Check that the first field in the block is the magic number. If it is not,
8873test for a regex that was compiled on a host of opposite endianness. If this is
8874the case, flipped values are put in internal_re and internal_study if there was
8875study data too. */
8876
8877if (re->magic_number != MAGIC_NUMBER)
8878  {
8879  re = try_flipped(re, &internal_re, study, &internal_study);
8880  if (re == NULL) return PCRE_ERROR_BADMAGIC;
8881  if (study != NULL) study = &internal_study;
8882  }
8883
8884/* Set up other data */
8885
8886anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8887startline = (re->options & PCRE_STARTLINE) != 0;
8888
8889/* The code starts after the real_pcre block and the capture name table. */
8890
8891match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8892  re->name_count * re->name_entry_size;
8893
8894match_block.start_subject = (const uschar *)subject;
8895match_block.start_offset = start_offset;
8896match_block.end_subject = match_block.start_subject + length;
8897end_subject = match_block.end_subject;
8898
8899match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8900match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8901
8902match_block.notbol = (options & PCRE_NOTBOL) != 0;
8903match_block.noteol = (options & PCRE_NOTEOL) != 0;
8904match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8905match_block.partial = (options & PCRE_PARTIAL) != 0;
8906match_block.hitend = FALSE;
8907
8908match_block.recursive = NULL;                   /* No recursion at top level */
8909
8910match_block.lcc = tables + lcc_offset;
8911match_block.ctypes = tables + ctypes_offset;
8912
8913/* Partial matching is supported only for a restricted set of regexes at the
8914moment. */
8915
8916if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8917  return PCRE_ERROR_BADPARTIAL;
8918
8919/* Check a UTF-8 string if required. Unfortunately there's no way of passing
8920back the character offset. */
8921
8922#ifdef SUPPORT_UTF8
8923if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8924  {
8925  if (valid_utf8((uschar *)subject, length) >= 0)
8926    return PCRE_ERROR_BADUTF8;
8927  if (start_offset > 0 && start_offset < length)
8928    {
8929    int tb = ((uschar *)subject)[start_offset];
8930    if (tb > 127)
8931      {
8932      tb &= 0xc0;
8933      if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8934      }
8935    }
8936  }
8937#endif
8938
8939/* The ims options can vary during the matching as a result of the presence
8940of (?ims) items in the pattern. They are kept in a local variable so that
8941restoring at the exit of a group is easy. */
8942
8943ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8944
8945/* If the expression has got more back references than the offsets supplied can
8946hold, we get a temporary chunk of working store to use during the matching.
8947Otherwise, we can use the vector supplied, rounding down its size to a multiple
8948of 3. */
8949
8950ocount = offsetcount - (offsetcount % 3);
8951
8952if (re->top_backref > 0 && re->top_backref >= ocount/3)
8953  {
8954  ocount = re->top_backref * 3 + 3;
8955  match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8956  if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8957  using_temporary_offsets = TRUE;
8958  DPRINTF(("Got memory to hold back references\n"));
8959  }
8960else match_block.offset_vector = offsets;
8961
8962match_block.offset_end = ocount;
8963match_block.offset_max = (2*ocount)/3;
8964match_block.offset_overflow = FALSE;
8965match_block.capture_last = -1;
8966
8967/* Compute the minimum number of offsets that we need to reset each time. Doing
8968this makes a huge difference to execution time when there aren't many brackets
8969in the pattern. */
8970
8971resetcount = 2 + re->top_bracket * 2;
8972if (resetcount > offsetcount) resetcount = ocount;
8973
8974/* Reset the working variable associated with each extraction. These should
8975never be used unless previously set, but they get saved and restored, and so we
8976initialize them to avoid reading uninitialized locations. */
8977
8978if (match_block.offset_vector != NULL)
8979  {
8980  register int *iptr = match_block.offset_vector + ocount;
8981  register int *iend = iptr - resetcount/2 + 1;
8982  while (--iptr >= iend) *iptr = -1;
8983  }
8984
8985/* Set up the first character to match, if available. The first_byte value is
8986never set for an anchored regular expression, but the anchoring may be forced
8987at run time, so we have to test for anchoring. The first char may be unset for
8988an unanchored pattern, of course. If there's no first char and the pattern was
8989studied, there may be a bitmap of possible first characters. */
8990
8991if (!anchored)
8992  {
8993  if ((re->options & PCRE_FIRSTSET) != 0)
8994    {
8995    first_byte = re->first_byte & 255;
8996    if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8997      first_byte = match_block.lcc[first_byte];
8998    }
8999  else
9000    if (!startline && study != NULL &&
9001      (study->options & PCRE_STUDY_MAPPED) != 0)
9002        start_bits = study->start_bits;
9003  }
9004
9005/* For anchored or unanchored matches, there may be a "last known required
9006character" set. */
9007
9008if ((re->options & PCRE_REQCHSET) != 0)
9009  {
9010  req_byte = re->req_byte & 255;
9011  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9012  req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
9013  }
9014
9015/* Loop for handling unanchored repeated matching attempts; for anchored regexs
9016the loop runs just once. */
9017
9018do
9019  {
9020  /* Reset the maximum number of extractions we might see. */
9021
9022  if (match_block.offset_vector != NULL)
9023    {
9024    register int *iptr = match_block.offset_vector;
9025    register int *iend = iptr + resetcount;
9026    while (iptr < iend) *iptr++ = -1;
9027    }
9028
9029  /* Advance to a unique first char if possible */
9030
9031  if (first_byte >= 0)
9032    {
9033    if (first_byte_caseless)
9034      while (start_match < end_subject &&
9035             match_block.lcc[*start_match] != first_byte)
9036        start_match++;
9037    else
9038      while (start_match < end_subject && *start_match != first_byte)
9039        start_match++;
9040    }
9041
9042  /* Or to just after \n for a multiline match if possible */
9043
9044  else if (startline)
9045    {
9046    if (start_match > match_block.start_subject + start_offset)
9047      {
9048      while (start_match < end_subject && start_match[-1] != NEWLINE)
9049        start_match++;
9050      }
9051    }
9052
9053  /* Or to a non-unique first char after study */
9054
9055  else if (start_bits != NULL)
9056    {
9057    while (start_match < end_subject)
9058      {
9059      register unsigned int c = *start_match;
9060      if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9061      }
9062    }
9063
9064#ifdef DEBUG  /* Sigh. Some compilers never learn. */
9065  printf(">>>> Match against: ");
9066  pchars(start_match, end_subject - start_match, TRUE, &match_block);
9067  printf("\n");
9068#endif
9069
9070  /* If req_byte is set, we know that that character must appear in the subject
9071  for the match to succeed. If the first character is set, req_byte must be
9072  later in the subject; otherwise the test starts at the match point. This
9073  optimization can save a huge amount of backtracking in patterns with nested
9074  unlimited repeats that aren't going to match. Writing separate code for
9075  cased/caseless versions makes it go faster, as does using an autoincrement
9076  and backing off on a match.
9077
9078  HOWEVER: when the subject string is very, very long, searching to its end can
9079  take a long time, and give bad performance on quite ordinary patterns. This
9080  showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9081  don't do this when the string is sufficiently long.
9082
9083  ALSO: this processing is disabled when partial matching is requested.
9084  */
9085
9086  if (req_byte >= 0 &&
9087      end_subject - start_match < REQ_BYTE_MAX &&
9088      !match_block.partial)
9089    {
9090    register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9091
9092    /* We don't need to repeat the search if we haven't yet reached the
9093    place we found it at last time. */
9094
9095    if (p > req_byte_ptr)
9096      {
9097      if (req_byte_caseless)
9098        {
9099        while (p < end_subject)
9100          {
9101          register int pp = *p++;
9102          if (pp == req_byte || pp == req_byte2) { p--; break; }
9103          }
9104        }
9105      else
9106        {
9107        while (p < end_subject)
9108          {
9109          if (*p++ == req_byte) { p--; break; }
9110          }
9111        }
9112
9113      /* If we can't find the required character, break the matching loop */
9114
9115      if (p >= end_subject) break;
9116
9117      /* If we have found the required character, save the point where we
9118      found it, so that we don't search again next time round the loop if
9119      the start hasn't passed this character yet. */
9120
9121      req_byte_ptr = p;
9122      }
9123    }
9124
9125  /* When a match occurs, substrings will be set for all internal extractions;
9126  we just need to set up the whole thing as substring 0 before returning. If
9127  there were too many extractions, set the return code to zero. In the case
9128  where we had to get some local store to hold offsets for backreferences, copy
9129  those back references that we can. In this case there need not be overflow
9130  if certain parts of the pattern were not used. */
9131
9132  match_block.start_match = start_match;
9133  match_block.match_call_count = 0;
9134
9135  rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9136    match_isgroup);
9137
9138  if (rc == MATCH_NOMATCH)
9139    {
9140    start_match++;
9141#ifdef SUPPORT_UTF8
9142    if (match_block.utf8)
9143      while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9144        start_match++;
9145#endif
9146    continue;
9147    }
9148
9149  if (rc != MATCH_MATCH)
9150    {
9151    DPRINTF((">>>> error: returning %d\n", rc));
9152    return rc;
9153    }
9154
9155  /* We have a match! Copy the offset information from temporary store if
9156  necessary */
9157
9158  if (using_temporary_offsets)
9159    {
9160    if (offsetcount >= 4)
9161      {
9162      memcpy(offsets + 2, match_block.offset_vector + 2,
9163        (offsetcount - 2) * sizeof(int));
9164      DPRINTF(("Copied offsets from temporary memory\n"));
9165      }
9166    if (match_block.end_offset_top > offsetcount)
9167      match_block.offset_overflow = TRUE;
9168
9169    DPRINTF(("Freeing temporary memory\n"));
9170    (pcre_free)(match_block.offset_vector);
9171    }
9172
9173  rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9174
9175  if (offsetcount < 2) rc = 0; else
9176    {
9177    offsets[0] = start_match - match_block.start_subject;
9178    offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9179    }
9180
9181  DPRINTF((">>>> returning %d\n", rc));
9182  return rc;
9183  }
9184
9185/* This "while" is the end of the "do" above */
9186
9187while (!anchored && start_match <= end_subject);
9188
9189if (using_temporary_offsets)
9190  {
9191  DPRINTF(("Freeing temporary memory\n"));
9192  (pcre_free)(match_block.offset_vector);
9193  }
9194
9195if (match_block.partial && match_block.hitend)
9196  {
9197  DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9198  return PCRE_ERROR_PARTIAL;
9199  }
9200else
9201  {
9202  DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9203  return PCRE_ERROR_NOMATCH;
9204  }
9205}
9206
9207/* End of pcre.c */
9208