1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language (but see
7below for why this module is different).
8
9                       Written by Philip Hazel
10           Copyright (c) 1997-2012 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains the external function pcre_dfa_exec(), which is an
42alternative matching function that uses a sort of DFA algorithm (not a true
43FSM). This is NOT Perl-compatible, but it has advantages in certain
44applications. */
45
46
47/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48the performance of his patterns greatly. I could not use it as it stood, as it
49was not thread safe, and made assumptions about pattern sizes. Also, it caused
50test 7 to loop, and test 9 to crash with a segfault.
51
52The issue is the check for duplicate states, which is done by a simple linear
53search up the state list. (Grep for "duplicate" below to find the code.) For
54many patterns, there will never be many states active at one time, so a simple
55linear search is fine. In patterns that have many active states, it might be a
56bottleneck. The suggested code used an indexing scheme to remember which states
57had previously been used for each character, and avoided the linear search when
58it knew there was no chance of a duplicate. This was implemented when adding
59states to the state lists.
60
61I wrote some thread-safe, not-limited code to try something similar at the time
62of checking for duplicates (instead of when adding states), using index vectors
63on the stack. It did give a 13% improvement with one specially constructed
64pattern for certain subject strings, but on other strings and on many of the
65simpler patterns in the test suite it did worse. The major problem, I think,
66was the extra time to initialize the index. This had to be done for each call
67of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68only once - I suspect this was the cause of the problems with the tests.)
69
70Overall, I concluded that the gains in some cases did not outweigh the losses
71in others, so I abandoned this code. */
72
73
74
75#ifdef HAVE_CONFIG_H
76#include "config.h"
77#endif
78
79#define NLBLOCK md             /* Block containing newline information */
80#define PSSTART start_subject  /* Field containing processed string start */
81#define PSEND   end_subject    /* Field containing processed string end */
82
83#include "pcre_internal.h"
84
85
86/* For use to indent debugging output */
87
88#define SP "                   "
89
90
91/*************************************************
92*      Code parameters and static tables         *
93*************************************************/
94
95/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96into others, under special conditions. A gap of 20 between the blocks should be
97enough. The resulting opcodes don't have to be less than 256 because they are
98never stored, so we push them well clear of the normal opcodes. */
99
100#define OP_PROP_EXTRA       300
101#define OP_EXTUNI_EXTRA     320
102#define OP_ANYNL_EXTRA      340
103#define OP_HSPACE_EXTRA     360
104#define OP_VSPACE_EXTRA     380
105
106
107/* This table identifies those opcodes that are followed immediately by a
108character that is to be tested in some way. This makes it possible to
109centralize the loading of these characters. In the case of Type * etc, the
110"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111small value. Non-zero values in the table are the offsets from the opcode where
112the character is to be found. ***NOTE*** If the start of this table is
113modified, the three tables that follow must also be modified. */
114
115static const pcre_uint8 coptable[] = {
116  0,                             /* End                                    */
117  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120  0, 0,                          /* \P, \p                                 */
121  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122  0,                             /* \X                                     */
123  0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
124  1,                             /* Char                                   */
125  1,                             /* Chari                                  */
126  1,                             /* not                                    */
127  1,                             /* noti                                   */
128  /* Positive single-char repeats                                          */
129  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131  1+IMM2_SIZE,                   /* exact                                  */
132  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135  1+IMM2_SIZE,                   /* exact I                                */
136  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137  /* Negative single-char repeats - only for chars < 256                   */
138  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140  1+IMM2_SIZE,                   /* NOT exact                              */
141  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144  1+IMM2_SIZE,                   /* NOT exact I                            */
145  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146  /* Positive type repeats                                                 */
147  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149  1+IMM2_SIZE,                   /* Type exact                             */
150  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151  /* Character class & ref repeats                                         */
152  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153  0, 0,                          /* CRRANGE, CRMINRANGE                    */
154  0,                             /* CLASS                                  */
155  0,                             /* NCLASS                                 */
156  0,                             /* XCLASS - variable length               */
157  0,                             /* REF                                    */
158  0,                             /* REFI                                   */
159  0,                             /* RECURSE                                */
160  0,                             /* CALLOUT                                */
161  0,                             /* Alt                                    */
162  0,                             /* Ket                                    */
163  0,                             /* KetRmax                                */
164  0,                             /* KetRmin                                */
165  0,                             /* KetRpos                                */
166  0,                             /* Reverse                                */
167  0,                             /* Assert                                 */
168  0,                             /* Assert not                             */
169  0,                             /* Assert behind                          */
170  0,                             /* Assert behind not                      */
171  0, 0,                          /* ONCE, ONCE_NC                          */
172  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
173  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
174  0, 0,                          /* CREF, NCREF                            */
175  0, 0,                          /* RREF, NRREF                            */
176  0,                             /* DEF                                    */
177  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
178  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
179  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
180  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
181  0, 0                           /* CLOSE, SKIPZERO  */
182};
183
184/* This table identifies those opcodes that inspect a character. It is used to
185remember the fact that a character could have been inspected when the end of
186the subject is reached. ***NOTE*** If the start of this table is modified, the
187two tables that follow must also be modified. */
188
189static const pcre_uint8 poptable[] = {
190  0,                             /* End                                    */
191  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
192  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
193  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
194  1, 1,                          /* \P, \p                                 */
195  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
196  1,                             /* \X                                     */
197  0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
198  1,                             /* Char                                   */
199  1,                             /* Chari                                  */
200  1,                             /* not                                    */
201  1,                             /* noti                                   */
202  /* Positive single-char repeats                                          */
203  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
204  1, 1, 1,                       /* upto, minupto, exact                   */
205  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
206  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
207  1, 1, 1,                       /* upto I, minupto I, exact I             */
208  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
209  /* Negative single-char repeats - only for chars < 256                   */
210  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
211  1, 1, 1,                       /* NOT upto, minupto, exact               */
212  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
213  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
214  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
215  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
216  /* Positive type repeats                                                 */
217  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
218  1, 1, 1,                       /* Type upto, minupto, exact              */
219  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
220  /* Character class & ref repeats                                         */
221  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
222  1, 1,                          /* CRRANGE, CRMINRANGE                    */
223  1,                             /* CLASS                                  */
224  1,                             /* NCLASS                                 */
225  1,                             /* XCLASS - variable length               */
226  0,                             /* REF                                    */
227  0,                             /* REFI                                   */
228  0,                             /* RECURSE                                */
229  0,                             /* CALLOUT                                */
230  0,                             /* Alt                                    */
231  0,                             /* Ket                                    */
232  0,                             /* KetRmax                                */
233  0,                             /* KetRmin                                */
234  0,                             /* KetRpos                                */
235  0,                             /* Reverse                                */
236  0,                             /* Assert                                 */
237  0,                             /* Assert not                             */
238  0,                             /* Assert behind                          */
239  0,                             /* Assert behind not                      */
240  0, 0,                          /* ONCE, ONCE_NC                          */
241  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
242  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
243  0, 0,                          /* CREF, NCREF                            */
244  0, 0,                          /* RREF, NRREF                            */
245  0,                             /* DEF                                    */
246  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
247  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
248  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
249  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
250  0, 0                           /* CLOSE, SKIPZERO                        */
251};
252
253/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254and \w */
255
256static const pcre_uint8 toptable1[] = {
257  0, 0, 0, 0, 0, 0,
258  ctype_digit, ctype_digit,
259  ctype_space, ctype_space,
260  ctype_word,  ctype_word,
261  0, 0                            /* OP_ANY, OP_ALLANY */
262};
263
264static const pcre_uint8 toptable2[] = {
265  0, 0, 0, 0, 0, 0,
266  ctype_digit, 0,
267  ctype_space, 0,
268  ctype_word,  0,
269  1, 1                            /* OP_ANY, OP_ALLANY */
270};
271
272
273/* Structure for holding data about a particular state, which is in effect the
274current data for an active path through the match tree. It must consist
275entirely of ints because the working vector we are passed, and which we put
276these structures in, is a vector of ints. */
277
278typedef struct stateblock {
279  int offset;                     /* Offset to opcode */
280  int count;                      /* Count for repeats */
281  int data;                       /* Some use extra data */
282} stateblock;
283
284#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
285
286
287#ifdef PCRE_DEBUG
288/*************************************************
289*             Print character string             *
290*************************************************/
291
292/* Character string printing function for debugging.
293
294Arguments:
295  p            points to string
296  length       number of bytes
297  f            where to print
298
299Returns:       nothing
300*/
301
302static void
303pchars(const pcre_uchar *p, int length, FILE *f)
304{
305int c;
306while (length-- > 0)
307  {
308  if (isprint(c = *(p++)))
309    fprintf(f, "%c", c);
310  else
311    fprintf(f, "\\x%02x", c);
312  }
313}
314#endif
315
316
317
318/*************************************************
319*    Execute a Regular Expression - DFA engine   *
320*************************************************/
321
322/* This internal function applies a compiled pattern to a subject string,
323starting at a given point, using a DFA engine. This function is called from the
324external one, possibly multiple times if the pattern is not anchored. The
325function calls itself recursively for some kinds of subpattern.
326
327Arguments:
328  md                the match_data block with fixed information
329  this_start_code   the opening bracket of this subexpression's code
330  current_subject   where we currently are in the subject string
331  start_offset      start offset in the subject string
332  offsets           vector to contain the matching string offsets
333  offsetcount       size of same
334  workspace         vector of workspace
335  wscount           size of same
336  rlevel            function call recursion level
337
338Returns:            > 0 => number of match offset pairs placed in offsets
339                    = 0 => offsets overflowed; longest matches are present
340                     -1 => failed to match
341                   < -1 => some kind of unexpected problem
342
343The following macros are used for adding states to the two state vectors (one
344for the current character, one for the following character). */
345
346#define ADD_ACTIVE(x,y) \
347  if (active_count++ < wscount) \
348    { \
349    next_active_state->offset = (x); \
350    next_active_state->count  = (y); \
351    next_active_state++; \
352    DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353    } \
354  else return PCRE_ERROR_DFA_WSSIZE
355
356#define ADD_ACTIVE_DATA(x,y,z) \
357  if (active_count++ < wscount) \
358    { \
359    next_active_state->offset = (x); \
360    next_active_state->count  = (y); \
361    next_active_state->data   = (z); \
362    next_active_state++; \
363    DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364    } \
365  else return PCRE_ERROR_DFA_WSSIZE
366
367#define ADD_NEW(x,y) \
368  if (new_count++ < wscount) \
369    { \
370    next_new_state->offset = (x); \
371    next_new_state->count  = (y); \
372    next_new_state++; \
373    DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374    } \
375  else return PCRE_ERROR_DFA_WSSIZE
376
377#define ADD_NEW_DATA(x,y,z) \
378  if (new_count++ < wscount) \
379    { \
380    next_new_state->offset = (x); \
381    next_new_state->count  = (y); \
382    next_new_state->data   = (z); \
383    next_new_state++; \
384    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385      (x), (y), (z), __LINE__)); \
386    } \
387  else return PCRE_ERROR_DFA_WSSIZE
388
389/* And now, here is the code */
390
391static int
392internal_dfa_exec(
393  dfa_match_data *md,
394  const pcre_uchar *this_start_code,
395  const pcre_uchar *current_subject,
396  int start_offset,
397  int *offsets,
398  int offsetcount,
399  int *workspace,
400  int wscount,
401  int  rlevel)
402{
403stateblock *active_states, *new_states, *temp_states;
404stateblock *next_active_state, *next_new_state;
405
406const pcre_uint8 *ctypes, *lcc, *fcc;
407const pcre_uchar *ptr;
408const pcre_uchar *end_code, *first_op;
409
410dfa_recursion_info new_recursive;
411
412int active_count, new_count, match_count;
413
414/* Some fields in the md block are frequently referenced, so we load them into
415independent variables in the hope that this will perform better. */
416
417const pcre_uchar *start_subject = md->start_subject;
418const pcre_uchar *end_subject = md->end_subject;
419const pcre_uchar *start_code = md->start_code;
420
421#ifdef SUPPORT_UTF
422BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423#else
424BOOL utf = FALSE;
425#endif
426
427BOOL reset_could_continue = FALSE;
428
429rlevel++;
430offsetcount &= (-2);
431
432wscount -= 2;
433wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434          (2 * INTS_PER_STATEBLOCK);
435
436DPRINTF(("\n%.*s---------------------\n"
437  "%.*sCall to internal_dfa_exec f=%d\n",
438  rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439
440ctypes = md->tables + ctypes_offset;
441lcc = md->tables + lcc_offset;
442fcc = md->tables + fcc_offset;
443
444match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
445
446active_states = (stateblock *)(workspace + 2);
447next_new_state = new_states = active_states + wscount;
448new_count = 0;
449
450first_op = this_start_code + 1 + LINK_SIZE +
451  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452    *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453    ? IMM2_SIZE:0);
454
455/* The first thing in any (sub) pattern is a bracket of some sort. Push all
456the alternative states onto the list, and find out where the end is. This
457makes is possible to use this function recursively, when we want to stop at a
458matching internal ket rather than at the end.
459
460If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461a backward assertion. In that case, we have to find out the maximum amount to
462move back, and set up each alternative appropriately. */
463
464if (*first_op == OP_REVERSE)
465  {
466  int max_back = 0;
467  int gone_back;
468
469  end_code = this_start_code;
470  do
471    {
472    int back = GET(end_code, 2+LINK_SIZE);
473    if (back > max_back) max_back = back;
474    end_code += GET(end_code, 1);
475    }
476  while (*end_code == OP_ALT);
477
478  /* If we can't go back the amount required for the longest lookbehind
479  pattern, go back as far as we can; some alternatives may still be viable. */
480
481#ifdef SUPPORT_UTF
482  /* In character mode we have to step back character by character */
483
484  if (utf)
485    {
486    for (gone_back = 0; gone_back < max_back; gone_back++)
487      {
488      if (current_subject <= start_subject) break;
489      current_subject--;
490      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491      }
492    }
493  else
494#endif
495
496  /* In byte-mode we can do this quickly. */
497
498    {
499    gone_back = (current_subject - max_back < start_subject)?
500      (int)(current_subject - start_subject) : max_back;
501    current_subject -= gone_back;
502    }
503
504  /* Save the earliest consulted character */
505
506  if (current_subject < md->start_used_ptr)
507    md->start_used_ptr = current_subject;
508
509  /* Now we can process the individual branches. */
510
511  end_code = this_start_code;
512  do
513    {
514    int back = GET(end_code, 2+LINK_SIZE);
515    if (back <= gone_back)
516      {
517      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518      ADD_NEW_DATA(-bstate, 0, gone_back - back);
519      }
520    end_code += GET(end_code, 1);
521    }
522  while (*end_code == OP_ALT);
523 }
524
525/* This is the code for a "normal" subpattern (not a backward assertion). The
526start of a whole pattern is always one of these. If we are at the top level,
527we may be asked to restart matching from the same point that we reached for a
528previous partial match. We still have to scan through the top-level branches to
529find the end state. */
530
531else
532  {
533  end_code = this_start_code;
534
535  /* Restarting */
536
537  if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538    {
539    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540    new_count = workspace[1];
541    if (!workspace[0])
542      memcpy(new_states, active_states, new_count * sizeof(stateblock));
543    }
544
545  /* Not restarting */
546
547  else
548    {
549    int length = 1 + LINK_SIZE +
550      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552        ? IMM2_SIZE:0);
553    do
554      {
555      ADD_NEW((int)(end_code - start_code + length), 0);
556      end_code += GET(end_code, 1);
557      length = 1 + LINK_SIZE;
558      }
559    while (*end_code == OP_ALT);
560    }
561  }
562
563workspace[0] = 0;    /* Bit indicating which vector is current */
564
565DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566
567/* Loop for scanning the subject */
568
569ptr = current_subject;
570for (;;)
571  {
572  int i, j;
573  int clen, dlen;
574  unsigned int c, d;
575  int forced_fail = 0;
576  BOOL partial_newline = FALSE;
577  BOOL could_continue = reset_could_continue;
578  reset_could_continue = FALSE;
579
580  /* Make the new state list into the active state list and empty the
581  new state list. */
582
583  temp_states = active_states;
584  active_states = new_states;
585  new_states = temp_states;
586  active_count = new_count;
587  new_count = 0;
588
589  workspace[0] ^= 1;              /* Remember for the restarting feature */
590  workspace[1] = active_count;
591
592#ifdef PCRE_DEBUG
593  printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594  pchars(ptr, STRLEN_UC(ptr), stdout);
595  printf("\"\n");
596
597  printf("%.*sActive states: ", rlevel*2-2, SP);
598  for (i = 0; i < active_count; i++)
599    printf("%d/%d ", active_states[i].offset, active_states[i].count);
600  printf("\n");
601#endif
602
603  /* Set the pointers for adding new states */
604
605  next_active_state = active_states + active_count;
606  next_new_state = new_states;
607
608  /* Load the current character from the subject outside the loop, as many
609  different states may want to look at it, and we assume that at least one
610  will. */
611
612  if (ptr < end_subject)
613    {
614    clen = 1;        /* Number of data items in the character */
615#ifdef SUPPORT_UTF
616    if (utf) { GETCHARLEN(c, ptr, clen); } else
617#endif  /* SUPPORT_UTF */
618    c = *ptr;
619    }
620  else
621    {
622    clen = 0;        /* This indicates the end of the subject */
623    c = NOTACHAR;    /* This value should never actually be used */
624    }
625
626  /* Scan up the active states and act on each one. The result of an action
627  may be to add more states to the currently active list (e.g. on hitting a
628  parenthesis) or it may be to put states on the new list, for considering
629  when we move the character pointer on. */
630
631  for (i = 0; i < active_count; i++)
632    {
633    stateblock *current_state = active_states + i;
634    BOOL caseless = FALSE;
635    const pcre_uchar *code;
636    int state_offset = current_state->offset;
637    int count, codevalue, rrc;
638
639#ifdef PCRE_DEBUG
640    printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641    if (clen == 0) printf("EOL\n");
642      else if (c > 32 && c < 127) printf("'%c'\n", c);
643        else printf("0x%02x\n", c);
644#endif
645
646    /* A negative offset is a special case meaning "hold off going to this
647    (negated) state until the number of characters in the data field have
648    been skipped". If the could_continue flag was passed over from a previous
649    state, arrange for it to passed on. */
650
651    if (state_offset < 0)
652      {
653      if (current_state->data > 0)
654        {
655        DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656        ADD_NEW_DATA(state_offset, current_state->count,
657          current_state->data - 1);
658        if (could_continue) reset_could_continue = TRUE;
659        continue;
660        }
661      else
662        {
663        current_state->offset = state_offset = -state_offset;
664        }
665      }
666
667    /* Check for a duplicate state with the same count, and skip if found.
668    See the note at the head of this module about the possibility of improving
669    performance here. */
670
671    for (j = 0; j < i; j++)
672      {
673      if (active_states[j].offset == state_offset &&
674          active_states[j].count == current_state->count)
675        {
676        DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
677        goto NEXT_ACTIVE_STATE;
678        }
679      }
680
681    /* The state offset is the offset to the opcode */
682
683    code = start_code + state_offset;
684    codevalue = *code;
685
686    /* If this opcode inspects a character, but we are at the end of the
687    subject, remember the fact for use when testing for a partial match. */
688
689    if (clen == 0 && poptable[codevalue] != 0)
690      could_continue = TRUE;
691
692    /* If this opcode is followed by an inline character, load it. It is
693    tempting to test for the presence of a subject character here, but that
694    is wrong, because sometimes zero repetitions of the subject are
695    permitted.
696
697    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698    argument that is not a data character - but is always one byte long because
699    the values are small. We have to take special action to deal with  \P, \p,
700    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701    these ones to new opcodes. */
702
703    if (coptable[codevalue] > 0)
704      {
705      dlen = 1;
706#ifdef SUPPORT_UTF
707      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708#endif  /* SUPPORT_UTF */
709      d = code[coptable[codevalue]];
710      if (codevalue >= OP_TYPESTAR)
711        {
712        switch(d)
713          {
714          case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
715          case OP_NOTPROP:
716          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719          case OP_NOT_HSPACE:
720          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721          case OP_NOT_VSPACE:
722          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723          default: break;
724          }
725        }
726      }
727    else
728      {
729      dlen = 0;         /* Not strictly necessary, but compilers moan */
730      d = NOTACHAR;     /* if these variables are not set. */
731      }
732
733
734    /* Now process the individual opcodes */
735
736    switch (codevalue)
737      {
738/* ========================================================================== */
739      /* These cases are never obeyed. This is a fudge that causes a compile-
740      time error if the vectors coptable or poptable, which are indexed by
741      opcode, are not the correct length. It seems to be the only way to do
742      such a check at compile time, as the sizeof() operator does not work
743      in the C preprocessor. */
744
745      case OP_TABLE_LENGTH:
746      case OP_TABLE_LENGTH +
747        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748         (sizeof(poptable) == OP_TABLE_LENGTH)):
749      break;
750
751/* ========================================================================== */
752      /* Reached a closing bracket. If not at the end of the pattern, carry
753      on with the next opcode. For repeating opcodes, also add the repeat
754      state. Note that KETRPOS will always be encountered at the end of the
755      subpattern, because the possessive subpattern repeats are always handled
756      using recursive calls. Thus, it never adds any new states.
757
758      At the end of the (sub)pattern, unless we have an empty string and
759      PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760      start of the subject, save the match data, shifting up all previous
761      matches so we always have the longest first. */
762
763      case OP_KET:
764      case OP_KETRMIN:
765      case OP_KETRMAX:
766      case OP_KETRPOS:
767      if (code != end_code)
768        {
769        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
770        if (codevalue != OP_KET)
771          {
772          ADD_ACTIVE(state_offset - GET(code, 1), 0);
773          }
774        }
775      else
776        {
777        if (ptr > current_subject ||
778            ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779              ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780                current_subject > start_subject + md->start_offset)))
781          {
782          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783            else if (match_count > 0 && ++match_count * 2 > offsetcount)
784              match_count = 0;
785          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787          if (offsetcount >= 2)
788            {
789            offsets[0] = (int)(current_subject - start_subject);
790            offsets[1] = (int)(ptr - start_subject);
791            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792              offsets[1] - offsets[0], (char *)current_subject));
793            }
794          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795            {
796            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798              match_count, rlevel*2-2, SP));
799            return match_count;
800            }
801          }
802        }
803      break;
804
805/* ========================================================================== */
806      /* These opcodes add to the current list of states without looking
807      at the current character. */
808
809      /*-----------------------------------------------------------------*/
810      case OP_ALT:
811      do { code += GET(code, 1); } while (*code == OP_ALT);
812      ADD_ACTIVE((int)(code - start_code), 0);
813      break;
814
815      /*-----------------------------------------------------------------*/
816      case OP_BRA:
817      case OP_SBRA:
818      do
819        {
820        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821        code += GET(code, 1);
822        }
823      while (*code == OP_ALT);
824      break;
825
826      /*-----------------------------------------------------------------*/
827      case OP_CBRA:
828      case OP_SCBRA:
829      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
830      code += GET(code, 1);
831      while (*code == OP_ALT)
832        {
833        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
834        code += GET(code, 1);
835        }
836      break;
837
838      /*-----------------------------------------------------------------*/
839      case OP_BRAZERO:
840      case OP_BRAMINZERO:
841      ADD_ACTIVE(state_offset + 1, 0);
842      code += 1 + GET(code, 2);
843      while (*code == OP_ALT) code += GET(code, 1);
844      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845      break;
846
847      /*-----------------------------------------------------------------*/
848      case OP_SKIPZERO:
849      code += 1 + GET(code, 2);
850      while (*code == OP_ALT) code += GET(code, 1);
851      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852      break;
853
854      /*-----------------------------------------------------------------*/
855      case OP_CIRC:
856      if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
857        { ADD_ACTIVE(state_offset + 1, 0); }
858      break;
859
860      /*-----------------------------------------------------------------*/
861      case OP_CIRCM:
862      if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863          (ptr != end_subject && WAS_NEWLINE(ptr)))
864        { ADD_ACTIVE(state_offset + 1, 0); }
865      break;
866
867      /*-----------------------------------------------------------------*/
868      case OP_EOD:
869      if (ptr >= end_subject)
870        {
871        if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872          could_continue = TRUE;
873        else { ADD_ACTIVE(state_offset + 1, 0); }
874        }
875      break;
876
877      /*-----------------------------------------------------------------*/
878      case OP_SOD:
879      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
880      break;
881
882      /*-----------------------------------------------------------------*/
883      case OP_SOM:
884      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
885      break;
886
887
888/* ========================================================================== */
889      /* These opcodes inspect the next subject character, and sometimes
890      the previous one as well, but do not have an argument. The variable
891      clen contains the length of the current character and is zero if we are
892      at the end of the subject. */
893
894      /*-----------------------------------------------------------------*/
895      case OP_ANY:
896      if (clen > 0 && !IS_NEWLINE(ptr))
897        {
898        if (ptr + 1 >= md->end_subject &&
899            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900            NLBLOCK->nltype == NLTYPE_FIXED &&
901            NLBLOCK->nllen == 2 &&
902            c == NLBLOCK->nl[0])
903          {
904          could_continue = partial_newline = TRUE;
905          }
906        else
907          {
908          ADD_NEW(state_offset + 1, 0);
909          }
910        }
911      break;
912
913      /*-----------------------------------------------------------------*/
914      case OP_ALLANY:
915      if (clen > 0)
916        { ADD_NEW(state_offset + 1, 0); }
917      break;
918
919      /*-----------------------------------------------------------------*/
920      case OP_EODN:
921      if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922        could_continue = TRUE;
923      else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924        { ADD_ACTIVE(state_offset + 1, 0); }
925      break;
926
927      /*-----------------------------------------------------------------*/
928      case OP_DOLL:
929      if ((md->moptions & PCRE_NOTEOL) == 0)
930        {
931        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932          could_continue = TRUE;
933        else if (clen == 0 ||
934            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935               (ptr == end_subject - md->nllen)
936            ))
937          { ADD_ACTIVE(state_offset + 1, 0); }
938        else if (ptr + 1 >= md->end_subject &&
939                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940                 NLBLOCK->nltype == NLTYPE_FIXED &&
941                 NLBLOCK->nllen == 2 &&
942                 c == NLBLOCK->nl[0])
943          {
944          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945            {
946            reset_could_continue = TRUE;
947            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948            }
949          else could_continue = partial_newline = TRUE;
950          }
951        }
952      break;
953
954      /*-----------------------------------------------------------------*/
955      case OP_DOLLM:
956      if ((md->moptions & PCRE_NOTEOL) == 0)
957        {
958        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959          could_continue = TRUE;
960        else if (clen == 0 ||
961            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962          { ADD_ACTIVE(state_offset + 1, 0); }
963        else if (ptr + 1 >= md->end_subject &&
964                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965                 NLBLOCK->nltype == NLTYPE_FIXED &&
966                 NLBLOCK->nllen == 2 &&
967                 c == NLBLOCK->nl[0])
968          {
969          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970            {
971            reset_could_continue = TRUE;
972            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973            }
974          else could_continue = partial_newline = TRUE;
975          }
976        }
977      else if (IS_NEWLINE(ptr))
978        { ADD_ACTIVE(state_offset + 1, 0); }
979      break;
980
981      /*-----------------------------------------------------------------*/
982
983      case OP_DIGIT:
984      case OP_WHITESPACE:
985      case OP_WORDCHAR:
986      if (clen > 0 && c < 256 &&
987            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
988        { ADD_NEW(state_offset + 1, 0); }
989      break;
990
991      /*-----------------------------------------------------------------*/
992      case OP_NOT_DIGIT:
993      case OP_NOT_WHITESPACE:
994      case OP_NOT_WORDCHAR:
995      if (clen > 0 && (c >= 256 ||
996            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
997        { ADD_NEW(state_offset + 1, 0); }
998      break;
999
1000      /*-----------------------------------------------------------------*/
1001      case OP_WORD_BOUNDARY:
1002      case OP_NOT_WORD_BOUNDARY:
1003        {
1004        int left_word, right_word;
1005
1006        if (ptr > start_subject)
1007          {
1008          const pcre_uchar *temp = ptr - 1;
1009          if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010#ifdef SUPPORT_UTF
1011          if (utf) { BACKCHAR(temp); }
1012#endif
1013          GETCHARTEST(d, temp);
1014#ifdef SUPPORT_UCP
1015          if ((md->poptions & PCRE_UCP) != 0)
1016            {
1017            if (d == '_') left_word = TRUE; else
1018              {
1019              int cat = UCD_CATEGORY(d);
1020              left_word = (cat == ucp_L || cat == ucp_N);
1021              }
1022            }
1023          else
1024#endif
1025          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026          }
1027        else left_word = FALSE;
1028
1029        if (clen > 0)
1030          {
1031#ifdef SUPPORT_UCP
1032          if ((md->poptions & PCRE_UCP) != 0)
1033            {
1034            if (c == '_') right_word = TRUE; else
1035              {
1036              int cat = UCD_CATEGORY(c);
1037              right_word = (cat == ucp_L || cat == ucp_N);
1038              }
1039            }
1040          else
1041#endif
1042          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043          }
1044        else right_word = FALSE;
1045
1046        if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047          { ADD_ACTIVE(state_offset + 1, 0); }
1048        }
1049      break;
1050
1051
1052      /*-----------------------------------------------------------------*/
1053      /* Check the next character by Unicode property. We will get here only
1054      if the support is in the binary; otherwise a compile-time error occurs.
1055      */
1056
1057#ifdef SUPPORT_UCP
1058      case OP_PROP:
1059      case OP_NOTPROP:
1060      if (clen > 0)
1061        {
1062        BOOL OK;
1063        const ucd_record * prop = GET_UCD(c);
1064        switch(code[1])
1065          {
1066          case PT_ANY:
1067          OK = TRUE;
1068          break;
1069
1070          case PT_LAMP:
1071          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072               prop->chartype == ucp_Lt;
1073          break;
1074
1075          case PT_GC:
1076          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077          break;
1078
1079          case PT_PC:
1080          OK = prop->chartype == code[2];
1081          break;
1082
1083          case PT_SC:
1084          OK = prop->script == code[2];
1085          break;
1086
1087          /* These are specials for combination cases. */
1088
1089          case PT_ALNUM:
1090          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092          break;
1093
1094          case PT_SPACE:    /* Perl space */
1095          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097          break;
1098
1099          case PT_PXSPACE:  /* POSIX space */
1100          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102               c == CHAR_FF || c == CHAR_CR;
1103          break;
1104
1105          case PT_WORD:
1106          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108               c == CHAR_UNDERSCORE;
1109          break;
1110
1111          /* Should never occur, but keep compilers from grumbling. */
1112
1113          default:
1114          OK = codevalue != OP_PROP;
1115          break;
1116          }
1117
1118        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1119        }
1120      break;
1121#endif
1122
1123
1124
1125/* ========================================================================== */
1126      /* These opcodes likewise inspect the subject character, but have an
1127      argument that is not a data character. It is one of these opcodes:
1128      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130
1131      case OP_TYPEPLUS:
1132      case OP_TYPEMINPLUS:
1133      case OP_TYPEPOSPLUS:
1134      count = current_state->count;  /* Already matched */
1135      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136      if (clen > 0)
1137        {
1138        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140            NLBLOCK->nltype == NLTYPE_FIXED &&
1141            NLBLOCK->nllen == 2 &&
1142            c == NLBLOCK->nl[0])
1143          {
1144          could_continue = partial_newline = TRUE;
1145          }
1146        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147            (c < 256 &&
1148              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1149              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150          {
1151          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1152            {
1153            active_count--;            /* Remove non-match possibility */
1154            next_active_state--;
1155            }
1156          count++;
1157          ADD_NEW(state_offset, count);
1158          }
1159        }
1160      break;
1161
1162      /*-----------------------------------------------------------------*/
1163      case OP_TYPEQUERY:
1164      case OP_TYPEMINQUERY:
1165      case OP_TYPEPOSQUERY:
1166      ADD_ACTIVE(state_offset + 2, 0);
1167      if (clen > 0)
1168        {
1169        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171            NLBLOCK->nltype == NLTYPE_FIXED &&
1172            NLBLOCK->nllen == 2 &&
1173            c == NLBLOCK->nl[0])
1174          {
1175          could_continue = partial_newline = TRUE;
1176          }
1177        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178            (c < 256 &&
1179              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1180              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181          {
1182          if (codevalue == OP_TYPEPOSQUERY)
1183            {
1184            active_count--;            /* Remove non-match possibility */
1185            next_active_state--;
1186            }
1187          ADD_NEW(state_offset + 2, 0);
1188          }
1189        }
1190      break;
1191
1192      /*-----------------------------------------------------------------*/
1193      case OP_TYPESTAR:
1194      case OP_TYPEMINSTAR:
1195      case OP_TYPEPOSSTAR:
1196      ADD_ACTIVE(state_offset + 2, 0);
1197      if (clen > 0)
1198        {
1199        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201            NLBLOCK->nltype == NLTYPE_FIXED &&
1202            NLBLOCK->nllen == 2 &&
1203            c == NLBLOCK->nl[0])
1204          {
1205          could_continue = partial_newline = TRUE;
1206          }
1207        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208            (c < 256 &&
1209              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211          {
1212          if (codevalue == OP_TYPEPOSSTAR)
1213            {
1214            active_count--;            /* Remove non-match possibility */
1215            next_active_state--;
1216            }
1217          ADD_NEW(state_offset, 0);
1218          }
1219        }
1220      break;
1221
1222      /*-----------------------------------------------------------------*/
1223      case OP_TYPEEXACT:
1224      count = current_state->count;  /* Number already matched */
1225      if (clen > 0)
1226        {
1227        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229            NLBLOCK->nltype == NLTYPE_FIXED &&
1230            NLBLOCK->nllen == 2 &&
1231            c == NLBLOCK->nl[0])
1232          {
1233          could_continue = partial_newline = TRUE;
1234          }
1235        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236            (c < 256 &&
1237              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1238              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239          {
1240          if (++count >= GET2(code, 1))
1241            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242          else
1243            { ADD_NEW(state_offset, count); }
1244          }
1245        }
1246      break;
1247
1248      /*-----------------------------------------------------------------*/
1249      case OP_TYPEUPTO:
1250      case OP_TYPEMINUPTO:
1251      case OP_TYPEPOSUPTO:
1252      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253      count = current_state->count;  /* Number already matched */
1254      if (clen > 0)
1255        {
1256        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258            NLBLOCK->nltype == NLTYPE_FIXED &&
1259            NLBLOCK->nllen == 2 &&
1260            c == NLBLOCK->nl[0])
1261          {
1262          could_continue = partial_newline = TRUE;
1263          }
1264        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265            (c < 256 &&
1266              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268          {
1269          if (codevalue == OP_TYPEPOSUPTO)
1270            {
1271            active_count--;           /* Remove non-match possibility */
1272            next_active_state--;
1273            }
1274          if (++count >= GET2(code, 1))
1275            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276          else
1277            { ADD_NEW(state_offset, count); }
1278          }
1279        }
1280      break;
1281
1282/* ========================================================================== */
1283      /* These are virtual opcodes that are used when something like
1284      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1285      argument. It keeps the code above fast for the other cases. The argument
1286      is in the d variable. */
1287
1288#ifdef SUPPORT_UCP
1289      case OP_PROP_EXTRA + OP_TYPEPLUS:
1290      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1292      count = current_state->count;           /* Already matched */
1293      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1294      if (clen > 0)
1295        {
1296        BOOL OK;
1297        const ucd_record * prop = GET_UCD(c);
1298        switch(code[2])
1299          {
1300          case PT_ANY:
1301          OK = TRUE;
1302          break;
1303
1304          case PT_LAMP:
1305          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306            prop->chartype == ucp_Lt;
1307          break;
1308
1309          case PT_GC:
1310          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311          break;
1312
1313          case PT_PC:
1314          OK = prop->chartype == code[3];
1315          break;
1316
1317          case PT_SC:
1318          OK = prop->script == code[3];
1319          break;
1320
1321          /* These are specials for combination cases. */
1322
1323          case PT_ALNUM:
1324          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326          break;
1327
1328          case PT_SPACE:    /* Perl space */
1329          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331          break;
1332
1333          case PT_PXSPACE:  /* POSIX space */
1334          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336               c == CHAR_FF || c == CHAR_CR;
1337          break;
1338
1339          case PT_WORD:
1340          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342               c == CHAR_UNDERSCORE;
1343          break;
1344
1345          /* Should never occur, but keep compilers from grumbling. */
1346
1347          default:
1348          OK = codevalue != OP_PROP;
1349          break;
1350          }
1351
1352        if (OK == (d == OP_PROP))
1353          {
1354          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1355            {
1356            active_count--;           /* Remove non-match possibility */
1357            next_active_state--;
1358            }
1359          count++;
1360          ADD_NEW(state_offset, count);
1361          }
1362        }
1363      break;
1364
1365      /*-----------------------------------------------------------------*/
1366      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1367      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1368      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369      count = current_state->count;  /* Already matched */
1370      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372        {
1373        const pcre_uchar *nptr = ptr + clen;
1374        int ncount = 0;
1375        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376          {
1377          active_count--;           /* Remove non-match possibility */
1378          next_active_state--;
1379          }
1380        while (nptr < end_subject)
1381          {
1382          int nd;
1383          int ndlen = 1;
1384          GETCHARLEN(nd, nptr, ndlen);
1385          if (UCD_CATEGORY(nd) != ucp_M) break;
1386          ncount++;
1387          nptr += ndlen;
1388          }
1389        count++;
1390        ADD_NEW_DATA(-state_offset, count, ncount);
1391        }
1392      break;
1393#endif
1394
1395      /*-----------------------------------------------------------------*/
1396      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1397      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1398      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1399      count = current_state->count;  /* Already matched */
1400      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1401      if (clen > 0)
1402        {
1403        int ncount = 0;
1404        switch (c)
1405          {
1406          case 0x000b:
1407          case 0x000c:
1408          case 0x0085:
1409          case 0x2028:
1410          case 0x2029:
1411          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412          goto ANYNL01;
1413
1414          case 0x000d:
1415          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416          /* Fall through */
1417
1418          ANYNL01:
1419          case 0x000a:
1420          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421            {
1422            active_count--;           /* Remove non-match possibility */
1423            next_active_state--;
1424            }
1425          count++;
1426          ADD_NEW_DATA(-state_offset, count, ncount);
1427          break;
1428
1429          default:
1430          break;
1431          }
1432        }
1433      break;
1434
1435      /*-----------------------------------------------------------------*/
1436      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1437      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1438      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1439      count = current_state->count;  /* Already matched */
1440      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441      if (clen > 0)
1442        {
1443        BOOL OK;
1444        switch (c)
1445          {
1446          case 0x000a:
1447          case 0x000b:
1448          case 0x000c:
1449          case 0x000d:
1450          case 0x0085:
1451          case 0x2028:
1452          case 0x2029:
1453          OK = TRUE;
1454          break;
1455
1456          default:
1457          OK = FALSE;
1458          break;
1459          }
1460
1461        if (OK == (d == OP_VSPACE))
1462          {
1463          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464            {
1465            active_count--;           /* Remove non-match possibility */
1466            next_active_state--;
1467            }
1468          count++;
1469          ADD_NEW_DATA(-state_offset, count, 0);
1470          }
1471        }
1472      break;
1473
1474      /*-----------------------------------------------------------------*/
1475      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478      count = current_state->count;  /* Already matched */
1479      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480      if (clen > 0)
1481        {
1482        BOOL OK;
1483        switch (c)
1484          {
1485          case 0x09:      /* HT */
1486          case 0x20:      /* SPACE */
1487          case 0xa0:      /* NBSP */
1488          case 0x1680:    /* OGHAM SPACE MARK */
1489          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1490          case 0x2000:    /* EN QUAD */
1491          case 0x2001:    /* EM QUAD */
1492          case 0x2002:    /* EN SPACE */
1493          case 0x2003:    /* EM SPACE */
1494          case 0x2004:    /* THREE-PER-EM SPACE */
1495          case 0x2005:    /* FOUR-PER-EM SPACE */
1496          case 0x2006:    /* SIX-PER-EM SPACE */
1497          case 0x2007:    /* FIGURE SPACE */
1498          case 0x2008:    /* PUNCTUATION SPACE */
1499          case 0x2009:    /* THIN SPACE */
1500          case 0x200A:    /* HAIR SPACE */
1501          case 0x202f:    /* NARROW NO-BREAK SPACE */
1502          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1503          case 0x3000:    /* IDEOGRAPHIC SPACE */
1504          OK = TRUE;
1505          break;
1506
1507          default:
1508          OK = FALSE;
1509          break;
1510          }
1511
1512        if (OK == (d == OP_HSPACE))
1513          {
1514          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515            {
1516            active_count--;           /* Remove non-match possibility */
1517            next_active_state--;
1518            }
1519          count++;
1520          ADD_NEW_DATA(-state_offset, count, 0);
1521          }
1522        }
1523      break;
1524
1525      /*-----------------------------------------------------------------*/
1526#ifdef SUPPORT_UCP
1527      case OP_PROP_EXTRA + OP_TYPEQUERY:
1528      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1529      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1530      count = 4;
1531      goto QS1;
1532
1533      case OP_PROP_EXTRA + OP_TYPESTAR:
1534      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1535      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1536      count = 0;
1537
1538      QS1:
1539
1540      ADD_ACTIVE(state_offset + 4, 0);
1541      if (clen > 0)
1542        {
1543        BOOL OK;
1544        const ucd_record * prop = GET_UCD(c);
1545        switch(code[2])
1546          {
1547          case PT_ANY:
1548          OK = TRUE;
1549          break;
1550
1551          case PT_LAMP:
1552          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1553            prop->chartype == ucp_Lt;
1554          break;
1555
1556          case PT_GC:
1557          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1558          break;
1559
1560          case PT_PC:
1561          OK = prop->chartype == code[3];
1562          break;
1563
1564          case PT_SC:
1565          OK = prop->script == code[3];
1566          break;
1567
1568          /* These are specials for combination cases. */
1569
1570          case PT_ALNUM:
1571          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1572               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1573          break;
1574
1575          case PT_SPACE:    /* Perl space */
1576          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1577               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578          break;
1579
1580          case PT_PXSPACE:  /* POSIX space */
1581          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1582               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583               c == CHAR_FF || c == CHAR_CR;
1584          break;
1585
1586          case PT_WORD:
1587          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1589               c == CHAR_UNDERSCORE;
1590          break;
1591
1592          /* Should never occur, but keep compilers from grumbling. */
1593
1594          default:
1595          OK = codevalue != OP_PROP;
1596          break;
1597          }
1598
1599        if (OK == (d == OP_PROP))
1600          {
1601          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1602              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1603            {
1604            active_count--;           /* Remove non-match possibility */
1605            next_active_state--;
1606            }
1607          ADD_NEW(state_offset + count, 0);
1608          }
1609        }
1610      break;
1611
1612      /*-----------------------------------------------------------------*/
1613      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1614      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1615      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1616      count = 2;
1617      goto QS2;
1618
1619      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1620      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1621      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1622      count = 0;
1623
1624      QS2:
1625
1626      ADD_ACTIVE(state_offset + 2, 0);
1627      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628        {
1629        const pcre_uchar *nptr = ptr + clen;
1630        int ncount = 0;
1631        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1633          {
1634          active_count--;           /* Remove non-match possibility */
1635          next_active_state--;
1636          }
1637        while (nptr < end_subject)
1638          {
1639          int nd;
1640          int ndlen = 1;
1641          GETCHARLEN(nd, nptr, ndlen);
1642          if (UCD_CATEGORY(nd) != ucp_M) break;
1643          ncount++;
1644          nptr += ndlen;
1645          }
1646        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1647        }
1648      break;
1649#endif
1650
1651      /*-----------------------------------------------------------------*/
1652      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1653      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1654      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1655      count = 2;
1656      goto QS3;
1657
1658      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1659      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1660      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1661      count = 0;
1662
1663      QS3:
1664      ADD_ACTIVE(state_offset + 2, 0);
1665      if (clen > 0)
1666        {
1667        int ncount = 0;
1668        switch (c)
1669          {
1670          case 0x000b:
1671          case 0x000c:
1672          case 0x0085:
1673          case 0x2028:
1674          case 0x2029:
1675          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676          goto ANYNL02;
1677
1678          case 0x000d:
1679          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680          /* Fall through */
1681
1682          ANYNL02:
1683          case 0x000a:
1684          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686            {
1687            active_count--;           /* Remove non-match possibility */
1688            next_active_state--;
1689            }
1690          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691          break;
1692
1693          default:
1694          break;
1695          }
1696        }
1697      break;
1698
1699      /*-----------------------------------------------------------------*/
1700      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1701      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1702      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1703      count = 2;
1704      goto QS4;
1705
1706      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1707      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1708      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1709      count = 0;
1710
1711      QS4:
1712      ADD_ACTIVE(state_offset + 2, 0);
1713      if (clen > 0)
1714        {
1715        BOOL OK;
1716        switch (c)
1717          {
1718          case 0x000a:
1719          case 0x000b:
1720          case 0x000c:
1721          case 0x000d:
1722          case 0x0085:
1723          case 0x2028:
1724          case 0x2029:
1725          OK = TRUE;
1726          break;
1727
1728          default:
1729          OK = FALSE;
1730          break;
1731          }
1732        if (OK == (d == OP_VSPACE))
1733          {
1734          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736            {
1737            active_count--;           /* Remove non-match possibility */
1738            next_active_state--;
1739            }
1740          ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741          }
1742        }
1743      break;
1744
1745      /*-----------------------------------------------------------------*/
1746      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749      count = 2;
1750      goto QS5;
1751
1752      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755      count = 0;
1756
1757      QS5:
1758      ADD_ACTIVE(state_offset + 2, 0);
1759      if (clen > 0)
1760        {
1761        BOOL OK;
1762        switch (c)
1763          {
1764          case 0x09:      /* HT */
1765          case 0x20:      /* SPACE */
1766          case 0xa0:      /* NBSP */
1767          case 0x1680:    /* OGHAM SPACE MARK */
1768          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1769          case 0x2000:    /* EN QUAD */
1770          case 0x2001:    /* EM QUAD */
1771          case 0x2002:    /* EN SPACE */
1772          case 0x2003:    /* EM SPACE */
1773          case 0x2004:    /* THREE-PER-EM SPACE */
1774          case 0x2005:    /* FOUR-PER-EM SPACE */
1775          case 0x2006:    /* SIX-PER-EM SPACE */
1776          case 0x2007:    /* FIGURE SPACE */
1777          case 0x2008:    /* PUNCTUATION SPACE */
1778          case 0x2009:    /* THIN SPACE */
1779          case 0x200A:    /* HAIR SPACE */
1780          case 0x202f:    /* NARROW NO-BREAK SPACE */
1781          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1782          case 0x3000:    /* IDEOGRAPHIC SPACE */
1783          OK = TRUE;
1784          break;
1785
1786          default:
1787          OK = FALSE;
1788          break;
1789          }
1790
1791        if (OK == (d == OP_HSPACE))
1792          {
1793          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795            {
1796            active_count--;           /* Remove non-match possibility */
1797            next_active_state--;
1798            }
1799          ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800          }
1801        }
1802      break;
1803
1804      /*-----------------------------------------------------------------*/
1805#ifdef SUPPORT_UCP
1806      case OP_PROP_EXTRA + OP_TYPEEXACT:
1807      case OP_PROP_EXTRA + OP_TYPEUPTO:
1808      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1811        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1812      count = current_state->count;  /* Number already matched */
1813      if (clen > 0)
1814        {
1815        BOOL OK;
1816        const ucd_record * prop = GET_UCD(c);
1817        switch(code[1 + IMM2_SIZE + 1])
1818          {
1819          case PT_ANY:
1820          OK = TRUE;
1821          break;
1822
1823          case PT_LAMP:
1824          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1825            prop->chartype == ucp_Lt;
1826          break;
1827
1828          case PT_GC:
1829          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1830          break;
1831
1832          case PT_PC:
1833          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1834          break;
1835
1836          case PT_SC:
1837          OK = prop->script == code[1 + IMM2_SIZE + 2];
1838          break;
1839
1840          /* These are specials for combination cases. */
1841
1842          case PT_ALNUM:
1843          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1844               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1845          break;
1846
1847          case PT_SPACE:    /* Perl space */
1848          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850          break;
1851
1852          case PT_PXSPACE:  /* POSIX space */
1853          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1854               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855               c == CHAR_FF || c == CHAR_CR;
1856          break;
1857
1858          case PT_WORD:
1859          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1860               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1861               c == CHAR_UNDERSCORE;
1862          break;
1863
1864          /* Should never occur, but keep compilers from grumbling. */
1865
1866          default:
1867          OK = codevalue != OP_PROP;
1868          break;
1869          }
1870
1871        if (OK == (d == OP_PROP))
1872          {
1873          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1874            {
1875            active_count--;           /* Remove non-match possibility */
1876            next_active_state--;
1877            }
1878          if (++count >= GET2(code, 1))
1879            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1880          else
1881            { ADD_NEW(state_offset, count); }
1882          }
1883        }
1884      break;
1885
1886      /*-----------------------------------------------------------------*/
1887      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1888      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1889      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1892        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1893      count = current_state->count;  /* Number already matched */
1894      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895        {
1896        const pcre_uchar *nptr = ptr + clen;
1897        int ncount = 0;
1898        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899          {
1900          active_count--;           /* Remove non-match possibility */
1901          next_active_state--;
1902          }
1903        while (nptr < end_subject)
1904          {
1905          int nd;
1906          int ndlen = 1;
1907          GETCHARLEN(nd, nptr, ndlen);
1908          if (UCD_CATEGORY(nd) != ucp_M) break;
1909          ncount++;
1910          nptr += ndlen;
1911          }
1912        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1913            reset_could_continue = TRUE;
1914        if (++count >= GET2(code, 1))
1915          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1916        else
1917          { ADD_NEW_DATA(-state_offset, count, ncount); }
1918        }
1919      break;
1920#endif
1921
1922      /*-----------------------------------------------------------------*/
1923      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1924      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1925      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1928        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1929      count = current_state->count;  /* Number already matched */
1930      if (clen > 0)
1931        {
1932        int ncount = 0;
1933        switch (c)
1934          {
1935          case 0x000b:
1936          case 0x000c:
1937          case 0x0085:
1938          case 0x2028:
1939          case 0x2029:
1940          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941          goto ANYNL03;
1942
1943          case 0x000d:
1944          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945          /* Fall through */
1946
1947          ANYNL03:
1948          case 0x000a:
1949          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950            {
1951            active_count--;           /* Remove non-match possibility */
1952            next_active_state--;
1953            }
1954          if (++count >= GET2(code, 1))
1955            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1956          else
1957            { ADD_NEW_DATA(-state_offset, count, ncount); }
1958          break;
1959
1960          default:
1961          break;
1962          }
1963        }
1964      break;
1965
1966      /*-----------------------------------------------------------------*/
1967      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1968      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1969      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1972        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1973      count = current_state->count;  /* Number already matched */
1974      if (clen > 0)
1975        {
1976        BOOL OK;
1977        switch (c)
1978          {
1979          case 0x000a:
1980          case 0x000b:
1981          case 0x000c:
1982          case 0x000d:
1983          case 0x0085:
1984          case 0x2028:
1985          case 0x2029:
1986          OK = TRUE;
1987          break;
1988
1989          default:
1990          OK = FALSE;
1991          }
1992
1993        if (OK == (d == OP_VSPACE))
1994          {
1995          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996            {
1997            active_count--;           /* Remove non-match possibility */
1998            next_active_state--;
1999            }
2000          if (++count >= GET2(code, 1))
2001            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2002          else
2003            { ADD_NEW_DATA(-state_offset, count, 0); }
2004          }
2005        }
2006      break;
2007
2008      /*-----------------------------------------------------------------*/
2009      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2010      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2011      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2014        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2015      count = current_state->count;  /* Number already matched */
2016      if (clen > 0)
2017        {
2018        BOOL OK;
2019        switch (c)
2020          {
2021          case 0x09:      /* HT */
2022          case 0x20:      /* SPACE */
2023          case 0xa0:      /* NBSP */
2024          case 0x1680:    /* OGHAM SPACE MARK */
2025          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2026          case 0x2000:    /* EN QUAD */
2027          case 0x2001:    /* EM QUAD */
2028          case 0x2002:    /* EN SPACE */
2029          case 0x2003:    /* EM SPACE */
2030          case 0x2004:    /* THREE-PER-EM SPACE */
2031          case 0x2005:    /* FOUR-PER-EM SPACE */
2032          case 0x2006:    /* SIX-PER-EM SPACE */
2033          case 0x2007:    /* FIGURE SPACE */
2034          case 0x2008:    /* PUNCTUATION SPACE */
2035          case 0x2009:    /* THIN SPACE */
2036          case 0x200A:    /* HAIR SPACE */
2037          case 0x202f:    /* NARROW NO-BREAK SPACE */
2038          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2039          case 0x3000:    /* IDEOGRAPHIC SPACE */
2040          OK = TRUE;
2041          break;
2042
2043          default:
2044          OK = FALSE;
2045          break;
2046          }
2047
2048        if (OK == (d == OP_HSPACE))
2049          {
2050          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051            {
2052            active_count--;           /* Remove non-match possibility */
2053            next_active_state--;
2054            }
2055          if (++count >= GET2(code, 1))
2056            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2057          else
2058            { ADD_NEW_DATA(-state_offset, count, 0); }
2059          }
2060        }
2061      break;
2062
2063/* ========================================================================== */
2064      /* These opcodes are followed by a character that is usually compared
2065      to the current subject character; it is loaded into d. We still get
2066      here even if there is no subject character, because in some cases zero
2067      repetitions are permitted. */
2068
2069      /*-----------------------------------------------------------------*/
2070      case OP_CHAR:
2071      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2072      break;
2073
2074      /*-----------------------------------------------------------------*/
2075      case OP_CHARI:
2076      if (clen == 0) break;
2077
2078#ifdef SUPPORT_UTF
2079      if (utf)
2080        {
2081        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082          {
2083          unsigned int othercase;
2084          if (c < 128)
2085            othercase = fcc[c];
2086          else
2087            /* If we have Unicode property support, we can use it to test the
2088            other case of the character. */
2089#ifdef SUPPORT_UCP
2090            othercase = UCD_OTHERCASE(c);
2091#else
2092            othercase = NOTACHAR;
2093#endif
2094
2095          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096          }
2097        }
2098      else
2099#endif  /* SUPPORT_UTF */
2100      /* Not UTF mode */
2101        {
2102        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103          { ADD_NEW(state_offset + 2, 0); }
2104        }
2105      break;
2106
2107
2108#ifdef SUPPORT_UCP
2109      /*-----------------------------------------------------------------*/
2110      /* This is a tricky one because it can match more than one character.
2111      Find out how many characters to skip, and then set up a negative state
2112      to wait for them to pass before continuing. */
2113
2114      case OP_EXTUNI:
2115      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116        {
2117        const pcre_uchar *nptr = ptr + clen;
2118        int ncount = 0;
2119        while (nptr < end_subject)
2120          {
2121          int nclen = 1;
2122          GETCHARLEN(c, nptr, nclen);
2123          if (UCD_CATEGORY(c) != ucp_M) break;
2124          ncount++;
2125          nptr += nclen;
2126          }
2127        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2128            reset_could_continue = TRUE;
2129        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130        }
2131      break;
2132#endif
2133
2134      /*-----------------------------------------------------------------*/
2135      /* This is a tricky like EXTUNI because it too can match more than one
2136      character (when CR is followed by LF). In this case, set up a negative
2137      state to wait for one character to pass before continuing. */
2138
2139      case OP_ANYNL:
2140      if (clen > 0) switch(c)
2141        {
2142        case 0x000b:
2143        case 0x000c:
2144        case 0x0085:
2145        case 0x2028:
2146        case 0x2029:
2147        if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148
2149        case 0x000a:
2150        ADD_NEW(state_offset + 1, 0);
2151        break;
2152
2153        case 0x000d:
2154        if (ptr + 1 >= end_subject)
2155          {
2156          ADD_NEW(state_offset + 1, 0);
2157          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2158            reset_could_continue = TRUE;
2159          }
2160        else if (ptr[1] == 0x0a)
2161          {
2162          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163          }
2164        else
2165          {
2166          ADD_NEW(state_offset + 1, 0);
2167          }
2168        break;
2169        }
2170      break;
2171
2172      /*-----------------------------------------------------------------*/
2173      case OP_NOT_VSPACE:
2174      if (clen > 0) switch(c)
2175        {
2176        case 0x000a:
2177        case 0x000b:
2178        case 0x000c:
2179        case 0x000d:
2180        case 0x0085:
2181        case 0x2028:
2182        case 0x2029:
2183        break;
2184
2185        default:
2186        ADD_NEW(state_offset + 1, 0);
2187        break;
2188        }
2189      break;
2190
2191      /*-----------------------------------------------------------------*/
2192      case OP_VSPACE:
2193      if (clen > 0) switch(c)
2194        {
2195        case 0x000a:
2196        case 0x000b:
2197        case 0x000c:
2198        case 0x000d:
2199        case 0x0085:
2200        case 0x2028:
2201        case 0x2029:
2202        ADD_NEW(state_offset + 1, 0);
2203        break;
2204
2205        default: break;
2206        }
2207      break;
2208
2209      /*-----------------------------------------------------------------*/
2210      case OP_NOT_HSPACE:
2211      if (clen > 0) switch(c)
2212        {
2213        case 0x09:      /* HT */
2214        case 0x20:      /* SPACE */
2215        case 0xa0:      /* NBSP */
2216        case 0x1680:    /* OGHAM SPACE MARK */
2217        case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2218        case 0x2000:    /* EN QUAD */
2219        case 0x2001:    /* EM QUAD */
2220        case 0x2002:    /* EN SPACE */
2221        case 0x2003:    /* EM SPACE */
2222        case 0x2004:    /* THREE-PER-EM SPACE */
2223        case 0x2005:    /* FOUR-PER-EM SPACE */
2224        case 0x2006:    /* SIX-PER-EM SPACE */
2225        case 0x2007:    /* FIGURE SPACE */
2226        case 0x2008:    /* PUNCTUATION SPACE */
2227        case 0x2009:    /* THIN SPACE */
2228        case 0x200A:    /* HAIR SPACE */
2229        case 0x202f:    /* NARROW NO-BREAK SPACE */
2230        case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2231        case 0x3000:    /* IDEOGRAPHIC SPACE */
2232        break;
2233
2234        default:
2235        ADD_NEW(state_offset + 1, 0);
2236        break;
2237        }
2238      break;
2239
2240      /*-----------------------------------------------------------------*/
2241      case OP_HSPACE:
2242      if (clen > 0) switch(c)
2243        {
2244        case 0x09:      /* HT */
2245        case 0x20:      /* SPACE */
2246        case 0xa0:      /* NBSP */
2247        case 0x1680:    /* OGHAM SPACE MARK */
2248        case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
2249        case 0x2000:    /* EN QUAD */
2250        case 0x2001:    /* EM QUAD */
2251        case 0x2002:    /* EN SPACE */
2252        case 0x2003:    /* EM SPACE */
2253        case 0x2004:    /* THREE-PER-EM SPACE */
2254        case 0x2005:    /* FOUR-PER-EM SPACE */
2255        case 0x2006:    /* SIX-PER-EM SPACE */
2256        case 0x2007:    /* FIGURE SPACE */
2257        case 0x2008:    /* PUNCTUATION SPACE */
2258        case 0x2009:    /* THIN SPACE */
2259        case 0x200A:    /* HAIR SPACE */
2260        case 0x202f:    /* NARROW NO-BREAK SPACE */
2261        case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
2262        case 0x3000:    /* IDEOGRAPHIC SPACE */
2263        ADD_NEW(state_offset + 1, 0);
2264        break;
2265        }
2266      break;
2267
2268      /*-----------------------------------------------------------------*/
2269      /* Match a negated single character casefully. */
2270
2271      case OP_NOT:
2272      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2273      break;
2274
2275      /*-----------------------------------------------------------------*/
2276      /* Match a negated single character caselessly. */
2277
2278      case OP_NOTI:
2279      if (clen > 0)
2280        {
2281        unsigned int otherd;
2282#ifdef SUPPORT_UTF
2283        if (utf && d >= 128)
2284          {
2285#ifdef SUPPORT_UCP
2286          otherd = UCD_OTHERCASE(d);
2287#endif  /* SUPPORT_UCP */
2288          }
2289        else
2290#endif  /* SUPPORT_UTF */
2291        otherd = TABLE_GET(d, fcc, d);
2292        if (c != d && c != otherd)
2293          { ADD_NEW(state_offset + dlen + 1, 0); }
2294        }
2295      break;
2296
2297      /*-----------------------------------------------------------------*/
2298      case OP_PLUSI:
2299      case OP_MINPLUSI:
2300      case OP_POSPLUSI:
2301      case OP_NOTPLUSI:
2302      case OP_NOTMINPLUSI:
2303      case OP_NOTPOSPLUSI:
2304      caseless = TRUE;
2305      codevalue -= OP_STARI - OP_STAR;
2306
2307      /* Fall through */
2308      case OP_PLUS:
2309      case OP_MINPLUS:
2310      case OP_POSPLUS:
2311      case OP_NOTPLUS:
2312      case OP_NOTMINPLUS:
2313      case OP_NOTPOSPLUS:
2314      count = current_state->count;  /* Already matched */
2315      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2316      if (clen > 0)
2317        {
2318        unsigned int otherd = NOTACHAR;
2319        if (caseless)
2320          {
2321#ifdef SUPPORT_UTF
2322          if (utf && d >= 128)
2323            {
2324#ifdef SUPPORT_UCP
2325            otherd = UCD_OTHERCASE(d);
2326#endif  /* SUPPORT_UCP */
2327            }
2328          else
2329#endif  /* SUPPORT_UTF */
2330          otherd = TABLE_GET(d, fcc, d);
2331          }
2332        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2333          {
2334          if (count > 0 &&
2335              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2336            {
2337            active_count--;             /* Remove non-match possibility */
2338            next_active_state--;
2339            }
2340          count++;
2341          ADD_NEW(state_offset, count);
2342          }
2343        }
2344      break;
2345
2346      /*-----------------------------------------------------------------*/
2347      case OP_QUERYI:
2348      case OP_MINQUERYI:
2349      case OP_POSQUERYI:
2350      case OP_NOTQUERYI:
2351      case OP_NOTMINQUERYI:
2352      case OP_NOTPOSQUERYI:
2353      caseless = TRUE;
2354      codevalue -= OP_STARI - OP_STAR;
2355      /* Fall through */
2356      case OP_QUERY:
2357      case OP_MINQUERY:
2358      case OP_POSQUERY:
2359      case OP_NOTQUERY:
2360      case OP_NOTMINQUERY:
2361      case OP_NOTPOSQUERY:
2362      ADD_ACTIVE(state_offset + dlen + 1, 0);
2363      if (clen > 0)
2364        {
2365        unsigned int otherd = NOTACHAR;
2366        if (caseless)
2367          {
2368#ifdef SUPPORT_UTF
2369          if (utf && d >= 128)
2370            {
2371#ifdef SUPPORT_UCP
2372            otherd = UCD_OTHERCASE(d);
2373#endif  /* SUPPORT_UCP */
2374            }
2375          else
2376#endif  /* SUPPORT_UTF */
2377          otherd = TABLE_GET(d, fcc, d);
2378          }
2379        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2380          {
2381          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2382            {
2383            active_count--;            /* Remove non-match possibility */
2384            next_active_state--;
2385            }
2386          ADD_NEW(state_offset + dlen + 1, 0);
2387          }
2388        }
2389      break;
2390
2391      /*-----------------------------------------------------------------*/
2392      case OP_STARI:
2393      case OP_MINSTARI:
2394      case OP_POSSTARI:
2395      case OP_NOTSTARI:
2396      case OP_NOTMINSTARI:
2397      case OP_NOTPOSSTARI:
2398      caseless = TRUE;
2399      codevalue -= OP_STARI - OP_STAR;
2400      /* Fall through */
2401      case OP_STAR:
2402      case OP_MINSTAR:
2403      case OP_POSSTAR:
2404      case OP_NOTSTAR:
2405      case OP_NOTMINSTAR:
2406      case OP_NOTPOSSTAR:
2407      ADD_ACTIVE(state_offset + dlen + 1, 0);
2408      if (clen > 0)
2409        {
2410        unsigned int otherd = NOTACHAR;
2411        if (caseless)
2412          {
2413#ifdef SUPPORT_UTF
2414          if (utf && d >= 128)
2415            {
2416#ifdef SUPPORT_UCP
2417            otherd = UCD_OTHERCASE(d);
2418#endif  /* SUPPORT_UCP */
2419            }
2420          else
2421#endif  /* SUPPORT_UTF */
2422          otherd = TABLE_GET(d, fcc, d);
2423          }
2424        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2425          {
2426          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2427            {
2428            active_count--;            /* Remove non-match possibility */
2429            next_active_state--;
2430            }
2431          ADD_NEW(state_offset, 0);
2432          }
2433        }
2434      break;
2435
2436      /*-----------------------------------------------------------------*/
2437      case OP_EXACTI:
2438      case OP_NOTEXACTI:
2439      caseless = TRUE;
2440      codevalue -= OP_STARI - OP_STAR;
2441      /* Fall through */
2442      case OP_EXACT:
2443      case OP_NOTEXACT:
2444      count = current_state->count;  /* Number already matched */
2445      if (clen > 0)
2446        {
2447        unsigned int otherd = NOTACHAR;
2448        if (caseless)
2449          {
2450#ifdef SUPPORT_UTF
2451          if (utf && d >= 128)
2452            {
2453#ifdef SUPPORT_UCP
2454            otherd = UCD_OTHERCASE(d);
2455#endif  /* SUPPORT_UCP */
2456            }
2457          else
2458#endif  /* SUPPORT_UTF */
2459          otherd = TABLE_GET(d, fcc, d);
2460          }
2461        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462          {
2463          if (++count >= GET2(code, 1))
2464            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2465          else
2466            { ADD_NEW(state_offset, count); }
2467          }
2468        }
2469      break;
2470
2471      /*-----------------------------------------------------------------*/
2472      case OP_UPTOI:
2473      case OP_MINUPTOI:
2474      case OP_POSUPTOI:
2475      case OP_NOTUPTOI:
2476      case OP_NOTMINUPTOI:
2477      case OP_NOTPOSUPTOI:
2478      caseless = TRUE;
2479      codevalue -= OP_STARI - OP_STAR;
2480      /* Fall through */
2481      case OP_UPTO:
2482      case OP_MINUPTO:
2483      case OP_POSUPTO:
2484      case OP_NOTUPTO:
2485      case OP_NOTMINUPTO:
2486      case OP_NOTPOSUPTO:
2487      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2488      count = current_state->count;  /* Number already matched */
2489      if (clen > 0)
2490        {
2491        unsigned int otherd = NOTACHAR;
2492        if (caseless)
2493          {
2494#ifdef SUPPORT_UTF
2495          if (utf && d >= 128)
2496            {
2497#ifdef SUPPORT_UCP
2498            otherd = UCD_OTHERCASE(d);
2499#endif  /* SUPPORT_UCP */
2500            }
2501          else
2502#endif  /* SUPPORT_UTF */
2503          otherd = TABLE_GET(d, fcc, d);
2504          }
2505        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2506          {
2507          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2508            {
2509            active_count--;             /* Remove non-match possibility */
2510            next_active_state--;
2511            }
2512          if (++count >= GET2(code, 1))
2513            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2514          else
2515            { ADD_NEW(state_offset, count); }
2516          }
2517        }
2518      break;
2519
2520
2521/* ========================================================================== */
2522      /* These are the class-handling opcodes */
2523
2524      case OP_CLASS:
2525      case OP_NCLASS:
2526      case OP_XCLASS:
2527        {
2528        BOOL isinclass = FALSE;
2529        int next_state_offset;
2530        const pcre_uchar *ecode;
2531
2532        /* For a simple class, there is always just a 32-byte table, and we
2533        can set isinclass from it. */
2534
2535        if (codevalue != OP_XCLASS)
2536          {
2537          ecode = code + 1 + (32 / sizeof(pcre_uchar));
2538          if (clen > 0)
2539            {
2540            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2541              ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2542            }
2543          }
2544
2545        /* An extended class may have a table or a list of single characters,
2546        ranges, or both, and it may be positive or negative. There's a
2547        function that sorts all this out. */
2548
2549        else
2550         {
2551         ecode = code + GET(code, 1);
2552         if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2553         }
2554
2555        /* At this point, isinclass is set for all kinds of class, and ecode
2556        points to the byte after the end of the class. If there is a
2557        quantifier, this is where it will be. */
2558
2559        next_state_offset = (int)(ecode - start_code);
2560
2561        switch (*ecode)
2562          {
2563          case OP_CRSTAR:
2564          case OP_CRMINSTAR:
2565          ADD_ACTIVE(next_state_offset + 1, 0);
2566          if (isinclass) { ADD_NEW(state_offset, 0); }
2567          break;
2568
2569          case OP_CRPLUS:
2570          case OP_CRMINPLUS:
2571          count = current_state->count;  /* Already matched */
2572          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2573          if (isinclass) { count++; ADD_NEW(state_offset, count); }
2574          break;
2575
2576          case OP_CRQUERY:
2577          case OP_CRMINQUERY:
2578          ADD_ACTIVE(next_state_offset + 1, 0);
2579          if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2580          break;
2581
2582          case OP_CRRANGE:
2583          case OP_CRMINRANGE:
2584          count = current_state->count;  /* Already matched */
2585          if (count >= GET2(ecode, 1))
2586            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2587          if (isinclass)
2588            {
2589            int max = GET2(ecode, 1 + IMM2_SIZE);
2590            if (++count >= max && max != 0)   /* Max 0 => no limit */
2591              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2592            else
2593              { ADD_NEW(state_offset, count); }
2594            }
2595          break;
2596
2597          default:
2598          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2599          break;
2600          }
2601        }
2602      break;
2603
2604/* ========================================================================== */
2605      /* These are the opcodes for fancy brackets of various kinds. We have
2606      to use recursion in order to handle them. The "always failing" assertion
2607      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608      though the other "backtracking verbs" are not supported. */
2609
2610      case OP_FAIL:
2611      forced_fail++;    /* Count FAILs for multiple states */
2612      break;
2613
2614      case OP_ASSERT:
2615      case OP_ASSERT_NOT:
2616      case OP_ASSERTBACK:
2617      case OP_ASSERTBACK_NOT:
2618        {
2619        int rc;
2620        int local_offsets[2];
2621        int local_workspace[1000];
2622        const pcre_uchar *endasscode = code + GET(code, 1);
2623
2624        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2625
2626        rc = internal_dfa_exec(
2627          md,                                   /* static match data */
2628          code,                                 /* this subexpression's code */
2629          ptr,                                  /* where we currently are */
2630          (int)(ptr - start_subject),           /* start offset */
2631          local_offsets,                        /* offset vector */
2632          sizeof(local_offsets)/sizeof(int),    /* size of same */
2633          local_workspace,                      /* workspace vector */
2634          sizeof(local_workspace)/sizeof(int),  /* size of same */
2635          rlevel);                              /* function recursion level */
2636
2637        if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2638        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2639            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2640        }
2641      break;
2642
2643      /*-----------------------------------------------------------------*/
2644      case OP_COND:
2645      case OP_SCOND:
2646        {
2647        int local_offsets[1000];
2648        int local_workspace[1000];
2649        int codelink = GET(code, 1);
2650        int condcode;
2651
2652        /* Because of the way auto-callout works during compile, a callout item
2653        is inserted between OP_COND and an assertion condition. This does not
2654        happen for the other conditions. */
2655
2656        if (code[LINK_SIZE+1] == OP_CALLOUT)
2657          {
2658          rrc = 0;
2659          if (PUBL(callout) != NULL)
2660            {
2661            PUBL(callout_block) cb;
2662            cb.version          = 1;   /* Version 1 of the callout block */
2663            cb.callout_number   = code[LINK_SIZE+2];
2664            cb.offset_vector    = offsets;
2665#ifdef COMPILE_PCRE8
2666            cb.subject          = (PCRE_SPTR)start_subject;
2667#else
2668            cb.subject          = (PCRE_SPTR16)start_subject;
2669#endif
2670            cb.subject_length   = (int)(end_subject - start_subject);
2671            cb.start_match      = (int)(current_subject - start_subject);
2672            cb.current_position = (int)(ptr - start_subject);
2673            cb.pattern_position = GET(code, LINK_SIZE + 3);
2674            cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2675            cb.capture_top      = 1;
2676            cb.capture_last     = -1;
2677            cb.callout_data     = md->callout_data;
2678            cb.mark             = NULL;   /* No (*MARK) support */
2679            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2680            }
2681          if (rrc > 0) break;                      /* Fail this thread */
2682          code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2683          }
2684
2685        condcode = code[LINK_SIZE+1];
2686
2687        /* Back reference conditions are not supported */
2688
2689        if (condcode == OP_CREF || condcode == OP_NCREF)
2690          return PCRE_ERROR_DFA_UCOND;
2691
2692        /* The DEFINE condition is always false */
2693
2694        if (condcode == OP_DEF)
2695          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2696
2697        /* The only supported version of OP_RREF is for the value RREF_ANY,
2698        which means "test if in any recursion". We can't test for specifically
2699        recursed groups. */
2700
2701        else if (condcode == OP_RREF || condcode == OP_NRREF)
2702          {
2703          int value = GET2(code, LINK_SIZE + 2);
2704          if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2705          if (md->recursive != NULL)
2706            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2707          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708          }
2709
2710        /* Otherwise, the condition is an assertion */
2711
2712        else
2713          {
2714          int rc;
2715          const pcre_uchar *asscode = code + LINK_SIZE + 1;
2716          const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2717
2718          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2719
2720          rc = internal_dfa_exec(
2721            md,                                   /* fixed match data */
2722            asscode,                              /* this subexpression's code */
2723            ptr,                                  /* where we currently are */
2724            (int)(ptr - start_subject),           /* start offset */
2725            local_offsets,                        /* offset vector */
2726            sizeof(local_offsets)/sizeof(int),    /* size of same */
2727            local_workspace,                      /* workspace vector */
2728            sizeof(local_workspace)/sizeof(int),  /* size of same */
2729            rlevel);                              /* function recursion level */
2730
2731          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2732          if ((rc >= 0) ==
2733                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2734            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2735          else
2736            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737          }
2738        }
2739      break;
2740
2741      /*-----------------------------------------------------------------*/
2742      case OP_RECURSE:
2743        {
2744        dfa_recursion_info *ri;
2745        int local_offsets[1000];
2746        int local_workspace[1000];
2747        const pcre_uchar *callpat = start_code + GET(code, 1);
2748        int recno = (callpat == md->start_code)? 0 :
2749          GET2(callpat, 1 + LINK_SIZE);
2750        int rc;
2751
2752        DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2753
2754        /* Check for repeating a recursion without advancing the subject
2755        pointer. This should catch convoluted mutual recursions. (Some simple
2756        cases are caught at compile time.) */
2757
2758        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2759          if (recno == ri->group_num && ptr == ri->subject_position)
2760            return PCRE_ERROR_RECURSELOOP;
2761
2762        /* Remember this recursion and where we started it so as to
2763        catch infinite loops. */
2764
2765        new_recursive.group_num = recno;
2766        new_recursive.subject_position = ptr;
2767        new_recursive.prevrec = md->recursive;
2768        md->recursive = &new_recursive;
2769
2770        rc = internal_dfa_exec(
2771          md,                                   /* fixed match data */
2772          callpat,                              /* this subexpression's code */
2773          ptr,                                  /* where we currently are */
2774          (int)(ptr - start_subject),           /* start offset */
2775          local_offsets,                        /* offset vector */
2776          sizeof(local_offsets)/sizeof(int),    /* size of same */
2777          local_workspace,                      /* workspace vector */
2778          sizeof(local_workspace)/sizeof(int),  /* size of same */
2779          rlevel);                              /* function recursion level */
2780
2781        md->recursive = new_recursive.prevrec;  /* Done this recursion */
2782
2783        DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2784          rc));
2785
2786        /* Ran out of internal offsets */
2787
2788        if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2789
2790        /* For each successful matched substring, set up the next state with a
2791        count of characters to skip before trying it. Note that the count is in
2792        characters, not bytes. */
2793
2794        if (rc > 0)
2795          {
2796          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2797            {
2798            int charcount = local_offsets[rc+1] - local_offsets[rc];
2799#ifdef SUPPORT_UTF
2800            if (utf)
2801              {
2802              const pcre_uchar *p = start_subject + local_offsets[rc];
2803              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2804              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2805              }
2806#endif
2807            if (charcount > 0)
2808              {
2809              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2810              }
2811            else
2812              {
2813              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2814              }
2815            }
2816          }
2817        else if (rc != PCRE_ERROR_NOMATCH) return rc;
2818        }
2819      break;
2820
2821      /*-----------------------------------------------------------------*/
2822      case OP_BRAPOS:
2823      case OP_SBRAPOS:
2824      case OP_CBRAPOS:
2825      case OP_SCBRAPOS:
2826      case OP_BRAPOSZERO:
2827        {
2828        int charcount, matched_count;
2829        const pcre_uchar *local_ptr = ptr;
2830        BOOL allow_zero;
2831
2832        if (codevalue == OP_BRAPOSZERO)
2833          {
2834          allow_zero = TRUE;
2835          codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2836          }
2837        else allow_zero = FALSE;
2838
2839        /* Loop to match the subpattern as many times as possible as if it were
2840        a complete pattern. */
2841
2842        for (matched_count = 0;; matched_count++)
2843          {
2844          int local_offsets[2];
2845          int local_workspace[1000];
2846
2847          int rc = internal_dfa_exec(
2848            md,                                   /* fixed match data */
2849            code,                                 /* this subexpression's code */
2850            local_ptr,                            /* where we currently are */
2851            (int)(ptr - start_subject),           /* start offset */
2852            local_offsets,                        /* offset vector */
2853            sizeof(local_offsets)/sizeof(int),    /* size of same */
2854            local_workspace,                      /* workspace vector */
2855            sizeof(local_workspace)/sizeof(int),  /* size of same */
2856            rlevel);                              /* function recursion level */
2857
2858          /* Failed to match */
2859
2860          if (rc < 0)
2861            {
2862            if (rc != PCRE_ERROR_NOMATCH) return rc;
2863            break;
2864            }
2865
2866          /* Matched: break the loop if zero characters matched. */
2867
2868          charcount = local_offsets[1] - local_offsets[0];
2869          if (charcount == 0) break;
2870          local_ptr += charcount;    /* Advance temporary position ptr */
2871          }
2872
2873        /* At this point we have matched the subpattern matched_count
2874        times, and local_ptr is pointing to the character after the end of the
2875        last match. */
2876
2877        if (matched_count > 0 || allow_zero)
2878          {
2879          const pcre_uchar *end_subpattern = code;
2880          int next_state_offset;
2881
2882          do { end_subpattern += GET(end_subpattern, 1); }
2883            while (*end_subpattern == OP_ALT);
2884          next_state_offset =
2885            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2886
2887          /* Optimization: if there are no more active states, and there
2888          are no new states yet set up, then skip over the subject string
2889          right here, to save looping. Otherwise, set up the new state to swing
2890          into action when the end of the matched substring is reached. */
2891
2892          if (i + 1 >= active_count && new_count == 0)
2893            {
2894            ptr = local_ptr;
2895            clen = 0;
2896            ADD_NEW(next_state_offset, 0);
2897            }
2898          else
2899            {
2900            const pcre_uchar *p = ptr;
2901            const pcre_uchar *pp = local_ptr;
2902            charcount = (int)(pp - p);
2903#ifdef SUPPORT_UTF
2904            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2905#endif
2906            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2907            }
2908          }
2909        }
2910      break;
2911
2912      /*-----------------------------------------------------------------*/
2913      case OP_ONCE:
2914      case OP_ONCE_NC:
2915        {
2916        int local_offsets[2];
2917        int local_workspace[1000];
2918
2919        int rc = internal_dfa_exec(
2920          md,                                   /* fixed match data */
2921          code,                                 /* this subexpression's code */
2922          ptr,                                  /* where we currently are */
2923          (int)(ptr - start_subject),           /* start offset */
2924          local_offsets,                        /* offset vector */
2925          sizeof(local_offsets)/sizeof(int),    /* size of same */
2926          local_workspace,                      /* workspace vector */
2927          sizeof(local_workspace)/sizeof(int),  /* size of same */
2928          rlevel);                              /* function recursion level */
2929
2930        if (rc >= 0)
2931          {
2932          const pcre_uchar *end_subpattern = code;
2933          int charcount = local_offsets[1] - local_offsets[0];
2934          int next_state_offset, repeat_state_offset;
2935
2936          do { end_subpattern += GET(end_subpattern, 1); }
2937            while (*end_subpattern == OP_ALT);
2938          next_state_offset =
2939            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2940
2941          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942          arrange for the repeat state also to be added to the relevant list.
2943          Calculate the offset, or set -1 for no repeat. */
2944
2945          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2946                                 *end_subpattern == OP_KETRMIN)?
2947            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2948
2949          /* If we have matched an empty string, add the next state at the
2950          current character pointer. This is important so that the duplicate
2951          checking kicks in, which is what breaks infinite loops that match an
2952          empty string. */
2953
2954          if (charcount == 0)
2955            {
2956            ADD_ACTIVE(next_state_offset, 0);
2957            }
2958
2959          /* Optimization: if there are no more active states, and there
2960          are no new states yet set up, then skip over the subject string
2961          right here, to save looping. Otherwise, set up the new state to swing
2962          into action when the end of the matched substring is reached. */
2963
2964          else if (i + 1 >= active_count && new_count == 0)
2965            {
2966            ptr += charcount;
2967            clen = 0;
2968            ADD_NEW(next_state_offset, 0);
2969
2970            /* If we are adding a repeat state at the new character position,
2971            we must fudge things so that it is the only current state.
2972            Otherwise, it might be a duplicate of one we processed before, and
2973            that would cause it to be skipped. */
2974
2975            if (repeat_state_offset >= 0)
2976              {
2977              next_active_state = active_states;
2978              active_count = 0;
2979              i = -1;
2980              ADD_ACTIVE(repeat_state_offset, 0);
2981              }
2982            }
2983          else
2984            {
2985#ifdef SUPPORT_UTF
2986            if (utf)
2987              {
2988              const pcre_uchar *p = start_subject + local_offsets[0];
2989              const pcre_uchar *pp = start_subject + local_offsets[1];
2990              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2991              }
2992#endif
2993            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2994            if (repeat_state_offset >= 0)
2995              { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996            }
2997          }
2998        else if (rc != PCRE_ERROR_NOMATCH) return rc;
2999        }
3000      break;
3001
3002
3003/* ========================================================================== */
3004      /* Handle callouts */
3005
3006      case OP_CALLOUT:
3007      rrc = 0;
3008      if (PUBL(callout) != NULL)
3009        {
3010        PUBL(callout_block) cb;
3011        cb.version          = 1;   /* Version 1 of the callout block */
3012        cb.callout_number   = code[1];
3013        cb.offset_vector    = offsets;
3014#ifdef COMPILE_PCRE8
3015        cb.subject          = (PCRE_SPTR)start_subject;
3016#else
3017        cb.subject          = (PCRE_SPTR16)start_subject;
3018#endif
3019        cb.subject_length   = (int)(end_subject - start_subject);
3020        cb.start_match      = (int)(current_subject - start_subject);
3021        cb.current_position = (int)(ptr - start_subject);
3022        cb.pattern_position = GET(code, 2);
3023        cb.next_item_length = GET(code, 2 + LINK_SIZE);
3024        cb.capture_top      = 1;
3025        cb.capture_last     = -1;
3026        cb.callout_data     = md->callout_data;
3027        cb.mark             = NULL;   /* No (*MARK) support */
3028        if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3029        }
3030      if (rrc == 0)
3031        { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3032      break;
3033
3034
3035/* ========================================================================== */
3036      default:        /* Unsupported opcode */
3037      return PCRE_ERROR_DFA_UITEM;
3038      }
3039
3040    NEXT_ACTIVE_STATE: continue;
3041
3042    }      /* End of loop scanning active states */
3043
3044  /* We have finished the processing at the current subject character. If no
3045  new states have been set for the next character, we have found all the
3046  matches that we are going to find. If we are at the top level and partial
3047  matching has been requested, check for appropriate conditions.
3048
3049  The "forced_ fail" variable counts the number of (*F) encountered for the
3050  character. If it is equal to the original active_count (saved in
3051  workspace[1]) it means that (*F) was found on every active state. In this
3052  case we don't want to give a partial match.
3053
3054  The "could_continue" variable is true if a state could have continued but
3055  for the fact that the end of the subject was reached. */
3056
3057  if (new_count <= 0)
3058    {
3059    if (rlevel == 1 &&                               /* Top level, and */
3060        could_continue &&                            /* Some could go on, and */
3061        forced_fail != workspace[1] &&               /* Not all forced fail & */
3062        (                                            /* either... */
3063        (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3064        ||                                           /* or... */
3065        ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3066         match_count < 0)                            /* no matches */
3067        ) &&                                         /* And... */
3068        (
3069        partial_newline ||                           /* Either partial NL */
3070          (                                          /* or ... */
3071          ptr >= end_subject &&                /* End of subject and */
3072          ptr > md->start_used_ptr)            /* Inspected non-empty string */
3073          )
3074        )
3075      {
3076      if (offsetcount >= 2)
3077        {
3078        offsets[0] = (int)(md->start_used_ptr - start_subject);
3079        offsets[1] = (int)(end_subject - start_subject);
3080        }
3081      match_count = PCRE_ERROR_PARTIAL;
3082      }
3083
3084    DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3085      "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3086      rlevel*2-2, SP));
3087    break;        /* In effect, "return", but see the comment below */
3088    }
3089
3090  /* One or more states are active for the next character. */
3091
3092  ptr += clen;    /* Advance to next subject character */
3093  }               /* Loop to move along the subject string */
3094
3095/* Control gets here from "break" a few lines above. We do it this way because
3096if we use "return" above, we have compiler trouble. Some compilers warn if
3097there's nothing here because they think the function doesn't return a value. On
3098the other hand, if we put a dummy statement here, some more clever compilers
3099complain that it can't be reached. Sigh. */
3100
3101return match_count;
3102}
3103
3104
3105
3106
3107/*************************************************
3108*    Execute a Regular Expression - DFA engine   *
3109*************************************************/
3110
3111/* This external function applies a compiled re to a subject string using a DFA
3112engine. This function calls the internal function multiple times if the pattern
3113is not anchored.
3114
3115Arguments:
3116  argument_re     points to the compiled expression
3117  extra_data      points to extra data or is NULL
3118  subject         points to the subject string
3119  length          length of subject string (may contain binary zeros)
3120  start_offset    where to start in the subject string
3121  options         option bits
3122  offsets         vector of match offsets
3123  offsetcount     size of same
3124  workspace       workspace vector
3125  wscount         size of same
3126
3127Returns:          > 0 => number of match offset pairs placed in offsets
3128                  = 0 => offsets overflowed; longest matches are present
3129                   -1 => failed to match
3130                 < -1 => some kind of unexpected problem
3131*/
3132
3133#ifdef COMPILE_PCRE8
3134PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3135pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3136  const char *subject, int length, int start_offset, int options, int *offsets,
3137  int offsetcount, int *workspace, int wscount)
3138#else
3139PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3141  PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3142  int offsetcount, int *workspace, int wscount)
3143#endif
3144{
3145REAL_PCRE *re = (REAL_PCRE *)argument_re;
3146dfa_match_data match_block;
3147dfa_match_data *md = &match_block;
3148BOOL utf, anchored, startline, firstline;
3149const pcre_uchar *current_subject, *end_subject;
3150const pcre_study_data *study = NULL;
3151
3152const pcre_uchar *req_char_ptr;
3153const pcre_uint8 *start_bits = NULL;
3154BOOL has_first_char = FALSE;
3155BOOL has_req_char = FALSE;
3156pcre_uchar first_char = 0;
3157pcre_uchar first_char2 = 0;
3158pcre_uchar req_char = 0;
3159pcre_uchar req_char2 = 0;
3160int newline;
3161
3162/* Plausibility checks */
3163
3164if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3165if (re == NULL || subject == NULL || workspace == NULL ||
3166   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3167if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3168if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3169if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3170
3171/* Check that the first field in the block is the magic number. If it is not,
3172return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3173REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3174means that the pattern is likely compiled with different endianness. */
3175
3176if (re->magic_number != MAGIC_NUMBER)
3177  return re->magic_number == REVERSED_MAGIC_NUMBER?
3178    PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3179if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3180
3181/* If restarting after a partial match, do some sanity checks on the contents
3182of the workspace. */
3183
3184if ((options & PCRE_DFA_RESTART) != 0)
3185  {
3186  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3187    workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3188      return PCRE_ERROR_DFA_BADRESTART;
3189  }
3190
3191/* Set up study, callout, and table data */
3192
3193md->tables = re->tables;
3194md->callout_data = NULL;
3195
3196if (extra_data != NULL)
3197  {
3198  unsigned int flags = extra_data->flags;
3199  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3200    study = (const pcre_study_data *)extra_data->study_data;
3201  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3202  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3203    return PCRE_ERROR_DFA_UMLIMIT;
3204  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3205    md->callout_data = extra_data->callout_data;
3206  if ((flags & PCRE_EXTRA_TABLES) != 0)
3207    md->tables = extra_data->tables;
3208  }
3209
3210/* Set some local values */
3211
3212current_subject = (const pcre_uchar *)subject + start_offset;
3213end_subject = (const pcre_uchar *)subject + length;
3214req_char_ptr = current_subject - 1;
3215
3216#ifdef SUPPORT_UTF
3217/* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218utf = (re->options & PCRE_UTF8) != 0;
3219#else
3220utf = FALSE;
3221#endif
3222
3223anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3224  (re->options & PCRE_ANCHORED) != 0;
3225
3226/* The remaining fixed data for passing around. */
3227
3228md->start_code = (const pcre_uchar *)argument_re +
3229    re->name_table_offset + re->name_count * re->name_entry_size;
3230md->start_subject = (const pcre_uchar *)subject;
3231md->end_subject = end_subject;
3232md->start_offset = start_offset;
3233md->moptions = options;
3234md->poptions = re->options;
3235
3236/* If the BSR option is not set at match time, copy what was set
3237at compile time. */
3238
3239if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3240  {
3241  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3242    md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3243#ifdef BSR_ANYCRLF
3244  else md->moptions |= PCRE_BSR_ANYCRLF;
3245#endif
3246  }
3247
3248/* Handle different types of newline. The three bits give eight cases. If
3249nothing is set at run time, whatever was used at compile time applies. */
3250
3251switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3252         PCRE_NEWLINE_BITS)
3253  {
3254  case 0: newline = NEWLINE; break;   /* Compile-time default */
3255  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3256  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3257  case PCRE_NEWLINE_CR+
3258       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3259  case PCRE_NEWLINE_ANY: newline = -1; break;
3260  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3261  default: return PCRE_ERROR_BADNEWLINE;
3262  }
3263
3264if (newline == -2)
3265  {
3266  md->nltype = NLTYPE_ANYCRLF;
3267  }
3268else if (newline < 0)
3269  {
3270  md->nltype = NLTYPE_ANY;
3271  }
3272else
3273  {
3274  md->nltype = NLTYPE_FIXED;
3275  if (newline > 255)
3276    {
3277    md->nllen = 2;
3278    md->nl[0] = (newline >> 8) & 255;
3279    md->nl[1] = newline & 255;
3280    }
3281  else
3282    {
3283    md->nllen = 1;
3284    md->nl[0] = newline;
3285    }
3286  }
3287
3288/* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289back the character offset. */
3290
3291#ifdef SUPPORT_UTF
3292if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3293  {
3294  int erroroffset;
3295  int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3296  if (errorcode != 0)
3297    {
3298    if (offsetcount >= 2)
3299      {
3300      offsets[0] = erroroffset;
3301      offsets[1] = errorcode;
3302      }
3303    return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3304      PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3305    }
3306  if (start_offset > 0 && start_offset < length &&
3307        NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3308    return PCRE_ERROR_BADUTF8_OFFSET;
3309  }
3310#endif
3311
3312/* If the exec call supplied NULL for tables, use the inbuilt ones. This
3313is a feature that makes it possible to save compiled regex and re-use them
3314in other programs later. */
3315
3316if (md->tables == NULL) md->tables = PRIV(default_tables);
3317
3318/* The "must be at the start of a line" flags are used in a loop when finding
3319where to start. */
3320
3321startline = (re->flags & PCRE_STARTLINE) != 0;
3322firstline = (re->options & PCRE_FIRSTLINE) != 0;
3323
3324/* Set up the first character to match, if available. The first_byte value is
3325never set for an anchored regular expression, but the anchoring may be forced
3326at run time, so we have to test for anchoring. The first char may be unset for
3327an unanchored pattern, of course. If there's no first char and the pattern was
3328studied, there may be a bitmap of possible first characters. */
3329
3330if (!anchored)
3331  {
3332  if ((re->flags & PCRE_FIRSTSET) != 0)
3333    {
3334    has_first_char = TRUE;
3335    first_char = first_char2 = (pcre_uchar)(re->first_char);
3336    if ((re->flags & PCRE_FCH_CASELESS) != 0)
3337      {
3338      first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3339#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340      if (utf && first_char > 127)
3341        first_char2 = UCD_OTHERCASE(first_char);
3342#endif
3343      }
3344    }
3345  else
3346    {
3347    if (!startline && study != NULL &&
3348         (study->flags & PCRE_STUDY_MAPPED) != 0)
3349      start_bits = study->start_bits;
3350    }
3351  }
3352
3353/* For anchored or unanchored matches, there may be a "last known required
3354character" set. */
3355
3356if ((re->flags & PCRE_REQCHSET) != 0)
3357  {
3358  has_req_char = TRUE;
3359  req_char = req_char2 = (pcre_uchar)(re->req_char);
3360  if ((re->flags & PCRE_RCH_CASELESS) != 0)
3361    {
3362    req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3363#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364    if (utf && req_char > 127)
3365      req_char2 = UCD_OTHERCASE(req_char);
3366#endif
3367    }
3368  }
3369
3370/* Call the main matching function, looping for a non-anchored regex after a
3371failed match. If not restarting, perform certain optimizations at the start of
3372a match. */
3373
3374for (;;)
3375  {
3376  int rc;
3377
3378  if ((options & PCRE_DFA_RESTART) == 0)
3379    {
3380    const pcre_uchar *save_end_subject = end_subject;
3381
3382    /* If firstline is TRUE, the start of the match is constrained to the first
3383    line of a multiline string. Implement this by temporarily adjusting
3384    end_subject so that we stop scanning at a newline. If the match fails at
3385    the newline, later code breaks this loop. */
3386
3387    if (firstline)
3388      {
3389      PCRE_PUCHAR t = current_subject;
3390#ifdef SUPPORT_UTF
3391      if (utf)
3392        {
3393        while (t < md->end_subject && !IS_NEWLINE(t))
3394          {
3395          t++;
3396          ACROSSCHAR(t < end_subject, *t, t++);
3397          }
3398        }
3399      else
3400#endif
3401      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3402      end_subject = t;
3403      }
3404
3405    /* There are some optimizations that avoid running the match if a known
3406    starting point is not found. However, there is an option that disables
3407    these, for testing and for ensuring that all callouts do actually occur.
3408    The option can be set in the regex by (*NO_START_OPT) or passed in
3409    match-time options. */
3410
3411    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3412      {
3413      /* Advance to a known first char. */
3414
3415      if (has_first_char)
3416        {
3417        if (first_char != first_char2)
3418          while (current_subject < end_subject &&
3419              *current_subject != first_char && *current_subject != first_char2)
3420            current_subject++;
3421        else
3422          while (current_subject < end_subject &&
3423                 *current_subject != first_char)
3424            current_subject++;
3425        }
3426
3427      /* Or to just after a linebreak for a multiline match if possible */
3428
3429      else if (startline)
3430        {
3431        if (current_subject > md->start_subject + start_offset)
3432          {
3433#ifdef SUPPORT_UTF
3434          if (utf)
3435            {
3436            while (current_subject < end_subject &&
3437                   !WAS_NEWLINE(current_subject))
3438              {
3439              current_subject++;
3440              ACROSSCHAR(current_subject < end_subject, *current_subject,
3441                current_subject++);
3442              }
3443            }
3444          else
3445#endif
3446          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3447            current_subject++;
3448
3449          /* If we have just passed a CR and the newline option is ANY or
3450          ANYCRLF, and we are now at a LF, advance the match position by one
3451          more character. */
3452
3453          if (current_subject[-1] == CHAR_CR &&
3454               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3455               current_subject < end_subject &&
3456               *current_subject == CHAR_NL)
3457            current_subject++;
3458          }
3459        }
3460
3461      /* Or to a non-unique first char after study */
3462
3463      else if (start_bits != NULL)
3464        {
3465        while (current_subject < end_subject)
3466          {
3467          register unsigned int c = *current_subject;
3468#ifndef COMPILE_PCRE8
3469          if (c > 255) c = 255;
3470#endif
3471          if ((start_bits[c/8] & (1 << (c&7))) == 0)
3472            {
3473            current_subject++;
3474#if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475            /* In non 8-bit mode, the iteration will stop for
3476            characters > 255 at the beginning or not stop at all. */
3477            if (utf)
3478              ACROSSCHAR(current_subject < end_subject, *current_subject,
3479                current_subject++);
3480#endif
3481            }
3482          else break;
3483          }
3484        }
3485      }
3486
3487    /* Restore fudged end_subject */
3488
3489    end_subject = save_end_subject;
3490
3491    /* The following two optimizations are disabled for partial matching or if
3492    disabling is explicitly requested (and of course, by the test above, this
3493    code is not obeyed when restarting after a partial match). */
3494
3495    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3496        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3497      {
3498      /* If the pattern was studied, a minimum subject length may be set. This
3499      is a lower bound; no actual string of that length may actually match the
3500      pattern. Although the value is, strictly, in characters, we treat it as
3501      bytes to avoid spending too much time in this optimization. */
3502
3503      if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3504          (pcre_uint32)(end_subject - current_subject) < study->minlength)
3505        return PCRE_ERROR_NOMATCH;
3506
3507      /* If req_char is set, we know that that character must appear in the
3508      subject for the match to succeed. If the first character is set, req_char
3509      must be later in the subject; otherwise the test starts at the match
3510      point. This optimization can save a huge amount of work in patterns with
3511      nested unlimited repeats that aren't going to match. Writing separate
3512      code for cased/caseless versions makes it go faster, as does using an
3513      autoincrement and backing off on a match.
3514
3515      HOWEVER: when the subject string is very, very long, searching to its end
3516      can take a long time, and give bad performance on quite ordinary
3517      patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518      string... so we don't do this when the string is sufficiently long. */
3519
3520      if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3521        {
3522        register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3523
3524        /* We don't need to repeat the search if we haven't yet reached the
3525        place we found it at last time. */
3526
3527        if (p > req_char_ptr)
3528          {
3529          if (req_char != req_char2)
3530            {
3531            while (p < end_subject)
3532              {
3533              register int pp = *p++;
3534              if (pp == req_char || pp == req_char2) { p--; break; }
3535              }
3536            }
3537          else
3538            {
3539            while (p < end_subject)
3540              {
3541              if (*p++ == req_char) { p--; break; }
3542              }
3543            }
3544
3545          /* If we can't find the required character, break the matching loop,
3546          which will cause a return or PCRE_ERROR_NOMATCH. */
3547
3548          if (p >= end_subject) break;
3549
3550          /* If we have found the required character, save the point where we
3551          found it, so that we don't search again next time round the loop if
3552          the start hasn't passed this character yet. */
3553
3554          req_char_ptr = p;
3555          }
3556        }
3557      }
3558    }   /* End of optimizations that are done when not restarting */
3559
3560  /* OK, now we can do the business */
3561
3562  md->start_used_ptr = current_subject;
3563  md->recursive = NULL;
3564
3565  rc = internal_dfa_exec(
3566    md,                                /* fixed match data */
3567    md->start_code,                    /* this subexpression's code */
3568    current_subject,                   /* where we currently are */
3569    start_offset,                      /* start offset in subject */
3570    offsets,                           /* offset vector */
3571    offsetcount,                       /* size of same */
3572    workspace,                         /* workspace vector */
3573    wscount,                           /* size of same */
3574    0);                                /* function recurse level */
3575
3576  /* Anything other than "no match" means we are done, always; otherwise, carry
3577  on only if not anchored. */
3578
3579  if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3580
3581  /* Advance to the next subject character unless we are at the end of a line
3582  and firstline is set. */
3583
3584  if (firstline && IS_NEWLINE(current_subject)) break;
3585  current_subject++;
3586#ifdef SUPPORT_UTF
3587  if (utf)
3588    {
3589    ACROSSCHAR(current_subject < end_subject, *current_subject,
3590      current_subject++);
3591    }
3592#endif
3593  if (current_subject > end_subject) break;
3594
3595  /* If we have just passed a CR and we are now at a LF, and the pattern does
3596  not contain any explicit matches for \r or \n, and the newline option is CRLF
3597  or ANY or ANYCRLF, advance the match position by one more character. */
3598
3599  if (current_subject[-1] == CHAR_CR &&
3600      current_subject < end_subject &&
3601      *current_subject == CHAR_NL &&
3602      (re->flags & PCRE_HASCRORLF) == 0 &&
3603        (md->nltype == NLTYPE_ANY ||
3604         md->nltype == NLTYPE_ANYCRLF ||
3605         md->nllen == 2))
3606    current_subject++;
3607
3608  }   /* "Bumpalong" loop */
3609
3610return PCRE_ERROR_NOMATCH;
3611}
3612
3613/* End of pcre_dfa_exec.c */
3614