1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language (but see
7below for why this module is different).
8
9                       Written by Philip Hazel
10           Copyright (c) 1997-2010 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42/* This module contains the external function pcre_dfa_exec(), which is an
43alternative matching function that uses a sort of DFA algorithm (not a true
44FSM). This is NOT Perl- compatible, but it has advantages in certain
45applications. */
46
47
48/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49the performance of his patterns greatly. I could not use it as it stood, as it
50was not thread safe, and made assumptions about pattern sizes. Also, it caused
51test 7 to loop, and test 9 to crash with a segfault.
52
53The issue is the check for duplicate states, which is done by a simple linear
54search up the state list. (Grep for "duplicate" below to find the code.) For
55many patterns, there will never be many states active at one time, so a simple
56linear search is fine. In patterns that have many active states, it might be a
57bottleneck. The suggested code used an indexing scheme to remember which states
58had previously been used for each character, and avoided the linear search when
59it knew there was no chance of a duplicate. This was implemented when adding
60states to the state lists.
61
62I wrote some thread-safe, not-limited code to try something similar at the time
63of checking for duplicates (instead of when adding states), using index vectors
64on the stack. It did give a 13% improvement with one specially constructed
65pattern for certain subject strings, but on other strings and on many of the
66simpler patterns in the test suite it did worse. The major problem, I think,
67was the extra time to initialize the index. This had to be done for each call
68of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69only once - I suspect this was the cause of the problems with the tests.)
70
71Overall, I concluded that the gains in some cases did not outweigh the losses
72in others, so I abandoned this code. */
73
74
75
76#ifdef HAVE_CONFIG_H
77#include "config.h"
78#endif
79
80#define NLBLOCK md             /* Block containing newline information */
81#define PSSTART start_subject  /* Field containing processed string start */
82#define PSEND   end_subject    /* Field containing processed string end */
83
84#include "pcre_internal.h"
85
86
87/* For use to indent debugging output */
88
89#define SP "                   "
90
91
92/*************************************************
93*      Code parameters and static tables         *
94*************************************************/
95
96/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97into others, under special conditions. A gap of 20 between the blocks should be
98enough. The resulting opcodes don't have to be less than 256 because they are
99never stored, so we push them well clear of the normal opcodes. */
100
101#define OP_PROP_EXTRA       300
102#define OP_EXTUNI_EXTRA     320
103#define OP_ANYNL_EXTRA      340
104#define OP_HSPACE_EXTRA     360
105#define OP_VSPACE_EXTRA     380
106
107
108/* This table identifies those opcodes that are followed immediately by a
109character that is to be tested in some way. This makes is possible to
110centralize the loading of these characters. In the case of Type * etc, the
111"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112small value. Non-zero values in the table are the offsets from the opcode where
113the character is to be found. ***NOTE*** If the start of this table is
114modified, the three tables that follow must also be modified. */
115
116static const uschar coptable[] = {
117  0,                             /* End                                    */
118  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
119  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
120  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
121  0, 0,                          /* \P, \p                                 */
122  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
123  0,                             /* \X                                     */
124  0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
125  1,                             /* Char                                   */
126  1,                             /* Charnc                                 */
127  1,                             /* not                                    */
128  /* Positive single-char repeats                                          */
129  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130  3, 3, 3,                       /* upto, minupto, exact                   */
131  1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
132  /* Negative single-char repeats - only for chars < 256                   */
133  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
134  3, 3, 3,                       /* NOT upto, minupto, exact               */
135  1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
136  /* Positive type repeats                                                 */
137  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
138  3, 3, 3,                       /* Type upto, minupto, exact              */
139  1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
140  /* Character class & ref repeats                                         */
141  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
142  0, 0,                          /* CRRANGE, CRMINRANGE                    */
143  0,                             /* CLASS                                  */
144  0,                             /* NCLASS                                 */
145  0,                             /* XCLASS - variable length               */
146  0,                             /* REF                                    */
147  0,                             /* RECURSE                                */
148  0,                             /* CALLOUT                                */
149  0,                             /* Alt                                    */
150  0,                             /* Ket                                    */
151  0,                             /* KetRmax                                */
152  0,                             /* KetRmin                                */
153  0,                             /* Assert                                 */
154  0,                             /* Assert not                             */
155  0,                             /* Assert behind                          */
156  0,                             /* Assert behind not                      */
157  0,                             /* Reverse                                */
158  0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
159  0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
160  0, 0,                          /* CREF, NCREF                            */
161  0, 0,                          /* RREF, NRREF                            */
162  0,                             /* DEF                                    */
163  0, 0,                          /* BRAZERO, BRAMINZERO                    */
164  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
165  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
166};
167
168/* This table identifies those opcodes that inspect a character. It is used to
169remember the fact that a character could have been inspected when the end of
170the subject is reached. ***NOTE*** If the start of this table is modified, the
171two tables that follow must also be modified. */
172
173static const uschar poptable[] = {
174  0,                             /* End                                    */
175  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
176  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
177  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
178  1, 1,                          /* \P, \p                                 */
179  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
180  1,                             /* \X                                     */
181  0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
182  1,                             /* Char                                   */
183  1,                             /* Charnc                                 */
184  1,                             /* not                                    */
185  /* Positive single-char repeats                                          */
186  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
187  1, 1, 1,                       /* upto, minupto, exact                   */
188  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
189  /* Negative single-char repeats - only for chars < 256                   */
190  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
191  1, 1, 1,                       /* NOT upto, minupto, exact               */
192  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
193  /* Positive type repeats                                                 */
194  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
195  1, 1, 1,                       /* Type upto, minupto, exact              */
196  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
197  /* Character class & ref repeats                                         */
198  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
199  1, 1,                          /* CRRANGE, CRMINRANGE                    */
200  1,                             /* CLASS                                  */
201  1,                             /* NCLASS                                 */
202  1,                             /* XCLASS - variable length               */
203  0,                             /* REF                                    */
204  0,                             /* RECURSE                                */
205  0,                             /* CALLOUT                                */
206  0,                             /* Alt                                    */
207  0,                             /* Ket                                    */
208  0,                             /* KetRmax                                */
209  0,                             /* KetRmin                                */
210  0,                             /* Assert                                 */
211  0,                             /* Assert not                             */
212  0,                             /* Assert behind                          */
213  0,                             /* Assert behind not                      */
214  0,                             /* Reverse                                */
215  0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
216  0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
217  0, 0,                          /* CREF, NCREF                            */
218  0, 0,                          /* RREF, NRREF                            */
219  0,                             /* DEF                                    */
220  0, 0,                          /* BRAZERO, BRAMINZERO                    */
221  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
222  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
223};
224
225/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
226and \w */
227
228static const uschar toptable1[] = {
229  0, 0, 0, 0, 0, 0,
230  ctype_digit, ctype_digit,
231  ctype_space, ctype_space,
232  ctype_word,  ctype_word,
233  0, 0                            /* OP_ANY, OP_ALLANY */
234};
235
236static const uschar toptable2[] = {
237  0, 0, 0, 0, 0, 0,
238  ctype_digit, 0,
239  ctype_space, 0,
240  ctype_word,  0,
241  1, 1                            /* OP_ANY, OP_ALLANY */
242};
243
244
245/* Structure for holding data about a particular state, which is in effect the
246current data for an active path through the match tree. It must consist
247entirely of ints because the working vector we are passed, and which we put
248these structures in, is a vector of ints. */
249
250typedef struct stateblock {
251  int offset;                     /* Offset to opcode */
252  int count;                      /* Count for repeats */
253  int ims;                        /* ims flag bits */
254  int data;                       /* Some use extra data */
255} stateblock;
256
257#define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
258
259
260#ifdef PCRE_DEBUG
261/*************************************************
262*             Print character string             *
263*************************************************/
264
265/* Character string printing function for debugging.
266
267Arguments:
268  p            points to string
269  length       number of bytes
270  f            where to print
271
272Returns:       nothing
273*/
274
275static void
276pchars(unsigned char *p, int length, FILE *f)
277{
278int c;
279while (length-- > 0)
280  {
281  if (isprint(c = *(p++)))
282    fprintf(f, "%c", c);
283  else
284    fprintf(f, "\\x%02x", c);
285  }
286}
287#endif
288
289
290
291/*************************************************
292*    Execute a Regular Expression - DFA engine   *
293*************************************************/
294
295/* This internal function applies a compiled pattern to a subject string,
296starting at a given point, using a DFA engine. This function is called from the
297external one, possibly multiple times if the pattern is not anchored. The
298function calls itself recursively for some kinds of subpattern.
299
300Arguments:
301  md                the match_data block with fixed information
302  this_start_code   the opening bracket of this subexpression's code
303  current_subject   where we currently are in the subject string
304  start_offset      start offset in the subject string
305  offsets           vector to contain the matching string offsets
306  offsetcount       size of same
307  workspace         vector of workspace
308  wscount           size of same
309  ims               the current ims flags
310  rlevel            function call recursion level
311  recursing         regex recursive call level
312
313Returns:            > 0 => number of match offset pairs placed in offsets
314                    = 0 => offsets overflowed; longest matches are present
315                     -1 => failed to match
316                   < -1 => some kind of unexpected problem
317
318The following macros are used for adding states to the two state vectors (one
319for the current character, one for the following character). */
320
321#define ADD_ACTIVE(x,y) \
322  if (active_count++ < wscount) \
323    { \
324    next_active_state->offset = (x); \
325    next_active_state->count  = (y); \
326    next_active_state->ims    = ims; \
327    next_active_state++; \
328    DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
329    } \
330  else return PCRE_ERROR_DFA_WSSIZE
331
332#define ADD_ACTIVE_DATA(x,y,z) \
333  if (active_count++ < wscount) \
334    { \
335    next_active_state->offset = (x); \
336    next_active_state->count  = (y); \
337    next_active_state->ims    = ims; \
338    next_active_state->data   = (z); \
339    next_active_state++; \
340    DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
341    } \
342  else return PCRE_ERROR_DFA_WSSIZE
343
344#define ADD_NEW(x,y) \
345  if (new_count++ < wscount) \
346    { \
347    next_new_state->offset = (x); \
348    next_new_state->count  = (y); \
349    next_new_state->ims    = ims; \
350    next_new_state++; \
351    DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
352    } \
353  else return PCRE_ERROR_DFA_WSSIZE
354
355#define ADD_NEW_DATA(x,y,z) \
356  if (new_count++ < wscount) \
357    { \
358    next_new_state->offset = (x); \
359    next_new_state->count  = (y); \
360    next_new_state->ims    = ims; \
361    next_new_state->data   = (z); \
362    next_new_state++; \
363    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364    } \
365  else return PCRE_ERROR_DFA_WSSIZE
366
367/* And now, here is the code */
368
369static int
370internal_dfa_exec(
371  dfa_match_data *md,
372  const uschar *this_start_code,
373  const uschar *current_subject,
374  int start_offset,
375  int *offsets,
376  int offsetcount,
377  int *workspace,
378  int wscount,
379  int ims,
380  int  rlevel,
381  int  recursing)
382{
383stateblock *active_states, *new_states, *temp_states;
384stateblock *next_active_state, *next_new_state;
385
386const uschar *ctypes, *lcc, *fcc;
387const uschar *ptr;
388const uschar *end_code, *first_op;
389
390int active_count, new_count, match_count;
391
392/* Some fields in the md block are frequently referenced, so we load them into
393independent variables in the hope that this will perform better. */
394
395const uschar *start_subject = md->start_subject;
396const uschar *end_subject = md->end_subject;
397const uschar *start_code = md->start_code;
398
399#ifdef SUPPORT_UTF8
400BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
401#else
402BOOL utf8 = FALSE;
403#endif
404
405rlevel++;
406offsetcount &= (-2);
407
408wscount -= 2;
409wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
410          (2 * INTS_PER_STATEBLOCK);
411
412DPRINTF(("\n%.*s---------------------\n"
413  "%.*sCall to internal_dfa_exec f=%d r=%d\n",
414  rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
415
416ctypes = md->tables + ctypes_offset;
417lcc = md->tables + lcc_offset;
418fcc = md->tables + fcc_offset;
419
420match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
421
422active_states = (stateblock *)(workspace + 2);
423next_new_state = new_states = active_states + wscount;
424new_count = 0;
425
426first_op = this_start_code + 1 + LINK_SIZE +
427  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
428
429/* The first thing in any (sub) pattern is a bracket of some sort. Push all
430the alternative states onto the list, and find out where the end is. This
431makes is possible to use this function recursively, when we want to stop at a
432matching internal ket rather than at the end.
433
434If the first opcode in the first alternative is OP_REVERSE, we are dealing with
435a backward assertion. In that case, we have to find out the maximum amount to
436move back, and set up each alternative appropriately. */
437
438if (*first_op == OP_REVERSE)
439  {
440  int max_back = 0;
441  int gone_back;
442
443  end_code = this_start_code;
444  do
445    {
446    int back = GET(end_code, 2+LINK_SIZE);
447    if (back > max_back) max_back = back;
448    end_code += GET(end_code, 1);
449    }
450  while (*end_code == OP_ALT);
451
452  /* If we can't go back the amount required for the longest lookbehind
453  pattern, go back as far as we can; some alternatives may still be viable. */
454
455#ifdef SUPPORT_UTF8
456  /* In character mode we have to step back character by character */
457
458  if (utf8)
459    {
460    for (gone_back = 0; gone_back < max_back; gone_back++)
461      {
462      if (current_subject <= start_subject) break;
463      current_subject--;
464      while (current_subject > start_subject &&
465             (*current_subject & 0xc0) == 0x80)
466        current_subject--;
467      }
468    }
469  else
470#endif
471
472  /* In byte-mode we can do this quickly. */
473
474    {
475    gone_back = (current_subject - max_back < start_subject)?
476      current_subject - start_subject : max_back;
477    current_subject -= gone_back;
478    }
479
480  /* Save the earliest consulted character */
481
482  if (current_subject < md->start_used_ptr)
483    md->start_used_ptr = current_subject;
484
485  /* Now we can process the individual branches. */
486
487  end_code = this_start_code;
488  do
489    {
490    int back = GET(end_code, 2+LINK_SIZE);
491    if (back <= gone_back)
492      {
493      int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
494      ADD_NEW_DATA(-bstate, 0, gone_back - back);
495      }
496    end_code += GET(end_code, 1);
497    }
498  while (*end_code == OP_ALT);
499 }
500
501/* This is the code for a "normal" subpattern (not a backward assertion). The
502start of a whole pattern is always one of these. If we are at the top level,
503we may be asked to restart matching from the same point that we reached for a
504previous partial match. We still have to scan through the top-level branches to
505find the end state. */
506
507else
508  {
509  end_code = this_start_code;
510
511  /* Restarting */
512
513  if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
514    {
515    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
516    new_count = workspace[1];
517    if (!workspace[0])
518      memcpy(new_states, active_states, new_count * sizeof(stateblock));
519    }
520
521  /* Not restarting */
522
523  else
524    {
525    int length = 1 + LINK_SIZE +
526      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
527    do
528      {
529      ADD_NEW(end_code - start_code + length, 0);
530      end_code += GET(end_code, 1);
531      length = 1 + LINK_SIZE;
532      }
533    while (*end_code == OP_ALT);
534    }
535  }
536
537workspace[0] = 0;    /* Bit indicating which vector is current */
538
539DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
540
541/* Loop for scanning the subject */
542
543ptr = current_subject;
544for (;;)
545  {
546  int i, j;
547  int clen, dlen;
548  unsigned int c, d;
549  int forced_fail = 0;
550  BOOL could_continue = FALSE;
551
552  /* Make the new state list into the active state list and empty the
553  new state list. */
554
555  temp_states = active_states;
556  active_states = new_states;
557  new_states = temp_states;
558  active_count = new_count;
559  new_count = 0;
560
561  workspace[0] ^= 1;              /* Remember for the restarting feature */
562  workspace[1] = active_count;
563
564#ifdef PCRE_DEBUG
565  printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
566  pchars((uschar *)ptr, strlen((char *)ptr), stdout);
567  printf("\"\n");
568
569  printf("%.*sActive states: ", rlevel*2-2, SP);
570  for (i = 0; i < active_count; i++)
571    printf("%d/%d ", active_states[i].offset, active_states[i].count);
572  printf("\n");
573#endif
574
575  /* Set the pointers for adding new states */
576
577  next_active_state = active_states + active_count;
578  next_new_state = new_states;
579
580  /* Load the current character from the subject outside the loop, as many
581  different states may want to look at it, and we assume that at least one
582  will. */
583
584  if (ptr < end_subject)
585    {
586    clen = 1;        /* Number of bytes in the character */
587#ifdef SUPPORT_UTF8
588    if (utf8) { GETCHARLEN(c, ptr, clen); } else
589#endif  /* SUPPORT_UTF8 */
590    c = *ptr;
591    }
592  else
593    {
594    clen = 0;        /* This indicates the end of the subject */
595    c = NOTACHAR;    /* This value should never actually be used */
596    }
597
598  /* Scan up the active states and act on each one. The result of an action
599  may be to add more states to the currently active list (e.g. on hitting a
600  parenthesis) or it may be to put states on the new list, for considering
601  when we move the character pointer on. */
602
603  for (i = 0; i < active_count; i++)
604    {
605    stateblock *current_state = active_states + i;
606    const uschar *code;
607    int state_offset = current_state->offset;
608    int count, codevalue, rrc;
609
610#ifdef PCRE_DEBUG
611    printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
612    if (clen == 0) printf("EOL\n");
613      else if (c > 32 && c < 127) printf("'%c'\n", c);
614        else printf("0x%02x\n", c);
615#endif
616
617    /* This variable is referred to implicity in the ADD_xxx macros. */
618
619    ims = current_state->ims;
620
621    /* A negative offset is a special case meaning "hold off going to this
622    (negated) state until the number of characters in the data field have
623    been skipped". */
624
625    if (state_offset < 0)
626      {
627      if (current_state->data > 0)
628        {
629        DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
630        ADD_NEW_DATA(state_offset, current_state->count,
631          current_state->data - 1);
632        continue;
633        }
634      else
635        {
636        current_state->offset = state_offset = -state_offset;
637        }
638      }
639
640    /* Check for a duplicate state with the same count, and skip if found.
641    See the note at the head of this module about the possibility of improving
642    performance here. */
643
644    for (j = 0; j < i; j++)
645      {
646      if (active_states[j].offset == state_offset &&
647          active_states[j].count == current_state->count)
648        {
649        DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
650        goto NEXT_ACTIVE_STATE;
651        }
652      }
653
654    /* The state offset is the offset to the opcode */
655
656    code = start_code + state_offset;
657    codevalue = *code;
658
659    /* If this opcode inspects a character, but we are at the end of the
660    subject, remember the fact for use when testing for a partial match. */
661
662    if (clen == 0 && poptable[codevalue] != 0)
663      could_continue = TRUE;
664
665    /* If this opcode is followed by an inline character, load it. It is
666    tempting to test for the presence of a subject character here, but that
667    is wrong, because sometimes zero repetitions of the subject are
668    permitted.
669
670    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
671    argument that is not a data character - but is always one byte long. We
672    have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
673    this case. To keep the other cases fast, convert these ones to new opcodes.
674    */
675
676    if (coptable[codevalue] > 0)
677      {
678      dlen = 1;
679#ifdef SUPPORT_UTF8
680      if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
681#endif  /* SUPPORT_UTF8 */
682      d = code[coptable[codevalue]];
683      if (codevalue >= OP_TYPESTAR)
684        {
685        switch(d)
686          {
687          case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
688          case OP_NOTPROP:
689          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
690          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
691          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
692          case OP_NOT_HSPACE:
693          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
694          case OP_NOT_VSPACE:
695          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
696          default: break;
697          }
698        }
699      }
700    else
701      {
702      dlen = 0;         /* Not strictly necessary, but compilers moan */
703      d = NOTACHAR;     /* if these variables are not set. */
704      }
705
706
707    /* Now process the individual opcodes */
708
709    switch (codevalue)
710      {
711/* ========================================================================== */
712      /* These cases are never obeyed. This is a fudge that causes a compile-
713      time error if the vectors coptable or poptable, which are indexed by
714      opcode, are not the correct length. It seems to be the only way to do
715      such a check at compile time, as the sizeof() operator does not work
716      in the C preprocessor. */
717
718      case OP_TABLE_LENGTH:
719      case OP_TABLE_LENGTH +
720        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
721         (sizeof(poptable) == OP_TABLE_LENGTH)):
722      break;
723
724/* ========================================================================== */
725      /* Reached a closing bracket. If not at the end of the pattern, carry
726      on with the next opcode. Otherwise, unless we have an empty string and
727      PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
728      start of the subject, save the match data, shifting up all previous
729      matches so we always have the longest first. */
730
731      case OP_KET:
732      case OP_KETRMIN:
733      case OP_KETRMAX:
734      if (code != end_code)
735        {
736        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
737        if (codevalue != OP_KET)
738          {
739          ADD_ACTIVE(state_offset - GET(code, 1), 0);
740          }
741        }
742      else
743        {
744        if (ptr > current_subject ||
745            ((md->moptions & PCRE_NOTEMPTY) == 0 &&
746              ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
747                current_subject > start_subject + md->start_offset)))
748          {
749          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
750            else if (match_count > 0 && ++match_count * 2 >= offsetcount)
751              match_count = 0;
752          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
753          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
754          if (offsetcount >= 2)
755            {
756            offsets[0] = current_subject - start_subject;
757            offsets[1] = ptr - start_subject;
758            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
759              offsets[1] - offsets[0], current_subject));
760            }
761          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
762            {
763            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
764              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
765              match_count, rlevel*2-2, SP));
766            return match_count;
767            }
768          }
769        }
770      break;
771
772/* ========================================================================== */
773      /* These opcodes add to the current list of states without looking
774      at the current character. */
775
776      /*-----------------------------------------------------------------*/
777      case OP_ALT:
778      do { code += GET(code, 1); } while (*code == OP_ALT);
779      ADD_ACTIVE(code - start_code, 0);
780      break;
781
782      /*-----------------------------------------------------------------*/
783      case OP_BRA:
784      case OP_SBRA:
785      do
786        {
787        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
788        code += GET(code, 1);
789        }
790      while (*code == OP_ALT);
791      break;
792
793      /*-----------------------------------------------------------------*/
794      case OP_CBRA:
795      case OP_SCBRA:
796      ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
797      code += GET(code, 1);
798      while (*code == OP_ALT)
799        {
800        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
801        code += GET(code, 1);
802        }
803      break;
804
805      /*-----------------------------------------------------------------*/
806      case OP_BRAZERO:
807      case OP_BRAMINZERO:
808      ADD_ACTIVE(state_offset + 1, 0);
809      code += 1 + GET(code, 2);
810      while (*code == OP_ALT) code += GET(code, 1);
811      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
812      break;
813
814      /*-----------------------------------------------------------------*/
815      case OP_SKIPZERO:
816      code += 1 + GET(code, 2);
817      while (*code == OP_ALT) code += GET(code, 1);
818      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
819      break;
820
821      /*-----------------------------------------------------------------*/
822      case OP_CIRC:
823      if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
824          ((ims & PCRE_MULTILINE) != 0 &&
825            ptr != end_subject &&
826            WAS_NEWLINE(ptr)))
827        { ADD_ACTIVE(state_offset + 1, 0); }
828      break;
829
830      /*-----------------------------------------------------------------*/
831      case OP_EOD:
832      if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
833      break;
834
835      /*-----------------------------------------------------------------*/
836      case OP_OPT:
837      ims = code[1];
838      ADD_ACTIVE(state_offset + 2, 0);
839      break;
840
841      /*-----------------------------------------------------------------*/
842      case OP_SOD:
843      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
844      break;
845
846      /*-----------------------------------------------------------------*/
847      case OP_SOM:
848      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
849      break;
850
851
852/* ========================================================================== */
853      /* These opcodes inspect the next subject character, and sometimes
854      the previous one as well, but do not have an argument. The variable
855      clen contains the length of the current character and is zero if we are
856      at the end of the subject. */
857
858      /*-----------------------------------------------------------------*/
859      case OP_ANY:
860      if (clen > 0 && !IS_NEWLINE(ptr))
861        { ADD_NEW(state_offset + 1, 0); }
862      break;
863
864      /*-----------------------------------------------------------------*/
865      case OP_ALLANY:
866      if (clen > 0)
867        { ADD_NEW(state_offset + 1, 0); }
868      break;
869
870      /*-----------------------------------------------------------------*/
871      case OP_EODN:
872      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
873        { ADD_ACTIVE(state_offset + 1, 0); }
874      break;
875
876      /*-----------------------------------------------------------------*/
877      case OP_DOLL:
878      if ((md->moptions & PCRE_NOTEOL) == 0)
879        {
880        if (clen == 0 ||
881            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
882               ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
883            ))
884          { ADD_ACTIVE(state_offset + 1, 0); }
885        }
886      else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
887        { ADD_ACTIVE(state_offset + 1, 0); }
888      break;
889
890      /*-----------------------------------------------------------------*/
891
892      case OP_DIGIT:
893      case OP_WHITESPACE:
894      case OP_WORDCHAR:
895      if (clen > 0 && c < 256 &&
896            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
897        { ADD_NEW(state_offset + 1, 0); }
898      break;
899
900      /*-----------------------------------------------------------------*/
901      case OP_NOT_DIGIT:
902      case OP_NOT_WHITESPACE:
903      case OP_NOT_WORDCHAR:
904      if (clen > 0 && (c >= 256 ||
905            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
906        { ADD_NEW(state_offset + 1, 0); }
907      break;
908
909      /*-----------------------------------------------------------------*/
910      case OP_WORD_BOUNDARY:
911      case OP_NOT_WORD_BOUNDARY:
912        {
913        int left_word, right_word;
914
915        if (ptr > start_subject)
916          {
917          const uschar *temp = ptr - 1;
918          if (temp < md->start_used_ptr) md->start_used_ptr = temp;
919#ifdef SUPPORT_UTF8
920          if (utf8) BACKCHAR(temp);
921#endif
922          GETCHARTEST(d, temp);
923          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
924          }
925        else left_word = 0;
926
927        if (clen > 0)
928          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
929        else right_word = 0;
930
931        if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
932          { ADD_ACTIVE(state_offset + 1, 0); }
933        }
934      break;
935
936
937      /*-----------------------------------------------------------------*/
938      /* Check the next character by Unicode property. We will get here only
939      if the support is in the binary; otherwise a compile-time error occurs.
940      */
941
942#ifdef SUPPORT_UCP
943      case OP_PROP:
944      case OP_NOTPROP:
945      if (clen > 0)
946        {
947        BOOL OK;
948        const ucd_record * prop = GET_UCD(c);
949        switch(code[1])
950          {
951          case PT_ANY:
952          OK = TRUE;
953          break;
954
955          case PT_LAMP:
956          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
957          break;
958
959          case PT_GC:
960          OK = _pcre_ucp_gentype[prop->chartype] == code[2];
961          break;
962
963          case PT_PC:
964          OK = prop->chartype == code[2];
965          break;
966
967          case PT_SC:
968          OK = prop->script == code[2];
969          break;
970
971          /* Should never occur, but keep compilers from grumbling. */
972
973          default:
974          OK = codevalue != OP_PROP;
975          break;
976          }
977
978        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
979        }
980      break;
981#endif
982
983
984
985/* ========================================================================== */
986      /* These opcodes likewise inspect the subject character, but have an
987      argument that is not a data character. It is one of these opcodes:
988      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
989      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
990
991      case OP_TYPEPLUS:
992      case OP_TYPEMINPLUS:
993      case OP_TYPEPOSPLUS:
994      count = current_state->count;  /* Already matched */
995      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
996      if (clen > 0)
997        {
998        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
999            (c < 256 &&
1000              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1001              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1002          {
1003          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1004            {
1005            active_count--;            /* Remove non-match possibility */
1006            next_active_state--;
1007            }
1008          count++;
1009          ADD_NEW(state_offset, count);
1010          }
1011        }
1012      break;
1013
1014      /*-----------------------------------------------------------------*/
1015      case OP_TYPEQUERY:
1016      case OP_TYPEMINQUERY:
1017      case OP_TYPEPOSQUERY:
1018      ADD_ACTIVE(state_offset + 2, 0);
1019      if (clen > 0)
1020        {
1021        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1022            (c < 256 &&
1023              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1024              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1025          {
1026          if (codevalue == OP_TYPEPOSQUERY)
1027            {
1028            active_count--;            /* Remove non-match possibility */
1029            next_active_state--;
1030            }
1031          ADD_NEW(state_offset + 2, 0);
1032          }
1033        }
1034      break;
1035
1036      /*-----------------------------------------------------------------*/
1037      case OP_TYPESTAR:
1038      case OP_TYPEMINSTAR:
1039      case OP_TYPEPOSSTAR:
1040      ADD_ACTIVE(state_offset + 2, 0);
1041      if (clen > 0)
1042        {
1043        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1044            (c < 256 &&
1045              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1046              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1047          {
1048          if (codevalue == OP_TYPEPOSSTAR)
1049            {
1050            active_count--;            /* Remove non-match possibility */
1051            next_active_state--;
1052            }
1053          ADD_NEW(state_offset, 0);
1054          }
1055        }
1056      break;
1057
1058      /*-----------------------------------------------------------------*/
1059      case OP_TYPEEXACT:
1060      count = current_state->count;  /* Number already matched */
1061      if (clen > 0)
1062        {
1063        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1064            (c < 256 &&
1065              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1066              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1067          {
1068          if (++count >= GET2(code, 1))
1069            { ADD_NEW(state_offset + 4, 0); }
1070          else
1071            { ADD_NEW(state_offset, count); }
1072          }
1073        }
1074      break;
1075
1076      /*-----------------------------------------------------------------*/
1077      case OP_TYPEUPTO:
1078      case OP_TYPEMINUPTO:
1079      case OP_TYPEPOSUPTO:
1080      ADD_ACTIVE(state_offset + 4, 0);
1081      count = current_state->count;  /* Number already matched */
1082      if (clen > 0)
1083        {
1084        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1085            (c < 256 &&
1086              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1087              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1088          {
1089          if (codevalue == OP_TYPEPOSUPTO)
1090            {
1091            active_count--;           /* Remove non-match possibility */
1092            next_active_state--;
1093            }
1094          if (++count >= GET2(code, 1))
1095            { ADD_NEW(state_offset + 4, 0); }
1096          else
1097            { ADD_NEW(state_offset, count); }
1098          }
1099        }
1100      break;
1101
1102/* ========================================================================== */
1103      /* These are virtual opcodes that are used when something like
1104      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1105      argument. It keeps the code above fast for the other cases. The argument
1106      is in the d variable. */
1107
1108#ifdef SUPPORT_UCP
1109      case OP_PROP_EXTRA + OP_TYPEPLUS:
1110      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1111      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1112      count = current_state->count;           /* Already matched */
1113      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1114      if (clen > 0)
1115        {
1116        BOOL OK;
1117        const ucd_record * prop = GET_UCD(c);
1118        switch(code[2])
1119          {
1120          case PT_ANY:
1121          OK = TRUE;
1122          break;
1123
1124          case PT_LAMP:
1125          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1126          break;
1127
1128          case PT_GC:
1129          OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1130          break;
1131
1132          case PT_PC:
1133          OK = prop->chartype == code[3];
1134          break;
1135
1136          case PT_SC:
1137          OK = prop->script == code[3];
1138          break;
1139
1140          /* Should never occur, but keep compilers from grumbling. */
1141
1142          default:
1143          OK = codevalue != OP_PROP;
1144          break;
1145          }
1146
1147        if (OK == (d == OP_PROP))
1148          {
1149          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1150            {
1151            active_count--;           /* Remove non-match possibility */
1152            next_active_state--;
1153            }
1154          count++;
1155          ADD_NEW(state_offset, count);
1156          }
1157        }
1158      break;
1159
1160      /*-----------------------------------------------------------------*/
1161      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1162      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1163      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1164      count = current_state->count;  /* Already matched */
1165      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1166      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1167        {
1168        const uschar *nptr = ptr + clen;
1169        int ncount = 0;
1170        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1171          {
1172          active_count--;           /* Remove non-match possibility */
1173          next_active_state--;
1174          }
1175        while (nptr < end_subject)
1176          {
1177          int nd;
1178          int ndlen = 1;
1179          GETCHARLEN(nd, nptr, ndlen);
1180          if (UCD_CATEGORY(nd) != ucp_M) break;
1181          ncount++;
1182          nptr += ndlen;
1183          }
1184        count++;
1185        ADD_NEW_DATA(-state_offset, count, ncount);
1186        }
1187      break;
1188#endif
1189
1190      /*-----------------------------------------------------------------*/
1191      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1192      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1193      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1194      count = current_state->count;  /* Already matched */
1195      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1196      if (clen > 0)
1197        {
1198        int ncount = 0;
1199        switch (c)
1200          {
1201          case 0x000b:
1202          case 0x000c:
1203          case 0x0085:
1204          case 0x2028:
1205          case 0x2029:
1206          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1207          goto ANYNL01;
1208
1209          case 0x000d:
1210          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1211          /* Fall through */
1212
1213          ANYNL01:
1214          case 0x000a:
1215          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1216            {
1217            active_count--;           /* Remove non-match possibility */
1218            next_active_state--;
1219            }
1220          count++;
1221          ADD_NEW_DATA(-state_offset, count, ncount);
1222          break;
1223
1224          default:
1225          break;
1226          }
1227        }
1228      break;
1229
1230      /*-----------------------------------------------------------------*/
1231      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1232      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1233      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1234      count = current_state->count;  /* Already matched */
1235      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1236      if (clen > 0)
1237        {
1238        BOOL OK;
1239        switch (c)
1240          {
1241          case 0x000a:
1242          case 0x000b:
1243          case 0x000c:
1244          case 0x000d:
1245          case 0x0085:
1246          case 0x2028:
1247          case 0x2029:
1248          OK = TRUE;
1249          break;
1250
1251          default:
1252          OK = FALSE;
1253          break;
1254          }
1255
1256        if (OK == (d == OP_VSPACE))
1257          {
1258          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1259            {
1260            active_count--;           /* Remove non-match possibility */
1261            next_active_state--;
1262            }
1263          count++;
1264          ADD_NEW_DATA(-state_offset, count, 0);
1265          }
1266        }
1267      break;
1268
1269      /*-----------------------------------------------------------------*/
1270      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1271      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1272      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1273      count = current_state->count;  /* Already matched */
1274      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1275      if (clen > 0)
1276        {
1277        BOOL OK;
1278        switch (c)
1279          {
1280          case 0x09:      /* HT */
1281          case 0x20:      /* SPACE */
1282          case 0xa0:      /* NBSP */
1283          case 0x1680:    /* OGHAM SPACE MARK */
1284          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1285          case 0x2000:    /* EN QUAD */
1286          case 0x2001:    /* EM QUAD */
1287          case 0x2002:    /* EN SPACE */
1288          case 0x2003:    /* EM SPACE */
1289          case 0x2004:    /* THREE-PER-EM SPACE */
1290          case 0x2005:    /* FOUR-PER-EM SPACE */
1291          case 0x2006:    /* SIX-PER-EM SPACE */
1292          case 0x2007:    /* FIGURE SPACE */
1293          case 0x2008:    /* PUNCTUATION SPACE */
1294          case 0x2009:    /* THIN SPACE */
1295          case 0x200A:    /* HAIR SPACE */
1296          case 0x202f:    /* NARROW NO-BREAK SPACE */
1297          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1298          case 0x3000:    /* IDEOGRAPHIC SPACE */
1299          OK = TRUE;
1300          break;
1301
1302          default:
1303          OK = FALSE;
1304          break;
1305          }
1306
1307        if (OK == (d == OP_HSPACE))
1308          {
1309          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1310            {
1311            active_count--;           /* Remove non-match possibility */
1312            next_active_state--;
1313            }
1314          count++;
1315          ADD_NEW_DATA(-state_offset, count, 0);
1316          }
1317        }
1318      break;
1319
1320      /*-----------------------------------------------------------------*/
1321#ifdef SUPPORT_UCP
1322      case OP_PROP_EXTRA + OP_TYPEQUERY:
1323      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1324      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1325      count = 4;
1326      goto QS1;
1327
1328      case OP_PROP_EXTRA + OP_TYPESTAR:
1329      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1330      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1331      count = 0;
1332
1333      QS1:
1334
1335      ADD_ACTIVE(state_offset + 4, 0);
1336      if (clen > 0)
1337        {
1338        BOOL OK;
1339        const ucd_record * prop = GET_UCD(c);
1340        switch(code[2])
1341          {
1342          case PT_ANY:
1343          OK = TRUE;
1344          break;
1345
1346          case PT_LAMP:
1347          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1348          break;
1349
1350          case PT_GC:
1351          OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1352          break;
1353
1354          case PT_PC:
1355          OK = prop->chartype == code[3];
1356          break;
1357
1358          case PT_SC:
1359          OK = prop->script == code[3];
1360          break;
1361
1362          /* Should never occur, but keep compilers from grumbling. */
1363
1364          default:
1365          OK = codevalue != OP_PROP;
1366          break;
1367          }
1368
1369        if (OK == (d == OP_PROP))
1370          {
1371          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1372              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1373            {
1374            active_count--;           /* Remove non-match possibility */
1375            next_active_state--;
1376            }
1377          ADD_NEW(state_offset + count, 0);
1378          }
1379        }
1380      break;
1381
1382      /*-----------------------------------------------------------------*/
1383      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1384      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1385      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1386      count = 2;
1387      goto QS2;
1388
1389      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1390      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1391      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1392      count = 0;
1393
1394      QS2:
1395
1396      ADD_ACTIVE(state_offset + 2, 0);
1397      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1398        {
1399        const uschar *nptr = ptr + clen;
1400        int ncount = 0;
1401        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1402            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1403          {
1404          active_count--;           /* Remove non-match possibility */
1405          next_active_state--;
1406          }
1407        while (nptr < end_subject)
1408          {
1409          int nd;
1410          int ndlen = 1;
1411          GETCHARLEN(nd, nptr, ndlen);
1412          if (UCD_CATEGORY(nd) != ucp_M) break;
1413          ncount++;
1414          nptr += ndlen;
1415          }
1416        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1417        }
1418      break;
1419#endif
1420
1421      /*-----------------------------------------------------------------*/
1422      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1423      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1424      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1425      count = 2;
1426      goto QS3;
1427
1428      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1429      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1430      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1431      count = 0;
1432
1433      QS3:
1434      ADD_ACTIVE(state_offset + 2, 0);
1435      if (clen > 0)
1436        {
1437        int ncount = 0;
1438        switch (c)
1439          {
1440          case 0x000b:
1441          case 0x000c:
1442          case 0x0085:
1443          case 0x2028:
1444          case 0x2029:
1445          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1446          goto ANYNL02;
1447
1448          case 0x000d:
1449          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1450          /* Fall through */
1451
1452          ANYNL02:
1453          case 0x000a:
1454          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1455              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1456            {
1457            active_count--;           /* Remove non-match possibility */
1458            next_active_state--;
1459            }
1460          ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1461          break;
1462
1463          default:
1464          break;
1465          }
1466        }
1467      break;
1468
1469      /*-----------------------------------------------------------------*/
1470      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1471      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1472      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1473      count = 2;
1474      goto QS4;
1475
1476      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1477      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1478      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1479      count = 0;
1480
1481      QS4:
1482      ADD_ACTIVE(state_offset + 2, 0);
1483      if (clen > 0)
1484        {
1485        BOOL OK;
1486        switch (c)
1487          {
1488          case 0x000a:
1489          case 0x000b:
1490          case 0x000c:
1491          case 0x000d:
1492          case 0x0085:
1493          case 0x2028:
1494          case 0x2029:
1495          OK = TRUE;
1496          break;
1497
1498          default:
1499          OK = FALSE;
1500          break;
1501          }
1502        if (OK == (d == OP_VSPACE))
1503          {
1504          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1505              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1506            {
1507            active_count--;           /* Remove non-match possibility */
1508            next_active_state--;
1509            }
1510          ADD_NEW_DATA(-(state_offset + count), 0, 0);
1511          }
1512        }
1513      break;
1514
1515      /*-----------------------------------------------------------------*/
1516      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1517      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1518      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1519      count = 2;
1520      goto QS5;
1521
1522      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1523      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1524      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1525      count = 0;
1526
1527      QS5:
1528      ADD_ACTIVE(state_offset + 2, 0);
1529      if (clen > 0)
1530        {
1531        BOOL OK;
1532        switch (c)
1533          {
1534          case 0x09:      /* HT */
1535          case 0x20:      /* SPACE */
1536          case 0xa0:      /* NBSP */
1537          case 0x1680:    /* OGHAM SPACE MARK */
1538          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1539          case 0x2000:    /* EN QUAD */
1540          case 0x2001:    /* EM QUAD */
1541          case 0x2002:    /* EN SPACE */
1542          case 0x2003:    /* EM SPACE */
1543          case 0x2004:    /* THREE-PER-EM SPACE */
1544          case 0x2005:    /* FOUR-PER-EM SPACE */
1545          case 0x2006:    /* SIX-PER-EM SPACE */
1546          case 0x2007:    /* FIGURE SPACE */
1547          case 0x2008:    /* PUNCTUATION SPACE */
1548          case 0x2009:    /* THIN SPACE */
1549          case 0x200A:    /* HAIR SPACE */
1550          case 0x202f:    /* NARROW NO-BREAK SPACE */
1551          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1552          case 0x3000:    /* IDEOGRAPHIC SPACE */
1553          OK = TRUE;
1554          break;
1555
1556          default:
1557          OK = FALSE;
1558          break;
1559          }
1560
1561        if (OK == (d == OP_HSPACE))
1562          {
1563          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1564              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1565            {
1566            active_count--;           /* Remove non-match possibility */
1567            next_active_state--;
1568            }
1569          ADD_NEW_DATA(-(state_offset + count), 0, 0);
1570          }
1571        }
1572      break;
1573
1574      /*-----------------------------------------------------------------*/
1575#ifdef SUPPORT_UCP
1576      case OP_PROP_EXTRA + OP_TYPEEXACT:
1577      case OP_PROP_EXTRA + OP_TYPEUPTO:
1578      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1579      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1580      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1581        { ADD_ACTIVE(state_offset + 6, 0); }
1582      count = current_state->count;  /* Number already matched */
1583      if (clen > 0)
1584        {
1585        BOOL OK;
1586        const ucd_record * prop = GET_UCD(c);
1587        switch(code[4])
1588          {
1589          case PT_ANY:
1590          OK = TRUE;
1591          break;
1592
1593          case PT_LAMP:
1594          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1595          break;
1596
1597          case PT_GC:
1598          OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1599          break;
1600
1601          case PT_PC:
1602          OK = prop->chartype == code[5];
1603          break;
1604
1605          case PT_SC:
1606          OK = prop->script == code[5];
1607          break;
1608
1609          /* Should never occur, but keep compilers from grumbling. */
1610
1611          default:
1612          OK = codevalue != OP_PROP;
1613          break;
1614          }
1615
1616        if (OK == (d == OP_PROP))
1617          {
1618          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1619            {
1620            active_count--;           /* Remove non-match possibility */
1621            next_active_state--;
1622            }
1623          if (++count >= GET2(code, 1))
1624            { ADD_NEW(state_offset + 6, 0); }
1625          else
1626            { ADD_NEW(state_offset, count); }
1627          }
1628        }
1629      break;
1630
1631      /*-----------------------------------------------------------------*/
1632      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1633      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1634      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1635      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1636      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1637        { ADD_ACTIVE(state_offset + 4, 0); }
1638      count = current_state->count;  /* Number already matched */
1639      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1640        {
1641        const uschar *nptr = ptr + clen;
1642        int ncount = 0;
1643        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1644          {
1645          active_count--;           /* Remove non-match possibility */
1646          next_active_state--;
1647          }
1648        while (nptr < end_subject)
1649          {
1650          int nd;
1651          int ndlen = 1;
1652          GETCHARLEN(nd, nptr, ndlen);
1653          if (UCD_CATEGORY(nd) != ucp_M) break;
1654          ncount++;
1655          nptr += ndlen;
1656          }
1657        if (++count >= GET2(code, 1))
1658          { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1659        else
1660          { ADD_NEW_DATA(-state_offset, count, ncount); }
1661        }
1662      break;
1663#endif
1664
1665      /*-----------------------------------------------------------------*/
1666      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1667      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1668      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1669      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1670      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1671        { ADD_ACTIVE(state_offset + 4, 0); }
1672      count = current_state->count;  /* Number already matched */
1673      if (clen > 0)
1674        {
1675        int ncount = 0;
1676        switch (c)
1677          {
1678          case 0x000b:
1679          case 0x000c:
1680          case 0x0085:
1681          case 0x2028:
1682          case 0x2029:
1683          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1684          goto ANYNL03;
1685
1686          case 0x000d:
1687          if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1688          /* Fall through */
1689
1690          ANYNL03:
1691          case 0x000a:
1692          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1693            {
1694            active_count--;           /* Remove non-match possibility */
1695            next_active_state--;
1696            }
1697          if (++count >= GET2(code, 1))
1698            { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1699          else
1700            { ADD_NEW_DATA(-state_offset, count, ncount); }
1701          break;
1702
1703          default:
1704          break;
1705          }
1706        }
1707      break;
1708
1709      /*-----------------------------------------------------------------*/
1710      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1711      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1712      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1713      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1714      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1715        { ADD_ACTIVE(state_offset + 4, 0); }
1716      count = current_state->count;  /* Number already matched */
1717      if (clen > 0)
1718        {
1719        BOOL OK;
1720        switch (c)
1721          {
1722          case 0x000a:
1723          case 0x000b:
1724          case 0x000c:
1725          case 0x000d:
1726          case 0x0085:
1727          case 0x2028:
1728          case 0x2029:
1729          OK = TRUE;
1730          break;
1731
1732          default:
1733          OK = FALSE;
1734          }
1735
1736        if (OK == (d == OP_VSPACE))
1737          {
1738          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1739            {
1740            active_count--;           /* Remove non-match possibility */
1741            next_active_state--;
1742            }
1743          if (++count >= GET2(code, 1))
1744            { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1745          else
1746            { ADD_NEW_DATA(-state_offset, count, 0); }
1747          }
1748        }
1749      break;
1750
1751      /*-----------------------------------------------------------------*/
1752      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1753      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1754      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1755      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1756      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1757        { ADD_ACTIVE(state_offset + 4, 0); }
1758      count = current_state->count;  /* Number already matched */
1759      if (clen > 0)
1760        {
1761        BOOL OK;
1762        switch (c)
1763          {
1764          case 0x09:      /* HT */
1765          case 0x20:      /* SPACE */
1766          case 0xa0:      /* NBSP */
1767          case 0x1680:    /* OGHAM SPACE MARK */
1768          case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1769          case 0x2000:    /* EN QUAD */
1770          case 0x2001:    /* EM QUAD */
1771          case 0x2002:    /* EN SPACE */
1772          case 0x2003:    /* EM SPACE */
1773          case 0x2004:    /* THREE-PER-EM SPACE */
1774          case 0x2005:    /* FOUR-PER-EM SPACE */
1775          case 0x2006:    /* SIX-PER-EM SPACE */
1776          case 0x2007:    /* FIGURE SPACE */
1777          case 0x2008:    /* PUNCTUATION SPACE */
1778          case 0x2009:    /* THIN SPACE */
1779          case 0x200A:    /* HAIR SPACE */
1780          case 0x202f:    /* NARROW NO-BREAK SPACE */
1781          case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1782          case 0x3000:    /* IDEOGRAPHIC SPACE */
1783          OK = TRUE;
1784          break;
1785
1786          default:
1787          OK = FALSE;
1788          break;
1789          }
1790
1791        if (OK == (d == OP_HSPACE))
1792          {
1793          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1794            {
1795            active_count--;           /* Remove non-match possibility */
1796            next_active_state--;
1797            }
1798          if (++count >= GET2(code, 1))
1799            { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1800          else
1801            { ADD_NEW_DATA(-state_offset, count, 0); }
1802          }
1803        }
1804      break;
1805
1806/* ========================================================================== */
1807      /* These opcodes are followed by a character that is usually compared
1808      to the current subject character; it is loaded into d. We still get
1809      here even if there is no subject character, because in some cases zero
1810      repetitions are permitted. */
1811
1812      /*-----------------------------------------------------------------*/
1813      case OP_CHAR:
1814      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1815      break;
1816
1817      /*-----------------------------------------------------------------*/
1818      case OP_CHARNC:
1819      if (clen == 0) break;
1820
1821#ifdef SUPPORT_UTF8
1822      if (utf8)
1823        {
1824        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1825          {
1826          unsigned int othercase;
1827          if (c < 128) othercase = fcc[c]; else
1828
1829          /* If we have Unicode property support, we can use it to test the
1830          other case of the character. */
1831
1832#ifdef SUPPORT_UCP
1833          othercase = UCD_OTHERCASE(c);
1834#else
1835          othercase = NOTACHAR;
1836#endif
1837
1838          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1839          }
1840        }
1841      else
1842#endif  /* SUPPORT_UTF8 */
1843
1844      /* Non-UTF-8 mode */
1845        {
1846        if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1847        }
1848      break;
1849
1850
1851#ifdef SUPPORT_UCP
1852      /*-----------------------------------------------------------------*/
1853      /* This is a tricky one because it can match more than one character.
1854      Find out how many characters to skip, and then set up a negative state
1855      to wait for them to pass before continuing. */
1856
1857      case OP_EXTUNI:
1858      if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1859        {
1860        const uschar *nptr = ptr + clen;
1861        int ncount = 0;
1862        while (nptr < end_subject)
1863          {
1864          int nclen = 1;
1865          GETCHARLEN(c, nptr, nclen);
1866          if (UCD_CATEGORY(c) != ucp_M) break;
1867          ncount++;
1868          nptr += nclen;
1869          }
1870        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1871        }
1872      break;
1873#endif
1874
1875      /*-----------------------------------------------------------------*/
1876      /* This is a tricky like EXTUNI because it too can match more than one
1877      character (when CR is followed by LF). In this case, set up a negative
1878      state to wait for one character to pass before continuing. */
1879
1880      case OP_ANYNL:
1881      if (clen > 0) switch(c)
1882        {
1883        case 0x000b:
1884        case 0x000c:
1885        case 0x0085:
1886        case 0x2028:
1887        case 0x2029:
1888        if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1889
1890        case 0x000a:
1891        ADD_NEW(state_offset + 1, 0);
1892        break;
1893
1894        case 0x000d:
1895        if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1896          {
1897          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1898          }
1899        else
1900          {
1901          ADD_NEW(state_offset + 1, 0);
1902          }
1903        break;
1904        }
1905      break;
1906
1907      /*-----------------------------------------------------------------*/
1908      case OP_NOT_VSPACE:
1909      if (clen > 0) switch(c)
1910        {
1911        case 0x000a:
1912        case 0x000b:
1913        case 0x000c:
1914        case 0x000d:
1915        case 0x0085:
1916        case 0x2028:
1917        case 0x2029:
1918        break;
1919
1920        default:
1921        ADD_NEW(state_offset + 1, 0);
1922        break;
1923        }
1924      break;
1925
1926      /*-----------------------------------------------------------------*/
1927      case OP_VSPACE:
1928      if (clen > 0) switch(c)
1929        {
1930        case 0x000a:
1931        case 0x000b:
1932        case 0x000c:
1933        case 0x000d:
1934        case 0x0085:
1935        case 0x2028:
1936        case 0x2029:
1937        ADD_NEW(state_offset + 1, 0);
1938        break;
1939
1940        default: break;
1941        }
1942      break;
1943
1944      /*-----------------------------------------------------------------*/
1945      case OP_NOT_HSPACE:
1946      if (clen > 0) switch(c)
1947        {
1948        case 0x09:      /* HT */
1949        case 0x20:      /* SPACE */
1950        case 0xa0:      /* NBSP */
1951        case 0x1680:    /* OGHAM SPACE MARK */
1952        case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1953        case 0x2000:    /* EN QUAD */
1954        case 0x2001:    /* EM QUAD */
1955        case 0x2002:    /* EN SPACE */
1956        case 0x2003:    /* EM SPACE */
1957        case 0x2004:    /* THREE-PER-EM SPACE */
1958        case 0x2005:    /* FOUR-PER-EM SPACE */
1959        case 0x2006:    /* SIX-PER-EM SPACE */
1960        case 0x2007:    /* FIGURE SPACE */
1961        case 0x2008:    /* PUNCTUATION SPACE */
1962        case 0x2009:    /* THIN SPACE */
1963        case 0x200A:    /* HAIR SPACE */
1964        case 0x202f:    /* NARROW NO-BREAK SPACE */
1965        case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1966        case 0x3000:    /* IDEOGRAPHIC SPACE */
1967        break;
1968
1969        default:
1970        ADD_NEW(state_offset + 1, 0);
1971        break;
1972        }
1973      break;
1974
1975      /*-----------------------------------------------------------------*/
1976      case OP_HSPACE:
1977      if (clen > 0) switch(c)
1978        {
1979        case 0x09:      /* HT */
1980        case 0x20:      /* SPACE */
1981        case 0xa0:      /* NBSP */
1982        case 0x1680:    /* OGHAM SPACE MARK */
1983        case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1984        case 0x2000:    /* EN QUAD */
1985        case 0x2001:    /* EM QUAD */
1986        case 0x2002:    /* EN SPACE */
1987        case 0x2003:    /* EM SPACE */
1988        case 0x2004:    /* THREE-PER-EM SPACE */
1989        case 0x2005:    /* FOUR-PER-EM SPACE */
1990        case 0x2006:    /* SIX-PER-EM SPACE */
1991        case 0x2007:    /* FIGURE SPACE */
1992        case 0x2008:    /* PUNCTUATION SPACE */
1993        case 0x2009:    /* THIN SPACE */
1994        case 0x200A:    /* HAIR SPACE */
1995        case 0x202f:    /* NARROW NO-BREAK SPACE */
1996        case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1997        case 0x3000:    /* IDEOGRAPHIC SPACE */
1998        ADD_NEW(state_offset + 1, 0);
1999        break;
2000        }
2001      break;
2002
2003      /*-----------------------------------------------------------------*/
2004      /* Match a negated single character. This is only used for one-byte
2005      characters, that is, we know that d < 256. The character we are
2006      checking (c) can be multibyte. */
2007
2008      case OP_NOT:
2009      if (clen > 0)
2010        {
2011        unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2012        if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2013        }
2014      break;
2015
2016      /*-----------------------------------------------------------------*/
2017      case OP_PLUS:
2018      case OP_MINPLUS:
2019      case OP_POSPLUS:
2020      case OP_NOTPLUS:
2021      case OP_NOTMINPLUS:
2022      case OP_NOTPOSPLUS:
2023      count = current_state->count;  /* Already matched */
2024      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2025      if (clen > 0)
2026        {
2027        unsigned int otherd = NOTACHAR;
2028        if ((ims & PCRE_CASELESS) != 0)
2029          {
2030#ifdef SUPPORT_UTF8
2031          if (utf8 && d >= 128)
2032            {
2033#ifdef SUPPORT_UCP
2034            otherd = UCD_OTHERCASE(d);
2035#endif  /* SUPPORT_UCP */
2036            }
2037          else
2038#endif  /* SUPPORT_UTF8 */
2039          otherd = fcc[d];
2040          }
2041        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042          {
2043          if (count > 0 &&
2044              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2045            {
2046            active_count--;             /* Remove non-match possibility */
2047            next_active_state--;
2048            }
2049          count++;
2050          ADD_NEW(state_offset, count);
2051          }
2052        }
2053      break;
2054
2055      /*-----------------------------------------------------------------*/
2056      case OP_QUERY:
2057      case OP_MINQUERY:
2058      case OP_POSQUERY:
2059      case OP_NOTQUERY:
2060      case OP_NOTMINQUERY:
2061      case OP_NOTPOSQUERY:
2062      ADD_ACTIVE(state_offset + dlen + 1, 0);
2063      if (clen > 0)
2064        {
2065        unsigned int otherd = NOTACHAR;
2066        if ((ims & PCRE_CASELESS) != 0)
2067          {
2068#ifdef SUPPORT_UTF8
2069          if (utf8 && d >= 128)
2070            {
2071#ifdef SUPPORT_UCP
2072            otherd = UCD_OTHERCASE(d);
2073#endif  /* SUPPORT_UCP */
2074            }
2075          else
2076#endif  /* SUPPORT_UTF8 */
2077          otherd = fcc[d];
2078          }
2079        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2080          {
2081          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2082            {
2083            active_count--;            /* Remove non-match possibility */
2084            next_active_state--;
2085            }
2086          ADD_NEW(state_offset + dlen + 1, 0);
2087          }
2088        }
2089      break;
2090
2091      /*-----------------------------------------------------------------*/
2092      case OP_STAR:
2093      case OP_MINSTAR:
2094      case OP_POSSTAR:
2095      case OP_NOTSTAR:
2096      case OP_NOTMINSTAR:
2097      case OP_NOTPOSSTAR:
2098      ADD_ACTIVE(state_offset + dlen + 1, 0);
2099      if (clen > 0)
2100        {
2101        unsigned int otherd = NOTACHAR;
2102        if ((ims & PCRE_CASELESS) != 0)
2103          {
2104#ifdef SUPPORT_UTF8
2105          if (utf8 && d >= 128)
2106            {
2107#ifdef SUPPORT_UCP
2108            otherd = UCD_OTHERCASE(d);
2109#endif  /* SUPPORT_UCP */
2110            }
2111          else
2112#endif  /* SUPPORT_UTF8 */
2113          otherd = fcc[d];
2114          }
2115        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2116          {
2117          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2118            {
2119            active_count--;            /* Remove non-match possibility */
2120            next_active_state--;
2121            }
2122          ADD_NEW(state_offset, 0);
2123          }
2124        }
2125      break;
2126
2127      /*-----------------------------------------------------------------*/
2128      case OP_EXACT:
2129      case OP_NOTEXACT:
2130      count = current_state->count;  /* Number already matched */
2131      if (clen > 0)
2132        {
2133        unsigned int otherd = NOTACHAR;
2134        if ((ims & PCRE_CASELESS) != 0)
2135          {
2136#ifdef SUPPORT_UTF8
2137          if (utf8 && d >= 128)
2138            {
2139#ifdef SUPPORT_UCP
2140            otherd = UCD_OTHERCASE(d);
2141#endif  /* SUPPORT_UCP */
2142            }
2143          else
2144#endif  /* SUPPORT_UTF8 */
2145          otherd = fcc[d];
2146          }
2147        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2148          {
2149          if (++count >= GET2(code, 1))
2150            { ADD_NEW(state_offset + dlen + 3, 0); }
2151          else
2152            { ADD_NEW(state_offset, count); }
2153          }
2154        }
2155      break;
2156
2157      /*-----------------------------------------------------------------*/
2158      case OP_UPTO:
2159      case OP_MINUPTO:
2160      case OP_POSUPTO:
2161      case OP_NOTUPTO:
2162      case OP_NOTMINUPTO:
2163      case OP_NOTPOSUPTO:
2164      ADD_ACTIVE(state_offset + dlen + 3, 0);
2165      count = current_state->count;  /* Number already matched */
2166      if (clen > 0)
2167        {
2168        unsigned int otherd = NOTACHAR;
2169        if ((ims & PCRE_CASELESS) != 0)
2170          {
2171#ifdef SUPPORT_UTF8
2172          if (utf8 && d >= 128)
2173            {
2174#ifdef SUPPORT_UCP
2175            otherd = UCD_OTHERCASE(d);
2176#endif  /* SUPPORT_UCP */
2177            }
2178          else
2179#endif  /* SUPPORT_UTF8 */
2180          otherd = fcc[d];
2181          }
2182        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2183          {
2184          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2185            {
2186            active_count--;             /* Remove non-match possibility */
2187            next_active_state--;
2188            }
2189          if (++count >= GET2(code, 1))
2190            { ADD_NEW(state_offset + dlen + 3, 0); }
2191          else
2192            { ADD_NEW(state_offset, count); }
2193          }
2194        }
2195      break;
2196
2197
2198/* ========================================================================== */
2199      /* These are the class-handling opcodes */
2200
2201      case OP_CLASS:
2202      case OP_NCLASS:
2203      case OP_XCLASS:
2204        {
2205        BOOL isinclass = FALSE;
2206        int next_state_offset;
2207        const uschar *ecode;
2208
2209        /* For a simple class, there is always just a 32-byte table, and we
2210        can set isinclass from it. */
2211
2212        if (codevalue != OP_XCLASS)
2213          {
2214          ecode = code + 33;
2215          if (clen > 0)
2216            {
2217            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2218              ((code[1 + c/8] & (1 << (c&7))) != 0);
2219            }
2220          }
2221
2222        /* An extended class may have a table or a list of single characters,
2223        ranges, or both, and it may be positive or negative. There's a
2224        function that sorts all this out. */
2225
2226        else
2227         {
2228         ecode = code + GET(code, 1);
2229         if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2230         }
2231
2232        /* At this point, isinclass is set for all kinds of class, and ecode
2233        points to the byte after the end of the class. If there is a
2234        quantifier, this is where it will be. */
2235
2236        next_state_offset = ecode - start_code;
2237
2238        switch (*ecode)
2239          {
2240          case OP_CRSTAR:
2241          case OP_CRMINSTAR:
2242          ADD_ACTIVE(next_state_offset + 1, 0);
2243          if (isinclass) { ADD_NEW(state_offset, 0); }
2244          break;
2245
2246          case OP_CRPLUS:
2247          case OP_CRMINPLUS:
2248          count = current_state->count;  /* Already matched */
2249          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2250          if (isinclass) { count++; ADD_NEW(state_offset, count); }
2251          break;
2252
2253          case OP_CRQUERY:
2254          case OP_CRMINQUERY:
2255          ADD_ACTIVE(next_state_offset + 1, 0);
2256          if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2257          break;
2258
2259          case OP_CRRANGE:
2260          case OP_CRMINRANGE:
2261          count = current_state->count;  /* Already matched */
2262          if (count >= GET2(ecode, 1))
2263            { ADD_ACTIVE(next_state_offset + 5, 0); }
2264          if (isinclass)
2265            {
2266            int max = GET2(ecode, 3);
2267            if (++count >= max && max != 0)   /* Max 0 => no limit */
2268              { ADD_NEW(next_state_offset + 5, 0); }
2269            else
2270              { ADD_NEW(state_offset, count); }
2271            }
2272          break;
2273
2274          default:
2275          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2276          break;
2277          }
2278        }
2279      break;
2280
2281/* ========================================================================== */
2282      /* These are the opcodes for fancy brackets of various kinds. We have
2283      to use recursion in order to handle them. The "always failing" assertion
2284      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2285      though the other "backtracking verbs" are not supported. */
2286
2287      case OP_FAIL:
2288      forced_fail++;    /* Count FAILs for multiple states */
2289      break;
2290
2291      case OP_ASSERT:
2292      case OP_ASSERT_NOT:
2293      case OP_ASSERTBACK:
2294      case OP_ASSERTBACK_NOT:
2295        {
2296        int rc;
2297        int local_offsets[2];
2298        int local_workspace[1000];
2299        const uschar *endasscode = code + GET(code, 1);
2300
2301        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2302
2303        rc = internal_dfa_exec(
2304          md,                                   /* static match data */
2305          code,                                 /* this subexpression's code */
2306          ptr,                                  /* where we currently are */
2307          ptr - start_subject,                  /* start offset */
2308          local_offsets,                        /* offset vector */
2309          sizeof(local_offsets)/sizeof(int),    /* size of same */
2310          local_workspace,                      /* workspace vector */
2311          sizeof(local_workspace)/sizeof(int),  /* size of same */
2312          ims,                                  /* the current ims flags */
2313          rlevel,                               /* function recursion level */
2314          recursing);                           /* pass on regex recursion */
2315
2316        if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2317        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2318            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2319        }
2320      break;
2321
2322      /*-----------------------------------------------------------------*/
2323      case OP_COND:
2324      case OP_SCOND:
2325        {
2326        int local_offsets[1000];
2327        int local_workspace[1000];
2328        int codelink = GET(code, 1);
2329        int condcode;
2330
2331        /* Because of the way auto-callout works during compile, a callout item
2332        is inserted between OP_COND and an assertion condition. This does not
2333        happen for the other conditions. */
2334
2335        if (code[LINK_SIZE+1] == OP_CALLOUT)
2336          {
2337          rrc = 0;
2338          if (pcre_callout != NULL)
2339            {
2340            pcre_callout_block cb;
2341            cb.version          = 1;   /* Version 1 of the callout block */
2342            cb.callout_number   = code[LINK_SIZE+2];
2343            cb.offset_vector    = offsets;
2344            cb.subject          = (PCRE_SPTR)start_subject;
2345            cb.subject_length   = end_subject - start_subject;
2346            cb.start_match      = current_subject - start_subject;
2347            cb.current_position = ptr - start_subject;
2348            cb.pattern_position = GET(code, LINK_SIZE + 3);
2349            cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2350            cb.capture_top      = 1;
2351            cb.capture_last     = -1;
2352            cb.callout_data     = md->callout_data;
2353            if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2354            }
2355          if (rrc > 0) break;                      /* Fail this thread */
2356          code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2357          }
2358
2359        condcode = code[LINK_SIZE+1];
2360
2361        /* Back reference conditions are not supported */
2362
2363        if (condcode == OP_CREF || condcode == OP_NCREF)
2364          return PCRE_ERROR_DFA_UCOND;
2365
2366        /* The DEFINE condition is always false */
2367
2368        if (condcode == OP_DEF)
2369          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2370
2371        /* The only supported version of OP_RREF is for the value RREF_ANY,
2372        which means "test if in any recursion". We can't test for specifically
2373        recursed groups. */
2374
2375        else if (condcode == OP_RREF || condcode == OP_NRREF)
2376          {
2377          int value = GET2(code, LINK_SIZE+2);
2378          if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2379          if (recursing > 0)
2380            { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2381          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2382          }
2383
2384        /* Otherwise, the condition is an assertion */
2385
2386        else
2387          {
2388          int rc;
2389          const uschar *asscode = code + LINK_SIZE + 1;
2390          const uschar *endasscode = asscode + GET(asscode, 1);
2391
2392          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2393
2394          rc = internal_dfa_exec(
2395            md,                                   /* fixed match data */
2396            asscode,                              /* this subexpression's code */
2397            ptr,                                  /* where we currently are */
2398            ptr - start_subject,                  /* start offset */
2399            local_offsets,                        /* offset vector */
2400            sizeof(local_offsets)/sizeof(int),    /* size of same */
2401            local_workspace,                      /* workspace vector */
2402            sizeof(local_workspace)/sizeof(int),  /* size of same */
2403            ims,                                  /* the current ims flags */
2404            rlevel,                               /* function recursion level */
2405            recursing);                           /* pass on regex recursion */
2406
2407          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2408          if ((rc >= 0) ==
2409                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2410            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2411          else
2412            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2413          }
2414        }
2415      break;
2416
2417      /*-----------------------------------------------------------------*/
2418      case OP_RECURSE:
2419        {
2420        int local_offsets[1000];
2421        int local_workspace[1000];
2422        int rc;
2423
2424        DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2425          recursing + 1));
2426
2427        rc = internal_dfa_exec(
2428          md,                                   /* fixed match data */
2429          start_code + GET(code, 1),            /* this subexpression's code */
2430          ptr,                                  /* where we currently are */
2431          ptr - start_subject,                  /* start offset */
2432          local_offsets,                        /* offset vector */
2433          sizeof(local_offsets)/sizeof(int),    /* size of same */
2434          local_workspace,                      /* workspace vector */
2435          sizeof(local_workspace)/sizeof(int),  /* size of same */
2436          ims,                                  /* the current ims flags */
2437          rlevel,                               /* function recursion level */
2438          recursing + 1);                       /* regex recurse level */
2439
2440        DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2441          recursing + 1, rc));
2442
2443        /* Ran out of internal offsets */
2444
2445        if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2446
2447        /* For each successful matched substring, set up the next state with a
2448        count of characters to skip before trying it. Note that the count is in
2449        characters, not bytes. */
2450
2451        if (rc > 0)
2452          {
2453          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2454            {
2455            const uschar *p = start_subject + local_offsets[rc];
2456            const uschar *pp = start_subject + local_offsets[rc+1];
2457            int charcount = local_offsets[rc+1] - local_offsets[rc];
2458            while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2459            if (charcount > 0)
2460              {
2461              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2462              }
2463            else
2464              {
2465              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2466              }
2467            }
2468          }
2469        else if (rc != PCRE_ERROR_NOMATCH) return rc;
2470        }
2471      break;
2472
2473      /*-----------------------------------------------------------------*/
2474      case OP_ONCE:
2475        {
2476        int local_offsets[2];
2477        int local_workspace[1000];
2478
2479        int rc = internal_dfa_exec(
2480          md,                                   /* fixed match data */
2481          code,                                 /* this subexpression's code */
2482          ptr,                                  /* where we currently are */
2483          ptr - start_subject,                  /* start offset */
2484          local_offsets,                        /* offset vector */
2485          sizeof(local_offsets)/sizeof(int),    /* size of same */
2486          local_workspace,                      /* workspace vector */
2487          sizeof(local_workspace)/sizeof(int),  /* size of same */
2488          ims,                                  /* the current ims flags */
2489          rlevel,                               /* function recursion level */
2490          recursing);                           /* pass on regex recursion */
2491
2492        if (rc >= 0)
2493          {
2494          const uschar *end_subpattern = code;
2495          int charcount = local_offsets[1] - local_offsets[0];
2496          int next_state_offset, repeat_state_offset;
2497
2498          do { end_subpattern += GET(end_subpattern, 1); }
2499            while (*end_subpattern == OP_ALT);
2500          next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2501
2502          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2503          arrange for the repeat state also to be added to the relevant list.
2504          Calculate the offset, or set -1 for no repeat. */
2505
2506          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2507                                 *end_subpattern == OP_KETRMIN)?
2508            end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2509
2510          /* If we have matched an empty string, add the next state at the
2511          current character pointer. This is important so that the duplicate
2512          checking kicks in, which is what breaks infinite loops that match an
2513          empty string. */
2514
2515          if (charcount == 0)
2516            {
2517            ADD_ACTIVE(next_state_offset, 0);
2518            }
2519
2520          /* Optimization: if there are no more active states, and there
2521          are no new states yet set up, then skip over the subject string
2522          right here, to save looping. Otherwise, set up the new state to swing
2523          into action when the end of the substring is reached. */
2524
2525          else if (i + 1 >= active_count && new_count == 0)
2526            {
2527            ptr += charcount;
2528            clen = 0;
2529            ADD_NEW(next_state_offset, 0);
2530
2531            /* If we are adding a repeat state at the new character position,
2532            we must fudge things so that it is the only current state.
2533            Otherwise, it might be a duplicate of one we processed before, and
2534            that would cause it to be skipped. */
2535
2536            if (repeat_state_offset >= 0)
2537              {
2538              next_active_state = active_states;
2539              active_count = 0;
2540              i = -1;
2541              ADD_ACTIVE(repeat_state_offset, 0);
2542              }
2543            }
2544          else
2545            {
2546            const uschar *p = start_subject + local_offsets[0];
2547            const uschar *pp = start_subject + local_offsets[1];
2548            while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2549            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2550            if (repeat_state_offset >= 0)
2551              { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2552            }
2553
2554          }
2555        else if (rc != PCRE_ERROR_NOMATCH) return rc;
2556        }
2557      break;
2558
2559
2560/* ========================================================================== */
2561      /* Handle callouts */
2562
2563      case OP_CALLOUT:
2564      rrc = 0;
2565      if (pcre_callout != NULL)
2566        {
2567        pcre_callout_block cb;
2568        cb.version          = 1;   /* Version 1 of the callout block */
2569        cb.callout_number   = code[1];
2570        cb.offset_vector    = offsets;
2571        cb.subject          = (PCRE_SPTR)start_subject;
2572        cb.subject_length   = end_subject - start_subject;
2573        cb.start_match      = current_subject - start_subject;
2574        cb.current_position = ptr - start_subject;
2575        cb.pattern_position = GET(code, 2);
2576        cb.next_item_length = GET(code, 2 + LINK_SIZE);
2577        cb.capture_top      = 1;
2578        cb.capture_last     = -1;
2579        cb.callout_data     = md->callout_data;
2580        if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2581        }
2582      if (rrc == 0)
2583        { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2584      break;
2585
2586
2587/* ========================================================================== */
2588      default:        /* Unsupported opcode */
2589      return PCRE_ERROR_DFA_UITEM;
2590      }
2591
2592    NEXT_ACTIVE_STATE: continue;
2593
2594    }      /* End of loop scanning active states */
2595
2596  /* We have finished the processing at the current subject character. If no
2597  new states have been set for the next character, we have found all the
2598  matches that we are going to find. If we are at the top level and partial
2599  matching has been requested, check for appropriate conditions.
2600
2601  The "forced_ fail" variable counts the number of (*F) encountered for the
2602  character. If it is equal to the original active_count (saved in
2603  workspace[1]) it means that (*F) was found on every active state. In this
2604  case we don't want to give a partial match.
2605
2606  The "could_continue" variable is true if a state could have continued but
2607  for the fact that the end of the subject was reached. */
2608
2609  if (new_count <= 0)
2610    {
2611    if (rlevel == 1 &&                               /* Top level, and */
2612        could_continue &&                            /* Some could go on */
2613        forced_fail != workspace[1] &&               /* Not all forced fail & */
2614        (                                            /* either... */
2615        (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
2616        ||                                           /* or... */
2617        ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
2618         match_count < 0)                            /* no matches */
2619        ) &&                                         /* And... */
2620        ptr >= end_subject &&                     /* Reached end of subject */
2621        ptr > current_subject)                    /* Matched non-empty string */
2622      {
2623      if (offsetcount >= 2)
2624        {
2625        offsets[0] = md->start_used_ptr - start_subject;
2626        offsets[1] = end_subject - start_subject;
2627        }
2628      match_count = PCRE_ERROR_PARTIAL;
2629      }
2630
2631    DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2632      "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2633      rlevel*2-2, SP));
2634    break;        /* In effect, "return", but see the comment below */
2635    }
2636
2637  /* One or more states are active for the next character. */
2638
2639  ptr += clen;    /* Advance to next subject character */
2640  }               /* Loop to move along the subject string */
2641
2642/* Control gets here from "break" a few lines above. We do it this way because
2643if we use "return" above, we have compiler trouble. Some compilers warn if
2644there's nothing here because they think the function doesn't return a value. On
2645the other hand, if we put a dummy statement here, some more clever compilers
2646complain that it can't be reached. Sigh. */
2647
2648return match_count;
2649}
2650
2651
2652
2653
2654/*************************************************
2655*    Execute a Regular Expression - DFA engine   *
2656*************************************************/
2657
2658/* This external function applies a compiled re to a subject string using a DFA
2659engine. This function calls the internal function multiple times if the pattern
2660is not anchored.
2661
2662Arguments:
2663  argument_re     points to the compiled expression
2664  extra_data      points to extra data or is NULL
2665  subject         points to the subject string
2666  length          length of subject string (may contain binary zeros)
2667  start_offset    where to start in the subject string
2668  options         option bits
2669  offsets         vector of match offsets
2670  offsetcount     size of same
2671  workspace       workspace vector
2672  wscount         size of same
2673
2674Returns:          > 0 => number of match offset pairs placed in offsets
2675                  = 0 => offsets overflowed; longest matches are present
2676                   -1 => failed to match
2677                 < -1 => some kind of unexpected problem
2678*/
2679
2680PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2681pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2682  const char *subject, int length, int start_offset, int options, int *offsets,
2683  int offsetcount, int *workspace, int wscount)
2684{
2685real_pcre *re = (real_pcre *)argument_re;
2686dfa_match_data match_block;
2687dfa_match_data *md = &match_block;
2688BOOL utf8, anchored, startline, firstline;
2689const uschar *current_subject, *end_subject, *lcc;
2690
2691pcre_study_data internal_study;
2692const pcre_study_data *study = NULL;
2693real_pcre internal_re;
2694
2695const uschar *req_byte_ptr;
2696const uschar *start_bits = NULL;
2697BOOL first_byte_caseless = FALSE;
2698BOOL req_byte_caseless = FALSE;
2699int first_byte = -1;
2700int req_byte = -1;
2701int req_byte2 = -1;
2702int newline;
2703
2704/* Plausibility checks */
2705
2706if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2707if (re == NULL || subject == NULL || workspace == NULL ||
2708   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2709if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2710if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2711
2712/* We need to find the pointer to any study data before we test for byte
2713flipping, so we scan the extra_data block first. This may set two fields in the
2714match block, so we must initialize them beforehand. However, the other fields
2715in the match block must not be set until after the byte flipping. */
2716
2717md->tables = re->tables;
2718md->callout_data = NULL;
2719
2720if (extra_data != NULL)
2721  {
2722  unsigned int flags = extra_data->flags;
2723  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2724    study = (const pcre_study_data *)extra_data->study_data;
2725  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2726  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2727    return PCRE_ERROR_DFA_UMLIMIT;
2728  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2729    md->callout_data = extra_data->callout_data;
2730  if ((flags & PCRE_EXTRA_TABLES) != 0)
2731    md->tables = extra_data->tables;
2732  }
2733
2734/* Check that the first field in the block is the magic number. If it is not,
2735test for a regex that was compiled on a host of opposite endianness. If this is
2736the case, flipped values are put in internal_re and internal_study if there was
2737study data too. */
2738
2739if (re->magic_number != MAGIC_NUMBER)
2740  {
2741  re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2742  if (re == NULL) return PCRE_ERROR_BADMAGIC;
2743  if (study != NULL) study = &internal_study;
2744  }
2745
2746/* Set some local values */
2747
2748current_subject = (const unsigned char *)subject + start_offset;
2749end_subject = (const unsigned char *)subject + length;
2750req_byte_ptr = current_subject - 1;
2751
2752#ifdef SUPPORT_UTF8
2753utf8 = (re->options & PCRE_UTF8) != 0;
2754#else
2755utf8 = FALSE;
2756#endif
2757
2758anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2759  (re->options & PCRE_ANCHORED) != 0;
2760
2761/* The remaining fixed data for passing around. */
2762
2763md->start_code = (const uschar *)argument_re +
2764    re->name_table_offset + re->name_count * re->name_entry_size;
2765md->start_subject = (const unsigned char *)subject;
2766md->end_subject = end_subject;
2767md->start_offset = start_offset;
2768md->moptions = options;
2769md->poptions = re->options;
2770
2771/* If the BSR option is not set at match time, copy what was set
2772at compile time. */
2773
2774if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2775  {
2776  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2777    md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2778#ifdef BSR_ANYCRLF
2779  else md->moptions |= PCRE_BSR_ANYCRLF;
2780#endif
2781  }
2782
2783/* Handle different types of newline. The three bits give eight cases. If
2784nothing is set at run time, whatever was used at compile time applies. */
2785
2786switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2787         PCRE_NEWLINE_BITS)
2788  {
2789  case 0: newline = NEWLINE; break;   /* Compile-time default */
2790  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2791  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2792  case PCRE_NEWLINE_CR+
2793       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2794  case PCRE_NEWLINE_ANY: newline = -1; break;
2795  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2796  default: return PCRE_ERROR_BADNEWLINE;
2797  }
2798
2799if (newline == -2)
2800  {
2801  md->nltype = NLTYPE_ANYCRLF;
2802  }
2803else if (newline < 0)
2804  {
2805  md->nltype = NLTYPE_ANY;
2806  }
2807else
2808  {
2809  md->nltype = NLTYPE_FIXED;
2810  if (newline > 255)
2811    {
2812    md->nllen = 2;
2813    md->nl[0] = (newline >> 8) & 255;
2814    md->nl[1] = newline & 255;
2815    }
2816  else
2817    {
2818    md->nllen = 1;
2819    md->nl[0] = newline;
2820    }
2821  }
2822
2823/* Check a UTF-8 string if required. Unfortunately there's no way of passing
2824back the character offset. */
2825
2826#ifdef SUPPORT_UTF8
2827if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2828  {
2829  if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2830    return PCRE_ERROR_BADUTF8;
2831  if (start_offset > 0 && start_offset < length)
2832    {
2833    int tb = ((uschar *)subject)[start_offset];
2834    if (tb > 127)
2835      {
2836      tb &= 0xc0;
2837      if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2838      }
2839    }
2840  }
2841#endif
2842
2843/* If the exec call supplied NULL for tables, use the inbuilt ones. This
2844is a feature that makes it possible to save compiled regex and re-use them
2845in other programs later. */
2846
2847if (md->tables == NULL) md->tables = _pcre_default_tables;
2848
2849/* The lower casing table and the "must be at the start of a line" flag are
2850used in a loop when finding where to start. */
2851
2852lcc = md->tables + lcc_offset;
2853startline = (re->flags & PCRE_STARTLINE) != 0;
2854firstline = (re->options & PCRE_FIRSTLINE) != 0;
2855
2856/* Set up the first character to match, if available. The first_byte value is
2857never set for an anchored regular expression, but the anchoring may be forced
2858at run time, so we have to test for anchoring. The first char may be unset for
2859an unanchored pattern, of course. If there's no first char and the pattern was
2860studied, there may be a bitmap of possible first characters. */
2861
2862if (!anchored)
2863  {
2864  if ((re->flags & PCRE_FIRSTSET) != 0)
2865    {
2866    first_byte = re->first_byte & 255;
2867    if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2868      first_byte = lcc[first_byte];
2869    }
2870  else
2871    {
2872    if (!startline && study != NULL &&
2873         (study->flags & PCRE_STUDY_MAPPED) != 0)
2874      start_bits = study->start_bits;
2875    }
2876  }
2877
2878/* For anchored or unanchored matches, there may be a "last known required
2879character" set. */
2880
2881if ((re->flags & PCRE_REQCHSET) != 0)
2882  {
2883  req_byte = re->req_byte & 255;
2884  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2885  req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2886  }
2887
2888/* Call the main matching function, looping for a non-anchored regex after a
2889failed match. If not restarting, perform certain optimizations at the start of
2890a match. */
2891
2892for (;;)
2893  {
2894  int rc;
2895
2896  if ((options & PCRE_DFA_RESTART) == 0)
2897    {
2898    const uschar *save_end_subject = end_subject;
2899
2900    /* If firstline is TRUE, the start of the match is constrained to the first
2901    line of a multiline string. Implement this by temporarily adjusting
2902    end_subject so that we stop scanning at a newline. If the match fails at
2903    the newline, later code breaks this loop. */
2904
2905    if (firstline)
2906      {
2907      USPTR t = current_subject;
2908#ifdef SUPPORT_UTF8
2909      if (utf8)
2910        {
2911        while (t < md->end_subject && !IS_NEWLINE(t))
2912          {
2913          t++;
2914          while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2915          }
2916        }
2917      else
2918#endif
2919      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2920      end_subject = t;
2921      }
2922
2923    /* There are some optimizations that avoid running the match if a known
2924    starting point is not found. However, there is an option that disables
2925    these, for testing and for ensuring that all callouts do actually occur. */
2926
2927    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2928      {
2929      /* Advance to a known first byte. */
2930
2931      if (first_byte >= 0)
2932        {
2933        if (first_byte_caseless)
2934          while (current_subject < end_subject &&
2935                 lcc[*current_subject] != first_byte)
2936            current_subject++;
2937        else
2938          while (current_subject < end_subject &&
2939                 *current_subject != first_byte)
2940            current_subject++;
2941        }
2942
2943      /* Or to just after a linebreak for a multiline match if possible */
2944
2945      else if (startline)
2946        {
2947        if (current_subject > md->start_subject + start_offset)
2948          {
2949#ifdef SUPPORT_UTF8
2950          if (utf8)
2951            {
2952            while (current_subject < end_subject &&
2953                   !WAS_NEWLINE(current_subject))
2954              {
2955              current_subject++;
2956              while(current_subject < end_subject &&
2957                    (*current_subject & 0xc0) == 0x80)
2958                current_subject++;
2959              }
2960            }
2961          else
2962#endif
2963          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2964            current_subject++;
2965
2966          /* If we have just passed a CR and the newline option is ANY or
2967          ANYCRLF, and we are now at a LF, advance the match position by one
2968          more character. */
2969
2970          if (current_subject[-1] == CHAR_CR &&
2971               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2972               current_subject < end_subject &&
2973               *current_subject == CHAR_NL)
2974            current_subject++;
2975          }
2976        }
2977
2978      /* Or to a non-unique first char after study */
2979
2980      else if (start_bits != NULL)
2981        {
2982        while (current_subject < end_subject)
2983          {
2984          register unsigned int c = *current_subject;
2985          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2986            else break;
2987          }
2988        }
2989      }
2990
2991    /* Restore fudged end_subject */
2992
2993    end_subject = save_end_subject;
2994
2995    /* The following two optimizations are disabled for partial matching or if
2996    disabling is explicitly requested (and of course, by the test above, this
2997    code is not obeyed when restarting after a partial match). */
2998
2999    if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3000        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3001      {
3002      /* If the pattern was studied, a minimum subject length may be set. This
3003      is a lower bound; no actual string of that length may actually match the
3004      pattern. Although the value is, strictly, in characters, we treat it as
3005      bytes to avoid spending too much time in this optimization. */
3006
3007      if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3008          (pcre_uint32)(end_subject - current_subject) < study->minlength)
3009        return PCRE_ERROR_NOMATCH;
3010
3011      /* If req_byte is set, we know that that character must appear in the
3012      subject for the match to succeed. If the first character is set, req_byte
3013      must be later in the subject; otherwise the test starts at the match
3014      point. This optimization can save a huge amount of work in patterns with
3015      nested unlimited repeats that aren't going to match. Writing separate
3016      code for cased/caseless versions makes it go faster, as does using an
3017      autoincrement and backing off on a match.
3018
3019      HOWEVER: when the subject string is very, very long, searching to its end
3020      can take a long time, and give bad performance on quite ordinary
3021      patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3022      string... so we don't do this when the string is sufficiently long. */
3023
3024      if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3025        {
3026        register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3027
3028        /* We don't need to repeat the search if we haven't yet reached the
3029        place we found it at last time. */
3030
3031        if (p > req_byte_ptr)
3032          {
3033          if (req_byte_caseless)
3034            {
3035            while (p < end_subject)
3036              {
3037              register int pp = *p++;
3038              if (pp == req_byte || pp == req_byte2) { p--; break; }
3039              }
3040            }
3041          else
3042            {
3043            while (p < end_subject)
3044              {
3045              if (*p++ == req_byte) { p--; break; }
3046              }
3047            }
3048
3049          /* If we can't find the required character, break the matching loop,
3050          which will cause a return or PCRE_ERROR_NOMATCH. */
3051
3052          if (p >= end_subject) break;
3053
3054          /* If we have found the required character, save the point where we
3055          found it, so that we don't search again next time round the loop if
3056          the start hasn't passed this character yet. */
3057
3058          req_byte_ptr = p;
3059          }
3060        }
3061      }
3062    }   /* End of optimizations that are done when not restarting */
3063
3064  /* OK, now we can do the business */
3065
3066  md->start_used_ptr = current_subject;
3067
3068  rc = internal_dfa_exec(
3069    md,                                /* fixed match data */
3070    md->start_code,                    /* this subexpression's code */
3071    current_subject,                   /* where we currently are */
3072    start_offset,                      /* start offset in subject */
3073    offsets,                           /* offset vector */
3074    offsetcount,                       /* size of same */
3075    workspace,                         /* workspace vector */
3076    wscount,                           /* size of same */
3077    re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3078    0,                                 /* function recurse level */
3079    0);                                /* regex recurse level */
3080
3081  /* Anything other than "no match" means we are done, always; otherwise, carry
3082  on only if not anchored. */
3083
3084  if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3085
3086  /* Advance to the next subject character unless we are at the end of a line
3087  and firstline is set. */
3088
3089  if (firstline && IS_NEWLINE(current_subject)) break;
3090  current_subject++;
3091  if (utf8)
3092    {
3093    while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3094      current_subject++;
3095    }
3096  if (current_subject > end_subject) break;
3097
3098  /* If we have just passed a CR and we are now at a LF, and the pattern does
3099  not contain any explicit matches for \r or \n, and the newline option is CRLF
3100  or ANY or ANYCRLF, advance the match position by one more character. */
3101
3102  if (current_subject[-1] == CHAR_CR &&
3103      current_subject < end_subject &&
3104      *current_subject == CHAR_NL &&
3105      (re->flags & PCRE_HASCRORLF) == 0 &&
3106        (md->nltype == NLTYPE_ANY ||
3107         md->nltype == NLTYPE_ANYCRLF ||
3108         md->nllen == 2))
3109    current_subject++;
3110
3111  }   /* "Bumpalong" loop */
3112
3113return PCRE_ERROR_NOMATCH;
3114}
3115
3116/* End of pcre_dfa_exec.c */
3117