1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2012 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_study(), along with local
42supporting functions. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#include "pcre_internal.h"
50
51#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
52
53/* Returns from set_start_bits() */
54
55enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
56
57
58
59/*************************************************
60*   Find the minimum subject length for a group  *
61*************************************************/
62
63/* Scan a parenthesized group and compute the minimum length of subject that
64is needed to match it. This is a lower bound; it does not mean there is a
65string of that length that matches. In UTF8 mode, the result is in characters
66rather than bytes.
67
68Arguments:
69  re              compiled pattern block
70  code            pointer to start of group (the bracket)
71  startcode       pointer to start of the whole pattern's code
72  options         the compiling options
73  recurses        chain of recurse_check to catch mutual recursion
74  countptr        pointer to call count (to catch over complexity)
75
76Returns:   the minimum length
77           -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78           -2 internal error (missing capturing bracket)
79           -3 internal error (opcode not listed)
80*/
81
82static int
83find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
84  const pcre_uchar *startcode, int options, recurse_check *recurses,
85  int *countptr)
86{
87int length = -1;
88/* PCRE_UTF16 has the same value as PCRE_UTF8. */
89BOOL utf = (options & PCRE_UTF8) != 0;
90BOOL had_recurse = FALSE;
91recurse_check this_recurse;
92register int branchlength = 0;
93register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
94
95if ((*countptr)++ > 1000) return -1;   /* too complex */
96
97if (*code == OP_CBRA || *code == OP_SCBRA ||
98    *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
99
100/* Scan along the opcodes for this branch. If we get to the end of the
101branch, check the length against that of the other branches. */
102
103for (;;)
104  {
105  int d, min;
106  pcre_uchar *cs, *ce;
107  register pcre_uchar op = *cc;
108
109  switch (op)
110    {
111    case OP_COND:
112    case OP_SCOND:
113
114    /* If there is only one branch in a condition, the implied branch has zero
115    length, so we don't add anything. This covers the DEFINE "condition"
116    automatically. */
117
118    cs = cc + GET(cc, 1);
119    if (*cs != OP_ALT)
120      {
121      cc = cs + 1 + LINK_SIZE;
122      break;
123      }
124
125    /* Otherwise we can fall through and treat it the same as any other
126    subpattern. */
127
128    case OP_CBRA:
129    case OP_SCBRA:
130    case OP_BRA:
131    case OP_SBRA:
132    case OP_CBRAPOS:
133    case OP_SCBRAPOS:
134    case OP_BRAPOS:
135    case OP_SBRAPOS:
136    case OP_ONCE:
137    case OP_ONCE_NC:
138    d = find_minlength(re, cc, startcode, options, recurses, countptr);
139    if (d < 0) return d;
140    branchlength += d;
141    do cc += GET(cc, 1); while (*cc == OP_ALT);
142    cc += 1 + LINK_SIZE;
143    break;
144
145    /* ACCEPT makes things far too complicated; we have to give up. */
146
147    case OP_ACCEPT:
148    case OP_ASSERT_ACCEPT:
149    return -1;
150
151    /* Reached end of a branch; if it's a ket it is the end of a nested
152    call. If it's ALT it is an alternation in a nested call. If it is END it's
153    the end of the outer call. All can be handled by the same code. If an
154    ACCEPT was previously encountered, use the length that was in force at that
155    time, and pass back the shortest ACCEPT length. */
156
157    case OP_ALT:
158    case OP_KET:
159    case OP_KETRMAX:
160    case OP_KETRMIN:
161    case OP_KETRPOS:
162    case OP_END:
163    if (length < 0 || (!had_recurse && branchlength < length))
164      length = branchlength;
165    if (op != OP_ALT) return length;
166    cc += 1 + LINK_SIZE;
167    branchlength = 0;
168    had_recurse = FALSE;
169    break;
170
171    /* Skip over assertive subpatterns */
172
173    case OP_ASSERT:
174    case OP_ASSERT_NOT:
175    case OP_ASSERTBACK:
176    case OP_ASSERTBACK_NOT:
177    do cc += GET(cc, 1); while (*cc == OP_ALT);
178    /* Fall through */
179
180    /* Skip over things that don't match chars */
181
182    case OP_REVERSE:
183    case OP_CREF:
184    case OP_DNCREF:
185    case OP_RREF:
186    case OP_DNRREF:
187    case OP_DEF:
188    case OP_CALLOUT:
189    case OP_SOD:
190    case OP_SOM:
191    case OP_EOD:
192    case OP_EODN:
193    case OP_CIRC:
194    case OP_CIRCM:
195    case OP_DOLL:
196    case OP_DOLLM:
197    case OP_NOT_WORD_BOUNDARY:
198    case OP_WORD_BOUNDARY:
199    cc += PRIV(OP_lengths)[*cc];
200    break;
201
202    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
203
204    case OP_BRAZERO:
205    case OP_BRAMINZERO:
206    case OP_BRAPOSZERO:
207    case OP_SKIPZERO:
208    cc += PRIV(OP_lengths)[*cc];
209    do cc += GET(cc, 1); while (*cc == OP_ALT);
210    cc += 1 + LINK_SIZE;
211    break;
212
213    /* Handle literal characters and + repetitions */
214
215    case OP_CHAR:
216    case OP_CHARI:
217    case OP_NOT:
218    case OP_NOTI:
219    case OP_PLUS:
220    case OP_PLUSI:
221    case OP_MINPLUS:
222    case OP_MINPLUSI:
223    case OP_POSPLUS:
224    case OP_POSPLUSI:
225    case OP_NOTPLUS:
226    case OP_NOTPLUSI:
227    case OP_NOTMINPLUS:
228    case OP_NOTMINPLUSI:
229    case OP_NOTPOSPLUS:
230    case OP_NOTPOSPLUSI:
231    branchlength++;
232    cc += 2;
233#ifdef SUPPORT_UTF
234    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
235#endif
236    break;
237
238    case OP_TYPEPLUS:
239    case OP_TYPEMINPLUS:
240    case OP_TYPEPOSPLUS:
241    branchlength++;
242    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
243    break;
244
245    /* Handle exact repetitions. The count is already in characters, but we
246    need to skip over a multibyte character in UTF8 mode.  */
247
248    case OP_EXACT:
249    case OP_EXACTI:
250    case OP_NOTEXACT:
251    case OP_NOTEXACTI:
252    branchlength += GET2(cc,1);
253    cc += 2 + IMM2_SIZE;
254#ifdef SUPPORT_UTF
255    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
256#endif
257    break;
258
259    case OP_TYPEEXACT:
260    branchlength += GET2(cc,1);
261    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
263    break;
264
265    /* Handle single-char non-literal matchers */
266
267    case OP_PROP:
268    case OP_NOTPROP:
269    cc += 2;
270    /* Fall through */
271
272    case OP_NOT_DIGIT:
273    case OP_DIGIT:
274    case OP_NOT_WHITESPACE:
275    case OP_WHITESPACE:
276    case OP_NOT_WORDCHAR:
277    case OP_WORDCHAR:
278    case OP_ANY:
279    case OP_ALLANY:
280    case OP_EXTUNI:
281    case OP_HSPACE:
282    case OP_NOT_HSPACE:
283    case OP_VSPACE:
284    case OP_NOT_VSPACE:
285    branchlength++;
286    cc++;
287    break;
288
289    /* "Any newline" might match two characters, but it also might match just
290    one. */
291
292    case OP_ANYNL:
293    branchlength += 1;
294    cc++;
295    break;
296
297    /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298    non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299    appear, but leave the code, just in case.) */
300
301    case OP_ANYBYTE:
302#ifdef SUPPORT_UTF
303    if (utf) return -1;
304#endif
305    branchlength++;
306    cc++;
307    break;
308
309    /* For repeated character types, we have to test for \p and \P, which have
310    an extra two bytes of parameters. */
311
312    case OP_TYPESTAR:
313    case OP_TYPEMINSTAR:
314    case OP_TYPEQUERY:
315    case OP_TYPEMINQUERY:
316    case OP_TYPEPOSSTAR:
317    case OP_TYPEPOSQUERY:
318    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
319    cc += PRIV(OP_lengths)[op];
320    break;
321
322    case OP_TYPEUPTO:
323    case OP_TYPEMINUPTO:
324    case OP_TYPEPOSUPTO:
325    if (cc[1 + IMM2_SIZE] == OP_PROP
326      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327    cc += PRIV(OP_lengths)[op];
328    break;
329
330    /* Check a class for variable quantification */
331
332    case OP_CLASS:
333    case OP_NCLASS:
334#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
335    case OP_XCLASS:
336    /* The original code caused an unsigned overflow in 64 bit systems,
337    so now we use a conditional statement. */
338    if (op == OP_XCLASS)
339      cc += GET(cc, 1);
340    else
341      cc += PRIV(OP_lengths)[OP_CLASS];
342#else
343    cc += PRIV(OP_lengths)[OP_CLASS];
344#endif
345
346    switch (*cc)
347      {
348      case OP_CRPLUS:
349      case OP_CRMINPLUS:
350      case OP_CRPOSPLUS:
351      branchlength++;
352      /* Fall through */
353
354      case OP_CRSTAR:
355      case OP_CRMINSTAR:
356      case OP_CRQUERY:
357      case OP_CRMINQUERY:
358      case OP_CRPOSSTAR:
359      case OP_CRPOSQUERY:
360      cc++;
361      break;
362
363      case OP_CRRANGE:
364      case OP_CRMINRANGE:
365      case OP_CRPOSRANGE:
366      branchlength += GET2(cc,1);
367      cc += 1 + 2 * IMM2_SIZE;
368      break;
369
370      default:
371      branchlength++;
372      break;
373      }
374    break;
375
376    /* Backreferences and subroutine calls are treated in the same way: we find
377    the minimum length for the subpattern. A recursion, however, causes an
378    a flag to be set that causes the length of this branch to be ignored. The
379    logic is that a recursion can only make sense if there is another
380    alternation that stops the recursing. That will provide the minimum length
381    (when no recursion happens). A backreference within the group that it is
382    referencing behaves in the same way.
383
384    If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385    matches an empty string (by default it causes a matching failure), so in
386    that case we must set the minimum length to zero. */
387
388    case OP_DNREF:     /* Duplicate named pattern back reference */
389    case OP_DNREFI:
390    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
391      {
392      int count = GET2(cc, 1+IMM2_SIZE);
393      pcre_uchar *slot = (pcre_uchar *)re +
394        re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
395      d = INT_MAX;
396      while (count-- > 0)
397        {
398        ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399        if (cs == NULL) return -2;
400        do ce += GET(ce, 1); while (*ce == OP_ALT);
401        if (cc > cs && cc < ce)     /* Simple recursion */
402          {
403          d = 0;
404          had_recurse = TRUE;
405          break;
406          }
407        else
408          {
409          recurse_check *r = recurses;
410          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
411          if (r != NULL)           /* Mutual recursion */
412            {
413            d = 0;
414            had_recurse = TRUE;
415            break;
416            }
417          else
418            {
419            int dd;
420            this_recurse.prev = recurses;
421            this_recurse.group = cs;
422            dd = find_minlength(re, cs, startcode, options, &this_recurse,
423              countptr);
424            if (dd < d) d = dd;
425            }
426          }
427        slot += re->name_entry_size;
428        }
429      }
430    else d = 0;
431    cc += 1 + 2*IMM2_SIZE;
432    goto REPEAT_BACK_REFERENCE;
433
434    case OP_REF:      /* Single back reference */
435    case OP_REFI:
436    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
437      {
438      ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439      if (cs == NULL) return -2;
440      do ce += GET(ce, 1); while (*ce == OP_ALT);
441      if (cc > cs && cc < ce)    /* Simple recursion */
442        {
443        d = 0;
444        had_recurse = TRUE;
445        }
446      else
447        {
448        recurse_check *r = recurses;
449        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
450        if (r != NULL)           /* Mutual recursion */
451          {
452          d = 0;
453          had_recurse = TRUE;
454          }
455        else
456          {
457          this_recurse.prev = recurses;
458          this_recurse.group = cs;
459          d = find_minlength(re, cs, startcode, options, &this_recurse,
460            countptr);
461          }
462        }
463      }
464    else d = 0;
465    cc += 1 + IMM2_SIZE;
466
467    /* Handle repeated back references */
468
469    REPEAT_BACK_REFERENCE:
470    switch (*cc)
471      {
472      case OP_CRSTAR:
473      case OP_CRMINSTAR:
474      case OP_CRQUERY:
475      case OP_CRMINQUERY:
476      case OP_CRPOSSTAR:
477      case OP_CRPOSQUERY:
478      min = 0;
479      cc++;
480      break;
481
482      case OP_CRPLUS:
483      case OP_CRMINPLUS:
484      case OP_CRPOSPLUS:
485      min = 1;
486      cc++;
487      break;
488
489      case OP_CRRANGE:
490      case OP_CRMINRANGE:
491      case OP_CRPOSRANGE:
492      min = GET2(cc, 1);
493      cc += 1 + 2 * IMM2_SIZE;
494      break;
495
496      default:
497      min = 1;
498      break;
499      }
500
501    branchlength += min * d;
502    break;
503
504    /* We can easily detect direct recursion, but not mutual recursion. This is
505    caught by a recursion depth count. */
506
507    case OP_RECURSE:
508    cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509    do ce += GET(ce, 1); while (*ce == OP_ALT);
510    if (cc > cs && cc < ce)    /* Simple recursion */
511      had_recurse = TRUE;
512    else
513      {
514      recurse_check *r = recurses;
515      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
516      if (r != NULL)           /* Mutual recursion */
517        had_recurse = TRUE;
518      else
519        {
520        this_recurse.prev = recurses;
521        this_recurse.group = cs;
522        branchlength += find_minlength(re, cs, startcode, options,
523          &this_recurse, countptr);
524        }
525      }
526    cc += 1 + LINK_SIZE;
527    break;
528
529    /* Anything else does not or need not match a character. We can get the
530    item's length from the table, but for those that can match zero occurrences
531    of a character, we must take special action for UTF-8 characters. As it
532    happens, the "NOT" versions of these opcodes are used at present only for
533    ASCII characters, so they could be omitted from this list. However, in
534    future that may change, so we include them here so as not to leave a
535    gotcha for a future maintainer. */
536
537    case OP_UPTO:
538    case OP_UPTOI:
539    case OP_NOTUPTO:
540    case OP_NOTUPTOI:
541    case OP_MINUPTO:
542    case OP_MINUPTOI:
543    case OP_NOTMINUPTO:
544    case OP_NOTMINUPTOI:
545    case OP_POSUPTO:
546    case OP_POSUPTOI:
547    case OP_NOTPOSUPTO:
548    case OP_NOTPOSUPTOI:
549
550    case OP_STAR:
551    case OP_STARI:
552    case OP_NOTSTAR:
553    case OP_NOTSTARI:
554    case OP_MINSTAR:
555    case OP_MINSTARI:
556    case OP_NOTMINSTAR:
557    case OP_NOTMINSTARI:
558    case OP_POSSTAR:
559    case OP_POSSTARI:
560    case OP_NOTPOSSTAR:
561    case OP_NOTPOSSTARI:
562
563    case OP_QUERY:
564    case OP_QUERYI:
565    case OP_NOTQUERY:
566    case OP_NOTQUERYI:
567    case OP_MINQUERY:
568    case OP_MINQUERYI:
569    case OP_NOTMINQUERY:
570    case OP_NOTMINQUERYI:
571    case OP_POSQUERY:
572    case OP_POSQUERYI:
573    case OP_NOTPOSQUERY:
574    case OP_NOTPOSQUERYI:
575
576    cc += PRIV(OP_lengths)[op];
577#ifdef SUPPORT_UTF
578    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
579#endif
580    break;
581
582    /* Skip these, but we need to add in the name length. */
583
584    case OP_MARK:
585    case OP_PRUNE_ARG:
586    case OP_SKIP_ARG:
587    case OP_THEN_ARG:
588    cc += PRIV(OP_lengths)[op] + cc[1];
589    break;
590
591    /* The remaining opcodes are just skipped over. */
592
593    case OP_CLOSE:
594    case OP_COMMIT:
595    case OP_FAIL:
596    case OP_PRUNE:
597    case OP_SET_SOM:
598    case OP_SKIP:
599    case OP_THEN:
600    cc += PRIV(OP_lengths)[op];
601    break;
602
603    /* This should not occur: we list all opcodes explicitly so that when
604    new ones get added they are properly considered. */
605
606    default:
607    return -3;
608    }
609  }
610/* Control never gets here */
611}
612
613
614
615/*************************************************
616*      Set a bit and maybe its alternate case    *
617*************************************************/
618
619/* Given a character, set its first byte's bit in the table, and also the
620corresponding bit for the other version of a letter if we are caseless. In
621UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622when Unicode property support is available.
623
624Arguments:
625  start_bits    points to the bit map
626  p             points to the character
627  caseless      the caseless flag
628  cd            the block with char table pointers
629  utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
630
631Returns:        pointer after the character
632*/
633
634static const pcre_uchar *
635set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636  compile_data *cd, BOOL utf)
637{
638pcre_uint32 c = *p;
639
640#ifdef COMPILE_PCRE8
641SET_BIT(c);
642
643#ifdef SUPPORT_UTF
644if (utf && c > 127)
645  {
646  GETCHARINC(c, p);
647#ifdef SUPPORT_UCP
648  if (caseless)
649    {
650    pcre_uchar buff[6];
651    c = UCD_OTHERCASE(c);
652    (void)PRIV(ord2utf)(c, buff);
653    SET_BIT(buff[0]);
654    }
655#endif  /* Not SUPPORT_UCP */
656  return p;
657  }
658#else   /* Not SUPPORT_UTF */
659(void)(utf);   /* Stops warning for unused parameter */
660#endif  /* SUPPORT_UTF */
661
662/* Not UTF-8 mode, or character is less than 127. */
663
664if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
665return p + 1;
666#endif  /* COMPILE_PCRE8 */
667
668#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
669if (c > 0xff)
670  {
671  c = 0xff;
672  caseless = FALSE;
673  }
674SET_BIT(c);
675
676#ifdef SUPPORT_UTF
677if (utf && c > 127)
678  {
679  GETCHARINC(c, p);
680#ifdef SUPPORT_UCP
681  if (caseless)
682    {
683    c = UCD_OTHERCASE(c);
684    if (c > 0xff)
685      c = 0xff;
686    SET_BIT(c);
687    }
688#endif  /* SUPPORT_UCP */
689  return p;
690  }
691#else   /* Not SUPPORT_UTF */
692(void)(utf);   /* Stops warning for unused parameter */
693#endif  /* SUPPORT_UTF */
694
695if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
696return p + 1;
697#endif
698}
699
700
701
702/*************************************************
703*     Set bits for a positive character type     *
704*************************************************/
705
706/* This function sets starting bits for a character type. In UTF-8 mode, we can
707only do a direct setting for bytes less than 128, as otherwise there can be
708confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709environment, the tables will only recognize ASCII characters anyway, but in at
710least one Windows environment, some higher bytes bits were set in the tables.
711So we deal with that case by considering the UTF-8 encoding.
712
713Arguments:
714  start_bits     the starting bitmap
715  cbit type      the type of character wanted
716  table_limit    32 for non-UTF-8; 16 for UTF-8
717  cd             the block with char table pointers
718
719Returns:         nothing
720*/
721
722static void
723set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724  compile_data *cd)
725{
726register pcre_uint32 c;
727for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728#if defined SUPPORT_UTF && defined COMPILE_PCRE8
729if (table_limit == 32) return;
730for (c = 128; c < 256; c++)
731  {
732  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
733    {
734    pcre_uchar buff[6];
735    (void)PRIV(ord2utf)(c, buff);
736    SET_BIT(buff[0]);
737    }
738  }
739#endif
740}
741
742
743/*************************************************
744*     Set bits for a negative character type     *
745*************************************************/
746
747/* This function sets starting bits for a negative character type such as \D.
748In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750Unlike in the positive case, where we can set appropriate starting bits for
751specific high-valued UTF-8 characters, in this case we have to set the bits for
752all high-valued characters. The lowest is 0xc2, but we overkill by starting at
7530xc0 (192) for simplicity.
754
755Arguments:
756  start_bits     the starting bitmap
757  cbit type      the type of character wanted
758  table_limit    32 for non-UTF-8; 16 for UTF-8
759  cd             the block with char table pointers
760
761Returns:         nothing
762*/
763
764static void
765set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
766  compile_data *cd)
767{
768register pcre_uint32 c;
769for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770#if defined SUPPORT_UTF && defined COMPILE_PCRE8
771if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
772#endif
773}
774
775
776
777/*************************************************
778*          Create bitmap of starting bytes       *
779*************************************************/
780
781/* This function scans a compiled unanchored expression recursively and
782attempts to build a bitmap of the set of possible starting bytes. As time goes
783by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784useful for parenthesized groups in patterns such as (a*)b where the group
785provides some optional starting bytes but scanning must continue at the outer
786level to find at least one mandatory byte. At the outermost level, this
787function fails unless the result is SSB_DONE.
788
789Arguments:
790  code         points to an expression
791  start_bits   points to a 32-byte table, initialized to 0
792  utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793  cd           the block with char table pointers
794
795Returns:       SSB_FAIL     => Failed to find any starting bytes
796               SSB_DONE     => Found mandatory starting bytes
797               SSB_CONTINUE => Found optional starting bytes
798               SSB_UNKNOWN  => Hit an unrecognized opcode
799*/
800
801static int
802set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
803  compile_data *cd)
804{
805register pcre_uint32 c;
806int yield = SSB_DONE;
807#if defined SUPPORT_UTF && defined COMPILE_PCRE8
808int table_limit = utf? 16:32;
809#else
810int table_limit = 32;
811#endif
812
813#if 0
814/* ========================================================================= */
815/* The following comment and code was inserted in January 1999. In May 2006,
816when it was observed to cause compiler warnings about unused values, I took it
817out again. If anybody is still using OS/2, they will have to put it back
818manually. */
819
820/* This next statement and the later reference to dummy are here in order to
821trick the optimizer of the IBM C compiler for OS/2 into generating correct
822code. Apparently IBM isn't going to fix the problem, and we would rather not
823disable optimization (in this module it actually makes a big difference, and
824the pcre module can use all the optimization it can get). */
825
826volatile int dummy;
827/* ========================================================================= */
828#endif
829
830do
831  {
832  BOOL try_next = TRUE;
833  const pcre_uchar *tcode = code + 1 + LINK_SIZE;
834
835  if (*code == OP_CBRA || *code == OP_SCBRA ||
836      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
837
838  while (try_next)    /* Loop for items in this branch */
839    {
840    int rc;
841
842    switch(*tcode)
843      {
844      /* If we reach something we don't understand, it means a new opcode has
845      been created that hasn't been added to this code. Hopefully this problem
846      will be discovered during testing. */
847
848      default:
849      return SSB_UNKNOWN;
850
851      /* Fail for a valid opcode that implies no starting bits. */
852
853      case OP_ACCEPT:
854      case OP_ASSERT_ACCEPT:
855      case OP_ALLANY:
856      case OP_ANY:
857      case OP_ANYBYTE:
858      case OP_CIRC:
859      case OP_CIRCM:
860      case OP_CLOSE:
861      case OP_COMMIT:
862      case OP_COND:
863      case OP_CREF:
864      case OP_DEF:
865      case OP_DNCREF:
866      case OP_DNREF:
867      case OP_DNREFI:
868      case OP_DNRREF:
869      case OP_DOLL:
870      case OP_DOLLM:
871      case OP_END:
872      case OP_EOD:
873      case OP_EODN:
874      case OP_EXTUNI:
875      case OP_FAIL:
876      case OP_MARK:
877      case OP_NOT:
878      case OP_NOTEXACT:
879      case OP_NOTEXACTI:
880      case OP_NOTI:
881      case OP_NOTMINPLUS:
882      case OP_NOTMINPLUSI:
883      case OP_NOTMINQUERY:
884      case OP_NOTMINQUERYI:
885      case OP_NOTMINSTAR:
886      case OP_NOTMINSTARI:
887      case OP_NOTMINUPTO:
888      case OP_NOTMINUPTOI:
889      case OP_NOTPLUS:
890      case OP_NOTPLUSI:
891      case OP_NOTPOSPLUS:
892      case OP_NOTPOSPLUSI:
893      case OP_NOTPOSQUERY:
894      case OP_NOTPOSQUERYI:
895      case OP_NOTPOSSTAR:
896      case OP_NOTPOSSTARI:
897      case OP_NOTPOSUPTO:
898      case OP_NOTPOSUPTOI:
899      case OP_NOTPROP:
900      case OP_NOTQUERY:
901      case OP_NOTQUERYI:
902      case OP_NOTSTAR:
903      case OP_NOTSTARI:
904      case OP_NOTUPTO:
905      case OP_NOTUPTOI:
906      case OP_NOT_HSPACE:
907      case OP_NOT_VSPACE:
908      case OP_PRUNE:
909      case OP_PRUNE_ARG:
910      case OP_RECURSE:
911      case OP_REF:
912      case OP_REFI:
913      case OP_REVERSE:
914      case OP_RREF:
915      case OP_SCOND:
916      case OP_SET_SOM:
917      case OP_SKIP:
918      case OP_SKIP_ARG:
919      case OP_SOD:
920      case OP_SOM:
921      case OP_THEN:
922      case OP_THEN_ARG:
923      return SSB_FAIL;
924
925      /* A "real" property test implies no starting bits, but the fake property
926      PT_CLIST identifies a list of characters. These lists are short, as they
927      are used for characters with more than one "other case", so there is no
928      point in recognizing them for OP_NOTPROP. */
929
930      case OP_PROP:
931      if (tcode[1] != PT_CLIST) return SSB_FAIL;
932        {
933        const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934        while ((c = *p++) < NOTACHAR)
935          {
936#if defined SUPPORT_UTF && defined COMPILE_PCRE8
937          if (utf)
938            {
939            pcre_uchar buff[6];
940            (void)PRIV(ord2utf)(c, buff);
941            c = buff[0];
942            }
943#endif
944          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
945          }
946        }
947      try_next = FALSE;
948      break;
949
950      /* We can ignore word boundary tests. */
951
952      case OP_WORD_BOUNDARY:
953      case OP_NOT_WORD_BOUNDARY:
954      tcode++;
955      break;
956
957      /* If we hit a bracket or a positive lookahead assertion, recurse to set
958      bits from within the subpattern. If it can't find anything, we have to
959      give up. If it finds some mandatory character(s), we are done for this
960      branch. Otherwise, carry on scanning after the subpattern. */
961
962      case OP_BRA:
963      case OP_SBRA:
964      case OP_CBRA:
965      case OP_SCBRA:
966      case OP_BRAPOS:
967      case OP_SBRAPOS:
968      case OP_CBRAPOS:
969      case OP_SCBRAPOS:
970      case OP_ONCE:
971      case OP_ONCE_NC:
972      case OP_ASSERT:
973      rc = set_start_bits(tcode, start_bits, utf, cd);
974      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
975      if (rc == SSB_DONE) try_next = FALSE; else
976        {
977        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978        tcode += 1 + LINK_SIZE;
979        }
980      break;
981
982      /* If we hit ALT or KET, it means we haven't found anything mandatory in
983      this branch, though we might have found something optional. For ALT, we
984      continue with the next alternative, but we have to arrange that the final
985      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986      return SSB_CONTINUE: if this is the top level, that indicates failure,
987      but after a nested subpattern, it causes scanning to continue. */
988
989      case OP_ALT:
990      yield = SSB_CONTINUE;
991      try_next = FALSE;
992      break;
993
994      case OP_KET:
995      case OP_KETRMAX:
996      case OP_KETRMIN:
997      case OP_KETRPOS:
998      return SSB_CONTINUE;
999
1000      /* Skip over callout */
1001
1002      case OP_CALLOUT:
1003      tcode += 2 + 2*LINK_SIZE;
1004      break;
1005
1006      /* Skip over lookbehind and negative lookahead assertions */
1007
1008      case OP_ASSERT_NOT:
1009      case OP_ASSERTBACK:
1010      case OP_ASSERTBACK_NOT:
1011      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012      tcode += 1 + LINK_SIZE;
1013      break;
1014
1015      /* BRAZERO does the bracket, but carries on. */
1016
1017      case OP_BRAZERO:
1018      case OP_BRAMINZERO:
1019      case OP_BRAPOSZERO:
1020      rc = set_start_bits(++tcode, start_bits, utf, cd);
1021      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1022/* =========================================================================
1023      See the comment at the head of this function concerning the next line,
1024      which was an old fudge for the benefit of OS/2.
1025      dummy = 1;
1026  ========================================================================= */
1027      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028      tcode += 1 + LINK_SIZE;
1029      break;
1030
1031      /* SKIPZERO skips the bracket. */
1032
1033      case OP_SKIPZERO:
1034      tcode++;
1035      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036      tcode += 1 + LINK_SIZE;
1037      break;
1038
1039      /* Single-char * or ? sets the bit and tries the next item */
1040
1041      case OP_STAR:
1042      case OP_MINSTAR:
1043      case OP_POSSTAR:
1044      case OP_QUERY:
1045      case OP_MINQUERY:
1046      case OP_POSQUERY:
1047      tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1048      break;
1049
1050      case OP_STARI:
1051      case OP_MINSTARI:
1052      case OP_POSSTARI:
1053      case OP_QUERYI:
1054      case OP_MINQUERYI:
1055      case OP_POSQUERYI:
1056      tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1057      break;
1058
1059      /* Single-char upto sets the bit and tries the next */
1060
1061      case OP_UPTO:
1062      case OP_MINUPTO:
1063      case OP_POSUPTO:
1064      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1065      break;
1066
1067      case OP_UPTOI:
1068      case OP_MINUPTOI:
1069      case OP_POSUPTOI:
1070      tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1071      break;
1072
1073      /* At least one single char sets the bit and stops */
1074
1075      case OP_EXACT:
1076      tcode += IMM2_SIZE;
1077      /* Fall through */
1078      case OP_CHAR:
1079      case OP_PLUS:
1080      case OP_MINPLUS:
1081      case OP_POSPLUS:
1082      (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1083      try_next = FALSE;
1084      break;
1085
1086      case OP_EXACTI:
1087      tcode += IMM2_SIZE;
1088      /* Fall through */
1089      case OP_CHARI:
1090      case OP_PLUSI:
1091      case OP_MINPLUSI:
1092      case OP_POSPLUSI:
1093      (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1094      try_next = FALSE;
1095      break;
1096
1097      /* Special spacing and line-terminating items. These recognize specific
1098      lists of characters. The difference between VSPACE and ANYNL is that the
1099      latter can match the two-character CRLF sequence, but that is not
1100      relevant for finding the first character, so their code here is
1101      identical. */
1102
1103      case OP_HSPACE:
1104      SET_BIT(CHAR_HT);
1105      SET_BIT(CHAR_SPACE);
1106#ifdef SUPPORT_UTF
1107      if (utf)
1108        {
1109#ifdef COMPILE_PCRE8
1110        SET_BIT(0xC2);  /* For U+00A0 */
1111        SET_BIT(0xE1);  /* For U+1680, U+180E */
1112        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1113        SET_BIT(0xE3);  /* For U+3000 */
1114#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1115        SET_BIT(0xA0);
1116        SET_BIT(0xFF);  /* For characters > 255 */
1117#endif  /* COMPILE_PCRE[8|16|32] */
1118        }
1119      else
1120#endif /* SUPPORT_UTF */
1121        {
1122#ifndef EBCDIC
1123        SET_BIT(0xA0);
1124#endif  /* Not EBCDIC */
1125#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126        SET_BIT(0xFF);  /* For characters > 255 */
1127#endif  /* COMPILE_PCRE[16|32] */
1128        }
1129      try_next = FALSE;
1130      break;
1131
1132      case OP_ANYNL:
1133      case OP_VSPACE:
1134      SET_BIT(CHAR_LF);
1135      SET_BIT(CHAR_VT);
1136      SET_BIT(CHAR_FF);
1137      SET_BIT(CHAR_CR);
1138#ifdef SUPPORT_UTF
1139      if (utf)
1140        {
1141#ifdef COMPILE_PCRE8
1142        SET_BIT(0xC2);  /* For U+0085 */
1143        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1144#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145        SET_BIT(CHAR_NEL);
1146        SET_BIT(0xFF);  /* For characters > 255 */
1147#endif  /* COMPILE_PCRE[8|16|32] */
1148        }
1149      else
1150#endif /* SUPPORT_UTF */
1151        {
1152        SET_BIT(CHAR_NEL);
1153#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154        SET_BIT(0xFF);  /* For characters > 255 */
1155#endif
1156        }
1157      try_next = FALSE;
1158      break;
1159
1160      /* Single character types set the bits and stop. Note that if PCRE_UCP
1161      is set, we do not see these op codes because \d etc are converted to
1162      properties. Therefore, these apply in the case when only characters less
1163      than 256 are recognized to match the types. */
1164
1165      case OP_NOT_DIGIT:
1166      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1167      try_next = FALSE;
1168      break;
1169
1170      case OP_DIGIT:
1171      set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172      try_next = FALSE;
1173      break;
1174
1175      /* The cbit_space table has vertical tab as whitespace; we no longer
1176      have to play fancy tricks because Perl added VT to its whitespace at
1177      release 5.18. PCRE added it at release 8.34. */
1178
1179      case OP_NOT_WHITESPACE:
1180      set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181      try_next = FALSE;
1182      break;
1183
1184      case OP_WHITESPACE:
1185      set_type_bits(start_bits, cbit_space, table_limit, cd);
1186      try_next = FALSE;
1187      break;
1188
1189      case OP_NOT_WORDCHAR:
1190      set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1191      try_next = FALSE;
1192      break;
1193
1194      case OP_WORDCHAR:
1195      set_type_bits(start_bits, cbit_word, table_limit, cd);
1196      try_next = FALSE;
1197      break;
1198
1199      /* One or more character type fudges the pointer and restarts, knowing
1200      it will hit a single character type and stop there. */
1201
1202      case OP_TYPEPLUS:
1203      case OP_TYPEMINPLUS:
1204      case OP_TYPEPOSPLUS:
1205      tcode++;
1206      break;
1207
1208      case OP_TYPEEXACT:
1209      tcode += 1 + IMM2_SIZE;
1210      break;
1211
1212      /* Zero or more repeats of character types set the bits and then
1213      try again. */
1214
1215      case OP_TYPEUPTO:
1216      case OP_TYPEMINUPTO:
1217      case OP_TYPEPOSUPTO:
1218      tcode += IMM2_SIZE;  /* Fall through */
1219
1220      case OP_TYPESTAR:
1221      case OP_TYPEMINSTAR:
1222      case OP_TYPEPOSSTAR:
1223      case OP_TYPEQUERY:
1224      case OP_TYPEMINQUERY:
1225      case OP_TYPEPOSQUERY:
1226      switch(tcode[1])
1227        {
1228        default:
1229        case OP_ANY:
1230        case OP_ALLANY:
1231        return SSB_FAIL;
1232
1233        case OP_HSPACE:
1234        SET_BIT(CHAR_HT);
1235        SET_BIT(CHAR_SPACE);
1236#ifdef SUPPORT_UTF
1237        if (utf)
1238          {
1239#ifdef COMPILE_PCRE8
1240          SET_BIT(0xC2);  /* For U+00A0 */
1241          SET_BIT(0xE1);  /* For U+1680, U+180E */
1242          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1243          SET_BIT(0xE3);  /* For U+3000 */
1244#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1245          SET_BIT(0xA0);
1246          SET_BIT(0xFF);  /* For characters > 255 */
1247#endif  /* COMPILE_PCRE[8|16|32] */
1248          }
1249        else
1250#endif /* SUPPORT_UTF */
1251#ifndef EBCDIC
1252          SET_BIT(0xA0);
1253#endif  /* Not EBCDIC */
1254        break;
1255
1256        case OP_ANYNL:
1257        case OP_VSPACE:
1258        SET_BIT(CHAR_LF);
1259        SET_BIT(CHAR_VT);
1260        SET_BIT(CHAR_FF);
1261        SET_BIT(CHAR_CR);
1262#ifdef SUPPORT_UTF
1263        if (utf)
1264          {
1265#ifdef COMPILE_PCRE8
1266          SET_BIT(0xC2);  /* For U+0085 */
1267          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1268#elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269          SET_BIT(CHAR_NEL);
1270          SET_BIT(0xFF);  /* For characters > 255 */
1271#endif  /* COMPILE_PCRE16 */
1272          }
1273        else
1274#endif /* SUPPORT_UTF */
1275          SET_BIT(CHAR_NEL);
1276        break;
1277
1278        case OP_NOT_DIGIT:
1279        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1280        break;
1281
1282        case OP_DIGIT:
1283        set_type_bits(start_bits, cbit_digit, table_limit, cd);
1284        break;
1285
1286        /* The cbit_space table has vertical tab as whitespace; we no longer
1287        have to play fancy tricks because Perl added VT to its whitespace at
1288        release 5.18. PCRE added it at release 8.34. */
1289
1290        case OP_NOT_WHITESPACE:
1291        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1292        break;
1293
1294        case OP_WHITESPACE:
1295        set_type_bits(start_bits, cbit_space, table_limit, cd);
1296        break;
1297
1298        case OP_NOT_WORDCHAR:
1299        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1300        break;
1301
1302        case OP_WORDCHAR:
1303        set_type_bits(start_bits, cbit_word, table_limit, cd);
1304        break;
1305        }
1306
1307      tcode += 2;
1308      break;
1309
1310      /* Character class where all the information is in a bit map: set the
1311      bits and either carry on or not, according to the repeat count. If it was
1312      a negative class, and we are operating with UTF-8 characters, any byte
1313      with a value >= 0xc4 is a potentially valid starter because it starts a
1314      character with a value > 255. */
1315
1316#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1317      case OP_XCLASS:
1318      if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1319        return SSB_FAIL;
1320      /* All bits are set. */
1321      if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1322        return SSB_FAIL;
1323#endif
1324      /* Fall through */
1325
1326      case OP_NCLASS:
1327#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1328      if (utf)
1329        {
1330        start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1331        memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1332        }
1333#endif
1334#if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335      SET_BIT(0xFF);                         /* For characters > 255 */
1336#endif
1337      /* Fall through */
1338
1339      case OP_CLASS:
1340        {
1341        pcre_uint8 *map;
1342#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1343        map = NULL;
1344        if (*tcode == OP_XCLASS)
1345          {
1346          if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347            map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348          tcode += GET(tcode, 1);
1349          }
1350        else
1351#endif
1352          {
1353          tcode++;
1354          map = (pcre_uint8 *)tcode;
1355          tcode += 32 / sizeof(pcre_uchar);
1356          }
1357
1358        /* In UTF-8 mode, the bits in a bit map correspond to character
1359        values, not to byte values. However, the bit map we are constructing is
1360        for byte values. So we have to do a conversion for characters whose
1361        value is > 127. In fact, there are only two possible starting bytes for
1362        characters in the range 128 - 255. */
1363
1364#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1365        if (map != NULL)
1366#endif
1367          {
1368#if defined SUPPORT_UTF && defined COMPILE_PCRE8
1369          if (utf)
1370            {
1371            for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372            for (c = 128; c < 256; c++)
1373              {
1374              if ((map[c/8] & (1 << (c&7))) != 0)
1375                {
1376                int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1377                start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1378                c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1379                }
1380              }
1381            }
1382          else
1383#endif
1384            {
1385            /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386            for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1387            }
1388          }
1389
1390        /* Advance past the bit map, and act on what follows. For a zero
1391        minimum repeat, continue; otherwise stop processing. */
1392
1393        switch (*tcode)
1394          {
1395          case OP_CRSTAR:
1396          case OP_CRMINSTAR:
1397          case OP_CRQUERY:
1398          case OP_CRMINQUERY:
1399          case OP_CRPOSSTAR:
1400          case OP_CRPOSQUERY:
1401          tcode++;
1402          break;
1403
1404          case OP_CRRANGE:
1405          case OP_CRMINRANGE:
1406          case OP_CRPOSRANGE:
1407          if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408            else try_next = FALSE;
1409          break;
1410
1411          default:
1412          try_next = FALSE;
1413          break;
1414          }
1415        }
1416      break; /* End of bitmap class handling */
1417
1418      }      /* End of switch */
1419    }        /* End of try_next loop */
1420
1421  code += GET(code, 1);   /* Advance to next branch */
1422  }
1423while (*code == OP_ALT);
1424return yield;
1425}
1426
1427
1428
1429
1430
1431/*************************************************
1432*          Study a compiled expression           *
1433*************************************************/
1434
1435/* This function is handed a compiled expression that it must study to produce
1436information that will speed up the matching. It returns a pcre[16]_extra block
1437which then gets handed back to pcre_exec().
1438
1439Arguments:
1440  re        points to the compiled expression
1441  options   contains option bits
1442  errorptr  points to where to place error messages;
1443            set NULL unless error
1444
1445Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1446              the appropriate flags set;
1447            NULL on error or if no optimization possible
1448*/
1449
1450#if defined COMPILE_PCRE8
1451PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1452pcre_study(const pcre *external_re, int options, const char **errorptr)
1453#elif defined COMPILE_PCRE16
1454PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1455pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456#elif defined COMPILE_PCRE32
1457PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1458pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1459#endif
1460{
1461int min;
1462int count = 0;
1463BOOL bits_set = FALSE;
1464pcre_uint8 start_bits[32];
1465PUBL(extra) *extra = NULL;
1466pcre_study_data *study;
1467const pcre_uint8 *tables;
1468pcre_uchar *code;
1469compile_data compile_block;
1470const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1471
1472
1473*errorptr = NULL;
1474
1475if (re == NULL || re->magic_number != MAGIC_NUMBER)
1476  {
1477  *errorptr = "argument is not a compiled regular expression";
1478  return NULL;
1479  }
1480
1481if ((re->flags & PCRE_MODE) == 0)
1482  {
1483#if defined COMPILE_PCRE8
1484  *errorptr = "argument not compiled in 8 bit mode";
1485#elif defined COMPILE_PCRE16
1486  *errorptr = "argument not compiled in 16 bit mode";
1487#elif defined COMPILE_PCRE32
1488  *errorptr = "argument not compiled in 32 bit mode";
1489#endif
1490  return NULL;
1491  }
1492
1493if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1494  {
1495  *errorptr = "unknown or incorrect option bit(s) set";
1496  return NULL;
1497  }
1498
1499code = (pcre_uchar *)re + re->name_table_offset +
1500  (re->name_count * re->name_entry_size);
1501
1502/* For an anchored pattern, or an unanchored pattern that has a first char, or
1503a multiline pattern that matches only at "line starts", there is no point in
1504seeking a list of starting bytes. */
1505
1506if ((re->options & PCRE_ANCHORED) == 0 &&
1507    (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1508  {
1509  int rc;
1510
1511  /* Set the character tables in the block that is passed around */
1512
1513  tables = re->tables;
1514
1515#if defined COMPILE_PCRE8
1516  if (tables == NULL)
1517    (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1518    (void *)(&tables));
1519#elif defined COMPILE_PCRE16
1520  if (tables == NULL)
1521    (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1522    (void *)(&tables));
1523#elif defined COMPILE_PCRE32
1524  if (tables == NULL)
1525    (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1526    (void *)(&tables));
1527#endif
1528
1529  compile_block.lcc = tables + lcc_offset;
1530  compile_block.fcc = tables + fcc_offset;
1531  compile_block.cbits = tables + cbits_offset;
1532  compile_block.ctypes = tables + ctypes_offset;
1533
1534  /* See if we can find a fixed set of initial characters for the pattern. */
1535
1536  memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537  rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1538    &compile_block);
1539  bits_set = rc == SSB_DONE;
1540  if (rc == SSB_UNKNOWN)
1541    {
1542    *errorptr = "internal error: opcode not recognized";
1543    return NULL;
1544    }
1545  }
1546
1547/* Find the minimum length of subject string. */
1548
1549switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1550  {
1551  case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552  case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1553  default: break;
1554  }
1555
1556/* If a set of starting bytes has been identified, or if the minimum length is
1557greater than zero, or if JIT optimization has been requested, or if
1558PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559pcre_study_data block. The study data is put in the latter, which is pointed to
1560by the former, which may also get additional data set later by the calling
1561program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562save it in a field for returning via the pcre_fullinfo() function so that if it
1563becomes variable in the future, we don't have to change that code. */
1564
1565if (bits_set || min > 0 || (options & (
1566#ifdef SUPPORT_JIT
1567    PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1568    PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1569#endif
1570    PCRE_STUDY_EXTRA_NEEDED)) != 0)
1571  {
1572  extra = (PUBL(extra) *)(PUBL(malloc))
1573    (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1574  if (extra == NULL)
1575    {
1576    *errorptr = "failed to get memory";
1577    return NULL;
1578    }
1579
1580  study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581  extra->flags = PCRE_EXTRA_STUDY_DATA;
1582  extra->study_data = study;
1583
1584  study->size = sizeof(pcre_study_data);
1585  study->flags = 0;
1586
1587  /* Set the start bits always, to avoid unset memory errors if the
1588  study data is written to a file, but set the flag only if any of the bits
1589  are set, to save time looking when none are. */
1590
1591  if (bits_set)
1592    {
1593    study->flags |= PCRE_STUDY_MAPPED;
1594    memcpy(study->start_bits, start_bits, sizeof(start_bits));
1595    }
1596  else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1597
1598#ifdef PCRE_DEBUG
1599  if (bits_set)
1600    {
1601    pcre_uint8 *ptr = start_bits;
1602    int i;
1603
1604    printf("Start bits:\n");
1605    for (i = 0; i < 32; i++)
1606      printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1607    }
1608#endif
1609
1610  /* Always set the minlength value in the block, because the JIT compiler
1611  makes use of it. However, don't set the bit unless the length is greater than
1612  zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613  checking the zero case. */
1614
1615  if (min > 0)
1616    {
1617    study->flags |= PCRE_STUDY_MINLEN;
1618    study->minlength = min;
1619    }
1620  else study->minlength = 0;
1621
1622  /* If JIT support was compiled and requested, attempt the JIT compilation.
1623  If no starting bytes were found, and the minimum length is zero, and JIT
1624  compilation fails, abandon the extra block and return NULL, unless
1625  PCRE_STUDY_EXTRA_NEEDED is set. */
1626
1627#ifdef SUPPORT_JIT
1628  extra->executable_jit = NULL;
1629  if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630    PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631  if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632    PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633  if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634    PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1635
1636  if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637      (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1638    {
1639#if defined COMPILE_PCRE8
1640    pcre_free_study(extra);
1641#elif defined COMPILE_PCRE16
1642    pcre16_free_study(extra);
1643#elif defined COMPILE_PCRE32
1644    pcre32_free_study(extra);
1645#endif
1646    extra = NULL;
1647    }
1648#endif
1649  }
1650
1651return extra;
1652}
1653
1654
1655/*************************************************
1656*          Free the study data                   *
1657*************************************************/
1658
1659/* This function frees the memory that was obtained by pcre_study().
1660
1661Argument:   a pointer to the pcre[16]_extra block
1662Returns:    nothing
1663*/
1664
1665#if defined COMPILE_PCRE8
1666PCRE_EXP_DEFN void
1667pcre_free_study(pcre_extra *extra)
1668#elif defined COMPILE_PCRE16
1669PCRE_EXP_DEFN void
1670pcre16_free_study(pcre16_extra *extra)
1671#elif defined COMPILE_PCRE32
1672PCRE_EXP_DEFN void
1673pcre32_free_study(pcre32_extra *extra)
1674#endif
1675{
1676if (extra == NULL)
1677  return;
1678#ifdef SUPPORT_JIT
1679if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680     extra->executable_jit != NULL)
1681  PRIV(jit_free)(extra->executable_jit);
1682#endif
1683PUBL(free)(extra);
1684}
1685
1686/* End of pcre_study.c */
1687