1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2010 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains a PCRE private debugging function for printing out the
42internal form of a compiled regular expression, along with some supporting
43local functions. This source file is used in two places:
44
45(1) It is #included by pcre_compile.c when it is compiled in debugging mode
46(PCRE_DEBUG defined in pcre_internal.h). It is not included in production
47compiles.
48
49(2) It is always #included by pcretest.c, which can be asked to print out a
50compiled regex for debugging purposes. */
51
52
53/* Macro that decides whether a character should be output as a literal or in
54hexadecimal. We don't use isprint() because that can vary from system to system
55(even without the use of locales) and we want the output always to be the same,
56for testing purposes. This macro is used in pcretest as well as in this file. */
57
58#ifdef EBCDIC
59#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
60#else
61#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
62#endif
63
64/* The table of operator names. */
65
66static const char *OP_names[] = { OP_NAME_LIST };
67
68
69
70/*************************************************
71*       Print single- or multi-byte character    *
72*************************************************/
73
74static int
75print_char(FILE *f, uschar *ptr, BOOL utf8)
76{
77int c = *ptr;
78
79#ifndef SUPPORT_UTF8
80utf8 = utf8;  /* Avoid compiler warning */
81if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
82return 0;
83
84#else
85if (!utf8 || (c & 0xc0) != 0xc0)
86  {
87  if (PRINTABLE(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
88  return 0;
89  }
90else
91  {
92  int i;
93  int a = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */
94  int s = 6*a;
95  c = (c & _pcre_utf8_table3[a]) << s;
96  for (i = 1; i <= a; i++)
97    {
98    /* This is a check for malformed UTF-8; it should only occur if the sanity
99    check has been turned off. Rather than swallow random bytes, just stop if
100    we hit a bad one. Print it with \X instead of \x as an indication. */
101
102    if ((ptr[i] & 0xc0) != 0x80)
103      {
104      fprintf(f, "\\X{%x}", c);
105      return i - 1;
106      }
107
108    /* The byte is OK */
109
110    s -= 6;
111    c |= (ptr[i] & 0x3f) << s;
112    }
113  if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
114  return a;
115  }
116#endif
117}
118
119
120
121/*************************************************
122*          Find Unicode property name            *
123*************************************************/
124
125static const char *
126get_ucpname(int ptype, int pvalue)
127{
128#ifdef SUPPORT_UCP
129int i;
130for (i = _pcre_utt_size - 1; i >= 0; i--)
131  {
132  if (ptype == _pcre_utt[i].type && pvalue == _pcre_utt[i].value) break;
133  }
134return (i >= 0)? _pcre_utt_names + _pcre_utt[i].name_offset : "??";
135#else
136/* It gets harder and harder to shut off unwanted compiler warnings. */
137ptype = ptype * pvalue;
138return (ptype == pvalue)? "??" : "??";
139#endif
140}
141
142
143
144/*************************************************
145*         Print compiled regex                   *
146*************************************************/
147
148/* Make this function work for a regex with integers either byte order.
149However, we assume that what we are passed is a compiled regex. The
150print_lengths flag controls whether offsets and lengths of items are printed.
151They can be turned off from pcretest so that automatic tests on bytecode can be
152written that do not depend on the value of LINK_SIZE. */
153
154static void
155pcre_printint(pcre *external_re, FILE *f, BOOL print_lengths)
156{
157real_pcre *re = (real_pcre *)external_re;
158uschar *codestart, *code;
159BOOL utf8;
160
161unsigned int options = re->options;
162int offset = re->name_table_offset;
163int count = re->name_count;
164int size = re->name_entry_size;
165
166if (re->magic_number != MAGIC_NUMBER)
167  {
168  offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
169  count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
170  size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
171  options = ((options << 24) & 0xff000000) |
172            ((options <<  8) & 0x00ff0000) |
173            ((options >>  8) & 0x0000ff00) |
174            ((options >> 24) & 0x000000ff);
175  }
176
177code = codestart = (uschar *)re + offset + count * size;
178utf8 = (options & PCRE_UTF8) != 0;
179
180for(;;)
181  {
182  uschar *ccode;
183  int c;
184  int extra = 0;
185
186  if (print_lengths)
187    fprintf(f, "%3d ", (int)(code - codestart));
188  else
189    fprintf(f, "    ");
190
191  switch(*code)
192    {
193/* ========================================================================== */
194      /* These cases are never obeyed. This is a fudge that causes a compile-
195      time error if the vectors OP_names or _pcre_OP_lengths, which are indexed
196      by opcode, are not the correct length. It seems to be the only way to do
197      such a check at compile time, as the sizeof() operator does not work in
198      the C preprocessor. We do this while compiling pcretest, because that
199      #includes pcre_tables.c, which holds _pcre_OP_lengths. We can't do this
200      when building pcre_compile.c with PCRE_DEBUG set, because it doesn't then
201      know the size of _pcre_OP_lengths. */
202
203#ifdef COMPILING_PCRETEST
204      case OP_TABLE_LENGTH:
205      case OP_TABLE_LENGTH +
206        ((sizeof(OP_names)/sizeof(const char *) == OP_TABLE_LENGTH) &&
207        (sizeof(_pcre_OP_lengths) == OP_TABLE_LENGTH)):
208      break;
209#endif
210/* ========================================================================== */
211
212    case OP_END:
213    fprintf(f, "    %s\n", OP_names[*code]);
214    fprintf(f, "------------------------------------------------------------------\n");
215    return;
216
217    case OP_OPT:
218    fprintf(f, " %.2x %s", code[1], OP_names[*code]);
219    break;
220
221    case OP_CHAR:
222    fprintf(f, "    ");
223    do
224      {
225      code++;
226      code += 1 + print_char(f, code, utf8);
227      }
228    while (*code == OP_CHAR);
229    fprintf(f, "\n");
230    continue;
231
232    case OP_CHARNC:
233    fprintf(f, " NC ");
234    do
235      {
236      code++;
237      code += 1 + print_char(f, code, utf8);
238      }
239    while (*code == OP_CHARNC);
240    fprintf(f, "\n");
241    continue;
242
243    case OP_CBRA:
244    case OP_SCBRA:
245    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
246      else fprintf(f, "    ");
247    fprintf(f, "%s %d", OP_names[*code], GET2(code, 1+LINK_SIZE));
248    break;
249
250    case OP_BRA:
251    case OP_SBRA:
252    case OP_KETRMAX:
253    case OP_KETRMIN:
254    case OP_ALT:
255    case OP_KET:
256    case OP_ASSERT:
257    case OP_ASSERT_NOT:
258    case OP_ASSERTBACK:
259    case OP_ASSERTBACK_NOT:
260    case OP_ONCE:
261    case OP_COND:
262    case OP_SCOND:
263    case OP_REVERSE:
264    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
265      else fprintf(f, "    ");
266    fprintf(f, "%s", OP_names[*code]);
267    break;
268
269    case OP_CLOSE:
270    fprintf(f, "    %s %d", OP_names[*code], GET2(code, 1));
271    break;
272
273    case OP_CREF:
274    case OP_NCREF:
275    fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
276    break;
277
278    case OP_RREF:
279    c = GET2(code, 1);
280    if (c == RREF_ANY)
281      fprintf(f, "    Cond recurse any");
282    else
283      fprintf(f, "    Cond recurse %d", c);
284    break;
285
286    case OP_NRREF:
287    c = GET2(code, 1);
288    if (c == RREF_ANY)
289      fprintf(f, "    Cond nrecurse any");
290    else
291      fprintf(f, "    Cond nrecurse %d", c);
292    break;
293
294    case OP_DEF:
295    fprintf(f, "    Cond def");
296    break;
297
298    case OP_STAR:
299    case OP_MINSTAR:
300    case OP_POSSTAR:
301    case OP_PLUS:
302    case OP_MINPLUS:
303    case OP_POSPLUS:
304    case OP_QUERY:
305    case OP_MINQUERY:
306    case OP_POSQUERY:
307    case OP_TYPESTAR:
308    case OP_TYPEMINSTAR:
309    case OP_TYPEPOSSTAR:
310    case OP_TYPEPLUS:
311    case OP_TYPEMINPLUS:
312    case OP_TYPEPOSPLUS:
313    case OP_TYPEQUERY:
314    case OP_TYPEMINQUERY:
315    case OP_TYPEPOSQUERY:
316    fprintf(f, "    ");
317    if (*code >= OP_TYPESTAR)
318      {
319      fprintf(f, "%s", OP_names[code[1]]);
320      if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
321        {
322        fprintf(f, " %s ", get_ucpname(code[2], code[3]));
323        extra = 2;
324        }
325      }
326    else extra = print_char(f, code+1, utf8);
327    fprintf(f, "%s", OP_names[*code]);
328    break;
329
330    case OP_EXACT:
331    case OP_UPTO:
332    case OP_MINUPTO:
333    case OP_POSUPTO:
334    fprintf(f, "    ");
335    extra = print_char(f, code+3, utf8);
336    fprintf(f, "{");
337    if (*code != OP_EXACT) fprintf(f, "0,");
338    fprintf(f, "%d}", GET2(code,1));
339    if (*code == OP_MINUPTO) fprintf(f, "?");
340      else if (*code == OP_POSUPTO) fprintf(f, "+");
341    break;
342
343    case OP_TYPEEXACT:
344    case OP_TYPEUPTO:
345    case OP_TYPEMINUPTO:
346    case OP_TYPEPOSUPTO:
347    fprintf(f, "    %s", OP_names[code[3]]);
348    if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
349      {
350      fprintf(f, " %s ", get_ucpname(code[4], code[5]));
351      extra = 2;
352      }
353    fprintf(f, "{");
354    if (*code != OP_TYPEEXACT) fprintf(f, "0,");
355    fprintf(f, "%d}", GET2(code,1));
356    if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
357      else if (*code == OP_TYPEPOSUPTO) fprintf(f, "+");
358    break;
359
360    case OP_NOT:
361    c = code[1];
362    if (PRINTABLE(c)) fprintf(f, "    [^%c]", c);
363      else fprintf(f, "    [^\\x%02x]", c);
364    break;
365
366    case OP_NOTSTAR:
367    case OP_NOTMINSTAR:
368    case OP_NOTPOSSTAR:
369    case OP_NOTPLUS:
370    case OP_NOTMINPLUS:
371    case OP_NOTPOSPLUS:
372    case OP_NOTQUERY:
373    case OP_NOTMINQUERY:
374    case OP_NOTPOSQUERY:
375    c = code[1];
376    if (PRINTABLE(c)) fprintf(f, "    [^%c]", c);
377      else fprintf(f, "    [^\\x%02x]", c);
378    fprintf(f, "%s", OP_names[*code]);
379    break;
380
381    case OP_NOTEXACT:
382    case OP_NOTUPTO:
383    case OP_NOTMINUPTO:
384    case OP_NOTPOSUPTO:
385    c = code[3];
386    if (PRINTABLE(c)) fprintf(f, "    [^%c]{", c);
387      else fprintf(f, "    [^\\x%02x]{", c);
388    if (*code != OP_NOTEXACT) fprintf(f, "0,");
389    fprintf(f, "%d}", GET2(code,1));
390    if (*code == OP_NOTMINUPTO) fprintf(f, "?");
391      else if (*code == OP_NOTPOSUPTO) fprintf(f, "+");
392    break;
393
394    case OP_RECURSE:
395    if (print_lengths) fprintf(f, "%3d ", GET(code, 1));
396      else fprintf(f, "    ");
397    fprintf(f, "%s", OP_names[*code]);
398    break;
399
400    case OP_REF:
401    fprintf(f, "    \\%d", GET2(code,1));
402    ccode = code + _pcre_OP_lengths[*code];
403    goto CLASS_REF_REPEAT;
404
405    case OP_CALLOUT:
406    fprintf(f, "    %s %d %d %d", OP_names[*code], code[1], GET(code,2),
407      GET(code, 2 + LINK_SIZE));
408    break;
409
410    case OP_PROP:
411    case OP_NOTPROP:
412    fprintf(f, "    %s %s", OP_names[*code], get_ucpname(code[1], code[2]));
413    break;
414
415    /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
416    having this code always here, and it makes it less messy without all those
417    #ifdefs. */
418
419    case OP_CLASS:
420    case OP_NCLASS:
421    case OP_XCLASS:
422      {
423      int i, min, max;
424      BOOL printmap;
425
426      fprintf(f, "    [");
427
428      if (*code == OP_XCLASS)
429        {
430        extra = GET(code, 1);
431        ccode = code + LINK_SIZE + 1;
432        printmap = (*ccode & XCL_MAP) != 0;
433        if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
434        }
435      else
436        {
437        printmap = TRUE;
438        ccode = code + 1;
439        }
440
441      /* Print a bit map */
442
443      if (printmap)
444        {
445        for (i = 0; i < 256; i++)
446          {
447          if ((ccode[i/8] & (1 << (i&7))) != 0)
448            {
449            int j;
450            for (j = i+1; j < 256; j++)
451              if ((ccode[j/8] & (1 << (j&7))) == 0) break;
452            if (i == '-' || i == ']') fprintf(f, "\\");
453            if (PRINTABLE(i)) fprintf(f, "%c", i);
454              else fprintf(f, "\\x%02x", i);
455            if (--j > i)
456              {
457              if (j != i + 1) fprintf(f, "-");
458              if (j == '-' || j == ']') fprintf(f, "\\");
459              if (PRINTABLE(j)) fprintf(f, "%c", j);
460                else fprintf(f, "\\x%02x", j);
461              }
462            i = j;
463            }
464          }
465        ccode += 32;
466        }
467
468      /* For an XCLASS there is always some additional data */
469
470      if (*code == OP_XCLASS)
471        {
472        int ch;
473        while ((ch = *ccode++) != XCL_END)
474          {
475          if (ch == XCL_PROP)
476            {
477            int ptype = *ccode++;
478            int pvalue = *ccode++;
479            fprintf(f, "\\p{%s}", get_ucpname(ptype, pvalue));
480            }
481          else if (ch == XCL_NOTPROP)
482            {
483            int ptype = *ccode++;
484            int pvalue = *ccode++;
485            fprintf(f, "\\P{%s}", get_ucpname(ptype, pvalue));
486            }
487          else
488            {
489            ccode += 1 + print_char(f, ccode, TRUE);
490            if (ch == XCL_RANGE)
491              {
492              fprintf(f, "-");
493              ccode += 1 + print_char(f, ccode, TRUE);
494              }
495            }
496          }
497        }
498
499      /* Indicate a non-UTF8 class which was created by negation */
500
501      fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
502
503      /* Handle repeats after a class or a back reference */
504
505      CLASS_REF_REPEAT:
506      switch(*ccode)
507        {
508        case OP_CRSTAR:
509        case OP_CRMINSTAR:
510        case OP_CRPLUS:
511        case OP_CRMINPLUS:
512        case OP_CRQUERY:
513        case OP_CRMINQUERY:
514        fprintf(f, "%s", OP_names[*ccode]);
515        extra += _pcre_OP_lengths[*ccode];
516        break;
517
518        case OP_CRRANGE:
519        case OP_CRMINRANGE:
520        min = GET2(ccode,1);
521        max = GET2(ccode,3);
522        if (max == 0) fprintf(f, "{%d,}", min);
523        else fprintf(f, "{%d,%d}", min, max);
524        if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
525        extra += _pcre_OP_lengths[*ccode];
526        break;
527
528        /* Do nothing if it's not a repeat; this code stops picky compilers
529        warning about the lack of a default code path. */
530
531        default:
532        break;
533        }
534      }
535    break;
536
537    /* Anything else is just an item with no data*/
538
539    default:
540    fprintf(f, "    %s", OP_names[*code]);
541    break;
542    }
543
544  code += _pcre_OP_lengths[*code] + extra;
545  fprintf(f, "\n");
546  }
547}
548
549/* End of pcre_printint.src */
550