1/* $OpenLDAP$ */
2/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3 *
4 * Copyright 1998-2011 The OpenLDAP Foundation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted only as authorized by the OpenLDAP
9 * Public License.
10 *
11 * A copy of this license is available in file LICENSE in the
12 * top-level directory of the distribution or, alternatively, at
13 * <http://www.OpenLDAP.org/license.html>.
14 */
15/* Copyright 1997, 1998, 1999 Computing Research Labs,
16 * New Mexico State University
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a
19 * copy of this software and associated documentation files (the "Software"),
20 * to deal in the Software without restriction, including without limitation
21 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 * and/or sell copies of the Software, and to permit persons to whom the
23 * Software is furnished to do so, subject to the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 */
36/* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */
37
38#include "portable.h"
39
40#include <ac/stdlib.h>
41#include <ac/string.h>
42#include <ac/unistd.h>
43
44#include "ure.h"
45
46/*
47 * Flags used internally in the DFA.
48 */
49#define _URE_DFA_CASEFOLD  0x01
50#define _URE_DFA_BLANKLINE 0x02
51
52static unsigned long cclass_flags[] = {
53    0,
54    _URE_NONSPACING,
55    _URE_COMBINING,
56    _URE_NUMDIGIT,
57    _URE_NUMOTHER,
58    _URE_SPACESEP,
59    _URE_LINESEP,
60    _URE_PARASEP,
61    _URE_CNTRL,
62    _URE_PUA,
63    _URE_UPPER,
64    _URE_LOWER,
65    _URE_TITLE,
66    _URE_MODIFIER,
67    _URE_OTHERLETTER,
68    _URE_DASHPUNCT,
69    _URE_OPENPUNCT,
70    _URE_CLOSEPUNCT,
71    _URE_OTHERPUNCT,
72    _URE_MATHSYM,
73    _URE_CURRENCYSYM,
74    _URE_OTHERSYM,
75    _URE_LTR,
76    _URE_RTL,
77    _URE_EURONUM,
78    _URE_EURONUMSEP,
79    _URE_EURONUMTERM,
80    _URE_ARABNUM,
81    _URE_COMMONSEP,
82    _URE_BLOCKSEP,
83    _URE_SEGMENTSEP,
84    _URE_WHITESPACE,
85    _URE_OTHERNEUT,
86};
87
88/*
89 * Symbol types for the DFA.
90 */
91#define _URE_ANY_CHAR   1
92#define _URE_CHAR       2
93#define _URE_CCLASS     3
94#define _URE_NCCLASS    4
95#define _URE_BOL_ANCHOR 5
96#define _URE_EOL_ANCHOR 6
97
98/*
99 * Op codes for converting the NFA to a DFA.
100 */
101#define _URE_SYMBOL     10
102#define _URE_PAREN      11
103#define _URE_QUEST      12
104#define _URE_STAR       13
105#define _URE_PLUS       14
106#define _URE_ONE        15
107#define _URE_AND        16
108#define _URE_OR         17
109
110#define _URE_NOOP       0xffff
111
112#define _URE_REGSTART 0x8000
113#define _URE_REGEND   0x4000
114
115/*
116 * Structure used to handle a compacted range of characters.
117 */
118typedef struct {
119    ucs4_t min_code;
120    ucs4_t max_code;
121} _ure_range_t;
122
123typedef struct {
124    _ure_range_t *ranges;
125    ucs2_t ranges_used;
126    ucs2_t ranges_size;
127} _ure_ccl_t;
128
129typedef union {
130    ucs4_t chr;
131    _ure_ccl_t ccl;
132} _ure_sym_t;
133
134/*
135 * This is a general element structure used for expressions and stack
136 * elements.
137 */
138typedef struct {
139    ucs2_t reg;
140    ucs2_t onstack;
141    ucs2_t type;
142    ucs2_t lhs;
143    ucs2_t rhs;
144} _ure_elt_t;
145
146/*
147 * This is a structure used to track a list or a stack of states.
148 */
149typedef struct {
150    ucs2_t *slist;
151    ucs2_t slist_size;
152    ucs2_t slist_used;
153} _ure_stlist_t;
154
155/*
156 * Structure to track the list of unique states for a symbol
157 * during reduction.
158 */
159typedef struct {
160    ucs2_t id;
161    ucs2_t type;
162    unsigned long mods;
163    unsigned long props;
164    _ure_sym_t sym;
165    _ure_stlist_t states;
166} _ure_symtab_t;
167
168/*
169 * Structure to hold a single state.
170 */
171typedef struct {
172    ucs2_t id;
173    ucs2_t accepting;
174    ucs2_t pad;
175    _ure_stlist_t st;
176    _ure_elt_t *trans;
177    ucs2_t trans_size;
178    ucs2_t trans_used;
179} _ure_state_t;
180
181/*
182 * Structure used for keeping lists of states.
183 */
184typedef struct {
185    _ure_state_t *states;
186    ucs2_t states_size;
187    ucs2_t states_used;
188} _ure_statetable_t;
189
190/*
191 * Structure to track pairs of DFA states when equivalent states are
192 * merged.
193 */
194typedef struct {
195    ucs2_t l;
196    ucs2_t r;
197} _ure_equiv_t;
198
199/*
200 * Structure used for constructing the NFA and reducing to a minimal DFA.
201 */
202typedef struct _ure_buffer_t {
203    int reducing;
204    int error;
205    unsigned long flags;
206
207    _ure_stlist_t stack;
208
209    /*
210     * Table of unique symbols encountered.
211     */
212    _ure_symtab_t *symtab;
213    ucs2_t symtab_size;
214    ucs2_t symtab_used;
215
216    /*
217     * Tracks the unique expressions generated for the NFA and when the NFA is
218     * reduced.
219     */
220    _ure_elt_t *expr;
221    ucs2_t expr_used;
222    ucs2_t expr_size;
223
224    /*
225     * The reduced table of unique groups of NFA states.
226     */
227    _ure_statetable_t states;
228
229    /*
230     * Tracks states when equivalent states are merged.
231     */
232    _ure_equiv_t *equiv;
233    ucs2_t equiv_used;
234    ucs2_t equiv_size;
235} _ure_buffer_t;
236
237typedef struct {
238    ucs2_t symbol;
239    ucs2_t next_state;
240} _ure_trans_t;
241
242typedef struct {
243    ucs2_t accepting;
244    ucs2_t ntrans;
245    _ure_trans_t *trans;
246} _ure_dstate_t;
247
248typedef struct _ure_dfa_t {
249    unsigned long flags;
250
251    _ure_symtab_t *syms;
252    ucs2_t nsyms;
253
254    _ure_dstate_t *states;
255    ucs2_t nstates;
256
257    _ure_trans_t *trans;
258    ucs2_t ntrans;
259} _ure_dfa_t;
260
261/*************************************************************************
262 *
263 * Functions.
264 *
265 *************************************************************************/
266
267static void
268_ure_memmove(char *dest, char *src, unsigned long bytes)
269{
270    long i, j;
271
272    i = (long) bytes;
273    j = i & 7;
274    i = (i + 7) >> 3;
275
276    /*
277     * Do a memmove using Ye Olde Duff's Device for efficiency.
278     */
279    if (src < dest) {
280        src += bytes;
281        dest += bytes;
282
283        switch (j) {
284          case 0: do {
285              *--dest = *--src;
286            case 7: *--dest = *--src;
287            case 6: *--dest = *--src;
288            case 5: *--dest = *--src;
289            case 4: *--dest = *--src;
290            case 3: *--dest = *--src;
291            case 2: *--dest = *--src;
292            case 1: *--dest = *--src;
293          } while (--i > 0);
294        }
295    } else if (src > dest) {
296        switch (j) {
297          case 0: do {
298              *dest++ = *src++;
299            case 7: *dest++ = *src++;
300            case 6: *dest++ = *src++;
301            case 5: *dest++ = *src++;
302            case 4: *dest++ = *src++;
303            case 3: *dest++ = *src++;
304            case 2: *dest++ = *src++;
305            case 1: *dest++ = *src++;
306          } while (--i > 0);
307        }
308    }
309}
310
311static void
312_ure_push(ucs2_t v, _ure_buffer_t *b)
313{
314    _ure_stlist_t *s;
315
316    if (b == 0)
317      return;
318
319    /*
320     * If the `reducing' parameter is non-zero, check to see if the value
321     * passed is already on the stack.
322     */
323    if (b->reducing != 0 && b->expr[v].onstack != 0)
324      return;
325
326    s = &b->stack;
327    if (s->slist_used == s->slist_size) {
328        if (s->slist_size == 0)
329          s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
330        else
331          s->slist = (ucs2_t *) realloc((char *) s->slist,
332                                        sizeof(ucs2_t) * (s->slist_size + 8));
333        s->slist_size += 8;
334    }
335    s->slist[s->slist_used++] = v;
336
337    /*
338     * If the `reducing' parameter is non-zero, flag the element as being on
339     * the stack.
340     */
341    if (b->reducing != 0)
342      b->expr[v].onstack = 1;
343}
344
345static ucs2_t
346_ure_peek(_ure_buffer_t *b)
347{
348    if (b == 0 || b->stack.slist_used == 0)
349      return _URE_NOOP;
350
351    return b->stack.slist[b->stack.slist_used - 1];
352}
353
354static ucs2_t
355_ure_pop(_ure_buffer_t *b)
356{
357    ucs2_t v;
358
359    if (b == 0 || b->stack.slist_used == 0)
360      return _URE_NOOP;
361
362    v = b->stack.slist[--b->stack.slist_used];
363    if (b->reducing)
364      b->expr[v].onstack = 0;
365
366    return v;
367}
368
369/*************************************************************************
370 *
371 * Start symbol parse functions.
372 *
373 *************************************************************************/
374
375/*
376 * Parse a comma-separated list of integers that represent character
377 * properties.  Combine them into a mask that is returned in the `mask'
378 * variable, and return the number of characters consumed.
379 */
380static unsigned long
381_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
382               _ure_buffer_t *b)
383{
384    unsigned long n, m;
385    ucs2_t *sp, *ep;
386
387    sp = pp;
388    ep = sp + limit;
389
390    for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
391        if (*sp == ',') {
392            /*
393             * Encountered a comma, so select the next character property flag
394             * and reset the number.
395             */
396            m |= cclass_flags[n];
397            n = 0;
398        } else if (*sp >= '0' && *sp <= '9')
399          /*
400           * Encountered a digit, so start or continue building the cardinal
401           * that represents the character property flag.
402           */
403          n = (n * 10) + (*sp - '0');
404        else
405          /*
406           * Encountered something that is not part of the property list.
407           * Indicate that we are done.
408           */
409          break;
410
411        /*
412         * If a property number greater than 32 occurs, then there is a
413         * problem.  Most likely a missing comma separator.
414         */
415        if (n > 32)
416          b->error = _URE_INVALID_PROPERTY;
417    }
418
419    if (n != 0)
420      m |= cclass_flags[n];
421
422    /*
423     * Set the mask that represents the group of character properties.
424     */
425    *mask = m;
426
427    /*
428     * Return the number of characters consumed.
429     */
430    return sp - pp;
431}
432
433/*
434 * Collect a hex number with 1 to 4 digits and return the number
435 * of characters used.
436 */
437static unsigned long
438_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
439{
440    ucs2_t i;
441    ucs2_t *sp, *ep;
442    ucs4_t nn;
443
444    sp = np;
445    ep = sp + limit;
446
447    for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
448        if (*sp >= '0' && *sp <= '9')
449          nn = (nn << 4) + (*sp - '0');
450        else if (*sp >= 'A' && *sp <= 'F')
451          nn = (nn << 4) + ((*sp - 'A') + 10);
452        else if (*sp >= 'a' && *sp <= 'f')
453          nn = (nn << 4) + ((*sp - 'a') + 10);
454        else
455          /*
456           * Encountered something that is not a hex digit.
457           */
458          break;
459    }
460
461    /*
462     * Assign the character code collected and return the number of
463     * characters used.
464     */
465    *n = nn;
466
467    return sp - np;
468}
469
470/*
471 * Insert a range into a character class, removing duplicates and ordering
472 * them in increasing range-start order.
473 */
474static void
475_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
476{
477    ucs2_t i;
478    ucs4_t tmp;
479    _ure_range_t *rp;
480
481    /*
482     * If the `casefold' flag is set, then make sure both endpoints of the
483     * range are converted to lower case.
484     */
485    if (b->flags & _URE_DFA_CASEFOLD) {
486        r->min_code = _ure_tolower(r->min_code);
487        r->max_code = _ure_tolower(r->max_code);
488    }
489
490    /*
491     * Swap the range endpoints if they are not in increasing order.
492     */
493    if (r->min_code > r->max_code) {
494        tmp = r->min_code;
495        r->min_code = r->max_code;
496        r->max_code = tmp;
497    }
498
499    for (i = 0, rp = ccl->ranges;
500         i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
501
502    /*
503     * Check for a duplicate.
504     */
505    if (i < ccl->ranges_used &&
506        r->min_code == rp->min_code && r->max_code == rp->max_code)
507      return;
508
509    if (ccl->ranges_used == ccl->ranges_size) {
510        if (ccl->ranges_size == 0)
511          ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
512        else
513          ccl->ranges = (_ure_range_t *)
514              realloc((char *) ccl->ranges,
515                      sizeof(_ure_range_t) * (ccl->ranges_size + 8));
516        ccl->ranges_size += 8;
517    }
518
519    rp = ccl->ranges + ccl->ranges_used;
520
521    if (i < ccl->ranges_used)
522      _ure_memmove((char *) (rp + 1), (char *) rp,
523                   sizeof(_ure_range_t) * (ccl->ranges_used - i));
524
525    ccl->ranges_used++;
526    rp->min_code = r->min_code;
527    rp->max_code = r->max_code;
528}
529
530#define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
531_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
532#define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
533#define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
534_URE_OTHERPUNCT)
535#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
536_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
537#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
538#define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
539
540typedef void (*_ure_cclsetup_t)(
541    _ure_symtab_t *sym,
542    unsigned long mask,
543    _ure_buffer_t *b
544);
545
546typedef struct {
547    ucs2_t key;
548    unsigned long len;
549    unsigned long next;
550    _ure_cclsetup_t func;
551    unsigned long mask;
552} _ure_trie_t;
553
554static void
555_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
556{
557    sym->props |= mask;
558}
559
560static void
561_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
562{
563    _ure_range_t range;
564
565    sym->props |= mask;
566
567    /*
568     * Add the additional characters needed for handling isspace().
569     */
570    range.min_code = range.max_code = '\t';
571    _ure_add_range(&sym->sym.ccl, &range, b);
572    range.min_code = range.max_code = '\r';
573    _ure_add_range(&sym->sym.ccl, &range, b);
574    range.min_code = range.max_code = '\n';
575    _ure_add_range(&sym->sym.ccl, &range, b);
576    range.min_code = range.max_code = '\f';
577    _ure_add_range(&sym->sym.ccl, &range, b);
578    range.min_code = range.max_code = 0xfeff;
579    _ure_add_range(&sym->sym.ccl, &range, b);
580}
581
582static void
583_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
584{
585    _ure_range_t range;
586
587    /*
588     * Add the additional characters needed for handling isxdigit().
589     */
590    range.min_code = '0';
591    range.max_code = '9';
592    _ure_add_range(&sym->sym.ccl, &range, b);
593    range.min_code = 'A';
594    range.max_code = 'F';
595    _ure_add_range(&sym->sym.ccl, &range, b);
596    range.min_code = 'a';
597    range.max_code = 'f';
598    _ure_add_range(&sym->sym.ccl, &range, b);
599}
600
601static _ure_trie_t cclass_trie[] = {
602    {0x003a, 1, 1, 0, 0},
603    {0x0061, 9, 10, 0, 0},
604    {0x0063, 8, 19, 0, 0},
605    {0x0064, 7, 24, 0, 0},
606    {0x0067, 6, 29, 0, 0},
607    {0x006c, 5, 34, 0, 0},
608    {0x0070, 4, 39, 0, 0},
609    {0x0073, 3, 49, 0, 0},
610    {0x0075, 2, 54, 0, 0},
611    {0x0078, 1, 59, 0, 0},
612    {0x006c, 1, 11, 0, 0},
613    {0x006e, 2, 13, 0, 0},
614    {0x0070, 1, 16, 0, 0},
615    {0x0075, 1, 14, 0, 0},
616    {0x006d, 1, 15, 0, 0},
617    {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
618    {0x0068, 1, 17, 0, 0},
619    {0x0061, 1, 18, 0, 0},
620    {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
621    {0x006e, 1, 20, 0, 0},
622    {0x0074, 1, 21, 0, 0},
623    {0x0072, 1, 22, 0, 0},
624    {0x006c, 1, 23, 0, 0},
625    {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
626    {0x0069, 1, 25, 0, 0},
627    {0x0067, 1, 26, 0, 0},
628    {0x0069, 1, 27, 0, 0},
629    {0x0074, 1, 28, 0, 0},
630    {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
631    {0x0072, 1, 30, 0, 0},
632    {0x0061, 1, 31, 0, 0},
633    {0x0070, 1, 32, 0, 0},
634    {0x0068, 1, 33, 0, 0},
635    {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
636    {0x006f, 1, 35, 0, 0},
637    {0x0077, 1, 36, 0, 0},
638    {0x0065, 1, 37, 0, 0},
639    {0x0072, 1, 38, 0, 0},
640    {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
641    {0x0072, 2, 41, 0, 0},
642    {0x0075, 1, 45, 0, 0},
643    {0x0069, 1, 42, 0, 0},
644    {0x006e, 1, 43, 0, 0},
645    {0x0074, 1, 44, 0, 0},
646    {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
647    {0x006e, 1, 46, 0, 0},
648    {0x0063, 1, 47, 0, 0},
649    {0x0074, 1, 48, 0, 0},
650    {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
651    {0x0070, 1, 50, 0, 0},
652    {0x0061, 1, 51, 0, 0},
653    {0x0063, 1, 52, 0, 0},
654    {0x0065, 1, 53, 0, 0},
655    {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
656    {0x0070, 1, 55, 0, 0},
657    {0x0070, 1, 56, 0, 0},
658    {0x0065, 1, 57, 0, 0},
659    {0x0072, 1, 58, 0, 0},
660    {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
661    {0x0064, 1, 60, 0, 0},
662    {0x0069, 1, 61, 0, 0},
663    {0x0067, 1, 62, 0, 0},
664    {0x0069, 1, 63, 0, 0},
665    {0x0074, 1, 64, 0, 0},
666    {0x003a, 1, 65, _ure_xdigit_setup, 0},
667};
668
669/*
670 * Probe for one of the POSIX colon delimited character classes in the static
671 * trie.
672 */
673static unsigned long
674_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
675               _ure_buffer_t *b)
676{
677    int i;
678    unsigned long n;
679    _ure_trie_t *tp;
680    ucs2_t *sp, *ep;
681
682    /*
683     * If the number of characters left is less than 7, then this cannot be
684     * interpreted as one of the colon delimited classes.
685     */
686    if (limit < 7)
687      return 0;
688
689    sp = cp;
690    ep = sp + limit;
691    tp = cclass_trie;
692    for (i = 0; sp < ep && i < 8; i++, sp++) {
693        n = tp->len;
694
695        for (; n > 0 && tp->key != *sp; tp++, n--) ;
696
697        if (n == 0)
698          return 0;
699
700        if (*sp == ':' && (i == 6 || i == 7)) {
701            sp++;
702            break;
703        }
704        if (sp + 1 < ep)
705          tp = cclass_trie + tp->next;
706    }
707    if (tp->func == 0)
708      return 0;
709
710    (*tp->func)(sym, tp->mask, b);
711
712    return sp - cp;
713}
714
715/*
716 * Construct a list of ranges and return the number of characters consumed.
717 */
718static unsigned long
719_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
720            _ure_buffer_t *b)
721{
722    int range_end;
723    unsigned long n;
724    ucs2_t *sp, *ep;
725    ucs4_t c, last;
726    _ure_ccl_t *cclp;
727    _ure_range_t range;
728
729    sp = cp;
730    ep = sp + limit;
731
732    if (*sp == '^') {
733      symp->type = _URE_NCCLASS;
734      sp++;
735    } else
736      symp->type = _URE_CCLASS;
737
738    for (last = 0, range_end = 0;
739         b->error == _URE_OK && sp < ep && *sp != ']'; ) {
740        c = *sp++;
741        if (c == '\\') {
742            if (sp == ep) {
743                /*
744                 * The EOS was encountered when expecting the reverse solidus
745                 * to be followed by the character it is escaping.  Set an
746                 * error code and return the number of characters consumed up
747                 * to this point.
748                 */
749                b->error = _URE_UNEXPECTED_EOS;
750                return sp - cp;
751            }
752
753            c = *sp++;
754            switch (c) {
755              case 'a':
756                c = 0x07;
757                break;
758              case 'b':
759                c = 0x08;
760                break;
761              case 'f':
762                c = 0x0c;
763                break;
764              case 'n':
765                c = 0x0a;
766                break;
767              case 'r':
768                c = 0x0d;
769                break;
770              case 't':
771                c = 0x09;
772                break;
773              case 'v':
774                c = 0x0b;
775                break;
776              case 'p':
777              case 'P':
778                sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
779                /*
780                 * Invert the bit mask of the properties if this is a negated
781                 * character class or if 'P' is used to specify a list of
782                 * character properties that should *not* match in a
783                 * character class.
784                 */
785                if (c == 'P')
786                  symp->props = ~symp->props;
787                continue;
788                break;
789              case 'x':
790              case 'X':
791              case 'u':
792              case 'U':
793                if (sp < ep &&
794                    ((*sp >= '0' && *sp <= '9') ||
795                     (*sp >= 'A' && *sp <= 'F') ||
796                     (*sp >= 'a' && *sp <= 'f')))
797                  sp += _ure_hex(sp, ep - sp, &c);
798            }
799        } else if (c == ':') {
800            /*
801             * Probe for a POSIX colon delimited character class.
802             */
803            sp--;
804            if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
805              sp++;
806            else {
807                sp += n;
808                continue;
809            }
810        }
811
812        cclp = &symp->sym.ccl;
813
814        /*
815         * Check to see if the current character is a low surrogate that needs
816         * to be combined with a preceding high surrogate.
817         */
818        if (last != 0) {
819            if (c >= 0xdc00 && c <= 0xdfff)
820              /*
821               * Construct the UTF16 character code.
822               */
823              c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
824            else {
825                /*
826                 * Add the isolated high surrogate to the range.
827                 */
828                if (range_end == 1)
829                  range.max_code = last & 0xffff;
830                else
831                  range.min_code = range.max_code = last & 0xffff;
832
833                _ure_add_range(cclp, &range, b);
834                range_end = 0;
835            }
836        }
837
838        /*
839         * Clear the last character code.
840         */
841        last = 0;
842
843        /*
844         * This slightly awkward code handles the different cases needed to
845         * construct a range.
846         */
847        if (c >= 0xd800 && c <= 0xdbff) {
848            /*
849             * If the high surrogate is followed by a range indicator, simply
850             * add it as the range start.  Otherwise, save it in case the next
851             * character is a low surrogate.
852             */
853            if (*sp == '-') {
854                sp++;
855                range.min_code = c;
856                range_end = 1;
857            } else
858              last = c;
859        } else if (range_end == 1) {
860            range.max_code = c;
861            _ure_add_range(cclp, &range, b);
862            range_end = 0;
863        } else {
864            range.min_code = range.max_code = c;
865            if (*sp == '-') {
866                sp++;
867                range_end = 1;
868            } else
869              _ure_add_range(cclp, &range, b);
870        }
871    }
872
873    if (sp < ep && *sp == ']')
874      sp++;
875    else
876      /*
877       * The parse was not terminated by the character class close symbol
878       * (']'), so set an error code.
879       */
880      b->error = _URE_CCLASS_OPEN;
881
882    return sp - cp;
883}
884
885/*
886 * Probe for a low surrogate hex code.
887 */
888static unsigned long
889_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
890{
891    ucs4_t i, code;
892    ucs2_t *sp, *ep;
893
894    for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
895        if (*sp >= '0' && *sp <= '9')
896          code = (code << 4) + (*sp - '0');
897        else if (*sp >= 'A' && *sp <= 'F')
898          code = (code << 4) + ((*sp - 'A') + 10);
899        else if (*sp >= 'a' && *sp <= 'f')
900          code = (code << 4) + ((*sp - 'a') + 10);
901        else
902          break;
903    }
904
905    *c = code;
906    return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
907}
908
909static unsigned long
910_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
911                    _ure_buffer_t *b)
912{
913    ucs4_t c;
914    ucs2_t *sp, *ep;
915
916    sp = sym;
917    ep = sym + limit;
918
919    if ((c = *sp++) == '\\') {
920
921        if (sp == ep) {
922            /*
923             * The EOS was encountered when expecting the reverse solidus to
924             * be followed by the character it is escaping.  Set an error code
925             * and return the number of characters consumed up to this point.
926             */
927            b->error = _URE_UNEXPECTED_EOS;
928            return sp - sym;
929        }
930
931        c = *sp++;
932        switch (c) {
933          case 'p':
934          case 'P':
935            symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
936            sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
937            break;
938          case 'a':
939            symp->type = _URE_CHAR;
940            symp->sym.chr = 0x07;
941            break;
942          case 'b':
943            symp->type = _URE_CHAR;
944            symp->sym.chr = 0x08;
945            break;
946          case 'f':
947            symp->type = _URE_CHAR;
948            symp->sym.chr = 0x0c;
949            break;
950          case 'n':
951            symp->type = _URE_CHAR;
952            symp->sym.chr = 0x0a;
953            break;
954          case 'r':
955            symp->type = _URE_CHAR;
956            symp->sym.chr = 0x0d;
957            break;
958          case 't':
959            symp->type = _URE_CHAR;
960            symp->sym.chr = 0x09;
961            break;
962          case 'v':
963            symp->type = _URE_CHAR;
964            symp->sym.chr = 0x0b;
965            break;
966          case 'x':
967          case 'X':
968          case 'u':
969          case 'U':
970            /*
971             * Collect between 1 and 4 digits representing a UCS2 code.  Fall
972             * through to the next case.
973             */
974            if (sp < ep &&
975                ((*sp >= '0' && *sp <= '9') ||
976                 (*sp >= 'A' && *sp <= 'F') ||
977                 (*sp >= 'a' && *sp <= 'f')))
978              sp += _ure_hex(sp, ep - sp, &c);
979            /* FALLTHROUGH */
980          default:
981            /*
982             * Simply add an escaped character here.
983             */
984            symp->type = _URE_CHAR;
985            symp->sym.chr = c;
986        }
987    } else if (c == '^' || c == '$')
988      /*
989       * Handle the BOL and EOL anchors.  This actually consists simply of
990       * setting a flag that indicates that the user supplied anchor match
991       * function should be called.  This needs to be done instead of simply
992       * matching line/paragraph separators because beginning-of-text and
993       * end-of-text tests are needed as well.
994       */
995      symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
996    else if (c == '[')
997      /*
998       * Construct a character class.
999       */
1000      sp += _ure_cclass(sp, ep - sp, symp, b);
1001    else if (c == '.')
1002      symp->type = _URE_ANY_CHAR;
1003    else {
1004        symp->type = _URE_CHAR;
1005        symp->sym.chr = c;
1006    }
1007
1008    /*
1009     * If the symbol type happens to be a character and is a high surrogate,
1010     * then probe forward to see if it is followed by a low surrogate that
1011     * needs to be added.
1012     */
1013    if (sp < ep && symp->type == _URE_CHAR &&
1014        0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1015
1016        if (0xdc00 <= *sp && *sp <= 0xdfff) {
1017            symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1018                                       (*sp & 0x03ff));
1019            sp++;
1020        } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1021                                 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1022            sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1023            if (0xdc00 <= c && c <= 0xdfff) {
1024                /*
1025                 * Take into account the \[xu] in front of the hex code.
1026                 */
1027                sp += 2;
1028                symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1029                                           (c & 0x03ff));
1030            }
1031        }
1032    }
1033
1034    /*
1035     * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1036     * the `casefold' flag is set.
1037     */
1038    if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1039      symp->sym.chr = _ure_tolower(symp->sym.chr);
1040
1041    /*
1042     * If the symbol constructed is anything other than one of the anchors,
1043     * make sure the _URE_DFA_BLANKLINE flag is removed.
1044     */
1045    if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1046      b->flags &= ~_URE_DFA_BLANKLINE;
1047
1048    /*
1049     * Return the number of characters consumed.
1050     */
1051    return sp - sym;
1052}
1053
1054static int
1055_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1056{
1057    if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1058      return 1;
1059
1060    if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1061        if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1062          return 1;
1063        if (a->sym.ccl.ranges_used > 0 &&
1064            memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1065                   sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1066          return 1;
1067    } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1068      return 1;
1069    return 0;
1070}
1071
1072/*
1073 * Construct a symbol, but only keep unique symbols.
1074 */
1075static ucs2_t
1076_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1077                 _ure_buffer_t *b)
1078{
1079    ucs2_t i;
1080    _ure_symtab_t *sp, symbol;
1081
1082    /*
1083     * Build the next symbol so we can test to see if it is already in the
1084     * symbol table.
1085     */
1086    (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1087    *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1088
1089    /*
1090     * Check to see if the symbol exists.
1091     */
1092    for (i = 0, sp = b->symtab;
1093         i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1094
1095    if (i < b->symtab_used) {
1096        /*
1097         * Free up any ranges used for the symbol.
1098         */
1099        if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1100            symbol.sym.ccl.ranges_size > 0)
1101          free((char *) symbol.sym.ccl.ranges);
1102
1103        return b->symtab[i].id;
1104    }
1105
1106    /*
1107     * Need to add the new symbol.
1108     */
1109    if (b->symtab_used == b->symtab_size) {
1110        if (b->symtab_size == 0)
1111          b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1112        else
1113          b->symtab = (_ure_symtab_t *)
1114              realloc((char *) b->symtab,
1115                      sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1116        sp = b->symtab + b->symtab_size;
1117        (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1118        b->symtab_size += 8;
1119    }
1120
1121    symbol.id = b->symtab_used++;
1122    (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1123                  sizeof(_ure_symtab_t));
1124
1125    return symbol.id;
1126}
1127
1128/*************************************************************************
1129 *
1130 * End symbol parse functions.
1131 *
1132 *************************************************************************/
1133
1134static ucs2_t
1135_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1136{
1137    ucs2_t i;
1138
1139    if (b == 0)
1140      return _URE_NOOP;
1141
1142    /*
1143     * Determine if the expression already exists or not.
1144     */
1145    for (i = 0; i < b->expr_used; i++) {
1146        if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1147            b->expr[i].rhs == rhs)
1148          break;
1149    }
1150    if (i < b->expr_used)
1151      return i;
1152
1153    /*
1154     * Need to add a new expression.
1155     */
1156    if (b->expr_used == b->expr_size) {
1157        if (b->expr_size == 0)
1158          b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1159        else
1160          b->expr = (_ure_elt_t *)
1161              realloc((char *) b->expr,
1162                      sizeof(_ure_elt_t) * (b->expr_size + 8));
1163        b->expr_size += 8;
1164    }
1165
1166    b->expr[b->expr_used].onstack = 0;
1167    b->expr[b->expr_used].type = type;
1168    b->expr[b->expr_used].lhs = lhs;
1169    b->expr[b->expr_used].rhs = rhs;
1170
1171    return b->expr_used++;
1172}
1173
1174static unsigned char spmap[] = {
1175    0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1176    0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1177    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1178};
1179
1180#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1181                            (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1182
1183/*
1184 * Convert the regular expression into an NFA in a form that will be easy to
1185 * reduce to a DFA.  The starting state for the reduction will be returned.
1186 */
1187static ucs2_t
1188_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1189{
1190    ucs2_t c, state, top, sym, *sp, *ep;
1191    unsigned long used;
1192
1193    state = _URE_NOOP;
1194
1195    sp = re;
1196    ep = sp + relen;
1197    while (b->error == _URE_OK && sp < ep) {
1198        c = *sp++;
1199        switch (c) {
1200          case '(':
1201            _ure_push(_URE_PAREN, b);
1202            break;
1203          case ')':
1204            /*
1205             * Check for the case of too many close parentheses.
1206             */
1207            if (_ure_peek(b) == _URE_NOOP) {
1208                b->error = _URE_UNBALANCED_GROUP;
1209                break;
1210            }
1211
1212            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1213              /*
1214               * Make an expression with the AND or OR operator and its right
1215               * hand side.
1216               */
1217              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1218
1219            /*
1220             * Remove the _URE_PAREN off the stack.
1221             */
1222            (void) _ure_pop(b);
1223            break;
1224          case '*':
1225            state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1226            break;
1227          case '+':
1228            state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1229            break;
1230          case '?':
1231            state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1232            break;
1233          case '|':
1234            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1235              /*
1236               * Make an expression with the AND or OR operator and its right
1237               * hand side.
1238               */
1239              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1240
1241            _ure_push(state, b);
1242            _ure_push(_URE_OR, b);
1243            break;
1244          default:
1245            sp--;
1246            sym = _ure_make_symbol(sp, ep - sp, &used, b);
1247            sp += used;
1248            state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1249            break;
1250        }
1251
1252        if (c != '(' && c != '|' && sp < ep &&
1253            (!_ure_isspecial(*sp) || *sp == '(')) {
1254            _ure_push(state, b);
1255            _ure_push(_URE_AND, b);
1256        }
1257    }
1258    while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1259      /*
1260       * Make an expression with the AND or OR operator and its right
1261       * hand side.
1262       */
1263      state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1264
1265    if (b->stack.slist_used > 0)
1266      b->error = _URE_UNBALANCED_GROUP;
1267
1268    return (b->error == _URE_OK) ? state : _URE_NOOP;
1269}
1270
1271static void
1272_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1273{
1274    ucs2_t i, *stp;
1275    _ure_symtab_t *sp;
1276
1277    /*
1278     * Locate the symbol in the symbol table so the state can be added.
1279     * If the symbol doesn't exist, then a real problem exists.
1280     */
1281    for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1282         i++, sp++) ;
1283
1284    /*
1285     * Now find out if the state exists in the symbol's state list.
1286     */
1287    for (i = 0, stp = sp->states.slist;
1288         i < sp->states.slist_used && state > *stp; i++, stp++) ;
1289
1290    if (i == sp->states.slist_used || state < *stp) {
1291        /*
1292         * Need to add the state in order.
1293         */
1294        if (sp->states.slist_used == sp->states.slist_size) {
1295            if (sp->states.slist_size == 0)
1296              sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1297            else
1298              sp->states.slist = (ucs2_t *)
1299                  realloc((char *) sp->states.slist,
1300                          sizeof(ucs2_t) * (sp->states.slist_size + 8));
1301            sp->states.slist_size += 8;
1302        }
1303        if (i < sp->states.slist_used)
1304          (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1305                              (char *) (sp->states.slist + i),
1306                              sizeof(ucs2_t) * (sp->states.slist_used - i));
1307        sp->states.slist[i] = state;
1308        sp->states.slist_used++;
1309    }
1310}
1311
1312static ucs2_t
1313_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1314{
1315    ucs2_t i;
1316    _ure_state_t *sp;
1317
1318    for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1319        if (sp->st.slist_used == nstates &&
1320            memcmp((char *) states, (char *) sp->st.slist,
1321                   sizeof(ucs2_t) * nstates) == 0)
1322          break;
1323    }
1324
1325    if (i == b->states.states_used) {
1326        /*
1327         * Need to add a new DFA state (set of NFA states).
1328         */
1329        if (b->states.states_used == b->states.states_size) {
1330            if (b->states.states_size == 0)
1331              b->states.states = (_ure_state_t *)
1332                  malloc(sizeof(_ure_state_t) << 3);
1333            else
1334              b->states.states = (_ure_state_t *)
1335                  realloc((char *) b->states.states,
1336                          sizeof(_ure_state_t) * (b->states.states_size + 8));
1337            sp = b->states.states + b->states.states_size;
1338            (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1339            b->states.states_size += 8;
1340        }
1341
1342        sp = b->states.states + b->states.states_used++;
1343        sp->id = i;
1344
1345        if (sp->st.slist_used + nstates > sp->st.slist_size) {
1346            if (sp->st.slist_size == 0)
1347              sp->st.slist = (ucs2_t *)
1348                  malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1349            else
1350              sp->st.slist = (ucs2_t *)
1351                  realloc((char *) sp->st.slist,
1352                          sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1353            sp->st.slist_size = sp->st.slist_used + nstates;
1354        }
1355        sp->st.slist_used = nstates;
1356        (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1357                      sizeof(ucs2_t) * nstates);
1358    }
1359
1360    /*
1361     * Return the ID of the DFA state representing a group of NFA states.
1362     */
1363    return i;
1364}
1365
1366static void
1367_ure_reduce(ucs2_t start, _ure_buffer_t *b)
1368{
1369    ucs2_t i, j, state, eval, syms, rhs;
1370    ucs2_t s1, s2, ns1, ns2;
1371    _ure_state_t *sp;
1372    _ure_symtab_t *smp;
1373
1374    b->reducing = 1;
1375
1376    /*
1377     * Add the starting state for the reduction.
1378     */
1379    _ure_add_state(1, &start, b);
1380
1381    /*
1382     * Process each set of NFA states that get created.
1383     */
1384    for (i = 0; i < b->states.states_used; i++) {
1385        sp = b->states.states + i;
1386
1387        /*
1388         * Push the current states on the stack.
1389         */
1390        for (j = 0; j < sp->st.slist_used; j++)
1391          _ure_push(sp->st.slist[j], b);
1392
1393        /*
1394         * Reduce the NFA states.
1395         */
1396        for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1397            state = b->stack.slist[j];
1398            eval = 1;
1399
1400            /*
1401             * This inner loop is the iterative equivalent of recursively
1402             * reducing subexpressions generated as a result of a reduction.
1403             */
1404            while (eval) {
1405                switch (b->expr[state].type) {
1406                  case _URE_SYMBOL:
1407                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1408                    _ure_add_symstate(b->expr[state].lhs, ns1, b);
1409                    syms++;
1410                    eval = 0;
1411                    break;
1412                  case _URE_ONE:
1413                    sp->accepting = 1;
1414                    eval = 0;
1415                    break;
1416                  case _URE_QUEST:
1417                    s1 = b->expr[state].lhs;
1418                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1419                    state = _ure_make_expr(_URE_OR, ns1, s1, b);
1420                    break;
1421                  case _URE_PLUS:
1422                    s1 = b->expr[state].lhs;
1423                    ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1424                    state = _ure_make_expr(_URE_AND, s1, ns1, b);
1425                    break;
1426                  case _URE_STAR:
1427                    s1 = b->expr[state].lhs;
1428                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1429                    ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1430                    state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1431                    break;
1432                  case _URE_OR:
1433                    s1 = b->expr[state].lhs;
1434                    s2 = b->expr[state].rhs;
1435                    _ure_push(s1, b);
1436                    _ure_push(s2, b);
1437                    eval = 0;
1438                    break;
1439                  case _URE_AND:
1440                    s1 = b->expr[state].lhs;
1441                    s2 = b->expr[state].rhs;
1442                    switch (b->expr[s1].type) {
1443                      case _URE_SYMBOL:
1444                        _ure_add_symstate(b->expr[s1].lhs, s2, b);
1445                        syms++;
1446                        eval = 0;
1447                        break;
1448                      case _URE_ONE:
1449                        state = s2;
1450                        break;
1451                      case _URE_QUEST:
1452                        ns1 = b->expr[s1].lhs;
1453                        ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1454                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
1455                        break;
1456                      case _URE_PLUS:
1457                        ns1 = b->expr[s1].lhs;
1458                        ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1459                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1460                        break;
1461                      case _URE_STAR:
1462                        ns1 = b->expr[s1].lhs;
1463                        ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1464                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
1465                        break;
1466                      case _URE_OR:
1467                        ns1 = b->expr[s1].lhs;
1468                        ns2 = b->expr[s1].rhs;
1469                        ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1470                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1471                        state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1472                        break;
1473                      case _URE_AND:
1474                        ns1 = b->expr[s1].lhs;
1475                        ns2 = b->expr[s1].rhs;
1476                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1477                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1478                        break;
1479                    }
1480                }
1481            }
1482        }
1483
1484        /*
1485         * Clear the state stack.
1486         */
1487        while (_ure_pop(b) != _URE_NOOP) ;
1488
1489        /*
1490         * Reset the state pointer because the reduction may have moved it
1491         * during a reallocation.
1492         */
1493        sp = b->states.states + i;
1494
1495        /*
1496         * Generate the DFA states for the symbols collected during the
1497         * current reduction.
1498         */
1499        if (sp->trans_used + syms > sp->trans_size) {
1500            if (sp->trans_size == 0)
1501              sp->trans = (_ure_elt_t *)
1502                  malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1503            else
1504              sp->trans = (_ure_elt_t *)
1505                  realloc((char *) sp->trans,
1506                          sizeof(_ure_elt_t) * (sp->trans_used + syms));
1507            sp->trans_size = sp->trans_used + syms;
1508        }
1509
1510        /*
1511         * Go through the symbol table and generate the DFA state transitions
1512         * for each symbol that has collected NFA states.
1513         */
1514        for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1515            sp = b->states.states + i;
1516
1517            if (smp->states.slist_used > 0) {
1518                sp->trans[syms].lhs = smp->id;
1519                rhs = _ure_add_state(smp->states.slist_used,
1520                                     smp->states.slist, b);
1521                /*
1522                 * Reset the state pointer in case the reallocation moves it
1523                 * in memory.
1524                 */
1525                sp = b->states.states + i;
1526                sp->trans[syms].rhs = rhs;
1527
1528                smp->states.slist_used = 0;
1529                syms++;
1530            }
1531        }
1532
1533        /*
1534         * Set the number of transitions actually used.
1535         */
1536        sp->trans_used = syms;
1537    }
1538    b->reducing = 0;
1539}
1540
1541static void
1542_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1543{
1544    ucs2_t tmp;
1545
1546    l = b->states.states[l].id;
1547    r = b->states.states[r].id;
1548
1549    if (l == r)
1550      return;
1551
1552    if (l > r) {
1553        tmp = l;
1554        l = r;
1555        r = tmp;
1556    }
1557
1558    /*
1559     * Check to see if the equivalence pair already exists.
1560     */
1561    for (tmp = 0; tmp < b->equiv_used &&
1562             (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1563         tmp++) ;
1564
1565    if (tmp < b->equiv_used)
1566      return;
1567
1568    if (b->equiv_used == b->equiv_size) {
1569        if (b->equiv_size == 0)
1570          b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1571        else
1572          b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1573                                              sizeof(_ure_equiv_t) *
1574                                              (b->equiv_size + 8));
1575        b->equiv_size += 8;
1576    }
1577    b->equiv[b->equiv_used].l = l;
1578    b->equiv[b->equiv_used].r = r;
1579    b->equiv_used++;
1580}
1581
1582/*
1583 * Merge the DFA states that are equivalent.
1584 */
1585static void
1586_ure_merge_equiv(_ure_buffer_t *b)
1587{
1588    ucs2_t i, j, k, eq, done;
1589    _ure_state_t *sp1, *sp2, *ls, *rs;
1590
1591    for (i = 0; i < b->states.states_used; i++) {
1592        sp1 = b->states.states + i;
1593        if (sp1->id != i)
1594          continue;
1595        for (j = 0; j < i; j++) {
1596            sp2 = b->states.states + j;
1597            if (sp2->id != j)
1598              continue;
1599            b->equiv_used = 0;
1600            _ure_add_equiv(i, j, b);
1601            for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1602                ls = b->states.states + b->equiv[eq].l;
1603                rs = b->states.states + b->equiv[eq].r;
1604                if (ls->accepting != rs->accepting ||
1605                    ls->trans_used != rs->trans_used) {
1606                    done = 1;
1607                    break;
1608                }
1609                for (k = 0; k < ls->trans_used &&
1610                         ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1611                if (k < ls->trans_used) {
1612                    done = 1;
1613                    break;
1614                }
1615
1616                for (k = 0; k < ls->trans_used; k++)
1617                  _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1618            }
1619            if (done == 0)
1620              break;
1621        }
1622        for (eq = 0; j < i && eq < b->equiv_used; eq++)
1623          b->states.states[b->equiv[eq].r].id =
1624              b->states.states[b->equiv[eq].l].id;
1625    }
1626
1627    /*
1628     * Renumber the states appropriately.
1629     */
1630    for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1631         sp1++, i++)
1632      sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1633}
1634
1635/*************************************************************************
1636 *
1637 * API.
1638 *
1639 *************************************************************************/
1640
1641ure_buffer_t
1642ure_buffer_create(void)
1643{
1644    ure_buffer_t b;
1645
1646    b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1647
1648    return b;
1649}
1650
1651void
1652ure_buffer_free(ure_buffer_t buf)
1653{
1654    unsigned long i;
1655
1656    if (buf == 0)
1657      return;
1658
1659    if (buf->stack.slist_size > 0)
1660      free((char *) buf->stack.slist);
1661
1662    if (buf->expr_size > 0)
1663      free((char *) buf->expr);
1664
1665    for (i = 0; i < buf->symtab_size; i++) {
1666        if (buf->symtab[i].states.slist_size > 0)
1667          free((char *) buf->symtab[i].states.slist);
1668    }
1669
1670    if (buf->symtab_size > 0)
1671      free((char *) buf->symtab);
1672
1673    for (i = 0; i < buf->states.states_size; i++) {
1674        if (buf->states.states[i].trans_size > 0)
1675          free((char *) buf->states.states[i].trans);
1676        if (buf->states.states[i].st.slist_size > 0)
1677          free((char *) buf->states.states[i].st.slist);
1678    }
1679
1680    if (buf->states.states_size > 0)
1681      free((char *) buf->states.states);
1682
1683    if (buf->equiv_size > 0)
1684      free((char *) buf->equiv);
1685
1686    free((char *) buf);
1687}
1688
1689ure_dfa_t
1690ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1691{
1692    ucs2_t i, j, state;
1693    _ure_state_t *sp;
1694    _ure_dstate_t *dsp;
1695    _ure_trans_t *tp;
1696    ure_dfa_t dfa;
1697
1698    if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1699      return 0;
1700
1701    /*
1702     * Reset the various fields of the compilation buffer.  Default the flags
1703     * to indicate the presense of the "^$" pattern.  If any other pattern
1704     * occurs, then this flag will be removed.  This is done to catch this
1705     * special pattern and handle it specially when matching.
1706     */
1707    buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1708    buf->reducing = 0;
1709    buf->stack.slist_used = 0;
1710    buf->expr_used = 0;
1711
1712    for (i = 0; i < buf->symtab_used; i++)
1713      buf->symtab[i].states.slist_used = 0;
1714    buf->symtab_used = 0;
1715
1716    for (i = 0; i < buf->states.states_used; i++) {
1717        buf->states.states[i].st.slist_used = 0;
1718        buf->states.states[i].trans_used = 0;
1719    }
1720    buf->states.states_used = 0;
1721
1722    /*
1723     * Construct the NFA.  If this stage returns a 0, then an error occured or
1724     * an empty expression was passed.
1725     */
1726    if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1727      return 0;
1728
1729    /*
1730     * Do the expression reduction to get the initial DFA.
1731     */
1732    _ure_reduce(state, buf);
1733
1734    /*
1735     * Merge all the equivalent DFA states.
1736     */
1737    _ure_merge_equiv(buf);
1738
1739    /*
1740     * Construct the minimal DFA.
1741     */
1742    dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1743    (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1744
1745    dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1746
1747    /*
1748     * Free up the NFA state groups and transfer the symbols from the buffer
1749     * to the DFA.
1750     */
1751    for (i = 0; i < buf->symtab_size; i++) {
1752        if (buf->symtab[i].states.slist_size > 0)
1753          free((char *) buf->symtab[i].states.slist);
1754    }
1755    dfa->syms = buf->symtab;
1756    dfa->nsyms = buf->symtab_used;
1757
1758    buf->symtab_used = buf->symtab_size = 0;
1759
1760    /*
1761     * Collect the total number of states and transitions needed for the DFA.
1762     */
1763    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1764         i++, sp++) {
1765        if (sp->id == state) {
1766            dfa->nstates++;
1767            dfa->ntrans += sp->trans_used;
1768            state++;
1769        }
1770    }
1771
1772    /*
1773     * Allocate enough space for the states and transitions.
1774     */
1775    dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1776                                           dfa->nstates);
1777    dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1778
1779    /*
1780     * Actually transfer the DFA states from the buffer.
1781     */
1782    dsp = dfa->states;
1783    tp = dfa->trans;
1784    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1785         i++, sp++) {
1786        if (sp->id == state) {
1787            dsp->trans = tp;
1788            dsp->ntrans = sp->trans_used;
1789            dsp->accepting = sp->accepting;
1790
1791            /*
1792             * Add the transitions for the state.
1793             */
1794            for (j = 0; j < dsp->ntrans; j++, tp++) {
1795                tp->symbol = sp->trans[j].lhs;
1796                tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1797            }
1798
1799            dsp++;
1800            state++;
1801        }
1802    }
1803
1804    return dfa;
1805}
1806
1807void
1808ure_dfa_free(ure_dfa_t dfa)
1809{
1810    ucs2_t i;
1811
1812    if (dfa == 0)
1813      return;
1814
1815    for (i = 0; i < dfa->nsyms; i++) {
1816        if ((dfa->syms[i].type == _URE_CCLASS ||
1817             dfa->syms[i].type == _URE_NCCLASS) &&
1818            dfa->syms[i].sym.ccl.ranges_size > 0)
1819          free((char *) dfa->syms[i].sym.ccl.ranges);
1820    }
1821    if (dfa->nsyms > 0)
1822      free((char *) dfa->syms);
1823
1824    if (dfa->nstates > 0)
1825      free((char *) dfa->states);
1826    if (dfa->ntrans > 0)
1827      free((char *) dfa->trans);
1828    free((char *) dfa);
1829}
1830
1831void
1832ure_write_dfa(ure_dfa_t dfa, FILE *out)
1833{
1834    ucs2_t i, j, k, h, l;
1835    _ure_dstate_t *sp;
1836    _ure_symtab_t *sym;
1837    _ure_range_t *rp;
1838
1839    if (dfa == 0 || out == 0)
1840      return;
1841
1842    /*
1843     * Write all the different character classes.
1844     */
1845    for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1846        if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1847            fprintf(out, "C%hd = ", sym->id);
1848            if (sym->sym.ccl.ranges_used > 0) {
1849                putc('[', out);
1850                if (sym->type == _URE_NCCLASS)
1851                  putc('^', out);
1852            }
1853            if (sym->props != 0) {
1854                if (sym->type == _URE_NCCLASS)
1855                  fprintf(out, "\\P");
1856                else
1857                  fprintf(out, "\\p");
1858                for (k = h = 0; k < 32; k++) {
1859                    if (sym->props & (1 << k)) {
1860                        if (h != 0)
1861                          putc(',', out);
1862                        fprintf(out, "%hd", k + 1);
1863                        h = 1;
1864                    }
1865                }
1866            }
1867            /*
1868             * Dump the ranges.
1869             */
1870            for (k = 0, rp = sym->sym.ccl.ranges;
1871                 k < sym->sym.ccl.ranges_used; k++, rp++) {
1872                /*
1873                 * Check for UTF16 characters.
1874                 */
1875                if (0x10000 <= rp->min_code &&
1876                    rp->min_code <= 0x10ffff) {
1877                    h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1878                    l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1879                    fprintf(out, "\\x%04hX\\x%04hX", h, l);
1880                } else
1881                  fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1882                if (rp->max_code != rp->min_code) {
1883                    putc('-', out);
1884                    if (rp->max_code >= 0x10000 &&
1885                        rp->max_code <= 0x10ffff) {
1886                        h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1887                        l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1888                        fprintf(out, "\\x%04hX\\x%04hX", h, l);
1889                    } else
1890                      fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1891                }
1892            }
1893            if (sym->sym.ccl.ranges_used > 0)
1894              putc(']', out);
1895            putc('\n', out);
1896        }
1897    }
1898
1899    for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1900        fprintf(out, "S%hd = ", i);
1901        if (sp->accepting) {
1902            fprintf(out, "1 ");
1903            if (sp->ntrans)
1904              fprintf(out, "| ");
1905        }
1906        for (j = 0; j < sp->ntrans; j++) {
1907            if (j > 0)
1908              fprintf(out, "| ");
1909
1910            sym = dfa->syms + sp->trans[j].symbol;
1911            switch (sym->type) {
1912              case _URE_CHAR:
1913                if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1914                    /*
1915                     * Take care of UTF16 characters.
1916                     */
1917                    h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1918                    l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1919                    fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1920                } else
1921                  fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1922                break;
1923              case _URE_ANY_CHAR:
1924                fprintf(out, "<any> ");
1925                break;
1926              case _URE_BOL_ANCHOR:
1927                fprintf(out, "<bol-anchor> ");
1928                break;
1929              case _URE_EOL_ANCHOR:
1930                fprintf(out, "<eol-anchor> ");
1931                break;
1932              case _URE_CCLASS:
1933              case _URE_NCCLASS:
1934                fprintf(out, "[C%hd] ", sym->id);
1935                break;
1936            }
1937            fprintf(out, "S%hd", sp->trans[j].next_state);
1938            if (j + 1 < sp->ntrans)
1939              putc(' ', out);
1940        }
1941        putc('\n', out);
1942    }
1943}
1944
1945#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1946                        (cc) == 0x2029)
1947
1948int
1949ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1950         unsigned long *match_start, unsigned long *match_end)
1951{
1952    int i, j, matched, found, skip;
1953    unsigned long ms, me;
1954    ucs4_t c;
1955    ucs2_t *sp, *ep, *lp;
1956    _ure_dstate_t *stp;
1957    _ure_symtab_t *sym;
1958    _ure_range_t *rp;
1959
1960    if (dfa == 0 || text == 0)
1961      return 0;
1962
1963    /*
1964     * Handle the special case of an empty string matching the "^$" pattern.
1965     */
1966    if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1967        *match_start = *match_end = 0;
1968        return 1;
1969    }
1970
1971    sp = text;
1972    ep = sp + textlen;
1973
1974    ms = me = ~0;
1975
1976    stp = dfa->states;
1977
1978    for (found = skip = 0; found == 0 && sp < ep; ) {
1979        lp = sp;
1980        c = *sp++;
1981
1982        /*
1983         * Check to see if this is a high surrogate that should be
1984         * combined with a following low surrogate.
1985         */
1986        if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1987            0xdc00 <= *sp && *sp <= 0xdfff)
1988          c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1989
1990        /*
1991         * Determine if the character is non-spacing and should be skipped.
1992         */
1993        if (_ure_matches_properties(_URE_NONSPACING, c) &&
1994            (flags & URE_IGNORE_NONSPACING)) {
1995            sp++;
1996            continue;
1997        }
1998
1999        if (dfa->flags & _URE_DFA_CASEFOLD)
2000          c = _ure_tolower(c);
2001
2002        /*
2003         * See if one of the transitions matches.
2004         */
2005        for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2006            sym = dfa->syms + stp->trans[i].symbol;
2007            switch (sym->type) {
2008              case _URE_ANY_CHAR:
2009                if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2010                    !_ure_issep(c))
2011                  matched = 1;
2012                break;
2013              case _URE_CHAR:
2014                if (c == sym->sym.chr)
2015                  matched = 1;
2016                break;
2017              case _URE_BOL_ANCHOR:
2018                if (lp == text) {
2019                    sp = lp;
2020                    matched = 1;
2021                } else if (_ure_issep(c)) {
2022                    if (c == '\r' && sp < ep && *sp == '\n')
2023                      sp++;
2024                    lp = sp;
2025                    matched = 1;
2026                }
2027                break;
2028              case _URE_EOL_ANCHOR:
2029                if (_ure_issep(c)) {
2030                    /*
2031                     * Put the pointer back before the separator so the match
2032                     * end position will be correct.  This case will also
2033                     * cause the `sp' pointer to be advanced over the current
2034                     * separator once the match end point has been recorded.
2035                     */
2036                    sp = lp;
2037                    matched = 1;
2038                }
2039                break;
2040              case _URE_CCLASS:
2041              case _URE_NCCLASS:
2042                if (sym->props != 0)
2043                  matched = _ure_matches_properties(sym->props, c);
2044                for (j = 0, rp = sym->sym.ccl.ranges;
2045                     j < sym->sym.ccl.ranges_used; j++, rp++) {
2046                    if (rp->min_code <= c && c <= rp->max_code)
2047                      matched = 1;
2048                }
2049                if (sym->type == _URE_NCCLASS)
2050                  matched = !matched;
2051                break;
2052            }
2053
2054            if (matched) {
2055                if (ms == ~0UL)
2056                  ms = lp - text;
2057                else
2058                  me = sp - text;
2059                stp = dfa->states + stp->trans[i].next_state;
2060
2061                /*
2062                 * If the match was an EOL anchor, adjust the pointer past the
2063                 * separator that caused the match.  The correct match
2064                 * position has been recorded already.
2065                 */
2066                if (sym->type == _URE_EOL_ANCHOR) {
2067                    /*
2068                     * Skip the character that caused the match.
2069                     */
2070                    sp++;
2071
2072                    /*
2073                     * Handle the infamous CRLF situation.
2074                     */
2075                    if (sp < ep && c == '\r' && *sp == '\n')
2076                      sp++;
2077                }
2078            }
2079        }
2080
2081        if (matched == 0) {
2082            if (stp->accepting == 0) {
2083                /*
2084                 * If the last state was not accepting, then reset
2085                 * and start over.
2086                 */
2087                stp = dfa->states;
2088                ms = me = ~0;
2089            } else
2090              /*
2091               * The last state was accepting, so terminate the matching
2092               * loop to avoid more work.
2093               */
2094              found = 1;
2095        } else if (sp == ep) {
2096            if (!stp->accepting) {
2097                /*
2098                 * This ugly hack is to make sure the end-of-line anchors
2099                 * match when the source text hits the end.  This is only done
2100                 * if the last subexpression matches.
2101                 */
2102                for (i = 0; found == 0 && i < stp->ntrans; i++) {
2103                    sym = dfa->syms + stp->trans[i].symbol;
2104                    if (sym->type ==_URE_EOL_ANCHOR) {
2105                        stp = dfa->states + stp->trans[i].next_state;
2106                        if (stp->accepting) {
2107                            me = sp - text;
2108                            found = 1;
2109                        } else
2110                          break;
2111                    }
2112                }
2113            } else {
2114                /*
2115                 * Make sure any conditions that match all the way to the end
2116                 * of the string match.
2117                 */
2118                found = 1;
2119                me = sp - text;
2120            }
2121        }
2122    }
2123
2124    if (found == 0)
2125      ms = me = ~0;
2126
2127    *match_start = ms;
2128    *match_end = me;
2129
2130    return (ms != ~0UL) ? 1 : 0;
2131}
2132