1/*	$NetBSD$	*/
2
3/* OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.5 2010/04/13 20:23:04 kurt Exp */
4/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2010 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17/* Copyright 1997, 1998, 1999 Computing Research Labs,
18 * New Mexico State University
19 *
20 * Permission is hereby granted, free of charge, to any person obtaining a
21 * copy of this software and associated documentation files (the "Software"),
22 * to deal in the Software without restriction, including without limitation
23 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 * and/or sell copies of the Software, and to permit persons to whom the
25 * Software is furnished to do so, subject to the following conditions:
26 *
27 * The above copyright notice and this permission notice shall be included in
28 * all copies or substantial portions of the Software.
29 *
30 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 */
38/* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp" */
39
40#include "portable.h"
41
42#include <ac/stdlib.h>
43#include <ac/string.h>
44#include <ac/unistd.h>
45
46#include "ure.h"
47
48/*
49 * Flags used internally in the DFA.
50 */
51#define _URE_DFA_CASEFOLD  0x01
52#define _URE_DFA_BLANKLINE 0x02
53
54static unsigned long cclass_flags[] = {
55    0,
56    _URE_NONSPACING,
57    _URE_COMBINING,
58    _URE_NUMDIGIT,
59    _URE_NUMOTHER,
60    _URE_SPACESEP,
61    _URE_LINESEP,
62    _URE_PARASEP,
63    _URE_CNTRL,
64    _URE_PUA,
65    _URE_UPPER,
66    _URE_LOWER,
67    _URE_TITLE,
68    _URE_MODIFIER,
69    _URE_OTHERLETTER,
70    _URE_DASHPUNCT,
71    _URE_OPENPUNCT,
72    _URE_CLOSEPUNCT,
73    _URE_OTHERPUNCT,
74    _URE_MATHSYM,
75    _URE_CURRENCYSYM,
76    _URE_OTHERSYM,
77    _URE_LTR,
78    _URE_RTL,
79    _URE_EURONUM,
80    _URE_EURONUMSEP,
81    _URE_EURONUMTERM,
82    _URE_ARABNUM,
83    _URE_COMMONSEP,
84    _URE_BLOCKSEP,
85    _URE_SEGMENTSEP,
86    _URE_WHITESPACE,
87    _URE_OTHERNEUT,
88};
89
90/*
91 * Symbol types for the DFA.
92 */
93#define _URE_ANY_CHAR   1
94#define _URE_CHAR       2
95#define _URE_CCLASS     3
96#define _URE_NCCLASS    4
97#define _URE_BOL_ANCHOR 5
98#define _URE_EOL_ANCHOR 6
99
100/*
101 * Op codes for converting the NFA to a DFA.
102 */
103#define _URE_SYMBOL     10
104#define _URE_PAREN      11
105#define _URE_QUEST      12
106#define _URE_STAR       13
107#define _URE_PLUS       14
108#define _URE_ONE        15
109#define _URE_AND        16
110#define _URE_OR         17
111
112#define _URE_NOOP       0xffff
113
114#define _URE_REGSTART 0x8000
115#define _URE_REGEND   0x4000
116
117/*
118 * Structure used to handle a compacted range of characters.
119 */
120typedef struct {
121    ucs4_t min_code;
122    ucs4_t max_code;
123} _ure_range_t;
124
125typedef struct {
126    _ure_range_t *ranges;
127    ucs2_t ranges_used;
128    ucs2_t ranges_size;
129} _ure_ccl_t;
130
131typedef union {
132    ucs4_t chr;
133    _ure_ccl_t ccl;
134} _ure_sym_t;
135
136/*
137 * This is a general element structure used for expressions and stack
138 * elements.
139 */
140typedef struct {
141    ucs2_t reg;
142    ucs2_t onstack;
143    ucs2_t type;
144    ucs2_t lhs;
145    ucs2_t rhs;
146} _ure_elt_t;
147
148/*
149 * This is a structure used to track a list or a stack of states.
150 */
151typedef struct {
152    ucs2_t *slist;
153    ucs2_t slist_size;
154    ucs2_t slist_used;
155} _ure_stlist_t;
156
157/*
158 * Structure to track the list of unique states for a symbol
159 * during reduction.
160 */
161typedef struct {
162    ucs2_t id;
163    ucs2_t type;
164    unsigned long mods;
165    unsigned long props;
166    _ure_sym_t sym;
167    _ure_stlist_t states;
168} _ure_symtab_t;
169
170/*
171 * Structure to hold a single state.
172 */
173typedef struct {
174    ucs2_t id;
175    ucs2_t accepting;
176    ucs2_t pad;
177    _ure_stlist_t st;
178    _ure_elt_t *trans;
179    ucs2_t trans_size;
180    ucs2_t trans_used;
181} _ure_state_t;
182
183/*
184 * Structure used for keeping lists of states.
185 */
186typedef struct {
187    _ure_state_t *states;
188    ucs2_t states_size;
189    ucs2_t states_used;
190} _ure_statetable_t;
191
192/*
193 * Structure to track pairs of DFA states when equivalent states are
194 * merged.
195 */
196typedef struct {
197    ucs2_t l;
198    ucs2_t r;
199} _ure_equiv_t;
200
201/*
202 * Structure used for constructing the NFA and reducing to a minimal DFA.
203 */
204typedef struct _ure_buffer_t {
205    int reducing;
206    int error;
207    unsigned long flags;
208
209    _ure_stlist_t stack;
210
211    /*
212     * Table of unique symbols encountered.
213     */
214    _ure_symtab_t *symtab;
215    ucs2_t symtab_size;
216    ucs2_t symtab_used;
217
218    /*
219     * Tracks the unique expressions generated for the NFA and when the NFA is
220     * reduced.
221     */
222    _ure_elt_t *expr;
223    ucs2_t expr_used;
224    ucs2_t expr_size;
225
226    /*
227     * The reduced table of unique groups of NFA states.
228     */
229    _ure_statetable_t states;
230
231    /*
232     * Tracks states when equivalent states are merged.
233     */
234    _ure_equiv_t *equiv;
235    ucs2_t equiv_used;
236    ucs2_t equiv_size;
237} _ure_buffer_t;
238
239typedef struct {
240    ucs2_t symbol;
241    ucs2_t next_state;
242} _ure_trans_t;
243
244typedef struct {
245    ucs2_t accepting;
246    ucs2_t ntrans;
247    _ure_trans_t *trans;
248} _ure_dstate_t;
249
250typedef struct _ure_dfa_t {
251    unsigned long flags;
252
253    _ure_symtab_t *syms;
254    ucs2_t nsyms;
255
256    _ure_dstate_t *states;
257    ucs2_t nstates;
258
259    _ure_trans_t *trans;
260    ucs2_t ntrans;
261} _ure_dfa_t;
262
263/*************************************************************************
264 *
265 * Functions.
266 *
267 *************************************************************************/
268
269static void
270_ure_memmove(char *dest, char *src, unsigned long bytes)
271{
272    long i, j;
273
274    i = (long) bytes;
275    j = i & 7;
276    i = (i + 7) >> 3;
277
278    /*
279     * Do a memmove using Ye Olde Duff's Device for efficiency.
280     */
281    if (src < dest) {
282        src += bytes;
283        dest += bytes;
284
285        switch (j) {
286          case 0: do {
287              *--dest = *--src;
288            case 7: *--dest = *--src;
289            case 6: *--dest = *--src;
290            case 5: *--dest = *--src;
291            case 4: *--dest = *--src;
292            case 3: *--dest = *--src;
293            case 2: *--dest = *--src;
294            case 1: *--dest = *--src;
295          } while (--i > 0);
296        }
297    } else if (src > dest) {
298        switch (j) {
299          case 0: do {
300              *dest++ = *src++;
301            case 7: *dest++ = *src++;
302            case 6: *dest++ = *src++;
303            case 5: *dest++ = *src++;
304            case 4: *dest++ = *src++;
305            case 3: *dest++ = *src++;
306            case 2: *dest++ = *src++;
307            case 1: *dest++ = *src++;
308          } while (--i > 0);
309        }
310    }
311}
312
313static void
314_ure_push(ucs2_t v, _ure_buffer_t *b)
315{
316    _ure_stlist_t *s;
317
318    if (b == 0)
319      return;
320
321    /*
322     * If the `reducing' parameter is non-zero, check to see if the value
323     * passed is already on the stack.
324     */
325    if (b->reducing != 0 && b->expr[v].onstack != 0)
326      return;
327
328    s = &b->stack;
329    if (s->slist_used == s->slist_size) {
330        if (s->slist_size == 0)
331          s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
332        else
333          s->slist = (ucs2_t *) realloc((char *) s->slist,
334                                        sizeof(ucs2_t) * (s->slist_size + 8));
335        s->slist_size += 8;
336    }
337    s->slist[s->slist_used++] = v;
338
339    /*
340     * If the `reducing' parameter is non-zero, flag the element as being on
341     * the stack.
342     */
343    if (b->reducing != 0)
344      b->expr[v].onstack = 1;
345}
346
347static ucs2_t
348_ure_peek(_ure_buffer_t *b)
349{
350    if (b == 0 || b->stack.slist_used == 0)
351      return _URE_NOOP;
352
353    return b->stack.slist[b->stack.slist_used - 1];
354}
355
356static ucs2_t
357_ure_pop(_ure_buffer_t *b)
358{
359    ucs2_t v;
360
361    if (b == 0 || b->stack.slist_used == 0)
362      return _URE_NOOP;
363
364    v = b->stack.slist[--b->stack.slist_used];
365    if (b->reducing)
366      b->expr[v].onstack = 0;
367
368    return v;
369}
370
371/*************************************************************************
372 *
373 * Start symbol parse functions.
374 *
375 *************************************************************************/
376
377/*
378 * Parse a comma-separated list of integers that represent character
379 * properties.  Combine them into a mask that is returned in the `mask'
380 * variable, and return the number of characters consumed.
381 */
382static unsigned long
383_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
384               _ure_buffer_t *b)
385{
386    unsigned long n, m;
387    ucs2_t *sp, *ep;
388
389    sp = pp;
390    ep = sp + limit;
391
392    for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
393        if (*sp == ',') {
394            /*
395             * Encountered a comma, so select the next character property flag
396             * and reset the number.
397             */
398            m |= cclass_flags[n];
399            n = 0;
400        } else if (*sp >= '0' && *sp <= '9')
401          /*
402           * Encountered a digit, so start or continue building the cardinal
403           * that represents the character property flag.
404           */
405          n = (n * 10) + (*sp - '0');
406        else
407          /*
408           * Encountered something that is not part of the property list.
409           * Indicate that we are done.
410           */
411          break;
412
413        /*
414         * If a property number greater than 32 occurs, then there is a
415         * problem.  Most likely a missing comma separator.
416         */
417        if (n > 32)
418          b->error = _URE_INVALID_PROPERTY;
419    }
420
421    if (n != 0)
422      m |= cclass_flags[n];
423
424    /*
425     * Set the mask that represents the group of character properties.
426     */
427    *mask = m;
428
429    /*
430     * Return the number of characters consumed.
431     */
432    return sp - pp;
433}
434
435/*
436 * Collect a hex number with 1 to 4 digits and return the number
437 * of characters used.
438 */
439static unsigned long
440_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
441{
442    ucs2_t i;
443    ucs2_t *sp, *ep;
444    ucs4_t nn;
445
446    sp = np;
447    ep = sp + limit;
448
449    for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
450        if (*sp >= '0' && *sp <= '9')
451          nn = (nn << 4) + (*sp - '0');
452        else if (*sp >= 'A' && *sp <= 'F')
453          nn = (nn << 4) + ((*sp - 'A') + 10);
454        else if (*sp >= 'a' && *sp <= 'f')
455          nn = (nn << 4) + ((*sp - 'a') + 10);
456        else
457          /*
458           * Encountered something that is not a hex digit.
459           */
460          break;
461    }
462
463    /*
464     * Assign the character code collected and return the number of
465     * characters used.
466     */
467    *n = nn;
468
469    return sp - np;
470}
471
472/*
473 * Insert a range into a character class, removing duplicates and ordering
474 * them in increasing range-start order.
475 */
476static void
477_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
478{
479    ucs2_t i;
480    ucs4_t tmp;
481    _ure_range_t *rp;
482
483    /*
484     * If the `casefold' flag is set, then make sure both endpoints of the
485     * range are converted to lower case.
486     */
487    if (b->flags & _URE_DFA_CASEFOLD) {
488        r->min_code = _ure_tolower(r->min_code);
489        r->max_code = _ure_tolower(r->max_code);
490    }
491
492    /*
493     * Swap the range endpoints if they are not in increasing order.
494     */
495    if (r->min_code > r->max_code) {
496        tmp = r->min_code;
497        r->min_code = r->max_code;
498        r->max_code = tmp;
499    }
500
501    for (i = 0, rp = ccl->ranges;
502         i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
503
504    /*
505     * Check for a duplicate.
506     */
507    if (i < ccl->ranges_used &&
508        r->min_code == rp->min_code && r->max_code == rp->max_code)
509      return;
510
511    if (ccl->ranges_used == ccl->ranges_size) {
512        if (ccl->ranges_size == 0)
513          ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
514        else
515          ccl->ranges = (_ure_range_t *)
516              realloc((char *) ccl->ranges,
517                      sizeof(_ure_range_t) * (ccl->ranges_size + 8));
518        ccl->ranges_size += 8;
519    }
520
521    rp = ccl->ranges + ccl->ranges_used;
522
523    if (i < ccl->ranges_used)
524      _ure_memmove((char *) (rp + 1), (char *) rp,
525                   sizeof(_ure_range_t) * (ccl->ranges_used - i));
526
527    ccl->ranges_used++;
528    rp->min_code = r->min_code;
529    rp->max_code = r->max_code;
530}
531
532#define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
533_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
534#define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
535#define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
536_URE_OTHERPUNCT)
537#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
538_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
539#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
540#define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
541
542typedef void (*_ure_cclsetup_t)(
543    _ure_symtab_t *sym,
544    unsigned long mask,
545    _ure_buffer_t *b
546);
547
548typedef struct {
549    ucs2_t key;
550    unsigned long len;
551    unsigned long next;
552    _ure_cclsetup_t func;
553    unsigned long mask;
554} _ure_trie_t;
555
556static void
557_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
558{
559    sym->props |= mask;
560}
561
562static void
563_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
564{
565    _ure_range_t range;
566
567    sym->props |= mask;
568
569    /*
570     * Add the additional characters needed for handling isspace().
571     */
572    range.min_code = range.max_code = '\t';
573    _ure_add_range(&sym->sym.ccl, &range, b);
574    range.min_code = range.max_code = '\r';
575    _ure_add_range(&sym->sym.ccl, &range, b);
576    range.min_code = range.max_code = '\n';
577    _ure_add_range(&sym->sym.ccl, &range, b);
578    range.min_code = range.max_code = '\f';
579    _ure_add_range(&sym->sym.ccl, &range, b);
580    range.min_code = range.max_code = 0xfeff;
581    _ure_add_range(&sym->sym.ccl, &range, b);
582}
583
584static void
585_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
586{
587    _ure_range_t range;
588
589    /*
590     * Add the additional characters needed for handling isxdigit().
591     */
592    range.min_code = '0';
593    range.max_code = '9';
594    _ure_add_range(&sym->sym.ccl, &range, b);
595    range.min_code = 'A';
596    range.max_code = 'F';
597    _ure_add_range(&sym->sym.ccl, &range, b);
598    range.min_code = 'a';
599    range.max_code = 'f';
600    _ure_add_range(&sym->sym.ccl, &range, b);
601}
602
603static _ure_trie_t cclass_trie[] = {
604    {0x003a, 1, 1, 0, 0},
605    {0x0061, 9, 10, 0, 0},
606    {0x0063, 8, 19, 0, 0},
607    {0x0064, 7, 24, 0, 0},
608    {0x0067, 6, 29, 0, 0},
609    {0x006c, 5, 34, 0, 0},
610    {0x0070, 4, 39, 0, 0},
611    {0x0073, 3, 49, 0, 0},
612    {0x0075, 2, 54, 0, 0},
613    {0x0078, 1, 59, 0, 0},
614    {0x006c, 1, 11, 0, 0},
615    {0x006e, 2, 13, 0, 0},
616    {0x0070, 1, 16, 0, 0},
617    {0x0075, 1, 14, 0, 0},
618    {0x006d, 1, 15, 0, 0},
619    {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
620    {0x0068, 1, 17, 0, 0},
621    {0x0061, 1, 18, 0, 0},
622    {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
623    {0x006e, 1, 20, 0, 0},
624    {0x0074, 1, 21, 0, 0},
625    {0x0072, 1, 22, 0, 0},
626    {0x006c, 1, 23, 0, 0},
627    {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
628    {0x0069, 1, 25, 0, 0},
629    {0x0067, 1, 26, 0, 0},
630    {0x0069, 1, 27, 0, 0},
631    {0x0074, 1, 28, 0, 0},
632    {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
633    {0x0072, 1, 30, 0, 0},
634    {0x0061, 1, 31, 0, 0},
635    {0x0070, 1, 32, 0, 0},
636    {0x0068, 1, 33, 0, 0},
637    {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
638    {0x006f, 1, 35, 0, 0},
639    {0x0077, 1, 36, 0, 0},
640    {0x0065, 1, 37, 0, 0},
641    {0x0072, 1, 38, 0, 0},
642    {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
643    {0x0072, 2, 41, 0, 0},
644    {0x0075, 1, 45, 0, 0},
645    {0x0069, 1, 42, 0, 0},
646    {0x006e, 1, 43, 0, 0},
647    {0x0074, 1, 44, 0, 0},
648    {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
649    {0x006e, 1, 46, 0, 0},
650    {0x0063, 1, 47, 0, 0},
651    {0x0074, 1, 48, 0, 0},
652    {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
653    {0x0070, 1, 50, 0, 0},
654    {0x0061, 1, 51, 0, 0},
655    {0x0063, 1, 52, 0, 0},
656    {0x0065, 1, 53, 0, 0},
657    {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
658    {0x0070, 1, 55, 0, 0},
659    {0x0070, 1, 56, 0, 0},
660    {0x0065, 1, 57, 0, 0},
661    {0x0072, 1, 58, 0, 0},
662    {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
663    {0x0064, 1, 60, 0, 0},
664    {0x0069, 1, 61, 0, 0},
665    {0x0067, 1, 62, 0, 0},
666    {0x0069, 1, 63, 0, 0},
667    {0x0074, 1, 64, 0, 0},
668    {0x003a, 1, 65, _ure_xdigit_setup, 0},
669};
670
671/*
672 * Probe for one of the POSIX colon delimited character classes in the static
673 * trie.
674 */
675static unsigned long
676_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
677               _ure_buffer_t *b)
678{
679    int i;
680    unsigned long n;
681    _ure_trie_t *tp;
682    ucs2_t *sp, *ep;
683
684    /*
685     * If the number of characters left is less than 7, then this cannot be
686     * interpreted as one of the colon delimited classes.
687     */
688    if (limit < 7)
689      return 0;
690
691    sp = cp;
692    ep = sp + limit;
693    tp = cclass_trie;
694    for (i = 0; sp < ep && i < 8; i++, sp++) {
695        n = tp->len;
696
697        for (; n > 0 && tp->key != *sp; tp++, n--) ;
698
699        if (n == 0)
700          return 0;
701
702        if (*sp == ':' && (i == 6 || i == 7)) {
703            sp++;
704            break;
705        }
706        if (sp + 1 < ep)
707          tp = cclass_trie + tp->next;
708    }
709    if (tp->func == 0)
710      return 0;
711
712    (*tp->func)(sym, tp->mask, b);
713
714    return sp - cp;
715}
716
717/*
718 * Construct a list of ranges and return the number of characters consumed.
719 */
720static unsigned long
721_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
722            _ure_buffer_t *b)
723{
724    int range_end;
725    unsigned long n;
726    ucs2_t *sp, *ep;
727    ucs4_t c, last;
728    _ure_ccl_t *cclp;
729    _ure_range_t range;
730
731    sp = cp;
732    ep = sp + limit;
733
734    if (*sp == '^') {
735      symp->type = _URE_NCCLASS;
736      sp++;
737    } else
738      symp->type = _URE_CCLASS;
739
740    for (last = 0, range_end = 0;
741         b->error == _URE_OK && sp < ep && *sp != ']'; ) {
742        c = *sp++;
743        if (c == '\\') {
744            if (sp == ep) {
745                /*
746                 * The EOS was encountered when expecting the reverse solidus
747                 * to be followed by the character it is escaping.  Set an
748                 * error code and return the number of characters consumed up
749                 * to this point.
750                 */
751                b->error = _URE_UNEXPECTED_EOS;
752                return sp - cp;
753            }
754
755            c = *sp++;
756            switch (c) {
757              case 'a':
758                c = 0x07;
759                break;
760              case 'b':
761                c = 0x08;
762                break;
763              case 'f':
764                c = 0x0c;
765                break;
766              case 'n':
767                c = 0x0a;
768                break;
769              case 'r':
770                c = 0x0d;
771                break;
772              case 't':
773                c = 0x09;
774                break;
775              case 'v':
776                c = 0x0b;
777                break;
778              case 'p':
779              case 'P':
780                sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
781                /*
782                 * Invert the bit mask of the properties if this is a negated
783                 * character class or if 'P' is used to specify a list of
784                 * character properties that should *not* match in a
785                 * character class.
786                 */
787                if (c == 'P')
788                  symp->props = ~symp->props;
789                continue;
790                break;
791              case 'x':
792              case 'X':
793              case 'u':
794              case 'U':
795                if (sp < ep &&
796                    ((*sp >= '0' && *sp <= '9') ||
797                     (*sp >= 'A' && *sp <= 'F') ||
798                     (*sp >= 'a' && *sp <= 'f')))
799                  sp += _ure_hex(sp, ep - sp, &c);
800            }
801        } else if (c == ':') {
802            /*
803             * Probe for a POSIX colon delimited character class.
804             */
805            sp--;
806            if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
807              sp++;
808            else {
809                sp += n;
810                continue;
811            }
812        }
813
814        cclp = &symp->sym.ccl;
815
816        /*
817         * Check to see if the current character is a low surrogate that needs
818         * to be combined with a preceding high surrogate.
819         */
820        if (last != 0) {
821            if (c >= 0xdc00 && c <= 0xdfff)
822              /*
823               * Construct the UTF16 character code.
824               */
825              c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
826            else {
827                /*
828                 * Add the isolated high surrogate to the range.
829                 */
830                if (range_end == 1)
831                  range.max_code = last & 0xffff;
832                else
833                  range.min_code = range.max_code = last & 0xffff;
834
835                _ure_add_range(cclp, &range, b);
836                range_end = 0;
837            }
838        }
839
840        /*
841         * Clear the last character code.
842         */
843        last = 0;
844
845        /*
846         * This slightly awkward code handles the different cases needed to
847         * construct a range.
848         */
849        if (c >= 0xd800 && c <= 0xdbff) {
850            /*
851             * If the high surrogate is followed by a range indicator, simply
852             * add it as the range start.  Otherwise, save it in case the next
853             * character is a low surrogate.
854             */
855            if (*sp == '-') {
856                sp++;
857                range.min_code = c;
858                range_end = 1;
859            } else
860              last = c;
861        } else if (range_end == 1) {
862            range.max_code = c;
863            _ure_add_range(cclp, &range, b);
864            range_end = 0;
865        } else {
866            range.min_code = range.max_code = c;
867            if (*sp == '-') {
868                sp++;
869                range_end = 1;
870            } else
871              _ure_add_range(cclp, &range, b);
872        }
873    }
874
875    if (sp < ep && *sp == ']')
876      sp++;
877    else
878      /*
879       * The parse was not terminated by the character class close symbol
880       * (']'), so set an error code.
881       */
882      b->error = _URE_CCLASS_OPEN;
883
884    return sp - cp;
885}
886
887/*
888 * Probe for a low surrogate hex code.
889 */
890static unsigned long
891_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
892{
893    ucs4_t i, code;
894    ucs2_t *sp, *ep;
895
896    for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
897        if (*sp >= '0' && *sp <= '9')
898          code = (code << 4) + (*sp - '0');
899        else if (*sp >= 'A' && *sp <= 'F')
900          code = (code << 4) + ((*sp - 'A') + 10);
901        else if (*sp >= 'a' && *sp <= 'f')
902          code = (code << 4) + ((*sp - 'a') + 10);
903        else
904          break;
905    }
906
907    *c = code;
908    return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
909}
910
911static unsigned long
912_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
913                    _ure_buffer_t *b)
914{
915    ucs4_t c;
916    ucs2_t *sp, *ep;
917
918    sp = sym;
919    ep = sym + limit;
920
921    if ((c = *sp++) == '\\') {
922
923        if (sp == ep) {
924            /*
925             * The EOS was encountered when expecting the reverse solidus to
926             * be followed by the character it is escaping.  Set an error code
927             * and return the number of characters consumed up to this point.
928             */
929            b->error = _URE_UNEXPECTED_EOS;
930            return sp - sym;
931        }
932
933        c = *sp++;
934        switch (c) {
935          case 'p':
936          case 'P':
937            symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
938            sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
939            break;
940          case 'a':
941            symp->type = _URE_CHAR;
942            symp->sym.chr = 0x07;
943            break;
944          case 'b':
945            symp->type = _URE_CHAR;
946            symp->sym.chr = 0x08;
947            break;
948          case 'f':
949            symp->type = _URE_CHAR;
950            symp->sym.chr = 0x0c;
951            break;
952          case 'n':
953            symp->type = _URE_CHAR;
954            symp->sym.chr = 0x0a;
955            break;
956          case 'r':
957            symp->type = _URE_CHAR;
958            symp->sym.chr = 0x0d;
959            break;
960          case 't':
961            symp->type = _URE_CHAR;
962            symp->sym.chr = 0x09;
963            break;
964          case 'v':
965            symp->type = _URE_CHAR;
966            symp->sym.chr = 0x0b;
967            break;
968          case 'x':
969          case 'X':
970          case 'u':
971          case 'U':
972            /*
973             * Collect between 1 and 4 digits representing a UCS2 code.  Fall
974             * through to the next case.
975             */
976            if (sp < ep &&
977                ((*sp >= '0' && *sp <= '9') ||
978                 (*sp >= 'A' && *sp <= 'F') ||
979                 (*sp >= 'a' && *sp <= 'f')))
980              sp += _ure_hex(sp, ep - sp, &c);
981            /* FALLTHROUGH */
982          default:
983            /*
984             * Simply add an escaped character here.
985             */
986            symp->type = _URE_CHAR;
987            symp->sym.chr = c;
988        }
989    } else if (c == '^' || c == '$')
990      /*
991       * Handle the BOL and EOL anchors.  This actually consists simply of
992       * setting a flag that indicates that the user supplied anchor match
993       * function should be called.  This needs to be done instead of simply
994       * matching line/paragraph separators because beginning-of-text and
995       * end-of-text tests are needed as well.
996       */
997      symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
998    else if (c == '[')
999      /*
1000       * Construct a character class.
1001       */
1002      sp += _ure_cclass(sp, ep - sp, symp, b);
1003    else if (c == '.')
1004      symp->type = _URE_ANY_CHAR;
1005    else {
1006        symp->type = _URE_CHAR;
1007        symp->sym.chr = c;
1008    }
1009
1010    /*
1011     * If the symbol type happens to be a character and is a high surrogate,
1012     * then probe forward to see if it is followed by a low surrogate that
1013     * needs to be added.
1014     */
1015    if (sp < ep && symp->type == _URE_CHAR &&
1016        0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1017
1018        if (0xdc00 <= *sp && *sp <= 0xdfff) {
1019            symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1020                                       (*sp & 0x03ff));
1021            sp++;
1022        } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1023                                 *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1024            sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1025            if (0xdc00 <= c && c <= 0xdfff) {
1026                /*
1027                 * Take into account the \[xu] in front of the hex code.
1028                 */
1029                sp += 2;
1030                symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1031                                           (c & 0x03ff));
1032            }
1033        }
1034    }
1035
1036    /*
1037     * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1038     * the `casefold' flag is set.
1039     */
1040    if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1041      symp->sym.chr = _ure_tolower(symp->sym.chr);
1042
1043    /*
1044     * If the symbol constructed is anything other than one of the anchors,
1045     * make sure the _URE_DFA_BLANKLINE flag is removed.
1046     */
1047    if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1048      b->flags &= ~_URE_DFA_BLANKLINE;
1049
1050    /*
1051     * Return the number of characters consumed.
1052     */
1053    return sp - sym;
1054}
1055
1056static int
1057_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1058{
1059    if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1060      return 1;
1061
1062    if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1063        if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1064          return 1;
1065        if (a->sym.ccl.ranges_used > 0 &&
1066            memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1067                   sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1068          return 1;
1069    } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1070      return 1;
1071    return 0;
1072}
1073
1074/*
1075 * Construct a symbol, but only keep unique symbols.
1076 */
1077static ucs2_t
1078_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1079                 _ure_buffer_t *b)
1080{
1081    ucs2_t i;
1082    _ure_symtab_t *sp, symbol;
1083
1084    /*
1085     * Build the next symbol so we can test to see if it is already in the
1086     * symbol table.
1087     */
1088    (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1089    *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1090
1091    /*
1092     * Check to see if the symbol exists.
1093     */
1094    for (i = 0, sp = b->symtab;
1095         i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1096
1097    if (i < b->symtab_used) {
1098        /*
1099         * Free up any ranges used for the symbol.
1100         */
1101        if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1102            symbol.sym.ccl.ranges_size > 0)
1103          free((char *) symbol.sym.ccl.ranges);
1104
1105        return b->symtab[i].id;
1106    }
1107
1108    /*
1109     * Need to add the new symbol.
1110     */
1111    if (b->symtab_used == b->symtab_size) {
1112        if (b->symtab_size == 0)
1113          b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1114        else
1115          b->symtab = (_ure_symtab_t *)
1116              realloc((char *) b->symtab,
1117                      sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1118        sp = b->symtab + b->symtab_size;
1119        (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1120        b->symtab_size += 8;
1121    }
1122
1123    symbol.id = b->symtab_used++;
1124    (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1125                  sizeof(_ure_symtab_t));
1126
1127    return symbol.id;
1128}
1129
1130/*************************************************************************
1131 *
1132 * End symbol parse functions.
1133 *
1134 *************************************************************************/
1135
1136static ucs2_t
1137_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1138{
1139    ucs2_t i;
1140
1141    if (b == 0)
1142      return _URE_NOOP;
1143
1144    /*
1145     * Determine if the expression already exists or not.
1146     */
1147    for (i = 0; i < b->expr_used; i++) {
1148        if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1149            b->expr[i].rhs == rhs)
1150          break;
1151    }
1152    if (i < b->expr_used)
1153      return i;
1154
1155    /*
1156     * Need to add a new expression.
1157     */
1158    if (b->expr_used == b->expr_size) {
1159        if (b->expr_size == 0)
1160          b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1161        else
1162          b->expr = (_ure_elt_t *)
1163              realloc((char *) b->expr,
1164                      sizeof(_ure_elt_t) * (b->expr_size + 8));
1165        b->expr_size += 8;
1166    }
1167
1168    b->expr[b->expr_used].onstack = 0;
1169    b->expr[b->expr_used].type = type;
1170    b->expr[b->expr_used].lhs = lhs;
1171    b->expr[b->expr_used].rhs = rhs;
1172
1173    return b->expr_used++;
1174}
1175
1176static unsigned char spmap[] = {
1177    0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1178    0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1179    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180};
1181
1182#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1183                            (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184
1185/*
1186 * Convert the regular expression into an NFA in a form that will be easy to
1187 * reduce to a DFA.  The starting state for the reduction will be returned.
1188 */
1189static ucs2_t
1190_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1191{
1192    ucs2_t c, state, top, sym, *sp, *ep;
1193    unsigned long used;
1194
1195    state = _URE_NOOP;
1196
1197    sp = re;
1198    ep = sp + relen;
1199    while (b->error == _URE_OK && sp < ep) {
1200        c = *sp++;
1201        switch (c) {
1202          case '(':
1203            _ure_push(_URE_PAREN, b);
1204            break;
1205          case ')':
1206            /*
1207             * Check for the case of too many close parentheses.
1208             */
1209            if (_ure_peek(b) == _URE_NOOP) {
1210                b->error = _URE_UNBALANCED_GROUP;
1211                break;
1212            }
1213
1214            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1215              /*
1216               * Make an expression with the AND or OR operator and its right
1217               * hand side.
1218               */
1219              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1220
1221            /*
1222             * Remove the _URE_PAREN off the stack.
1223             */
1224            (void) _ure_pop(b);
1225            break;
1226          case '*':
1227            state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1228            break;
1229          case '+':
1230            state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1231            break;
1232          case '?':
1233            state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1234            break;
1235          case '|':
1236            while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1237              /*
1238               * Make an expression with the AND or OR operator and its right
1239               * hand side.
1240               */
1241              state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1242
1243            _ure_push(state, b);
1244            _ure_push(_URE_OR, b);
1245            break;
1246          default:
1247            sp--;
1248            sym = _ure_make_symbol(sp, ep - sp, &used, b);
1249            sp += used;
1250            state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1251            break;
1252        }
1253
1254        if (c != '(' && c != '|' && sp < ep &&
1255            (!_ure_isspecial(*sp) || *sp == '(')) {
1256            _ure_push(state, b);
1257            _ure_push(_URE_AND, b);
1258        }
1259    }
1260    while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1261      /*
1262       * Make an expression with the AND or OR operator and its right
1263       * hand side.
1264       */
1265      state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1266
1267    if (b->stack.slist_used > 0)
1268      b->error = _URE_UNBALANCED_GROUP;
1269
1270    return (b->error == _URE_OK) ? state : _URE_NOOP;
1271}
1272
1273static void
1274_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1275{
1276    ucs2_t i, *stp;
1277    _ure_symtab_t *sp;
1278
1279    /*
1280     * Locate the symbol in the symbol table so the state can be added.
1281     * If the symbol doesn't exist, then a real problem exists.
1282     */
1283    for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1284         i++, sp++) ;
1285
1286    /*
1287     * Now find out if the state exists in the symbol's state list.
1288     */
1289    for (i = 0, stp = sp->states.slist;
1290         i < sp->states.slist_used && state > *stp; i++, stp++) ;
1291
1292    if (i == sp->states.slist_used || state < *stp) {
1293        /*
1294         * Need to add the state in order.
1295         */
1296        if (sp->states.slist_used == sp->states.slist_size) {
1297            if (sp->states.slist_size == 0)
1298              sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1299            else
1300              sp->states.slist = (ucs2_t *)
1301                  realloc((char *) sp->states.slist,
1302                          sizeof(ucs2_t) * (sp->states.slist_size + 8));
1303            sp->states.slist_size += 8;
1304        }
1305        if (i < sp->states.slist_used)
1306          (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1307                              (char *) (sp->states.slist + i),
1308                              sizeof(ucs2_t) * (sp->states.slist_used - i));
1309        sp->states.slist[i] = state;
1310        sp->states.slist_used++;
1311    }
1312}
1313
1314static ucs2_t
1315_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1316{
1317    ucs2_t i;
1318    _ure_state_t *sp;
1319
1320    for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1321        if (sp->st.slist_used == nstates &&
1322            memcmp((char *) states, (char *) sp->st.slist,
1323                   sizeof(ucs2_t) * nstates) == 0)
1324          break;
1325    }
1326
1327    if (i == b->states.states_used) {
1328        /*
1329         * Need to add a new DFA state (set of NFA states).
1330         */
1331        if (b->states.states_used == b->states.states_size) {
1332            if (b->states.states_size == 0)
1333              b->states.states = (_ure_state_t *)
1334                  malloc(sizeof(_ure_state_t) << 3);
1335            else
1336              b->states.states = (_ure_state_t *)
1337                  realloc((char *) b->states.states,
1338                          sizeof(_ure_state_t) * (b->states.states_size + 8));
1339            sp = b->states.states + b->states.states_size;
1340            (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1341            b->states.states_size += 8;
1342        }
1343
1344        sp = b->states.states + b->states.states_used++;
1345        sp->id = i;
1346
1347        if (sp->st.slist_used + nstates > sp->st.slist_size) {
1348            if (sp->st.slist_size == 0)
1349              sp->st.slist = (ucs2_t *)
1350                  malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1351            else
1352              sp->st.slist = (ucs2_t *)
1353                  realloc((char *) sp->st.slist,
1354                          sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1355            sp->st.slist_size = sp->st.slist_used + nstates;
1356        }
1357        sp->st.slist_used = nstates;
1358        (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1359                      sizeof(ucs2_t) * nstates);
1360    }
1361
1362    /*
1363     * Return the ID of the DFA state representing a group of NFA states.
1364     */
1365    return i;
1366}
1367
1368static void
1369_ure_reduce(ucs2_t start, _ure_buffer_t *b)
1370{
1371    ucs2_t i, j, state, eval, syms, rhs;
1372    ucs2_t s1, s2, ns1, ns2;
1373    _ure_state_t *sp;
1374    _ure_symtab_t *smp;
1375
1376    b->reducing = 1;
1377
1378    /*
1379     * Add the starting state for the reduction.
1380     */
1381    _ure_add_state(1, &start, b);
1382
1383    /*
1384     * Process each set of NFA states that get created.
1385     */
1386    for (i = 0; i < b->states.states_used; i++) {
1387        sp = b->states.states + i;
1388
1389        /*
1390         * Push the current states on the stack.
1391         */
1392        for (j = 0; j < sp->st.slist_used; j++)
1393          _ure_push(sp->st.slist[j], b);
1394
1395        /*
1396         * Reduce the NFA states.
1397         */
1398        for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1399            state = b->stack.slist[j];
1400            eval = 1;
1401
1402            /*
1403             * This inner loop is the iterative equivalent of recursively
1404             * reducing subexpressions generated as a result of a reduction.
1405             */
1406            while (eval) {
1407                switch (b->expr[state].type) {
1408                  case _URE_SYMBOL:
1409                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1410                    _ure_add_symstate(b->expr[state].lhs, ns1, b);
1411                    syms++;
1412                    eval = 0;
1413                    break;
1414                  case _URE_ONE:
1415                    sp->accepting = 1;
1416                    eval = 0;
1417                    break;
1418                  case _URE_QUEST:
1419                    s1 = b->expr[state].lhs;
1420                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1421                    state = _ure_make_expr(_URE_OR, ns1, s1, b);
1422                    break;
1423                  case _URE_PLUS:
1424                    s1 = b->expr[state].lhs;
1425                    ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1426                    state = _ure_make_expr(_URE_AND, s1, ns1, b);
1427                    break;
1428                  case _URE_STAR:
1429                    s1 = b->expr[state].lhs;
1430                    ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1431                    ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1432                    state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1433                    break;
1434                  case _URE_OR:
1435                    s1 = b->expr[state].lhs;
1436                    s2 = b->expr[state].rhs;
1437                    _ure_push(s1, b);
1438                    _ure_push(s2, b);
1439                    eval = 0;
1440                    break;
1441                  case _URE_AND:
1442                    s1 = b->expr[state].lhs;
1443                    s2 = b->expr[state].rhs;
1444                    switch (b->expr[s1].type) {
1445                      case _URE_SYMBOL:
1446                        _ure_add_symstate(b->expr[s1].lhs, s2, b);
1447                        syms++;
1448                        eval = 0;
1449                        break;
1450                      case _URE_ONE:
1451                        state = s2;
1452                        break;
1453                      case _URE_QUEST:
1454                        ns1 = b->expr[s1].lhs;
1455                        ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1456                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
1457                        break;
1458                      case _URE_PLUS:
1459                        ns1 = b->expr[s1].lhs;
1460                        ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1461                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1462                        break;
1463                      case _URE_STAR:
1464                        ns1 = b->expr[s1].lhs;
1465                        ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1466                        state = _ure_make_expr(_URE_OR, s2, ns2, b);
1467                        break;
1468                      case _URE_OR:
1469                        ns1 = b->expr[s1].lhs;
1470                        ns2 = b->expr[s1].rhs;
1471                        ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1472                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1473                        state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1474                        break;
1475                      case _URE_AND:
1476                        ns1 = b->expr[s1].lhs;
1477                        ns2 = b->expr[s1].rhs;
1478                        ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1479                        state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1480                        break;
1481                    }
1482                }
1483            }
1484        }
1485
1486        /*
1487         * Clear the state stack.
1488         */
1489        while (_ure_pop(b) != _URE_NOOP) ;
1490
1491        /*
1492         * Reset the state pointer because the reduction may have moved it
1493         * during a reallocation.
1494         */
1495        sp = b->states.states + i;
1496
1497        /*
1498         * Generate the DFA states for the symbols collected during the
1499         * current reduction.
1500         */
1501        if (sp->trans_used + syms > sp->trans_size) {
1502            if (sp->trans_size == 0)
1503              sp->trans = (_ure_elt_t *)
1504                  malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1505            else
1506              sp->trans = (_ure_elt_t *)
1507                  realloc((char *) sp->trans,
1508                          sizeof(_ure_elt_t) * (sp->trans_used + syms));
1509            sp->trans_size = sp->trans_used + syms;
1510        }
1511
1512        /*
1513         * Go through the symbol table and generate the DFA state transitions
1514         * for each symbol that has collected NFA states.
1515         */
1516        for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1517            sp = b->states.states + i;
1518
1519            if (smp->states.slist_used > 0) {
1520                sp->trans[syms].lhs = smp->id;
1521                rhs = _ure_add_state(smp->states.slist_used,
1522                                     smp->states.slist, b);
1523                /*
1524                 * Reset the state pointer in case the reallocation moves it
1525                 * in memory.
1526                 */
1527                sp = b->states.states + i;
1528                sp->trans[syms].rhs = rhs;
1529
1530                smp->states.slist_used = 0;
1531                syms++;
1532            }
1533        }
1534
1535        /*
1536         * Set the number of transitions actually used.
1537         */
1538        sp->trans_used = syms;
1539    }
1540    b->reducing = 0;
1541}
1542
1543static void
1544_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1545{
1546    ucs2_t tmp;
1547
1548    l = b->states.states[l].id;
1549    r = b->states.states[r].id;
1550
1551    if (l == r)
1552      return;
1553
1554    if (l > r) {
1555        tmp = l;
1556        l = r;
1557        r = tmp;
1558    }
1559
1560    /*
1561     * Check to see if the equivalence pair already exists.
1562     */
1563    for (tmp = 0; tmp < b->equiv_used &&
1564             (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1565         tmp++) ;
1566
1567    if (tmp < b->equiv_used)
1568      return;
1569
1570    if (b->equiv_used == b->equiv_size) {
1571        if (b->equiv_size == 0)
1572          b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1573        else
1574          b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1575                                              sizeof(_ure_equiv_t) *
1576                                              (b->equiv_size + 8));
1577        b->equiv_size += 8;
1578    }
1579    b->equiv[b->equiv_used].l = l;
1580    b->equiv[b->equiv_used].r = r;
1581    b->equiv_used++;
1582}
1583
1584/*
1585 * Merge the DFA states that are equivalent.
1586 */
1587static void
1588_ure_merge_equiv(_ure_buffer_t *b)
1589{
1590    ucs2_t i, j, k, eq, done;
1591    _ure_state_t *sp1, *sp2, *ls, *rs;
1592
1593    for (i = 0; i < b->states.states_used; i++) {
1594        sp1 = b->states.states + i;
1595        if (sp1->id != i)
1596          continue;
1597        for (j = 0; j < i; j++) {
1598            sp2 = b->states.states + j;
1599            if (sp2->id != j)
1600              continue;
1601            b->equiv_used = 0;
1602            _ure_add_equiv(i, j, b);
1603            for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1604                ls = b->states.states + b->equiv[eq].l;
1605                rs = b->states.states + b->equiv[eq].r;
1606                if (ls->accepting != rs->accepting ||
1607                    ls->trans_used != rs->trans_used) {
1608                    done = 1;
1609                    break;
1610                }
1611                for (k = 0; k < ls->trans_used &&
1612                         ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1613                if (k < ls->trans_used) {
1614                    done = 1;
1615                    break;
1616                }
1617
1618                for (k = 0; k < ls->trans_used; k++)
1619                  _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1620            }
1621            if (done == 0)
1622              break;
1623        }
1624        for (eq = 0; j < i && eq < b->equiv_used; eq++)
1625          b->states.states[b->equiv[eq].r].id =
1626              b->states.states[b->equiv[eq].l].id;
1627    }
1628
1629    /*
1630     * Renumber the states appropriately.
1631     */
1632    for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1633         sp1++, i++)
1634      sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1635}
1636
1637/*************************************************************************
1638 *
1639 * API.
1640 *
1641 *************************************************************************/
1642
1643ure_buffer_t
1644ure_buffer_create(void)
1645{
1646    ure_buffer_t b;
1647
1648    b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1649
1650    return b;
1651}
1652
1653void
1654ure_buffer_free(ure_buffer_t buf)
1655{
1656    unsigned long i;
1657
1658    if (buf == 0)
1659      return;
1660
1661    if (buf->stack.slist_size > 0)
1662      free((char *) buf->stack.slist);
1663
1664    if (buf->expr_size > 0)
1665      free((char *) buf->expr);
1666
1667    for (i = 0; i < buf->symtab_size; i++) {
1668        if (buf->symtab[i].states.slist_size > 0)
1669          free((char *) buf->symtab[i].states.slist);
1670    }
1671
1672    if (buf->symtab_size > 0)
1673      free((char *) buf->symtab);
1674
1675    for (i = 0; i < buf->states.states_size; i++) {
1676        if (buf->states.states[i].trans_size > 0)
1677          free((char *) buf->states.states[i].trans);
1678        if (buf->states.states[i].st.slist_size > 0)
1679          free((char *) buf->states.states[i].st.slist);
1680    }
1681
1682    if (buf->states.states_size > 0)
1683      free((char *) buf->states.states);
1684
1685    if (buf->equiv_size > 0)
1686      free((char *) buf->equiv);
1687
1688    free((char *) buf);
1689}
1690
1691ure_dfa_t
1692ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1693{
1694    ucs2_t i, j, state;
1695    _ure_state_t *sp;
1696    _ure_dstate_t *dsp;
1697    _ure_trans_t *tp;
1698    ure_dfa_t dfa;
1699
1700    if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1701      return 0;
1702
1703    /*
1704     * Reset the various fields of the compilation buffer.  Default the flags
1705     * to indicate the presense of the "^$" pattern.  If any other pattern
1706     * occurs, then this flag will be removed.  This is done to catch this
1707     * special pattern and handle it specially when matching.
1708     */
1709    buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1710    buf->reducing = 0;
1711    buf->stack.slist_used = 0;
1712    buf->expr_used = 0;
1713
1714    for (i = 0; i < buf->symtab_used; i++)
1715      buf->symtab[i].states.slist_used = 0;
1716    buf->symtab_used = 0;
1717
1718    for (i = 0; i < buf->states.states_used; i++) {
1719        buf->states.states[i].st.slist_used = 0;
1720        buf->states.states[i].trans_used = 0;
1721    }
1722    buf->states.states_used = 0;
1723
1724    /*
1725     * Construct the NFA.  If this stage returns a 0, then an error occured or
1726     * an empty expression was passed.
1727     */
1728    if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1729      return 0;
1730
1731    /*
1732     * Do the expression reduction to get the initial DFA.
1733     */
1734    _ure_reduce(state, buf);
1735
1736    /*
1737     * Merge all the equivalent DFA states.
1738     */
1739    _ure_merge_equiv(buf);
1740
1741    /*
1742     * Construct the minimal DFA.
1743     */
1744    dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1745    (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1746
1747    dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1748
1749    /*
1750     * Free up the NFA state groups and transfer the symbols from the buffer
1751     * to the DFA.
1752     */
1753    for (i = 0; i < buf->symtab_size; i++) {
1754        if (buf->symtab[i].states.slist_size > 0)
1755          free((char *) buf->symtab[i].states.slist);
1756    }
1757    dfa->syms = buf->symtab;
1758    dfa->nsyms = buf->symtab_used;
1759
1760    buf->symtab_used = buf->symtab_size = 0;
1761
1762    /*
1763     * Collect the total number of states and transitions needed for the DFA.
1764     */
1765    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1766         i++, sp++) {
1767        if (sp->id == state) {
1768            dfa->nstates++;
1769            dfa->ntrans += sp->trans_used;
1770            state++;
1771        }
1772    }
1773
1774    /*
1775     * Allocate enough space for the states and transitions.
1776     */
1777    dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1778                                           dfa->nstates);
1779    dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1780
1781    /*
1782     * Actually transfer the DFA states from the buffer.
1783     */
1784    dsp = dfa->states;
1785    tp = dfa->trans;
1786    for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1787         i++, sp++) {
1788        if (sp->id == state) {
1789            dsp->trans = tp;
1790            dsp->ntrans = sp->trans_used;
1791            dsp->accepting = sp->accepting;
1792
1793            /*
1794             * Add the transitions for the state.
1795             */
1796            for (j = 0; j < dsp->ntrans; j++, tp++) {
1797                tp->symbol = sp->trans[j].lhs;
1798                tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1799            }
1800
1801            dsp++;
1802            state++;
1803        }
1804    }
1805
1806    return dfa;
1807}
1808
1809void
1810ure_dfa_free(ure_dfa_t dfa)
1811{
1812    ucs2_t i;
1813
1814    if (dfa == 0)
1815      return;
1816
1817    for (i = 0; i < dfa->nsyms; i++) {
1818        if ((dfa->syms[i].type == _URE_CCLASS ||
1819             dfa->syms[i].type == _URE_NCCLASS) &&
1820            dfa->syms[i].sym.ccl.ranges_size > 0)
1821          free((char *) dfa->syms[i].sym.ccl.ranges);
1822    }
1823    if (dfa->nsyms > 0)
1824      free((char *) dfa->syms);
1825
1826    if (dfa->nstates > 0)
1827      free((char *) dfa->states);
1828    if (dfa->ntrans > 0)
1829      free((char *) dfa->trans);
1830    free((char *) dfa);
1831}
1832
1833void
1834ure_write_dfa(ure_dfa_t dfa, FILE *out)
1835{
1836    ucs2_t i, j, k, h, l;
1837    _ure_dstate_t *sp;
1838    _ure_symtab_t *sym;
1839    _ure_range_t *rp;
1840
1841    if (dfa == 0 || out == 0)
1842      return;
1843
1844    /*
1845     * Write all the different character classes.
1846     */
1847    for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1848        if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1849            fprintf(out, "C%hd = ", sym->id);
1850            if (sym->sym.ccl.ranges_used > 0) {
1851                putc('[', out);
1852                if (sym->type == _URE_NCCLASS)
1853                  putc('^', out);
1854            }
1855            if (sym->props != 0) {
1856                if (sym->type == _URE_NCCLASS)
1857                  fprintf(out, "\\P");
1858                else
1859                  fprintf(out, "\\p");
1860                for (k = h = 0; k < 32; k++) {
1861                    if (sym->props & (1 << k)) {
1862                        if (h != 0)
1863                          putc(',', out);
1864                        fprintf(out, "%hd", k + 1);
1865                        h = 1;
1866                    }
1867                }
1868            }
1869            /*
1870             * Dump the ranges.
1871             */
1872            for (k = 0, rp = sym->sym.ccl.ranges;
1873                 k < sym->sym.ccl.ranges_used; k++, rp++) {
1874                /*
1875                 * Check for UTF16 characters.
1876                 */
1877                if (0x10000 <= rp->min_code &&
1878                    rp->min_code <= 0x10ffff) {
1879                    h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1880                    l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1881                    fprintf(out, "\\x%04hX\\x%04hX", h, l);
1882                } else
1883                  fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1884                if (rp->max_code != rp->min_code) {
1885                    putc('-', out);
1886                    if (rp->max_code >= 0x10000 &&
1887                        rp->max_code <= 0x10ffff) {
1888                        h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1889                        l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1890                        fprintf(out, "\\x%04hX\\x%04hX", h, l);
1891                    } else
1892                      fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1893                }
1894            }
1895            if (sym->sym.ccl.ranges_used > 0)
1896              putc(']', out);
1897            putc('\n', out);
1898        }
1899    }
1900
1901    for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1902        fprintf(out, "S%hd = ", i);
1903        if (sp->accepting) {
1904            fprintf(out, "1 ");
1905            if (sp->ntrans)
1906              fprintf(out, "| ");
1907        }
1908        for (j = 0; j < sp->ntrans; j++) {
1909            if (j > 0)
1910              fprintf(out, "| ");
1911
1912            sym = dfa->syms + sp->trans[j].symbol;
1913            switch (sym->type) {
1914              case _URE_CHAR:
1915                if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1916                    /*
1917                     * Take care of UTF16 characters.
1918                     */
1919                    h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1920                    l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1921                    fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1922                } else
1923                  fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1924                break;
1925              case _URE_ANY_CHAR:
1926                fprintf(out, "<any> ");
1927                break;
1928              case _URE_BOL_ANCHOR:
1929                fprintf(out, "<bol-anchor> ");
1930                break;
1931              case _URE_EOL_ANCHOR:
1932                fprintf(out, "<eol-anchor> ");
1933                break;
1934              case _URE_CCLASS:
1935              case _URE_NCCLASS:
1936                fprintf(out, "[C%hd] ", sym->id);
1937                break;
1938            }
1939            fprintf(out, "S%hd", sp->trans[j].next_state);
1940            if (j + 1 < sp->ntrans)
1941              putc(' ', out);
1942        }
1943        putc('\n', out);
1944    }
1945}
1946
1947#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1948                        (cc) == 0x2029)
1949
1950int
1951ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1952         unsigned long *match_start, unsigned long *match_end)
1953{
1954    int i, j, matched, found, skip;
1955    unsigned long ms, me;
1956    ucs4_t c;
1957    ucs2_t *sp, *ep, *lp;
1958    _ure_dstate_t *stp;
1959    _ure_symtab_t *sym;
1960    _ure_range_t *rp;
1961
1962    if (dfa == 0 || text == 0)
1963      return 0;
1964
1965    /*
1966     * Handle the special case of an empty string matching the "^$" pattern.
1967     */
1968    if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1969        *match_start = *match_end = 0;
1970        return 1;
1971    }
1972
1973    sp = text;
1974    ep = sp + textlen;
1975
1976    ms = me = ~0;
1977
1978    stp = dfa->states;
1979
1980    for (found = skip = 0; found == 0 && sp < ep; ) {
1981        lp = sp;
1982        c = *sp++;
1983
1984        /*
1985         * Check to see if this is a high surrogate that should be
1986         * combined with a following low surrogate.
1987         */
1988        if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1989            0xdc00 <= *sp && *sp <= 0xdfff)
1990          c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1991
1992        /*
1993         * Determine if the character is non-spacing and should be skipped.
1994         */
1995        if (_ure_matches_properties(_URE_NONSPACING, c) &&
1996            (flags & URE_IGNORE_NONSPACING)) {
1997            sp++;
1998            continue;
1999        }
2000
2001        if (dfa->flags & _URE_DFA_CASEFOLD)
2002          c = _ure_tolower(c);
2003
2004        /*
2005         * See if one of the transitions matches.
2006         */
2007        for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2008            sym = dfa->syms + stp->trans[i].symbol;
2009            switch (sym->type) {
2010              case _URE_ANY_CHAR:
2011                if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2012                    !_ure_issep(c))
2013                  matched = 1;
2014                break;
2015              case _URE_CHAR:
2016                if (c == sym->sym.chr)
2017                  matched = 1;
2018                break;
2019              case _URE_BOL_ANCHOR:
2020                if (lp == text) {
2021                    sp = lp;
2022                    matched = 1;
2023                } else if (_ure_issep(c)) {
2024                    if (c == '\r' && sp < ep && *sp == '\n')
2025                      sp++;
2026                    lp = sp;
2027                    matched = 1;
2028                }
2029                break;
2030              case _URE_EOL_ANCHOR:
2031                if (_ure_issep(c)) {
2032                    /*
2033                     * Put the pointer back before the separator so the match
2034                     * end position will be correct.  This case will also
2035                     * cause the `sp' pointer to be advanced over the current
2036                     * separator once the match end point has been recorded.
2037                     */
2038                    sp = lp;
2039                    matched = 1;
2040                }
2041                break;
2042              case _URE_CCLASS:
2043              case _URE_NCCLASS:
2044                if (sym->props != 0)
2045                  matched = _ure_matches_properties(sym->props, c);
2046                for (j = 0, rp = sym->sym.ccl.ranges;
2047                     j < sym->sym.ccl.ranges_used; j++, rp++) {
2048                    if (rp->min_code <= c && c <= rp->max_code)
2049                      matched = 1;
2050                }
2051                if (sym->type == _URE_NCCLASS)
2052                  matched = !matched;
2053                break;
2054            }
2055
2056            if (matched) {
2057                if (ms == ~0UL)
2058                  ms = lp - text;
2059                else
2060                  me = sp - text;
2061                stp = dfa->states + stp->trans[i].next_state;
2062
2063                /*
2064                 * If the match was an EOL anchor, adjust the pointer past the
2065                 * separator that caused the match.  The correct match
2066                 * position has been recorded already.
2067                 */
2068                if (sym->type == _URE_EOL_ANCHOR) {
2069                    /*
2070                     * Skip the character that caused the match.
2071                     */
2072                    sp++;
2073
2074                    /*
2075                     * Handle the infamous CRLF situation.
2076                     */
2077                    if (sp < ep && c == '\r' && *sp == '\n')
2078                      sp++;
2079                }
2080            }
2081        }
2082
2083        if (matched == 0) {
2084            if (stp->accepting == 0) {
2085                /*
2086                 * If the last state was not accepting, then reset
2087                 * and start over.
2088                 */
2089                stp = dfa->states;
2090                ms = me = ~0;
2091            } else
2092              /*
2093               * The last state was accepting, so terminate the matching
2094               * loop to avoid more work.
2095               */
2096              found = 1;
2097        } else if (sp == ep) {
2098            if (!stp->accepting) {
2099                /*
2100                 * This ugly hack is to make sure the end-of-line anchors
2101                 * match when the source text hits the end.  This is only done
2102                 * if the last subexpression matches.
2103                 */
2104                for (i = 0; found == 0 && i < stp->ntrans; i++) {
2105                    sym = dfa->syms + stp->trans[i].symbol;
2106                    if (sym->type ==_URE_EOL_ANCHOR) {
2107                        stp = dfa->states + stp->trans[i].next_state;
2108                        if (stp->accepting) {
2109                            me = sp - text;
2110                            found = 1;
2111                        } else
2112                          break;
2113                    }
2114                }
2115            } else {
2116                /*
2117                 * Make sure any conditions that match all the way to the end
2118                 * of the string match.
2119                 */
2120                found = 1;
2121                me = sp - text;
2122            }
2123        }
2124    }
2125
2126    if (found == 0)
2127      ms = me = ~0;
2128
2129    *match_start = ms;
2130    *match_end = me;
2131
2132    return (ms != ~0UL) ? 1 : 0;
2133}
2134