1/* $OpenLDAP$ */ 2/* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2011 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15/* Copyright 1997, 1998, 1999 Computing Research Labs, 16 * New Mexico State University 17 * 18 * Permission is hereby granted, free of charge, to any person obtaining a 19 * copy of this software and associated documentation files (the "Software"), 20 * to deal in the Software without restriction, including without limitation 21 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 22 * and/or sell copies of the Software, and to permit persons to whom the 23 * Software is furnished to do so, subject to the following conditions: 24 * 25 * The above copyright notice and this permission notice shall be included in 26 * all copies or substantial portions of the Software. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 31 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 32 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 33 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 34 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 35 */ 36/* $Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp $" */ 37 38#include "portable.h" 39 40#include <ac/stdlib.h> 41#include <ac/string.h> 42#include <ac/unistd.h> 43 44#include "ure.h" 45 46/* 47 * Flags used internally in the DFA. 48 */ 49#define _URE_DFA_CASEFOLD 0x01 50#define _URE_DFA_BLANKLINE 0x02 51 52static unsigned long cclass_flags[] = { 53 0, 54 _URE_NONSPACING, 55 _URE_COMBINING, 56 _URE_NUMDIGIT, 57 _URE_NUMOTHER, 58 _URE_SPACESEP, 59 _URE_LINESEP, 60 _URE_PARASEP, 61 _URE_CNTRL, 62 _URE_PUA, 63 _URE_UPPER, 64 _URE_LOWER, 65 _URE_TITLE, 66 _URE_MODIFIER, 67 _URE_OTHERLETTER, 68 _URE_DASHPUNCT, 69 _URE_OPENPUNCT, 70 _URE_CLOSEPUNCT, 71 _URE_OTHERPUNCT, 72 _URE_MATHSYM, 73 _URE_CURRENCYSYM, 74 _URE_OTHERSYM, 75 _URE_LTR, 76 _URE_RTL, 77 _URE_EURONUM, 78 _URE_EURONUMSEP, 79 _URE_EURONUMTERM, 80 _URE_ARABNUM, 81 _URE_COMMONSEP, 82 _URE_BLOCKSEP, 83 _URE_SEGMENTSEP, 84 _URE_WHITESPACE, 85 _URE_OTHERNEUT, 86}; 87 88/* 89 * Symbol types for the DFA. 90 */ 91#define _URE_ANY_CHAR 1 92#define _URE_CHAR 2 93#define _URE_CCLASS 3 94#define _URE_NCCLASS 4 95#define _URE_BOL_ANCHOR 5 96#define _URE_EOL_ANCHOR 6 97 98/* 99 * Op codes for converting the NFA to a DFA. 100 */ 101#define _URE_SYMBOL 10 102#define _URE_PAREN 11 103#define _URE_QUEST 12 104#define _URE_STAR 13 105#define _URE_PLUS 14 106#define _URE_ONE 15 107#define _URE_AND 16 108#define _URE_OR 17 109 110#define _URE_NOOP 0xffff 111 112#define _URE_REGSTART 0x8000 113#define _URE_REGEND 0x4000 114 115/* 116 * Structure used to handle a compacted range of characters. 117 */ 118typedef struct { 119 ucs4_t min_code; 120 ucs4_t max_code; 121} _ure_range_t; 122 123typedef struct { 124 _ure_range_t *ranges; 125 ucs2_t ranges_used; 126 ucs2_t ranges_size; 127} _ure_ccl_t; 128 129typedef union { 130 ucs4_t chr; 131 _ure_ccl_t ccl; 132} _ure_sym_t; 133 134/* 135 * This is a general element structure used for expressions and stack 136 * elements. 137 */ 138typedef struct { 139 ucs2_t reg; 140 ucs2_t onstack; 141 ucs2_t type; 142 ucs2_t lhs; 143 ucs2_t rhs; 144} _ure_elt_t; 145 146/* 147 * This is a structure used to track a list or a stack of states. 148 */ 149typedef struct { 150 ucs2_t *slist; 151 ucs2_t slist_size; 152 ucs2_t slist_used; 153} _ure_stlist_t; 154 155/* 156 * Structure to track the list of unique states for a symbol 157 * during reduction. 158 */ 159typedef struct { 160 ucs2_t id; 161 ucs2_t type; 162 unsigned long mods; 163 unsigned long props; 164 _ure_sym_t sym; 165 _ure_stlist_t states; 166} _ure_symtab_t; 167 168/* 169 * Structure to hold a single state. 170 */ 171typedef struct { 172 ucs2_t id; 173 ucs2_t accepting; 174 ucs2_t pad; 175 _ure_stlist_t st; 176 _ure_elt_t *trans; 177 ucs2_t trans_size; 178 ucs2_t trans_used; 179} _ure_state_t; 180 181/* 182 * Structure used for keeping lists of states. 183 */ 184typedef struct { 185 _ure_state_t *states; 186 ucs2_t states_size; 187 ucs2_t states_used; 188} _ure_statetable_t; 189 190/* 191 * Structure to track pairs of DFA states when equivalent states are 192 * merged. 193 */ 194typedef struct { 195 ucs2_t l; 196 ucs2_t r; 197} _ure_equiv_t; 198 199/* 200 * Structure used for constructing the NFA and reducing to a minimal DFA. 201 */ 202typedef struct _ure_buffer_t { 203 int reducing; 204 int error; 205 unsigned long flags; 206 207 _ure_stlist_t stack; 208 209 /* 210 * Table of unique symbols encountered. 211 */ 212 _ure_symtab_t *symtab; 213 ucs2_t symtab_size; 214 ucs2_t symtab_used; 215 216 /* 217 * Tracks the unique expressions generated for the NFA and when the NFA is 218 * reduced. 219 */ 220 _ure_elt_t *expr; 221 ucs2_t expr_used; 222 ucs2_t expr_size; 223 224 /* 225 * The reduced table of unique groups of NFA states. 226 */ 227 _ure_statetable_t states; 228 229 /* 230 * Tracks states when equivalent states are merged. 231 */ 232 _ure_equiv_t *equiv; 233 ucs2_t equiv_used; 234 ucs2_t equiv_size; 235} _ure_buffer_t; 236 237typedef struct { 238 ucs2_t symbol; 239 ucs2_t next_state; 240} _ure_trans_t; 241 242typedef struct { 243 ucs2_t accepting; 244 ucs2_t ntrans; 245 _ure_trans_t *trans; 246} _ure_dstate_t; 247 248typedef struct _ure_dfa_t { 249 unsigned long flags; 250 251 _ure_symtab_t *syms; 252 ucs2_t nsyms; 253 254 _ure_dstate_t *states; 255 ucs2_t nstates; 256 257 _ure_trans_t *trans; 258 ucs2_t ntrans; 259} _ure_dfa_t; 260 261/************************************************************************* 262 * 263 * Functions. 264 * 265 *************************************************************************/ 266 267static void 268_ure_memmove(char *dest, char *src, unsigned long bytes) 269{ 270 long i, j; 271 272 i = (long) bytes; 273 j = i & 7; 274 i = (i + 7) >> 3; 275 276 /* 277 * Do a memmove using Ye Olde Duff's Device for efficiency. 278 */ 279 if (src < dest) { 280 src += bytes; 281 dest += bytes; 282 283 switch (j) { 284 case 0: do { 285 *--dest = *--src; 286 case 7: *--dest = *--src; 287 case 6: *--dest = *--src; 288 case 5: *--dest = *--src; 289 case 4: *--dest = *--src; 290 case 3: *--dest = *--src; 291 case 2: *--dest = *--src; 292 case 1: *--dest = *--src; 293 } while (--i > 0); 294 } 295 } else if (src > dest) { 296 switch (j) { 297 case 0: do { 298 *dest++ = *src++; 299 case 7: *dest++ = *src++; 300 case 6: *dest++ = *src++; 301 case 5: *dest++ = *src++; 302 case 4: *dest++ = *src++; 303 case 3: *dest++ = *src++; 304 case 2: *dest++ = *src++; 305 case 1: *dest++ = *src++; 306 } while (--i > 0); 307 } 308 } 309} 310 311static void 312_ure_push(ucs2_t v, _ure_buffer_t *b) 313{ 314 _ure_stlist_t *s; 315 316 if (b == 0) 317 return; 318 319 /* 320 * If the `reducing' parameter is non-zero, check to see if the value 321 * passed is already on the stack. 322 */ 323 if (b->reducing != 0 && b->expr[v].onstack != 0) 324 return; 325 326 s = &b->stack; 327 if (s->slist_used == s->slist_size) { 328 if (s->slist_size == 0) 329 s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 330 else 331 s->slist = (ucs2_t *) realloc((char *) s->slist, 332 sizeof(ucs2_t) * (s->slist_size + 8)); 333 s->slist_size += 8; 334 } 335 s->slist[s->slist_used++] = v; 336 337 /* 338 * If the `reducing' parameter is non-zero, flag the element as being on 339 * the stack. 340 */ 341 if (b->reducing != 0) 342 b->expr[v].onstack = 1; 343} 344 345static ucs2_t 346_ure_peek(_ure_buffer_t *b) 347{ 348 if (b == 0 || b->stack.slist_used == 0) 349 return _URE_NOOP; 350 351 return b->stack.slist[b->stack.slist_used - 1]; 352} 353 354static ucs2_t 355_ure_pop(_ure_buffer_t *b) 356{ 357 ucs2_t v; 358 359 if (b == 0 || b->stack.slist_used == 0) 360 return _URE_NOOP; 361 362 v = b->stack.slist[--b->stack.slist_used]; 363 if (b->reducing) 364 b->expr[v].onstack = 0; 365 366 return v; 367} 368 369/************************************************************************* 370 * 371 * Start symbol parse functions. 372 * 373 *************************************************************************/ 374 375/* 376 * Parse a comma-separated list of integers that represent character 377 * properties. Combine them into a mask that is returned in the `mask' 378 * variable, and return the number of characters consumed. 379 */ 380static unsigned long 381_ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask, 382 _ure_buffer_t *b) 383{ 384 unsigned long n, m; 385 ucs2_t *sp, *ep; 386 387 sp = pp; 388 ep = sp + limit; 389 390 for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) { 391 if (*sp == ',') { 392 /* 393 * Encountered a comma, so select the next character property flag 394 * and reset the number. 395 */ 396 m |= cclass_flags[n]; 397 n = 0; 398 } else if (*sp >= '0' && *sp <= '9') 399 /* 400 * Encountered a digit, so start or continue building the cardinal 401 * that represents the character property flag. 402 */ 403 n = (n * 10) + (*sp - '0'); 404 else 405 /* 406 * Encountered something that is not part of the property list. 407 * Indicate that we are done. 408 */ 409 break; 410 411 /* 412 * If a property number greater than 32 occurs, then there is a 413 * problem. Most likely a missing comma separator. 414 */ 415 if (n > 32) 416 b->error = _URE_INVALID_PROPERTY; 417 } 418 419 if (n != 0) 420 m |= cclass_flags[n]; 421 422 /* 423 * Set the mask that represents the group of character properties. 424 */ 425 *mask = m; 426 427 /* 428 * Return the number of characters consumed. 429 */ 430 return sp - pp; 431} 432 433/* 434 * Collect a hex number with 1 to 4 digits and return the number 435 * of characters used. 436 */ 437static unsigned long 438_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n) 439{ 440 ucs2_t i; 441 ucs2_t *sp, *ep; 442 ucs4_t nn; 443 444 sp = np; 445 ep = sp + limit; 446 447 for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { 448 if (*sp >= '0' && *sp <= '9') 449 nn = (nn << 4) + (*sp - '0'); 450 else if (*sp >= 'A' && *sp <= 'F') 451 nn = (nn << 4) + ((*sp - 'A') + 10); 452 else if (*sp >= 'a' && *sp <= 'f') 453 nn = (nn << 4) + ((*sp - 'a') + 10); 454 else 455 /* 456 * Encountered something that is not a hex digit. 457 */ 458 break; 459 } 460 461 /* 462 * Assign the character code collected and return the number of 463 * characters used. 464 */ 465 *n = nn; 466 467 return sp - np; 468} 469 470/* 471 * Insert a range into a character class, removing duplicates and ordering 472 * them in increasing range-start order. 473 */ 474static void 475_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b) 476{ 477 ucs2_t i; 478 ucs4_t tmp; 479 _ure_range_t *rp; 480 481 /* 482 * If the `casefold' flag is set, then make sure both endpoints of the 483 * range are converted to lower case. 484 */ 485 if (b->flags & _URE_DFA_CASEFOLD) { 486 r->min_code = _ure_tolower(r->min_code); 487 r->max_code = _ure_tolower(r->max_code); 488 } 489 490 /* 491 * Swap the range endpoints if they are not in increasing order. 492 */ 493 if (r->min_code > r->max_code) { 494 tmp = r->min_code; 495 r->min_code = r->max_code; 496 r->max_code = tmp; 497 } 498 499 for (i = 0, rp = ccl->ranges; 500 i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; 501 502 /* 503 * Check for a duplicate. 504 */ 505 if (i < ccl->ranges_used && 506 r->min_code == rp->min_code && r->max_code == rp->max_code) 507 return; 508 509 if (ccl->ranges_used == ccl->ranges_size) { 510 if (ccl->ranges_size == 0) 511 ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); 512 else 513 ccl->ranges = (_ure_range_t *) 514 realloc((char *) ccl->ranges, 515 sizeof(_ure_range_t) * (ccl->ranges_size + 8)); 516 ccl->ranges_size += 8; 517 } 518 519 rp = ccl->ranges + ccl->ranges_used; 520 521 if (i < ccl->ranges_used) 522 _ure_memmove((char *) (rp + 1), (char *) rp, 523 sizeof(_ure_range_t) * (ccl->ranges_used - i)); 524 525 ccl->ranges_used++; 526 rp->min_code = r->min_code; 527 rp->max_code = r->max_code; 528} 529 530#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\ 531_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING) 532#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT) 533#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\ 534_URE_OTHERPUNCT) 535#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\ 536_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM) 537#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP) 538#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP) 539 540typedef void (*_ure_cclsetup_t)( 541 _ure_symtab_t *sym, 542 unsigned long mask, 543 _ure_buffer_t *b 544); 545 546typedef struct { 547 ucs2_t key; 548 unsigned long len; 549 unsigned long next; 550 _ure_cclsetup_t func; 551 unsigned long mask; 552} _ure_trie_t; 553 554static void 555_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 556{ 557 sym->props |= mask; 558} 559 560static void 561_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 562{ 563 _ure_range_t range; 564 565 sym->props |= mask; 566 567 /* 568 * Add the additional characters needed for handling isspace(). 569 */ 570 range.min_code = range.max_code = '\t'; 571 _ure_add_range(&sym->sym.ccl, &range, b); 572 range.min_code = range.max_code = '\r'; 573 _ure_add_range(&sym->sym.ccl, &range, b); 574 range.min_code = range.max_code = '\n'; 575 _ure_add_range(&sym->sym.ccl, &range, b); 576 range.min_code = range.max_code = '\f'; 577 _ure_add_range(&sym->sym.ccl, &range, b); 578 range.min_code = range.max_code = 0xfeff; 579 _ure_add_range(&sym->sym.ccl, &range, b); 580} 581 582static void 583_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b) 584{ 585 _ure_range_t range; 586 587 /* 588 * Add the additional characters needed for handling isxdigit(). 589 */ 590 range.min_code = '0'; 591 range.max_code = '9'; 592 _ure_add_range(&sym->sym.ccl, &range, b); 593 range.min_code = 'A'; 594 range.max_code = 'F'; 595 _ure_add_range(&sym->sym.ccl, &range, b); 596 range.min_code = 'a'; 597 range.max_code = 'f'; 598 _ure_add_range(&sym->sym.ccl, &range, b); 599} 600 601static _ure_trie_t cclass_trie[] = { 602 {0x003a, 1, 1, 0, 0}, 603 {0x0061, 9, 10, 0, 0}, 604 {0x0063, 8, 19, 0, 0}, 605 {0x0064, 7, 24, 0, 0}, 606 {0x0067, 6, 29, 0, 0}, 607 {0x006c, 5, 34, 0, 0}, 608 {0x0070, 4, 39, 0, 0}, 609 {0x0073, 3, 49, 0, 0}, 610 {0x0075, 2, 54, 0, 0}, 611 {0x0078, 1, 59, 0, 0}, 612 {0x006c, 1, 11, 0, 0}, 613 {0x006e, 2, 13, 0, 0}, 614 {0x0070, 1, 16, 0, 0}, 615 {0x0075, 1, 14, 0, 0}, 616 {0x006d, 1, 15, 0, 0}, 617 {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, 618 {0x0068, 1, 17, 0, 0}, 619 {0x0061, 1, 18, 0, 0}, 620 {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, 621 {0x006e, 1, 20, 0, 0}, 622 {0x0074, 1, 21, 0, 0}, 623 {0x0072, 1, 22, 0, 0}, 624 {0x006c, 1, 23, 0, 0}, 625 {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, 626 {0x0069, 1, 25, 0, 0}, 627 {0x0067, 1, 26, 0, 0}, 628 {0x0069, 1, 27, 0, 0}, 629 {0x0074, 1, 28, 0, 0}, 630 {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, 631 {0x0072, 1, 30, 0, 0}, 632 {0x0061, 1, 31, 0, 0}, 633 {0x0070, 1, 32, 0, 0}, 634 {0x0068, 1, 33, 0, 0}, 635 {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, 636 {0x006f, 1, 35, 0, 0}, 637 {0x0077, 1, 36, 0, 0}, 638 {0x0065, 1, 37, 0, 0}, 639 {0x0072, 1, 38, 0, 0}, 640 {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, 641 {0x0072, 2, 41, 0, 0}, 642 {0x0075, 1, 45, 0, 0}, 643 {0x0069, 1, 42, 0, 0}, 644 {0x006e, 1, 43, 0, 0}, 645 {0x0074, 1, 44, 0, 0}, 646 {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, 647 {0x006e, 1, 46, 0, 0}, 648 {0x0063, 1, 47, 0, 0}, 649 {0x0074, 1, 48, 0, 0}, 650 {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, 651 {0x0070, 1, 50, 0, 0}, 652 {0x0061, 1, 51, 0, 0}, 653 {0x0063, 1, 52, 0, 0}, 654 {0x0065, 1, 53, 0, 0}, 655 {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, 656 {0x0070, 1, 55, 0, 0}, 657 {0x0070, 1, 56, 0, 0}, 658 {0x0065, 1, 57, 0, 0}, 659 {0x0072, 1, 58, 0, 0}, 660 {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, 661 {0x0064, 1, 60, 0, 0}, 662 {0x0069, 1, 61, 0, 0}, 663 {0x0067, 1, 62, 0, 0}, 664 {0x0069, 1, 63, 0, 0}, 665 {0x0074, 1, 64, 0, 0}, 666 {0x003a, 1, 65, _ure_xdigit_setup, 0}, 667}; 668 669/* 670 * Probe for one of the POSIX colon delimited character classes in the static 671 * trie. 672 */ 673static unsigned long 674_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, 675 _ure_buffer_t *b) 676{ 677 int i; 678 unsigned long n; 679 _ure_trie_t *tp; 680 ucs2_t *sp, *ep; 681 682 /* 683 * If the number of characters left is less than 7, then this cannot be 684 * interpreted as one of the colon delimited classes. 685 */ 686 if (limit < 7) 687 return 0; 688 689 sp = cp; 690 ep = sp + limit; 691 tp = cclass_trie; 692 for (i = 0; sp < ep && i < 8; i++, sp++) { 693 n = tp->len; 694 695 for (; n > 0 && tp->key != *sp; tp++, n--) ; 696 697 if (n == 0) 698 return 0; 699 700 if (*sp == ':' && (i == 6 || i == 7)) { 701 sp++; 702 break; 703 } 704 if (sp + 1 < ep) 705 tp = cclass_trie + tp->next; 706 } 707 if (tp->func == 0) 708 return 0; 709 710 (*tp->func)(sym, tp->mask, b); 711 712 return sp - cp; 713} 714 715/* 716 * Construct a list of ranges and return the number of characters consumed. 717 */ 718static unsigned long 719_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, 720 _ure_buffer_t *b) 721{ 722 int range_end; 723 unsigned long n; 724 ucs2_t *sp, *ep; 725 ucs4_t c, last; 726 _ure_ccl_t *cclp; 727 _ure_range_t range; 728 729 sp = cp; 730 ep = sp + limit; 731 732 if (*sp == '^') { 733 symp->type = _URE_NCCLASS; 734 sp++; 735 } else 736 symp->type = _URE_CCLASS; 737 738 for (last = 0, range_end = 0; 739 b->error == _URE_OK && sp < ep && *sp != ']'; ) { 740 c = *sp++; 741 if (c == '\\') { 742 if (sp == ep) { 743 /* 744 * The EOS was encountered when expecting the reverse solidus 745 * to be followed by the character it is escaping. Set an 746 * error code and return the number of characters consumed up 747 * to this point. 748 */ 749 b->error = _URE_UNEXPECTED_EOS; 750 return sp - cp; 751 } 752 753 c = *sp++; 754 switch (c) { 755 case 'a': 756 c = 0x07; 757 break; 758 case 'b': 759 c = 0x08; 760 break; 761 case 'f': 762 c = 0x0c; 763 break; 764 case 'n': 765 c = 0x0a; 766 break; 767 case 'r': 768 c = 0x0d; 769 break; 770 case 't': 771 c = 0x09; 772 break; 773 case 'v': 774 c = 0x0b; 775 break; 776 case 'p': 777 case 'P': 778 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 779 /* 780 * Invert the bit mask of the properties if this is a negated 781 * character class or if 'P' is used to specify a list of 782 * character properties that should *not* match in a 783 * character class. 784 */ 785 if (c == 'P') 786 symp->props = ~symp->props; 787 continue; 788 break; 789 case 'x': 790 case 'X': 791 case 'u': 792 case 'U': 793 if (sp < ep && 794 ((*sp >= '0' && *sp <= '9') || 795 (*sp >= 'A' && *sp <= 'F') || 796 (*sp >= 'a' && *sp <= 'f'))) 797 sp += _ure_hex(sp, ep - sp, &c); 798 } 799 } else if (c == ':') { 800 /* 801 * Probe for a POSIX colon delimited character class. 802 */ 803 sp--; 804 if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) 805 sp++; 806 else { 807 sp += n; 808 continue; 809 } 810 } 811 812 cclp = &symp->sym.ccl; 813 814 /* 815 * Check to see if the current character is a low surrogate that needs 816 * to be combined with a preceding high surrogate. 817 */ 818 if (last != 0) { 819 if (c >= 0xdc00 && c <= 0xdfff) 820 /* 821 * Construct the UTF16 character code. 822 */ 823 c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); 824 else { 825 /* 826 * Add the isolated high surrogate to the range. 827 */ 828 if (range_end == 1) 829 range.max_code = last & 0xffff; 830 else 831 range.min_code = range.max_code = last & 0xffff; 832 833 _ure_add_range(cclp, &range, b); 834 range_end = 0; 835 } 836 } 837 838 /* 839 * Clear the last character code. 840 */ 841 last = 0; 842 843 /* 844 * This slightly awkward code handles the different cases needed to 845 * construct a range. 846 */ 847 if (c >= 0xd800 && c <= 0xdbff) { 848 /* 849 * If the high surrogate is followed by a range indicator, simply 850 * add it as the range start. Otherwise, save it in case the next 851 * character is a low surrogate. 852 */ 853 if (*sp == '-') { 854 sp++; 855 range.min_code = c; 856 range_end = 1; 857 } else 858 last = c; 859 } else if (range_end == 1) { 860 range.max_code = c; 861 _ure_add_range(cclp, &range, b); 862 range_end = 0; 863 } else { 864 range.min_code = range.max_code = c; 865 if (*sp == '-') { 866 sp++; 867 range_end = 1; 868 } else 869 _ure_add_range(cclp, &range, b); 870 } 871 } 872 873 if (sp < ep && *sp == ']') 874 sp++; 875 else 876 /* 877 * The parse was not terminated by the character class close symbol 878 * (']'), so set an error code. 879 */ 880 b->error = _URE_CCLASS_OPEN; 881 882 return sp - cp; 883} 884 885/* 886 * Probe for a low surrogate hex code. 887 */ 888static unsigned long 889_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c) 890{ 891 ucs4_t i, code; 892 ucs2_t *sp, *ep; 893 894 for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { 895 if (*sp >= '0' && *sp <= '9') 896 code = (code << 4) + (*sp - '0'); 897 else if (*sp >= 'A' && *sp <= 'F') 898 code = (code << 4) + ((*sp - 'A') + 10); 899 else if (*sp >= 'a' && *sp <= 'f') 900 code = (code << 4) + ((*sp - 'a') + 10); 901 else 902 break; 903 } 904 905 *c = code; 906 return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0; 907} 908 909static unsigned long 910_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, 911 _ure_buffer_t *b) 912{ 913 ucs4_t c; 914 ucs2_t *sp, *ep; 915 916 sp = sym; 917 ep = sym + limit; 918 919 if ((c = *sp++) == '\\') { 920 921 if (sp == ep) { 922 /* 923 * The EOS was encountered when expecting the reverse solidus to 924 * be followed by the character it is escaping. Set an error code 925 * and return the number of characters consumed up to this point. 926 */ 927 b->error = _URE_UNEXPECTED_EOS; 928 return sp - sym; 929 } 930 931 c = *sp++; 932 switch (c) { 933 case 'p': 934 case 'P': 935 symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; 936 sp += _ure_prop_list(sp, ep - sp, &symp->props, b); 937 break; 938 case 'a': 939 symp->type = _URE_CHAR; 940 symp->sym.chr = 0x07; 941 break; 942 case 'b': 943 symp->type = _URE_CHAR; 944 symp->sym.chr = 0x08; 945 break; 946 case 'f': 947 symp->type = _URE_CHAR; 948 symp->sym.chr = 0x0c; 949 break; 950 case 'n': 951 symp->type = _URE_CHAR; 952 symp->sym.chr = 0x0a; 953 break; 954 case 'r': 955 symp->type = _URE_CHAR; 956 symp->sym.chr = 0x0d; 957 break; 958 case 't': 959 symp->type = _URE_CHAR; 960 symp->sym.chr = 0x09; 961 break; 962 case 'v': 963 symp->type = _URE_CHAR; 964 symp->sym.chr = 0x0b; 965 break; 966 case 'x': 967 case 'X': 968 case 'u': 969 case 'U': 970 /* 971 * Collect between 1 and 4 digits representing a UCS2 code. Fall 972 * through to the next case. 973 */ 974 if (sp < ep && 975 ((*sp >= '0' && *sp <= '9') || 976 (*sp >= 'A' && *sp <= 'F') || 977 (*sp >= 'a' && *sp <= 'f'))) 978 sp += _ure_hex(sp, ep - sp, &c); 979 /* FALLTHROUGH */ 980 default: 981 /* 982 * Simply add an escaped character here. 983 */ 984 symp->type = _URE_CHAR; 985 symp->sym.chr = c; 986 } 987 } else if (c == '^' || c == '$') 988 /* 989 * Handle the BOL and EOL anchors. This actually consists simply of 990 * setting a flag that indicates that the user supplied anchor match 991 * function should be called. This needs to be done instead of simply 992 * matching line/paragraph separators because beginning-of-text and 993 * end-of-text tests are needed as well. 994 */ 995 symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; 996 else if (c == '[') 997 /* 998 * Construct a character class. 999 */ 1000 sp += _ure_cclass(sp, ep - sp, symp, b); 1001 else if (c == '.') 1002 symp->type = _URE_ANY_CHAR; 1003 else { 1004 symp->type = _URE_CHAR; 1005 symp->sym.chr = c; 1006 } 1007 1008 /* 1009 * If the symbol type happens to be a character and is a high surrogate, 1010 * then probe forward to see if it is followed by a low surrogate that 1011 * needs to be added. 1012 */ 1013 if (sp < ep && symp->type == _URE_CHAR && 1014 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { 1015 1016 if (0xdc00 <= *sp && *sp <= 0xdfff) { 1017 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1018 (*sp & 0x03ff)); 1019 sp++; 1020 } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || 1021 *(sp + 1) == 'u' || *(sp + 1) == 'U')) { 1022 sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); 1023 if (0xdc00 <= c && c <= 0xdfff) { 1024 /* 1025 * Take into account the \[xu] in front of the hex code. 1026 */ 1027 sp += 2; 1028 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | 1029 (c & 0x03ff)); 1030 } 1031 } 1032 } 1033 1034 /* 1035 * Last, make sure any _URE_CHAR type symbols are changed to lower case if 1036 * the `casefold' flag is set. 1037 */ 1038 if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) 1039 symp->sym.chr = _ure_tolower(symp->sym.chr); 1040 1041 /* 1042 * If the symbol constructed is anything other than one of the anchors, 1043 * make sure the _URE_DFA_BLANKLINE flag is removed. 1044 */ 1045 if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) 1046 b->flags &= ~_URE_DFA_BLANKLINE; 1047 1048 /* 1049 * Return the number of characters consumed. 1050 */ 1051 return sp - sym; 1052} 1053 1054static int 1055_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b) 1056{ 1057 if (a->type != b->type || a->mods != b->mods || a->props != b->props) 1058 return 1; 1059 1060 if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { 1061 if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) 1062 return 1; 1063 if (a->sym.ccl.ranges_used > 0 && 1064 memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, 1065 sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) 1066 return 1; 1067 } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) 1068 return 1; 1069 return 0; 1070} 1071 1072/* 1073 * Construct a symbol, but only keep unique symbols. 1074 */ 1075static ucs2_t 1076_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, 1077 _ure_buffer_t *b) 1078{ 1079 ucs2_t i; 1080 _ure_symtab_t *sp, symbol; 1081 1082 /* 1083 * Build the next symbol so we can test to see if it is already in the 1084 * symbol table. 1085 */ 1086 (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); 1087 *consumed = _ure_compile_symbol(sym, limit, &symbol, b); 1088 1089 /* 1090 * Check to see if the symbol exists. 1091 */ 1092 for (i = 0, sp = b->symtab; 1093 i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; 1094 1095 if (i < b->symtab_used) { 1096 /* 1097 * Free up any ranges used for the symbol. 1098 */ 1099 if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && 1100 symbol.sym.ccl.ranges_size > 0) 1101 free((char *) symbol.sym.ccl.ranges); 1102 1103 return b->symtab[i].id; 1104 } 1105 1106 /* 1107 * Need to add the new symbol. 1108 */ 1109 if (b->symtab_used == b->symtab_size) { 1110 if (b->symtab_size == 0) 1111 b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); 1112 else 1113 b->symtab = (_ure_symtab_t *) 1114 realloc((char *) b->symtab, 1115 sizeof(_ure_symtab_t) * (b->symtab_size + 8)); 1116 sp = b->symtab + b->symtab_size; 1117 (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); 1118 b->symtab_size += 8; 1119 } 1120 1121 symbol.id = b->symtab_used++; 1122 (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol, 1123 sizeof(_ure_symtab_t)); 1124 1125 return symbol.id; 1126} 1127 1128/************************************************************************* 1129 * 1130 * End symbol parse functions. 1131 * 1132 *************************************************************************/ 1133 1134static ucs2_t 1135_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b) 1136{ 1137 ucs2_t i; 1138 1139 if (b == 0) 1140 return _URE_NOOP; 1141 1142 /* 1143 * Determine if the expression already exists or not. 1144 */ 1145 for (i = 0; i < b->expr_used; i++) { 1146 if (b->expr[i].type == type && b->expr[i].lhs == lhs && 1147 b->expr[i].rhs == rhs) 1148 break; 1149 } 1150 if (i < b->expr_used) 1151 return i; 1152 1153 /* 1154 * Need to add a new expression. 1155 */ 1156 if (b->expr_used == b->expr_size) { 1157 if (b->expr_size == 0) 1158 b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); 1159 else 1160 b->expr = (_ure_elt_t *) 1161 realloc((char *) b->expr, 1162 sizeof(_ure_elt_t) * (b->expr_size + 8)); 1163 b->expr_size += 8; 1164 } 1165 1166 b->expr[b->expr_used].onstack = 0; 1167 b->expr[b->expr_used].type = type; 1168 b->expr[b->expr_used].lhs = lhs; 1169 b->expr[b->expr_used].rhs = rhs; 1170 1171 return b->expr_used++; 1172} 1173 1174static unsigned char spmap[] = { 1175 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 1176 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1177 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1178}; 1179 1180#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ 1181 (spmap[(cc) >> 3] & (1 << ((cc) & 7)))) 1182 1183/* 1184 * Convert the regular expression into an NFA in a form that will be easy to 1185 * reduce to a DFA. The starting state for the reduction will be returned. 1186 */ 1187static ucs2_t 1188_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b) 1189{ 1190 ucs2_t c, state, top, sym, *sp, *ep; 1191 unsigned long used; 1192 1193 state = _URE_NOOP; 1194 1195 sp = re; 1196 ep = sp + relen; 1197 while (b->error == _URE_OK && sp < ep) { 1198 c = *sp++; 1199 switch (c) { 1200 case '(': 1201 _ure_push(_URE_PAREN, b); 1202 break; 1203 case ')': 1204 /* 1205 * Check for the case of too many close parentheses. 1206 */ 1207 if (_ure_peek(b) == _URE_NOOP) { 1208 b->error = _URE_UNBALANCED_GROUP; 1209 break; 1210 } 1211 1212 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1213 /* 1214 * Make an expression with the AND or OR operator and its right 1215 * hand side. 1216 */ 1217 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1218 1219 /* 1220 * Remove the _URE_PAREN off the stack. 1221 */ 1222 (void) _ure_pop(b); 1223 break; 1224 case '*': 1225 state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); 1226 break; 1227 case '+': 1228 state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); 1229 break; 1230 case '?': 1231 state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); 1232 break; 1233 case '|': 1234 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1235 /* 1236 * Make an expression with the AND or OR operator and its right 1237 * hand side. 1238 */ 1239 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1240 1241 _ure_push(state, b); 1242 _ure_push(_URE_OR, b); 1243 break; 1244 default: 1245 sp--; 1246 sym = _ure_make_symbol(sp, ep - sp, &used, b); 1247 sp += used; 1248 state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); 1249 break; 1250 } 1251 1252 if (c != '(' && c != '|' && sp < ep && 1253 (!_ure_isspecial(*sp) || *sp == '(')) { 1254 _ure_push(state, b); 1255 _ure_push(_URE_AND, b); 1256 } 1257 } 1258 while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) 1259 /* 1260 * Make an expression with the AND or OR operator and its right 1261 * hand side. 1262 */ 1263 state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); 1264 1265 if (b->stack.slist_used > 0) 1266 b->error = _URE_UNBALANCED_GROUP; 1267 1268 return (b->error == _URE_OK) ? state : _URE_NOOP; 1269} 1270 1271static void 1272_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b) 1273{ 1274 ucs2_t i, *stp; 1275 _ure_symtab_t *sp; 1276 1277 /* 1278 * Locate the symbol in the symbol table so the state can be added. 1279 * If the symbol doesn't exist, then a real problem exists. 1280 */ 1281 for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id; 1282 i++, sp++) ; 1283 1284 /* 1285 * Now find out if the state exists in the symbol's state list. 1286 */ 1287 for (i = 0, stp = sp->states.slist; 1288 i < sp->states.slist_used && state > *stp; i++, stp++) ; 1289 1290 if (i == sp->states.slist_used || state < *stp) { 1291 /* 1292 * Need to add the state in order. 1293 */ 1294 if (sp->states.slist_used == sp->states.slist_size) { 1295 if (sp->states.slist_size == 0) 1296 sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3); 1297 else 1298 sp->states.slist = (ucs2_t *) 1299 realloc((char *) sp->states.slist, 1300 sizeof(ucs2_t) * (sp->states.slist_size + 8)); 1301 sp->states.slist_size += 8; 1302 } 1303 if (i < sp->states.slist_used) 1304 (void) _ure_memmove((char *) (sp->states.slist + i + 1), 1305 (char *) (sp->states.slist + i), 1306 sizeof(ucs2_t) * (sp->states.slist_used - i)); 1307 sp->states.slist[i] = state; 1308 sp->states.slist_used++; 1309 } 1310} 1311 1312static ucs2_t 1313_ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b) 1314{ 1315 ucs2_t i; 1316 _ure_state_t *sp; 1317 1318 for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) { 1319 if (sp->st.slist_used == nstates && 1320 memcmp((char *) states, (char *) sp->st.slist, 1321 sizeof(ucs2_t) * nstates) == 0) 1322 break; 1323 } 1324 1325 if (i == b->states.states_used) { 1326 /* 1327 * Need to add a new DFA state (set of NFA states). 1328 */ 1329 if (b->states.states_used == b->states.states_size) { 1330 if (b->states.states_size == 0) 1331 b->states.states = (_ure_state_t *) 1332 malloc(sizeof(_ure_state_t) << 3); 1333 else 1334 b->states.states = (_ure_state_t *) 1335 realloc((char *) b->states.states, 1336 sizeof(_ure_state_t) * (b->states.states_size + 8)); 1337 sp = b->states.states + b->states.states_size; 1338 (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3); 1339 b->states.states_size += 8; 1340 } 1341 1342 sp = b->states.states + b->states.states_used++; 1343 sp->id = i; 1344 1345 if (sp->st.slist_used + nstates > sp->st.slist_size) { 1346 if (sp->st.slist_size == 0) 1347 sp->st.slist = (ucs2_t *) 1348 malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1349 else 1350 sp->st.slist = (ucs2_t *) 1351 realloc((char *) sp->st.slist, 1352 sizeof(ucs2_t) * (sp->st.slist_used + nstates)); 1353 sp->st.slist_size = sp->st.slist_used + nstates; 1354 } 1355 sp->st.slist_used = nstates; 1356 (void) AC_MEMCPY((char *) sp->st.slist, (char *) states, 1357 sizeof(ucs2_t) * nstates); 1358 } 1359 1360 /* 1361 * Return the ID of the DFA state representing a group of NFA states. 1362 */ 1363 return i; 1364} 1365 1366static void 1367_ure_reduce(ucs2_t start, _ure_buffer_t *b) 1368{ 1369 ucs2_t i, j, state, eval, syms, rhs; 1370 ucs2_t s1, s2, ns1, ns2; 1371 _ure_state_t *sp; 1372 _ure_symtab_t *smp; 1373 1374 b->reducing = 1; 1375 1376 /* 1377 * Add the starting state for the reduction. 1378 */ 1379 _ure_add_state(1, &start, b); 1380 1381 /* 1382 * Process each set of NFA states that get created. 1383 */ 1384 for (i = 0; i < b->states.states_used; i++) { 1385 sp = b->states.states + i; 1386 1387 /* 1388 * Push the current states on the stack. 1389 */ 1390 for (j = 0; j < sp->st.slist_used; j++) 1391 _ure_push(sp->st.slist[j], b); 1392 1393 /* 1394 * Reduce the NFA states. 1395 */ 1396 for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) { 1397 state = b->stack.slist[j]; 1398 eval = 1; 1399 1400 /* 1401 * This inner loop is the iterative equivalent of recursively 1402 * reducing subexpressions generated as a result of a reduction. 1403 */ 1404 while (eval) { 1405 switch (b->expr[state].type) { 1406 case _URE_SYMBOL: 1407 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1408 _ure_add_symstate(b->expr[state].lhs, ns1, b); 1409 syms++; 1410 eval = 0; 1411 break; 1412 case _URE_ONE: 1413 sp->accepting = 1; 1414 eval = 0; 1415 break; 1416 case _URE_QUEST: 1417 s1 = b->expr[state].lhs; 1418 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1419 state = _ure_make_expr(_URE_OR, ns1, s1, b); 1420 break; 1421 case _URE_PLUS: 1422 s1 = b->expr[state].lhs; 1423 ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b); 1424 state = _ure_make_expr(_URE_AND, s1, ns1, b); 1425 break; 1426 case _URE_STAR: 1427 s1 = b->expr[state].lhs; 1428 ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b); 1429 ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b); 1430 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1431 break; 1432 case _URE_OR: 1433 s1 = b->expr[state].lhs; 1434 s2 = b->expr[state].rhs; 1435 _ure_push(s1, b); 1436 _ure_push(s2, b); 1437 eval = 0; 1438 break; 1439 case _URE_AND: 1440 s1 = b->expr[state].lhs; 1441 s2 = b->expr[state].rhs; 1442 switch (b->expr[s1].type) { 1443 case _URE_SYMBOL: 1444 _ure_add_symstate(b->expr[s1].lhs, s2, b); 1445 syms++; 1446 eval = 0; 1447 break; 1448 case _URE_ONE: 1449 state = s2; 1450 break; 1451 case _URE_QUEST: 1452 ns1 = b->expr[s1].lhs; 1453 ns2 = _ure_make_expr(_URE_AND, ns1, s2, b); 1454 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1455 break; 1456 case _URE_PLUS: 1457 ns1 = b->expr[s1].lhs; 1458 ns2 = _ure_make_expr(_URE_OR, s2, state, b); 1459 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1460 break; 1461 case _URE_STAR: 1462 ns1 = b->expr[s1].lhs; 1463 ns2 = _ure_make_expr(_URE_AND, ns1, state, b); 1464 state = _ure_make_expr(_URE_OR, s2, ns2, b); 1465 break; 1466 case _URE_OR: 1467 ns1 = b->expr[s1].lhs; 1468 ns2 = b->expr[s1].rhs; 1469 ns1 = _ure_make_expr(_URE_AND, ns1, s2, b); 1470 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1471 state = _ure_make_expr(_URE_OR, ns1, ns2, b); 1472 break; 1473 case _URE_AND: 1474 ns1 = b->expr[s1].lhs; 1475 ns2 = b->expr[s1].rhs; 1476 ns2 = _ure_make_expr(_URE_AND, ns2, s2, b); 1477 state = _ure_make_expr(_URE_AND, ns1, ns2, b); 1478 break; 1479 } 1480 } 1481 } 1482 } 1483 1484 /* 1485 * Clear the state stack. 1486 */ 1487 while (_ure_pop(b) != _URE_NOOP) ; 1488 1489 /* 1490 * Reset the state pointer because the reduction may have moved it 1491 * during a reallocation. 1492 */ 1493 sp = b->states.states + i; 1494 1495 /* 1496 * Generate the DFA states for the symbols collected during the 1497 * current reduction. 1498 */ 1499 if (sp->trans_used + syms > sp->trans_size) { 1500 if (sp->trans_size == 0) 1501 sp->trans = (_ure_elt_t *) 1502 malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1503 else 1504 sp->trans = (_ure_elt_t *) 1505 realloc((char *) sp->trans, 1506 sizeof(_ure_elt_t) * (sp->trans_used + syms)); 1507 sp->trans_size = sp->trans_used + syms; 1508 } 1509 1510 /* 1511 * Go through the symbol table and generate the DFA state transitions 1512 * for each symbol that has collected NFA states. 1513 */ 1514 for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) { 1515 sp = b->states.states + i; 1516 1517 if (smp->states.slist_used > 0) { 1518 sp->trans[syms].lhs = smp->id; 1519 rhs = _ure_add_state(smp->states.slist_used, 1520 smp->states.slist, b); 1521 /* 1522 * Reset the state pointer in case the reallocation moves it 1523 * in memory. 1524 */ 1525 sp = b->states.states + i; 1526 sp->trans[syms].rhs = rhs; 1527 1528 smp->states.slist_used = 0; 1529 syms++; 1530 } 1531 } 1532 1533 /* 1534 * Set the number of transitions actually used. 1535 */ 1536 sp->trans_used = syms; 1537 } 1538 b->reducing = 0; 1539} 1540 1541static void 1542_ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b) 1543{ 1544 ucs2_t tmp; 1545 1546 l = b->states.states[l].id; 1547 r = b->states.states[r].id; 1548 1549 if (l == r) 1550 return; 1551 1552 if (l > r) { 1553 tmp = l; 1554 l = r; 1555 r = tmp; 1556 } 1557 1558 /* 1559 * Check to see if the equivalence pair already exists. 1560 */ 1561 for (tmp = 0; tmp < b->equiv_used && 1562 (b->equiv[tmp].l != l || b->equiv[tmp].r != r); 1563 tmp++) ; 1564 1565 if (tmp < b->equiv_used) 1566 return; 1567 1568 if (b->equiv_used == b->equiv_size) { 1569 if (b->equiv_size == 0) 1570 b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3); 1571 else 1572 b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv, 1573 sizeof(_ure_equiv_t) * 1574 (b->equiv_size + 8)); 1575 b->equiv_size += 8; 1576 } 1577 b->equiv[b->equiv_used].l = l; 1578 b->equiv[b->equiv_used].r = r; 1579 b->equiv_used++; 1580} 1581 1582/* 1583 * Merge the DFA states that are equivalent. 1584 */ 1585static void 1586_ure_merge_equiv(_ure_buffer_t *b) 1587{ 1588 ucs2_t i, j, k, eq, done; 1589 _ure_state_t *sp1, *sp2, *ls, *rs; 1590 1591 for (i = 0; i < b->states.states_used; i++) { 1592 sp1 = b->states.states + i; 1593 if (sp1->id != i) 1594 continue; 1595 for (j = 0; j < i; j++) { 1596 sp2 = b->states.states + j; 1597 if (sp2->id != j) 1598 continue; 1599 b->equiv_used = 0; 1600 _ure_add_equiv(i, j, b); 1601 for (eq = 0, done = 0; eq < b->equiv_used; eq++) { 1602 ls = b->states.states + b->equiv[eq].l; 1603 rs = b->states.states + b->equiv[eq].r; 1604 if (ls->accepting != rs->accepting || 1605 ls->trans_used != rs->trans_used) { 1606 done = 1; 1607 break; 1608 } 1609 for (k = 0; k < ls->trans_used && 1610 ls->trans[k].lhs == rs->trans[k].lhs; k++) ; 1611 if (k < ls->trans_used) { 1612 done = 1; 1613 break; 1614 } 1615 1616 for (k = 0; k < ls->trans_used; k++) 1617 _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b); 1618 } 1619 if (done == 0) 1620 break; 1621 } 1622 for (eq = 0; j < i && eq < b->equiv_used; eq++) 1623 b->states.states[b->equiv[eq].r].id = 1624 b->states.states[b->equiv[eq].l].id; 1625 } 1626 1627 /* 1628 * Renumber the states appropriately. 1629 */ 1630 for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used; 1631 sp1++, i++) 1632 sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id; 1633} 1634 1635/************************************************************************* 1636 * 1637 * API. 1638 * 1639 *************************************************************************/ 1640 1641ure_buffer_t 1642ure_buffer_create(void) 1643{ 1644 ure_buffer_t b; 1645 1646 b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t)); 1647 1648 return b; 1649} 1650 1651void 1652ure_buffer_free(ure_buffer_t buf) 1653{ 1654 unsigned long i; 1655 1656 if (buf == 0) 1657 return; 1658 1659 if (buf->stack.slist_size > 0) 1660 free((char *) buf->stack.slist); 1661 1662 if (buf->expr_size > 0) 1663 free((char *) buf->expr); 1664 1665 for (i = 0; i < buf->symtab_size; i++) { 1666 if (buf->symtab[i].states.slist_size > 0) 1667 free((char *) buf->symtab[i].states.slist); 1668 } 1669 1670 if (buf->symtab_size > 0) 1671 free((char *) buf->symtab); 1672 1673 for (i = 0; i < buf->states.states_size; i++) { 1674 if (buf->states.states[i].trans_size > 0) 1675 free((char *) buf->states.states[i].trans); 1676 if (buf->states.states[i].st.slist_size > 0) 1677 free((char *) buf->states.states[i].st.slist); 1678 } 1679 1680 if (buf->states.states_size > 0) 1681 free((char *) buf->states.states); 1682 1683 if (buf->equiv_size > 0) 1684 free((char *) buf->equiv); 1685 1686 free((char *) buf); 1687} 1688 1689ure_dfa_t 1690ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf) 1691{ 1692 ucs2_t i, j, state; 1693 _ure_state_t *sp; 1694 _ure_dstate_t *dsp; 1695 _ure_trans_t *tp; 1696 ure_dfa_t dfa; 1697 1698 if (re == 0 || *re == 0 || relen == 0 || buf == 0) 1699 return 0; 1700 1701 /* 1702 * Reset the various fields of the compilation buffer. Default the flags 1703 * to indicate the presense of the "^$" pattern. If any other pattern 1704 * occurs, then this flag will be removed. This is done to catch this 1705 * special pattern and handle it specially when matching. 1706 */ 1707 buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0); 1708 buf->reducing = 0; 1709 buf->stack.slist_used = 0; 1710 buf->expr_used = 0; 1711 1712 for (i = 0; i < buf->symtab_used; i++) 1713 buf->symtab[i].states.slist_used = 0; 1714 buf->symtab_used = 0; 1715 1716 for (i = 0; i < buf->states.states_used; i++) { 1717 buf->states.states[i].st.slist_used = 0; 1718 buf->states.states[i].trans_used = 0; 1719 } 1720 buf->states.states_used = 0; 1721 1722 /* 1723 * Construct the NFA. If this stage returns a 0, then an error occured or 1724 * an empty expression was passed. 1725 */ 1726 if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) 1727 return 0; 1728 1729 /* 1730 * Do the expression reduction to get the initial DFA. 1731 */ 1732 _ure_reduce(state, buf); 1733 1734 /* 1735 * Merge all the equivalent DFA states. 1736 */ 1737 _ure_merge_equiv(buf); 1738 1739 /* 1740 * Construct the minimal DFA. 1741 */ 1742 dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); 1743 (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); 1744 1745 dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); 1746 1747 /* 1748 * Free up the NFA state groups and transfer the symbols from the buffer 1749 * to the DFA. 1750 */ 1751 for (i = 0; i < buf->symtab_size; i++) { 1752 if (buf->symtab[i].states.slist_size > 0) 1753 free((char *) buf->symtab[i].states.slist); 1754 } 1755 dfa->syms = buf->symtab; 1756 dfa->nsyms = buf->symtab_used; 1757 1758 buf->symtab_used = buf->symtab_size = 0; 1759 1760 /* 1761 * Collect the total number of states and transitions needed for the DFA. 1762 */ 1763 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1764 i++, sp++) { 1765 if (sp->id == state) { 1766 dfa->nstates++; 1767 dfa->ntrans += sp->trans_used; 1768 state++; 1769 } 1770 } 1771 1772 /* 1773 * Allocate enough space for the states and transitions. 1774 */ 1775 dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * 1776 dfa->nstates); 1777 dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); 1778 1779 /* 1780 * Actually transfer the DFA states from the buffer. 1781 */ 1782 dsp = dfa->states; 1783 tp = dfa->trans; 1784 for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; 1785 i++, sp++) { 1786 if (sp->id == state) { 1787 dsp->trans = tp; 1788 dsp->ntrans = sp->trans_used; 1789 dsp->accepting = sp->accepting; 1790 1791 /* 1792 * Add the transitions for the state. 1793 */ 1794 for (j = 0; j < dsp->ntrans; j++, tp++) { 1795 tp->symbol = sp->trans[j].lhs; 1796 tp->next_state = buf->states.states[sp->trans[j].rhs].id; 1797 } 1798 1799 dsp++; 1800 state++; 1801 } 1802 } 1803 1804 return dfa; 1805} 1806 1807void 1808ure_dfa_free(ure_dfa_t dfa) 1809{ 1810 ucs2_t i; 1811 1812 if (dfa == 0) 1813 return; 1814 1815 for (i = 0; i < dfa->nsyms; i++) { 1816 if ((dfa->syms[i].type == _URE_CCLASS || 1817 dfa->syms[i].type == _URE_NCCLASS) && 1818 dfa->syms[i].sym.ccl.ranges_size > 0) 1819 free((char *) dfa->syms[i].sym.ccl.ranges); 1820 } 1821 if (dfa->nsyms > 0) 1822 free((char *) dfa->syms); 1823 1824 if (dfa->nstates > 0) 1825 free((char *) dfa->states); 1826 if (dfa->ntrans > 0) 1827 free((char *) dfa->trans); 1828 free((char *) dfa); 1829} 1830 1831void 1832ure_write_dfa(ure_dfa_t dfa, FILE *out) 1833{ 1834 ucs2_t i, j, k, h, l; 1835 _ure_dstate_t *sp; 1836 _ure_symtab_t *sym; 1837 _ure_range_t *rp; 1838 1839 if (dfa == 0 || out == 0) 1840 return; 1841 1842 /* 1843 * Write all the different character classes. 1844 */ 1845 for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { 1846 if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { 1847 fprintf(out, "C%hd = ", sym->id); 1848 if (sym->sym.ccl.ranges_used > 0) { 1849 putc('[', out); 1850 if (sym->type == _URE_NCCLASS) 1851 putc('^', out); 1852 } 1853 if (sym->props != 0) { 1854 if (sym->type == _URE_NCCLASS) 1855 fprintf(out, "\\P"); 1856 else 1857 fprintf(out, "\\p"); 1858 for (k = h = 0; k < 32; k++) { 1859 if (sym->props & (1 << k)) { 1860 if (h != 0) 1861 putc(',', out); 1862 fprintf(out, "%hd", k + 1); 1863 h = 1; 1864 } 1865 } 1866 } 1867 /* 1868 * Dump the ranges. 1869 */ 1870 for (k = 0, rp = sym->sym.ccl.ranges; 1871 k < sym->sym.ccl.ranges_used; k++, rp++) { 1872 /* 1873 * Check for UTF16 characters. 1874 */ 1875 if (0x10000 <= rp->min_code && 1876 rp->min_code <= 0x10ffff) { 1877 h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); 1878 l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); 1879 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1880 } else 1881 fprintf(out, "\\x%04lX", rp->min_code & 0xffff); 1882 if (rp->max_code != rp->min_code) { 1883 putc('-', out); 1884 if (rp->max_code >= 0x10000 && 1885 rp->max_code <= 0x10ffff) { 1886 h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); 1887 l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); 1888 fprintf(out, "\\x%04hX\\x%04hX", h, l); 1889 } else 1890 fprintf(out, "\\x%04lX", rp->max_code & 0xffff); 1891 } 1892 } 1893 if (sym->sym.ccl.ranges_used > 0) 1894 putc(']', out); 1895 putc('\n', out); 1896 } 1897 } 1898 1899 for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { 1900 fprintf(out, "S%hd = ", i); 1901 if (sp->accepting) { 1902 fprintf(out, "1 "); 1903 if (sp->ntrans) 1904 fprintf(out, "| "); 1905 } 1906 for (j = 0; j < sp->ntrans; j++) { 1907 if (j > 0) 1908 fprintf(out, "| "); 1909 1910 sym = dfa->syms + sp->trans[j].symbol; 1911 switch (sym->type) { 1912 case _URE_CHAR: 1913 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { 1914 /* 1915 * Take care of UTF16 characters. 1916 */ 1917 h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); 1918 l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); 1919 fprintf(out, "\\x%04hX\\x%04hX ", h, l); 1920 } else 1921 fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); 1922 break; 1923 case _URE_ANY_CHAR: 1924 fprintf(out, "<any> "); 1925 break; 1926 case _URE_BOL_ANCHOR: 1927 fprintf(out, "<bol-anchor> "); 1928 break; 1929 case _URE_EOL_ANCHOR: 1930 fprintf(out, "<eol-anchor> "); 1931 break; 1932 case _URE_CCLASS: 1933 case _URE_NCCLASS: 1934 fprintf(out, "[C%hd] ", sym->id); 1935 break; 1936 } 1937 fprintf(out, "S%hd", sp->trans[j].next_state); 1938 if (j + 1 < sp->ntrans) 1939 putc(' ', out); 1940 } 1941 putc('\n', out); 1942 } 1943} 1944 1945#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ 1946 (cc) == 0x2029) 1947 1948int 1949ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, 1950 unsigned long *match_start, unsigned long *match_end) 1951{ 1952 int i, j, matched, found, skip; 1953 unsigned long ms, me; 1954 ucs4_t c; 1955 ucs2_t *sp, *ep, *lp; 1956 _ure_dstate_t *stp; 1957 _ure_symtab_t *sym; 1958 _ure_range_t *rp; 1959 1960 if (dfa == 0 || text == 0) 1961 return 0; 1962 1963 /* 1964 * Handle the special case of an empty string matching the "^$" pattern. 1965 */ 1966 if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { 1967 *match_start = *match_end = 0; 1968 return 1; 1969 } 1970 1971 sp = text; 1972 ep = sp + textlen; 1973 1974 ms = me = ~0; 1975 1976 stp = dfa->states; 1977 1978 for (found = skip = 0; found == 0 && sp < ep; ) { 1979 lp = sp; 1980 c = *sp++; 1981 1982 /* 1983 * Check to see if this is a high surrogate that should be 1984 * combined with a following low surrogate. 1985 */ 1986 if (sp < ep && 0xd800 <= c && c <= 0xdbff && 1987 0xdc00 <= *sp && *sp <= 0xdfff) 1988 c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); 1989 1990 /* 1991 * Determine if the character is non-spacing and should be skipped. 1992 */ 1993 if (_ure_matches_properties(_URE_NONSPACING, c) && 1994 (flags & URE_IGNORE_NONSPACING)) { 1995 sp++; 1996 continue; 1997 } 1998 1999 if (dfa->flags & _URE_DFA_CASEFOLD) 2000 c = _ure_tolower(c); 2001 2002 /* 2003 * See if one of the transitions matches. 2004 */ 2005 for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { 2006 sym = dfa->syms + stp->trans[i].symbol; 2007 switch (sym->type) { 2008 case _URE_ANY_CHAR: 2009 if ((flags & URE_DOT_MATCHES_SEPARATORS) || 2010 !_ure_issep(c)) 2011 matched = 1; 2012 break; 2013 case _URE_CHAR: 2014 if (c == sym->sym.chr) 2015 matched = 1; 2016 break; 2017 case _URE_BOL_ANCHOR: 2018 if (lp == text) { 2019 sp = lp; 2020 matched = 1; 2021 } else if (_ure_issep(c)) { 2022 if (c == '\r' && sp < ep && *sp == '\n') 2023 sp++; 2024 lp = sp; 2025 matched = 1; 2026 } 2027 break; 2028 case _URE_EOL_ANCHOR: 2029 if (_ure_issep(c)) { 2030 /* 2031 * Put the pointer back before the separator so the match 2032 * end position will be correct. This case will also 2033 * cause the `sp' pointer to be advanced over the current 2034 * separator once the match end point has been recorded. 2035 */ 2036 sp = lp; 2037 matched = 1; 2038 } 2039 break; 2040 case _URE_CCLASS: 2041 case _URE_NCCLASS: 2042 if (sym->props != 0) 2043 matched = _ure_matches_properties(sym->props, c); 2044 for (j = 0, rp = sym->sym.ccl.ranges; 2045 j < sym->sym.ccl.ranges_used; j++, rp++) { 2046 if (rp->min_code <= c && c <= rp->max_code) 2047 matched = 1; 2048 } 2049 if (sym->type == _URE_NCCLASS) 2050 matched = !matched; 2051 break; 2052 } 2053 2054 if (matched) { 2055 if (ms == ~0UL) 2056 ms = lp - text; 2057 else 2058 me = sp - text; 2059 stp = dfa->states + stp->trans[i].next_state; 2060 2061 /* 2062 * If the match was an EOL anchor, adjust the pointer past the 2063 * separator that caused the match. The correct match 2064 * position has been recorded already. 2065 */ 2066 if (sym->type == _URE_EOL_ANCHOR) { 2067 /* 2068 * Skip the character that caused the match. 2069 */ 2070 sp++; 2071 2072 /* 2073 * Handle the infamous CRLF situation. 2074 */ 2075 if (sp < ep && c == '\r' && *sp == '\n') 2076 sp++; 2077 } 2078 } 2079 } 2080 2081 if (matched == 0) { 2082 if (stp->accepting == 0) { 2083 /* 2084 * If the last state was not accepting, then reset 2085 * and start over. 2086 */ 2087 stp = dfa->states; 2088 ms = me = ~0; 2089 } else 2090 /* 2091 * The last state was accepting, so terminate the matching 2092 * loop to avoid more work. 2093 */ 2094 found = 1; 2095 } else if (sp == ep) { 2096 if (!stp->accepting) { 2097 /* 2098 * This ugly hack is to make sure the end-of-line anchors 2099 * match when the source text hits the end. This is only done 2100 * if the last subexpression matches. 2101 */ 2102 for (i = 0; found == 0 && i < stp->ntrans; i++) { 2103 sym = dfa->syms + stp->trans[i].symbol; 2104 if (sym->type ==_URE_EOL_ANCHOR) { 2105 stp = dfa->states + stp->trans[i].next_state; 2106 if (stp->accepting) { 2107 me = sp - text; 2108 found = 1; 2109 } else 2110 break; 2111 } 2112 } 2113 } else { 2114 /* 2115 * Make sure any conditions that match all the way to the end 2116 * of the string match. 2117 */ 2118 found = 1; 2119 me = sp - text; 2120 } 2121 } 2122 } 2123 2124 if (found == 0) 2125 ms = me = ~0; 2126 2127 *match_start = ms; 2128 *match_end = me; 2129 2130 return (ms != ~0UL) ? 1 : 0; 2131} 2132