1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22/* #define DEBUG_ERR */ 23 24#include <stdio.h> 25#include <string.h> 26#ifdef HAVE_LIMITS_H 27#include <limits.h> 28#endif 29 30#include <libxml/tree.h> 31#include <libxml/parserInternals.h> 32#include <libxml/xmlregexp.h> 33#include <libxml/xmlautomata.h> 34#include <libxml/xmlunicode.h> 35 36#ifndef INT_MAX 37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38#endif 39 40/* #define DEBUG_REGEXP_GRAPH */ 41/* #define DEBUG_REGEXP_EXEC */ 42/* #define DEBUG_PUSH */ 43/* #define DEBUG_COMPACTION */ 44 45#define MAX_PUSH 10000000 46 47#define ERROR(str) \ 48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 49 xmlRegexpErrCompile(ctxt, str); 50#define NEXT ctxt->cur++ 51#define CUR (*(ctxt->cur)) 52#define NXT(index) (ctxt->cur[index]) 53 54#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 55#define NEXTL(l) ctxt->cur += l; 56#define XML_REG_STRING_SEPARATOR '|' 57 58/** 59 * TODO: 60 * 61 * macro to flag unimplemented blocks 62 */ 63#define TODO \ 64 xmlGenericError(xmlGenericErrorContext, \ 65 "Unimplemented block at %s:%d\n", \ 66 __FILE__, __LINE__); 67 68/************************************************************************ 69 * * 70 * Datatypes and structures * 71 * * 72 ************************************************************************/ 73 74/* 75 * Note: the order of the enums below is significant, do not shuffle 76 */ 77typedef enum { 78 XML_REGEXP_EPSILON = 1, 79 XML_REGEXP_CHARVAL, 80 XML_REGEXP_RANGES, 81 XML_REGEXP_SUBREG, /* used for () sub regexps */ 82 XML_REGEXP_STRING, 83 XML_REGEXP_ANYCHAR, /* . */ 84 XML_REGEXP_ANYSPACE, /* \s */ 85 XML_REGEXP_NOTSPACE, /* \S */ 86 XML_REGEXP_INITNAME, /* \l */ 87 XML_REGEXP_NOTINITNAME, /* \L */ 88 XML_REGEXP_NAMECHAR, /* \c */ 89 XML_REGEXP_NOTNAMECHAR, /* \C */ 90 XML_REGEXP_DECIMAL, /* \d */ 91 XML_REGEXP_NOTDECIMAL, /* \D */ 92 XML_REGEXP_REALCHAR, /* \w */ 93 XML_REGEXP_NOTREALCHAR, /* \W */ 94 XML_REGEXP_LETTER = 100, 95 XML_REGEXP_LETTER_UPPERCASE, 96 XML_REGEXP_LETTER_LOWERCASE, 97 XML_REGEXP_LETTER_TITLECASE, 98 XML_REGEXP_LETTER_MODIFIER, 99 XML_REGEXP_LETTER_OTHERS, 100 XML_REGEXP_MARK, 101 XML_REGEXP_MARK_NONSPACING, 102 XML_REGEXP_MARK_SPACECOMBINING, 103 XML_REGEXP_MARK_ENCLOSING, 104 XML_REGEXP_NUMBER, 105 XML_REGEXP_NUMBER_DECIMAL, 106 XML_REGEXP_NUMBER_LETTER, 107 XML_REGEXP_NUMBER_OTHERS, 108 XML_REGEXP_PUNCT, 109 XML_REGEXP_PUNCT_CONNECTOR, 110 XML_REGEXP_PUNCT_DASH, 111 XML_REGEXP_PUNCT_OPEN, 112 XML_REGEXP_PUNCT_CLOSE, 113 XML_REGEXP_PUNCT_INITQUOTE, 114 XML_REGEXP_PUNCT_FINQUOTE, 115 XML_REGEXP_PUNCT_OTHERS, 116 XML_REGEXP_SEPAR, 117 XML_REGEXP_SEPAR_SPACE, 118 XML_REGEXP_SEPAR_LINE, 119 XML_REGEXP_SEPAR_PARA, 120 XML_REGEXP_SYMBOL, 121 XML_REGEXP_SYMBOL_MATH, 122 XML_REGEXP_SYMBOL_CURRENCY, 123 XML_REGEXP_SYMBOL_MODIFIER, 124 XML_REGEXP_SYMBOL_OTHERS, 125 XML_REGEXP_OTHER, 126 XML_REGEXP_OTHER_CONTROL, 127 XML_REGEXP_OTHER_FORMAT, 128 XML_REGEXP_OTHER_PRIVATE, 129 XML_REGEXP_OTHER_NA, 130 XML_REGEXP_BLOCK_NAME 131} xmlRegAtomType; 132 133typedef enum { 134 XML_REGEXP_QUANT_EPSILON = 1, 135 XML_REGEXP_QUANT_ONCE, 136 XML_REGEXP_QUANT_OPT, 137 XML_REGEXP_QUANT_MULT, 138 XML_REGEXP_QUANT_PLUS, 139 XML_REGEXP_QUANT_ONCEONLY, 140 XML_REGEXP_QUANT_ALL, 141 XML_REGEXP_QUANT_RANGE 142} xmlRegQuantType; 143 144typedef enum { 145 XML_REGEXP_START_STATE = 1, 146 XML_REGEXP_FINAL_STATE, 147 XML_REGEXP_TRANS_STATE, 148 XML_REGEXP_SINK_STATE 149} xmlRegStateType; 150 151typedef enum { 152 XML_REGEXP_MARK_NORMAL = 0, 153 XML_REGEXP_MARK_START, 154 XML_REGEXP_MARK_VISITED 155} xmlRegMarkedType; 156 157typedef struct _xmlRegRange xmlRegRange; 158typedef xmlRegRange *xmlRegRangePtr; 159 160struct _xmlRegRange { 161 int neg; /* 0 normal, 1 not, 2 exclude */ 162 xmlRegAtomType type; 163 int start; 164 int end; 165 xmlChar *blockName; 166}; 167 168typedef struct _xmlRegAtom xmlRegAtom; 169typedef xmlRegAtom *xmlRegAtomPtr; 170 171typedef struct _xmlAutomataState xmlRegState; 172typedef xmlRegState *xmlRegStatePtr; 173 174struct _xmlRegAtom { 175 int no; 176 xmlRegAtomType type; 177 xmlRegQuantType quant; 178 int min; 179 int max; 180 181 void *valuep; 182 void *valuep2; 183 int neg; 184 int codepoint; 185 xmlRegStatePtr start; 186 xmlRegStatePtr stop; 187 int maxRanges; 188 int nbRanges; 189 xmlRegRangePtr *ranges; 190 void *data; 191}; 192 193typedef struct _xmlRegCounter xmlRegCounter; 194typedef xmlRegCounter *xmlRegCounterPtr; 195 196struct _xmlRegCounter { 197 int min; 198 int max; 199}; 200 201typedef struct _xmlRegTrans xmlRegTrans; 202typedef xmlRegTrans *xmlRegTransPtr; 203 204struct _xmlRegTrans { 205 xmlRegAtomPtr atom; 206 int to; 207 int counter; 208 int count; 209 int nd; 210}; 211 212struct _xmlAutomataState { 213 xmlRegStateType type; 214 xmlRegMarkedType mark; 215 xmlRegMarkedType reached; 216 int no; 217 int maxTrans; 218 int nbTrans; 219 xmlRegTrans *trans; 220 /* knowing states ponting to us can speed things up */ 221 int maxTransTo; 222 int nbTransTo; 223 int *transTo; 224}; 225 226typedef struct _xmlAutomata xmlRegParserCtxt; 227typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 228 229struct _xmlAutomata { 230 xmlChar *string; 231 xmlChar *cur; 232 233 int error; 234 int neg; 235 236 xmlRegStatePtr start; 237 xmlRegStatePtr end; 238 xmlRegStatePtr state; 239 240 xmlRegAtomPtr atom; 241 242 int maxAtoms; 243 int nbAtoms; 244 xmlRegAtomPtr *atoms; 245 246 int maxStates; 247 int nbStates; 248 xmlRegStatePtr *states; 249 250 int maxCounters; 251 int nbCounters; 252 xmlRegCounter *counters; 253 254 int determinist; 255 int negs; 256}; 257 258struct _xmlRegexp { 259 xmlChar *string; 260 int nbStates; 261 xmlRegStatePtr *states; 262 int nbAtoms; 263 xmlRegAtomPtr *atoms; 264 int nbCounters; 265 xmlRegCounter *counters; 266 int determinist; 267 /* 268 * That's the compact form for determinists automatas 269 */ 270 int nbstates; 271 int *compact; 272 void **transdata; 273 int nbstrings; 274 xmlChar **stringMap; 275}; 276 277typedef struct _xmlRegExecRollback xmlRegExecRollback; 278typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 279 280struct _xmlRegExecRollback { 281 xmlRegStatePtr state;/* the current state */ 282 int index; /* the index in the input stack */ 283 int nextbranch; /* the next transition to explore in that state */ 284 int *counts; /* save the automata state if it has some */ 285}; 286 287typedef struct _xmlRegInputToken xmlRegInputToken; 288typedef xmlRegInputToken *xmlRegInputTokenPtr; 289 290struct _xmlRegInputToken { 291 xmlChar *value; 292 void *data; 293}; 294 295struct _xmlRegExecCtxt { 296 int status; /* execution status != 0 indicate an error */ 297 int determinist; /* did we find an indeterministic behaviour */ 298 xmlRegexpPtr comp; /* the compiled regexp */ 299 xmlRegExecCallbacks callback; 300 void *data; 301 302 xmlRegStatePtr state;/* the current state */ 303 int transno; /* the current transition on that state */ 304 int transcount; /* the number of chars in char counted transitions */ 305 306 /* 307 * A stack of rollback states 308 */ 309 int maxRollbacks; 310 int nbRollbacks; 311 xmlRegExecRollback *rollbacks; 312 313 /* 314 * The state of the automata if any 315 */ 316 int *counts; 317 318 /* 319 * The input stack 320 */ 321 int inputStackMax; 322 int inputStackNr; 323 int index; 324 int *charStack; 325 const xmlChar *inputString; /* when operating on characters */ 326 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 327 328 /* 329 * error handling 330 */ 331 int errStateNo; /* the error state number */ 332 xmlRegStatePtr errState; /* the error state */ 333 xmlChar *errString; /* the string raising the error */ 334 int *errCounts; /* counters at the error state */ 335 int nbPush; 336}; 337 338#define REGEXP_ALL_COUNTER 0x123456 339#define REGEXP_ALL_LAX_COUNTER 0x123457 340 341static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 342static void xmlRegFreeState(xmlRegStatePtr state); 343static void xmlRegFreeAtom(xmlRegAtomPtr atom); 344static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 345static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 346static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 347 int neg, int start, int end, const xmlChar *blockName); 348 349/************************************************************************ 350 * * 351 * Regexp memory error handler * 352 * * 353 ************************************************************************/ 354/** 355 * xmlRegexpErrMemory: 356 * @extra: extra information 357 * 358 * Handle an out of memory condition 359 */ 360static void 361xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 362{ 363 const char *regexp = NULL; 364 if (ctxt != NULL) { 365 regexp = (const char *) ctxt->string; 366 ctxt->error = XML_ERR_NO_MEMORY; 367 } 368 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 369 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 370 regexp, NULL, 0, 0, 371 "Memory allocation failed : %s\n", extra); 372} 373 374/** 375 * xmlRegexpErrCompile: 376 * @extra: extra information 377 * 378 * Handle a compilation failure 379 */ 380static void 381xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 382{ 383 const char *regexp = NULL; 384 int idx = 0; 385 386 if (ctxt != NULL) { 387 regexp = (const char *) ctxt->string; 388 idx = ctxt->cur - ctxt->string; 389 ctxt->error = XML_REGEXP_COMPILE_ERROR; 390 } 391 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 392 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 393 regexp, NULL, idx, 0, 394 "failed to compile: %s\n", extra); 395} 396 397/************************************************************************ 398 * * 399 * Allocation/Deallocation * 400 * * 401 ************************************************************************/ 402 403static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 404/** 405 * xmlRegEpxFromParse: 406 * @ctxt: the parser context used to build it 407 * 408 * Allocate a new regexp and fill it with the result from the parser 409 * 410 * Returns the new regexp or NULL in case of error 411 */ 412static xmlRegexpPtr 413xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 414 xmlRegexpPtr ret; 415 416 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 417 if (ret == NULL) { 418 xmlRegexpErrMemory(ctxt, "compiling regexp"); 419 return(NULL); 420 } 421 memset(ret, 0, sizeof(xmlRegexp)); 422 ret->string = ctxt->string; 423 ret->nbStates = ctxt->nbStates; 424 ret->states = ctxt->states; 425 ret->nbAtoms = ctxt->nbAtoms; 426 ret->atoms = ctxt->atoms; 427 ret->nbCounters = ctxt->nbCounters; 428 ret->counters = ctxt->counters; 429 ret->determinist = ctxt->determinist; 430 if (ret->determinist == -1) { 431 xmlRegexpIsDeterminist(ret); 432 } 433 434 if ((ret->determinist != 0) && 435 (ret->nbCounters == 0) && 436 (ctxt->negs == 0) && 437 (ret->atoms != NULL) && 438 (ret->atoms[0] != NULL) && 439 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 440 int i, j, nbstates = 0, nbatoms = 0; 441 int *stateRemap; 442 int *stringRemap; 443 int *transitions; 444 void **transdata; 445 xmlChar **stringMap; 446 xmlChar *value; 447 448 /* 449 * Switch to a compact representation 450 * 1/ counting the effective number of states left 451 * 2/ counting the unique number of atoms, and check that 452 * they are all of the string type 453 * 3/ build a table state x atom for the transitions 454 */ 455 456 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 457 if (stateRemap == NULL) { 458 xmlRegexpErrMemory(ctxt, "compiling regexp"); 459 xmlFree(ret); 460 return(NULL); 461 } 462 for (i = 0;i < ret->nbStates;i++) { 463 if (ret->states[i] != NULL) { 464 stateRemap[i] = nbstates; 465 nbstates++; 466 } else { 467 stateRemap[i] = -1; 468 } 469 } 470#ifdef DEBUG_COMPACTION 471 printf("Final: %d states\n", nbstates); 472#endif 473 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 474 if (stringMap == NULL) { 475 xmlRegexpErrMemory(ctxt, "compiling regexp"); 476 xmlFree(stateRemap); 477 xmlFree(ret); 478 return(NULL); 479 } 480 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 481 if (stringRemap == NULL) { 482 xmlRegexpErrMemory(ctxt, "compiling regexp"); 483 xmlFree(stringMap); 484 xmlFree(stateRemap); 485 xmlFree(ret); 486 return(NULL); 487 } 488 for (i = 0;i < ret->nbAtoms;i++) { 489 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 490 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 491 value = ret->atoms[i]->valuep; 492 for (j = 0;j < nbatoms;j++) { 493 if (xmlStrEqual(stringMap[j], value)) { 494 stringRemap[i] = j; 495 break; 496 } 497 } 498 if (j >= nbatoms) { 499 stringRemap[i] = nbatoms; 500 stringMap[nbatoms] = xmlStrdup(value); 501 if (stringMap[nbatoms] == NULL) { 502 for (i = 0;i < nbatoms;i++) 503 xmlFree(stringMap[i]); 504 xmlFree(stringRemap); 505 xmlFree(stringMap); 506 xmlFree(stateRemap); 507 xmlFree(ret); 508 return(NULL); 509 } 510 nbatoms++; 511 } 512 } else { 513 xmlFree(stateRemap); 514 xmlFree(stringRemap); 515 for (i = 0;i < nbatoms;i++) 516 xmlFree(stringMap[i]); 517 xmlFree(stringMap); 518 xmlFree(ret); 519 return(NULL); 520 } 521 } 522#ifdef DEBUG_COMPACTION 523 printf("Final: %d atoms\n", nbatoms); 524#endif 525 transitions = (int *) xmlMalloc((nbstates + 1) * 526 (nbatoms + 1) * sizeof(int)); 527 if (transitions == NULL) { 528 xmlFree(stateRemap); 529 xmlFree(stringRemap); 530 xmlFree(stringMap); 531 xmlFree(ret); 532 return(NULL); 533 } 534 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 535 536 /* 537 * Allocate the transition table. The first entry for each 538 * state corresponds to the state type. 539 */ 540 transdata = NULL; 541 542 for (i = 0;i < ret->nbStates;i++) { 543 int stateno, atomno, targetno, prev; 544 xmlRegStatePtr state; 545 xmlRegTransPtr trans; 546 547 stateno = stateRemap[i]; 548 if (stateno == -1) 549 continue; 550 state = ret->states[i]; 551 552 transitions[stateno * (nbatoms + 1)] = state->type; 553 554 for (j = 0;j < state->nbTrans;j++) { 555 trans = &(state->trans[j]); 556 if ((trans->to == -1) || (trans->atom == NULL)) 557 continue; 558 atomno = stringRemap[trans->atom->no]; 559 if ((trans->atom->data != NULL) && (transdata == NULL)) { 560 transdata = (void **) xmlMalloc(nbstates * nbatoms * 561 sizeof(void *)); 562 if (transdata != NULL) 563 memset(transdata, 0, 564 nbstates * nbatoms * sizeof(void *)); 565 else { 566 xmlRegexpErrMemory(ctxt, "compiling regexp"); 567 break; 568 } 569 } 570 targetno = stateRemap[trans->to]; 571 /* 572 * if the same atom can generate transitions to 2 different 573 * states then it means the automata is not determinist and 574 * the compact form can't be used ! 575 */ 576 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 577 if (prev != 0) { 578 if (prev != targetno + 1) { 579 ret->determinist = 0; 580#ifdef DEBUG_COMPACTION 581 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 582 i, j, trans->atom->no, trans->to, atomno, targetno); 583 printf(" previous to is %d\n", prev); 584#endif 585 if (transdata != NULL) 586 xmlFree(transdata); 587 xmlFree(transitions); 588 xmlFree(stateRemap); 589 xmlFree(stringRemap); 590 for (i = 0;i < nbatoms;i++) 591 xmlFree(stringMap[i]); 592 xmlFree(stringMap); 593 goto not_determ; 594 } 595 } else { 596#if 0 597 printf("State %d trans %d: atom %d to %d : %d to %d\n", 598 i, j, trans->atom->no, trans->to, atomno, targetno); 599#endif 600 transitions[stateno * (nbatoms + 1) + atomno + 1] = 601 targetno + 1; /* to avoid 0 */ 602 if (transdata != NULL) 603 transdata[stateno * nbatoms + atomno] = 604 trans->atom->data; 605 } 606 } 607 } 608 ret->determinist = 1; 609#ifdef DEBUG_COMPACTION 610 /* 611 * Debug 612 */ 613 for (i = 0;i < nbstates;i++) { 614 for (j = 0;j < nbatoms + 1;j++) { 615 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 616 } 617 printf("\n"); 618 } 619 printf("\n"); 620#endif 621 /* 622 * Cleanup of the old data 623 */ 624 if (ret->states != NULL) { 625 for (i = 0;i < ret->nbStates;i++) 626 xmlRegFreeState(ret->states[i]); 627 xmlFree(ret->states); 628 } 629 ret->states = NULL; 630 ret->nbStates = 0; 631 if (ret->atoms != NULL) { 632 for (i = 0;i < ret->nbAtoms;i++) 633 xmlRegFreeAtom(ret->atoms[i]); 634 xmlFree(ret->atoms); 635 } 636 ret->atoms = NULL; 637 ret->nbAtoms = 0; 638 639 ret->compact = transitions; 640 ret->transdata = transdata; 641 ret->stringMap = stringMap; 642 ret->nbstrings = nbatoms; 643 ret->nbstates = nbstates; 644 xmlFree(stateRemap); 645 xmlFree(stringRemap); 646 } 647not_determ: 648 ctxt->string = NULL; 649 ctxt->nbStates = 0; 650 ctxt->states = NULL; 651 ctxt->nbAtoms = 0; 652 ctxt->atoms = NULL; 653 ctxt->nbCounters = 0; 654 ctxt->counters = NULL; 655 return(ret); 656} 657 658/** 659 * xmlRegNewParserCtxt: 660 * @string: the string to parse 661 * 662 * Allocate a new regexp parser context 663 * 664 * Returns the new context or NULL in case of error 665 */ 666static xmlRegParserCtxtPtr 667xmlRegNewParserCtxt(const xmlChar *string) { 668 xmlRegParserCtxtPtr ret; 669 670 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 671 if (ret == NULL) 672 return(NULL); 673 memset(ret, 0, sizeof(xmlRegParserCtxt)); 674 if (string != NULL) 675 ret->string = xmlStrdup(string); 676 ret->cur = ret->string; 677 ret->neg = 0; 678 ret->negs = 0; 679 ret->error = 0; 680 ret->determinist = -1; 681 return(ret); 682} 683 684/** 685 * xmlRegNewRange: 686 * @ctxt: the regexp parser context 687 * @neg: is that negative 688 * @type: the type of range 689 * @start: the start codepoint 690 * @end: the end codepoint 691 * 692 * Allocate a new regexp range 693 * 694 * Returns the new range or NULL in case of error 695 */ 696static xmlRegRangePtr 697xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 698 int neg, xmlRegAtomType type, int start, int end) { 699 xmlRegRangePtr ret; 700 701 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 702 if (ret == NULL) { 703 xmlRegexpErrMemory(ctxt, "allocating range"); 704 return(NULL); 705 } 706 ret->neg = neg; 707 ret->type = type; 708 ret->start = start; 709 ret->end = end; 710 return(ret); 711} 712 713/** 714 * xmlRegFreeRange: 715 * @range: the regexp range 716 * 717 * Free a regexp range 718 */ 719static void 720xmlRegFreeRange(xmlRegRangePtr range) { 721 if (range == NULL) 722 return; 723 724 if (range->blockName != NULL) 725 xmlFree(range->blockName); 726 xmlFree(range); 727} 728 729/** 730 * xmlRegNewAtom: 731 * @ctxt: the regexp parser context 732 * @type: the type of atom 733 * 734 * Allocate a new regexp range 735 * 736 * Returns the new atom or NULL in case of error 737 */ 738static xmlRegAtomPtr 739xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 740 xmlRegAtomPtr ret; 741 742 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 743 if (ret == NULL) { 744 xmlRegexpErrMemory(ctxt, "allocating atom"); 745 return(NULL); 746 } 747 memset(ret, 0, sizeof(xmlRegAtom)); 748 ret->type = type; 749 ret->quant = XML_REGEXP_QUANT_ONCE; 750 ret->min = 0; 751 ret->max = 0; 752 return(ret); 753} 754 755/** 756 * xmlRegFreeAtom: 757 * @atom: the regexp atom 758 * 759 * Free a regexp atom 760 */ 761static void 762xmlRegFreeAtom(xmlRegAtomPtr atom) { 763 int i; 764 765 if (atom == NULL) 766 return; 767 768 for (i = 0;i < atom->nbRanges;i++) 769 xmlRegFreeRange(atom->ranges[i]); 770 if (atom->ranges != NULL) 771 xmlFree(atom->ranges); 772 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 773 xmlFree(atom->valuep); 774 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 775 xmlFree(atom->valuep2); 776 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 777 xmlFree(atom->valuep); 778 xmlFree(atom); 779} 780 781static xmlRegStatePtr 782xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 783 xmlRegStatePtr ret; 784 785 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 786 if (ret == NULL) { 787 xmlRegexpErrMemory(ctxt, "allocating state"); 788 return(NULL); 789 } 790 memset(ret, 0, sizeof(xmlRegState)); 791 ret->type = XML_REGEXP_TRANS_STATE; 792 ret->mark = XML_REGEXP_MARK_NORMAL; 793 return(ret); 794} 795 796/** 797 * xmlRegFreeState: 798 * @state: the regexp state 799 * 800 * Free a regexp state 801 */ 802static void 803xmlRegFreeState(xmlRegStatePtr state) { 804 if (state == NULL) 805 return; 806 807 if (state->trans != NULL) 808 xmlFree(state->trans); 809 if (state->transTo != NULL) 810 xmlFree(state->transTo); 811 xmlFree(state); 812} 813 814/** 815 * xmlRegFreeParserCtxt: 816 * @ctxt: the regexp parser context 817 * 818 * Free a regexp parser context 819 */ 820static void 821xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 822 int i; 823 if (ctxt == NULL) 824 return; 825 826 if (ctxt->string != NULL) 827 xmlFree(ctxt->string); 828 if (ctxt->states != NULL) { 829 for (i = 0;i < ctxt->nbStates;i++) 830 xmlRegFreeState(ctxt->states[i]); 831 xmlFree(ctxt->states); 832 } 833 if (ctxt->atoms != NULL) { 834 for (i = 0;i < ctxt->nbAtoms;i++) 835 xmlRegFreeAtom(ctxt->atoms[i]); 836 xmlFree(ctxt->atoms); 837 } 838 if (ctxt->counters != NULL) 839 xmlFree(ctxt->counters); 840 xmlFree(ctxt); 841} 842 843/************************************************************************ 844 * * 845 * Display of Data structures * 846 * * 847 ************************************************************************/ 848 849static void 850xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 851 switch (type) { 852 case XML_REGEXP_EPSILON: 853 fprintf(output, "epsilon "); break; 854 case XML_REGEXP_CHARVAL: 855 fprintf(output, "charval "); break; 856 case XML_REGEXP_RANGES: 857 fprintf(output, "ranges "); break; 858 case XML_REGEXP_SUBREG: 859 fprintf(output, "subexpr "); break; 860 case XML_REGEXP_STRING: 861 fprintf(output, "string "); break; 862 case XML_REGEXP_ANYCHAR: 863 fprintf(output, "anychar "); break; 864 case XML_REGEXP_ANYSPACE: 865 fprintf(output, "anyspace "); break; 866 case XML_REGEXP_NOTSPACE: 867 fprintf(output, "notspace "); break; 868 case XML_REGEXP_INITNAME: 869 fprintf(output, "initname "); break; 870 case XML_REGEXP_NOTINITNAME: 871 fprintf(output, "notinitname "); break; 872 case XML_REGEXP_NAMECHAR: 873 fprintf(output, "namechar "); break; 874 case XML_REGEXP_NOTNAMECHAR: 875 fprintf(output, "notnamechar "); break; 876 case XML_REGEXP_DECIMAL: 877 fprintf(output, "decimal "); break; 878 case XML_REGEXP_NOTDECIMAL: 879 fprintf(output, "notdecimal "); break; 880 case XML_REGEXP_REALCHAR: 881 fprintf(output, "realchar "); break; 882 case XML_REGEXP_NOTREALCHAR: 883 fprintf(output, "notrealchar "); break; 884 case XML_REGEXP_LETTER: 885 fprintf(output, "LETTER "); break; 886 case XML_REGEXP_LETTER_UPPERCASE: 887 fprintf(output, "LETTER_UPPERCASE "); break; 888 case XML_REGEXP_LETTER_LOWERCASE: 889 fprintf(output, "LETTER_LOWERCASE "); break; 890 case XML_REGEXP_LETTER_TITLECASE: 891 fprintf(output, "LETTER_TITLECASE "); break; 892 case XML_REGEXP_LETTER_MODIFIER: 893 fprintf(output, "LETTER_MODIFIER "); break; 894 case XML_REGEXP_LETTER_OTHERS: 895 fprintf(output, "LETTER_OTHERS "); break; 896 case XML_REGEXP_MARK: 897 fprintf(output, "MARK "); break; 898 case XML_REGEXP_MARK_NONSPACING: 899 fprintf(output, "MARK_NONSPACING "); break; 900 case XML_REGEXP_MARK_SPACECOMBINING: 901 fprintf(output, "MARK_SPACECOMBINING "); break; 902 case XML_REGEXP_MARK_ENCLOSING: 903 fprintf(output, "MARK_ENCLOSING "); break; 904 case XML_REGEXP_NUMBER: 905 fprintf(output, "NUMBER "); break; 906 case XML_REGEXP_NUMBER_DECIMAL: 907 fprintf(output, "NUMBER_DECIMAL "); break; 908 case XML_REGEXP_NUMBER_LETTER: 909 fprintf(output, "NUMBER_LETTER "); break; 910 case XML_REGEXP_NUMBER_OTHERS: 911 fprintf(output, "NUMBER_OTHERS "); break; 912 case XML_REGEXP_PUNCT: 913 fprintf(output, "PUNCT "); break; 914 case XML_REGEXP_PUNCT_CONNECTOR: 915 fprintf(output, "PUNCT_CONNECTOR "); break; 916 case XML_REGEXP_PUNCT_DASH: 917 fprintf(output, "PUNCT_DASH "); break; 918 case XML_REGEXP_PUNCT_OPEN: 919 fprintf(output, "PUNCT_OPEN "); break; 920 case XML_REGEXP_PUNCT_CLOSE: 921 fprintf(output, "PUNCT_CLOSE "); break; 922 case XML_REGEXP_PUNCT_INITQUOTE: 923 fprintf(output, "PUNCT_INITQUOTE "); break; 924 case XML_REGEXP_PUNCT_FINQUOTE: 925 fprintf(output, "PUNCT_FINQUOTE "); break; 926 case XML_REGEXP_PUNCT_OTHERS: 927 fprintf(output, "PUNCT_OTHERS "); break; 928 case XML_REGEXP_SEPAR: 929 fprintf(output, "SEPAR "); break; 930 case XML_REGEXP_SEPAR_SPACE: 931 fprintf(output, "SEPAR_SPACE "); break; 932 case XML_REGEXP_SEPAR_LINE: 933 fprintf(output, "SEPAR_LINE "); break; 934 case XML_REGEXP_SEPAR_PARA: 935 fprintf(output, "SEPAR_PARA "); break; 936 case XML_REGEXP_SYMBOL: 937 fprintf(output, "SYMBOL "); break; 938 case XML_REGEXP_SYMBOL_MATH: 939 fprintf(output, "SYMBOL_MATH "); break; 940 case XML_REGEXP_SYMBOL_CURRENCY: 941 fprintf(output, "SYMBOL_CURRENCY "); break; 942 case XML_REGEXP_SYMBOL_MODIFIER: 943 fprintf(output, "SYMBOL_MODIFIER "); break; 944 case XML_REGEXP_SYMBOL_OTHERS: 945 fprintf(output, "SYMBOL_OTHERS "); break; 946 case XML_REGEXP_OTHER: 947 fprintf(output, "OTHER "); break; 948 case XML_REGEXP_OTHER_CONTROL: 949 fprintf(output, "OTHER_CONTROL "); break; 950 case XML_REGEXP_OTHER_FORMAT: 951 fprintf(output, "OTHER_FORMAT "); break; 952 case XML_REGEXP_OTHER_PRIVATE: 953 fprintf(output, "OTHER_PRIVATE "); break; 954 case XML_REGEXP_OTHER_NA: 955 fprintf(output, "OTHER_NA "); break; 956 case XML_REGEXP_BLOCK_NAME: 957 fprintf(output, "BLOCK "); break; 958 } 959} 960 961static void 962xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 963 switch (type) { 964 case XML_REGEXP_QUANT_EPSILON: 965 fprintf(output, "epsilon "); break; 966 case XML_REGEXP_QUANT_ONCE: 967 fprintf(output, "once "); break; 968 case XML_REGEXP_QUANT_OPT: 969 fprintf(output, "? "); break; 970 case XML_REGEXP_QUANT_MULT: 971 fprintf(output, "* "); break; 972 case XML_REGEXP_QUANT_PLUS: 973 fprintf(output, "+ "); break; 974 case XML_REGEXP_QUANT_RANGE: 975 fprintf(output, "range "); break; 976 case XML_REGEXP_QUANT_ONCEONLY: 977 fprintf(output, "onceonly "); break; 978 case XML_REGEXP_QUANT_ALL: 979 fprintf(output, "all "); break; 980 } 981} 982static void 983xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 984 fprintf(output, " range: "); 985 if (range->neg) 986 fprintf(output, "negative "); 987 xmlRegPrintAtomType(output, range->type); 988 fprintf(output, "%c - %c\n", range->start, range->end); 989} 990 991static void 992xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 993 fprintf(output, " atom: "); 994 if (atom == NULL) { 995 fprintf(output, "NULL\n"); 996 return; 997 } 998 if (atom->neg) 999 fprintf(output, "not "); 1000 xmlRegPrintAtomType(output, atom->type); 1001 xmlRegPrintQuantType(output, atom->quant); 1002 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1003 fprintf(output, "%d-%d ", atom->min, atom->max); 1004 if (atom->type == XML_REGEXP_STRING) 1005 fprintf(output, "'%s' ", (char *) atom->valuep); 1006 if (atom->type == XML_REGEXP_CHARVAL) 1007 fprintf(output, "char %c\n", atom->codepoint); 1008 else if (atom->type == XML_REGEXP_RANGES) { 1009 int i; 1010 fprintf(output, "%d entries\n", atom->nbRanges); 1011 for (i = 0; i < atom->nbRanges;i++) 1012 xmlRegPrintRange(output, atom->ranges[i]); 1013 } else if (atom->type == XML_REGEXP_SUBREG) { 1014 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1015 } else { 1016 fprintf(output, "\n"); 1017 } 1018} 1019 1020static void 1021xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1022 fprintf(output, " trans: "); 1023 if (trans == NULL) { 1024 fprintf(output, "NULL\n"); 1025 return; 1026 } 1027 if (trans->to < 0) { 1028 fprintf(output, "removed\n"); 1029 return; 1030 } 1031 if (trans->nd != 0) { 1032 if (trans->nd == 2) 1033 fprintf(output, "last not determinist, "); 1034 else 1035 fprintf(output, "not determinist, "); 1036 } 1037 if (trans->counter >= 0) { 1038 fprintf(output, "counted %d, ", trans->counter); 1039 } 1040 if (trans->count == REGEXP_ALL_COUNTER) { 1041 fprintf(output, "all transition, "); 1042 } else if (trans->count >= 0) { 1043 fprintf(output, "count based %d, ", trans->count); 1044 } 1045 if (trans->atom == NULL) { 1046 fprintf(output, "epsilon to %d\n", trans->to); 1047 return; 1048 } 1049 if (trans->atom->type == XML_REGEXP_CHARVAL) 1050 fprintf(output, "char %c ", trans->atom->codepoint); 1051 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1052} 1053 1054static void 1055xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1056 int i; 1057 1058 fprintf(output, " state: "); 1059 if (state == NULL) { 1060 fprintf(output, "NULL\n"); 1061 return; 1062 } 1063 if (state->type == XML_REGEXP_START_STATE) 1064 fprintf(output, "START "); 1065 if (state->type == XML_REGEXP_FINAL_STATE) 1066 fprintf(output, "FINAL "); 1067 1068 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1069 for (i = 0;i < state->nbTrans; i++) { 1070 xmlRegPrintTrans(output, &(state->trans[i])); 1071 } 1072} 1073 1074#ifdef DEBUG_REGEXP_GRAPH 1075static void 1076xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1077 int i; 1078 1079 fprintf(output, " ctxt: "); 1080 if (ctxt == NULL) { 1081 fprintf(output, "NULL\n"); 1082 return; 1083 } 1084 fprintf(output, "'%s' ", ctxt->string); 1085 if (ctxt->error) 1086 fprintf(output, "error "); 1087 if (ctxt->neg) 1088 fprintf(output, "neg "); 1089 fprintf(output, "\n"); 1090 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1091 for (i = 0;i < ctxt->nbAtoms; i++) { 1092 fprintf(output, " %02d ", i); 1093 xmlRegPrintAtom(output, ctxt->atoms[i]); 1094 } 1095 if (ctxt->atom != NULL) { 1096 fprintf(output, "current atom:\n"); 1097 xmlRegPrintAtom(output, ctxt->atom); 1098 } 1099 fprintf(output, "%d states:", ctxt->nbStates); 1100 if (ctxt->start != NULL) 1101 fprintf(output, " start: %d", ctxt->start->no); 1102 if (ctxt->end != NULL) 1103 fprintf(output, " end: %d", ctxt->end->no); 1104 fprintf(output, "\n"); 1105 for (i = 0;i < ctxt->nbStates; i++) { 1106 xmlRegPrintState(output, ctxt->states[i]); 1107 } 1108 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1109 for (i = 0;i < ctxt->nbCounters; i++) { 1110 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1111 ctxt->counters[i].max); 1112 } 1113} 1114#endif 1115 1116/************************************************************************ 1117 * * 1118 * Finite Automata structures manipulations * 1119 * * 1120 ************************************************************************/ 1121 1122static void 1123xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1124 int neg, xmlRegAtomType type, int start, int end, 1125 xmlChar *blockName) { 1126 xmlRegRangePtr range; 1127 1128 if (atom == NULL) { 1129 ERROR("add range: atom is NULL"); 1130 return; 1131 } 1132 if (atom->type != XML_REGEXP_RANGES) { 1133 ERROR("add range: atom is not ranges"); 1134 return; 1135 } 1136 if (atom->maxRanges == 0) { 1137 atom->maxRanges = 4; 1138 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1139 sizeof(xmlRegRangePtr)); 1140 if (atom->ranges == NULL) { 1141 xmlRegexpErrMemory(ctxt, "adding ranges"); 1142 atom->maxRanges = 0; 1143 return; 1144 } 1145 } else if (atom->nbRanges >= atom->maxRanges) { 1146 xmlRegRangePtr *tmp; 1147 atom->maxRanges *= 2; 1148 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1149 sizeof(xmlRegRangePtr)); 1150 if (tmp == NULL) { 1151 xmlRegexpErrMemory(ctxt, "adding ranges"); 1152 atom->maxRanges /= 2; 1153 return; 1154 } 1155 atom->ranges = tmp; 1156 } 1157 range = xmlRegNewRange(ctxt, neg, type, start, end); 1158 if (range == NULL) 1159 return; 1160 range->blockName = blockName; 1161 atom->ranges[atom->nbRanges++] = range; 1162 1163} 1164 1165static int 1166xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1167 if (ctxt->maxCounters == 0) { 1168 ctxt->maxCounters = 4; 1169 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1170 sizeof(xmlRegCounter)); 1171 if (ctxt->counters == NULL) { 1172 xmlRegexpErrMemory(ctxt, "allocating counter"); 1173 ctxt->maxCounters = 0; 1174 return(-1); 1175 } 1176 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1177 xmlRegCounter *tmp; 1178 ctxt->maxCounters *= 2; 1179 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1180 sizeof(xmlRegCounter)); 1181 if (tmp == NULL) { 1182 xmlRegexpErrMemory(ctxt, "allocating counter"); 1183 ctxt->maxCounters /= 2; 1184 return(-1); 1185 } 1186 ctxt->counters = tmp; 1187 } 1188 ctxt->counters[ctxt->nbCounters].min = -1; 1189 ctxt->counters[ctxt->nbCounters].max = -1; 1190 return(ctxt->nbCounters++); 1191} 1192 1193static int 1194xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1195 if (atom == NULL) { 1196 ERROR("atom push: atom is NULL"); 1197 return(-1); 1198 } 1199 if (ctxt->maxAtoms == 0) { 1200 ctxt->maxAtoms = 4; 1201 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1202 sizeof(xmlRegAtomPtr)); 1203 if (ctxt->atoms == NULL) { 1204 xmlRegexpErrMemory(ctxt, "pushing atom"); 1205 ctxt->maxAtoms = 0; 1206 return(-1); 1207 } 1208 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1209 xmlRegAtomPtr *tmp; 1210 ctxt->maxAtoms *= 2; 1211 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1212 sizeof(xmlRegAtomPtr)); 1213 if (tmp == NULL) { 1214 xmlRegexpErrMemory(ctxt, "allocating counter"); 1215 ctxt->maxAtoms /= 2; 1216 return(-1); 1217 } 1218 ctxt->atoms = tmp; 1219 } 1220 atom->no = ctxt->nbAtoms; 1221 ctxt->atoms[ctxt->nbAtoms++] = atom; 1222 return(0); 1223} 1224 1225static void 1226xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1227 int from) { 1228 if (target->maxTransTo == 0) { 1229 target->maxTransTo = 8; 1230 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1231 sizeof(int)); 1232 if (target->transTo == NULL) { 1233 xmlRegexpErrMemory(ctxt, "adding transition"); 1234 target->maxTransTo = 0; 1235 return; 1236 } 1237 } else if (target->nbTransTo >= target->maxTransTo) { 1238 int *tmp; 1239 target->maxTransTo *= 2; 1240 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1241 sizeof(int)); 1242 if (tmp == NULL) { 1243 xmlRegexpErrMemory(ctxt, "adding transition"); 1244 target->maxTransTo /= 2; 1245 return; 1246 } 1247 target->transTo = tmp; 1248 } 1249 target->transTo[target->nbTransTo] = from; 1250 target->nbTransTo++; 1251} 1252 1253static void 1254xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1255 xmlRegAtomPtr atom, xmlRegStatePtr target, 1256 int counter, int count) { 1257 1258 int nrtrans; 1259 1260 if (state == NULL) { 1261 ERROR("add state: state is NULL"); 1262 return; 1263 } 1264 if (target == NULL) { 1265 ERROR("add state: target is NULL"); 1266 return; 1267 } 1268 /* 1269 * Other routines follow the philosophy 'When in doubt, add a transition' 1270 * so we check here whether such a transition is already present and, if 1271 * so, silently ignore this request. 1272 */ 1273 1274 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1275 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1276 if ((trans->atom == atom) && 1277 (trans->to == target->no) && 1278 (trans->counter == counter) && 1279 (trans->count == count)) { 1280#ifdef DEBUG_REGEXP_GRAPH 1281 printf("Ignoring duplicate transition from %d to %d\n", 1282 state->no, target->no); 1283#endif 1284 return; 1285 } 1286 } 1287 1288 if (state->maxTrans == 0) { 1289 state->maxTrans = 8; 1290 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1291 sizeof(xmlRegTrans)); 1292 if (state->trans == NULL) { 1293 xmlRegexpErrMemory(ctxt, "adding transition"); 1294 state->maxTrans = 0; 1295 return; 1296 } 1297 } else if (state->nbTrans >= state->maxTrans) { 1298 xmlRegTrans *tmp; 1299 state->maxTrans *= 2; 1300 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1301 sizeof(xmlRegTrans)); 1302 if (tmp == NULL) { 1303 xmlRegexpErrMemory(ctxt, "adding transition"); 1304 state->maxTrans /= 2; 1305 return; 1306 } 1307 state->trans = tmp; 1308 } 1309#ifdef DEBUG_REGEXP_GRAPH 1310 printf("Add trans from %d to %d ", state->no, target->no); 1311 if (count == REGEXP_ALL_COUNTER) 1312 printf("all transition\n"); 1313 else if (count >= 0) 1314 printf("count based %d\n", count); 1315 else if (counter >= 0) 1316 printf("counted %d\n", counter); 1317 else if (atom == NULL) 1318 printf("epsilon transition\n"); 1319 else if (atom != NULL) 1320 xmlRegPrintAtom(stdout, atom); 1321#endif 1322 1323 state->trans[state->nbTrans].atom = atom; 1324 state->trans[state->nbTrans].to = target->no; 1325 state->trans[state->nbTrans].counter = counter; 1326 state->trans[state->nbTrans].count = count; 1327 state->trans[state->nbTrans].nd = 0; 1328 state->nbTrans++; 1329 xmlRegStateAddTransTo(ctxt, target, state->no); 1330} 1331 1332static int 1333xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1334 if (state == NULL) return(-1); 1335 if (ctxt->maxStates == 0) { 1336 ctxt->maxStates = 4; 1337 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1338 sizeof(xmlRegStatePtr)); 1339 if (ctxt->states == NULL) { 1340 xmlRegexpErrMemory(ctxt, "adding state"); 1341 ctxt->maxStates = 0; 1342 return(-1); 1343 } 1344 } else if (ctxt->nbStates >= ctxt->maxStates) { 1345 xmlRegStatePtr *tmp; 1346 ctxt->maxStates *= 2; 1347 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1348 sizeof(xmlRegStatePtr)); 1349 if (tmp == NULL) { 1350 xmlRegexpErrMemory(ctxt, "adding state"); 1351 ctxt->maxStates /= 2; 1352 return(-1); 1353 } 1354 ctxt->states = tmp; 1355 } 1356 state->no = ctxt->nbStates; 1357 ctxt->states[ctxt->nbStates++] = state; 1358 return(0); 1359} 1360 1361/** 1362 * xmlFAGenerateAllTransition: 1363 * @ctxt: a regexp parser context 1364 * @from: the from state 1365 * @to: the target state or NULL for building a new one 1366 * @lax: 1367 * 1368 */ 1369static void 1370xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1371 xmlRegStatePtr from, xmlRegStatePtr to, 1372 int lax) { 1373 if (to == NULL) { 1374 to = xmlRegNewState(ctxt); 1375 xmlRegStatePush(ctxt, to); 1376 ctxt->state = to; 1377 } 1378 if (lax) 1379 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1380 else 1381 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1382} 1383 1384/** 1385 * xmlFAGenerateEpsilonTransition: 1386 * @ctxt: a regexp parser context 1387 * @from: the from state 1388 * @to: the target state or NULL for building a new one 1389 * 1390 */ 1391static void 1392xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1393 xmlRegStatePtr from, xmlRegStatePtr to) { 1394 if (to == NULL) { 1395 to = xmlRegNewState(ctxt); 1396 xmlRegStatePush(ctxt, to); 1397 ctxt->state = to; 1398 } 1399 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1400} 1401 1402/** 1403 * xmlFAGenerateCountedEpsilonTransition: 1404 * @ctxt: a regexp parser context 1405 * @from: the from state 1406 * @to: the target state or NULL for building a new one 1407 * counter: the counter for that transition 1408 * 1409 */ 1410static void 1411xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1412 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1413 if (to == NULL) { 1414 to = xmlRegNewState(ctxt); 1415 xmlRegStatePush(ctxt, to); 1416 ctxt->state = to; 1417 } 1418 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1419} 1420 1421/** 1422 * xmlFAGenerateCountedTransition: 1423 * @ctxt: a regexp parser context 1424 * @from: the from state 1425 * @to: the target state or NULL for building a new one 1426 * counter: the counter for that transition 1427 * 1428 */ 1429static void 1430xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1431 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1432 if (to == NULL) { 1433 to = xmlRegNewState(ctxt); 1434 xmlRegStatePush(ctxt, to); 1435 ctxt->state = to; 1436 } 1437 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1438} 1439 1440/** 1441 * xmlFAGenerateTransitions: 1442 * @ctxt: a regexp parser context 1443 * @from: the from state 1444 * @to: the target state or NULL for building a new one 1445 * @atom: the atom generating the transition 1446 * 1447 * Returns 0 if success and -1 in case of error. 1448 */ 1449static int 1450xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1451 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1452 if (atom == NULL) { 1453 ERROR("genrate transition: atom == NULL"); 1454 return(-1); 1455 } 1456 if (atom->type == XML_REGEXP_SUBREG) { 1457 /* 1458 * this is a subexpression handling one should not need to 1459 * create a new node except for XML_REGEXP_QUANT_RANGE. 1460 */ 1461 if (xmlRegAtomPush(ctxt, atom) < 0) { 1462 return(-1); 1463 } 1464 if ((to != NULL) && (atom->stop != to) && 1465 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1466 /* 1467 * Generate an epsilon transition to link to the target 1468 */ 1469 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1470#ifdef DV 1471 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1472 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1473 to = xmlRegNewState(ctxt); 1474 xmlRegStatePush(ctxt, to); 1475 ctxt->state = to; 1476 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1477#endif 1478 } 1479 switch (atom->quant) { 1480 case XML_REGEXP_QUANT_OPT: 1481 atom->quant = XML_REGEXP_QUANT_ONCE; 1482 /* 1483 * transition done to the state after end of atom. 1484 * 1. set transition from atom start to new state 1485 * 2. set transition from atom end to this state. 1486 */ 1487 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1488 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state); 1489 break; 1490 case XML_REGEXP_QUANT_MULT: 1491 atom->quant = XML_REGEXP_QUANT_ONCE; 1492 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1493 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1494 break; 1495 case XML_REGEXP_QUANT_PLUS: 1496 atom->quant = XML_REGEXP_QUANT_ONCE; 1497 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1498 break; 1499 case XML_REGEXP_QUANT_RANGE: { 1500 int counter; 1501 xmlRegStatePtr newstate; 1502 1503 /* 1504 * This one is nasty: 1505 * 1/ if range has minOccurs == 0, create a new state 1506 * and create epsilon transitions from atom->start 1507 * to atom->stop, as well as atom->start to the new 1508 * state 1509 * 2/ register a new counter 1510 * 3/ register an epsilon transition associated to 1511 * this counter going from atom->stop to atom->start 1512 * 4/ create a new state 1513 * 5/ generate a counted transition from atom->stop to 1514 * that state 1515 */ 1516 if (atom->min == 0) { 1517 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1518 atom->stop); 1519 newstate = xmlRegNewState(ctxt); 1520 xmlRegStatePush(ctxt, newstate); 1521 ctxt->state = newstate; 1522 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1523 newstate); 1524 } 1525 counter = xmlRegGetCounter(ctxt); 1526 ctxt->counters[counter].min = atom->min - 1; 1527 ctxt->counters[counter].max = atom->max - 1; 1528 atom->min = 0; 1529 atom->max = 0; 1530 atom->quant = XML_REGEXP_QUANT_ONCE; 1531 if (to != NULL) { 1532 newstate = to; 1533 } else { 1534 newstate = xmlRegNewState(ctxt); 1535 xmlRegStatePush(ctxt, newstate); 1536 } 1537 ctxt->state = newstate; 1538 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1539 newstate, counter); 1540 1541 /* 1542 * first check count and if OK jump forward; 1543 * if checking fail increment count and jump back 1544 */ 1545 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1546 atom->start, counter); 1547 } 1548 default: 1549 break; 1550 } 1551 return(0); 1552 } 1553 if ((atom->min == 0) && (atom->max == 0) && 1554 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1555 /* 1556 * we can discard the atom and generate an epsilon transition instead 1557 */ 1558 if (to == NULL) { 1559 to = xmlRegNewState(ctxt); 1560 if (to != NULL) 1561 xmlRegStatePush(ctxt, to); 1562 else { 1563 return(-1); 1564 } 1565 } 1566 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1567 ctxt->state = to; 1568 xmlRegFreeAtom(atom); 1569 return(0); 1570 } 1571 if (to == NULL) { 1572 to = xmlRegNewState(ctxt); 1573 if (to != NULL) 1574 xmlRegStatePush(ctxt, to); 1575 else { 1576 return(-1); 1577 } 1578 } 1579 if (xmlRegAtomPush(ctxt, atom) < 0) { 1580 return(-1); 1581 } 1582 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1583 ctxt->state = to; 1584 switch (atom->quant) { 1585 case XML_REGEXP_QUANT_OPT: 1586 atom->quant = XML_REGEXP_QUANT_ONCE; 1587 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1588 break; 1589 case XML_REGEXP_QUANT_MULT: 1590 atom->quant = XML_REGEXP_QUANT_ONCE; 1591 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1592 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1593 break; 1594 case XML_REGEXP_QUANT_PLUS: 1595 atom->quant = XML_REGEXP_QUANT_ONCE; 1596 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1597 break; 1598 default: 1599 break; 1600 } 1601 return(0); 1602} 1603 1604/** 1605 * xmlFAReduceEpsilonTransitions: 1606 * @ctxt: a regexp parser context 1607 * @fromnr: the from state 1608 * @tonr: the to state 1609 * @counter: should that transition be associated to a counted 1610 * 1611 */ 1612static void 1613xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1614 int tonr, int counter) { 1615 int transnr; 1616 xmlRegStatePtr from; 1617 xmlRegStatePtr to; 1618 1619#ifdef DEBUG_REGEXP_GRAPH 1620 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1621#endif 1622 from = ctxt->states[fromnr]; 1623 if (from == NULL) 1624 return; 1625 to = ctxt->states[tonr]; 1626 if (to == NULL) 1627 return; 1628 if ((to->mark == XML_REGEXP_MARK_START) || 1629 (to->mark == XML_REGEXP_MARK_VISITED)) 1630 return; 1631 1632 to->mark = XML_REGEXP_MARK_VISITED; 1633 if (to->type == XML_REGEXP_FINAL_STATE) { 1634#ifdef DEBUG_REGEXP_GRAPH 1635 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1636#endif 1637 from->type = XML_REGEXP_FINAL_STATE; 1638 } 1639 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1640 if (to->trans[transnr].to < 0) 1641 continue; 1642 if (to->trans[transnr].atom == NULL) { 1643 /* 1644 * Don't remove counted transitions 1645 * Don't loop either 1646 */ 1647 if (to->trans[transnr].to != fromnr) { 1648 if (to->trans[transnr].count >= 0) { 1649 int newto = to->trans[transnr].to; 1650 1651 xmlRegStateAddTrans(ctxt, from, NULL, 1652 ctxt->states[newto], 1653 -1, to->trans[transnr].count); 1654 } else { 1655#ifdef DEBUG_REGEXP_GRAPH 1656 printf("Found epsilon trans %d from %d to %d\n", 1657 transnr, tonr, to->trans[transnr].to); 1658#endif 1659 if (to->trans[transnr].counter >= 0) { 1660 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1661 to->trans[transnr].to, 1662 to->trans[transnr].counter); 1663 } else { 1664 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1665 to->trans[transnr].to, 1666 counter); 1667 } 1668 } 1669 } 1670 } else { 1671 int newto = to->trans[transnr].to; 1672 1673 if (to->trans[transnr].counter >= 0) { 1674 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1675 ctxt->states[newto], 1676 to->trans[transnr].counter, -1); 1677 } else { 1678 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1679 ctxt->states[newto], counter, -1); 1680 } 1681 } 1682 } 1683 to->mark = XML_REGEXP_MARK_NORMAL; 1684} 1685 1686/** 1687 * xmlFAEliminateSimpleEpsilonTransitions: 1688 * @ctxt: a regexp parser context 1689 * 1690 * Eliminating general epsilon transitions can get costly in the general 1691 * algorithm due to the large amount of generated new transitions and 1692 * associated comparisons. However for simple epsilon transition used just 1693 * to separate building blocks when generating the automata this can be 1694 * reduced to state elimination: 1695 * - if there exists an epsilon from X to Y 1696 * - if there is no other transition from X 1697 * then X and Y are semantically equivalent and X can be eliminated 1698 * If X is the start state then make Y the start state, else replace the 1699 * target of all transitions to X by transitions to Y. 1700 */ 1701static void 1702xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1703 int statenr, i, j, newto; 1704 xmlRegStatePtr state, tmp; 1705 1706 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1707 state = ctxt->states[statenr]; 1708 if (state == NULL) 1709 continue; 1710 if (state->nbTrans != 1) 1711 continue; 1712 /* is the only transition out a basic transition */ 1713 if ((state->trans[0].atom == NULL) && 1714 (state->trans[0].to >= 0) && 1715 (state->trans[0].to != statenr) && 1716 (state->trans[0].counter < 0) && 1717 (state->trans[0].count < 0)) { 1718 newto = state->trans[0].to; 1719 1720 if (state->type == XML_REGEXP_START_STATE) { 1721#ifdef DEBUG_REGEXP_GRAPH 1722 printf("Found simple epsilon trans from start %d to %d\n", 1723 statenr, newto); 1724#endif 1725 } else { 1726#ifdef DEBUG_REGEXP_GRAPH 1727 printf("Found simple epsilon trans from %d to %d\n", 1728 statenr, newto); 1729#endif 1730 for (i = 0;i < state->nbTransTo;i++) { 1731 tmp = ctxt->states[state->transTo[i]]; 1732 for (j = 0;j < tmp->nbTrans;j++) { 1733 if (tmp->trans[j].to == statenr) { 1734 tmp->trans[j].to = newto; 1735#ifdef DEBUG_REGEXP_GRAPH 1736 printf("Changed transition %d on %d to go to %d\n", 1737 j, tmp->no, newto); 1738#endif 1739 xmlRegStateAddTransTo(ctxt, ctxt->states[newto], 1740 tmp->no); 1741 } 1742 } 1743 } 1744#if 0 1745 for (i = 0;i < ctxt->nbStates;i++) { 1746 tmp = ctxt->states[i]; 1747 for (j = 0;j < tmp->nbTrans;j++) { 1748 if (tmp->trans[j].to == statenr) { 1749 tmp->trans[j].to = newto; 1750#ifdef DEBUG_REGEXP_GRAPH 1751 printf("Changed transition %d on %d to go to %d\n", 1752 j, tmp->no, newto); 1753#endif 1754 } 1755 } 1756 } 1757#endif 1758 if (state->type == XML_REGEXP_FINAL_STATE) 1759 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1760 /* eliminate the transition completely */ 1761 state->nbTrans = 0; 1762 1763 1764 } 1765 1766 } 1767 } 1768} 1769/** 1770 * xmlFAEliminateEpsilonTransitions: 1771 * @ctxt: a regexp parser context 1772 * 1773 */ 1774static void 1775xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1776 int statenr, transnr; 1777 xmlRegStatePtr state; 1778 int has_epsilon; 1779 1780 if (ctxt->states == NULL) return; 1781 1782 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1783 1784 has_epsilon = 0; 1785 1786 /* 1787 * build the completed transitions bypassing the epsilons 1788 * Use a marking algorithm to avoid loops 1789 * mark sink states too. 1790 */ 1791 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1792 state = ctxt->states[statenr]; 1793 if (state == NULL) 1794 continue; 1795 if ((state->nbTrans == 0) && 1796 (state->type != XML_REGEXP_FINAL_STATE)) { 1797 state->type = XML_REGEXP_SINK_STATE; 1798 } 1799 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1800 if ((state->trans[transnr].atom == NULL) && 1801 (state->trans[transnr].to >= 0)) { 1802 if (state->trans[transnr].to == statenr) { 1803 state->trans[transnr].to = -1; 1804#ifdef DEBUG_REGEXP_GRAPH 1805 printf("Removed loopback epsilon trans %d on %d\n", 1806 transnr, statenr); 1807#endif 1808 } else if (state->trans[transnr].count < 0) { 1809 int newto = state->trans[transnr].to; 1810 1811#ifdef DEBUG_REGEXP_GRAPH 1812 printf("Found epsilon trans %d from %d to %d\n", 1813 transnr, statenr, newto); 1814#endif 1815 state->mark = XML_REGEXP_MARK_START; 1816 has_epsilon = 1; 1817 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1818 newto, state->trans[transnr].counter); 1819 state->mark = XML_REGEXP_MARK_NORMAL; 1820#ifdef DEBUG_REGEXP_GRAPH 1821 } else { 1822 printf("Found counted transition %d on %d\n", 1823 transnr, statenr); 1824#endif 1825 } 1826 } 1827 } 1828 } 1829 /* 1830 * Eliminate the epsilon transitions 1831 */ 1832 if (has_epsilon) { 1833 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1834 state = ctxt->states[statenr]; 1835 if (state == NULL) 1836 continue; 1837 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1838 xmlRegTransPtr trans = &(state->trans[transnr]); 1839 if ((trans->atom == NULL) && 1840 (trans->count < 0) && 1841 (trans->to >= 0)) { 1842 trans->to = -1; 1843 } 1844 } 1845 } 1846 } 1847 1848 /* 1849 * Use this pass to detect unreachable states too 1850 */ 1851 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1852 state = ctxt->states[statenr]; 1853 if (state != NULL) 1854 state->reached = XML_REGEXP_MARK_NORMAL; 1855 } 1856 state = ctxt->states[0]; 1857 if (state != NULL) 1858 state->reached = XML_REGEXP_MARK_START; 1859 while (state != NULL) { 1860 xmlRegStatePtr target = NULL; 1861 state->reached = XML_REGEXP_MARK_VISITED; 1862 /* 1863 * Mark all states reachable from the current reachable state 1864 */ 1865 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1866 if ((state->trans[transnr].to >= 0) && 1867 ((state->trans[transnr].atom != NULL) || 1868 (state->trans[transnr].count >= 0))) { 1869 int newto = state->trans[transnr].to; 1870 1871 if (ctxt->states[newto] == NULL) 1872 continue; 1873 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 1874 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 1875 target = ctxt->states[newto]; 1876 } 1877 } 1878 } 1879 1880 /* 1881 * find the next accessible state not explored 1882 */ 1883 if (target == NULL) { 1884 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1885 state = ctxt->states[statenr]; 1886 if ((state != NULL) && (state->reached == 1887 XML_REGEXP_MARK_START)) { 1888 target = state; 1889 break; 1890 } 1891 } 1892 } 1893 state = target; 1894 } 1895 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1896 state = ctxt->states[statenr]; 1897 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 1898#ifdef DEBUG_REGEXP_GRAPH 1899 printf("Removed unreachable state %d\n", statenr); 1900#endif 1901 xmlRegFreeState(state); 1902 ctxt->states[statenr] = NULL; 1903 } 1904 } 1905 1906} 1907 1908static int 1909xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 1910 int ret = 0; 1911 1912 if ((range1->type == XML_REGEXP_RANGES) || 1913 (range2->type == XML_REGEXP_RANGES) || 1914 (range2->type == XML_REGEXP_SUBREG) || 1915 (range1->type == XML_REGEXP_SUBREG) || 1916 (range1->type == XML_REGEXP_STRING) || 1917 (range2->type == XML_REGEXP_STRING)) 1918 return(-1); 1919 1920 /* put them in order */ 1921 if (range1->type > range2->type) { 1922 xmlRegRangePtr tmp; 1923 1924 tmp = range1; 1925 range1 = range2; 1926 range2 = tmp; 1927 } 1928 if ((range1->type == XML_REGEXP_ANYCHAR) || 1929 (range2->type == XML_REGEXP_ANYCHAR)) { 1930 ret = 1; 1931 } else if ((range1->type == XML_REGEXP_EPSILON) || 1932 (range2->type == XML_REGEXP_EPSILON)) { 1933 return(0); 1934 } else if (range1->type == range2->type) { 1935 if ((range1->type != XML_REGEXP_CHARVAL) || 1936 (range1->end < range2->start) || 1937 (range2->end < range1->start)) 1938 ret = 1; 1939 else 1940 ret = 0; 1941 } else if (range1->type == XML_REGEXP_CHARVAL) { 1942 int codepoint; 1943 int neg = 0; 1944 1945 /* 1946 * just check all codepoints in the range for acceptance, 1947 * this is usually way cheaper since done only once at 1948 * compilation than testing over and over at runtime or 1949 * pushing too many states when evaluating. 1950 */ 1951 if (((range1->neg == 0) && (range2->neg != 0)) || 1952 ((range1->neg != 0) && (range2->neg == 0))) 1953 neg = 1; 1954 1955 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 1956 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 1957 0, range2->start, range2->end, 1958 range2->blockName); 1959 if (ret < 0) 1960 return(-1); 1961 if (((neg == 1) && (ret == 0)) || 1962 ((neg == 0) && (ret == 1))) 1963 return(1); 1964 } 1965 return(0); 1966 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 1967 (range2->type == XML_REGEXP_BLOCK_NAME)) { 1968 if (range1->type == range2->type) { 1969 ret = xmlStrEqual(range1->blockName, range2->blockName); 1970 } else { 1971 /* 1972 * comparing a block range with anything else is way 1973 * too costly, and maintining the table is like too much 1974 * memory too, so let's force the automata to save state 1975 * here. 1976 */ 1977 return(1); 1978 } 1979 } else if ((range1->type < XML_REGEXP_LETTER) || 1980 (range2->type < XML_REGEXP_LETTER)) { 1981 if ((range1->type == XML_REGEXP_ANYSPACE) && 1982 (range2->type == XML_REGEXP_NOTSPACE)) 1983 ret = 0; 1984 else if ((range1->type == XML_REGEXP_INITNAME) && 1985 (range2->type == XML_REGEXP_NOTINITNAME)) 1986 ret = 0; 1987 else if ((range1->type == XML_REGEXP_NAMECHAR) && 1988 (range2->type == XML_REGEXP_NOTNAMECHAR)) 1989 ret = 0; 1990 else if ((range1->type == XML_REGEXP_DECIMAL) && 1991 (range2->type == XML_REGEXP_NOTDECIMAL)) 1992 ret = 0; 1993 else if ((range1->type == XML_REGEXP_REALCHAR) && 1994 (range2->type == XML_REGEXP_NOTREALCHAR)) 1995 ret = 0; 1996 else { 1997 /* same thing to limit complexity */ 1998 return(1); 1999 } 2000 } else { 2001 ret = 0; 2002 /* range1->type < range2->type here */ 2003 switch (range1->type) { 2004 case XML_REGEXP_LETTER: 2005 /* all disjoint except in the subgroups */ 2006 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2007 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2008 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2009 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2010 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2011 ret = 1; 2012 break; 2013 case XML_REGEXP_MARK: 2014 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2015 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2016 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2017 ret = 1; 2018 break; 2019 case XML_REGEXP_NUMBER: 2020 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2021 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2022 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2023 ret = 1; 2024 break; 2025 case XML_REGEXP_PUNCT: 2026 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2027 (range2->type == XML_REGEXP_PUNCT_DASH) || 2028 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2029 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2030 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2031 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2032 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2033 ret = 1; 2034 break; 2035 case XML_REGEXP_SEPAR: 2036 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2037 (range2->type == XML_REGEXP_SEPAR_LINE) || 2038 (range2->type == XML_REGEXP_SEPAR_PARA)) 2039 ret = 1; 2040 break; 2041 case XML_REGEXP_SYMBOL: 2042 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2043 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2044 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2045 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2046 ret = 1; 2047 break; 2048 case XML_REGEXP_OTHER: 2049 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2050 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2051 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2052 ret = 1; 2053 break; 2054 default: 2055 if ((range2->type >= XML_REGEXP_LETTER) && 2056 (range2->type < XML_REGEXP_BLOCK_NAME)) 2057 ret = 0; 2058 else { 2059 /* safety net ! */ 2060 return(1); 2061 } 2062 } 2063 } 2064 if (((range1->neg == 0) && (range2->neg != 0)) || 2065 ((range1->neg != 0) && (range2->neg == 0))) 2066 ret = !ret; 2067 return(1); 2068} 2069 2070/** 2071 * xmlFACompareAtomTypes: 2072 * @type1: an atom type 2073 * @type2: an atom type 2074 * 2075 * Compares two atoms type to check whether they intersect in some ways, 2076 * this is used by xmlFACompareAtoms only 2077 * 2078 * Returns 1 if they may intersect and 0 otherwise 2079 */ 2080static int 2081xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2082 if ((type1 == XML_REGEXP_EPSILON) || 2083 (type1 == XML_REGEXP_CHARVAL) || 2084 (type1 == XML_REGEXP_RANGES) || 2085 (type1 == XML_REGEXP_SUBREG) || 2086 (type1 == XML_REGEXP_STRING) || 2087 (type1 == XML_REGEXP_ANYCHAR)) 2088 return(1); 2089 if ((type2 == XML_REGEXP_EPSILON) || 2090 (type2 == XML_REGEXP_CHARVAL) || 2091 (type2 == XML_REGEXP_RANGES) || 2092 (type2 == XML_REGEXP_SUBREG) || 2093 (type2 == XML_REGEXP_STRING) || 2094 (type2 == XML_REGEXP_ANYCHAR)) 2095 return(1); 2096 2097 if (type1 == type2) return(1); 2098 2099 /* simplify subsequent compares by making sure type1 < type2 */ 2100 if (type1 > type2) { 2101 xmlRegAtomType tmp = type1; 2102 type1 = type2; 2103 type2 = tmp; 2104 } 2105 switch (type1) { 2106 case XML_REGEXP_ANYSPACE: /* \s */ 2107 /* can't be a letter, number, mark, pontuation, symbol */ 2108 if ((type2 == XML_REGEXP_NOTSPACE) || 2109 ((type2 >= XML_REGEXP_LETTER) && 2110 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2111 ((type2 >= XML_REGEXP_NUMBER) && 2112 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2113 ((type2 >= XML_REGEXP_MARK) && 2114 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2115 ((type2 >= XML_REGEXP_PUNCT) && 2116 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2117 ((type2 >= XML_REGEXP_SYMBOL) && 2118 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2119 ) return(0); 2120 break; 2121 case XML_REGEXP_NOTSPACE: /* \S */ 2122 break; 2123 case XML_REGEXP_INITNAME: /* \l */ 2124 /* can't be a number, mark, separator, pontuation, symbol or other */ 2125 if ((type2 == XML_REGEXP_NOTINITNAME) || 2126 ((type2 >= XML_REGEXP_NUMBER) && 2127 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2128 ((type2 >= XML_REGEXP_MARK) && 2129 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2130 ((type2 >= XML_REGEXP_SEPAR) && 2131 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2132 ((type2 >= XML_REGEXP_PUNCT) && 2133 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2134 ((type2 >= XML_REGEXP_SYMBOL) && 2135 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2136 ((type2 >= XML_REGEXP_OTHER) && 2137 (type2 <= XML_REGEXP_OTHER_NA)) 2138 ) return(0); 2139 break; 2140 case XML_REGEXP_NOTINITNAME: /* \L */ 2141 break; 2142 case XML_REGEXP_NAMECHAR: /* \c */ 2143 /* can't be a mark, separator, pontuation, symbol or other */ 2144 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2145 ((type2 >= XML_REGEXP_MARK) && 2146 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2147 ((type2 >= XML_REGEXP_PUNCT) && 2148 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2149 ((type2 >= XML_REGEXP_SEPAR) && 2150 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2151 ((type2 >= XML_REGEXP_SYMBOL) && 2152 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2153 ((type2 >= XML_REGEXP_OTHER) && 2154 (type2 <= XML_REGEXP_OTHER_NA)) 2155 ) return(0); 2156 break; 2157 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2158 break; 2159 case XML_REGEXP_DECIMAL: /* \d */ 2160 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2161 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2162 (type2 == XML_REGEXP_REALCHAR) || 2163 ((type2 >= XML_REGEXP_LETTER) && 2164 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2165 ((type2 >= XML_REGEXP_MARK) && 2166 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2167 ((type2 >= XML_REGEXP_PUNCT) && 2168 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2169 ((type2 >= XML_REGEXP_SEPAR) && 2170 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2171 ((type2 >= XML_REGEXP_SYMBOL) && 2172 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2173 ((type2 >= XML_REGEXP_OTHER) && 2174 (type2 <= XML_REGEXP_OTHER_NA)) 2175 )return(0); 2176 break; 2177 case XML_REGEXP_NOTDECIMAL: /* \D */ 2178 break; 2179 case XML_REGEXP_REALCHAR: /* \w */ 2180 /* can't be a mark, separator, pontuation, symbol or other */ 2181 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2182 ((type2 >= XML_REGEXP_MARK) && 2183 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2184 ((type2 >= XML_REGEXP_PUNCT) && 2185 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2186 ((type2 >= XML_REGEXP_SEPAR) && 2187 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2188 ((type2 >= XML_REGEXP_SYMBOL) && 2189 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2190 ((type2 >= XML_REGEXP_OTHER) && 2191 (type2 <= XML_REGEXP_OTHER_NA)) 2192 )return(0); 2193 break; 2194 case XML_REGEXP_NOTREALCHAR: /* \W */ 2195 break; 2196 /* 2197 * at that point we know both type 1 and type2 are from 2198 * character categories are ordered and are different, 2199 * it becomes simple because this is a partition 2200 */ 2201 case XML_REGEXP_LETTER: 2202 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2203 return(1); 2204 return(0); 2205 case XML_REGEXP_LETTER_UPPERCASE: 2206 case XML_REGEXP_LETTER_LOWERCASE: 2207 case XML_REGEXP_LETTER_TITLECASE: 2208 case XML_REGEXP_LETTER_MODIFIER: 2209 case XML_REGEXP_LETTER_OTHERS: 2210 return(0); 2211 case XML_REGEXP_MARK: 2212 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2213 return(1); 2214 return(0); 2215 case XML_REGEXP_MARK_NONSPACING: 2216 case XML_REGEXP_MARK_SPACECOMBINING: 2217 case XML_REGEXP_MARK_ENCLOSING: 2218 return(0); 2219 case XML_REGEXP_NUMBER: 2220 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2221 return(1); 2222 return(0); 2223 case XML_REGEXP_NUMBER_DECIMAL: 2224 case XML_REGEXP_NUMBER_LETTER: 2225 case XML_REGEXP_NUMBER_OTHERS: 2226 return(0); 2227 case XML_REGEXP_PUNCT: 2228 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2229 return(1); 2230 return(0); 2231 case XML_REGEXP_PUNCT_CONNECTOR: 2232 case XML_REGEXP_PUNCT_DASH: 2233 case XML_REGEXP_PUNCT_OPEN: 2234 case XML_REGEXP_PUNCT_CLOSE: 2235 case XML_REGEXP_PUNCT_INITQUOTE: 2236 case XML_REGEXP_PUNCT_FINQUOTE: 2237 case XML_REGEXP_PUNCT_OTHERS: 2238 return(0); 2239 case XML_REGEXP_SEPAR: 2240 if (type2 <= XML_REGEXP_SEPAR_PARA) 2241 return(1); 2242 return(0); 2243 case XML_REGEXP_SEPAR_SPACE: 2244 case XML_REGEXP_SEPAR_LINE: 2245 case XML_REGEXP_SEPAR_PARA: 2246 return(0); 2247 case XML_REGEXP_SYMBOL: 2248 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2249 return(1); 2250 return(0); 2251 case XML_REGEXP_SYMBOL_MATH: 2252 case XML_REGEXP_SYMBOL_CURRENCY: 2253 case XML_REGEXP_SYMBOL_MODIFIER: 2254 case XML_REGEXP_SYMBOL_OTHERS: 2255 return(0); 2256 case XML_REGEXP_OTHER: 2257 if (type2 <= XML_REGEXP_OTHER_NA) 2258 return(1); 2259 return(0); 2260 case XML_REGEXP_OTHER_CONTROL: 2261 case XML_REGEXP_OTHER_FORMAT: 2262 case XML_REGEXP_OTHER_PRIVATE: 2263 case XML_REGEXP_OTHER_NA: 2264 return(0); 2265 default: 2266 break; 2267 } 2268 return(1); 2269} 2270 2271/** 2272 * xmlFAEqualAtoms: 2273 * @atom1: an atom 2274 * @atom2: an atom 2275 * 2276 * Compares two atoms to check whether they are the same exactly 2277 * this is used to remove equivalent transitions 2278 * 2279 * Returns 1 if same and 0 otherwise 2280 */ 2281static int 2282xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2283 int ret = 0; 2284 2285 if (atom1 == atom2) 2286 return(1); 2287 if ((atom1 == NULL) || (atom2 == NULL)) 2288 return(0); 2289 2290 if (atom1->type != atom2->type) 2291 return(0); 2292 switch (atom1->type) { 2293 case XML_REGEXP_EPSILON: 2294 ret = 0; 2295 break; 2296 case XML_REGEXP_STRING: 2297 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2298 (xmlChar *)atom2->valuep); 2299 break; 2300 case XML_REGEXP_CHARVAL: 2301 ret = (atom1->codepoint == atom2->codepoint); 2302 break; 2303 case XML_REGEXP_RANGES: 2304 /* too hard to do in the general case */ 2305 ret = 0; 2306 default: 2307 break; 2308 } 2309 return(ret); 2310} 2311 2312/** 2313 * xmlFACompareAtoms: 2314 * @atom1: an atom 2315 * @atom2: an atom 2316 * 2317 * Compares two atoms to check whether they intersect in some ways, 2318 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2319 * 2320 * Returns 1 if yes and 0 otherwise 2321 */ 2322static int 2323xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2324 int ret = 1; 2325 2326 if (atom1 == atom2) 2327 return(1); 2328 if ((atom1 == NULL) || (atom2 == NULL)) 2329 return(0); 2330 2331 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2332 (atom2->type == XML_REGEXP_ANYCHAR)) 2333 return(1); 2334 2335 if (atom1->type > atom2->type) { 2336 xmlRegAtomPtr tmp; 2337 tmp = atom1; 2338 atom1 = atom2; 2339 atom2 = tmp; 2340 } 2341 if (atom1->type != atom2->type) { 2342 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2343 /* if they can't intersect at the type level break now */ 2344 if (ret == 0) 2345 return(0); 2346 } 2347 switch (atom1->type) { 2348 case XML_REGEXP_STRING: 2349 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2350 (xmlChar *)atom2->valuep); 2351 break; 2352 case XML_REGEXP_EPSILON: 2353 goto not_determinist; 2354 case XML_REGEXP_CHARVAL: 2355 if (atom2->type == XML_REGEXP_CHARVAL) { 2356 ret = (atom1->codepoint == atom2->codepoint); 2357 } else { 2358 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2359 if (ret < 0) 2360 ret = 1; 2361 } 2362 break; 2363 case XML_REGEXP_RANGES: 2364 if (atom2->type == XML_REGEXP_RANGES) { 2365 int i, j, res; 2366 xmlRegRangePtr r1, r2; 2367 2368 /* 2369 * need to check that none of the ranges eventually matches 2370 */ 2371 for (i = 0;i < atom1->nbRanges;i++) { 2372 for (j = 0;j < atom2->nbRanges;j++) { 2373 r1 = atom1->ranges[i]; 2374 r2 = atom2->ranges[j]; 2375 res = xmlFACompareRanges(r1, r2); 2376 if (res == 1) { 2377 ret = 1; 2378 goto done; 2379 } 2380 } 2381 } 2382 ret = 0; 2383 } 2384 break; 2385 default: 2386 goto not_determinist; 2387 } 2388done: 2389 if (atom1->neg != atom2->neg) { 2390 ret = !ret; 2391 } 2392 if (ret == 0) 2393 return(0); 2394not_determinist: 2395 return(1); 2396} 2397 2398/** 2399 * xmlFARecurseDeterminism: 2400 * @ctxt: a regexp parser context 2401 * 2402 * Check whether the associated regexp is determinist, 2403 * should be called after xmlFAEliminateEpsilonTransitions() 2404 * 2405 */ 2406static int 2407xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2408 int to, xmlRegAtomPtr atom) { 2409 int ret = 1; 2410 int res; 2411 int transnr, nbTrans; 2412 xmlRegTransPtr t1; 2413 2414 if (state == NULL) 2415 return(ret); 2416 /* 2417 * don't recurse on transitions potentially added in the course of 2418 * the elimination. 2419 */ 2420 nbTrans = state->nbTrans; 2421 for (transnr = 0;transnr < nbTrans;transnr++) { 2422 t1 = &(state->trans[transnr]); 2423 /* 2424 * check transitions conflicting with the one looked at 2425 */ 2426 if (t1->atom == NULL) { 2427 if (t1->to == -1) 2428 continue; 2429 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2430 to, atom); 2431 if (res == 0) { 2432 ret = 0; 2433 /* t1->nd = 1; */ 2434 } 2435 continue; 2436 } 2437 if (t1->to != to) 2438 continue; 2439 if (xmlFACompareAtoms(t1->atom, atom)) { 2440 ret = 0; 2441 /* mark the transition as non-deterministic */ 2442 t1->nd = 1; 2443 } 2444 } 2445 return(ret); 2446} 2447 2448/** 2449 * xmlFAComputesDeterminism: 2450 * @ctxt: a regexp parser context 2451 * 2452 * Check whether the associated regexp is determinist, 2453 * should be called after xmlFAEliminateEpsilonTransitions() 2454 * 2455 */ 2456static int 2457xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2458 int statenr, transnr; 2459 xmlRegStatePtr state; 2460 xmlRegTransPtr t1, t2, last; 2461 int i; 2462 int ret = 1; 2463 2464#ifdef DEBUG_REGEXP_GRAPH 2465 printf("xmlFAComputesDeterminism\n"); 2466 xmlRegPrintCtxt(stdout, ctxt); 2467#endif 2468 if (ctxt->determinist != -1) 2469 return(ctxt->determinist); 2470 2471 /* 2472 * First cleanup the automata removing cancelled transitions 2473 */ 2474 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2475 state = ctxt->states[statenr]; 2476 if (state == NULL) 2477 continue; 2478 if (state->nbTrans < 2) 2479 continue; 2480 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2481 t1 = &(state->trans[transnr]); 2482 /* 2483 * Determinism checks in case of counted or all transitions 2484 * will have to be handled separately 2485 */ 2486 if (t1->atom == NULL) { 2487 /* t1->nd = 1; */ 2488 continue; 2489 } 2490 if (t1->to == -1) /* eliminated */ 2491 continue; 2492 for (i = 0;i < transnr;i++) { 2493 t2 = &(state->trans[i]); 2494 if (t2->to == -1) /* eliminated */ 2495 continue; 2496 if (t2->atom != NULL) { 2497 if (t1->to == t2->to) { 2498 if (xmlFAEqualAtoms(t1->atom, t2->atom)) 2499 t2->to = -1; /* eliminated */ 2500 } 2501 } 2502 } 2503 } 2504 } 2505 2506 /* 2507 * Check for all states that there aren't 2 transitions 2508 * with the same atom and a different target. 2509 */ 2510 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2511 state = ctxt->states[statenr]; 2512 if (state == NULL) 2513 continue; 2514 if (state->nbTrans < 2) 2515 continue; 2516 last = NULL; 2517 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2518 t1 = &(state->trans[transnr]); 2519 /* 2520 * Determinism checks in case of counted or all transitions 2521 * will have to be handled separately 2522 */ 2523 if (t1->atom == NULL) { 2524 continue; 2525 } 2526 if (t1->to == -1) /* eliminated */ 2527 continue; 2528 for (i = 0;i < transnr;i++) { 2529 t2 = &(state->trans[i]); 2530 if (t2->to == -1) /* eliminated */ 2531 continue; 2532 if (t2->atom != NULL) { 2533 /* not determinist ! */ 2534 if (xmlFACompareAtoms(t1->atom, t2->atom)) { 2535 ret = 0; 2536 /* mark the transitions as non-deterministic ones */ 2537 t1->nd = 1; 2538 t2->nd = 1; 2539 last = t1; 2540 } 2541 } else if (t1->to != -1) { 2542 /* 2543 * do the closure in case of remaining specific 2544 * epsilon transitions like choices or all 2545 */ 2546 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2547 t2->to, t2->atom); 2548 /* don't shortcut the computation so all non deterministic 2549 transition get marked down 2550 if (ret == 0) 2551 return(0); 2552 */ 2553 if (ret == 0) { 2554 t1->nd = 1; 2555 /* t2->nd = 1; */ 2556 last = t1; 2557 } 2558 } 2559 } 2560 /* don't shortcut the computation so all non deterministic 2561 transition get marked down 2562 if (ret == 0) 2563 break; */ 2564 } 2565 2566 /* 2567 * mark specifically the last non-deterministic transition 2568 * from a state since there is no need to set-up rollback 2569 * from it 2570 */ 2571 if (last != NULL) { 2572 last->nd = 2; 2573 } 2574 2575 /* don't shortcut the computation so all non deterministic 2576 transition get marked down 2577 if (ret == 0) 2578 break; */ 2579 } 2580 2581 ctxt->determinist = ret; 2582 return(ret); 2583} 2584 2585/************************************************************************ 2586 * * 2587 * Routines to check input against transition atoms * 2588 * * 2589 ************************************************************************/ 2590 2591static int 2592xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2593 int start, int end, const xmlChar *blockName) { 2594 int ret = 0; 2595 2596 switch (type) { 2597 case XML_REGEXP_STRING: 2598 case XML_REGEXP_SUBREG: 2599 case XML_REGEXP_RANGES: 2600 case XML_REGEXP_EPSILON: 2601 return(-1); 2602 case XML_REGEXP_ANYCHAR: 2603 ret = ((codepoint != '\n') && (codepoint != '\r')); 2604 break; 2605 case XML_REGEXP_CHARVAL: 2606 ret = ((codepoint >= start) && (codepoint <= end)); 2607 break; 2608 case XML_REGEXP_NOTSPACE: 2609 neg = !neg; 2610 case XML_REGEXP_ANYSPACE: 2611 ret = ((codepoint == '\n') || (codepoint == '\r') || 2612 (codepoint == '\t') || (codepoint == ' ')); 2613 break; 2614 case XML_REGEXP_NOTINITNAME: 2615 neg = !neg; 2616 case XML_REGEXP_INITNAME: 2617 ret = (IS_LETTER(codepoint) || 2618 (codepoint == '_') || (codepoint == ':')); 2619 break; 2620 case XML_REGEXP_NOTNAMECHAR: 2621 neg = !neg; 2622 case XML_REGEXP_NAMECHAR: 2623 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2624 (codepoint == '.') || (codepoint == '-') || 2625 (codepoint == '_') || (codepoint == ':') || 2626 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2627 break; 2628 case XML_REGEXP_NOTDECIMAL: 2629 neg = !neg; 2630 case XML_REGEXP_DECIMAL: 2631 ret = xmlUCSIsCatNd(codepoint); 2632 break; 2633 case XML_REGEXP_REALCHAR: 2634 neg = !neg; 2635 case XML_REGEXP_NOTREALCHAR: 2636 ret = xmlUCSIsCatP(codepoint); 2637 if (ret == 0) 2638 ret = xmlUCSIsCatZ(codepoint); 2639 if (ret == 0) 2640 ret = xmlUCSIsCatC(codepoint); 2641 break; 2642 case XML_REGEXP_LETTER: 2643 ret = xmlUCSIsCatL(codepoint); 2644 break; 2645 case XML_REGEXP_LETTER_UPPERCASE: 2646 ret = xmlUCSIsCatLu(codepoint); 2647 break; 2648 case XML_REGEXP_LETTER_LOWERCASE: 2649 ret = xmlUCSIsCatLl(codepoint); 2650 break; 2651 case XML_REGEXP_LETTER_TITLECASE: 2652 ret = xmlUCSIsCatLt(codepoint); 2653 break; 2654 case XML_REGEXP_LETTER_MODIFIER: 2655 ret = xmlUCSIsCatLm(codepoint); 2656 break; 2657 case XML_REGEXP_LETTER_OTHERS: 2658 ret = xmlUCSIsCatLo(codepoint); 2659 break; 2660 case XML_REGEXP_MARK: 2661 ret = xmlUCSIsCatM(codepoint); 2662 break; 2663 case XML_REGEXP_MARK_NONSPACING: 2664 ret = xmlUCSIsCatMn(codepoint); 2665 break; 2666 case XML_REGEXP_MARK_SPACECOMBINING: 2667 ret = xmlUCSIsCatMc(codepoint); 2668 break; 2669 case XML_REGEXP_MARK_ENCLOSING: 2670 ret = xmlUCSIsCatMe(codepoint); 2671 break; 2672 case XML_REGEXP_NUMBER: 2673 ret = xmlUCSIsCatN(codepoint); 2674 break; 2675 case XML_REGEXP_NUMBER_DECIMAL: 2676 ret = xmlUCSIsCatNd(codepoint); 2677 break; 2678 case XML_REGEXP_NUMBER_LETTER: 2679 ret = xmlUCSIsCatNl(codepoint); 2680 break; 2681 case XML_REGEXP_NUMBER_OTHERS: 2682 ret = xmlUCSIsCatNo(codepoint); 2683 break; 2684 case XML_REGEXP_PUNCT: 2685 ret = xmlUCSIsCatP(codepoint); 2686 break; 2687 case XML_REGEXP_PUNCT_CONNECTOR: 2688 ret = xmlUCSIsCatPc(codepoint); 2689 break; 2690 case XML_REGEXP_PUNCT_DASH: 2691 ret = xmlUCSIsCatPd(codepoint); 2692 break; 2693 case XML_REGEXP_PUNCT_OPEN: 2694 ret = xmlUCSIsCatPs(codepoint); 2695 break; 2696 case XML_REGEXP_PUNCT_CLOSE: 2697 ret = xmlUCSIsCatPe(codepoint); 2698 break; 2699 case XML_REGEXP_PUNCT_INITQUOTE: 2700 ret = xmlUCSIsCatPi(codepoint); 2701 break; 2702 case XML_REGEXP_PUNCT_FINQUOTE: 2703 ret = xmlUCSIsCatPf(codepoint); 2704 break; 2705 case XML_REGEXP_PUNCT_OTHERS: 2706 ret = xmlUCSIsCatPo(codepoint); 2707 break; 2708 case XML_REGEXP_SEPAR: 2709 ret = xmlUCSIsCatZ(codepoint); 2710 break; 2711 case XML_REGEXP_SEPAR_SPACE: 2712 ret = xmlUCSIsCatZs(codepoint); 2713 break; 2714 case XML_REGEXP_SEPAR_LINE: 2715 ret = xmlUCSIsCatZl(codepoint); 2716 break; 2717 case XML_REGEXP_SEPAR_PARA: 2718 ret = xmlUCSIsCatZp(codepoint); 2719 break; 2720 case XML_REGEXP_SYMBOL: 2721 ret = xmlUCSIsCatS(codepoint); 2722 break; 2723 case XML_REGEXP_SYMBOL_MATH: 2724 ret = xmlUCSIsCatSm(codepoint); 2725 break; 2726 case XML_REGEXP_SYMBOL_CURRENCY: 2727 ret = xmlUCSIsCatSc(codepoint); 2728 break; 2729 case XML_REGEXP_SYMBOL_MODIFIER: 2730 ret = xmlUCSIsCatSk(codepoint); 2731 break; 2732 case XML_REGEXP_SYMBOL_OTHERS: 2733 ret = xmlUCSIsCatSo(codepoint); 2734 break; 2735 case XML_REGEXP_OTHER: 2736 ret = xmlUCSIsCatC(codepoint); 2737 break; 2738 case XML_REGEXP_OTHER_CONTROL: 2739 ret = xmlUCSIsCatCc(codepoint); 2740 break; 2741 case XML_REGEXP_OTHER_FORMAT: 2742 ret = xmlUCSIsCatCf(codepoint); 2743 break; 2744 case XML_REGEXP_OTHER_PRIVATE: 2745 ret = xmlUCSIsCatCo(codepoint); 2746 break; 2747 case XML_REGEXP_OTHER_NA: 2748 /* ret = xmlUCSIsCatCn(codepoint); */ 2749 /* Seems it doesn't exist anymore in recent Unicode releases */ 2750 ret = 0; 2751 break; 2752 case XML_REGEXP_BLOCK_NAME: 2753 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2754 break; 2755 } 2756 if (neg) 2757 return(!ret); 2758 return(ret); 2759} 2760 2761static int 2762xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2763 int i, ret = 0; 2764 xmlRegRangePtr range; 2765 2766 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2767 return(-1); 2768 2769 switch (atom->type) { 2770 case XML_REGEXP_SUBREG: 2771 case XML_REGEXP_EPSILON: 2772 return(-1); 2773 case XML_REGEXP_CHARVAL: 2774 return(codepoint == atom->codepoint); 2775 case XML_REGEXP_RANGES: { 2776 int accept = 0; 2777 2778 for (i = 0;i < atom->nbRanges;i++) { 2779 range = atom->ranges[i]; 2780 if (range->neg == 2) { 2781 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2782 0, range->start, range->end, 2783 range->blockName); 2784 if (ret != 0) 2785 return(0); /* excluded char */ 2786 } else if (range->neg) { 2787 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2788 0, range->start, range->end, 2789 range->blockName); 2790 if (ret == 0) 2791 accept = 1; 2792 else 2793 return(0); 2794 } else { 2795 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2796 0, range->start, range->end, 2797 range->blockName); 2798 if (ret != 0) 2799 accept = 1; /* might still be excluded */ 2800 } 2801 } 2802 return(accept); 2803 } 2804 case XML_REGEXP_STRING: 2805 printf("TODO: XML_REGEXP_STRING\n"); 2806 return(-1); 2807 case XML_REGEXP_ANYCHAR: 2808 case XML_REGEXP_ANYSPACE: 2809 case XML_REGEXP_NOTSPACE: 2810 case XML_REGEXP_INITNAME: 2811 case XML_REGEXP_NOTINITNAME: 2812 case XML_REGEXP_NAMECHAR: 2813 case XML_REGEXP_NOTNAMECHAR: 2814 case XML_REGEXP_DECIMAL: 2815 case XML_REGEXP_NOTDECIMAL: 2816 case XML_REGEXP_REALCHAR: 2817 case XML_REGEXP_NOTREALCHAR: 2818 case XML_REGEXP_LETTER: 2819 case XML_REGEXP_LETTER_UPPERCASE: 2820 case XML_REGEXP_LETTER_LOWERCASE: 2821 case XML_REGEXP_LETTER_TITLECASE: 2822 case XML_REGEXP_LETTER_MODIFIER: 2823 case XML_REGEXP_LETTER_OTHERS: 2824 case XML_REGEXP_MARK: 2825 case XML_REGEXP_MARK_NONSPACING: 2826 case XML_REGEXP_MARK_SPACECOMBINING: 2827 case XML_REGEXP_MARK_ENCLOSING: 2828 case XML_REGEXP_NUMBER: 2829 case XML_REGEXP_NUMBER_DECIMAL: 2830 case XML_REGEXP_NUMBER_LETTER: 2831 case XML_REGEXP_NUMBER_OTHERS: 2832 case XML_REGEXP_PUNCT: 2833 case XML_REGEXP_PUNCT_CONNECTOR: 2834 case XML_REGEXP_PUNCT_DASH: 2835 case XML_REGEXP_PUNCT_OPEN: 2836 case XML_REGEXP_PUNCT_CLOSE: 2837 case XML_REGEXP_PUNCT_INITQUOTE: 2838 case XML_REGEXP_PUNCT_FINQUOTE: 2839 case XML_REGEXP_PUNCT_OTHERS: 2840 case XML_REGEXP_SEPAR: 2841 case XML_REGEXP_SEPAR_SPACE: 2842 case XML_REGEXP_SEPAR_LINE: 2843 case XML_REGEXP_SEPAR_PARA: 2844 case XML_REGEXP_SYMBOL: 2845 case XML_REGEXP_SYMBOL_MATH: 2846 case XML_REGEXP_SYMBOL_CURRENCY: 2847 case XML_REGEXP_SYMBOL_MODIFIER: 2848 case XML_REGEXP_SYMBOL_OTHERS: 2849 case XML_REGEXP_OTHER: 2850 case XML_REGEXP_OTHER_CONTROL: 2851 case XML_REGEXP_OTHER_FORMAT: 2852 case XML_REGEXP_OTHER_PRIVATE: 2853 case XML_REGEXP_OTHER_NA: 2854 case XML_REGEXP_BLOCK_NAME: 2855 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 2856 (const xmlChar *)atom->valuep); 2857 if (atom->neg) 2858 ret = !ret; 2859 break; 2860 } 2861 return(ret); 2862} 2863 2864/************************************************************************ 2865 * * 2866 * Saving and restoring state of an execution context * 2867 * * 2868 ************************************************************************/ 2869 2870#ifdef DEBUG_REGEXP_EXEC 2871static void 2872xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 2873 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 2874 if (exec->inputStack != NULL) { 2875 int i; 2876 printf(": "); 2877 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 2878 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 2879 } else { 2880 printf(": %s", &(exec->inputString[exec->index])); 2881 } 2882 printf("\n"); 2883} 2884#endif 2885 2886static void 2887xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 2888#ifdef DEBUG_REGEXP_EXEC 2889 printf("saving "); 2890 exec->transno++; 2891 xmlFARegDebugExec(exec); 2892 exec->transno--; 2893#endif 2894#ifdef MAX_PUSH 2895 if (exec->nbPush > MAX_PUSH) { 2896 return; 2897 } 2898 exec->nbPush++; 2899#endif 2900 2901 if (exec->maxRollbacks == 0) { 2902 exec->maxRollbacks = 4; 2903 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2904 sizeof(xmlRegExecRollback)); 2905 if (exec->rollbacks == NULL) { 2906 xmlRegexpErrMemory(NULL, "saving regexp"); 2907 exec->maxRollbacks = 0; 2908 return; 2909 } 2910 memset(exec->rollbacks, 0, 2911 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2912 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2913 xmlRegExecRollback *tmp; 2914 int len = exec->maxRollbacks; 2915 2916 exec->maxRollbacks *= 2; 2917 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2918 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2919 if (tmp == NULL) { 2920 xmlRegexpErrMemory(NULL, "saving regexp"); 2921 exec->maxRollbacks /= 2; 2922 return; 2923 } 2924 exec->rollbacks = tmp; 2925 tmp = &exec->rollbacks[len]; 2926 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2927 } 2928 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2929 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2930 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2931 if (exec->comp->nbCounters > 0) { 2932 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2933 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2934 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2935 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2936 xmlRegexpErrMemory(NULL, "saving regexp"); 2937 exec->status = -5; 2938 return; 2939 } 2940 } 2941 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2942 exec->comp->nbCounters * sizeof(int)); 2943 } 2944 exec->nbRollbacks++; 2945} 2946 2947static void 2948xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2949 if (exec->nbRollbacks <= 0) { 2950 exec->status = -1; 2951#ifdef DEBUG_REGEXP_EXEC 2952 printf("rollback failed on empty stack\n"); 2953#endif 2954 return; 2955 } 2956 exec->nbRollbacks--; 2957 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2958 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2959 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2960 if (exec->comp->nbCounters > 0) { 2961 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2962 fprintf(stderr, "exec save: allocation failed"); 2963 exec->status = -6; 2964 return; 2965 } 2966 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2967 exec->comp->nbCounters * sizeof(int)); 2968 } 2969 2970#ifdef DEBUG_REGEXP_EXEC 2971 printf("restored "); 2972 xmlFARegDebugExec(exec); 2973#endif 2974} 2975 2976/************************************************************************ 2977 * * 2978 * Verifier, running an input against a compiled regexp * 2979 * * 2980 ************************************************************************/ 2981 2982static int 2983xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2984 xmlRegExecCtxt execval; 2985 xmlRegExecCtxtPtr exec = &execval; 2986 int ret, codepoint = 0, len, deter; 2987 2988 exec->inputString = content; 2989 exec->index = 0; 2990 exec->nbPush = 0; 2991 exec->determinist = 1; 2992 exec->maxRollbacks = 0; 2993 exec->nbRollbacks = 0; 2994 exec->rollbacks = NULL; 2995 exec->status = 0; 2996 exec->comp = comp; 2997 exec->state = comp->states[0]; 2998 exec->transno = 0; 2999 exec->transcount = 0; 3000 exec->inputStack = NULL; 3001 exec->inputStackMax = 0; 3002 if (comp->nbCounters > 0) { 3003 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3004 if (exec->counts == NULL) { 3005 xmlRegexpErrMemory(NULL, "running regexp"); 3006 return(-1); 3007 } 3008 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3009 } else 3010 exec->counts = NULL; 3011 while ((exec->status == 0) && 3012 ((exec->inputString[exec->index] != 0) || 3013 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 3014 xmlRegTransPtr trans; 3015 xmlRegAtomPtr atom; 3016 3017 /* 3018 * If end of input on non-terminal state, rollback, however we may 3019 * still have epsilon like transition for counted transitions 3020 * on counters, in that case don't break too early. Additionally, 3021 * if we are working on a range like "AB{0,2}", where B is not present, 3022 * we don't want to break. 3023 */ 3024 len = 1; 3025 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3026 /* 3027 * if there is a transition, we must check if 3028 * atom allows minOccurs of 0 3029 */ 3030 if (exec->transno < exec->state->nbTrans) { 3031 trans = &exec->state->trans[exec->transno]; 3032 if (trans->to >=0) { 3033 atom = trans->atom; 3034 if (!((atom->min == 0) && (atom->max > 0))) 3035 goto rollback; 3036 } 3037 } else 3038 goto rollback; 3039 } 3040 3041 exec->transcount = 0; 3042 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3043 trans = &exec->state->trans[exec->transno]; 3044 if (trans->to < 0) 3045 continue; 3046 atom = trans->atom; 3047 ret = 0; 3048 deter = 1; 3049 if (trans->count >= 0) { 3050 int count; 3051 xmlRegCounterPtr counter; 3052 3053 if (exec->counts == NULL) { 3054 exec->status = -1; 3055 goto error; 3056 } 3057 /* 3058 * A counted transition. 3059 */ 3060 3061 count = exec->counts[trans->count]; 3062 counter = &exec->comp->counters[trans->count]; 3063#ifdef DEBUG_REGEXP_EXEC 3064 printf("testing count %d: val %d, min %d, max %d\n", 3065 trans->count, count, counter->min, counter->max); 3066#endif 3067 ret = ((count >= counter->min) && (count <= counter->max)); 3068 if ((ret) && (counter->min != counter->max)) 3069 deter = 0; 3070 } else if (atom == NULL) { 3071 fprintf(stderr, "epsilon transition left at runtime\n"); 3072 exec->status = -2; 3073 break; 3074 } else if (exec->inputString[exec->index] != 0) { 3075 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3076 ret = xmlRegCheckCharacter(atom, codepoint); 3077 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3078 xmlRegStatePtr to = comp->states[trans->to]; 3079 3080 /* 3081 * this is a multiple input sequence 3082 * If there is a counter associated increment it now. 3083 * before potentially saving and rollback 3084 */ 3085 if (trans->counter >= 0) { 3086 if (exec->counts == NULL) { 3087 exec->status = -1; 3088 goto error; 3089 } 3090#ifdef DEBUG_REGEXP_EXEC 3091 printf("Increasing count %d\n", trans->counter); 3092#endif 3093 exec->counts[trans->counter]++; 3094 } 3095 if (exec->state->nbTrans > exec->transno + 1) { 3096 xmlFARegExecSave(exec); 3097 } 3098 exec->transcount = 1; 3099 do { 3100 /* 3101 * Try to progress as much as possible on the input 3102 */ 3103 if (exec->transcount == atom->max) { 3104 break; 3105 } 3106 exec->index += len; 3107 /* 3108 * End of input: stop here 3109 */ 3110 if (exec->inputString[exec->index] == 0) { 3111 exec->index -= len; 3112 break; 3113 } 3114 if (exec->transcount >= atom->min) { 3115 int transno = exec->transno; 3116 xmlRegStatePtr state = exec->state; 3117 3118 /* 3119 * The transition is acceptable save it 3120 */ 3121 exec->transno = -1; /* trick */ 3122 exec->state = to; 3123 xmlFARegExecSave(exec); 3124 exec->transno = transno; 3125 exec->state = state; 3126 } 3127 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3128 len); 3129 ret = xmlRegCheckCharacter(atom, codepoint); 3130 exec->transcount++; 3131 } while (ret == 1); 3132 if (exec->transcount < atom->min) 3133 ret = 0; 3134 3135 /* 3136 * If the last check failed but one transition was found 3137 * possible, rollback 3138 */ 3139 if (ret < 0) 3140 ret = 0; 3141 if (ret == 0) { 3142 goto rollback; 3143 } 3144 if (trans->counter >= 0) { 3145 if (exec->counts == NULL) { 3146 exec->status = -1; 3147 goto error; 3148 } 3149#ifdef DEBUG_REGEXP_EXEC 3150 printf("Decreasing count %d\n", trans->counter); 3151#endif 3152 exec->counts[trans->counter]--; 3153 } 3154 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3155 /* 3156 * we don't match on the codepoint, but minOccurs of 0 3157 * says that's ok. Setting len to 0 inhibits stepping 3158 * over the codepoint. 3159 */ 3160 exec->transcount = 1; 3161 len = 0; 3162 ret = 1; 3163 } 3164 } else if ((atom->min == 0) && (atom->max > 0)) { 3165 /* another spot to match when minOccurs is 0 */ 3166 exec->transcount = 1; 3167 len = 0; 3168 ret = 1; 3169 } 3170 if (ret == 1) { 3171 if ((trans->nd == 1) || 3172 ((trans->count >= 0) && (deter == 0) && 3173 (exec->state->nbTrans > exec->transno + 1))) { 3174#ifdef DEBUG_REGEXP_EXEC 3175 if (trans->nd == 1) 3176 printf("Saving on nd transition atom %d for %c at %d\n", 3177 trans->atom->no, codepoint, exec->index); 3178 else 3179 printf("Saving on counted transition count %d for %c at %d\n", 3180 trans->count, codepoint, exec->index); 3181#endif 3182 xmlFARegExecSave(exec); 3183 } 3184 if (trans->counter >= 0) { 3185 if (exec->counts == NULL) { 3186 exec->status = -1; 3187 goto error; 3188 } 3189#ifdef DEBUG_REGEXP_EXEC 3190 printf("Increasing count %d\n", trans->counter); 3191#endif 3192 exec->counts[trans->counter]++; 3193 } 3194 if ((trans->count >= 0) && 3195 (trans->count < REGEXP_ALL_COUNTER)) { 3196 if (exec->counts == NULL) { 3197 exec->status = -1; 3198 goto error; 3199 } 3200#ifdef DEBUG_REGEXP_EXEC 3201 printf("resetting count %d on transition\n", 3202 trans->count); 3203#endif 3204 exec->counts[trans->count] = 0; 3205 } 3206#ifdef DEBUG_REGEXP_EXEC 3207 printf("entering state %d\n", trans->to); 3208#endif 3209 exec->state = comp->states[trans->to]; 3210 exec->transno = 0; 3211 if (trans->atom != NULL) { 3212 exec->index += len; 3213 } 3214 goto progress; 3215 } else if (ret < 0) { 3216 exec->status = -4; 3217 break; 3218 } 3219 } 3220 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3221rollback: 3222 /* 3223 * Failed to find a way out 3224 */ 3225 exec->determinist = 0; 3226#ifdef DEBUG_REGEXP_EXEC 3227 printf("rollback from state %d on %d:%c\n", exec->state->no, 3228 codepoint,codepoint); 3229#endif 3230 xmlFARegExecRollBack(exec); 3231 } 3232progress: 3233 continue; 3234 } 3235error: 3236 if (exec->rollbacks != NULL) { 3237 if (exec->counts != NULL) { 3238 int i; 3239 3240 for (i = 0;i < exec->maxRollbacks;i++) 3241 if (exec->rollbacks[i].counts != NULL) 3242 xmlFree(exec->rollbacks[i].counts); 3243 } 3244 xmlFree(exec->rollbacks); 3245 } 3246 if (exec->counts != NULL) 3247 xmlFree(exec->counts); 3248 if (exec->status == 0) 3249 return(1); 3250 if (exec->status == -1) { 3251 if (exec->nbPush > MAX_PUSH) 3252 return(-1); 3253 return(0); 3254 } 3255 return(exec->status); 3256} 3257 3258/************************************************************************ 3259 * * 3260 * Progressive interface to the verifier one atom at a time * 3261 * * 3262 ************************************************************************/ 3263#ifdef DEBUG_ERR 3264static void testerr(xmlRegExecCtxtPtr exec); 3265#endif 3266 3267/** 3268 * xmlRegNewExecCtxt: 3269 * @comp: a precompiled regular expression 3270 * @callback: a callback function used for handling progresses in the 3271 * automata matching phase 3272 * @data: the context data associated to the callback in this context 3273 * 3274 * Build a context used for progressive evaluation of a regexp. 3275 * 3276 * Returns the new context 3277 */ 3278xmlRegExecCtxtPtr 3279xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3280 xmlRegExecCtxtPtr exec; 3281 3282 if (comp == NULL) 3283 return(NULL); 3284 if ((comp->compact == NULL) && (comp->states == NULL)) 3285 return(NULL); 3286 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3287 if (exec == NULL) { 3288 xmlRegexpErrMemory(NULL, "creating execution context"); 3289 return(NULL); 3290 } 3291 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3292 exec->inputString = NULL; 3293 exec->index = 0; 3294 exec->determinist = 1; 3295 exec->maxRollbacks = 0; 3296 exec->nbRollbacks = 0; 3297 exec->rollbacks = NULL; 3298 exec->status = 0; 3299 exec->comp = comp; 3300 if (comp->compact == NULL) 3301 exec->state = comp->states[0]; 3302 exec->transno = 0; 3303 exec->transcount = 0; 3304 exec->callback = callback; 3305 exec->data = data; 3306 if (comp->nbCounters > 0) { 3307 /* 3308 * For error handling, exec->counts is allocated twice the size 3309 * the second half is used to store the data in case of rollback 3310 */ 3311 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3312 * 2); 3313 if (exec->counts == NULL) { 3314 xmlRegexpErrMemory(NULL, "creating execution context"); 3315 xmlFree(exec); 3316 return(NULL); 3317 } 3318 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3319 exec->errCounts = &exec->counts[comp->nbCounters]; 3320 } else { 3321 exec->counts = NULL; 3322 exec->errCounts = NULL; 3323 } 3324 exec->inputStackMax = 0; 3325 exec->inputStackNr = 0; 3326 exec->inputStack = NULL; 3327 exec->errStateNo = -1; 3328 exec->errString = NULL; 3329 exec->nbPush = 0; 3330 return(exec); 3331} 3332 3333/** 3334 * xmlRegFreeExecCtxt: 3335 * @exec: a regular expression evaulation context 3336 * 3337 * Free the structures associated to a regular expression evaulation context. 3338 */ 3339void 3340xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3341 if (exec == NULL) 3342 return; 3343 3344 if (exec->rollbacks != NULL) { 3345 if (exec->counts != NULL) { 3346 int i; 3347 3348 for (i = 0;i < exec->maxRollbacks;i++) 3349 if (exec->rollbacks[i].counts != NULL) 3350 xmlFree(exec->rollbacks[i].counts); 3351 } 3352 xmlFree(exec->rollbacks); 3353 } 3354 if (exec->counts != NULL) 3355 xmlFree(exec->counts); 3356 if (exec->inputStack != NULL) { 3357 int i; 3358 3359 for (i = 0;i < exec->inputStackNr;i++) { 3360 if (exec->inputStack[i].value != NULL) 3361 xmlFree(exec->inputStack[i].value); 3362 } 3363 xmlFree(exec->inputStack); 3364 } 3365 if (exec->errString != NULL) 3366 xmlFree(exec->errString); 3367 xmlFree(exec); 3368} 3369 3370static void 3371xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3372 void *data) { 3373#ifdef DEBUG_PUSH 3374 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3375#endif 3376 if (exec->inputStackMax == 0) { 3377 exec->inputStackMax = 4; 3378 exec->inputStack = (xmlRegInputTokenPtr) 3379 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3380 if (exec->inputStack == NULL) { 3381 xmlRegexpErrMemory(NULL, "pushing input string"); 3382 exec->inputStackMax = 0; 3383 return; 3384 } 3385 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3386 xmlRegInputTokenPtr tmp; 3387 3388 exec->inputStackMax *= 2; 3389 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3390 exec->inputStackMax * sizeof(xmlRegInputToken)); 3391 if (tmp == NULL) { 3392 xmlRegexpErrMemory(NULL, "pushing input string"); 3393 exec->inputStackMax /= 2; 3394 return; 3395 } 3396 exec->inputStack = tmp; 3397 } 3398 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3399 exec->inputStack[exec->inputStackNr].data = data; 3400 exec->inputStackNr++; 3401 exec->inputStack[exec->inputStackNr].value = NULL; 3402 exec->inputStack[exec->inputStackNr].data = NULL; 3403} 3404 3405/** 3406 * xmlRegStrEqualWildcard: 3407 * @expStr: the string to be evaluated 3408 * @valStr: the validation string 3409 * 3410 * Checks if both strings are equal or have the same content. "*" 3411 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3412 * substrings in both @expStr and @valStr. 3413 * 3414 * Returns 1 if the comparison is satisfied and the number of substrings 3415 * is equal, 0 otherwise. 3416 */ 3417 3418static int 3419xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3420 if (expStr == valStr) return(1); 3421 if (expStr == NULL) return(0); 3422 if (valStr == NULL) return(0); 3423 do { 3424 /* 3425 * Eval if we have a wildcard for the current item. 3426 */ 3427 if (*expStr != *valStr) { 3428 /* if one of them starts with a wildcard make valStr be it */ 3429 if (*valStr == '*') { 3430 const xmlChar *tmp; 3431 3432 tmp = valStr; 3433 valStr = expStr; 3434 expStr = tmp; 3435 } 3436 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3437 do { 3438 if (*valStr == XML_REG_STRING_SEPARATOR) 3439 break; 3440 valStr++; 3441 } while (*valStr != 0); 3442 continue; 3443 } else 3444 return(0); 3445 } 3446 expStr++; 3447 valStr++; 3448 } while (*valStr != 0); 3449 if (*expStr != 0) 3450 return (0); 3451 else 3452 return (1); 3453} 3454 3455/** 3456 * xmlRegCompactPushString: 3457 * @exec: a regexp execution context 3458 * @comp: the precompiled exec with a compact table 3459 * @value: a string token input 3460 * @data: data associated to the token to reuse in callbacks 3461 * 3462 * Push one input token in the execution context 3463 * 3464 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3465 * a negative value in case of error. 3466 */ 3467static int 3468xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3469 xmlRegexpPtr comp, 3470 const xmlChar *value, 3471 void *data) { 3472 int state = exec->index; 3473 int i, target; 3474 3475 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3476 return(-1); 3477 3478 if (value == NULL) { 3479 /* 3480 * are we at a final state ? 3481 */ 3482 if (comp->compact[state * (comp->nbstrings + 1)] == 3483 XML_REGEXP_FINAL_STATE) 3484 return(1); 3485 return(0); 3486 } 3487 3488#ifdef DEBUG_PUSH 3489 printf("value pushed: %s\n", value); 3490#endif 3491 3492 /* 3493 * Examine all outside transitions from current state 3494 */ 3495 for (i = 0;i < comp->nbstrings;i++) { 3496 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3497 if ((target > 0) && (target <= comp->nbstates)) { 3498 target--; /* to avoid 0 */ 3499 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3500 exec->index = target; 3501 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3502 exec->callback(exec->data, value, 3503 comp->transdata[state * comp->nbstrings + i], data); 3504 } 3505#ifdef DEBUG_PUSH 3506 printf("entering state %d\n", target); 3507#endif 3508 if (comp->compact[target * (comp->nbstrings + 1)] == 3509 XML_REGEXP_SINK_STATE) 3510 goto error; 3511 3512 if (comp->compact[target * (comp->nbstrings + 1)] == 3513 XML_REGEXP_FINAL_STATE) 3514 return(1); 3515 return(0); 3516 } 3517 } 3518 } 3519 /* 3520 * Failed to find an exit transition out from current state for the 3521 * current token 3522 */ 3523#ifdef DEBUG_PUSH 3524 printf("failed to find a transition for %s on state %d\n", value, state); 3525#endif 3526error: 3527 if (exec->errString != NULL) 3528 xmlFree(exec->errString); 3529 exec->errString = xmlStrdup(value); 3530 exec->errStateNo = state; 3531 exec->status = -1; 3532#ifdef DEBUG_ERR 3533 testerr(exec); 3534#endif 3535 return(-1); 3536} 3537 3538/** 3539 * xmlRegExecPushStringInternal: 3540 * @exec: a regexp execution context or NULL to indicate the end 3541 * @value: a string token input 3542 * @data: data associated to the token to reuse in callbacks 3543 * @compound: value was assembled from 2 strings 3544 * 3545 * Push one input token in the execution context 3546 * 3547 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3548 * a negative value in case of error. 3549 */ 3550static int 3551xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3552 void *data, int compound) { 3553 xmlRegTransPtr trans; 3554 xmlRegAtomPtr atom; 3555 int ret; 3556 int final = 0; 3557 int progress = 1; 3558 3559 if (exec == NULL) 3560 return(-1); 3561 if (exec->comp == NULL) 3562 return(-1); 3563 if (exec->status != 0) 3564 return(exec->status); 3565 3566 if (exec->comp->compact != NULL) 3567 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3568 3569 if (value == NULL) { 3570 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3571 return(1); 3572 final = 1; 3573 } 3574 3575#ifdef DEBUG_PUSH 3576 printf("value pushed: %s\n", value); 3577#endif 3578 /* 3579 * If we have an active rollback stack push the new value there 3580 * and get back to where we were left 3581 */ 3582 if ((value != NULL) && (exec->inputStackNr > 0)) { 3583 xmlFARegExecSaveInputString(exec, value, data); 3584 value = exec->inputStack[exec->index].value; 3585 data = exec->inputStack[exec->index].data; 3586#ifdef DEBUG_PUSH 3587 printf("value loaded: %s\n", value); 3588#endif 3589 } 3590 3591 while ((exec->status == 0) && 3592 ((value != NULL) || 3593 ((final == 1) && 3594 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3595 3596 /* 3597 * End of input on non-terminal state, rollback, however we may 3598 * still have epsilon like transition for counted transitions 3599 * on counters, in that case don't break too early. 3600 */ 3601 if ((value == NULL) && (exec->counts == NULL)) 3602 goto rollback; 3603 3604 exec->transcount = 0; 3605 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3606 trans = &exec->state->trans[exec->transno]; 3607 if (trans->to < 0) 3608 continue; 3609 atom = trans->atom; 3610 ret = 0; 3611 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3612 int i; 3613 int count; 3614 xmlRegTransPtr t; 3615 xmlRegCounterPtr counter; 3616 3617 ret = 0; 3618 3619#ifdef DEBUG_PUSH 3620 printf("testing all lax %d\n", trans->count); 3621#endif 3622 /* 3623 * Check all counted transitions from the current state 3624 */ 3625 if ((value == NULL) && (final)) { 3626 ret = 1; 3627 } else if (value != NULL) { 3628 for (i = 0;i < exec->state->nbTrans;i++) { 3629 t = &exec->state->trans[i]; 3630 if ((t->counter < 0) || (t == trans)) 3631 continue; 3632 counter = &exec->comp->counters[t->counter]; 3633 count = exec->counts[t->counter]; 3634 if ((count < counter->max) && 3635 (t->atom != NULL) && 3636 (xmlStrEqual(value, t->atom->valuep))) { 3637 ret = 0; 3638 break; 3639 } 3640 if ((count >= counter->min) && 3641 (count < counter->max) && 3642 (t->atom != NULL) && 3643 (xmlStrEqual(value, t->atom->valuep))) { 3644 ret = 1; 3645 break; 3646 } 3647 } 3648 } 3649 } else if (trans->count == REGEXP_ALL_COUNTER) { 3650 int i; 3651 int count; 3652 xmlRegTransPtr t; 3653 xmlRegCounterPtr counter; 3654 3655 ret = 1; 3656 3657#ifdef DEBUG_PUSH 3658 printf("testing all %d\n", trans->count); 3659#endif 3660 /* 3661 * Check all counted transitions from the current state 3662 */ 3663 for (i = 0;i < exec->state->nbTrans;i++) { 3664 t = &exec->state->trans[i]; 3665 if ((t->counter < 0) || (t == trans)) 3666 continue; 3667 counter = &exec->comp->counters[t->counter]; 3668 count = exec->counts[t->counter]; 3669 if ((count < counter->min) || (count > counter->max)) { 3670 ret = 0; 3671 break; 3672 } 3673 } 3674 } else if (trans->count >= 0) { 3675 int count; 3676 xmlRegCounterPtr counter; 3677 3678 /* 3679 * A counted transition. 3680 */ 3681 3682 count = exec->counts[trans->count]; 3683 counter = &exec->comp->counters[trans->count]; 3684#ifdef DEBUG_PUSH 3685 printf("testing count %d: val %d, min %d, max %d\n", 3686 trans->count, count, counter->min, counter->max); 3687#endif 3688 ret = ((count >= counter->min) && (count <= counter->max)); 3689 } else if (atom == NULL) { 3690 fprintf(stderr, "epsilon transition left at runtime\n"); 3691 exec->status = -2; 3692 break; 3693 } else if (value != NULL) { 3694 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3695 if (atom->neg) { 3696 ret = !ret; 3697 if (!compound) 3698 ret = 0; 3699 } 3700 if ((ret == 1) && (trans->counter >= 0)) { 3701 xmlRegCounterPtr counter; 3702 int count; 3703 3704 count = exec->counts[trans->counter]; 3705 counter = &exec->comp->counters[trans->counter]; 3706 if (count >= counter->max) 3707 ret = 0; 3708 } 3709 3710 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3711 xmlRegStatePtr to = exec->comp->states[trans->to]; 3712 3713 /* 3714 * this is a multiple input sequence 3715 */ 3716 if (exec->state->nbTrans > exec->transno + 1) { 3717 if (exec->inputStackNr <= 0) { 3718 xmlFARegExecSaveInputString(exec, value, data); 3719 } 3720 xmlFARegExecSave(exec); 3721 } 3722 exec->transcount = 1; 3723 do { 3724 /* 3725 * Try to progress as much as possible on the input 3726 */ 3727 if (exec->transcount == atom->max) { 3728 break; 3729 } 3730 exec->index++; 3731 value = exec->inputStack[exec->index].value; 3732 data = exec->inputStack[exec->index].data; 3733#ifdef DEBUG_PUSH 3734 printf("value loaded: %s\n", value); 3735#endif 3736 3737 /* 3738 * End of input: stop here 3739 */ 3740 if (value == NULL) { 3741 exec->index --; 3742 break; 3743 } 3744 if (exec->transcount >= atom->min) { 3745 int transno = exec->transno; 3746 xmlRegStatePtr state = exec->state; 3747 3748 /* 3749 * The transition is acceptable save it 3750 */ 3751 exec->transno = -1; /* trick */ 3752 exec->state = to; 3753 if (exec->inputStackNr <= 0) { 3754 xmlFARegExecSaveInputString(exec, value, data); 3755 } 3756 xmlFARegExecSave(exec); 3757 exec->transno = transno; 3758 exec->state = state; 3759 } 3760 ret = xmlStrEqual(value, atom->valuep); 3761 exec->transcount++; 3762 } while (ret == 1); 3763 if (exec->transcount < atom->min) 3764 ret = 0; 3765 3766 /* 3767 * If the last check failed but one transition was found 3768 * possible, rollback 3769 */ 3770 if (ret < 0) 3771 ret = 0; 3772 if (ret == 0) { 3773 goto rollback; 3774 } 3775 } 3776 } 3777 if (ret == 1) { 3778 if ((exec->callback != NULL) && (atom != NULL) && 3779 (data != NULL)) { 3780 exec->callback(exec->data, atom->valuep, 3781 atom->data, data); 3782 } 3783 if (exec->state->nbTrans > exec->transno + 1) { 3784 if (exec->inputStackNr <= 0) { 3785 xmlFARegExecSaveInputString(exec, value, data); 3786 } 3787 xmlFARegExecSave(exec); 3788 } 3789 if (trans->counter >= 0) { 3790#ifdef DEBUG_PUSH 3791 printf("Increasing count %d\n", trans->counter); 3792#endif 3793 exec->counts[trans->counter]++; 3794 } 3795 if ((trans->count >= 0) && 3796 (trans->count < REGEXP_ALL_COUNTER)) { 3797#ifdef DEBUG_REGEXP_EXEC 3798 printf("resetting count %d on transition\n", 3799 trans->count); 3800#endif 3801 exec->counts[trans->count] = 0; 3802 } 3803#ifdef DEBUG_PUSH 3804 printf("entering state %d\n", trans->to); 3805#endif 3806 if ((exec->comp->states[trans->to] != NULL) && 3807 (exec->comp->states[trans->to]->type == 3808 XML_REGEXP_SINK_STATE)) { 3809 /* 3810 * entering a sink state, save the current state as error 3811 * state. 3812 */ 3813 if (exec->errString != NULL) 3814 xmlFree(exec->errString); 3815 exec->errString = xmlStrdup(value); 3816 exec->errState = exec->state; 3817 memcpy(exec->errCounts, exec->counts, 3818 exec->comp->nbCounters * sizeof(int)); 3819 } 3820 exec->state = exec->comp->states[trans->to]; 3821 exec->transno = 0; 3822 if (trans->atom != NULL) { 3823 if (exec->inputStack != NULL) { 3824 exec->index++; 3825 if (exec->index < exec->inputStackNr) { 3826 value = exec->inputStack[exec->index].value; 3827 data = exec->inputStack[exec->index].data; 3828#ifdef DEBUG_PUSH 3829 printf("value loaded: %s\n", value); 3830#endif 3831 } else { 3832 value = NULL; 3833 data = NULL; 3834#ifdef DEBUG_PUSH 3835 printf("end of input\n"); 3836#endif 3837 } 3838 } else { 3839 value = NULL; 3840 data = NULL; 3841#ifdef DEBUG_PUSH 3842 printf("end of input\n"); 3843#endif 3844 } 3845 } 3846 goto progress; 3847 } else if (ret < 0) { 3848 exec->status = -4; 3849 break; 3850 } 3851 } 3852 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3853rollback: 3854 /* 3855 * if we didn't yet rollback on the current input 3856 * store the current state as the error state. 3857 */ 3858 if ((progress) && (exec->state != NULL) && 3859 (exec->state->type != XML_REGEXP_SINK_STATE)) { 3860 progress = 0; 3861 if (exec->errString != NULL) 3862 xmlFree(exec->errString); 3863 exec->errString = xmlStrdup(value); 3864 exec->errState = exec->state; 3865 memcpy(exec->errCounts, exec->counts, 3866 exec->comp->nbCounters * sizeof(int)); 3867 } 3868 3869 /* 3870 * Failed to find a way out 3871 */ 3872 exec->determinist = 0; 3873 xmlFARegExecRollBack(exec); 3874 if (exec->status == 0) { 3875 value = exec->inputStack[exec->index].value; 3876 data = exec->inputStack[exec->index].data; 3877#ifdef DEBUG_PUSH 3878 printf("value loaded: %s\n", value); 3879#endif 3880 } 3881 } 3882 continue; 3883progress: 3884 progress = 1; 3885 continue; 3886 } 3887 if (exec->status == 0) { 3888 return(exec->state->type == XML_REGEXP_FINAL_STATE); 3889 } 3890#ifdef DEBUG_ERR 3891 if (exec->status < 0) { 3892 testerr(exec); 3893 } 3894#endif 3895 return(exec->status); 3896} 3897 3898/** 3899 * xmlRegExecPushString: 3900 * @exec: a regexp execution context or NULL to indicate the end 3901 * @value: a string token input 3902 * @data: data associated to the token to reuse in callbacks 3903 * 3904 * Push one input token in the execution context 3905 * 3906 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3907 * a negative value in case of error. 3908 */ 3909int 3910xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3911 void *data) { 3912 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 3913} 3914 3915/** 3916 * xmlRegExecPushString2: 3917 * @exec: a regexp execution context or NULL to indicate the end 3918 * @value: the first string token input 3919 * @value2: the second string token input 3920 * @data: data associated to the token to reuse in callbacks 3921 * 3922 * Push one input token in the execution context 3923 * 3924 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3925 * a negative value in case of error. 3926 */ 3927int 3928xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 3929 const xmlChar *value2, void *data) { 3930 xmlChar buf[150]; 3931 int lenn, lenp, ret; 3932 xmlChar *str; 3933 3934 if (exec == NULL) 3935 return(-1); 3936 if (exec->comp == NULL) 3937 return(-1); 3938 if (exec->status != 0) 3939 return(exec->status); 3940 3941 if (value2 == NULL) 3942 return(xmlRegExecPushString(exec, value, data)); 3943 3944 lenn = strlen((char *) value2); 3945 lenp = strlen((char *) value); 3946 3947 if (150 < lenn + lenp + 2) { 3948 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 3949 if (str == NULL) { 3950 exec->status = -1; 3951 return(-1); 3952 } 3953 } else { 3954 str = buf; 3955 } 3956 memcpy(&str[0], value, lenp); 3957 str[lenp] = XML_REG_STRING_SEPARATOR; 3958 memcpy(&str[lenp + 1], value2, lenn); 3959 str[lenn + lenp + 1] = 0; 3960 3961 if (exec->comp->compact != NULL) 3962 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 3963 else 3964 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 3965 3966 if (str != buf) 3967 xmlFree(str); 3968 return(ret); 3969} 3970 3971/** 3972 * xmlRegExecGetValues: 3973 * @exec: a regexp execution context 3974 * @err: error extraction or normal one 3975 * @nbval: pointer to the number of accepted values IN/OUT 3976 * @nbneg: return number of negative transitions 3977 * @values: pointer to the array of acceptable values 3978 * @terminal: return value if this was a terminal state 3979 * 3980 * Extract informations from the regexp execution, internal routine to 3981 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 3982 * 3983 * Returns: 0 in case of success or -1 in case of error. 3984 */ 3985static int 3986xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 3987 int *nbval, int *nbneg, 3988 xmlChar **values, int *terminal) { 3989 int maxval; 3990 int nb = 0; 3991 3992 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 3993 (values == NULL) || (*nbval <= 0)) 3994 return(-1); 3995 3996 maxval = *nbval; 3997 *nbval = 0; 3998 *nbneg = 0; 3999 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4000 xmlRegexpPtr comp; 4001 int target, i, state; 4002 4003 comp = exec->comp; 4004 4005 if (err) { 4006 if (exec->errStateNo == -1) return(-1); 4007 state = exec->errStateNo; 4008 } else { 4009 state = exec->index; 4010 } 4011 if (terminal != NULL) { 4012 if (comp->compact[state * (comp->nbstrings + 1)] == 4013 XML_REGEXP_FINAL_STATE) 4014 *terminal = 1; 4015 else 4016 *terminal = 0; 4017 } 4018 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4019 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4020 if ((target > 0) && (target <= comp->nbstates) && 4021 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4022 XML_REGEXP_SINK_STATE)) { 4023 values[nb++] = comp->stringMap[i]; 4024 (*nbval)++; 4025 } 4026 } 4027 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4028 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4029 if ((target > 0) && (target <= comp->nbstates) && 4030 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4031 XML_REGEXP_SINK_STATE)) { 4032 values[nb++] = comp->stringMap[i]; 4033 (*nbneg)++; 4034 } 4035 } 4036 } else { 4037 int transno; 4038 xmlRegTransPtr trans; 4039 xmlRegAtomPtr atom; 4040 xmlRegStatePtr state; 4041 4042 if (terminal != NULL) { 4043 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4044 *terminal = 1; 4045 else 4046 *terminal = 0; 4047 } 4048 4049 if (err) { 4050 if (exec->errState == NULL) return(-1); 4051 state = exec->errState; 4052 } else { 4053 if (exec->state == NULL) return(-1); 4054 state = exec->state; 4055 } 4056 for (transno = 0; 4057 (transno < state->nbTrans) && (nb < maxval); 4058 transno++) { 4059 trans = &state->trans[transno]; 4060 if (trans->to < 0) 4061 continue; 4062 atom = trans->atom; 4063 if ((atom == NULL) || (atom->valuep == NULL)) 4064 continue; 4065 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4066 /* this should not be reached but ... */ 4067 TODO; 4068 } else if (trans->count == REGEXP_ALL_COUNTER) { 4069 /* this should not be reached but ... */ 4070 TODO; 4071 } else if (trans->counter >= 0) { 4072 xmlRegCounterPtr counter = NULL; 4073 int count; 4074 4075 if (err) 4076 count = exec->errCounts[trans->counter]; 4077 else 4078 count = exec->counts[trans->counter]; 4079 if (exec->comp != NULL) 4080 counter = &exec->comp->counters[trans->counter]; 4081 if ((counter == NULL) || (count < counter->max)) { 4082 if (atom->neg) 4083 values[nb++] = (xmlChar *) atom->valuep2; 4084 else 4085 values[nb++] = (xmlChar *) atom->valuep; 4086 (*nbval)++; 4087 } 4088 } else { 4089 if ((exec->comp->states[trans->to] != NULL) && 4090 (exec->comp->states[trans->to]->type != 4091 XML_REGEXP_SINK_STATE)) { 4092 if (atom->neg) 4093 values[nb++] = (xmlChar *) atom->valuep2; 4094 else 4095 values[nb++] = (xmlChar *) atom->valuep; 4096 (*nbval)++; 4097 } 4098 } 4099 } 4100 for (transno = 0; 4101 (transno < state->nbTrans) && (nb < maxval); 4102 transno++) { 4103 trans = &state->trans[transno]; 4104 if (trans->to < 0) 4105 continue; 4106 atom = trans->atom; 4107 if ((atom == NULL) || (atom->valuep == NULL)) 4108 continue; 4109 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4110 continue; 4111 } else if (trans->count == REGEXP_ALL_COUNTER) { 4112 continue; 4113 } else if (trans->counter >= 0) { 4114 continue; 4115 } else { 4116 if ((exec->comp->states[trans->to] != NULL) && 4117 (exec->comp->states[trans->to]->type == 4118 XML_REGEXP_SINK_STATE)) { 4119 if (atom->neg) 4120 values[nb++] = (xmlChar *) atom->valuep2; 4121 else 4122 values[nb++] = (xmlChar *) atom->valuep; 4123 (*nbneg)++; 4124 } 4125 } 4126 } 4127 } 4128 return(0); 4129} 4130 4131/** 4132 * xmlRegExecNextValues: 4133 * @exec: a regexp execution context 4134 * @nbval: pointer to the number of accepted values IN/OUT 4135 * @nbneg: return number of negative transitions 4136 * @values: pointer to the array of acceptable values 4137 * @terminal: return value if this was a terminal state 4138 * 4139 * Extract informations from the regexp execution, 4140 * the parameter @values must point to an array of @nbval string pointers 4141 * on return nbval will contain the number of possible strings in that 4142 * state and the @values array will be updated with them. The string values 4143 * returned will be freed with the @exec context and don't need to be 4144 * deallocated. 4145 * 4146 * Returns: 0 in case of success or -1 in case of error. 4147 */ 4148int 4149xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4150 xmlChar **values, int *terminal) { 4151 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4152} 4153 4154/** 4155 * xmlRegExecErrInfo: 4156 * @exec: a regexp execution context generating an error 4157 * @string: return value for the error string 4158 * @nbval: pointer to the number of accepted values IN/OUT 4159 * @nbneg: return number of negative transitions 4160 * @values: pointer to the array of acceptable values 4161 * @terminal: return value if this was a terminal state 4162 * 4163 * Extract error informations from the regexp execution, the parameter 4164 * @string will be updated with the value pushed and not accepted, 4165 * the parameter @values must point to an array of @nbval string pointers 4166 * on return nbval will contain the number of possible strings in that 4167 * state and the @values array will be updated with them. The string values 4168 * returned will be freed with the @exec context and don't need to be 4169 * deallocated. 4170 * 4171 * Returns: 0 in case of success or -1 in case of error. 4172 */ 4173int 4174xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4175 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4176 if (exec == NULL) 4177 return(-1); 4178 if (string != NULL) { 4179 if (exec->status != 0) 4180 *string = exec->errString; 4181 else 4182 *string = NULL; 4183 } 4184 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4185} 4186 4187#ifdef DEBUG_ERR 4188static void testerr(xmlRegExecCtxtPtr exec) { 4189 const xmlChar *string; 4190 xmlChar *values[5]; 4191 int nb = 5; 4192 int nbneg; 4193 int terminal; 4194 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4195} 4196#endif 4197 4198#if 0 4199static int 4200xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4201 xmlRegTransPtr trans; 4202 xmlRegAtomPtr atom; 4203 int ret; 4204 int codepoint, len; 4205 4206 if (exec == NULL) 4207 return(-1); 4208 if (exec->status != 0) 4209 return(exec->status); 4210 4211 while ((exec->status == 0) && 4212 ((exec->inputString[exec->index] != 0) || 4213 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4214 4215 /* 4216 * End of input on non-terminal state, rollback, however we may 4217 * still have epsilon like transition for counted transitions 4218 * on counters, in that case don't break too early. 4219 */ 4220 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4221 goto rollback; 4222 4223 exec->transcount = 0; 4224 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4225 trans = &exec->state->trans[exec->transno]; 4226 if (trans->to < 0) 4227 continue; 4228 atom = trans->atom; 4229 ret = 0; 4230 if (trans->count >= 0) { 4231 int count; 4232 xmlRegCounterPtr counter; 4233 4234 /* 4235 * A counted transition. 4236 */ 4237 4238 count = exec->counts[trans->count]; 4239 counter = &exec->comp->counters[trans->count]; 4240#ifdef DEBUG_REGEXP_EXEC 4241 printf("testing count %d: val %d, min %d, max %d\n", 4242 trans->count, count, counter->min, counter->max); 4243#endif 4244 ret = ((count >= counter->min) && (count <= counter->max)); 4245 } else if (atom == NULL) { 4246 fprintf(stderr, "epsilon transition left at runtime\n"); 4247 exec->status = -2; 4248 break; 4249 } else if (exec->inputString[exec->index] != 0) { 4250 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4251 ret = xmlRegCheckCharacter(atom, codepoint); 4252 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4253 xmlRegStatePtr to = exec->comp->states[trans->to]; 4254 4255 /* 4256 * this is a multiple input sequence 4257 */ 4258 if (exec->state->nbTrans > exec->transno + 1) { 4259 xmlFARegExecSave(exec); 4260 } 4261 exec->transcount = 1; 4262 do { 4263 /* 4264 * Try to progress as much as possible on the input 4265 */ 4266 if (exec->transcount == atom->max) { 4267 break; 4268 } 4269 exec->index += len; 4270 /* 4271 * End of input: stop here 4272 */ 4273 if (exec->inputString[exec->index] == 0) { 4274 exec->index -= len; 4275 break; 4276 } 4277 if (exec->transcount >= atom->min) { 4278 int transno = exec->transno; 4279 xmlRegStatePtr state = exec->state; 4280 4281 /* 4282 * The transition is acceptable save it 4283 */ 4284 exec->transno = -1; /* trick */ 4285 exec->state = to; 4286 xmlFARegExecSave(exec); 4287 exec->transno = transno; 4288 exec->state = state; 4289 } 4290 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4291 len); 4292 ret = xmlRegCheckCharacter(atom, codepoint); 4293 exec->transcount++; 4294 } while (ret == 1); 4295 if (exec->transcount < atom->min) 4296 ret = 0; 4297 4298 /* 4299 * If the last check failed but one transition was found 4300 * possible, rollback 4301 */ 4302 if (ret < 0) 4303 ret = 0; 4304 if (ret == 0) { 4305 goto rollback; 4306 } 4307 } 4308 } 4309 if (ret == 1) { 4310 if (exec->state->nbTrans > exec->transno + 1) { 4311 xmlFARegExecSave(exec); 4312 } 4313 /* 4314 * restart count for expressions like this ((abc){2})* 4315 */ 4316 if (trans->count >= 0) { 4317#ifdef DEBUG_REGEXP_EXEC 4318 printf("Reset count %d\n", trans->count); 4319#endif 4320 exec->counts[trans->count] = 0; 4321 } 4322 if (trans->counter >= 0) { 4323#ifdef DEBUG_REGEXP_EXEC 4324 printf("Increasing count %d\n", trans->counter); 4325#endif 4326 exec->counts[trans->counter]++; 4327 } 4328#ifdef DEBUG_REGEXP_EXEC 4329 printf("entering state %d\n", trans->to); 4330#endif 4331 exec->state = exec->comp->states[trans->to]; 4332 exec->transno = 0; 4333 if (trans->atom != NULL) { 4334 exec->index += len; 4335 } 4336 goto progress; 4337 } else if (ret < 0) { 4338 exec->status = -4; 4339 break; 4340 } 4341 } 4342 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4343rollback: 4344 /* 4345 * Failed to find a way out 4346 */ 4347 exec->determinist = 0; 4348 xmlFARegExecRollBack(exec); 4349 } 4350progress: 4351 continue; 4352 } 4353} 4354#endif 4355/************************************************************************ 4356 * * 4357 * Parser for the Schemas Datatype Regular Expressions * 4358 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4359 * * 4360 ************************************************************************/ 4361 4362/** 4363 * xmlFAIsChar: 4364 * @ctxt: a regexp parser context 4365 * 4366 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4367 */ 4368static int 4369xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4370 int cur; 4371 int len; 4372 4373 cur = CUR_SCHAR(ctxt->cur, len); 4374 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4375 (cur == '*') || (cur == '+') || (cur == '(') || 4376 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4377 (cur == 0x5D) || (cur == 0)) 4378 return(-1); 4379 return(cur); 4380} 4381 4382/** 4383 * xmlFAParseCharProp: 4384 * @ctxt: a regexp parser context 4385 * 4386 * [27] charProp ::= IsCategory | IsBlock 4387 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4388 * Separators | Symbols | Others 4389 * [29] Letters ::= 'L' [ultmo]? 4390 * [30] Marks ::= 'M' [nce]? 4391 * [31] Numbers ::= 'N' [dlo]? 4392 * [32] Punctuation ::= 'P' [cdseifo]? 4393 * [33] Separators ::= 'Z' [slp]? 4394 * [34] Symbols ::= 'S' [mcko]? 4395 * [35] Others ::= 'C' [cfon]? 4396 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4397 */ 4398static void 4399xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4400 int cur; 4401 xmlRegAtomType type = (xmlRegAtomType) 0; 4402 xmlChar *blockName = NULL; 4403 4404 cur = CUR; 4405 if (cur == 'L') { 4406 NEXT; 4407 cur = CUR; 4408 if (cur == 'u') { 4409 NEXT; 4410 type = XML_REGEXP_LETTER_UPPERCASE; 4411 } else if (cur == 'l') { 4412 NEXT; 4413 type = XML_REGEXP_LETTER_LOWERCASE; 4414 } else if (cur == 't') { 4415 NEXT; 4416 type = XML_REGEXP_LETTER_TITLECASE; 4417 } else if (cur == 'm') { 4418 NEXT; 4419 type = XML_REGEXP_LETTER_MODIFIER; 4420 } else if (cur == 'o') { 4421 NEXT; 4422 type = XML_REGEXP_LETTER_OTHERS; 4423 } else { 4424 type = XML_REGEXP_LETTER; 4425 } 4426 } else if (cur == 'M') { 4427 NEXT; 4428 cur = CUR; 4429 if (cur == 'n') { 4430 NEXT; 4431 /* nonspacing */ 4432 type = XML_REGEXP_MARK_NONSPACING; 4433 } else if (cur == 'c') { 4434 NEXT; 4435 /* spacing combining */ 4436 type = XML_REGEXP_MARK_SPACECOMBINING; 4437 } else if (cur == 'e') { 4438 NEXT; 4439 /* enclosing */ 4440 type = XML_REGEXP_MARK_ENCLOSING; 4441 } else { 4442 /* all marks */ 4443 type = XML_REGEXP_MARK; 4444 } 4445 } else if (cur == 'N') { 4446 NEXT; 4447 cur = CUR; 4448 if (cur == 'd') { 4449 NEXT; 4450 /* digital */ 4451 type = XML_REGEXP_NUMBER_DECIMAL; 4452 } else if (cur == 'l') { 4453 NEXT; 4454 /* letter */ 4455 type = XML_REGEXP_NUMBER_LETTER; 4456 } else if (cur == 'o') { 4457 NEXT; 4458 /* other */ 4459 type = XML_REGEXP_NUMBER_OTHERS; 4460 } else { 4461 /* all numbers */ 4462 type = XML_REGEXP_NUMBER; 4463 } 4464 } else if (cur == 'P') { 4465 NEXT; 4466 cur = CUR; 4467 if (cur == 'c') { 4468 NEXT; 4469 /* connector */ 4470 type = XML_REGEXP_PUNCT_CONNECTOR; 4471 } else if (cur == 'd') { 4472 NEXT; 4473 /* dash */ 4474 type = XML_REGEXP_PUNCT_DASH; 4475 } else if (cur == 's') { 4476 NEXT; 4477 /* open */ 4478 type = XML_REGEXP_PUNCT_OPEN; 4479 } else if (cur == 'e') { 4480 NEXT; 4481 /* close */ 4482 type = XML_REGEXP_PUNCT_CLOSE; 4483 } else if (cur == 'i') { 4484 NEXT; 4485 /* initial quote */ 4486 type = XML_REGEXP_PUNCT_INITQUOTE; 4487 } else if (cur == 'f') { 4488 NEXT; 4489 /* final quote */ 4490 type = XML_REGEXP_PUNCT_FINQUOTE; 4491 } else if (cur == 'o') { 4492 NEXT; 4493 /* other */ 4494 type = XML_REGEXP_PUNCT_OTHERS; 4495 } else { 4496 /* all punctuation */ 4497 type = XML_REGEXP_PUNCT; 4498 } 4499 } else if (cur == 'Z') { 4500 NEXT; 4501 cur = CUR; 4502 if (cur == 's') { 4503 NEXT; 4504 /* space */ 4505 type = XML_REGEXP_SEPAR_SPACE; 4506 } else if (cur == 'l') { 4507 NEXT; 4508 /* line */ 4509 type = XML_REGEXP_SEPAR_LINE; 4510 } else if (cur == 'p') { 4511 NEXT; 4512 /* paragraph */ 4513 type = XML_REGEXP_SEPAR_PARA; 4514 } else { 4515 /* all separators */ 4516 type = XML_REGEXP_SEPAR; 4517 } 4518 } else if (cur == 'S') { 4519 NEXT; 4520 cur = CUR; 4521 if (cur == 'm') { 4522 NEXT; 4523 type = XML_REGEXP_SYMBOL_MATH; 4524 /* math */ 4525 } else if (cur == 'c') { 4526 NEXT; 4527 type = XML_REGEXP_SYMBOL_CURRENCY; 4528 /* currency */ 4529 } else if (cur == 'k') { 4530 NEXT; 4531 type = XML_REGEXP_SYMBOL_MODIFIER; 4532 /* modifiers */ 4533 } else if (cur == 'o') { 4534 NEXT; 4535 type = XML_REGEXP_SYMBOL_OTHERS; 4536 /* other */ 4537 } else { 4538 /* all symbols */ 4539 type = XML_REGEXP_SYMBOL; 4540 } 4541 } else if (cur == 'C') { 4542 NEXT; 4543 cur = CUR; 4544 if (cur == 'c') { 4545 NEXT; 4546 /* control */ 4547 type = XML_REGEXP_OTHER_CONTROL; 4548 } else if (cur == 'f') { 4549 NEXT; 4550 /* format */ 4551 type = XML_REGEXP_OTHER_FORMAT; 4552 } else if (cur == 'o') { 4553 NEXT; 4554 /* private use */ 4555 type = XML_REGEXP_OTHER_PRIVATE; 4556 } else if (cur == 'n') { 4557 NEXT; 4558 /* not assigned */ 4559 type = XML_REGEXP_OTHER_NA; 4560 } else { 4561 /* all others */ 4562 type = XML_REGEXP_OTHER; 4563 } 4564 } else if (cur == 'I') { 4565 const xmlChar *start; 4566 NEXT; 4567 cur = CUR; 4568 if (cur != 's') { 4569 ERROR("IsXXXX expected"); 4570 return; 4571 } 4572 NEXT; 4573 start = ctxt->cur; 4574 cur = CUR; 4575 if (((cur >= 'a') && (cur <= 'z')) || 4576 ((cur >= 'A') && (cur <= 'Z')) || 4577 ((cur >= '0') && (cur <= '9')) || 4578 (cur == 0x2D)) { 4579 NEXT; 4580 cur = CUR; 4581 while (((cur >= 'a') && (cur <= 'z')) || 4582 ((cur >= 'A') && (cur <= 'Z')) || 4583 ((cur >= '0') && (cur <= '9')) || 4584 (cur == 0x2D)) { 4585 NEXT; 4586 cur = CUR; 4587 } 4588 } 4589 type = XML_REGEXP_BLOCK_NAME; 4590 blockName = xmlStrndup(start, ctxt->cur - start); 4591 } else { 4592 ERROR("Unknown char property"); 4593 return; 4594 } 4595 if (ctxt->atom == NULL) { 4596 ctxt->atom = xmlRegNewAtom(ctxt, type); 4597 if (ctxt->atom != NULL) 4598 ctxt->atom->valuep = blockName; 4599 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4600 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4601 type, 0, 0, blockName); 4602 } 4603} 4604 4605/** 4606 * xmlFAParseCharClassEsc: 4607 * @ctxt: a regexp parser context 4608 * 4609 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4610 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4611 * [25] catEsc ::= '\p{' charProp '}' 4612 * [26] complEsc ::= '\P{' charProp '}' 4613 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4614 */ 4615static void 4616xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4617 int cur; 4618 4619 if (CUR == '.') { 4620 if (ctxt->atom == NULL) { 4621 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4622 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4623 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4624 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4625 } 4626 NEXT; 4627 return; 4628 } 4629 if (CUR != '\\') { 4630 ERROR("Escaped sequence: expecting \\"); 4631 return; 4632 } 4633 NEXT; 4634 cur = CUR; 4635 if (cur == 'p') { 4636 NEXT; 4637 if (CUR != '{') { 4638 ERROR("Expecting '{'"); 4639 return; 4640 } 4641 NEXT; 4642 xmlFAParseCharProp(ctxt); 4643 if (CUR != '}') { 4644 ERROR("Expecting '}'"); 4645 return; 4646 } 4647 NEXT; 4648 } else if (cur == 'P') { 4649 NEXT; 4650 if (CUR != '{') { 4651 ERROR("Expecting '{'"); 4652 return; 4653 } 4654 NEXT; 4655 xmlFAParseCharProp(ctxt); 4656 ctxt->atom->neg = 1; 4657 if (CUR != '}') { 4658 ERROR("Expecting '}'"); 4659 return; 4660 } 4661 NEXT; 4662 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4663 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4664 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4665 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4666 (cur == 0x5E)) { 4667 if (ctxt->atom == NULL) { 4668 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4669 if (ctxt->atom != NULL) { 4670 switch (cur) { 4671 case 'n': 4672 ctxt->atom->codepoint = '\n'; 4673 break; 4674 case 'r': 4675 ctxt->atom->codepoint = '\r'; 4676 break; 4677 case 't': 4678 ctxt->atom->codepoint = '\t'; 4679 break; 4680 default: 4681 ctxt->atom->codepoint = cur; 4682 } 4683 } 4684 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4685 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4686 XML_REGEXP_CHARVAL, cur, cur, NULL); 4687 } 4688 NEXT; 4689 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4690 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4691 (cur == 'w') || (cur == 'W')) { 4692 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4693 4694 switch (cur) { 4695 case 's': 4696 type = XML_REGEXP_ANYSPACE; 4697 break; 4698 case 'S': 4699 type = XML_REGEXP_NOTSPACE; 4700 break; 4701 case 'i': 4702 type = XML_REGEXP_INITNAME; 4703 break; 4704 case 'I': 4705 type = XML_REGEXP_NOTINITNAME; 4706 break; 4707 case 'c': 4708 type = XML_REGEXP_NAMECHAR; 4709 break; 4710 case 'C': 4711 type = XML_REGEXP_NOTNAMECHAR; 4712 break; 4713 case 'd': 4714 type = XML_REGEXP_DECIMAL; 4715 break; 4716 case 'D': 4717 type = XML_REGEXP_NOTDECIMAL; 4718 break; 4719 case 'w': 4720 type = XML_REGEXP_REALCHAR; 4721 break; 4722 case 'W': 4723 type = XML_REGEXP_NOTREALCHAR; 4724 break; 4725 } 4726 NEXT; 4727 if (ctxt->atom == NULL) { 4728 ctxt->atom = xmlRegNewAtom(ctxt, type); 4729 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4730 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4731 type, 0, 0, NULL); 4732 } 4733 } 4734} 4735 4736/** 4737 * xmlFAParseCharRef: 4738 * @ctxt: a regexp parser context 4739 * 4740 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 4741 */ 4742static int 4743xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 4744 int ret = 0, cur; 4745 4746 if ((CUR != '&') || (NXT(1) != '#')) 4747 return(-1); 4748 NEXT; 4749 NEXT; 4750 cur = CUR; 4751 if (cur == 'x') { 4752 NEXT; 4753 cur = CUR; 4754 if (((cur >= '0') && (cur <= '9')) || 4755 ((cur >= 'a') && (cur <= 'f')) || 4756 ((cur >= 'A') && (cur <= 'F'))) { 4757 while (((cur >= '0') && (cur <= '9')) || 4758 ((cur >= 'a') && (cur <= 'f')) || 4759 ((cur >= 'A') && (cur <= 'F'))) { 4760 if ((cur >= '0') && (cur <= '9')) 4761 ret = ret * 16 + cur - '0'; 4762 else if ((cur >= 'a') && (cur <= 'f')) 4763 ret = ret * 16 + 10 + (cur - 'a'); 4764 else 4765 ret = ret * 16 + 10 + (cur - 'A'); 4766 NEXT; 4767 cur = CUR; 4768 } 4769 } else { 4770 ERROR("Char ref: expecting [0-9A-F]"); 4771 return(-1); 4772 } 4773 } else { 4774 if ((cur >= '0') && (cur <= '9')) { 4775 while ((cur >= '0') && (cur <= '9')) { 4776 ret = ret * 10 + cur - '0'; 4777 NEXT; 4778 cur = CUR; 4779 } 4780 } else { 4781 ERROR("Char ref: expecting [0-9]"); 4782 return(-1); 4783 } 4784 } 4785 if (cur != ';') { 4786 ERROR("Char ref: expecting ';'"); 4787 return(-1); 4788 } else { 4789 NEXT; 4790 } 4791 return(ret); 4792} 4793 4794/** 4795 * xmlFAParseCharRange: 4796 * @ctxt: a regexp parser context 4797 * 4798 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4799 * [18] seRange ::= charOrEsc '-' charOrEsc 4800 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4801 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4802 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4803 */ 4804static void 4805xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4806 int cur, len; 4807 int start = -1; 4808 int end = -1; 4809 4810 if ((CUR == '&') && (NXT(1) == '#')) { 4811 end = start = xmlFAParseCharRef(ctxt); 4812 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4813 XML_REGEXP_CHARVAL, start, end, NULL); 4814 return; 4815 } 4816 cur = CUR; 4817 if (cur == '\\') { 4818 NEXT; 4819 cur = CUR; 4820 switch (cur) { 4821 case 'n': start = 0xA; break; 4822 case 'r': start = 0xD; break; 4823 case 't': start = 0x9; break; 4824 case '\\': case '|': case '.': case '-': case '^': case '?': 4825 case '*': case '+': case '{': case '}': case '(': case ')': 4826 case '[': case ']': 4827 start = cur; break; 4828 default: 4829 ERROR("Invalid escape value"); 4830 return; 4831 } 4832 end = start; 4833 len = 1; 4834 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4835 end = start = CUR_SCHAR(ctxt->cur, len); 4836 } else { 4837 ERROR("Expecting a char range"); 4838 return; 4839 } 4840 NEXTL(len); 4841 if (start == '-') { 4842 return; 4843 } 4844 cur = CUR; 4845 if ((cur != '-') || (NXT(1) == ']')) { 4846 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4847 XML_REGEXP_CHARVAL, start, end, NULL); 4848 return; 4849 } 4850 NEXT; 4851 cur = CUR; 4852 if (cur == '\\') { 4853 NEXT; 4854 cur = CUR; 4855 switch (cur) { 4856 case 'n': end = 0xA; break; 4857 case 'r': end = 0xD; break; 4858 case 't': end = 0x9; break; 4859 case '\\': case '|': case '.': case '-': case '^': case '?': 4860 case '*': case '+': case '{': case '}': case '(': case ')': 4861 case '[': case ']': 4862 end = cur; break; 4863 default: 4864 ERROR("Invalid escape value"); 4865 return; 4866 } 4867 len = 1; 4868 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4869 end = CUR_SCHAR(ctxt->cur, len); 4870 } else { 4871 ERROR("Expecting the end of a char range"); 4872 return; 4873 } 4874 NEXTL(len); 4875 /* TODO check that the values are acceptable character ranges for XML */ 4876 if (end < start) { 4877 ERROR("End of range is before start of range"); 4878 } else { 4879 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4880 XML_REGEXP_CHARVAL, start, end, NULL); 4881 } 4882 return; 4883} 4884 4885/** 4886 * xmlFAParsePosCharGroup: 4887 * @ctxt: a regexp parser context 4888 * 4889 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 4890 */ 4891static void 4892xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 4893 do { 4894 if ((CUR == '\\') || (CUR == '.')) { 4895 xmlFAParseCharClassEsc(ctxt); 4896 } else { 4897 xmlFAParseCharRange(ctxt); 4898 } 4899 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 4900 (ctxt->error == 0)); 4901} 4902 4903/** 4904 * xmlFAParseCharGroup: 4905 * @ctxt: a regexp parser context 4906 * 4907 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 4908 * [15] negCharGroup ::= '^' posCharGroup 4909 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 4910 * [12] charClassExpr ::= '[' charGroup ']' 4911 */ 4912static void 4913xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 4914 int n = ctxt->neg; 4915 while ((CUR != ']') && (ctxt->error == 0)) { 4916 if (CUR == '^') { 4917 int neg = ctxt->neg; 4918 4919 NEXT; 4920 ctxt->neg = !ctxt->neg; 4921 xmlFAParsePosCharGroup(ctxt); 4922 ctxt->neg = neg; 4923 } else if ((CUR == '-') && (NXT(1) == '[')) { 4924 int neg = ctxt->neg; 4925 ctxt->neg = 2; 4926 NEXT; /* eat the '-' */ 4927 NEXT; /* eat the '[' */ 4928 xmlFAParseCharGroup(ctxt); 4929 if (CUR == ']') { 4930 NEXT; 4931 } else { 4932 ERROR("charClassExpr: ']' expected"); 4933 break; 4934 } 4935 ctxt->neg = neg; 4936 break; 4937 } else if (CUR != ']') { 4938 xmlFAParsePosCharGroup(ctxt); 4939 } 4940 } 4941 ctxt->neg = n; 4942} 4943 4944/** 4945 * xmlFAParseCharClass: 4946 * @ctxt: a regexp parser context 4947 * 4948 * [11] charClass ::= charClassEsc | charClassExpr 4949 * [12] charClassExpr ::= '[' charGroup ']' 4950 */ 4951static void 4952xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 4953 if (CUR == '[') { 4954 NEXT; 4955 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 4956 if (ctxt->atom == NULL) 4957 return; 4958 xmlFAParseCharGroup(ctxt); 4959 if (CUR == ']') { 4960 NEXT; 4961 } else { 4962 ERROR("xmlFAParseCharClass: ']' expected"); 4963 } 4964 } else { 4965 xmlFAParseCharClassEsc(ctxt); 4966 } 4967} 4968 4969/** 4970 * xmlFAParseQuantExact: 4971 * @ctxt: a regexp parser context 4972 * 4973 * [8] QuantExact ::= [0-9]+ 4974 * 4975 * Returns 0 if success or -1 in case of error 4976 */ 4977static int 4978xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 4979 int ret = 0; 4980 int ok = 0; 4981 4982 while ((CUR >= '0') && (CUR <= '9')) { 4983 ret = ret * 10 + (CUR - '0'); 4984 ok = 1; 4985 NEXT; 4986 } 4987 if (ok != 1) { 4988 return(-1); 4989 } 4990 return(ret); 4991} 4992 4993/** 4994 * xmlFAParseQuantifier: 4995 * @ctxt: a regexp parser context 4996 * 4997 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 4998 * [5] quantity ::= quantRange | quantMin | QuantExact 4999 * [6] quantRange ::= QuantExact ',' QuantExact 5000 * [7] quantMin ::= QuantExact ',' 5001 * [8] QuantExact ::= [0-9]+ 5002 */ 5003static int 5004xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5005 int cur; 5006 5007 cur = CUR; 5008 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5009 if (ctxt->atom != NULL) { 5010 if (cur == '?') 5011 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5012 else if (cur == '*') 5013 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5014 else if (cur == '+') 5015 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5016 } 5017 NEXT; 5018 return(1); 5019 } 5020 if (cur == '{') { 5021 int min = 0, max = 0; 5022 5023 NEXT; 5024 cur = xmlFAParseQuantExact(ctxt); 5025 if (cur >= 0) 5026 min = cur; 5027 if (CUR == ',') { 5028 NEXT; 5029 if (CUR == '}') 5030 max = INT_MAX; 5031 else { 5032 cur = xmlFAParseQuantExact(ctxt); 5033 if (cur >= 0) 5034 max = cur; 5035 else { 5036 ERROR("Improper quantifier"); 5037 } 5038 } 5039 } 5040 if (CUR == '}') { 5041 NEXT; 5042 } else { 5043 ERROR("Unterminated quantifier"); 5044 } 5045 if (max == 0) 5046 max = min; 5047 if (ctxt->atom != NULL) { 5048 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5049 ctxt->atom->min = min; 5050 ctxt->atom->max = max; 5051 } 5052 return(1); 5053 } 5054 return(0); 5055} 5056 5057/** 5058 * xmlFAParseAtom: 5059 * @ctxt: a regexp parser context 5060 * 5061 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5062 */ 5063static int 5064xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5065 int codepoint, len; 5066 5067 codepoint = xmlFAIsChar(ctxt); 5068 if (codepoint > 0) { 5069 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5070 if (ctxt->atom == NULL) 5071 return(-1); 5072 codepoint = CUR_SCHAR(ctxt->cur, len); 5073 ctxt->atom->codepoint = codepoint; 5074 NEXTL(len); 5075 return(1); 5076 } else if (CUR == '|') { 5077 return(0); 5078 } else if (CUR == 0) { 5079 return(0); 5080 } else if (CUR == ')') { 5081 return(0); 5082 } else if (CUR == '(') { 5083 xmlRegStatePtr start, oldend; 5084 5085 NEXT; 5086 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5087 start = ctxt->state; 5088 oldend = ctxt->end; 5089 ctxt->end = NULL; 5090 ctxt->atom = NULL; 5091 xmlFAParseRegExp(ctxt, 0); 5092 if (CUR == ')') { 5093 NEXT; 5094 } else { 5095 ERROR("xmlFAParseAtom: expecting ')'"); 5096 } 5097 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5098 if (ctxt->atom == NULL) 5099 return(-1); 5100 ctxt->atom->start = start; 5101 ctxt->atom->stop = ctxt->state; 5102 ctxt->end = oldend; 5103 return(1); 5104 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5105 xmlFAParseCharClass(ctxt); 5106 return(1); 5107 } 5108 return(0); 5109} 5110 5111/** 5112 * xmlFAParsePiece: 5113 * @ctxt: a regexp parser context 5114 * 5115 * [3] piece ::= atom quantifier? 5116 */ 5117static int 5118xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5119 int ret; 5120 5121 ctxt->atom = NULL; 5122 ret = xmlFAParseAtom(ctxt); 5123 if (ret == 0) 5124 return(0); 5125 if (ctxt->atom == NULL) { 5126 ERROR("internal: no atom generated"); 5127 } 5128 xmlFAParseQuantifier(ctxt); 5129 return(1); 5130} 5131 5132/** 5133 * xmlFAParseBranch: 5134 * @ctxt: a regexp parser context 5135 * @to: optional target to the end of the branch 5136 * 5137 * @to is used to optimize by removing duplicate path in automata 5138 * in expressions like (a|b)(c|d) 5139 * 5140 * [2] branch ::= piece* 5141 */ 5142static int 5143xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5144 xmlRegStatePtr previous; 5145 int ret; 5146 5147 previous = ctxt->state; 5148 ret = xmlFAParsePiece(ctxt); 5149 if (ret != 0) { 5150 if (xmlFAGenerateTransitions(ctxt, previous, 5151 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5152 return(-1); 5153 previous = ctxt->state; 5154 ctxt->atom = NULL; 5155 } 5156 while ((ret != 0) && (ctxt->error == 0)) { 5157 ret = xmlFAParsePiece(ctxt); 5158 if (ret != 0) { 5159 if (xmlFAGenerateTransitions(ctxt, previous, 5160 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5161 return(-1); 5162 previous = ctxt->state; 5163 ctxt->atom = NULL; 5164 } 5165 } 5166 return(0); 5167} 5168 5169/** 5170 * xmlFAParseRegExp: 5171 * @ctxt: a regexp parser context 5172 * @top: is this the top-level expression ? 5173 * 5174 * [1] regExp ::= branch ( '|' branch )* 5175 */ 5176static void 5177xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5178 xmlRegStatePtr start, end; 5179 5180 /* if not top start should have been generated by an epsilon trans */ 5181 start = ctxt->state; 5182 ctxt->end = NULL; 5183 xmlFAParseBranch(ctxt, NULL); 5184 if (top) { 5185#ifdef DEBUG_REGEXP_GRAPH 5186 printf("State %d is final\n", ctxt->state->no); 5187#endif 5188 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5189 } 5190 if (CUR != '|') { 5191 ctxt->end = ctxt->state; 5192 return; 5193 } 5194 end = ctxt->state; 5195 while ((CUR == '|') && (ctxt->error == 0)) { 5196 NEXT; 5197 ctxt->state = start; 5198 ctxt->end = NULL; 5199 xmlFAParseBranch(ctxt, end); 5200 } 5201 if (!top) { 5202 ctxt->state = end; 5203 ctxt->end = end; 5204 } 5205} 5206 5207/************************************************************************ 5208 * * 5209 * The basic API * 5210 * * 5211 ************************************************************************/ 5212 5213/** 5214 * xmlRegexpPrint: 5215 * @output: the file for the output debug 5216 * @regexp: the compiled regexp 5217 * 5218 * Print the content of the compiled regular expression 5219 */ 5220void 5221xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5222 int i; 5223 5224 if (output == NULL) 5225 return; 5226 fprintf(output, " regexp: "); 5227 if (regexp == NULL) { 5228 fprintf(output, "NULL\n"); 5229 return; 5230 } 5231 fprintf(output, "'%s' ", regexp->string); 5232 fprintf(output, "\n"); 5233 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5234 for (i = 0;i < regexp->nbAtoms; i++) { 5235 fprintf(output, " %02d ", i); 5236 xmlRegPrintAtom(output, regexp->atoms[i]); 5237 } 5238 fprintf(output, "%d states:", regexp->nbStates); 5239 fprintf(output, "\n"); 5240 for (i = 0;i < regexp->nbStates; i++) { 5241 xmlRegPrintState(output, regexp->states[i]); 5242 } 5243 fprintf(output, "%d counters:\n", regexp->nbCounters); 5244 for (i = 0;i < regexp->nbCounters; i++) { 5245 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5246 regexp->counters[i].max); 5247 } 5248} 5249 5250/** 5251 * xmlRegexpCompile: 5252 * @regexp: a regular expression string 5253 * 5254 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5255 * Appendix F and builds an automata suitable for testing strings against 5256 * that regular expression 5257 * 5258 * Returns the compiled expression or NULL in case of error 5259 */ 5260xmlRegexpPtr 5261xmlRegexpCompile(const xmlChar *regexp) { 5262 xmlRegexpPtr ret; 5263 xmlRegParserCtxtPtr ctxt; 5264 5265 ctxt = xmlRegNewParserCtxt(regexp); 5266 if (ctxt == NULL) 5267 return(NULL); 5268 5269 /* initialize the parser */ 5270 ctxt->end = NULL; 5271 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5272 xmlRegStatePush(ctxt, ctxt->start); 5273 5274 /* parse the expression building an automata */ 5275 xmlFAParseRegExp(ctxt, 1); 5276 if (CUR != 0) { 5277 ERROR("xmlFAParseRegExp: extra characters"); 5278 } 5279 ctxt->end = ctxt->state; 5280 ctxt->start->type = XML_REGEXP_START_STATE; 5281 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5282 5283 /* remove the Epsilon except for counted transitions */ 5284 xmlFAEliminateEpsilonTransitions(ctxt); 5285 5286 5287 if (ctxt->error != 0) { 5288 xmlRegFreeParserCtxt(ctxt); 5289 return(NULL); 5290 } 5291 ret = xmlRegEpxFromParse(ctxt); 5292 xmlRegFreeParserCtxt(ctxt); 5293 return(ret); 5294} 5295 5296/** 5297 * xmlRegexpExec: 5298 * @comp: the compiled regular expression 5299 * @content: the value to check against the regular expression 5300 * 5301 * Check if the regular expression generates the value 5302 * 5303 * Returns 1 if it matches, 0 if not and a negative value in case of error 5304 */ 5305int 5306xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5307 if ((comp == NULL) || (content == NULL)) 5308 return(-1); 5309 return(xmlFARegExec(comp, content)); 5310} 5311 5312/** 5313 * xmlRegexpIsDeterminist: 5314 * @comp: the compiled regular expression 5315 * 5316 * Check if the regular expression is determinist 5317 * 5318 * Returns 1 if it yes, 0 if not and a negative value in case of error 5319 */ 5320int 5321xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5322 xmlAutomataPtr am; 5323 int ret; 5324 5325 if (comp == NULL) 5326 return(-1); 5327 if (comp->determinist != -1) 5328 return(comp->determinist); 5329 5330 am = xmlNewAutomata(); 5331 if (am->states != NULL) { 5332 int i; 5333 5334 for (i = 0;i < am->nbStates;i++) 5335 xmlRegFreeState(am->states[i]); 5336 xmlFree(am->states); 5337 } 5338 am->nbAtoms = comp->nbAtoms; 5339 am->atoms = comp->atoms; 5340 am->nbStates = comp->nbStates; 5341 am->states = comp->states; 5342 am->determinist = -1; 5343 ret = xmlFAComputesDeterminism(am); 5344 am->atoms = NULL; 5345 am->states = NULL; 5346 xmlFreeAutomata(am); 5347 return(ret); 5348} 5349 5350/** 5351 * xmlRegFreeRegexp: 5352 * @regexp: the regexp 5353 * 5354 * Free a regexp 5355 */ 5356void 5357xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5358 int i; 5359 if (regexp == NULL) 5360 return; 5361 5362 if (regexp->string != NULL) 5363 xmlFree(regexp->string); 5364 if (regexp->states != NULL) { 5365 for (i = 0;i < regexp->nbStates;i++) 5366 xmlRegFreeState(regexp->states[i]); 5367 xmlFree(regexp->states); 5368 } 5369 if (regexp->atoms != NULL) { 5370 for (i = 0;i < regexp->nbAtoms;i++) 5371 xmlRegFreeAtom(regexp->atoms[i]); 5372 xmlFree(regexp->atoms); 5373 } 5374 if (regexp->counters != NULL) 5375 xmlFree(regexp->counters); 5376 if (regexp->compact != NULL) 5377 xmlFree(regexp->compact); 5378 if (regexp->transdata != NULL) 5379 xmlFree(regexp->transdata); 5380 if (regexp->stringMap != NULL) { 5381 for (i = 0; i < regexp->nbstrings;i++) 5382 xmlFree(regexp->stringMap[i]); 5383 xmlFree(regexp->stringMap); 5384 } 5385 5386 xmlFree(regexp); 5387} 5388 5389#ifdef LIBXML_AUTOMATA_ENABLED 5390/************************************************************************ 5391 * * 5392 * The Automata interface * 5393 * * 5394 ************************************************************************/ 5395 5396/** 5397 * xmlNewAutomata: 5398 * 5399 * Create a new automata 5400 * 5401 * Returns the new object or NULL in case of failure 5402 */ 5403xmlAutomataPtr 5404xmlNewAutomata(void) { 5405 xmlAutomataPtr ctxt; 5406 5407 ctxt = xmlRegNewParserCtxt(NULL); 5408 if (ctxt == NULL) 5409 return(NULL); 5410 5411 /* initialize the parser */ 5412 ctxt->end = NULL; 5413 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5414 if (ctxt->start == NULL) { 5415 xmlFreeAutomata(ctxt); 5416 return(NULL); 5417 } 5418 ctxt->start->type = XML_REGEXP_START_STATE; 5419 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5420 xmlRegFreeState(ctxt->start); 5421 xmlFreeAutomata(ctxt); 5422 return(NULL); 5423 } 5424 5425 return(ctxt); 5426} 5427 5428/** 5429 * xmlFreeAutomata: 5430 * @am: an automata 5431 * 5432 * Free an automata 5433 */ 5434void 5435xmlFreeAutomata(xmlAutomataPtr am) { 5436 if (am == NULL) 5437 return; 5438 xmlRegFreeParserCtxt(am); 5439} 5440 5441/** 5442 * xmlAutomataGetInitState: 5443 * @am: an automata 5444 * 5445 * Initial state lookup 5446 * 5447 * Returns the initial state of the automata 5448 */ 5449xmlAutomataStatePtr 5450xmlAutomataGetInitState(xmlAutomataPtr am) { 5451 if (am == NULL) 5452 return(NULL); 5453 return(am->start); 5454} 5455 5456/** 5457 * xmlAutomataSetFinalState: 5458 * @am: an automata 5459 * @state: a state in this automata 5460 * 5461 * Makes that state a final state 5462 * 5463 * Returns 0 or -1 in case of error 5464 */ 5465int 5466xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5467 if ((am == NULL) || (state == NULL)) 5468 return(-1); 5469 state->type = XML_REGEXP_FINAL_STATE; 5470 return(0); 5471} 5472 5473/** 5474 * xmlAutomataNewTransition: 5475 * @am: an automata 5476 * @from: the starting point of the transition 5477 * @to: the target point of the transition or NULL 5478 * @token: the input string associated to that transition 5479 * @data: data passed to the callback function if the transition is activated 5480 * 5481 * If @to is NULL, this creates first a new target state in the automata 5482 * and then adds a transition from the @from state to the target state 5483 * activated by the value of @token 5484 * 5485 * Returns the target state or NULL in case of error 5486 */ 5487xmlAutomataStatePtr 5488xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5489 xmlAutomataStatePtr to, const xmlChar *token, 5490 void *data) { 5491 xmlRegAtomPtr atom; 5492 5493 if ((am == NULL) || (from == NULL) || (token == NULL)) 5494 return(NULL); 5495 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5496 if (atom == NULL) 5497 return(NULL); 5498 atom->data = data; 5499 if (atom == NULL) 5500 return(NULL); 5501 atom->valuep = xmlStrdup(token); 5502 5503 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5504 xmlRegFreeAtom(atom); 5505 return(NULL); 5506 } 5507 if (to == NULL) 5508 return(am->state); 5509 return(to); 5510} 5511 5512/** 5513 * xmlAutomataNewTransition2: 5514 * @am: an automata 5515 * @from: the starting point of the transition 5516 * @to: the target point of the transition or NULL 5517 * @token: the first input string associated to that transition 5518 * @token2: the second input string associated to that transition 5519 * @data: data passed to the callback function if the transition is activated 5520 * 5521 * If @to is NULL, this creates first a new target state in the automata 5522 * and then adds a transition from the @from state to the target state 5523 * activated by the value of @token 5524 * 5525 * Returns the target state or NULL in case of error 5526 */ 5527xmlAutomataStatePtr 5528xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5529 xmlAutomataStatePtr to, const xmlChar *token, 5530 const xmlChar *token2, void *data) { 5531 xmlRegAtomPtr atom; 5532 5533 if ((am == NULL) || (from == NULL) || (token == NULL)) 5534 return(NULL); 5535 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5536 if (atom == NULL) 5537 return(NULL); 5538 atom->data = data; 5539 if ((token2 == NULL) || (*token2 == 0)) { 5540 atom->valuep = xmlStrdup(token); 5541 } else { 5542 int lenn, lenp; 5543 xmlChar *str; 5544 5545 lenn = strlen((char *) token2); 5546 lenp = strlen((char *) token); 5547 5548 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5549 if (str == NULL) { 5550 xmlRegFreeAtom(atom); 5551 return(NULL); 5552 } 5553 memcpy(&str[0], token, lenp); 5554 str[lenp] = '|'; 5555 memcpy(&str[lenp + 1], token2, lenn); 5556 str[lenn + lenp + 1] = 0; 5557 5558 atom->valuep = str; 5559 } 5560 5561 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5562 xmlRegFreeAtom(atom); 5563 return(NULL); 5564 } 5565 if (to == NULL) 5566 return(am->state); 5567 return(to); 5568} 5569 5570/** 5571 * xmlAutomataNewNegTrans: 5572 * @am: an automata 5573 * @from: the starting point of the transition 5574 * @to: the target point of the transition or NULL 5575 * @token: the first input string associated to that transition 5576 * @token2: the second input string associated to that transition 5577 * @data: data passed to the callback function if the transition is activated 5578 * 5579 * If @to is NULL, this creates first a new target state in the automata 5580 * and then adds a transition from the @from state to the target state 5581 * activated by any value except (@token,@token2) 5582 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5583 # the semantic of XSD ##other 5584 * 5585 * Returns the target state or NULL in case of error 5586 */ 5587xmlAutomataStatePtr 5588xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5589 xmlAutomataStatePtr to, const xmlChar *token, 5590 const xmlChar *token2, void *data) { 5591 xmlRegAtomPtr atom; 5592 xmlChar err_msg[200]; 5593 5594 if ((am == NULL) || (from == NULL) || (token == NULL)) 5595 return(NULL); 5596 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5597 if (atom == NULL) 5598 return(NULL); 5599 atom->data = data; 5600 atom->neg = 1; 5601 if ((token2 == NULL) || (*token2 == 0)) { 5602 atom->valuep = xmlStrdup(token); 5603 } else { 5604 int lenn, lenp; 5605 xmlChar *str; 5606 5607 lenn = strlen((char *) token2); 5608 lenp = strlen((char *) token); 5609 5610 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5611 if (str == NULL) { 5612 xmlRegFreeAtom(atom); 5613 return(NULL); 5614 } 5615 memcpy(&str[0], token, lenp); 5616 str[lenp] = '|'; 5617 memcpy(&str[lenp + 1], token2, lenn); 5618 str[lenn + lenp + 1] = 0; 5619 5620 atom->valuep = str; 5621 } 5622 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5623 err_msg[199] = 0; 5624 atom->valuep2 = xmlStrdup(err_msg); 5625 5626 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5627 xmlRegFreeAtom(atom); 5628 return(NULL); 5629 } 5630 am->negs++; 5631 if (to == NULL) 5632 return(am->state); 5633 return(to); 5634} 5635 5636/** 5637 * xmlAutomataNewCountTrans2: 5638 * @am: an automata 5639 * @from: the starting point of the transition 5640 * @to: the target point of the transition or NULL 5641 * @token: the input string associated to that transition 5642 * @token2: the second input string associated to that transition 5643 * @min: the minimum successive occurences of token 5644 * @max: the maximum successive occurences of token 5645 * @data: data associated to the transition 5646 * 5647 * If @to is NULL, this creates first a new target state in the automata 5648 * and then adds a transition from the @from state to the target state 5649 * activated by a succession of input of value @token and @token2 and 5650 * whose number is between @min and @max 5651 * 5652 * Returns the target state or NULL in case of error 5653 */ 5654xmlAutomataStatePtr 5655xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5656 xmlAutomataStatePtr to, const xmlChar *token, 5657 const xmlChar *token2, 5658 int min, int max, void *data) { 5659 xmlRegAtomPtr atom; 5660 int counter; 5661 5662 if ((am == NULL) || (from == NULL) || (token == NULL)) 5663 return(NULL); 5664 if (min < 0) 5665 return(NULL); 5666 if ((max < min) || (max < 1)) 5667 return(NULL); 5668 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5669 if (atom == NULL) 5670 return(NULL); 5671 if ((token2 == NULL) || (*token2 == 0)) { 5672 atom->valuep = xmlStrdup(token); 5673 } else { 5674 int lenn, lenp; 5675 xmlChar *str; 5676 5677 lenn = strlen((char *) token2); 5678 lenp = strlen((char *) token); 5679 5680 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5681 if (str == NULL) { 5682 xmlRegFreeAtom(atom); 5683 return(NULL); 5684 } 5685 memcpy(&str[0], token, lenp); 5686 str[lenp] = '|'; 5687 memcpy(&str[lenp + 1], token2, lenn); 5688 str[lenn + lenp + 1] = 0; 5689 5690 atom->valuep = str; 5691 } 5692 atom->data = data; 5693 if (min == 0) 5694 atom->min = 1; 5695 else 5696 atom->min = min; 5697 atom->max = max; 5698 5699 /* 5700 * associate a counter to the transition. 5701 */ 5702 counter = xmlRegGetCounter(am); 5703 am->counters[counter].min = min; 5704 am->counters[counter].max = max; 5705 5706 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5707 if (to == NULL) { 5708 to = xmlRegNewState(am); 5709 xmlRegStatePush(am, to); 5710 } 5711 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5712 xmlRegAtomPush(am, atom); 5713 am->state = to; 5714 5715 if (to == NULL) 5716 to = am->state; 5717 if (to == NULL) 5718 return(NULL); 5719 if (min == 0) 5720 xmlFAGenerateEpsilonTransition(am, from, to); 5721 return(to); 5722} 5723 5724/** 5725 * xmlAutomataNewCountTrans: 5726 * @am: an automata 5727 * @from: the starting point of the transition 5728 * @to: the target point of the transition or NULL 5729 * @token: the input string associated to that transition 5730 * @min: the minimum successive occurences of token 5731 * @max: the maximum successive occurences of token 5732 * @data: data associated to the transition 5733 * 5734 * If @to is NULL, this creates first a new target state in the automata 5735 * and then adds a transition from the @from state to the target state 5736 * activated by a succession of input of value @token and whose number 5737 * is between @min and @max 5738 * 5739 * Returns the target state or NULL in case of error 5740 */ 5741xmlAutomataStatePtr 5742xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5743 xmlAutomataStatePtr to, const xmlChar *token, 5744 int min, int max, void *data) { 5745 xmlRegAtomPtr atom; 5746 int counter; 5747 5748 if ((am == NULL) || (from == NULL) || (token == NULL)) 5749 return(NULL); 5750 if (min < 0) 5751 return(NULL); 5752 if ((max < min) || (max < 1)) 5753 return(NULL); 5754 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5755 if (atom == NULL) 5756 return(NULL); 5757 atom->valuep = xmlStrdup(token); 5758 atom->data = data; 5759 if (min == 0) 5760 atom->min = 1; 5761 else 5762 atom->min = min; 5763 atom->max = max; 5764 5765 /* 5766 * associate a counter to the transition. 5767 */ 5768 counter = xmlRegGetCounter(am); 5769 am->counters[counter].min = min; 5770 am->counters[counter].max = max; 5771 5772 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5773 if (to == NULL) { 5774 to = xmlRegNewState(am); 5775 xmlRegStatePush(am, to); 5776 } 5777 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5778 xmlRegAtomPush(am, atom); 5779 am->state = to; 5780 5781 if (to == NULL) 5782 to = am->state; 5783 if (to == NULL) 5784 return(NULL); 5785 if (min == 0) 5786 xmlFAGenerateEpsilonTransition(am, from, to); 5787 return(to); 5788} 5789 5790/** 5791 * xmlAutomataNewOnceTrans2: 5792 * @am: an automata 5793 * @from: the starting point of the transition 5794 * @to: the target point of the transition or NULL 5795 * @token: the input string associated to that transition 5796 * @token2: the second input string associated to that transition 5797 * @min: the minimum successive occurences of token 5798 * @max: the maximum successive occurences of token 5799 * @data: data associated to the transition 5800 * 5801 * If @to is NULL, this creates first a new target state in the automata 5802 * and then adds a transition from the @from state to the target state 5803 * activated by a succession of input of value @token and @token2 and whose 5804 * number is between @min and @max, moreover that transition can only be 5805 * crossed once. 5806 * 5807 * Returns the target state or NULL in case of error 5808 */ 5809xmlAutomataStatePtr 5810xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5811 xmlAutomataStatePtr to, const xmlChar *token, 5812 const xmlChar *token2, 5813 int min, int max, void *data) { 5814 xmlRegAtomPtr atom; 5815 int counter; 5816 5817 if ((am == NULL) || (from == NULL) || (token == NULL)) 5818 return(NULL); 5819 if (min < 1) 5820 return(NULL); 5821 if ((max < min) || (max < 1)) 5822 return(NULL); 5823 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5824 if (atom == NULL) 5825 return(NULL); 5826 if ((token2 == NULL) || (*token2 == 0)) { 5827 atom->valuep = xmlStrdup(token); 5828 } else { 5829 int lenn, lenp; 5830 xmlChar *str; 5831 5832 lenn = strlen((char *) token2); 5833 lenp = strlen((char *) token); 5834 5835 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5836 if (str == NULL) { 5837 xmlRegFreeAtom(atom); 5838 return(NULL); 5839 } 5840 memcpy(&str[0], token, lenp); 5841 str[lenp] = '|'; 5842 memcpy(&str[lenp + 1], token2, lenn); 5843 str[lenn + lenp + 1] = 0; 5844 5845 atom->valuep = str; 5846 } 5847 atom->data = data; 5848 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5849 atom->min = min; 5850 atom->max = max; 5851 /* 5852 * associate a counter to the transition. 5853 */ 5854 counter = xmlRegGetCounter(am); 5855 am->counters[counter].min = 1; 5856 am->counters[counter].max = 1; 5857 5858 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5859 if (to == NULL) { 5860 to = xmlRegNewState(am); 5861 xmlRegStatePush(am, to); 5862 } 5863 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5864 xmlRegAtomPush(am, atom); 5865 am->state = to; 5866 return(to); 5867} 5868 5869 5870 5871/** 5872 * xmlAutomataNewOnceTrans: 5873 * @am: an automata 5874 * @from: the starting point of the transition 5875 * @to: the target point of the transition or NULL 5876 * @token: the input string associated to that transition 5877 * @min: the minimum successive occurences of token 5878 * @max: the maximum successive occurences of token 5879 * @data: data associated to the transition 5880 * 5881 * If @to is NULL, this creates first a new target state in the automata 5882 * and then adds a transition from the @from state to the target state 5883 * activated by a succession of input of value @token and whose number 5884 * is between @min and @max, moreover that transition can only be crossed 5885 * once. 5886 * 5887 * Returns the target state or NULL in case of error 5888 */ 5889xmlAutomataStatePtr 5890xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5891 xmlAutomataStatePtr to, const xmlChar *token, 5892 int min, int max, void *data) { 5893 xmlRegAtomPtr atom; 5894 int counter; 5895 5896 if ((am == NULL) || (from == NULL) || (token == NULL)) 5897 return(NULL); 5898 if (min < 1) 5899 return(NULL); 5900 if ((max < min) || (max < 1)) 5901 return(NULL); 5902 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5903 if (atom == NULL) 5904 return(NULL); 5905 atom->valuep = xmlStrdup(token); 5906 atom->data = data; 5907 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5908 atom->min = min; 5909 atom->max = max; 5910 /* 5911 * associate a counter to the transition. 5912 */ 5913 counter = xmlRegGetCounter(am); 5914 am->counters[counter].min = 1; 5915 am->counters[counter].max = 1; 5916 5917 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5918 if (to == NULL) { 5919 to = xmlRegNewState(am); 5920 xmlRegStatePush(am, to); 5921 } 5922 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5923 xmlRegAtomPush(am, atom); 5924 am->state = to; 5925 return(to); 5926} 5927 5928/** 5929 * xmlAutomataNewState: 5930 * @am: an automata 5931 * 5932 * Create a new disconnected state in the automata 5933 * 5934 * Returns the new state or NULL in case of error 5935 */ 5936xmlAutomataStatePtr 5937xmlAutomataNewState(xmlAutomataPtr am) { 5938 xmlAutomataStatePtr to; 5939 5940 if (am == NULL) 5941 return(NULL); 5942 to = xmlRegNewState(am); 5943 xmlRegStatePush(am, to); 5944 return(to); 5945} 5946 5947/** 5948 * xmlAutomataNewEpsilon: 5949 * @am: an automata 5950 * @from: the starting point of the transition 5951 * @to: the target point of the transition or NULL 5952 * 5953 * If @to is NULL, this creates first a new target state in the automata 5954 * and then adds an epsilon transition from the @from state to the 5955 * target state 5956 * 5957 * Returns the target state or NULL in case of error 5958 */ 5959xmlAutomataStatePtr 5960xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 5961 xmlAutomataStatePtr to) { 5962 if ((am == NULL) || (from == NULL)) 5963 return(NULL); 5964 xmlFAGenerateEpsilonTransition(am, from, to); 5965 if (to == NULL) 5966 return(am->state); 5967 return(to); 5968} 5969 5970/** 5971 * xmlAutomataNewAllTrans: 5972 * @am: an automata 5973 * @from: the starting point of the transition 5974 * @to: the target point of the transition or NULL 5975 * @lax: allow to transition if not all all transitions have been activated 5976 * 5977 * If @to is NULL, this creates first a new target state in the automata 5978 * and then adds a an ALL transition from the @from state to the 5979 * target state. That transition is an epsilon transition allowed only when 5980 * all transitions from the @from node have been activated. 5981 * 5982 * Returns the target state or NULL in case of error 5983 */ 5984xmlAutomataStatePtr 5985xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5986 xmlAutomataStatePtr to, int lax) { 5987 if ((am == NULL) || (from == NULL)) 5988 return(NULL); 5989 xmlFAGenerateAllTransition(am, from, to, lax); 5990 if (to == NULL) 5991 return(am->state); 5992 return(to); 5993} 5994 5995/** 5996 * xmlAutomataNewCounter: 5997 * @am: an automata 5998 * @min: the minimal value on the counter 5999 * @max: the maximal value on the counter 6000 * 6001 * Create a new counter 6002 * 6003 * Returns the counter number or -1 in case of error 6004 */ 6005int 6006xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6007 int ret; 6008 6009 if (am == NULL) 6010 return(-1); 6011 6012 ret = xmlRegGetCounter(am); 6013 if (ret < 0) 6014 return(-1); 6015 am->counters[ret].min = min; 6016 am->counters[ret].max = max; 6017 return(ret); 6018} 6019 6020/** 6021 * xmlAutomataNewCountedTrans: 6022 * @am: an automata 6023 * @from: the starting point of the transition 6024 * @to: the target point of the transition or NULL 6025 * @counter: the counter associated to that transition 6026 * 6027 * If @to is NULL, this creates first a new target state in the automata 6028 * and then adds an epsilon transition from the @from state to the target state 6029 * which will increment the counter provided 6030 * 6031 * Returns the target state or NULL in case of error 6032 */ 6033xmlAutomataStatePtr 6034xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6035 xmlAutomataStatePtr to, int counter) { 6036 if ((am == NULL) || (from == NULL) || (counter < 0)) 6037 return(NULL); 6038 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6039 if (to == NULL) 6040 return(am->state); 6041 return(to); 6042} 6043 6044/** 6045 * xmlAutomataNewCounterTrans: 6046 * @am: an automata 6047 * @from: the starting point of the transition 6048 * @to: the target point of the transition or NULL 6049 * @counter: the counter associated to that transition 6050 * 6051 * If @to is NULL, this creates first a new target state in the automata 6052 * and then adds an epsilon transition from the @from state to the target state 6053 * which will be allowed only if the counter is within the right range. 6054 * 6055 * Returns the target state or NULL in case of error 6056 */ 6057xmlAutomataStatePtr 6058xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6059 xmlAutomataStatePtr to, int counter) { 6060 if ((am == NULL) || (from == NULL) || (counter < 0)) 6061 return(NULL); 6062 xmlFAGenerateCountedTransition(am, from, to, counter); 6063 if (to == NULL) 6064 return(am->state); 6065 return(to); 6066} 6067 6068/** 6069 * xmlAutomataCompile: 6070 * @am: an automata 6071 * 6072 * Compile the automata into a Reg Exp ready for being executed. 6073 * The automata should be free after this point. 6074 * 6075 * Returns the compiled regexp or NULL in case of error 6076 */ 6077xmlRegexpPtr 6078xmlAutomataCompile(xmlAutomataPtr am) { 6079 xmlRegexpPtr ret; 6080 6081 if ((am == NULL) || (am->error != 0)) return(NULL); 6082 xmlFAEliminateEpsilonTransitions(am); 6083 /* xmlFAComputesDeterminism(am); */ 6084 ret = xmlRegEpxFromParse(am); 6085 6086 return(ret); 6087} 6088 6089/** 6090 * xmlAutomataIsDeterminist: 6091 * @am: an automata 6092 * 6093 * Checks if an automata is determinist. 6094 * 6095 * Returns 1 if true, 0 if not, and -1 in case of error 6096 */ 6097int 6098xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6099 int ret; 6100 6101 if (am == NULL) 6102 return(-1); 6103 6104 ret = xmlFAComputesDeterminism(am); 6105 return(ret); 6106} 6107#endif /* LIBXML_AUTOMATA_ENABLED */ 6108 6109#ifdef LIBXML_EXPR_ENABLED 6110/************************************************************************ 6111 * * 6112 * Formal Expression handling code * 6113 * * 6114 ************************************************************************/ 6115/************************************************************************ 6116 * * 6117 * Expression handling context * 6118 * * 6119 ************************************************************************/ 6120 6121struct _xmlExpCtxt { 6122 xmlDictPtr dict; 6123 xmlExpNodePtr *table; 6124 int size; 6125 int nbElems; 6126 int nb_nodes; 6127 const char *expr; 6128 const char *cur; 6129 int nb_cons; 6130 int tabSize; 6131}; 6132 6133/** 6134 * xmlExpNewCtxt: 6135 * @maxNodes: the maximum number of nodes 6136 * @dict: optional dictionnary to use internally 6137 * 6138 * Creates a new context for manipulating expressions 6139 * 6140 * Returns the context or NULL in case of error 6141 */ 6142xmlExpCtxtPtr 6143xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6144 xmlExpCtxtPtr ret; 6145 int size = 256; 6146 6147 if (maxNodes <= 4096) 6148 maxNodes = 4096; 6149 6150 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6151 if (ret == NULL) 6152 return(NULL); 6153 memset(ret, 0, sizeof(xmlExpCtxt)); 6154 ret->size = size; 6155 ret->nbElems = 0; 6156 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6157 if (ret->table == NULL) { 6158 xmlFree(ret); 6159 return(NULL); 6160 } 6161 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6162 if (dict == NULL) { 6163 ret->dict = xmlDictCreate(); 6164 if (ret->dict == NULL) { 6165 xmlFree(ret->table); 6166 xmlFree(ret); 6167 return(NULL); 6168 } 6169 } else { 6170 ret->dict = dict; 6171 xmlDictReference(ret->dict); 6172 } 6173 return(ret); 6174} 6175 6176/** 6177 * xmlExpFreeCtxt: 6178 * @ctxt: an expression context 6179 * 6180 * Free an expression context 6181 */ 6182void 6183xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6184 if (ctxt == NULL) 6185 return; 6186 xmlDictFree(ctxt->dict); 6187 if (ctxt->table != NULL) 6188 xmlFree(ctxt->table); 6189 xmlFree(ctxt); 6190} 6191 6192/************************************************************************ 6193 * * 6194 * Structure associated to an expression node * 6195 * * 6196 ************************************************************************/ 6197#define MAX_NODES 10000 6198 6199/* #define DEBUG_DERIV */ 6200 6201/* 6202 * TODO: 6203 * - Wildcards 6204 * - public API for creation 6205 * 6206 * Started 6207 * - regression testing 6208 * 6209 * Done 6210 * - split into module and test tool 6211 * - memleaks 6212 */ 6213 6214typedef enum { 6215 XML_EXP_NILABLE = (1 << 0) 6216} xmlExpNodeInfo; 6217 6218#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6219 6220struct _xmlExpNode { 6221 unsigned char type;/* xmlExpNodeType */ 6222 unsigned char info;/* OR of xmlExpNodeInfo */ 6223 unsigned short key; /* the hash key */ 6224 unsigned int ref; /* The number of references */ 6225 int c_max; /* the maximum length it can consume */ 6226 xmlExpNodePtr exp_left; 6227 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6228 union { 6229 struct { 6230 int f_min; 6231 int f_max; 6232 } count; 6233 struct { 6234 xmlExpNodePtr f_right; 6235 } children; 6236 const xmlChar *f_str; 6237 } field; 6238}; 6239 6240#define exp_min field.count.f_min 6241#define exp_max field.count.f_max 6242/* #define exp_left field.children.f_left */ 6243#define exp_right field.children.f_right 6244#define exp_str field.f_str 6245 6246static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6247static xmlExpNode forbiddenExpNode = { 6248 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6249}; 6250xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6251static xmlExpNode emptyExpNode = { 6252 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6253}; 6254xmlExpNodePtr emptyExp = &emptyExpNode; 6255 6256/************************************************************************ 6257 * * 6258 * The custom hash table for unicity and canonicalization * 6259 * of sub-expressions pointers * 6260 * * 6261 ************************************************************************/ 6262/* 6263 * xmlExpHashNameComputeKey: 6264 * Calculate the hash key for a token 6265 */ 6266static unsigned short 6267xmlExpHashNameComputeKey(const xmlChar *name) { 6268 unsigned short value = 0L; 6269 char ch; 6270 6271 if (name != NULL) { 6272 value += 30 * (*name); 6273 while ((ch = *name++) != 0) { 6274 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6275 } 6276 } 6277 return (value); 6278} 6279 6280/* 6281 * xmlExpHashComputeKey: 6282 * Calculate the hash key for a compound expression 6283 */ 6284static unsigned short 6285xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6286 xmlExpNodePtr right) { 6287 unsigned long value; 6288 unsigned short ret; 6289 6290 switch (type) { 6291 case XML_EXP_SEQ: 6292 value = left->key; 6293 value += right->key; 6294 value *= 3; 6295 ret = (unsigned short) value; 6296 break; 6297 case XML_EXP_OR: 6298 value = left->key; 6299 value += right->key; 6300 value *= 7; 6301 ret = (unsigned short) value; 6302 break; 6303 case XML_EXP_COUNT: 6304 value = left->key; 6305 value += right->key; 6306 ret = (unsigned short) value; 6307 break; 6308 default: 6309 ret = 0; 6310 } 6311 return(ret); 6312} 6313 6314 6315static xmlExpNodePtr 6316xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6317 xmlExpNodePtr ret; 6318 6319 if (ctxt->nb_nodes >= MAX_NODES) 6320 return(NULL); 6321 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6322 if (ret == NULL) 6323 return(NULL); 6324 memset(ret, 0, sizeof(xmlExpNode)); 6325 ret->type = type; 6326 ret->next = NULL; 6327 ctxt->nb_nodes++; 6328 ctxt->nb_cons++; 6329 return(ret); 6330} 6331 6332/** 6333 * xmlExpHashGetEntry: 6334 * @table: the hash table 6335 * 6336 * Get the unique entry from the hash table. The entry is created if 6337 * needed. @left and @right are consumed, i.e. their ref count will 6338 * be decremented by the operation. 6339 * 6340 * Returns the pointer or NULL in case of error 6341 */ 6342static xmlExpNodePtr 6343xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6344 xmlExpNodePtr left, xmlExpNodePtr right, 6345 const xmlChar *name, int min, int max) { 6346 unsigned short kbase, key; 6347 xmlExpNodePtr entry; 6348 xmlExpNodePtr insert; 6349 6350 if (ctxt == NULL) 6351 return(NULL); 6352 6353 /* 6354 * Check for duplicate and insertion location. 6355 */ 6356 if (type == XML_EXP_ATOM) { 6357 kbase = xmlExpHashNameComputeKey(name); 6358 } else if (type == XML_EXP_COUNT) { 6359 /* COUNT reduction rule 1 */ 6360 /* a{1} -> a */ 6361 if (min == max) { 6362 if (min == 1) { 6363 return(left); 6364 } 6365 if (min == 0) { 6366 xmlExpFree(ctxt, left); 6367 return(emptyExp); 6368 } 6369 } 6370 if (min < 0) { 6371 xmlExpFree(ctxt, left); 6372 return(forbiddenExp); 6373 } 6374 if (max == -1) 6375 kbase = min + 79; 6376 else 6377 kbase = max - min; 6378 kbase += left->key; 6379 } else if (type == XML_EXP_OR) { 6380 /* Forbid reduction rules */ 6381 if (left->type == XML_EXP_FORBID) { 6382 xmlExpFree(ctxt, left); 6383 return(right); 6384 } 6385 if (right->type == XML_EXP_FORBID) { 6386 xmlExpFree(ctxt, right); 6387 return(left); 6388 } 6389 6390 /* OR reduction rule 1 */ 6391 /* a | a reduced to a */ 6392 if (left == right) { 6393 left->ref--; 6394 return(left); 6395 } 6396 /* OR canonicalization rule 1 */ 6397 /* linearize (a | b) | c into a | (b | c) */ 6398 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6399 xmlExpNodePtr tmp = left; 6400 left = right; 6401 right = tmp; 6402 } 6403 /* OR reduction rule 2 */ 6404 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6405 if (right->type == XML_EXP_OR) { 6406 if ((left == right->exp_left) || 6407 (left == right->exp_right)) { 6408 xmlExpFree(ctxt, left); 6409 return(right); 6410 } 6411 } 6412 /* OR canonicalization rule 2 */ 6413 /* linearize (a | b) | c into a | (b | c) */ 6414 if (left->type == XML_EXP_OR) { 6415 xmlExpNodePtr tmp; 6416 6417 /* OR canonicalization rule 2 */ 6418 if ((left->exp_right->type != XML_EXP_OR) && 6419 (left->exp_right->key < left->exp_left->key)) { 6420 tmp = left->exp_right; 6421 left->exp_right = left->exp_left; 6422 left->exp_left = tmp; 6423 } 6424 left->exp_right->ref++; 6425 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6426 NULL, 0, 0); 6427 left->exp_left->ref++; 6428 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6429 NULL, 0, 0); 6430 6431 xmlExpFree(ctxt, left); 6432 return(tmp); 6433 } 6434 if (right->type == XML_EXP_OR) { 6435 /* Ordering in the tree */ 6436 /* C | (A | B) -> A | (B | C) */ 6437 if (left->key > right->exp_right->key) { 6438 xmlExpNodePtr tmp; 6439 right->exp_right->ref++; 6440 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6441 left, NULL, 0, 0); 6442 right->exp_left->ref++; 6443 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6444 tmp, NULL, 0, 0); 6445 xmlExpFree(ctxt, right); 6446 return(tmp); 6447 } 6448 /* Ordering in the tree */ 6449 /* B | (A | C) -> A | (B | C) */ 6450 if (left->key > right->exp_left->key) { 6451 xmlExpNodePtr tmp; 6452 right->exp_right->ref++; 6453 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6454 right->exp_right, NULL, 0, 0); 6455 right->exp_left->ref++; 6456 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6457 tmp, NULL, 0, 0); 6458 xmlExpFree(ctxt, right); 6459 return(tmp); 6460 } 6461 } 6462 /* we know both types are != XML_EXP_OR here */ 6463 else if (left->key > right->key) { 6464 xmlExpNodePtr tmp = left; 6465 left = right; 6466 right = tmp; 6467 } 6468 kbase = xmlExpHashComputeKey(type, left, right); 6469 } else if (type == XML_EXP_SEQ) { 6470 /* Forbid reduction rules */ 6471 if (left->type == XML_EXP_FORBID) { 6472 xmlExpFree(ctxt, right); 6473 return(left); 6474 } 6475 if (right->type == XML_EXP_FORBID) { 6476 xmlExpFree(ctxt, left); 6477 return(right); 6478 } 6479 /* Empty reduction rules */ 6480 if (right->type == XML_EXP_EMPTY) { 6481 return(left); 6482 } 6483 if (left->type == XML_EXP_EMPTY) { 6484 return(right); 6485 } 6486 kbase = xmlExpHashComputeKey(type, left, right); 6487 } else 6488 return(NULL); 6489 6490 key = kbase % ctxt->size; 6491 if (ctxt->table[key] != NULL) { 6492 for (insert = ctxt->table[key]; insert != NULL; 6493 insert = insert->next) { 6494 if ((insert->key == kbase) && 6495 (insert->type == type)) { 6496 if (type == XML_EXP_ATOM) { 6497 if (name == insert->exp_str) { 6498 insert->ref++; 6499 return(insert); 6500 } 6501 } else if (type == XML_EXP_COUNT) { 6502 if ((insert->exp_min == min) && (insert->exp_max == max) && 6503 (insert->exp_left == left)) { 6504 insert->ref++; 6505 left->ref--; 6506 return(insert); 6507 } 6508 } else if ((insert->exp_left == left) && 6509 (insert->exp_right == right)) { 6510 insert->ref++; 6511 left->ref--; 6512 right->ref--; 6513 return(insert); 6514 } 6515 } 6516 } 6517 } 6518 6519 entry = xmlExpNewNode(ctxt, type); 6520 if (entry == NULL) 6521 return(NULL); 6522 entry->key = kbase; 6523 if (type == XML_EXP_ATOM) { 6524 entry->exp_str = name; 6525 entry->c_max = 1; 6526 } else if (type == XML_EXP_COUNT) { 6527 entry->exp_min = min; 6528 entry->exp_max = max; 6529 entry->exp_left = left; 6530 if ((min == 0) || (IS_NILLABLE(left))) 6531 entry->info |= XML_EXP_NILABLE; 6532 if (max < 0) 6533 entry->c_max = -1; 6534 else 6535 entry->c_max = max * entry->exp_left->c_max; 6536 } else { 6537 entry->exp_left = left; 6538 entry->exp_right = right; 6539 if (type == XML_EXP_OR) { 6540 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6541 entry->info |= XML_EXP_NILABLE; 6542 if ((entry->exp_left->c_max == -1) || 6543 (entry->exp_right->c_max == -1)) 6544 entry->c_max = -1; 6545 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6546 entry->c_max = entry->exp_left->c_max; 6547 else 6548 entry->c_max = entry->exp_right->c_max; 6549 } else { 6550 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6551 entry->info |= XML_EXP_NILABLE; 6552 if ((entry->exp_left->c_max == -1) || 6553 (entry->exp_right->c_max == -1)) 6554 entry->c_max = -1; 6555 else 6556 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6557 } 6558 } 6559 entry->ref = 1; 6560 if (ctxt->table[key] != NULL) 6561 entry->next = ctxt->table[key]; 6562 6563 ctxt->table[key] = entry; 6564 ctxt->nbElems++; 6565 6566 return(entry); 6567} 6568 6569/** 6570 * xmlExpFree: 6571 * @ctxt: the expression context 6572 * @exp: the expression 6573 * 6574 * Dereference the expression 6575 */ 6576void 6577xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6578 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6579 return; 6580 exp->ref--; 6581 if (exp->ref == 0) { 6582 unsigned short key; 6583 6584 /* Unlink it first from the hash table */ 6585 key = exp->key % ctxt->size; 6586 if (ctxt->table[key] == exp) { 6587 ctxt->table[key] = exp->next; 6588 } else { 6589 xmlExpNodePtr tmp; 6590 6591 tmp = ctxt->table[key]; 6592 while (tmp != NULL) { 6593 if (tmp->next == exp) { 6594 tmp->next = exp->next; 6595 break; 6596 } 6597 tmp = tmp->next; 6598 } 6599 } 6600 6601 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6602 xmlExpFree(ctxt, exp->exp_left); 6603 xmlExpFree(ctxt, exp->exp_right); 6604 } else if (exp->type == XML_EXP_COUNT) { 6605 xmlExpFree(ctxt, exp->exp_left); 6606 } 6607 xmlFree(exp); 6608 ctxt->nb_nodes--; 6609 } 6610} 6611 6612/** 6613 * xmlExpRef: 6614 * @exp: the expression 6615 * 6616 * Increase the reference count of the expression 6617 */ 6618void 6619xmlExpRef(xmlExpNodePtr exp) { 6620 if (exp != NULL) 6621 exp->ref++; 6622} 6623 6624/** 6625 * xmlExpNewAtom: 6626 * @ctxt: the expression context 6627 * @name: the atom name 6628 * @len: the atom name lenght in byte (or -1); 6629 * 6630 * Get the atom associated to this name from that context 6631 * 6632 * Returns the node or NULL in case of error 6633 */ 6634xmlExpNodePtr 6635xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6636 if ((ctxt == NULL) || (name == NULL)) 6637 return(NULL); 6638 name = xmlDictLookup(ctxt->dict, name, len); 6639 if (name == NULL) 6640 return(NULL); 6641 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6642} 6643 6644/** 6645 * xmlExpNewOr: 6646 * @ctxt: the expression context 6647 * @left: left expression 6648 * @right: right expression 6649 * 6650 * Get the atom associated to the choice @left | @right 6651 * Note that @left and @right are consumed in the operation, to keep 6652 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6653 * this is true even in case of failure (unless ctxt == NULL). 6654 * 6655 * Returns the node or NULL in case of error 6656 */ 6657xmlExpNodePtr 6658xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6659 if (ctxt == NULL) 6660 return(NULL); 6661 if ((left == NULL) || (right == NULL)) { 6662 xmlExpFree(ctxt, left); 6663 xmlExpFree(ctxt, right); 6664 return(NULL); 6665 } 6666 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6667} 6668 6669/** 6670 * xmlExpNewSeq: 6671 * @ctxt: the expression context 6672 * @left: left expression 6673 * @right: right expression 6674 * 6675 * Get the atom associated to the sequence @left , @right 6676 * Note that @left and @right are consumed in the operation, to keep 6677 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6678 * this is true even in case of failure (unless ctxt == NULL). 6679 * 6680 * Returns the node or NULL in case of error 6681 */ 6682xmlExpNodePtr 6683xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6684 if (ctxt == NULL) 6685 return(NULL); 6686 if ((left == NULL) || (right == NULL)) { 6687 xmlExpFree(ctxt, left); 6688 xmlExpFree(ctxt, right); 6689 return(NULL); 6690 } 6691 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6692} 6693 6694/** 6695 * xmlExpNewRange: 6696 * @ctxt: the expression context 6697 * @subset: the expression to be repeated 6698 * @min: the lower bound for the repetition 6699 * @max: the upper bound for the repetition, -1 means infinite 6700 * 6701 * Get the atom associated to the range (@subset){@min, @max} 6702 * Note that @subset is consumed in the operation, to keep 6703 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6704 * this is true even in case of failure (unless ctxt == NULL). 6705 * 6706 * Returns the node or NULL in case of error 6707 */ 6708xmlExpNodePtr 6709xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6710 if (ctxt == NULL) 6711 return(NULL); 6712 if ((subset == NULL) || (min < 0) || (max < -1) || 6713 ((max >= 0) && (min > max))) { 6714 xmlExpFree(ctxt, subset); 6715 return(NULL); 6716 } 6717 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6718 NULL, NULL, min, max)); 6719} 6720 6721/************************************************************************ 6722 * * 6723 * Public API for operations on expressions * 6724 * * 6725 ************************************************************************/ 6726 6727static int 6728xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6729 const xmlChar**list, int len, int nb) { 6730 int tmp, tmp2; 6731tail: 6732 switch (exp->type) { 6733 case XML_EXP_EMPTY: 6734 return(0); 6735 case XML_EXP_ATOM: 6736 for (tmp = 0;tmp < nb;tmp++) 6737 if (list[tmp] == exp->exp_str) 6738 return(0); 6739 if (nb >= len) 6740 return(-2); 6741 list[nb++] = exp->exp_str; 6742 return(1); 6743 case XML_EXP_COUNT: 6744 exp = exp->exp_left; 6745 goto tail; 6746 case XML_EXP_SEQ: 6747 case XML_EXP_OR: 6748 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6749 if (tmp < 0) 6750 return(tmp); 6751 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6752 nb + tmp); 6753 if (tmp2 < 0) 6754 return(tmp2); 6755 return(tmp + tmp2); 6756 } 6757 return(-1); 6758} 6759 6760/** 6761 * xmlExpGetLanguage: 6762 * @ctxt: the expression context 6763 * @exp: the expression 6764 * @langList: where to store the tokens 6765 * @len: the allocated lenght of @list 6766 * 6767 * Find all the strings used in @exp and store them in @list 6768 * 6769 * Returns the number of unique strings found, -1 in case of errors and 6770 * -2 if there is more than @len strings 6771 */ 6772int 6773xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6774 const xmlChar**langList, int len) { 6775 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6776 return(-1); 6777 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6778} 6779 6780static int 6781xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6782 const xmlChar**list, int len, int nb) { 6783 int tmp, tmp2; 6784tail: 6785 switch (exp->type) { 6786 case XML_EXP_FORBID: 6787 return(0); 6788 case XML_EXP_EMPTY: 6789 return(0); 6790 case XML_EXP_ATOM: 6791 for (tmp = 0;tmp < nb;tmp++) 6792 if (list[tmp] == exp->exp_str) 6793 return(0); 6794 if (nb >= len) 6795 return(-2); 6796 list[nb++] = exp->exp_str; 6797 return(1); 6798 case XML_EXP_COUNT: 6799 exp = exp->exp_left; 6800 goto tail; 6801 case XML_EXP_SEQ: 6802 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6803 if (tmp < 0) 6804 return(tmp); 6805 if (IS_NILLABLE(exp->exp_left)) { 6806 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6807 nb + tmp); 6808 if (tmp2 < 0) 6809 return(tmp2); 6810 tmp += tmp2; 6811 } 6812 return(tmp); 6813 case XML_EXP_OR: 6814 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6815 if (tmp < 0) 6816 return(tmp); 6817 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6818 nb + tmp); 6819 if (tmp2 < 0) 6820 return(tmp2); 6821 return(tmp + tmp2); 6822 } 6823 return(-1); 6824} 6825 6826/** 6827 * xmlExpGetStart: 6828 * @ctxt: the expression context 6829 * @exp: the expression 6830 * @tokList: where to store the tokens 6831 * @len: the allocated lenght of @list 6832 * 6833 * Find all the strings that appears at the start of the languages 6834 * accepted by @exp and store them in @list. E.g. for (a, b) | c 6835 * it will return the list [a, c] 6836 * 6837 * Returns the number of unique strings found, -1 in case of errors and 6838 * -2 if there is more than @len strings 6839 */ 6840int 6841xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6842 const xmlChar**tokList, int len) { 6843 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 6844 return(-1); 6845 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 6846} 6847 6848/** 6849 * xmlExpIsNillable: 6850 * @exp: the expression 6851 * 6852 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 6853 * 6854 * Returns 1 if nillable, 0 if not and -1 in case of error 6855 */ 6856int 6857xmlExpIsNillable(xmlExpNodePtr exp) { 6858 if (exp == NULL) 6859 return(-1); 6860 return(IS_NILLABLE(exp) != 0); 6861} 6862 6863static xmlExpNodePtr 6864xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 6865{ 6866 xmlExpNodePtr ret; 6867 6868 switch (exp->type) { 6869 case XML_EXP_EMPTY: 6870 return(forbiddenExp); 6871 case XML_EXP_FORBID: 6872 return(forbiddenExp); 6873 case XML_EXP_ATOM: 6874 if (exp->exp_str == str) { 6875#ifdef DEBUG_DERIV 6876 printf("deriv atom: equal => Empty\n"); 6877#endif 6878 ret = emptyExp; 6879 } else { 6880#ifdef DEBUG_DERIV 6881 printf("deriv atom: mismatch => forbid\n"); 6882#endif 6883 /* TODO wildcards here */ 6884 ret = forbiddenExp; 6885 } 6886 return(ret); 6887 case XML_EXP_OR: { 6888 xmlExpNodePtr tmp; 6889 6890#ifdef DEBUG_DERIV 6891 printf("deriv or: => or(derivs)\n"); 6892#endif 6893 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6894 if (tmp == NULL) { 6895 return(NULL); 6896 } 6897 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6898 if (ret == NULL) { 6899 xmlExpFree(ctxt, tmp); 6900 return(NULL); 6901 } 6902 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 6903 NULL, 0, 0); 6904 return(ret); 6905 } 6906 case XML_EXP_SEQ: 6907#ifdef DEBUG_DERIV 6908 printf("deriv seq: starting with left\n"); 6909#endif 6910 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6911 if (ret == NULL) { 6912 return(NULL); 6913 } else if (ret == forbiddenExp) { 6914 if (IS_NILLABLE(exp->exp_left)) { 6915#ifdef DEBUG_DERIV 6916 printf("deriv seq: left failed but nillable\n"); 6917#endif 6918 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6919 } 6920 } else { 6921#ifdef DEBUG_DERIV 6922 printf("deriv seq: left match => sequence\n"); 6923#endif 6924 exp->exp_right->ref++; 6925 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 6926 NULL, 0, 0); 6927 } 6928 return(ret); 6929 case XML_EXP_COUNT: { 6930 int min, max; 6931 xmlExpNodePtr tmp; 6932 6933 if (exp->exp_max == 0) 6934 return(forbiddenExp); 6935 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6936 if (ret == NULL) 6937 return(NULL); 6938 if (ret == forbiddenExp) { 6939#ifdef DEBUG_DERIV 6940 printf("deriv count: pattern mismatch => forbid\n"); 6941#endif 6942 return(ret); 6943 } 6944 if (exp->exp_max == 1) 6945 return(ret); 6946 if (exp->exp_max < 0) /* unbounded */ 6947 max = -1; 6948 else 6949 max = exp->exp_max - 1; 6950 if (exp->exp_min > 0) 6951 min = exp->exp_min - 1; 6952 else 6953 min = 0; 6954 exp->exp_left->ref++; 6955 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 6956 NULL, min, max); 6957 if (ret == emptyExp) { 6958#ifdef DEBUG_DERIV 6959 printf("deriv count: match to empty => new count\n"); 6960#endif 6961 return(tmp); 6962 } 6963#ifdef DEBUG_DERIV 6964 printf("deriv count: match => sequence with new count\n"); 6965#endif 6966 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 6967 NULL, 0, 0)); 6968 } 6969 } 6970 return(NULL); 6971} 6972 6973/** 6974 * xmlExpStringDerive: 6975 * @ctxt: the expression context 6976 * @exp: the expression 6977 * @str: the string 6978 * @len: the string len in bytes if available 6979 * 6980 * Do one step of Brzozowski derivation of the expression @exp with 6981 * respect to the input string 6982 * 6983 * Returns the resulting expression or NULL in case of internal error 6984 */ 6985xmlExpNodePtr 6986xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6987 const xmlChar *str, int len) { 6988 const xmlChar *input; 6989 6990 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 6991 return(NULL); 6992 } 6993 /* 6994 * check the string is in the dictionnary, if yes use an interned 6995 * copy, otherwise we know it's not an acceptable input 6996 */ 6997 input = xmlDictExists(ctxt->dict, str, len); 6998 if (input == NULL) { 6999 return(forbiddenExp); 7000 } 7001 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7002} 7003 7004static int 7005xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7006 int ret = 1; 7007 7008 if (sub->c_max == -1) { 7009 if (exp->c_max != -1) 7010 ret = 0; 7011 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7012 ret = 0; 7013 } 7014#if 0 7015 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7016 ret = 0; 7017#endif 7018 return(ret); 7019} 7020 7021static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7022 xmlExpNodePtr sub); 7023/** 7024 * xmlExpDivide: 7025 * @ctxt: the expressions context 7026 * @exp: the englobing expression 7027 * @sub: the subexpression 7028 * @mult: the multiple expression 7029 * @remain: the remain from the derivation of the multiple 7030 * 7031 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7032 * so that sub{n} subsume exp 7033 * 7034 * Returns the multiple value if successful, 0 if it is not a multiple 7035 * and -1 in case of internel error. 7036 */ 7037 7038static int 7039xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7040 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7041 int i; 7042 xmlExpNodePtr tmp, tmp2; 7043 7044 if (mult != NULL) *mult = NULL; 7045 if (remain != NULL) *remain = NULL; 7046 if (exp->c_max == -1) return(0); 7047 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7048 7049 for (i = 1;i <= exp->c_max;i++) { 7050 sub->ref++; 7051 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7052 sub, NULL, NULL, i, i); 7053 if (tmp == NULL) { 7054 return(-1); 7055 } 7056 if (!xmlExpCheckCard(tmp, exp)) { 7057 xmlExpFree(ctxt, tmp); 7058 continue; 7059 } 7060 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7061 if (tmp2 == NULL) { 7062 xmlExpFree(ctxt, tmp); 7063 return(-1); 7064 } 7065 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7066 if (remain != NULL) 7067 *remain = tmp2; 7068 else 7069 xmlExpFree(ctxt, tmp2); 7070 if (mult != NULL) 7071 *mult = tmp; 7072 else 7073 xmlExpFree(ctxt, tmp); 7074#ifdef DEBUG_DERIV 7075 printf("Divide succeeded %d\n", i); 7076#endif 7077 return(i); 7078 } 7079 xmlExpFree(ctxt, tmp); 7080 xmlExpFree(ctxt, tmp2); 7081 } 7082#ifdef DEBUG_DERIV 7083 printf("Divide failed\n"); 7084#endif 7085 return(0); 7086} 7087 7088/** 7089 * xmlExpExpDeriveInt: 7090 * @ctxt: the expressions context 7091 * @exp: the englobing expression 7092 * @sub: the subexpression 7093 * 7094 * Try to do a step of Brzozowski derivation but at a higher level 7095 * the input being a subexpression. 7096 * 7097 * Returns the resulting expression or NULL in case of internal error 7098 */ 7099static xmlExpNodePtr 7100xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7101 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7102 const xmlChar **tab; 7103 int len, i; 7104 7105 /* 7106 * In case of equality and if the expression can only consume a finite 7107 * amount, then the derivation is empty 7108 */ 7109 if ((exp == sub) && (exp->c_max >= 0)) { 7110#ifdef DEBUG_DERIV 7111 printf("Equal(exp, sub) and finite -> Empty\n"); 7112#endif 7113 return(emptyExp); 7114 } 7115 /* 7116 * decompose sub sequence first 7117 */ 7118 if (sub->type == XML_EXP_EMPTY) { 7119#ifdef DEBUG_DERIV 7120 printf("Empty(sub) -> Empty\n"); 7121#endif 7122 exp->ref++; 7123 return(exp); 7124 } 7125 if (sub->type == XML_EXP_SEQ) { 7126#ifdef DEBUG_DERIV 7127 printf("Seq(sub) -> decompose\n"); 7128#endif 7129 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7130 if (tmp == NULL) 7131 return(NULL); 7132 if (tmp == forbiddenExp) 7133 return(tmp); 7134 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7135 xmlExpFree(ctxt, tmp); 7136 return(ret); 7137 } 7138 if (sub->type == XML_EXP_OR) { 7139#ifdef DEBUG_DERIV 7140 printf("Or(sub) -> decompose\n"); 7141#endif 7142 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7143 if (tmp == forbiddenExp) 7144 return(tmp); 7145 if (tmp == NULL) 7146 return(NULL); 7147 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7148 if ((ret == NULL) || (ret == forbiddenExp)) { 7149 xmlExpFree(ctxt, tmp); 7150 return(ret); 7151 } 7152 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7153 } 7154 if (!xmlExpCheckCard(exp, sub)) { 7155#ifdef DEBUG_DERIV 7156 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7157#endif 7158 return(forbiddenExp); 7159 } 7160 switch (exp->type) { 7161 case XML_EXP_EMPTY: 7162 if (sub == emptyExp) 7163 return(emptyExp); 7164#ifdef DEBUG_DERIV 7165 printf("Empty(exp) -> Forbid\n"); 7166#endif 7167 return(forbiddenExp); 7168 case XML_EXP_FORBID: 7169#ifdef DEBUG_DERIV 7170 printf("Forbid(exp) -> Forbid\n"); 7171#endif 7172 return(forbiddenExp); 7173 case XML_EXP_ATOM: 7174 if (sub->type == XML_EXP_ATOM) { 7175 /* TODO: handle wildcards */ 7176 if (exp->exp_str == sub->exp_str) { 7177#ifdef DEBUG_DERIV 7178 printf("Atom match -> Empty\n"); 7179#endif 7180 return(emptyExp); 7181 } 7182#ifdef DEBUG_DERIV 7183 printf("Atom mismatch -> Forbid\n"); 7184#endif 7185 return(forbiddenExp); 7186 } 7187 if ((sub->type == XML_EXP_COUNT) && 7188 (sub->exp_max == 1) && 7189 (sub->exp_left->type == XML_EXP_ATOM)) { 7190 /* TODO: handle wildcards */ 7191 if (exp->exp_str == sub->exp_left->exp_str) { 7192#ifdef DEBUG_DERIV 7193 printf("Atom match -> Empty\n"); 7194#endif 7195 return(emptyExp); 7196 } 7197#ifdef DEBUG_DERIV 7198 printf("Atom mismatch -> Forbid\n"); 7199#endif 7200 return(forbiddenExp); 7201 } 7202#ifdef DEBUG_DERIV 7203 printf("Compex exp vs Atom -> Forbid\n"); 7204#endif 7205 return(forbiddenExp); 7206 case XML_EXP_SEQ: 7207 /* try to get the sequence consumed only if possible */ 7208 if (xmlExpCheckCard(exp->exp_left, sub)) { 7209 /* See if the sequence can be consumed directly */ 7210#ifdef DEBUG_DERIV 7211 printf("Seq trying left only\n"); 7212#endif 7213 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7214 if ((ret != forbiddenExp) && (ret != NULL)) { 7215#ifdef DEBUG_DERIV 7216 printf("Seq trying left only worked\n"); 7217#endif 7218 /* 7219 * TODO: assumption here that we are determinist 7220 * i.e. we won't get to a nillable exp left 7221 * subset which could be matched by the right 7222 * part too. 7223 * e.g.: (a | b)+,(a | c) and 'a+,a' 7224 */ 7225 exp->exp_right->ref++; 7226 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7227 exp->exp_right, NULL, 0, 0)); 7228 } 7229#ifdef DEBUG_DERIV 7230 } else { 7231 printf("Seq: left too short\n"); 7232#endif 7233 } 7234 /* Try instead to decompose */ 7235 if (sub->type == XML_EXP_COUNT) { 7236 int min, max; 7237 7238#ifdef DEBUG_DERIV 7239 printf("Seq: sub is a count\n"); 7240#endif 7241 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7242 if (ret == NULL) 7243 return(NULL); 7244 if (ret != forbiddenExp) { 7245#ifdef DEBUG_DERIV 7246 printf("Seq , Count match on left\n"); 7247#endif 7248 if (sub->exp_max < 0) 7249 max = -1; 7250 else 7251 max = sub->exp_max -1; 7252 if (sub->exp_min > 0) 7253 min = sub->exp_min -1; 7254 else 7255 min = 0; 7256 exp->exp_right->ref++; 7257 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7258 exp->exp_right, NULL, 0, 0); 7259 if (tmp == NULL) 7260 return(NULL); 7261 7262 sub->exp_left->ref++; 7263 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7264 sub->exp_left, NULL, NULL, min, max); 7265 if (tmp2 == NULL) { 7266 xmlExpFree(ctxt, tmp); 7267 return(NULL); 7268 } 7269 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7270 xmlExpFree(ctxt, tmp); 7271 xmlExpFree(ctxt, tmp2); 7272 return(ret); 7273 } 7274 } 7275 /* we made no progress on structured operations */ 7276 break; 7277 case XML_EXP_OR: 7278#ifdef DEBUG_DERIV 7279 printf("Or , trying both side\n"); 7280#endif 7281 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7282 if (ret == NULL) 7283 return(NULL); 7284 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7285 if (tmp == NULL) { 7286 xmlExpFree(ctxt, ret); 7287 return(NULL); 7288 } 7289 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7290 case XML_EXP_COUNT: { 7291 int min, max; 7292 7293 if (sub->type == XML_EXP_COUNT) { 7294 /* 7295 * Try to see if the loop is completely subsumed 7296 */ 7297 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7298 if (tmp == NULL) 7299 return(NULL); 7300 if (tmp == forbiddenExp) { 7301 int mult; 7302 7303#ifdef DEBUG_DERIV 7304 printf("Count, Count inner don't subsume\n"); 7305#endif 7306 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7307 NULL, &tmp); 7308 if (mult <= 0) { 7309#ifdef DEBUG_DERIV 7310 printf("Count, Count not multiple => forbidden\n"); 7311#endif 7312 return(forbiddenExp); 7313 } 7314 if (sub->exp_max == -1) { 7315 max = -1; 7316 if (exp->exp_max == -1) { 7317 if (exp->exp_min <= sub->exp_min * mult) 7318 min = 0; 7319 else 7320 min = exp->exp_min - sub->exp_min * mult; 7321 } else { 7322#ifdef DEBUG_DERIV 7323 printf("Count, Count finite can't subsume infinite\n"); 7324#endif 7325 xmlExpFree(ctxt, tmp); 7326 return(forbiddenExp); 7327 } 7328 } else { 7329 if (exp->exp_max == -1) { 7330#ifdef DEBUG_DERIV 7331 printf("Infinite loop consume mult finite loop\n"); 7332#endif 7333 if (exp->exp_min > sub->exp_min * mult) { 7334 max = -1; 7335 min = exp->exp_min - sub->exp_min * mult; 7336 } else { 7337 max = -1; 7338 min = 0; 7339 } 7340 } else { 7341 if (exp->exp_max < sub->exp_max * mult) { 7342#ifdef DEBUG_DERIV 7343 printf("loops max mult mismatch => forbidden\n"); 7344#endif 7345 xmlExpFree(ctxt, tmp); 7346 return(forbiddenExp); 7347 } 7348 if (sub->exp_max * mult > exp->exp_min) 7349 min = 0; 7350 else 7351 min = exp->exp_min - sub->exp_max * mult; 7352 max = exp->exp_max - sub->exp_max * mult; 7353 } 7354 } 7355 } else if (!IS_NILLABLE(tmp)) { 7356 /* 7357 * TODO: loop here to try to grow if working on finite 7358 * blocks. 7359 */ 7360#ifdef DEBUG_DERIV 7361 printf("Count, Count remain not nillable => forbidden\n"); 7362#endif 7363 xmlExpFree(ctxt, tmp); 7364 return(forbiddenExp); 7365 } else if (sub->exp_max == -1) { 7366 if (exp->exp_max == -1) { 7367 if (exp->exp_min <= sub->exp_min) { 7368#ifdef DEBUG_DERIV 7369 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7370#endif 7371 max = -1; 7372 min = 0; 7373 } else { 7374#ifdef DEBUG_DERIV 7375 printf("Infinite loops min => Count(X,Inf)\n"); 7376#endif 7377 max = -1; 7378 min = exp->exp_min - sub->exp_min; 7379 } 7380 } else if (exp->exp_min > sub->exp_min) { 7381#ifdef DEBUG_DERIV 7382 printf("loops min mismatch 1 => forbidden ???\n"); 7383#endif 7384 xmlExpFree(ctxt, tmp); 7385 return(forbiddenExp); 7386 } else { 7387 max = -1; 7388 min = 0; 7389 } 7390 } else { 7391 if (exp->exp_max == -1) { 7392#ifdef DEBUG_DERIV 7393 printf("Infinite loop consume finite loop\n"); 7394#endif 7395 if (exp->exp_min > sub->exp_min) { 7396 max = -1; 7397 min = exp->exp_min - sub->exp_min; 7398 } else { 7399 max = -1; 7400 min = 0; 7401 } 7402 } else { 7403 if (exp->exp_max < sub->exp_max) { 7404#ifdef DEBUG_DERIV 7405 printf("loops max mismatch => forbidden\n"); 7406#endif 7407 xmlExpFree(ctxt, tmp); 7408 return(forbiddenExp); 7409 } 7410 if (sub->exp_max > exp->exp_min) 7411 min = 0; 7412 else 7413 min = exp->exp_min - sub->exp_max; 7414 max = exp->exp_max - sub->exp_max; 7415 } 7416 } 7417#ifdef DEBUG_DERIV 7418 printf("loops match => SEQ(COUNT())\n"); 7419#endif 7420 exp->exp_left->ref++; 7421 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7422 NULL, NULL, min, max); 7423 if (tmp2 == NULL) { 7424 return(NULL); 7425 } 7426 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7427 NULL, 0, 0); 7428 return(ret); 7429 } 7430 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7431 if (tmp == NULL) 7432 return(NULL); 7433 if (tmp == forbiddenExp) { 7434#ifdef DEBUG_DERIV 7435 printf("loop mismatch => forbidden\n"); 7436#endif 7437 return(forbiddenExp); 7438 } 7439 if (exp->exp_min > 0) 7440 min = exp->exp_min - 1; 7441 else 7442 min = 0; 7443 if (exp->exp_max < 0) 7444 max = -1; 7445 else 7446 max = exp->exp_max - 1; 7447 7448#ifdef DEBUG_DERIV 7449 printf("loop match => SEQ(COUNT())\n"); 7450#endif 7451 exp->exp_left->ref++; 7452 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7453 NULL, NULL, min, max); 7454 if (tmp2 == NULL) 7455 return(NULL); 7456 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7457 NULL, 0, 0); 7458 return(ret); 7459 } 7460 } 7461 7462#ifdef DEBUG_DERIV 7463 printf("Fallback to derivative\n"); 7464#endif 7465 if (IS_NILLABLE(sub)) { 7466 if (!(IS_NILLABLE(exp))) 7467 return(forbiddenExp); 7468 else 7469 ret = emptyExp; 7470 } else 7471 ret = NULL; 7472 /* 7473 * here the structured derivation made no progress so 7474 * we use the default token based derivation to force one more step 7475 */ 7476 if (ctxt->tabSize == 0) 7477 ctxt->tabSize = 40; 7478 7479 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7480 sizeof(const xmlChar *)); 7481 if (tab == NULL) { 7482 return(NULL); 7483 } 7484 7485 /* 7486 * collect all the strings accepted by the subexpression on input 7487 */ 7488 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7489 while (len < 0) { 7490 const xmlChar **temp; 7491 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7492 sizeof(const xmlChar *)); 7493 if (temp == NULL) { 7494 xmlFree((xmlChar **) tab); 7495 return(NULL); 7496 } 7497 tab = temp; 7498 ctxt->tabSize *= 2; 7499 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7500 } 7501 for (i = 0;i < len;i++) { 7502 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7503 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7504 xmlExpFree(ctxt, ret); 7505 xmlFree((xmlChar **) tab); 7506 return(tmp); 7507 } 7508 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7509 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7510 xmlExpFree(ctxt, tmp); 7511 xmlExpFree(ctxt, ret); 7512 xmlFree((xmlChar **) tab); 7513 return(tmp); 7514 } 7515 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7516 xmlExpFree(ctxt, tmp); 7517 xmlExpFree(ctxt, tmp2); 7518 7519 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7520 xmlExpFree(ctxt, ret); 7521 xmlFree((xmlChar **) tab); 7522 return(tmp3); 7523 } 7524 7525 if (ret == NULL) 7526 ret = tmp3; 7527 else { 7528 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7529 if (ret == NULL) { 7530 xmlFree((xmlChar **) tab); 7531 return(NULL); 7532 } 7533 } 7534 } 7535 xmlFree((xmlChar **) tab); 7536 return(ret); 7537} 7538 7539/** 7540 * xmlExpExpDerive: 7541 * @ctxt: the expressions context 7542 * @exp: the englobing expression 7543 * @sub: the subexpression 7544 * 7545 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7546 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7547 * it usually tatkes less than linear time and can handle expressions generating 7548 * infinite languages. 7549 * 7550 * Returns the resulting expression or NULL in case of internal error, the 7551 * result must be freed 7552 */ 7553xmlExpNodePtr 7554xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7555 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7556 return(NULL); 7557 7558 /* 7559 * O(1) speedups 7560 */ 7561 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7562#ifdef DEBUG_DERIV 7563 printf("Sub nillable and not exp : can't subsume\n"); 7564#endif 7565 return(forbiddenExp); 7566 } 7567 if (xmlExpCheckCard(exp, sub) == 0) { 7568#ifdef DEBUG_DERIV 7569 printf("sub generate longuer sequances than exp : can't subsume\n"); 7570#endif 7571 return(forbiddenExp); 7572 } 7573 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7574} 7575 7576/** 7577 * xmlExpSubsume: 7578 * @ctxt: the expressions context 7579 * @exp: the englobing expression 7580 * @sub: the subexpression 7581 * 7582 * Check whether @exp accepts all the languages accexpted by @sub 7583 * the input being a subexpression. 7584 * 7585 * Returns 1 if true 0 if false and -1 in case of failure. 7586 */ 7587int 7588xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7589 xmlExpNodePtr tmp; 7590 7591 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7592 return(-1); 7593 7594 /* 7595 * TODO: speedup by checking the language of sub is a subset of the 7596 * language of exp 7597 */ 7598 /* 7599 * O(1) speedups 7600 */ 7601 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7602#ifdef DEBUG_DERIV 7603 printf("Sub nillable and not exp : can't subsume\n"); 7604#endif 7605 return(0); 7606 } 7607 if (xmlExpCheckCard(exp, sub) == 0) { 7608#ifdef DEBUG_DERIV 7609 printf("sub generate longuer sequances than exp : can't subsume\n"); 7610#endif 7611 return(0); 7612 } 7613 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7614#ifdef DEBUG_DERIV 7615 printf("Result derivation :\n"); 7616 PRINT_EXP(tmp); 7617#endif 7618 if (tmp == NULL) 7619 return(-1); 7620 if (tmp == forbiddenExp) 7621 return(0); 7622 if (tmp == emptyExp) 7623 return(1); 7624 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7625 xmlExpFree(ctxt, tmp); 7626 return(1); 7627 } 7628 xmlExpFree(ctxt, tmp); 7629 return(0); 7630} 7631 7632/************************************************************************ 7633 * * 7634 * Parsing expression * 7635 * * 7636 ************************************************************************/ 7637 7638static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7639 7640#undef CUR 7641#define CUR (*ctxt->cur) 7642#undef NEXT 7643#define NEXT ctxt->cur++; 7644#undef IS_BLANK 7645#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7646#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7647 7648static int 7649xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7650 int ret = 0; 7651 7652 SKIP_BLANKS 7653 if (CUR == '*') { 7654 NEXT 7655 return(-1); 7656 } 7657 if ((CUR < '0') || (CUR > '9')) 7658 return(-1); 7659 while ((CUR >= '0') && (CUR <= '9')) { 7660 ret = ret * 10 + (CUR - '0'); 7661 NEXT 7662 } 7663 return(ret); 7664} 7665 7666static xmlExpNodePtr 7667xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7668 const char *base; 7669 xmlExpNodePtr ret; 7670 const xmlChar *val; 7671 7672 SKIP_BLANKS 7673 base = ctxt->cur; 7674 if (*ctxt->cur == '(') { 7675 NEXT 7676 ret = xmlExpParseExpr(ctxt); 7677 SKIP_BLANKS 7678 if (*ctxt->cur != ')') { 7679 fprintf(stderr, "unbalanced '(' : %s\n", base); 7680 xmlExpFree(ctxt, ret); 7681 return(NULL); 7682 } 7683 NEXT; 7684 SKIP_BLANKS 7685 goto parse_quantifier; 7686 } 7687 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7688 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7689 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7690 NEXT; 7691 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7692 if (val == NULL) 7693 return(NULL); 7694 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7695 if (ret == NULL) 7696 return(NULL); 7697 SKIP_BLANKS 7698parse_quantifier: 7699 if (CUR == '{') { 7700 int min, max; 7701 7702 NEXT 7703 min = xmlExpParseNumber(ctxt); 7704 if (min < 0) { 7705 xmlExpFree(ctxt, ret); 7706 return(NULL); 7707 } 7708 SKIP_BLANKS 7709 if (CUR == ',') { 7710 NEXT 7711 max = xmlExpParseNumber(ctxt); 7712 SKIP_BLANKS 7713 } else 7714 max = min; 7715 if (CUR != '}') { 7716 xmlExpFree(ctxt, ret); 7717 return(NULL); 7718 } 7719 NEXT 7720 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7721 min, max); 7722 SKIP_BLANKS 7723 } else if (CUR == '?') { 7724 NEXT 7725 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7726 0, 1); 7727 SKIP_BLANKS 7728 } else if (CUR == '+') { 7729 NEXT 7730 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7731 1, -1); 7732 SKIP_BLANKS 7733 } else if (CUR == '*') { 7734 NEXT 7735 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7736 0, -1); 7737 SKIP_BLANKS 7738 } 7739 return(ret); 7740} 7741 7742 7743static xmlExpNodePtr 7744xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7745 xmlExpNodePtr ret, right; 7746 7747 ret = xmlExpParseOr(ctxt); 7748 SKIP_BLANKS 7749 while (CUR == '|') { 7750 NEXT 7751 right = xmlExpParseOr(ctxt); 7752 if (right == NULL) { 7753 xmlExpFree(ctxt, ret); 7754 return(NULL); 7755 } 7756 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7757 if (ret == NULL) 7758 return(NULL); 7759 } 7760 return(ret); 7761} 7762 7763static xmlExpNodePtr 7764xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7765 xmlExpNodePtr ret, right; 7766 7767 ret = xmlExpParseSeq(ctxt); 7768 SKIP_BLANKS 7769 while (CUR == ',') { 7770 NEXT 7771 right = xmlExpParseSeq(ctxt); 7772 if (right == NULL) { 7773 xmlExpFree(ctxt, ret); 7774 return(NULL); 7775 } 7776 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7777 if (ret == NULL) 7778 return(NULL); 7779 } 7780 return(ret); 7781} 7782 7783/** 7784 * xmlExpParse: 7785 * @ctxt: the expressions context 7786 * @expr: the 0 terminated string 7787 * 7788 * Minimal parser for regexps, it understand the following constructs 7789 * - string terminals 7790 * - choice operator | 7791 * - sequence operator , 7792 * - subexpressions (...) 7793 * - usual cardinality operators + * and ? 7794 * - finite sequences { min, max } 7795 * - infinite sequences { min, * } 7796 * There is minimal checkings made especially no checking on strings values 7797 * 7798 * Returns a new expression or NULL in case of failure 7799 */ 7800xmlExpNodePtr 7801xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 7802 xmlExpNodePtr ret; 7803 7804 ctxt->expr = expr; 7805 ctxt->cur = expr; 7806 7807 ret = xmlExpParseExpr(ctxt); 7808 SKIP_BLANKS 7809 if (*ctxt->cur != 0) { 7810 xmlExpFree(ctxt, ret); 7811 return(NULL); 7812 } 7813 return(ret); 7814} 7815 7816static void 7817xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 7818 xmlExpNodePtr c; 7819 7820 if (expr == NULL) return; 7821 if (glob) xmlBufferWriteChar(buf, "("); 7822 switch (expr->type) { 7823 case XML_EXP_EMPTY: 7824 xmlBufferWriteChar(buf, "empty"); 7825 break; 7826 case XML_EXP_FORBID: 7827 xmlBufferWriteChar(buf, "forbidden"); 7828 break; 7829 case XML_EXP_ATOM: 7830 xmlBufferWriteCHAR(buf, expr->exp_str); 7831 break; 7832 case XML_EXP_SEQ: 7833 c = expr->exp_left; 7834 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7835 xmlExpDumpInt(buf, c, 1); 7836 else 7837 xmlExpDumpInt(buf, c, 0); 7838 xmlBufferWriteChar(buf, " , "); 7839 c = expr->exp_right; 7840 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7841 xmlExpDumpInt(buf, c, 1); 7842 else 7843 xmlExpDumpInt(buf, c, 0); 7844 break; 7845 case XML_EXP_OR: 7846 c = expr->exp_left; 7847 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7848 xmlExpDumpInt(buf, c, 1); 7849 else 7850 xmlExpDumpInt(buf, c, 0); 7851 xmlBufferWriteChar(buf, " | "); 7852 c = expr->exp_right; 7853 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7854 xmlExpDumpInt(buf, c, 1); 7855 else 7856 xmlExpDumpInt(buf, c, 0); 7857 break; 7858 case XML_EXP_COUNT: { 7859 char rep[40]; 7860 7861 c = expr->exp_left; 7862 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7863 xmlExpDumpInt(buf, c, 1); 7864 else 7865 xmlExpDumpInt(buf, c, 0); 7866 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 7867 rep[0] = '?'; 7868 rep[1] = 0; 7869 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 7870 rep[0] = '*'; 7871 rep[1] = 0; 7872 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 7873 rep[0] = '+'; 7874 rep[1] = 0; 7875 } else if (expr->exp_max == expr->exp_min) { 7876 snprintf(rep, 39, "{%d}", expr->exp_min); 7877 } else if (expr->exp_max < 0) { 7878 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 7879 } else { 7880 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 7881 } 7882 rep[39] = 0; 7883 xmlBufferWriteChar(buf, rep); 7884 break; 7885 } 7886 default: 7887 fprintf(stderr, "Error in tree\n"); 7888 } 7889 if (glob) 7890 xmlBufferWriteChar(buf, ")"); 7891} 7892/** 7893 * xmlExpDump: 7894 * @buf: a buffer to receive the output 7895 * @expr: the compiled expression 7896 * 7897 * Serialize the expression as compiled to the buffer 7898 */ 7899void 7900xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 7901 if ((buf == NULL) || (expr == NULL)) 7902 return; 7903 xmlExpDumpInt(buf, expr, 0); 7904} 7905 7906/** 7907 * xmlExpMaxToken: 7908 * @expr: a compiled expression 7909 * 7910 * Indicate the maximum number of input a expression can accept 7911 * 7912 * Returns the maximum length or -1 in case of error 7913 */ 7914int 7915xmlExpMaxToken(xmlExpNodePtr expr) { 7916 if (expr == NULL) 7917 return(-1); 7918 return(expr->c_max); 7919} 7920 7921/** 7922 * xmlExpCtxtNbNodes: 7923 * @ctxt: an expression context 7924 * 7925 * Debugging facility provides the number of allocated nodes at a that point 7926 * 7927 * Returns the number of nodes in use or -1 in case of error 7928 */ 7929int 7930xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 7931 if (ctxt == NULL) 7932 return(-1); 7933 return(ctxt->nb_nodes); 7934} 7935 7936/** 7937 * xmlExpCtxtNbCons: 7938 * @ctxt: an expression context 7939 * 7940 * Debugging facility provides the number of allocated nodes over lifetime 7941 * 7942 * Returns the number of nodes ever allocated or -1 in case of error 7943 */ 7944int 7945xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 7946 if (ctxt == NULL) 7947 return(-1); 7948 return(ctxt->nb_cons); 7949} 7950 7951#endif /* LIBXML_EXPR_ENABLED */ 7952#define bottom_xmlregexp 7953#include "elfgcchack.h" 7954#endif /* LIBXML_REGEXP_ENABLED */ 7955