1/* 2 * regexp.c: generic and extensible Regular Expression engine 3 * 4 * Basically designed with the purpose of compiling regexps for 5 * the variety of validation/shemas mechanisms now available in 6 * XML related specifications these include: 7 * - XML-1.0 DTD validation 8 * - XML Schemas structure part 1 9 * - XML Schemas Datatypes part 2 especially Appendix F 10 * - RELAX-NG/TREX i.e. the counter proposal 11 * 12 * See Copyright for the status of this software. 13 * 14 * Daniel Veillard <veillard@redhat.com> 15 */ 16 17#define IN_LIBXML 18#include "libxml.h" 19 20#ifdef LIBXML_REGEXP_ENABLED 21 22/* #define DEBUG_ERR */ 23 24#include <stdio.h> 25#include <string.h> 26#ifdef HAVE_LIMITS_H 27#include <limits.h> 28#endif 29 30#include <libxml/tree.h> 31#include <libxml/parserInternals.h> 32#include <libxml/xmlregexp.h> 33#include <libxml/xmlautomata.h> 34#include <libxml/xmlunicode.h> 35 36#ifndef INT_MAX 37#define INT_MAX 123456789 /* easy to flag and big enough for our needs */ 38#endif 39 40/* #define DEBUG_REGEXP_GRAPH */ 41/* #define DEBUG_REGEXP_EXEC */ 42/* #define DEBUG_PUSH */ 43/* #define DEBUG_COMPACTION */ 44 45#define MAX_PUSH 10000000 46 47#define ERROR(str) \ 48 ctxt->error = XML_REGEXP_COMPILE_ERROR; \ 49 xmlRegexpErrCompile(ctxt, str); 50#define NEXT ctxt->cur++ 51#define CUR (*(ctxt->cur)) 52#define NXT(index) (ctxt->cur[index]) 53 54#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l) 55#define NEXTL(l) ctxt->cur += l; 56#define XML_REG_STRING_SEPARATOR '|' 57 58/** 59 * TODO: 60 * 61 * macro to flag unimplemented blocks 62 */ 63#define TODO \ 64 xmlGenericError(xmlGenericErrorContext, \ 65 "Unimplemented block at %s:%d\n", \ 66 __FILE__, __LINE__); 67 68/************************************************************************ 69 * * 70 * Datatypes and structures * 71 * * 72 ************************************************************************/ 73 74/* 75 * Note: the order of the enums below is significant, do not shuffle 76 */ 77typedef enum { 78 XML_REGEXP_EPSILON = 1, 79 XML_REGEXP_CHARVAL, 80 XML_REGEXP_RANGES, 81 XML_REGEXP_SUBREG, /* used for () sub regexps */ 82 XML_REGEXP_STRING, 83 XML_REGEXP_ANYCHAR, /* . */ 84 XML_REGEXP_ANYSPACE, /* \s */ 85 XML_REGEXP_NOTSPACE, /* \S */ 86 XML_REGEXP_INITNAME, /* \l */ 87 XML_REGEXP_NOTINITNAME, /* \L */ 88 XML_REGEXP_NAMECHAR, /* \c */ 89 XML_REGEXP_NOTNAMECHAR, /* \C */ 90 XML_REGEXP_DECIMAL, /* \d */ 91 XML_REGEXP_NOTDECIMAL, /* \D */ 92 XML_REGEXP_REALCHAR, /* \w */ 93 XML_REGEXP_NOTREALCHAR, /* \W */ 94 XML_REGEXP_LETTER = 100, 95 XML_REGEXP_LETTER_UPPERCASE, 96 XML_REGEXP_LETTER_LOWERCASE, 97 XML_REGEXP_LETTER_TITLECASE, 98 XML_REGEXP_LETTER_MODIFIER, 99 XML_REGEXP_LETTER_OTHERS, 100 XML_REGEXP_MARK, 101 XML_REGEXP_MARK_NONSPACING, 102 XML_REGEXP_MARK_SPACECOMBINING, 103 XML_REGEXP_MARK_ENCLOSING, 104 XML_REGEXP_NUMBER, 105 XML_REGEXP_NUMBER_DECIMAL, 106 XML_REGEXP_NUMBER_LETTER, 107 XML_REGEXP_NUMBER_OTHERS, 108 XML_REGEXP_PUNCT, 109 XML_REGEXP_PUNCT_CONNECTOR, 110 XML_REGEXP_PUNCT_DASH, 111 XML_REGEXP_PUNCT_OPEN, 112 XML_REGEXP_PUNCT_CLOSE, 113 XML_REGEXP_PUNCT_INITQUOTE, 114 XML_REGEXP_PUNCT_FINQUOTE, 115 XML_REGEXP_PUNCT_OTHERS, 116 XML_REGEXP_SEPAR, 117 XML_REGEXP_SEPAR_SPACE, 118 XML_REGEXP_SEPAR_LINE, 119 XML_REGEXP_SEPAR_PARA, 120 XML_REGEXP_SYMBOL, 121 XML_REGEXP_SYMBOL_MATH, 122 XML_REGEXP_SYMBOL_CURRENCY, 123 XML_REGEXP_SYMBOL_MODIFIER, 124 XML_REGEXP_SYMBOL_OTHERS, 125 XML_REGEXP_OTHER, 126 XML_REGEXP_OTHER_CONTROL, 127 XML_REGEXP_OTHER_FORMAT, 128 XML_REGEXP_OTHER_PRIVATE, 129 XML_REGEXP_OTHER_NA, 130 XML_REGEXP_BLOCK_NAME 131} xmlRegAtomType; 132 133typedef enum { 134 XML_REGEXP_QUANT_EPSILON = 1, 135 XML_REGEXP_QUANT_ONCE, 136 XML_REGEXP_QUANT_OPT, 137 XML_REGEXP_QUANT_MULT, 138 XML_REGEXP_QUANT_PLUS, 139 XML_REGEXP_QUANT_ONCEONLY, 140 XML_REGEXP_QUANT_ALL, 141 XML_REGEXP_QUANT_RANGE 142} xmlRegQuantType; 143 144typedef enum { 145 XML_REGEXP_START_STATE = 1, 146 XML_REGEXP_FINAL_STATE, 147 XML_REGEXP_TRANS_STATE, 148 XML_REGEXP_SINK_STATE 149} xmlRegStateType; 150 151typedef enum { 152 XML_REGEXP_MARK_NORMAL = 0, 153 XML_REGEXP_MARK_START, 154 XML_REGEXP_MARK_VISITED 155} xmlRegMarkedType; 156 157typedef struct _xmlRegRange xmlRegRange; 158typedef xmlRegRange *xmlRegRangePtr; 159 160struct _xmlRegRange { 161 int neg; /* 0 normal, 1 not, 2 exclude */ 162 xmlRegAtomType type; 163 int start; 164 int end; 165 xmlChar *blockName; 166}; 167 168typedef struct _xmlRegAtom xmlRegAtom; 169typedef xmlRegAtom *xmlRegAtomPtr; 170 171typedef struct _xmlAutomataState xmlRegState; 172typedef xmlRegState *xmlRegStatePtr; 173 174struct _xmlRegAtom { 175 int no; 176 xmlRegAtomType type; 177 xmlRegQuantType quant; 178 int min; 179 int max; 180 181 void *valuep; 182 void *valuep2; 183 int neg; 184 int codepoint; 185 xmlRegStatePtr start; 186 xmlRegStatePtr stop; 187 int maxRanges; 188 int nbRanges; 189 xmlRegRangePtr *ranges; 190 void *data; 191}; 192 193typedef struct _xmlRegCounter xmlRegCounter; 194typedef xmlRegCounter *xmlRegCounterPtr; 195 196struct _xmlRegCounter { 197 int min; 198 int max; 199}; 200 201typedef struct _xmlRegTrans xmlRegTrans; 202typedef xmlRegTrans *xmlRegTransPtr; 203 204struct _xmlRegTrans { 205 xmlRegAtomPtr atom; 206 int to; 207 int counter; 208 int count; 209 int nd; 210}; 211 212struct _xmlAutomataState { 213 xmlRegStateType type; 214 xmlRegMarkedType mark; 215 xmlRegMarkedType reached; 216 int no; 217 int maxTrans; 218 int nbTrans; 219 xmlRegTrans *trans; 220 /* knowing states ponting to us can speed things up */ 221 int maxTransTo; 222 int nbTransTo; 223 int *transTo; 224}; 225 226typedef struct _xmlAutomata xmlRegParserCtxt; 227typedef xmlRegParserCtxt *xmlRegParserCtxtPtr; 228 229struct _xmlAutomata { 230 xmlChar *string; 231 xmlChar *cur; 232 233 int error; 234 int neg; 235 236 xmlRegStatePtr start; 237 xmlRegStatePtr end; 238 xmlRegStatePtr state; 239 240 xmlRegAtomPtr atom; 241 242 int maxAtoms; 243 int nbAtoms; 244 xmlRegAtomPtr *atoms; 245 246 int maxStates; 247 int nbStates; 248 xmlRegStatePtr *states; 249 250 int maxCounters; 251 int nbCounters; 252 xmlRegCounter *counters; 253 254 int determinist; 255 int negs; 256}; 257 258struct _xmlRegexp { 259 xmlChar *string; 260 int nbStates; 261 xmlRegStatePtr *states; 262 int nbAtoms; 263 xmlRegAtomPtr *atoms; 264 int nbCounters; 265 xmlRegCounter *counters; 266 int determinist; 267 /* 268 * That's the compact form for determinists automatas 269 */ 270 int nbstates; 271 int *compact; 272 void **transdata; 273 int nbstrings; 274 xmlChar **stringMap; 275}; 276 277typedef struct _xmlRegExecRollback xmlRegExecRollback; 278typedef xmlRegExecRollback *xmlRegExecRollbackPtr; 279 280struct _xmlRegExecRollback { 281 xmlRegStatePtr state;/* the current state */ 282 int index; /* the index in the input stack */ 283 int nextbranch; /* the next transition to explore in that state */ 284 int *counts; /* save the automata state if it has some */ 285}; 286 287typedef struct _xmlRegInputToken xmlRegInputToken; 288typedef xmlRegInputToken *xmlRegInputTokenPtr; 289 290struct _xmlRegInputToken { 291 xmlChar *value; 292 void *data; 293}; 294 295struct _xmlRegExecCtxt { 296 int status; /* execution status != 0 indicate an error */ 297 int determinist; /* did we find an indeterministic behaviour */ 298 xmlRegexpPtr comp; /* the compiled regexp */ 299 xmlRegExecCallbacks callback; 300 void *data; 301 302 xmlRegStatePtr state;/* the current state */ 303 int transno; /* the current transition on that state */ 304 int transcount; /* the number of chars in char counted transitions */ 305 306 /* 307 * A stack of rollback states 308 */ 309 int maxRollbacks; 310 int nbRollbacks; 311 xmlRegExecRollback *rollbacks; 312 313 /* 314 * The state of the automata if any 315 */ 316 int *counts; 317 318 /* 319 * The input stack 320 */ 321 int inputStackMax; 322 int inputStackNr; 323 int index; 324 int *charStack; 325 const xmlChar *inputString; /* when operating on characters */ 326 xmlRegInputTokenPtr inputStack;/* when operating on strings */ 327 328 /* 329 * error handling 330 */ 331 int errStateNo; /* the error state number */ 332 xmlRegStatePtr errState; /* the error state */ 333 xmlChar *errString; /* the string raising the error */ 334 int *errCounts; /* counters at the error state */ 335 int nbPush; 336}; 337 338#define REGEXP_ALL_COUNTER 0x123456 339#define REGEXP_ALL_LAX_COUNTER 0x123457 340 341static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); 342static void xmlRegFreeState(xmlRegStatePtr state); 343static void xmlRegFreeAtom(xmlRegAtomPtr atom); 344static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr); 345static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint); 346static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, 347 int neg, int start, int end, const xmlChar *blockName); 348 349/************************************************************************ 350 * * 351 * Regexp memory error handler * 352 * * 353 ************************************************************************/ 354/** 355 * xmlRegexpErrMemory: 356 * @extra: extra information 357 * 358 * Handle an out of memory condition 359 */ 360static void 361xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra) 362{ 363 const char *regexp = NULL; 364 if (ctxt != NULL) { 365 regexp = (const char *) ctxt->string; 366 ctxt->error = XML_ERR_NO_MEMORY; 367 } 368 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 369 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 370 regexp, NULL, 0, 0, 371 "Memory allocation failed : %s\n", extra); 372} 373 374/** 375 * xmlRegexpErrCompile: 376 * @extra: extra information 377 * 378 * Handle a compilation failure 379 */ 380static void 381xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra) 382{ 383 const char *regexp = NULL; 384 int idx = 0; 385 386 if (ctxt != NULL) { 387 regexp = (const char *) ctxt->string; 388 idx = ctxt->cur - ctxt->string; 389 ctxt->error = XML_REGEXP_COMPILE_ERROR; 390 } 391 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, 392 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, 393 regexp, NULL, idx, 0, 394 "failed to compile: %s\n", extra); 395} 396 397/************************************************************************ 398 * * 399 * Allocation/Deallocation * 400 * * 401 ************************************************************************/ 402 403static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); 404/** 405 * xmlRegEpxFromParse: 406 * @ctxt: the parser context used to build it 407 * 408 * Allocate a new regexp and fill it with the result from the parser 409 * 410 * Returns the new regexp or NULL in case of error 411 */ 412static xmlRegexpPtr 413xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { 414 xmlRegexpPtr ret; 415 416 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); 417 if (ret == NULL) { 418 xmlRegexpErrMemory(ctxt, "compiling regexp"); 419 return(NULL); 420 } 421 memset(ret, 0, sizeof(xmlRegexp)); 422 ret->string = ctxt->string; 423 ret->nbStates = ctxt->nbStates; 424 ret->states = ctxt->states; 425 ret->nbAtoms = ctxt->nbAtoms; 426 ret->atoms = ctxt->atoms; 427 ret->nbCounters = ctxt->nbCounters; 428 ret->counters = ctxt->counters; 429 ret->determinist = ctxt->determinist; 430 if (ret->determinist == -1) { 431 xmlRegexpIsDeterminist(ret); 432 } 433 434 if ((ret->determinist != 0) && 435 (ret->nbCounters == 0) && 436 (ctxt->negs == 0) && 437 (ret->atoms != NULL) && 438 (ret->atoms[0] != NULL) && 439 (ret->atoms[0]->type == XML_REGEXP_STRING)) { 440 int i, j, nbstates = 0, nbatoms = 0; 441 int *stateRemap; 442 int *stringRemap; 443 int *transitions; 444 void **transdata; 445 xmlChar **stringMap; 446 xmlChar *value; 447 448 /* 449 * Switch to a compact representation 450 * 1/ counting the effective number of states left 451 * 2/ counting the unique number of atoms, and check that 452 * they are all of the string type 453 * 3/ build a table state x atom for the transitions 454 */ 455 456 stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); 457 if (stateRemap == NULL) { 458 xmlRegexpErrMemory(ctxt, "compiling regexp"); 459 xmlFree(ret); 460 return(NULL); 461 } 462 for (i = 0;i < ret->nbStates;i++) { 463 if (ret->states[i] != NULL) { 464 stateRemap[i] = nbstates; 465 nbstates++; 466 } else { 467 stateRemap[i] = -1; 468 } 469 } 470#ifdef DEBUG_COMPACTION 471 printf("Final: %d states\n", nbstates); 472#endif 473 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); 474 if (stringMap == NULL) { 475 xmlRegexpErrMemory(ctxt, "compiling regexp"); 476 xmlFree(stateRemap); 477 xmlFree(ret); 478 return(NULL); 479 } 480 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); 481 if (stringRemap == NULL) { 482 xmlRegexpErrMemory(ctxt, "compiling regexp"); 483 xmlFree(stringMap); 484 xmlFree(stateRemap); 485 xmlFree(ret); 486 return(NULL); 487 } 488 for (i = 0;i < ret->nbAtoms;i++) { 489 if ((ret->atoms[i]->type == XML_REGEXP_STRING) && 490 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { 491 value = ret->atoms[i]->valuep; 492 for (j = 0;j < nbatoms;j++) { 493 if (xmlStrEqual(stringMap[j], value)) { 494 stringRemap[i] = j; 495 break; 496 } 497 } 498 if (j >= nbatoms) { 499 stringRemap[i] = nbatoms; 500 stringMap[nbatoms] = xmlStrdup(value); 501 if (stringMap[nbatoms] == NULL) { 502 for (i = 0;i < nbatoms;i++) 503 xmlFree(stringMap[i]); 504 xmlFree(stringRemap); 505 xmlFree(stringMap); 506 xmlFree(stateRemap); 507 xmlFree(ret); 508 return(NULL); 509 } 510 nbatoms++; 511 } 512 } else { 513 xmlFree(stateRemap); 514 xmlFree(stringRemap); 515 for (i = 0;i < nbatoms;i++) 516 xmlFree(stringMap[i]); 517 xmlFree(stringMap); 518 xmlFree(ret); 519 return(NULL); 520 } 521 } 522#ifdef DEBUG_COMPACTION 523 printf("Final: %d atoms\n", nbatoms); 524#endif 525 transitions = (int *) xmlMalloc((nbstates + 1) * 526 (nbatoms + 1) * sizeof(int)); 527 if (transitions == NULL) { 528 xmlFree(stateRemap); 529 xmlFree(stringRemap); 530 xmlFree(stringMap); 531 xmlFree(ret); 532 return(NULL); 533 } 534 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int)); 535 536 /* 537 * Allocate the transition table. The first entry for each 538 * state corresponds to the state type. 539 */ 540 transdata = NULL; 541 542 for (i = 0;i < ret->nbStates;i++) { 543 int stateno, atomno, targetno, prev; 544 xmlRegStatePtr state; 545 xmlRegTransPtr trans; 546 547 stateno = stateRemap[i]; 548 if (stateno == -1) 549 continue; 550 state = ret->states[i]; 551 552 transitions[stateno * (nbatoms + 1)] = state->type; 553 554 for (j = 0;j < state->nbTrans;j++) { 555 trans = &(state->trans[j]); 556 if ((trans->to == -1) || (trans->atom == NULL)) 557 continue; 558 atomno = stringRemap[trans->atom->no]; 559 if ((trans->atom->data != NULL) && (transdata == NULL)) { 560 transdata = (void **) xmlMalloc(nbstates * nbatoms * 561 sizeof(void *)); 562 if (transdata != NULL) 563 memset(transdata, 0, 564 nbstates * nbatoms * sizeof(void *)); 565 else { 566 xmlRegexpErrMemory(ctxt, "compiling regexp"); 567 break; 568 } 569 } 570 targetno = stateRemap[trans->to]; 571 /* 572 * if the same atom can generate transitions to 2 different 573 * states then it means the automata is not determinist and 574 * the compact form can't be used ! 575 */ 576 prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; 577 if (prev != 0) { 578 if (prev != targetno + 1) { 579 ret->determinist = 0; 580#ifdef DEBUG_COMPACTION 581 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", 582 i, j, trans->atom->no, trans->to, atomno, targetno); 583 printf(" previous to is %d\n", prev); 584#endif 585 if (transdata != NULL) 586 xmlFree(transdata); 587 xmlFree(transitions); 588 xmlFree(stateRemap); 589 xmlFree(stringRemap); 590 for (i = 0;i < nbatoms;i++) 591 xmlFree(stringMap[i]); 592 xmlFree(stringMap); 593 goto not_determ; 594 } 595 } else { 596#if 0 597 printf("State %d trans %d: atom %d to %d : %d to %d\n", 598 i, j, trans->atom->no, trans->to, atomno, targetno); 599#endif 600 transitions[stateno * (nbatoms + 1) + atomno + 1] = 601 targetno + 1; /* to avoid 0 */ 602 if (transdata != NULL) 603 transdata[stateno * nbatoms + atomno] = 604 trans->atom->data; 605 } 606 } 607 } 608 ret->determinist = 1; 609#ifdef DEBUG_COMPACTION 610 /* 611 * Debug 612 */ 613 for (i = 0;i < nbstates;i++) { 614 for (j = 0;j < nbatoms + 1;j++) { 615 printf("%02d ", transitions[i * (nbatoms + 1) + j]); 616 } 617 printf("\n"); 618 } 619 printf("\n"); 620#endif 621 /* 622 * Cleanup of the old data 623 */ 624 if (ret->states != NULL) { 625 for (i = 0;i < ret->nbStates;i++) 626 xmlRegFreeState(ret->states[i]); 627 xmlFree(ret->states); 628 } 629 ret->states = NULL; 630 ret->nbStates = 0; 631 if (ret->atoms != NULL) { 632 for (i = 0;i < ret->nbAtoms;i++) 633 xmlRegFreeAtom(ret->atoms[i]); 634 xmlFree(ret->atoms); 635 } 636 ret->atoms = NULL; 637 ret->nbAtoms = 0; 638 639 ret->compact = transitions; 640 ret->transdata = transdata; 641 ret->stringMap = stringMap; 642 ret->nbstrings = nbatoms; 643 ret->nbstates = nbstates; 644 xmlFree(stateRemap); 645 xmlFree(stringRemap); 646 } 647not_determ: 648 ctxt->string = NULL; 649 ctxt->nbStates = 0; 650 ctxt->states = NULL; 651 ctxt->nbAtoms = 0; 652 ctxt->atoms = NULL; 653 ctxt->nbCounters = 0; 654 ctxt->counters = NULL; 655 return(ret); 656} 657 658/** 659 * xmlRegNewParserCtxt: 660 * @string: the string to parse 661 * 662 * Allocate a new regexp parser context 663 * 664 * Returns the new context or NULL in case of error 665 */ 666static xmlRegParserCtxtPtr 667xmlRegNewParserCtxt(const xmlChar *string) { 668 xmlRegParserCtxtPtr ret; 669 670 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt)); 671 if (ret == NULL) 672 return(NULL); 673 memset(ret, 0, sizeof(xmlRegParserCtxt)); 674 if (string != NULL) 675 ret->string = xmlStrdup(string); 676 ret->cur = ret->string; 677 ret->neg = 0; 678 ret->negs = 0; 679 ret->error = 0; 680 ret->determinist = -1; 681 return(ret); 682} 683 684/** 685 * xmlRegNewRange: 686 * @ctxt: the regexp parser context 687 * @neg: is that negative 688 * @type: the type of range 689 * @start: the start codepoint 690 * @end: the end codepoint 691 * 692 * Allocate a new regexp range 693 * 694 * Returns the new range or NULL in case of error 695 */ 696static xmlRegRangePtr 697xmlRegNewRange(xmlRegParserCtxtPtr ctxt, 698 int neg, xmlRegAtomType type, int start, int end) { 699 xmlRegRangePtr ret; 700 701 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange)); 702 if (ret == NULL) { 703 xmlRegexpErrMemory(ctxt, "allocating range"); 704 return(NULL); 705 } 706 ret->neg = neg; 707 ret->type = type; 708 ret->start = start; 709 ret->end = end; 710 return(ret); 711} 712 713/** 714 * xmlRegFreeRange: 715 * @range: the regexp range 716 * 717 * Free a regexp range 718 */ 719static void 720xmlRegFreeRange(xmlRegRangePtr range) { 721 if (range == NULL) 722 return; 723 724 if (range->blockName != NULL) 725 xmlFree(range->blockName); 726 xmlFree(range); 727} 728 729/** 730 * xmlRegNewAtom: 731 * @ctxt: the regexp parser context 732 * @type: the type of atom 733 * 734 * Allocate a new regexp range 735 * 736 * Returns the new atom or NULL in case of error 737 */ 738static xmlRegAtomPtr 739xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) { 740 xmlRegAtomPtr ret; 741 742 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom)); 743 if (ret == NULL) { 744 xmlRegexpErrMemory(ctxt, "allocating atom"); 745 return(NULL); 746 } 747 memset(ret, 0, sizeof(xmlRegAtom)); 748 ret->type = type; 749 ret->quant = XML_REGEXP_QUANT_ONCE; 750 ret->min = 0; 751 ret->max = 0; 752 return(ret); 753} 754 755/** 756 * xmlRegFreeAtom: 757 * @atom: the regexp atom 758 * 759 * Free a regexp atom 760 */ 761static void 762xmlRegFreeAtom(xmlRegAtomPtr atom) { 763 int i; 764 765 if (atom == NULL) 766 return; 767 768 for (i = 0;i < atom->nbRanges;i++) 769 xmlRegFreeRange(atom->ranges[i]); 770 if (atom->ranges != NULL) 771 xmlFree(atom->ranges); 772 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL)) 773 xmlFree(atom->valuep); 774 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL)) 775 xmlFree(atom->valuep2); 776 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL)) 777 xmlFree(atom->valuep); 778 xmlFree(atom); 779} 780 781static xmlRegStatePtr 782xmlRegNewState(xmlRegParserCtxtPtr ctxt) { 783 xmlRegStatePtr ret; 784 785 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState)); 786 if (ret == NULL) { 787 xmlRegexpErrMemory(ctxt, "allocating state"); 788 return(NULL); 789 } 790 memset(ret, 0, sizeof(xmlRegState)); 791 ret->type = XML_REGEXP_TRANS_STATE; 792 ret->mark = XML_REGEXP_MARK_NORMAL; 793 return(ret); 794} 795 796/** 797 * xmlRegFreeState: 798 * @state: the regexp state 799 * 800 * Free a regexp state 801 */ 802static void 803xmlRegFreeState(xmlRegStatePtr state) { 804 if (state == NULL) 805 return; 806 807 if (state->trans != NULL) 808 xmlFree(state->trans); 809 if (state->transTo != NULL) 810 xmlFree(state->transTo); 811 xmlFree(state); 812} 813 814/** 815 * xmlRegFreeParserCtxt: 816 * @ctxt: the regexp parser context 817 * 818 * Free a regexp parser context 819 */ 820static void 821xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) { 822 int i; 823 if (ctxt == NULL) 824 return; 825 826 if (ctxt->string != NULL) 827 xmlFree(ctxt->string); 828 if (ctxt->states != NULL) { 829 for (i = 0;i < ctxt->nbStates;i++) 830 xmlRegFreeState(ctxt->states[i]); 831 xmlFree(ctxt->states); 832 } 833 if (ctxt->atoms != NULL) { 834 for (i = 0;i < ctxt->nbAtoms;i++) 835 xmlRegFreeAtom(ctxt->atoms[i]); 836 xmlFree(ctxt->atoms); 837 } 838 if (ctxt->counters != NULL) 839 xmlFree(ctxt->counters); 840 xmlFree(ctxt); 841} 842 843/************************************************************************ 844 * * 845 * Display of Data structures * 846 * * 847 ************************************************************************/ 848 849static void 850xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) { 851 switch (type) { 852 case XML_REGEXP_EPSILON: 853 fprintf(output, "epsilon "); break; 854 case XML_REGEXP_CHARVAL: 855 fprintf(output, "charval "); break; 856 case XML_REGEXP_RANGES: 857 fprintf(output, "ranges "); break; 858 case XML_REGEXP_SUBREG: 859 fprintf(output, "subexpr "); break; 860 case XML_REGEXP_STRING: 861 fprintf(output, "string "); break; 862 case XML_REGEXP_ANYCHAR: 863 fprintf(output, "anychar "); break; 864 case XML_REGEXP_ANYSPACE: 865 fprintf(output, "anyspace "); break; 866 case XML_REGEXP_NOTSPACE: 867 fprintf(output, "notspace "); break; 868 case XML_REGEXP_INITNAME: 869 fprintf(output, "initname "); break; 870 case XML_REGEXP_NOTINITNAME: 871 fprintf(output, "notinitname "); break; 872 case XML_REGEXP_NAMECHAR: 873 fprintf(output, "namechar "); break; 874 case XML_REGEXP_NOTNAMECHAR: 875 fprintf(output, "notnamechar "); break; 876 case XML_REGEXP_DECIMAL: 877 fprintf(output, "decimal "); break; 878 case XML_REGEXP_NOTDECIMAL: 879 fprintf(output, "notdecimal "); break; 880 case XML_REGEXP_REALCHAR: 881 fprintf(output, "realchar "); break; 882 case XML_REGEXP_NOTREALCHAR: 883 fprintf(output, "notrealchar "); break; 884 case XML_REGEXP_LETTER: 885 fprintf(output, "LETTER "); break; 886 case XML_REGEXP_LETTER_UPPERCASE: 887 fprintf(output, "LETTER_UPPERCASE "); break; 888 case XML_REGEXP_LETTER_LOWERCASE: 889 fprintf(output, "LETTER_LOWERCASE "); break; 890 case XML_REGEXP_LETTER_TITLECASE: 891 fprintf(output, "LETTER_TITLECASE "); break; 892 case XML_REGEXP_LETTER_MODIFIER: 893 fprintf(output, "LETTER_MODIFIER "); break; 894 case XML_REGEXP_LETTER_OTHERS: 895 fprintf(output, "LETTER_OTHERS "); break; 896 case XML_REGEXP_MARK: 897 fprintf(output, "MARK "); break; 898 case XML_REGEXP_MARK_NONSPACING: 899 fprintf(output, "MARK_NONSPACING "); break; 900 case XML_REGEXP_MARK_SPACECOMBINING: 901 fprintf(output, "MARK_SPACECOMBINING "); break; 902 case XML_REGEXP_MARK_ENCLOSING: 903 fprintf(output, "MARK_ENCLOSING "); break; 904 case XML_REGEXP_NUMBER: 905 fprintf(output, "NUMBER "); break; 906 case XML_REGEXP_NUMBER_DECIMAL: 907 fprintf(output, "NUMBER_DECIMAL "); break; 908 case XML_REGEXP_NUMBER_LETTER: 909 fprintf(output, "NUMBER_LETTER "); break; 910 case XML_REGEXP_NUMBER_OTHERS: 911 fprintf(output, "NUMBER_OTHERS "); break; 912 case XML_REGEXP_PUNCT: 913 fprintf(output, "PUNCT "); break; 914 case XML_REGEXP_PUNCT_CONNECTOR: 915 fprintf(output, "PUNCT_CONNECTOR "); break; 916 case XML_REGEXP_PUNCT_DASH: 917 fprintf(output, "PUNCT_DASH "); break; 918 case XML_REGEXP_PUNCT_OPEN: 919 fprintf(output, "PUNCT_OPEN "); break; 920 case XML_REGEXP_PUNCT_CLOSE: 921 fprintf(output, "PUNCT_CLOSE "); break; 922 case XML_REGEXP_PUNCT_INITQUOTE: 923 fprintf(output, "PUNCT_INITQUOTE "); break; 924 case XML_REGEXP_PUNCT_FINQUOTE: 925 fprintf(output, "PUNCT_FINQUOTE "); break; 926 case XML_REGEXP_PUNCT_OTHERS: 927 fprintf(output, "PUNCT_OTHERS "); break; 928 case XML_REGEXP_SEPAR: 929 fprintf(output, "SEPAR "); break; 930 case XML_REGEXP_SEPAR_SPACE: 931 fprintf(output, "SEPAR_SPACE "); break; 932 case XML_REGEXP_SEPAR_LINE: 933 fprintf(output, "SEPAR_LINE "); break; 934 case XML_REGEXP_SEPAR_PARA: 935 fprintf(output, "SEPAR_PARA "); break; 936 case XML_REGEXP_SYMBOL: 937 fprintf(output, "SYMBOL "); break; 938 case XML_REGEXP_SYMBOL_MATH: 939 fprintf(output, "SYMBOL_MATH "); break; 940 case XML_REGEXP_SYMBOL_CURRENCY: 941 fprintf(output, "SYMBOL_CURRENCY "); break; 942 case XML_REGEXP_SYMBOL_MODIFIER: 943 fprintf(output, "SYMBOL_MODIFIER "); break; 944 case XML_REGEXP_SYMBOL_OTHERS: 945 fprintf(output, "SYMBOL_OTHERS "); break; 946 case XML_REGEXP_OTHER: 947 fprintf(output, "OTHER "); break; 948 case XML_REGEXP_OTHER_CONTROL: 949 fprintf(output, "OTHER_CONTROL "); break; 950 case XML_REGEXP_OTHER_FORMAT: 951 fprintf(output, "OTHER_FORMAT "); break; 952 case XML_REGEXP_OTHER_PRIVATE: 953 fprintf(output, "OTHER_PRIVATE "); break; 954 case XML_REGEXP_OTHER_NA: 955 fprintf(output, "OTHER_NA "); break; 956 case XML_REGEXP_BLOCK_NAME: 957 fprintf(output, "BLOCK "); break; 958 } 959} 960 961static void 962xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { 963 switch (type) { 964 case XML_REGEXP_QUANT_EPSILON: 965 fprintf(output, "epsilon "); break; 966 case XML_REGEXP_QUANT_ONCE: 967 fprintf(output, "once "); break; 968 case XML_REGEXP_QUANT_OPT: 969 fprintf(output, "? "); break; 970 case XML_REGEXP_QUANT_MULT: 971 fprintf(output, "* "); break; 972 case XML_REGEXP_QUANT_PLUS: 973 fprintf(output, "+ "); break; 974 case XML_REGEXP_QUANT_RANGE: 975 fprintf(output, "range "); break; 976 case XML_REGEXP_QUANT_ONCEONLY: 977 fprintf(output, "onceonly "); break; 978 case XML_REGEXP_QUANT_ALL: 979 fprintf(output, "all "); break; 980 } 981} 982static void 983xmlRegPrintRange(FILE *output, xmlRegRangePtr range) { 984 fprintf(output, " range: "); 985 if (range->neg) 986 fprintf(output, "negative "); 987 xmlRegPrintAtomType(output, range->type); 988 fprintf(output, "%c - %c\n", range->start, range->end); 989} 990 991static void 992xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) { 993 fprintf(output, " atom: "); 994 if (atom == NULL) { 995 fprintf(output, "NULL\n"); 996 return; 997 } 998 if (atom->neg) 999 fprintf(output, "not "); 1000 xmlRegPrintAtomType(output, atom->type); 1001 xmlRegPrintQuantType(output, atom->quant); 1002 if (atom->quant == XML_REGEXP_QUANT_RANGE) 1003 fprintf(output, "%d-%d ", atom->min, atom->max); 1004 if (atom->type == XML_REGEXP_STRING) 1005 fprintf(output, "'%s' ", (char *) atom->valuep); 1006 if (atom->type == XML_REGEXP_CHARVAL) 1007 fprintf(output, "char %c\n", atom->codepoint); 1008 else if (atom->type == XML_REGEXP_RANGES) { 1009 int i; 1010 fprintf(output, "%d entries\n", atom->nbRanges); 1011 for (i = 0; i < atom->nbRanges;i++) 1012 xmlRegPrintRange(output, atom->ranges[i]); 1013 } else if (atom->type == XML_REGEXP_SUBREG) { 1014 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no); 1015 } else { 1016 fprintf(output, "\n"); 1017 } 1018} 1019 1020static void 1021xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) { 1022 fprintf(output, " trans: "); 1023 if (trans == NULL) { 1024 fprintf(output, "NULL\n"); 1025 return; 1026 } 1027 if (trans->to < 0) { 1028 fprintf(output, "removed\n"); 1029 return; 1030 } 1031 if (trans->nd != 0) { 1032 if (trans->nd == 2) 1033 fprintf(output, "last not determinist, "); 1034 else 1035 fprintf(output, "not determinist, "); 1036 } 1037 if (trans->counter >= 0) { 1038 fprintf(output, "counted %d, ", trans->counter); 1039 } 1040 if (trans->count == REGEXP_ALL_COUNTER) { 1041 fprintf(output, "all transition, "); 1042 } else if (trans->count >= 0) { 1043 fprintf(output, "count based %d, ", trans->count); 1044 } 1045 if (trans->atom == NULL) { 1046 fprintf(output, "epsilon to %d\n", trans->to); 1047 return; 1048 } 1049 if (trans->atom->type == XML_REGEXP_CHARVAL) 1050 fprintf(output, "char %c ", trans->atom->codepoint); 1051 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to); 1052} 1053 1054static void 1055xmlRegPrintState(FILE *output, xmlRegStatePtr state) { 1056 int i; 1057 1058 fprintf(output, " state: "); 1059 if (state == NULL) { 1060 fprintf(output, "NULL\n"); 1061 return; 1062 } 1063 if (state->type == XML_REGEXP_START_STATE) 1064 fprintf(output, "START "); 1065 if (state->type == XML_REGEXP_FINAL_STATE) 1066 fprintf(output, "FINAL "); 1067 1068 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans); 1069 for (i = 0;i < state->nbTrans; i++) { 1070 xmlRegPrintTrans(output, &(state->trans[i])); 1071 } 1072} 1073 1074#ifdef DEBUG_REGEXP_GRAPH 1075static void 1076xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { 1077 int i; 1078 1079 fprintf(output, " ctxt: "); 1080 if (ctxt == NULL) { 1081 fprintf(output, "NULL\n"); 1082 return; 1083 } 1084 fprintf(output, "'%s' ", ctxt->string); 1085 if (ctxt->error) 1086 fprintf(output, "error "); 1087 if (ctxt->neg) 1088 fprintf(output, "neg "); 1089 fprintf(output, "\n"); 1090 fprintf(output, "%d atoms:\n", ctxt->nbAtoms); 1091 for (i = 0;i < ctxt->nbAtoms; i++) { 1092 fprintf(output, " %02d ", i); 1093 xmlRegPrintAtom(output, ctxt->atoms[i]); 1094 } 1095 if (ctxt->atom != NULL) { 1096 fprintf(output, "current atom:\n"); 1097 xmlRegPrintAtom(output, ctxt->atom); 1098 } 1099 fprintf(output, "%d states:", ctxt->nbStates); 1100 if (ctxt->start != NULL) 1101 fprintf(output, " start: %d", ctxt->start->no); 1102 if (ctxt->end != NULL) 1103 fprintf(output, " end: %d", ctxt->end->no); 1104 fprintf(output, "\n"); 1105 for (i = 0;i < ctxt->nbStates; i++) { 1106 xmlRegPrintState(output, ctxt->states[i]); 1107 } 1108 fprintf(output, "%d counters:\n", ctxt->nbCounters); 1109 for (i = 0;i < ctxt->nbCounters; i++) { 1110 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min, 1111 ctxt->counters[i].max); 1112 } 1113} 1114#endif 1115 1116/************************************************************************ 1117 * * 1118 * Finite Automata structures manipulations * 1119 * * 1120 ************************************************************************/ 1121 1122static void 1123xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom, 1124 int neg, xmlRegAtomType type, int start, int end, 1125 xmlChar *blockName) { 1126 xmlRegRangePtr range; 1127 1128 if (atom == NULL) { 1129 ERROR("add range: atom is NULL"); 1130 return; 1131 } 1132 if (atom->type != XML_REGEXP_RANGES) { 1133 ERROR("add range: atom is not ranges"); 1134 return; 1135 } 1136 if (atom->maxRanges == 0) { 1137 atom->maxRanges = 4; 1138 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges * 1139 sizeof(xmlRegRangePtr)); 1140 if (atom->ranges == NULL) { 1141 xmlRegexpErrMemory(ctxt, "adding ranges"); 1142 atom->maxRanges = 0; 1143 return; 1144 } 1145 } else if (atom->nbRanges >= atom->maxRanges) { 1146 xmlRegRangePtr *tmp; 1147 atom->maxRanges *= 2; 1148 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges * 1149 sizeof(xmlRegRangePtr)); 1150 if (tmp == NULL) { 1151 xmlRegexpErrMemory(ctxt, "adding ranges"); 1152 atom->maxRanges /= 2; 1153 return; 1154 } 1155 atom->ranges = tmp; 1156 } 1157 range = xmlRegNewRange(ctxt, neg, type, start, end); 1158 if (range == NULL) 1159 return; 1160 range->blockName = blockName; 1161 atom->ranges[atom->nbRanges++] = range; 1162 1163} 1164 1165static int 1166xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) { 1167 if (ctxt->maxCounters == 0) { 1168 ctxt->maxCounters = 4; 1169 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters * 1170 sizeof(xmlRegCounter)); 1171 if (ctxt->counters == NULL) { 1172 xmlRegexpErrMemory(ctxt, "allocating counter"); 1173 ctxt->maxCounters = 0; 1174 return(-1); 1175 } 1176 } else if (ctxt->nbCounters >= ctxt->maxCounters) { 1177 xmlRegCounter *tmp; 1178 ctxt->maxCounters *= 2; 1179 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters * 1180 sizeof(xmlRegCounter)); 1181 if (tmp == NULL) { 1182 xmlRegexpErrMemory(ctxt, "allocating counter"); 1183 ctxt->maxCounters /= 2; 1184 return(-1); 1185 } 1186 ctxt->counters = tmp; 1187 } 1188 ctxt->counters[ctxt->nbCounters].min = -1; 1189 ctxt->counters[ctxt->nbCounters].max = -1; 1190 return(ctxt->nbCounters++); 1191} 1192 1193static int 1194xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) { 1195 if (atom == NULL) { 1196 ERROR("atom push: atom is NULL"); 1197 return(-1); 1198 } 1199 if (ctxt->maxAtoms == 0) { 1200 ctxt->maxAtoms = 4; 1201 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms * 1202 sizeof(xmlRegAtomPtr)); 1203 if (ctxt->atoms == NULL) { 1204 xmlRegexpErrMemory(ctxt, "pushing atom"); 1205 ctxt->maxAtoms = 0; 1206 return(-1); 1207 } 1208 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) { 1209 xmlRegAtomPtr *tmp; 1210 ctxt->maxAtoms *= 2; 1211 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms * 1212 sizeof(xmlRegAtomPtr)); 1213 if (tmp == NULL) { 1214 xmlRegexpErrMemory(ctxt, "allocating counter"); 1215 ctxt->maxAtoms /= 2; 1216 return(-1); 1217 } 1218 ctxt->atoms = tmp; 1219 } 1220 atom->no = ctxt->nbAtoms; 1221 ctxt->atoms[ctxt->nbAtoms++] = atom; 1222 return(0); 1223} 1224 1225static void 1226xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target, 1227 int from) { 1228 if (target->maxTransTo == 0) { 1229 target->maxTransTo = 8; 1230 target->transTo = (int *) xmlMalloc(target->maxTransTo * 1231 sizeof(int)); 1232 if (target->transTo == NULL) { 1233 xmlRegexpErrMemory(ctxt, "adding transition"); 1234 target->maxTransTo = 0; 1235 return; 1236 } 1237 } else if (target->nbTransTo >= target->maxTransTo) { 1238 int *tmp; 1239 target->maxTransTo *= 2; 1240 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo * 1241 sizeof(int)); 1242 if (tmp == NULL) { 1243 xmlRegexpErrMemory(ctxt, "adding transition"); 1244 target->maxTransTo /= 2; 1245 return; 1246 } 1247 target->transTo = tmp; 1248 } 1249 target->transTo[target->nbTransTo] = from; 1250 target->nbTransTo++; 1251} 1252 1253static void 1254xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 1255 xmlRegAtomPtr atom, xmlRegStatePtr target, 1256 int counter, int count) { 1257 1258 int nrtrans; 1259 1260 if (state == NULL) { 1261 ERROR("add state: state is NULL"); 1262 return; 1263 } 1264 if (target == NULL) { 1265 ERROR("add state: target is NULL"); 1266 return; 1267 } 1268 /* 1269 * Other routines follow the philosophy 'When in doubt, add a transition' 1270 * so we check here whether such a transition is already present and, if 1271 * so, silently ignore this request. 1272 */ 1273 1274 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) { 1275 xmlRegTransPtr trans = &(state->trans[nrtrans]); 1276 if ((trans->atom == atom) && 1277 (trans->to == target->no) && 1278 (trans->counter == counter) && 1279 (trans->count == count)) { 1280#ifdef DEBUG_REGEXP_GRAPH 1281 printf("Ignoring duplicate transition from %d to %d\n", 1282 state->no, target->no); 1283#endif 1284 return; 1285 } 1286 } 1287 1288 if (state->maxTrans == 0) { 1289 state->maxTrans = 8; 1290 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans * 1291 sizeof(xmlRegTrans)); 1292 if (state->trans == NULL) { 1293 xmlRegexpErrMemory(ctxt, "adding transition"); 1294 state->maxTrans = 0; 1295 return; 1296 } 1297 } else if (state->nbTrans >= state->maxTrans) { 1298 xmlRegTrans *tmp; 1299 state->maxTrans *= 2; 1300 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans * 1301 sizeof(xmlRegTrans)); 1302 if (tmp == NULL) { 1303 xmlRegexpErrMemory(ctxt, "adding transition"); 1304 state->maxTrans /= 2; 1305 return; 1306 } 1307 state->trans = tmp; 1308 } 1309#ifdef DEBUG_REGEXP_GRAPH 1310 printf("Add trans from %d to %d ", state->no, target->no); 1311 if (count == REGEXP_ALL_COUNTER) 1312 printf("all transition\n"); 1313 else if (count >= 0) 1314 printf("count based %d\n", count); 1315 else if (counter >= 0) 1316 printf("counted %d\n", counter); 1317 else if (atom == NULL) 1318 printf("epsilon transition\n"); 1319 else if (atom != NULL) 1320 xmlRegPrintAtom(stdout, atom); 1321#endif 1322 1323 state->trans[state->nbTrans].atom = atom; 1324 state->trans[state->nbTrans].to = target->no; 1325 state->trans[state->nbTrans].counter = counter; 1326 state->trans[state->nbTrans].count = count; 1327 state->trans[state->nbTrans].nd = 0; 1328 state->nbTrans++; 1329 xmlRegStateAddTransTo(ctxt, target, state->no); 1330} 1331 1332static int 1333xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { 1334 if (state == NULL) return(-1); 1335 if (ctxt->maxStates == 0) { 1336 ctxt->maxStates = 4; 1337 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates * 1338 sizeof(xmlRegStatePtr)); 1339 if (ctxt->states == NULL) { 1340 xmlRegexpErrMemory(ctxt, "adding state"); 1341 ctxt->maxStates = 0; 1342 return(-1); 1343 } 1344 } else if (ctxt->nbStates >= ctxt->maxStates) { 1345 xmlRegStatePtr *tmp; 1346 ctxt->maxStates *= 2; 1347 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates * 1348 sizeof(xmlRegStatePtr)); 1349 if (tmp == NULL) { 1350 xmlRegexpErrMemory(ctxt, "adding state"); 1351 ctxt->maxStates /= 2; 1352 return(-1); 1353 } 1354 ctxt->states = tmp; 1355 } 1356 state->no = ctxt->nbStates; 1357 ctxt->states[ctxt->nbStates++] = state; 1358 return(0); 1359} 1360 1361/** 1362 * xmlFAGenerateAllTransition: 1363 * @ctxt: a regexp parser context 1364 * @from: the from state 1365 * @to: the target state or NULL for building a new one 1366 * @lax: 1367 * 1368 */ 1369static void 1370xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, 1371 xmlRegStatePtr from, xmlRegStatePtr to, 1372 int lax) { 1373 if (to == NULL) { 1374 to = xmlRegNewState(ctxt); 1375 xmlRegStatePush(ctxt, to); 1376 ctxt->state = to; 1377 } 1378 if (lax) 1379 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER); 1380 else 1381 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); 1382} 1383 1384/** 1385 * xmlFAGenerateEpsilonTransition: 1386 * @ctxt: a regexp parser context 1387 * @from: the from state 1388 * @to: the target state or NULL for building a new one 1389 * 1390 */ 1391static void 1392xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1393 xmlRegStatePtr from, xmlRegStatePtr to) { 1394 if (to == NULL) { 1395 to = xmlRegNewState(ctxt); 1396 xmlRegStatePush(ctxt, to); 1397 ctxt->state = to; 1398 } 1399 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1); 1400} 1401 1402/** 1403 * xmlFAGenerateCountedEpsilonTransition: 1404 * @ctxt: a regexp parser context 1405 * @from: the from state 1406 * @to: the target state or NULL for building a new one 1407 * counter: the counter for that transition 1408 * 1409 */ 1410static void 1411xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt, 1412 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1413 if (to == NULL) { 1414 to = xmlRegNewState(ctxt); 1415 xmlRegStatePush(ctxt, to); 1416 ctxt->state = to; 1417 } 1418 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1); 1419} 1420 1421/** 1422 * xmlFAGenerateCountedTransition: 1423 * @ctxt: a regexp parser context 1424 * @from: the from state 1425 * @to: the target state or NULL for building a new one 1426 * counter: the counter for that transition 1427 * 1428 */ 1429static void 1430xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt, 1431 xmlRegStatePtr from, xmlRegStatePtr to, int counter) { 1432 if (to == NULL) { 1433 to = xmlRegNewState(ctxt); 1434 xmlRegStatePush(ctxt, to); 1435 ctxt->state = to; 1436 } 1437 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter); 1438} 1439 1440/** 1441 * xmlFAGenerateTransitions: 1442 * @ctxt: a regexp parser context 1443 * @from: the from state 1444 * @to: the target state or NULL for building a new one 1445 * @atom: the atom generating the transition 1446 * 1447 * Returns 0 if success and -1 in case of error. 1448 */ 1449static int 1450xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from, 1451 xmlRegStatePtr to, xmlRegAtomPtr atom) { 1452 if (atom == NULL) { 1453 ERROR("genrate transition: atom == NULL"); 1454 return(-1); 1455 } 1456 if (atom->type == XML_REGEXP_SUBREG) { 1457 /* 1458 * this is a subexpression handling one should not need to 1459 * create a new node except for XML_REGEXP_QUANT_RANGE. 1460 */ 1461 if (xmlRegAtomPush(ctxt, atom) < 0) { 1462 return(-1); 1463 } 1464 if ((to != NULL) && (atom->stop != to) && 1465 (atom->quant != XML_REGEXP_QUANT_RANGE)) { 1466 /* 1467 * Generate an epsilon transition to link to the target 1468 */ 1469 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1470#ifdef DV 1471 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) && 1472 (atom->quant != XML_REGEXP_QUANT_ONCE)) { 1473 to = xmlRegNewState(ctxt); 1474 xmlRegStatePush(ctxt, to); 1475 ctxt->state = to; 1476 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to); 1477#endif 1478 } 1479 switch (atom->quant) { 1480 case XML_REGEXP_QUANT_OPT: 1481 atom->quant = XML_REGEXP_QUANT_ONCE; 1482 /* 1483 * transition done to the state after end of atom. 1484 * 1. set transition from atom start to new state 1485 * 2. set transition from atom end to this state. 1486 */ 1487 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0); 1488 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, ctxt->state); 1489 break; 1490 case XML_REGEXP_QUANT_MULT: 1491 atom->quant = XML_REGEXP_QUANT_ONCE; 1492 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop); 1493 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1494 break; 1495 case XML_REGEXP_QUANT_PLUS: 1496 atom->quant = XML_REGEXP_QUANT_ONCE; 1497 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start); 1498 break; 1499 case XML_REGEXP_QUANT_RANGE: { 1500 int counter; 1501 xmlRegStatePtr newstate; 1502 1503 /* 1504 * This one is nasty: 1505 * 1/ if range has minOccurs == 0, create a new state 1506 * and create epsilon transitions from atom->start 1507 * to atom->stop, as well as atom->start to the new 1508 * state 1509 * 2/ register a new counter 1510 * 3/ register an epsilon transition associated to 1511 * this counter going from atom->stop to atom->start 1512 * 4/ create a new state 1513 * 5/ generate a counted transition from atom->stop to 1514 * that state 1515 */ 1516 if (atom->min == 0) { 1517 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1518 atom->stop); 1519 newstate = xmlRegNewState(ctxt); 1520 xmlRegStatePush(ctxt, newstate); 1521 ctxt->state = newstate; 1522 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 1523 newstate); 1524 } 1525 counter = xmlRegGetCounter(ctxt); 1526 ctxt->counters[counter].min = atom->min - 1; 1527 ctxt->counters[counter].max = atom->max - 1; 1528 atom->min = 0; 1529 atom->max = 0; 1530 atom->quant = XML_REGEXP_QUANT_ONCE; 1531 if (to != NULL) { 1532 newstate = to; 1533 } else { 1534 newstate = xmlRegNewState(ctxt); 1535 xmlRegStatePush(ctxt, newstate); 1536 } 1537 ctxt->state = newstate; 1538 xmlFAGenerateCountedTransition(ctxt, atom->stop, 1539 newstate, counter); 1540 1541 /* 1542 * first check count and if OK jump forward; 1543 * if checking fail increment count and jump back 1544 */ 1545 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop, 1546 atom->start, counter); 1547 } 1548 default: 1549 break; 1550 } 1551 return(0); 1552 } 1553 if ((atom->min == 0) && (atom->max == 0) && 1554 (atom->quant == XML_REGEXP_QUANT_RANGE)) { 1555 /* 1556 * we can discard the atom and generate an epsilon transition instead 1557 */ 1558 if (to == NULL) { 1559 to = xmlRegNewState(ctxt); 1560 if (to != NULL) 1561 xmlRegStatePush(ctxt, to); 1562 else { 1563 return(-1); 1564 } 1565 } 1566 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1567 ctxt->state = to; 1568 xmlRegFreeAtom(atom); 1569 return(0); 1570 } 1571 if (to == NULL) { 1572 to = xmlRegNewState(ctxt); 1573 if (to != NULL) 1574 xmlRegStatePush(ctxt, to); 1575 else { 1576 return(-1); 1577 } 1578 } 1579 if (xmlRegAtomPush(ctxt, atom) < 0) { 1580 return(-1); 1581 } 1582 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1); 1583 ctxt->state = to; 1584 switch (atom->quant) { 1585 case XML_REGEXP_QUANT_OPT: 1586 atom->quant = XML_REGEXP_QUANT_ONCE; 1587 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1588 break; 1589 case XML_REGEXP_QUANT_MULT: 1590 atom->quant = XML_REGEXP_QUANT_ONCE; 1591 xmlFAGenerateEpsilonTransition(ctxt, from, to); 1592 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1593 break; 1594 case XML_REGEXP_QUANT_PLUS: 1595 atom->quant = XML_REGEXP_QUANT_ONCE; 1596 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1); 1597 break; 1598 default: 1599 break; 1600 } 1601 return(0); 1602} 1603 1604/** 1605 * xmlFAReduceEpsilonTransitions: 1606 * @ctxt: a regexp parser context 1607 * @fromnr: the from state 1608 * @tonr: the to state 1609 * @counter: should that transition be associated to a counted 1610 * 1611 */ 1612static void 1613xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, 1614 int tonr, int counter) { 1615 int transnr; 1616 xmlRegStatePtr from; 1617 xmlRegStatePtr to; 1618 1619#ifdef DEBUG_REGEXP_GRAPH 1620 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr); 1621#endif 1622 from = ctxt->states[fromnr]; 1623 if (from == NULL) 1624 return; 1625 to = ctxt->states[tonr]; 1626 if (to == NULL) 1627 return; 1628 if ((to->mark == XML_REGEXP_MARK_START) || 1629 (to->mark == XML_REGEXP_MARK_VISITED)) 1630 return; 1631 1632 to->mark = XML_REGEXP_MARK_VISITED; 1633 if (to->type == XML_REGEXP_FINAL_STATE) { 1634#ifdef DEBUG_REGEXP_GRAPH 1635 printf("State %d is final, so %d becomes final\n", tonr, fromnr); 1636#endif 1637 from->type = XML_REGEXP_FINAL_STATE; 1638 } 1639 for (transnr = 0;transnr < to->nbTrans;transnr++) { 1640 if (to->trans[transnr].to < 0) 1641 continue; 1642 if (to->trans[transnr].atom == NULL) { 1643 /* 1644 * Don't remove counted transitions 1645 * Don't loop either 1646 */ 1647 if (to->trans[transnr].to != fromnr) { 1648 if (to->trans[transnr].count >= 0) { 1649 int newto = to->trans[transnr].to; 1650 1651 xmlRegStateAddTrans(ctxt, from, NULL, 1652 ctxt->states[newto], 1653 -1, to->trans[transnr].count); 1654 } else { 1655#ifdef DEBUG_REGEXP_GRAPH 1656 printf("Found epsilon trans %d from %d to %d\n", 1657 transnr, tonr, to->trans[transnr].to); 1658#endif 1659 if (to->trans[transnr].counter >= 0) { 1660 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1661 to->trans[transnr].to, 1662 to->trans[transnr].counter); 1663 } else { 1664 xmlFAReduceEpsilonTransitions(ctxt, fromnr, 1665 to->trans[transnr].to, 1666 counter); 1667 } 1668 } 1669 } 1670 } else { 1671 int newto = to->trans[transnr].to; 1672 1673 if (to->trans[transnr].counter >= 0) { 1674 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1675 ctxt->states[newto], 1676 to->trans[transnr].counter, -1); 1677 } else { 1678 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, 1679 ctxt->states[newto], counter, -1); 1680 } 1681 } 1682 } 1683 to->mark = XML_REGEXP_MARK_NORMAL; 1684} 1685 1686/** 1687 * xmlFAEliminateSimpleEpsilonTransitions: 1688 * @ctxt: a regexp parser context 1689 * 1690 * Eliminating general epsilon transitions can get costly in the general 1691 * algorithm due to the large amount of generated new transitions and 1692 * associated comparisons. However for simple epsilon transition used just 1693 * to separate building blocks when generating the automata this can be 1694 * reduced to state elimination: 1695 * - if there exists an epsilon from X to Y 1696 * - if there is no other transition from X 1697 * then X and Y are semantically equivalent and X can be eliminated 1698 * If X is the start state then make Y the start state, else replace the 1699 * target of all transitions to X by transitions to Y. 1700 */ 1701static void 1702xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1703 int statenr, i, j, newto; 1704 xmlRegStatePtr state, tmp; 1705 1706 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1707 state = ctxt->states[statenr]; 1708 if (state == NULL) 1709 continue; 1710 if (state->nbTrans != 1) 1711 continue; 1712 /* is the only transition out a basic transition */ 1713 if ((state->trans[0].atom == NULL) && 1714 (state->trans[0].to >= 0) && 1715 (state->trans[0].to != statenr) && 1716 (state->trans[0].counter < 0) && 1717 (state->trans[0].count < 0)) { 1718 newto = state->trans[0].to; 1719 1720 if (state->type == XML_REGEXP_START_STATE) { 1721#ifdef DEBUG_REGEXP_GRAPH 1722 printf("Found simple epsilon trans from start %d to %d\n", 1723 statenr, newto); 1724#endif 1725 } else { 1726#ifdef DEBUG_REGEXP_GRAPH 1727 printf("Found simple epsilon trans from %d to %d\n", 1728 statenr, newto); 1729#endif 1730 for (i = 0;i < state->nbTransTo;i++) { 1731 tmp = ctxt->states[state->transTo[i]]; 1732 for (j = 0;j < tmp->nbTrans;j++) { 1733 if (tmp->trans[j].to == statenr) { 1734 tmp->trans[j].to = newto; 1735#ifdef DEBUG_REGEXP_GRAPH 1736 printf("Changed transition %d on %d to go to %d\n", 1737 j, tmp->no, newto); 1738#endif 1739 xmlRegStateAddTransTo(ctxt, ctxt->states[newto], 1740 tmp->no); 1741 } 1742 } 1743 } 1744#if 0 1745 for (i = 0;i < ctxt->nbStates;i++) { 1746 tmp = ctxt->states[i]; 1747 for (j = 0;j < tmp->nbTrans;j++) { 1748 if (tmp->trans[j].to == statenr) { 1749 tmp->trans[j].to = newto; 1750#ifdef DEBUG_REGEXP_GRAPH 1751 printf("Changed transition %d on %d to go to %d\n", 1752 j, tmp->no, newto); 1753#endif 1754 } 1755 } 1756 } 1757#endif 1758 if (state->type == XML_REGEXP_FINAL_STATE) 1759 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE; 1760 /* eliminate the transition completely */ 1761 state->nbTrans = 0; 1762 1763 1764 } 1765 1766 } 1767 } 1768} 1769/** 1770 * xmlFAEliminateEpsilonTransitions: 1771 * @ctxt: a regexp parser context 1772 * 1773 */ 1774static void 1775xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { 1776 int statenr, transnr; 1777 xmlRegStatePtr state; 1778 int has_epsilon; 1779 1780 if (ctxt->states == NULL) return; 1781 1782 xmlFAEliminateSimpleEpsilonTransitions(ctxt); 1783 1784 has_epsilon = 0; 1785 1786 /* 1787 * build the completed transitions bypassing the epsilons 1788 * Use a marking algorithm to avoid loops 1789 * mark sink states too. 1790 */ 1791 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1792 state = ctxt->states[statenr]; 1793 if (state == NULL) 1794 continue; 1795 if ((state->nbTrans == 0) && 1796 (state->type != XML_REGEXP_FINAL_STATE)) { 1797 state->type = XML_REGEXP_SINK_STATE; 1798 } 1799 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1800 if ((state->trans[transnr].atom == NULL) && 1801 (state->trans[transnr].to >= 0)) { 1802 if (state->trans[transnr].to == statenr) { 1803 state->trans[transnr].to = -1; 1804#ifdef DEBUG_REGEXP_GRAPH 1805 printf("Removed loopback epsilon trans %d on %d\n", 1806 transnr, statenr); 1807#endif 1808 } else if (state->trans[transnr].count < 0) { 1809 int newto = state->trans[transnr].to; 1810 1811#ifdef DEBUG_REGEXP_GRAPH 1812 printf("Found epsilon trans %d from %d to %d\n", 1813 transnr, statenr, newto); 1814#endif 1815 state->mark = XML_REGEXP_MARK_START; 1816 has_epsilon = 1; 1817 xmlFAReduceEpsilonTransitions(ctxt, statenr, 1818 newto, state->trans[transnr].counter); 1819 state->mark = XML_REGEXP_MARK_NORMAL; 1820#ifdef DEBUG_REGEXP_GRAPH 1821 } else { 1822 printf("Found counted transition %d on %d\n", 1823 transnr, statenr); 1824#endif 1825 } 1826 } 1827 } 1828 } 1829 /* 1830 * Eliminate the epsilon transitions 1831 */ 1832 if (has_epsilon) { 1833 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1834 state = ctxt->states[statenr]; 1835 if (state == NULL) 1836 continue; 1837 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1838 xmlRegTransPtr trans = &(state->trans[transnr]); 1839 if ((trans->atom == NULL) && 1840 (trans->count < 0) && 1841 (trans->to >= 0)) { 1842 trans->to = -1; 1843 } 1844 } 1845 } 1846 } 1847 1848 /* 1849 * Use this pass to detect unreachable states too 1850 */ 1851 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1852 state = ctxt->states[statenr]; 1853 if (state != NULL) 1854 state->reached = XML_REGEXP_MARK_NORMAL; 1855 } 1856 state = ctxt->states[0]; 1857 if (state != NULL) 1858 state->reached = XML_REGEXP_MARK_START; 1859 while (state != NULL) { 1860 xmlRegStatePtr target = NULL; 1861 state->reached = XML_REGEXP_MARK_VISITED; 1862 /* 1863 * Mark all states reachable from the current reachable state 1864 */ 1865 for (transnr = 0;transnr < state->nbTrans;transnr++) { 1866 if ((state->trans[transnr].to >= 0) && 1867 ((state->trans[transnr].atom != NULL) || 1868 (state->trans[transnr].count >= 0))) { 1869 int newto = state->trans[transnr].to; 1870 1871 if (ctxt->states[newto] == NULL) 1872 continue; 1873 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { 1874 ctxt->states[newto]->reached = XML_REGEXP_MARK_START; 1875 target = ctxt->states[newto]; 1876 } 1877 } 1878 } 1879 1880 /* 1881 * find the next accessible state not explored 1882 */ 1883 if (target == NULL) { 1884 for (statenr = 1;statenr < ctxt->nbStates;statenr++) { 1885 state = ctxt->states[statenr]; 1886 if ((state != NULL) && (state->reached == 1887 XML_REGEXP_MARK_START)) { 1888 target = state; 1889 break; 1890 } 1891 } 1892 } 1893 state = target; 1894 } 1895 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 1896 state = ctxt->states[statenr]; 1897 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) { 1898#ifdef DEBUG_REGEXP_GRAPH 1899 printf("Removed unreachable state %d\n", statenr); 1900#endif 1901 xmlRegFreeState(state); 1902 ctxt->states[statenr] = NULL; 1903 } 1904 } 1905 1906} 1907 1908static int 1909xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { 1910 int ret = 0; 1911 1912 if ((range1->type == XML_REGEXP_RANGES) || 1913 (range2->type == XML_REGEXP_RANGES) || 1914 (range2->type == XML_REGEXP_SUBREG) || 1915 (range1->type == XML_REGEXP_SUBREG) || 1916 (range1->type == XML_REGEXP_STRING) || 1917 (range2->type == XML_REGEXP_STRING)) 1918 return(-1); 1919 1920 /* put them in order */ 1921 if (range1->type > range2->type) { 1922 xmlRegRangePtr tmp; 1923 1924 tmp = range1; 1925 range1 = range2; 1926 range2 = tmp; 1927 } 1928 if ((range1->type == XML_REGEXP_ANYCHAR) || 1929 (range2->type == XML_REGEXP_ANYCHAR)) { 1930 ret = 1; 1931 } else if ((range1->type == XML_REGEXP_EPSILON) || 1932 (range2->type == XML_REGEXP_EPSILON)) { 1933 return(0); 1934 } else if (range1->type == range2->type) { 1935 if ((range1->type != XML_REGEXP_CHARVAL) || 1936 (range1->end < range2->start) || 1937 (range2->end < range1->start)) 1938 ret = 1; 1939 else 1940 ret = 0; 1941 } else if (range1->type == XML_REGEXP_CHARVAL) { 1942 int codepoint; 1943 int neg = 0; 1944 1945 /* 1946 * just check all codepoints in the range for acceptance, 1947 * this is usually way cheaper since done only once at 1948 * compilation than testing over and over at runtime or 1949 * pushing too many states when evaluating. 1950 */ 1951 if (((range1->neg == 0) && (range2->neg != 0)) || 1952 ((range1->neg != 0) && (range2->neg == 0))) 1953 neg = 1; 1954 1955 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { 1956 ret = xmlRegCheckCharacterRange(range2->type, codepoint, 1957 0, range2->start, range2->end, 1958 range2->blockName); 1959 if (ret < 0) 1960 return(-1); 1961 if (((neg == 1) && (ret == 0)) || 1962 ((neg == 0) && (ret == 1))) 1963 return(1); 1964 } 1965 return(0); 1966 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || 1967 (range2->type == XML_REGEXP_BLOCK_NAME)) { 1968 if (range1->type == range2->type) { 1969 ret = xmlStrEqual(range1->blockName, range2->blockName); 1970 } else { 1971 /* 1972 * comparing a block range with anything else is way 1973 * too costly, and maintining the table is like too much 1974 * memory too, so let's force the automata to save state 1975 * here. 1976 */ 1977 return(1); 1978 } 1979 } else if ((range1->type < XML_REGEXP_LETTER) || 1980 (range2->type < XML_REGEXP_LETTER)) { 1981 if ((range1->type == XML_REGEXP_ANYSPACE) && 1982 (range2->type == XML_REGEXP_NOTSPACE)) 1983 ret = 0; 1984 else if ((range1->type == XML_REGEXP_INITNAME) && 1985 (range2->type == XML_REGEXP_NOTINITNAME)) 1986 ret = 0; 1987 else if ((range1->type == XML_REGEXP_NAMECHAR) && 1988 (range2->type == XML_REGEXP_NOTNAMECHAR)) 1989 ret = 0; 1990 else if ((range1->type == XML_REGEXP_DECIMAL) && 1991 (range2->type == XML_REGEXP_NOTDECIMAL)) 1992 ret = 0; 1993 else if ((range1->type == XML_REGEXP_REALCHAR) && 1994 (range2->type == XML_REGEXP_NOTREALCHAR)) 1995 ret = 0; 1996 else { 1997 /* same thing to limit complexity */ 1998 return(1); 1999 } 2000 } else { 2001 ret = 0; 2002 /* range1->type < range2->type here */ 2003 switch (range1->type) { 2004 case XML_REGEXP_LETTER: 2005 /* all disjoint except in the subgroups */ 2006 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || 2007 (range2->type == XML_REGEXP_LETTER_LOWERCASE) || 2008 (range2->type == XML_REGEXP_LETTER_TITLECASE) || 2009 (range2->type == XML_REGEXP_LETTER_MODIFIER) || 2010 (range2->type == XML_REGEXP_LETTER_OTHERS)) 2011 ret = 1; 2012 break; 2013 case XML_REGEXP_MARK: 2014 if ((range2->type == XML_REGEXP_MARK_NONSPACING) || 2015 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || 2016 (range2->type == XML_REGEXP_MARK_ENCLOSING)) 2017 ret = 1; 2018 break; 2019 case XML_REGEXP_NUMBER: 2020 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || 2021 (range2->type == XML_REGEXP_NUMBER_LETTER) || 2022 (range2->type == XML_REGEXP_NUMBER_OTHERS)) 2023 ret = 1; 2024 break; 2025 case XML_REGEXP_PUNCT: 2026 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || 2027 (range2->type == XML_REGEXP_PUNCT_DASH) || 2028 (range2->type == XML_REGEXP_PUNCT_OPEN) || 2029 (range2->type == XML_REGEXP_PUNCT_CLOSE) || 2030 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || 2031 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || 2032 (range2->type == XML_REGEXP_PUNCT_OTHERS)) 2033 ret = 1; 2034 break; 2035 case XML_REGEXP_SEPAR: 2036 if ((range2->type == XML_REGEXP_SEPAR_SPACE) || 2037 (range2->type == XML_REGEXP_SEPAR_LINE) || 2038 (range2->type == XML_REGEXP_SEPAR_PARA)) 2039 ret = 1; 2040 break; 2041 case XML_REGEXP_SYMBOL: 2042 if ((range2->type == XML_REGEXP_SYMBOL_MATH) || 2043 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || 2044 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || 2045 (range2->type == XML_REGEXP_SYMBOL_OTHERS)) 2046 ret = 1; 2047 break; 2048 case XML_REGEXP_OTHER: 2049 if ((range2->type == XML_REGEXP_OTHER_CONTROL) || 2050 (range2->type == XML_REGEXP_OTHER_FORMAT) || 2051 (range2->type == XML_REGEXP_OTHER_PRIVATE)) 2052 ret = 1; 2053 break; 2054 default: 2055 if ((range2->type >= XML_REGEXP_LETTER) && 2056 (range2->type < XML_REGEXP_BLOCK_NAME)) 2057 ret = 0; 2058 else { 2059 /* safety net ! */ 2060 return(1); 2061 } 2062 } 2063 } 2064 if (((range1->neg == 0) && (range2->neg != 0)) || 2065 ((range1->neg != 0) && (range2->neg == 0))) 2066 ret = !ret; 2067 return(1); 2068} 2069 2070/** 2071 * xmlFACompareAtomTypes: 2072 * @type1: an atom type 2073 * @type2: an atom type 2074 * 2075 * Compares two atoms type to check whether they intersect in some ways, 2076 * this is used by xmlFACompareAtoms only 2077 * 2078 * Returns 1 if they may intersect and 0 otherwise 2079 */ 2080static int 2081xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { 2082 if ((type1 == XML_REGEXP_EPSILON) || 2083 (type1 == XML_REGEXP_CHARVAL) || 2084 (type1 == XML_REGEXP_RANGES) || 2085 (type1 == XML_REGEXP_SUBREG) || 2086 (type1 == XML_REGEXP_STRING) || 2087 (type1 == XML_REGEXP_ANYCHAR)) 2088 return(1); 2089 if ((type2 == XML_REGEXP_EPSILON) || 2090 (type2 == XML_REGEXP_CHARVAL) || 2091 (type2 == XML_REGEXP_RANGES) || 2092 (type2 == XML_REGEXP_SUBREG) || 2093 (type2 == XML_REGEXP_STRING) || 2094 (type2 == XML_REGEXP_ANYCHAR)) 2095 return(1); 2096 2097 if (type1 == type2) return(1); 2098 2099 /* simplify subsequent compares by making sure type1 < type2 */ 2100 if (type1 > type2) { 2101 xmlRegAtomType tmp = type1; 2102 type1 = type2; 2103 type2 = tmp; 2104 } 2105 switch (type1) { 2106 case XML_REGEXP_ANYSPACE: /* \s */ 2107 /* can't be a letter, number, mark, pontuation, symbol */ 2108 if ((type2 == XML_REGEXP_NOTSPACE) || 2109 ((type2 >= XML_REGEXP_LETTER) && 2110 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2111 ((type2 >= XML_REGEXP_NUMBER) && 2112 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2113 ((type2 >= XML_REGEXP_MARK) && 2114 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2115 ((type2 >= XML_REGEXP_PUNCT) && 2116 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2117 ((type2 >= XML_REGEXP_SYMBOL) && 2118 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) 2119 ) return(0); 2120 break; 2121 case XML_REGEXP_NOTSPACE: /* \S */ 2122 break; 2123 case XML_REGEXP_INITNAME: /* \l */ 2124 /* can't be a number, mark, separator, pontuation, symbol or other */ 2125 if ((type2 == XML_REGEXP_NOTINITNAME) || 2126 ((type2 >= XML_REGEXP_NUMBER) && 2127 (type2 <= XML_REGEXP_NUMBER_OTHERS)) || 2128 ((type2 >= XML_REGEXP_MARK) && 2129 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2130 ((type2 >= XML_REGEXP_SEPAR) && 2131 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2132 ((type2 >= XML_REGEXP_PUNCT) && 2133 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2134 ((type2 >= XML_REGEXP_SYMBOL) && 2135 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2136 ((type2 >= XML_REGEXP_OTHER) && 2137 (type2 <= XML_REGEXP_OTHER_NA)) 2138 ) return(0); 2139 break; 2140 case XML_REGEXP_NOTINITNAME: /* \L */ 2141 break; 2142 case XML_REGEXP_NAMECHAR: /* \c */ 2143 /* can't be a mark, separator, pontuation, symbol or other */ 2144 if ((type2 == XML_REGEXP_NOTNAMECHAR) || 2145 ((type2 >= XML_REGEXP_MARK) && 2146 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2147 ((type2 >= XML_REGEXP_PUNCT) && 2148 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2149 ((type2 >= XML_REGEXP_SEPAR) && 2150 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2151 ((type2 >= XML_REGEXP_SYMBOL) && 2152 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2153 ((type2 >= XML_REGEXP_OTHER) && 2154 (type2 <= XML_REGEXP_OTHER_NA)) 2155 ) return(0); 2156 break; 2157 case XML_REGEXP_NOTNAMECHAR: /* \C */ 2158 break; 2159 case XML_REGEXP_DECIMAL: /* \d */ 2160 /* can't be a letter, mark, separator, pontuation, symbol or other */ 2161 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2162 (type2 == XML_REGEXP_REALCHAR) || 2163 ((type2 >= XML_REGEXP_LETTER) && 2164 (type2 <= XML_REGEXP_LETTER_OTHERS)) || 2165 ((type2 >= XML_REGEXP_MARK) && 2166 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2167 ((type2 >= XML_REGEXP_PUNCT) && 2168 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2169 ((type2 >= XML_REGEXP_SEPAR) && 2170 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2171 ((type2 >= XML_REGEXP_SYMBOL) && 2172 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2173 ((type2 >= XML_REGEXP_OTHER) && 2174 (type2 <= XML_REGEXP_OTHER_NA)) 2175 )return(0); 2176 break; 2177 case XML_REGEXP_NOTDECIMAL: /* \D */ 2178 break; 2179 case XML_REGEXP_REALCHAR: /* \w */ 2180 /* can't be a mark, separator, pontuation, symbol or other */ 2181 if ((type2 == XML_REGEXP_NOTDECIMAL) || 2182 ((type2 >= XML_REGEXP_MARK) && 2183 (type2 <= XML_REGEXP_MARK_ENCLOSING)) || 2184 ((type2 >= XML_REGEXP_PUNCT) && 2185 (type2 <= XML_REGEXP_PUNCT_OTHERS)) || 2186 ((type2 >= XML_REGEXP_SEPAR) && 2187 (type2 <= XML_REGEXP_SEPAR_PARA)) || 2188 ((type2 >= XML_REGEXP_SYMBOL) && 2189 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || 2190 ((type2 >= XML_REGEXP_OTHER) && 2191 (type2 <= XML_REGEXP_OTHER_NA)) 2192 )return(0); 2193 break; 2194 case XML_REGEXP_NOTREALCHAR: /* \W */ 2195 break; 2196 /* 2197 * at that point we know both type 1 and type2 are from 2198 * character categories are ordered and are different, 2199 * it becomes simple because this is a partition 2200 */ 2201 case XML_REGEXP_LETTER: 2202 if (type2 <= XML_REGEXP_LETTER_OTHERS) 2203 return(1); 2204 return(0); 2205 case XML_REGEXP_LETTER_UPPERCASE: 2206 case XML_REGEXP_LETTER_LOWERCASE: 2207 case XML_REGEXP_LETTER_TITLECASE: 2208 case XML_REGEXP_LETTER_MODIFIER: 2209 case XML_REGEXP_LETTER_OTHERS: 2210 return(0); 2211 case XML_REGEXP_MARK: 2212 if (type2 <= XML_REGEXP_MARK_ENCLOSING) 2213 return(1); 2214 return(0); 2215 case XML_REGEXP_MARK_NONSPACING: 2216 case XML_REGEXP_MARK_SPACECOMBINING: 2217 case XML_REGEXP_MARK_ENCLOSING: 2218 return(0); 2219 case XML_REGEXP_NUMBER: 2220 if (type2 <= XML_REGEXP_NUMBER_OTHERS) 2221 return(1); 2222 return(0); 2223 case XML_REGEXP_NUMBER_DECIMAL: 2224 case XML_REGEXP_NUMBER_LETTER: 2225 case XML_REGEXP_NUMBER_OTHERS: 2226 return(0); 2227 case XML_REGEXP_PUNCT: 2228 if (type2 <= XML_REGEXP_PUNCT_OTHERS) 2229 return(1); 2230 return(0); 2231 case XML_REGEXP_PUNCT_CONNECTOR: 2232 case XML_REGEXP_PUNCT_DASH: 2233 case XML_REGEXP_PUNCT_OPEN: 2234 case XML_REGEXP_PUNCT_CLOSE: 2235 case XML_REGEXP_PUNCT_INITQUOTE: 2236 case XML_REGEXP_PUNCT_FINQUOTE: 2237 case XML_REGEXP_PUNCT_OTHERS: 2238 return(0); 2239 case XML_REGEXP_SEPAR: 2240 if (type2 <= XML_REGEXP_SEPAR_PARA) 2241 return(1); 2242 return(0); 2243 case XML_REGEXP_SEPAR_SPACE: 2244 case XML_REGEXP_SEPAR_LINE: 2245 case XML_REGEXP_SEPAR_PARA: 2246 return(0); 2247 case XML_REGEXP_SYMBOL: 2248 if (type2 <= XML_REGEXP_SYMBOL_OTHERS) 2249 return(1); 2250 return(0); 2251 case XML_REGEXP_SYMBOL_MATH: 2252 case XML_REGEXP_SYMBOL_CURRENCY: 2253 case XML_REGEXP_SYMBOL_MODIFIER: 2254 case XML_REGEXP_SYMBOL_OTHERS: 2255 return(0); 2256 case XML_REGEXP_OTHER: 2257 if (type2 <= XML_REGEXP_OTHER_NA) 2258 return(1); 2259 return(0); 2260 case XML_REGEXP_OTHER_CONTROL: 2261 case XML_REGEXP_OTHER_FORMAT: 2262 case XML_REGEXP_OTHER_PRIVATE: 2263 case XML_REGEXP_OTHER_NA: 2264 return(0); 2265 default: 2266 break; 2267 } 2268 return(1); 2269} 2270 2271/** 2272 * xmlFAEqualAtoms: 2273 * @atom1: an atom 2274 * @atom2: an atom 2275 * 2276 * Compares two atoms to check whether they are the same exactly 2277 * this is used to remove equivalent transitions 2278 * 2279 * Returns 1 if same and 0 otherwise 2280 */ 2281static int 2282xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2283 int ret = 0; 2284 2285 if (atom1 == atom2) 2286 return(1); 2287 if ((atom1 == NULL) || (atom2 == NULL)) 2288 return(0); 2289 2290 if (atom1->type != atom2->type) 2291 return(0); 2292 switch (atom1->type) { 2293 case XML_REGEXP_EPSILON: 2294 ret = 0; 2295 break; 2296 case XML_REGEXP_STRING: 2297 ret = xmlStrEqual((xmlChar *)atom1->valuep, 2298 (xmlChar *)atom2->valuep); 2299 break; 2300 case XML_REGEXP_CHARVAL: 2301 ret = (atom1->codepoint == atom2->codepoint); 2302 break; 2303 case XML_REGEXP_RANGES: 2304 /* too hard to do in the general case */ 2305 ret = 0; 2306 default: 2307 break; 2308 } 2309 return(ret); 2310} 2311 2312/** 2313 * xmlFACompareAtoms: 2314 * @atom1: an atom 2315 * @atom2: an atom 2316 * 2317 * Compares two atoms to check whether they intersect in some ways, 2318 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only 2319 * 2320 * Returns 1 if yes and 0 otherwise 2321 */ 2322static int 2323xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { 2324 int ret = 1; 2325 2326 if (atom1 == atom2) 2327 return(1); 2328 if ((atom1 == NULL) || (atom2 == NULL)) 2329 return(0); 2330 2331 if ((atom1->type == XML_REGEXP_ANYCHAR) || 2332 (atom2->type == XML_REGEXP_ANYCHAR)) 2333 return(1); 2334 2335 if (atom1->type > atom2->type) { 2336 xmlRegAtomPtr tmp; 2337 tmp = atom1; 2338 atom1 = atom2; 2339 atom2 = tmp; 2340 } 2341 if (atom1->type != atom2->type) { 2342 ret = xmlFACompareAtomTypes(atom1->type, atom2->type); 2343 /* if they can't intersect at the type level break now */ 2344 if (ret == 0) 2345 return(0); 2346 } 2347 switch (atom1->type) { 2348 case XML_REGEXP_STRING: 2349 ret = xmlRegStrEqualWildcard((xmlChar *)atom1->valuep, 2350 (xmlChar *)atom2->valuep); 2351 break; 2352 case XML_REGEXP_EPSILON: 2353 goto not_determinist; 2354 case XML_REGEXP_CHARVAL: 2355 if (atom2->type == XML_REGEXP_CHARVAL) { 2356 ret = (atom1->codepoint == atom2->codepoint); 2357 } else { 2358 ret = xmlRegCheckCharacter(atom2, atom1->codepoint); 2359 if (ret < 0) 2360 ret = 1; 2361 } 2362 break; 2363 case XML_REGEXP_RANGES: 2364 if (atom2->type == XML_REGEXP_RANGES) { 2365 int i, j, res; 2366 xmlRegRangePtr r1, r2; 2367 2368 /* 2369 * need to check that none of the ranges eventually matches 2370 */ 2371 for (i = 0;i < atom1->nbRanges;i++) { 2372 for (j = 0;j < atom2->nbRanges;j++) { 2373 r1 = atom1->ranges[i]; 2374 r2 = atom2->ranges[j]; 2375 res = xmlFACompareRanges(r1, r2); 2376 if (res == 1) { 2377 ret = 1; 2378 goto done; 2379 } 2380 } 2381 } 2382 ret = 0; 2383 } 2384 break; 2385 default: 2386 goto not_determinist; 2387 } 2388done: 2389 if (atom1->neg != atom2->neg) { 2390 ret = !ret; 2391 } 2392 if (ret == 0) 2393 return(0); 2394not_determinist: 2395 return(1); 2396} 2397 2398/** 2399 * xmlFARecurseDeterminism: 2400 * @ctxt: a regexp parser context 2401 * 2402 * Check whether the associated regexp is determinist, 2403 * should be called after xmlFAEliminateEpsilonTransitions() 2404 * 2405 */ 2406static int 2407xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state, 2408 int to, xmlRegAtomPtr atom) { 2409 int ret = 1; 2410 int res; 2411 int transnr, nbTrans; 2412 xmlRegTransPtr t1; 2413 2414 if (state == NULL) 2415 return(ret); 2416 /* 2417 * don't recurse on transitions potentially added in the course of 2418 * the elimination. 2419 */ 2420 nbTrans = state->nbTrans; 2421 for (transnr = 0;transnr < nbTrans;transnr++) { 2422 t1 = &(state->trans[transnr]); 2423 /* 2424 * check transitions conflicting with the one looked at 2425 */ 2426 if (t1->atom == NULL) { 2427 if (t1->to == -1) 2428 continue; 2429 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2430 to, atom); 2431 if (res == 0) { 2432 ret = 0; 2433 /* t1->nd = 1; */ 2434 } 2435 continue; 2436 } 2437 if (t1->to != to) 2438 continue; 2439 if (xmlFACompareAtoms(t1->atom, atom)) { 2440 ret = 0; 2441 /* mark the transition as non-deterministic */ 2442 t1->nd = 1; 2443 } 2444 } 2445 return(ret); 2446} 2447 2448/** 2449 * xmlFAComputesDeterminism: 2450 * @ctxt: a regexp parser context 2451 * 2452 * Check whether the associated regexp is determinist, 2453 * should be called after xmlFAEliminateEpsilonTransitions() 2454 * 2455 */ 2456static int 2457xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { 2458 int statenr, transnr; 2459 xmlRegStatePtr state; 2460 xmlRegTransPtr t1, t2, last; 2461 int i; 2462 int ret = 1; 2463 2464#ifdef DEBUG_REGEXP_GRAPH 2465 printf("xmlFAComputesDeterminism\n"); 2466 xmlRegPrintCtxt(stdout, ctxt); 2467#endif 2468 if (ctxt->determinist != -1) 2469 return(ctxt->determinist); 2470 2471 /* 2472 * First cleanup the automata removing cancelled transitions 2473 */ 2474 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2475 state = ctxt->states[statenr]; 2476 if (state == NULL) 2477 continue; 2478 if (state->nbTrans < 2) 2479 continue; 2480 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2481 t1 = &(state->trans[transnr]); 2482 /* 2483 * Determinism checks in case of counted or all transitions 2484 * will have to be handled separately 2485 */ 2486 if (t1->atom == NULL) { 2487 /* t1->nd = 1; */ 2488 continue; 2489 } 2490 if (t1->to == -1) /* eliminated */ 2491 continue; 2492 for (i = 0;i < transnr;i++) { 2493 t2 = &(state->trans[i]); 2494 if (t2->to == -1) /* eliminated */ 2495 continue; 2496 if (t2->atom != NULL) { 2497 if (t1->to == t2->to) { 2498 if (xmlFAEqualAtoms(t1->atom, t2->atom)) 2499 t2->to = -1; /* eliminated */ 2500 } 2501 } 2502 } 2503 } 2504 } 2505 2506 /* 2507 * Check for all states that there aren't 2 transitions 2508 * with the same atom and a different target. 2509 */ 2510 for (statenr = 0;statenr < ctxt->nbStates;statenr++) { 2511 state = ctxt->states[statenr]; 2512 if (state == NULL) 2513 continue; 2514 if (state->nbTrans < 2) 2515 continue; 2516 last = NULL; 2517 for (transnr = 0;transnr < state->nbTrans;transnr++) { 2518 t1 = &(state->trans[transnr]); 2519 /* 2520 * Determinism checks in case of counted or all transitions 2521 * will have to be handled separately 2522 */ 2523 if (t1->atom == NULL) { 2524 continue; 2525 } 2526 if (t1->to == -1) /* eliminated */ 2527 continue; 2528 for (i = 0;i < transnr;i++) { 2529 t2 = &(state->trans[i]); 2530 if (t2->to == -1) /* eliminated */ 2531 continue; 2532 if (t2->atom != NULL) { 2533 /* not determinist ! */ 2534 if (xmlFACompareAtoms(t1->atom, t2->atom)) { 2535 ret = 0; 2536 /* mark the transitions as non-deterministic ones */ 2537 t1->nd = 1; 2538 t2->nd = 1; 2539 last = t1; 2540 } 2541 } else if (t1->to != -1) { 2542 /* 2543 * do the closure in case of remaining specific 2544 * epsilon transitions like choices or all 2545 */ 2546 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to], 2547 t2->to, t2->atom); 2548 /* don't shortcut the computation so all non deterministic 2549 transition get marked down 2550 if (ret == 0) 2551 return(0); 2552 */ 2553 if (ret == 0) { 2554 t1->nd = 1; 2555 /* t2->nd = 1; */ 2556 last = t1; 2557 } 2558 } 2559 } 2560 /* don't shortcut the computation so all non deterministic 2561 transition get marked down 2562 if (ret == 0) 2563 break; */ 2564 } 2565 2566 /* 2567 * mark specifically the last non-deterministic transition 2568 * from a state since there is no need to set-up rollback 2569 * from it 2570 */ 2571 if (last != NULL) { 2572 last->nd = 2; 2573 } 2574 2575 /* don't shortcut the computation so all non deterministic 2576 transition get marked down 2577 if (ret == 0) 2578 break; */ 2579 } 2580 2581 ctxt->determinist = ret; 2582 return(ret); 2583} 2584 2585/************************************************************************ 2586 * * 2587 * Routines to check input against transition atoms * 2588 * * 2589 ************************************************************************/ 2590 2591static int 2592xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg, 2593 int start, int end, const xmlChar *blockName) { 2594 int ret = 0; 2595 2596 switch (type) { 2597 case XML_REGEXP_STRING: 2598 case XML_REGEXP_SUBREG: 2599 case XML_REGEXP_RANGES: 2600 case XML_REGEXP_EPSILON: 2601 return(-1); 2602 case XML_REGEXP_ANYCHAR: 2603 ret = ((codepoint != '\n') && (codepoint != '\r')); 2604 break; 2605 case XML_REGEXP_CHARVAL: 2606 ret = ((codepoint >= start) && (codepoint <= end)); 2607 break; 2608 case XML_REGEXP_NOTSPACE: 2609 neg = !neg; 2610 case XML_REGEXP_ANYSPACE: 2611 ret = ((codepoint == '\n') || (codepoint == '\r') || 2612 (codepoint == '\t') || (codepoint == ' ')); 2613 break; 2614 case XML_REGEXP_NOTINITNAME: 2615 neg = !neg; 2616 case XML_REGEXP_INITNAME: 2617 ret = (IS_LETTER(codepoint) || 2618 (codepoint == '_') || (codepoint == ':')); 2619 break; 2620 case XML_REGEXP_NOTNAMECHAR: 2621 neg = !neg; 2622 case XML_REGEXP_NAMECHAR: 2623 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) || 2624 (codepoint == '.') || (codepoint == '-') || 2625 (codepoint == '_') || (codepoint == ':') || 2626 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint)); 2627 break; 2628 case XML_REGEXP_NOTDECIMAL: 2629 neg = !neg; 2630 case XML_REGEXP_DECIMAL: 2631 ret = xmlUCSIsCatNd(codepoint); 2632 break; 2633 case XML_REGEXP_REALCHAR: 2634 neg = !neg; 2635 case XML_REGEXP_NOTREALCHAR: 2636 ret = xmlUCSIsCatP(codepoint); 2637 if (ret == 0) 2638 ret = xmlUCSIsCatZ(codepoint); 2639 if (ret == 0) 2640 ret = xmlUCSIsCatC(codepoint); 2641 break; 2642 case XML_REGEXP_LETTER: 2643 ret = xmlUCSIsCatL(codepoint); 2644 break; 2645 case XML_REGEXP_LETTER_UPPERCASE: 2646 ret = xmlUCSIsCatLu(codepoint); 2647 break; 2648 case XML_REGEXP_LETTER_LOWERCASE: 2649 ret = xmlUCSIsCatLl(codepoint); 2650 break; 2651 case XML_REGEXP_LETTER_TITLECASE: 2652 ret = xmlUCSIsCatLt(codepoint); 2653 break; 2654 case XML_REGEXP_LETTER_MODIFIER: 2655 ret = xmlUCSIsCatLm(codepoint); 2656 break; 2657 case XML_REGEXP_LETTER_OTHERS: 2658 ret = xmlUCSIsCatLo(codepoint); 2659 break; 2660 case XML_REGEXP_MARK: 2661 ret = xmlUCSIsCatM(codepoint); 2662 break; 2663 case XML_REGEXP_MARK_NONSPACING: 2664 ret = xmlUCSIsCatMn(codepoint); 2665 break; 2666 case XML_REGEXP_MARK_SPACECOMBINING: 2667 ret = xmlUCSIsCatMc(codepoint); 2668 break; 2669 case XML_REGEXP_MARK_ENCLOSING: 2670 ret = xmlUCSIsCatMe(codepoint); 2671 break; 2672 case XML_REGEXP_NUMBER: 2673 ret = xmlUCSIsCatN(codepoint); 2674 break; 2675 case XML_REGEXP_NUMBER_DECIMAL: 2676 ret = xmlUCSIsCatNd(codepoint); 2677 break; 2678 case XML_REGEXP_NUMBER_LETTER: 2679 ret = xmlUCSIsCatNl(codepoint); 2680 break; 2681 case XML_REGEXP_NUMBER_OTHERS: 2682 ret = xmlUCSIsCatNo(codepoint); 2683 break; 2684 case XML_REGEXP_PUNCT: 2685 ret = xmlUCSIsCatP(codepoint); 2686 break; 2687 case XML_REGEXP_PUNCT_CONNECTOR: 2688 ret = xmlUCSIsCatPc(codepoint); 2689 break; 2690 case XML_REGEXP_PUNCT_DASH: 2691 ret = xmlUCSIsCatPd(codepoint); 2692 break; 2693 case XML_REGEXP_PUNCT_OPEN: 2694 ret = xmlUCSIsCatPs(codepoint); 2695 break; 2696 case XML_REGEXP_PUNCT_CLOSE: 2697 ret = xmlUCSIsCatPe(codepoint); 2698 break; 2699 case XML_REGEXP_PUNCT_INITQUOTE: 2700 ret = xmlUCSIsCatPi(codepoint); 2701 break; 2702 case XML_REGEXP_PUNCT_FINQUOTE: 2703 ret = xmlUCSIsCatPf(codepoint); 2704 break; 2705 case XML_REGEXP_PUNCT_OTHERS: 2706 ret = xmlUCSIsCatPo(codepoint); 2707 break; 2708 case XML_REGEXP_SEPAR: 2709 ret = xmlUCSIsCatZ(codepoint); 2710 break; 2711 case XML_REGEXP_SEPAR_SPACE: 2712 ret = xmlUCSIsCatZs(codepoint); 2713 break; 2714 case XML_REGEXP_SEPAR_LINE: 2715 ret = xmlUCSIsCatZl(codepoint); 2716 break; 2717 case XML_REGEXP_SEPAR_PARA: 2718 ret = xmlUCSIsCatZp(codepoint); 2719 break; 2720 case XML_REGEXP_SYMBOL: 2721 ret = xmlUCSIsCatS(codepoint); 2722 break; 2723 case XML_REGEXP_SYMBOL_MATH: 2724 ret = xmlUCSIsCatSm(codepoint); 2725 break; 2726 case XML_REGEXP_SYMBOL_CURRENCY: 2727 ret = xmlUCSIsCatSc(codepoint); 2728 break; 2729 case XML_REGEXP_SYMBOL_MODIFIER: 2730 ret = xmlUCSIsCatSk(codepoint); 2731 break; 2732 case XML_REGEXP_SYMBOL_OTHERS: 2733 ret = xmlUCSIsCatSo(codepoint); 2734 break; 2735 case XML_REGEXP_OTHER: 2736 ret = xmlUCSIsCatC(codepoint); 2737 break; 2738 case XML_REGEXP_OTHER_CONTROL: 2739 ret = xmlUCSIsCatCc(codepoint); 2740 break; 2741 case XML_REGEXP_OTHER_FORMAT: 2742 ret = xmlUCSIsCatCf(codepoint); 2743 break; 2744 case XML_REGEXP_OTHER_PRIVATE: 2745 ret = xmlUCSIsCatCo(codepoint); 2746 break; 2747 case XML_REGEXP_OTHER_NA: 2748 /* ret = xmlUCSIsCatCn(codepoint); */ 2749 /* Seems it doesn't exist anymore in recent Unicode releases */ 2750 ret = 0; 2751 break; 2752 case XML_REGEXP_BLOCK_NAME: 2753 ret = xmlUCSIsBlock(codepoint, (const char *) blockName); 2754 break; 2755 } 2756 if (neg) 2757 return(!ret); 2758 return(ret); 2759} 2760 2761static int 2762xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) { 2763 int i, ret = 0; 2764 xmlRegRangePtr range; 2765 2766 if ((atom == NULL) || (!IS_CHAR(codepoint))) 2767 return(-1); 2768 2769 switch (atom->type) { 2770 case XML_REGEXP_SUBREG: 2771 case XML_REGEXP_EPSILON: 2772 return(-1); 2773 case XML_REGEXP_CHARVAL: 2774 return(codepoint == atom->codepoint); 2775 case XML_REGEXP_RANGES: { 2776 int accept = 0; 2777 2778 for (i = 0;i < atom->nbRanges;i++) { 2779 range = atom->ranges[i]; 2780 if (range->neg == 2) { 2781 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2782 0, range->start, range->end, 2783 range->blockName); 2784 if (ret != 0) 2785 return(0); /* excluded char */ 2786 } else if (range->neg) { 2787 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2788 0, range->start, range->end, 2789 range->blockName); 2790 if (ret == 0) 2791 accept = 1; 2792 else 2793 return(0); 2794 } else { 2795 ret = xmlRegCheckCharacterRange(range->type, codepoint, 2796 0, range->start, range->end, 2797 range->blockName); 2798 if (ret != 0) 2799 accept = 1; /* might still be excluded */ 2800 } 2801 } 2802 return(accept); 2803 } 2804 case XML_REGEXP_STRING: 2805 printf("TODO: XML_REGEXP_STRING\n"); 2806 return(-1); 2807 case XML_REGEXP_ANYCHAR: 2808 case XML_REGEXP_ANYSPACE: 2809 case XML_REGEXP_NOTSPACE: 2810 case XML_REGEXP_INITNAME: 2811 case XML_REGEXP_NOTINITNAME: 2812 case XML_REGEXP_NAMECHAR: 2813 case XML_REGEXP_NOTNAMECHAR: 2814 case XML_REGEXP_DECIMAL: 2815 case XML_REGEXP_NOTDECIMAL: 2816 case XML_REGEXP_REALCHAR: 2817 case XML_REGEXP_NOTREALCHAR: 2818 case XML_REGEXP_LETTER: 2819 case XML_REGEXP_LETTER_UPPERCASE: 2820 case XML_REGEXP_LETTER_LOWERCASE: 2821 case XML_REGEXP_LETTER_TITLECASE: 2822 case XML_REGEXP_LETTER_MODIFIER: 2823 case XML_REGEXP_LETTER_OTHERS: 2824 case XML_REGEXP_MARK: 2825 case XML_REGEXP_MARK_NONSPACING: 2826 case XML_REGEXP_MARK_SPACECOMBINING: 2827 case XML_REGEXP_MARK_ENCLOSING: 2828 case XML_REGEXP_NUMBER: 2829 case XML_REGEXP_NUMBER_DECIMAL: 2830 case XML_REGEXP_NUMBER_LETTER: 2831 case XML_REGEXP_NUMBER_OTHERS: 2832 case XML_REGEXP_PUNCT: 2833 case XML_REGEXP_PUNCT_CONNECTOR: 2834 case XML_REGEXP_PUNCT_DASH: 2835 case XML_REGEXP_PUNCT_OPEN: 2836 case XML_REGEXP_PUNCT_CLOSE: 2837 case XML_REGEXP_PUNCT_INITQUOTE: 2838 case XML_REGEXP_PUNCT_FINQUOTE: 2839 case XML_REGEXP_PUNCT_OTHERS: 2840 case XML_REGEXP_SEPAR: 2841 case XML_REGEXP_SEPAR_SPACE: 2842 case XML_REGEXP_SEPAR_LINE: 2843 case XML_REGEXP_SEPAR_PARA: 2844 case XML_REGEXP_SYMBOL: 2845 case XML_REGEXP_SYMBOL_MATH: 2846 case XML_REGEXP_SYMBOL_CURRENCY: 2847 case XML_REGEXP_SYMBOL_MODIFIER: 2848 case XML_REGEXP_SYMBOL_OTHERS: 2849 case XML_REGEXP_OTHER: 2850 case XML_REGEXP_OTHER_CONTROL: 2851 case XML_REGEXP_OTHER_FORMAT: 2852 case XML_REGEXP_OTHER_PRIVATE: 2853 case XML_REGEXP_OTHER_NA: 2854 case XML_REGEXP_BLOCK_NAME: 2855 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0, 2856 (const xmlChar *)atom->valuep); 2857 if (atom->neg) 2858 ret = !ret; 2859 break; 2860 } 2861 return(ret); 2862} 2863 2864/************************************************************************ 2865 * * 2866 * Saving and restoring state of an execution context * 2867 * * 2868 ************************************************************************/ 2869 2870#ifdef DEBUG_REGEXP_EXEC 2871static void 2872xmlFARegDebugExec(xmlRegExecCtxtPtr exec) { 2873 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index); 2874 if (exec->inputStack != NULL) { 2875 int i; 2876 printf(": "); 2877 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++) 2878 printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]); 2879 } else { 2880 printf(": %s", &(exec->inputString[exec->index])); 2881 } 2882 printf("\n"); 2883} 2884#endif 2885 2886static void 2887xmlFARegExecSave(xmlRegExecCtxtPtr exec) { 2888#ifdef DEBUG_REGEXP_EXEC 2889 printf("saving "); 2890 exec->transno++; 2891 xmlFARegDebugExec(exec); 2892 exec->transno--; 2893#endif 2894#ifdef MAX_PUSH 2895 if (exec->nbPush > MAX_PUSH) { 2896 return; 2897 } 2898 exec->nbPush++; 2899#endif 2900 2901 if (exec->maxRollbacks == 0) { 2902 exec->maxRollbacks = 4; 2903 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks * 2904 sizeof(xmlRegExecRollback)); 2905 if (exec->rollbacks == NULL) { 2906 xmlRegexpErrMemory(NULL, "saving regexp"); 2907 exec->maxRollbacks = 0; 2908 return; 2909 } 2910 memset(exec->rollbacks, 0, 2911 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2912 } else if (exec->nbRollbacks >= exec->maxRollbacks) { 2913 xmlRegExecRollback *tmp; 2914 int len = exec->maxRollbacks; 2915 2916 exec->maxRollbacks *= 2; 2917 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks, 2918 exec->maxRollbacks * sizeof(xmlRegExecRollback)); 2919 if (tmp == NULL) { 2920 xmlRegexpErrMemory(NULL, "saving regexp"); 2921 exec->maxRollbacks /= 2; 2922 return; 2923 } 2924 exec->rollbacks = tmp; 2925 tmp = &exec->rollbacks[len]; 2926 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback)); 2927 } 2928 exec->rollbacks[exec->nbRollbacks].state = exec->state; 2929 exec->rollbacks[exec->nbRollbacks].index = exec->index; 2930 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1; 2931 if (exec->comp->nbCounters > 0) { 2932 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2933 exec->rollbacks[exec->nbRollbacks].counts = (int *) 2934 xmlMalloc(exec->comp->nbCounters * sizeof(int)); 2935 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2936 xmlRegexpErrMemory(NULL, "saving regexp"); 2937 exec->status = -5; 2938 return; 2939 } 2940 } 2941 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts, 2942 exec->comp->nbCounters * sizeof(int)); 2943 } 2944 exec->nbRollbacks++; 2945} 2946 2947static void 2948xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) { 2949 if (exec->nbRollbacks <= 0) { 2950 exec->status = -1; 2951#ifdef DEBUG_REGEXP_EXEC 2952 printf("rollback failed on empty stack\n"); 2953#endif 2954 return; 2955 } 2956 exec->nbRollbacks--; 2957 exec->state = exec->rollbacks[exec->nbRollbacks].state; 2958 exec->index = exec->rollbacks[exec->nbRollbacks].index; 2959 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch; 2960 if (exec->comp->nbCounters > 0) { 2961 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) { 2962 fprintf(stderr, "exec save: allocation failed"); 2963 exec->status = -6; 2964 return; 2965 } 2966 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts, 2967 exec->comp->nbCounters * sizeof(int)); 2968 } 2969 2970#ifdef DEBUG_REGEXP_EXEC 2971 printf("restored "); 2972 xmlFARegDebugExec(exec); 2973#endif 2974} 2975 2976/************************************************************************ 2977 * * 2978 * Verifier, running an input against a compiled regexp * 2979 * * 2980 ************************************************************************/ 2981 2982static int 2983xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) { 2984 xmlRegExecCtxt execval; 2985 xmlRegExecCtxtPtr exec = &execval; 2986 int ret, codepoint = 0, len, deter; 2987 2988 exec->inputString = content; 2989 exec->index = 0; 2990 exec->nbPush = 0; 2991 exec->determinist = 1; 2992 exec->maxRollbacks = 0; 2993 exec->nbRollbacks = 0; 2994 exec->rollbacks = NULL; 2995 exec->status = 0; 2996 exec->comp = comp; 2997 exec->state = comp->states[0]; 2998 exec->transno = 0; 2999 exec->transcount = 0; 3000 exec->inputStack = NULL; 3001 exec->inputStackMax = 0; 3002 if (comp->nbCounters > 0) { 3003 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)); 3004 if (exec->counts == NULL) { 3005 xmlRegexpErrMemory(NULL, "running regexp"); 3006 return(-1); 3007 } 3008 memset(exec->counts, 0, comp->nbCounters * sizeof(int)); 3009 } else 3010 exec->counts = NULL; 3011 while ((exec->status == 0) && 3012 ((exec->inputString[exec->index] != 0) || 3013 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 3014 xmlRegTransPtr trans; 3015 xmlRegAtomPtr atom; 3016 3017 /* 3018 * If end of input on non-terminal state, rollback, however we may 3019 * still have epsilon like transition for counted transitions 3020 * on counters, in that case don't break too early. Additionally, 3021 * if we are working on a range like "AB{0,2}", where B is not present, 3022 * we don't want to break. 3023 */ 3024 len = 1; 3025 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) { 3026 /* 3027 * if there is a transition, we must check if 3028 * atom allows minOccurs of 0 3029 */ 3030 if (exec->transno < exec->state->nbTrans) { 3031 trans = &exec->state->trans[exec->transno]; 3032 if (trans->to >=0) { 3033 atom = trans->atom; 3034 if (!((atom->min == 0) && (atom->max > 0))) 3035 goto rollback; 3036 } 3037 } else 3038 goto rollback; 3039 } 3040 3041 exec->transcount = 0; 3042 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3043 trans = &exec->state->trans[exec->transno]; 3044 if (trans->to < 0) 3045 continue; 3046 atom = trans->atom; 3047 ret = 0; 3048 deter = 1; 3049 if (trans->count >= 0) { 3050 int count; 3051 xmlRegCounterPtr counter; 3052 3053 if (exec->counts == NULL) { 3054 exec->status = -1; 3055 goto error; 3056 } 3057 /* 3058 * A counted transition. 3059 */ 3060 3061 count = exec->counts[trans->count]; 3062 counter = &exec->comp->counters[trans->count]; 3063#ifdef DEBUG_REGEXP_EXEC 3064 printf("testing count %d: val %d, min %d, max %d\n", 3065 trans->count, count, counter->min, counter->max); 3066#endif 3067 ret = ((count >= counter->min) && (count <= counter->max)); 3068 if ((ret) && (counter->min != counter->max)) 3069 deter = 0; 3070 } else if (atom == NULL) { 3071 fprintf(stderr, "epsilon transition left at runtime\n"); 3072 exec->status = -2; 3073 break; 3074 } else if (exec->inputString[exec->index] != 0) { 3075 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 3076 ret = xmlRegCheckCharacter(atom, codepoint); 3077 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) { 3078 xmlRegStatePtr to = comp->states[trans->to]; 3079 3080 /* 3081 * this is a multiple input sequence 3082 * If there is a counter associated increment it now. 3083 * before potentially saving and rollback 3084 */ 3085 if (trans->counter >= 0) { 3086 if (exec->counts == NULL) { 3087 exec->status = -1; 3088 goto error; 3089 } 3090#ifdef DEBUG_REGEXP_EXEC 3091 printf("Increasing count %d\n", trans->counter); 3092#endif 3093 exec->counts[trans->counter]++; 3094 } 3095 if (exec->state->nbTrans > exec->transno + 1) { 3096 xmlFARegExecSave(exec); 3097 } 3098 exec->transcount = 1; 3099 do { 3100 /* 3101 * Try to progress as much as possible on the input 3102 */ 3103 if (exec->transcount == atom->max) { 3104 break; 3105 } 3106 exec->index += len; 3107 /* 3108 * End of input: stop here 3109 */ 3110 if (exec->inputString[exec->index] == 0) { 3111 exec->index -= len; 3112 break; 3113 } 3114 if (exec->transcount >= atom->min) { 3115 int transno = exec->transno; 3116 xmlRegStatePtr state = exec->state; 3117 3118 /* 3119 * The transition is acceptable save it 3120 */ 3121 exec->transno = -1; /* trick */ 3122 exec->state = to; 3123 xmlFARegExecSave(exec); 3124 exec->transno = transno; 3125 exec->state = state; 3126 } 3127 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 3128 len); 3129 ret = xmlRegCheckCharacter(atom, codepoint); 3130 exec->transcount++; 3131 } while (ret == 1); 3132 if (exec->transcount < atom->min) 3133 ret = 0; 3134 3135 /* 3136 * If the last check failed but one transition was found 3137 * possible, rollback 3138 */ 3139 if (ret < 0) 3140 ret = 0; 3141 if (ret == 0) { 3142 goto rollback; 3143 } 3144 if (trans->counter >= 0) { 3145 if (exec->counts == NULL) { 3146 exec->status = -1; 3147 goto error; 3148 } 3149#ifdef DEBUG_REGEXP_EXEC 3150 printf("Decreasing count %d\n", trans->counter); 3151#endif 3152 exec->counts[trans->counter]--; 3153 } 3154 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) { 3155 /* 3156 * we don't match on the codepoint, but minOccurs of 0 3157 * says that's ok. Setting len to 0 inhibits stepping 3158 * over the codepoint. 3159 */ 3160 exec->transcount = 1; 3161 len = 0; 3162 ret = 1; 3163 } 3164 } else if ((atom->min == 0) && (atom->max > 0)) { 3165 /* another spot to match when minOccurs is 0 */ 3166 exec->transcount = 1; 3167 len = 0; 3168 ret = 1; 3169 } 3170 if (ret == 1) { 3171 if ((trans->nd == 1) || 3172 ((trans->count >= 0) && (deter == 0) && 3173 (exec->state->nbTrans > exec->transno + 1))) { 3174#ifdef DEBUG_REGEXP_EXEC 3175 if (trans->nd == 1) 3176 printf("Saving on nd transition atom %d for %c at %d\n", 3177 trans->atom->no, codepoint, exec->index); 3178 else 3179 printf("Saving on counted transition count %d for %c at %d\n", 3180 trans->count, codepoint, exec->index); 3181#endif 3182 xmlFARegExecSave(exec); 3183 } 3184 if (trans->counter >= 0) { 3185 if (exec->counts == NULL) { 3186 exec->status = -1; 3187 goto error; 3188 } 3189#ifdef DEBUG_REGEXP_EXEC 3190 printf("Increasing count %d\n", trans->counter); 3191#endif 3192 exec->counts[trans->counter]++; 3193 } 3194 if ((trans->count >= 0) && 3195 (trans->count < REGEXP_ALL_COUNTER)) { 3196 if (exec->counts == NULL) { 3197 exec->status = -1; 3198 goto error; 3199 } 3200#ifdef DEBUG_REGEXP_EXEC 3201 printf("resetting count %d on transition\n", 3202 trans->count); 3203#endif 3204 exec->counts[trans->count] = 0; 3205 } 3206#ifdef DEBUG_REGEXP_EXEC 3207 printf("entering state %d\n", trans->to); 3208#endif 3209 exec->state = comp->states[trans->to]; 3210 exec->transno = 0; 3211 if (trans->atom != NULL) { 3212 exec->index += len; 3213 } 3214 goto progress; 3215 } else if (ret < 0) { 3216 exec->status = -4; 3217 break; 3218 } 3219 } 3220 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3221rollback: 3222 /* 3223 * Failed to find a way out 3224 */ 3225 exec->determinist = 0; 3226#ifdef DEBUG_REGEXP_EXEC 3227 printf("rollback from state %d on %d:%c\n", exec->state->no, 3228 codepoint,codepoint); 3229#endif 3230 xmlFARegExecRollBack(exec); 3231 } 3232progress: 3233 continue; 3234 } 3235error: 3236 if (exec->rollbacks != NULL) { 3237 if (exec->counts != NULL) { 3238 int i; 3239 3240 for (i = 0;i < exec->maxRollbacks;i++) 3241 if (exec->rollbacks[i].counts != NULL) 3242 xmlFree(exec->rollbacks[i].counts); 3243 } 3244 xmlFree(exec->rollbacks); 3245 } 3246 if (exec->counts != NULL) 3247 xmlFree(exec->counts); 3248 if (exec->status == 0) 3249 return(1); 3250 if (exec->status == -1) { 3251 if (exec->nbPush > MAX_PUSH) 3252 return(-1); 3253 return(0); 3254 } 3255 return(exec->status); 3256} 3257 3258/************************************************************************ 3259 * * 3260 * Progressive interface to the verifier one atom at a time * 3261 * * 3262 ************************************************************************/ 3263#ifdef DEBUG_ERR 3264static void testerr(xmlRegExecCtxtPtr exec); 3265#endif 3266 3267/** 3268 * xmlRegNewExecCtxt: 3269 * @comp: a precompiled regular expression 3270 * @callback: a callback function used for handling progresses in the 3271 * automata matching phase 3272 * @data: the context data associated to the callback in this context 3273 * 3274 * Build a context used for progressive evaluation of a regexp. 3275 * 3276 * Returns the new context 3277 */ 3278xmlRegExecCtxtPtr 3279xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { 3280 xmlRegExecCtxtPtr exec; 3281 3282 if (comp == NULL) 3283 return(NULL); 3284 if ((comp->compact == NULL) && (comp->states == NULL)) 3285 return(NULL); 3286 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt)); 3287 if (exec == NULL) { 3288 xmlRegexpErrMemory(NULL, "creating execution context"); 3289 return(NULL); 3290 } 3291 memset(exec, 0, sizeof(xmlRegExecCtxt)); 3292 exec->inputString = NULL; 3293 exec->index = 0; 3294 exec->determinist = 1; 3295 exec->maxRollbacks = 0; 3296 exec->nbRollbacks = 0; 3297 exec->rollbacks = NULL; 3298 exec->status = 0; 3299 exec->comp = comp; 3300 if (comp->compact == NULL) 3301 exec->state = comp->states[0]; 3302 exec->transno = 0; 3303 exec->transcount = 0; 3304 exec->callback = callback; 3305 exec->data = data; 3306 if (comp->nbCounters > 0) { 3307 /* 3308 * For error handling, exec->counts is allocated twice the size 3309 * the second half is used to store the data in case of rollback 3310 */ 3311 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int) 3312 * 2); 3313 if (exec->counts == NULL) { 3314 xmlRegexpErrMemory(NULL, "creating execution context"); 3315 xmlFree(exec); 3316 return(NULL); 3317 } 3318 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2); 3319 exec->errCounts = &exec->counts[comp->nbCounters]; 3320 } else { 3321 exec->counts = NULL; 3322 exec->errCounts = NULL; 3323 } 3324 exec->inputStackMax = 0; 3325 exec->inputStackNr = 0; 3326 exec->inputStack = NULL; 3327 exec->errStateNo = -1; 3328 exec->errString = NULL; 3329 exec->nbPush = 0; 3330 return(exec); 3331} 3332 3333/** 3334 * xmlRegFreeExecCtxt: 3335 * @exec: a regular expression evaulation context 3336 * 3337 * Free the structures associated to a regular expression evaulation context. 3338 */ 3339void 3340xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) { 3341 if (exec == NULL) 3342 return; 3343 3344 if (exec->rollbacks != NULL) { 3345 if (exec->counts != NULL) { 3346 int i; 3347 3348 for (i = 0;i < exec->maxRollbacks;i++) 3349 if (exec->rollbacks[i].counts != NULL) 3350 xmlFree(exec->rollbacks[i].counts); 3351 } 3352 xmlFree(exec->rollbacks); 3353 } 3354 if (exec->counts != NULL) 3355 xmlFree(exec->counts); 3356 if (exec->inputStack != NULL) { 3357 int i; 3358 3359 for (i = 0;i < exec->inputStackNr;i++) { 3360 if (exec->inputStack[i].value != NULL) 3361 xmlFree(exec->inputStack[i].value); 3362 } 3363 xmlFree(exec->inputStack); 3364 } 3365 if (exec->errString != NULL) 3366 xmlFree(exec->errString); 3367 xmlFree(exec); 3368} 3369 3370static void 3371xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3372 void *data) { 3373#ifdef DEBUG_PUSH 3374 printf("saving value: %d:%s\n", exec->inputStackNr, value); 3375#endif 3376 if (exec->inputStackMax == 0) { 3377 exec->inputStackMax = 4; 3378 exec->inputStack = (xmlRegInputTokenPtr) 3379 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken)); 3380 if (exec->inputStack == NULL) { 3381 xmlRegexpErrMemory(NULL, "pushing input string"); 3382 exec->inputStackMax = 0; 3383 return; 3384 } 3385 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) { 3386 xmlRegInputTokenPtr tmp; 3387 3388 exec->inputStackMax *= 2; 3389 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack, 3390 exec->inputStackMax * sizeof(xmlRegInputToken)); 3391 if (tmp == NULL) { 3392 xmlRegexpErrMemory(NULL, "pushing input string"); 3393 exec->inputStackMax /= 2; 3394 return; 3395 } 3396 exec->inputStack = tmp; 3397 } 3398 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value); 3399 exec->inputStack[exec->inputStackNr].data = data; 3400 exec->inputStackNr++; 3401 exec->inputStack[exec->inputStackNr].value = NULL; 3402 exec->inputStack[exec->inputStackNr].data = NULL; 3403} 3404 3405/** 3406 * xmlRegStrEqualWildcard: 3407 * @expStr: the string to be evaluated 3408 * @valStr: the validation string 3409 * 3410 * Checks if both strings are equal or have the same content. "*" 3411 * can be used as a wildcard in @valStr; "|" is used as a seperator of 3412 * substrings in both @expStr and @valStr. 3413 * 3414 * Returns 1 if the comparison is satisfied and the number of substrings 3415 * is equal, 0 otherwise. 3416 */ 3417 3418static int 3419xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) { 3420 if (expStr == valStr) return(1); 3421 if (expStr == NULL) return(0); 3422 if (valStr == NULL) return(0); 3423 do { 3424 /* 3425 * Eval if we have a wildcard for the current item. 3426 */ 3427 if (*expStr != *valStr) { 3428 /* if one of them starts with a wildcard make valStr be it */ 3429 if (*valStr == '*') { 3430 const xmlChar *tmp; 3431 3432 tmp = valStr; 3433 valStr = expStr; 3434 expStr = tmp; 3435 } 3436 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) { 3437 do { 3438 if (*valStr == XML_REG_STRING_SEPARATOR) 3439 break; 3440 valStr++; 3441 } while (*valStr != 0); 3442 continue; 3443 } else 3444 return(0); 3445 } 3446 expStr++; 3447 valStr++; 3448 } while (*valStr != 0); 3449 if (*expStr != 0) 3450 return (0); 3451 else 3452 return (1); 3453} 3454 3455/** 3456 * xmlRegCompactPushString: 3457 * @exec: a regexp execution context 3458 * @comp: the precompiled exec with a compact table 3459 * @value: a string token input 3460 * @data: data associated to the token to reuse in callbacks 3461 * 3462 * Push one input token in the execution context 3463 * 3464 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3465 * a negative value in case of error. 3466 */ 3467static int 3468xmlRegCompactPushString(xmlRegExecCtxtPtr exec, 3469 xmlRegexpPtr comp, 3470 const xmlChar *value, 3471 void *data) { 3472 int state = exec->index; 3473 int i, target; 3474 3475 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) 3476 return(-1); 3477 3478 if (value == NULL) { 3479 /* 3480 * are we at a final state ? 3481 */ 3482 if (comp->compact[state * (comp->nbstrings + 1)] == 3483 XML_REGEXP_FINAL_STATE) 3484 return(1); 3485 return(0); 3486 } 3487 3488#ifdef DEBUG_PUSH 3489 printf("value pushed: %s\n", value); 3490#endif 3491 3492 /* 3493 * Examine all outside transitions from current state 3494 */ 3495 for (i = 0;i < comp->nbstrings;i++) { 3496 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 3497 if ((target > 0) && (target <= comp->nbstates)) { 3498 target--; /* to avoid 0 */ 3499 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) { 3500 exec->index = target; 3501 if ((exec->callback != NULL) && (comp->transdata != NULL)) { 3502 exec->callback(exec->data, value, 3503 comp->transdata[state * comp->nbstrings + i], data); 3504 } 3505#ifdef DEBUG_PUSH 3506 printf("entering state %d\n", target); 3507#endif 3508 if (comp->compact[target * (comp->nbstrings + 1)] == 3509 XML_REGEXP_SINK_STATE) 3510 goto error; 3511 3512 if (comp->compact[target * (comp->nbstrings + 1)] == 3513 XML_REGEXP_FINAL_STATE) 3514 return(1); 3515 return(0); 3516 } 3517 } 3518 } 3519 /* 3520 * Failed to find an exit transition out from current state for the 3521 * current token 3522 */ 3523#ifdef DEBUG_PUSH 3524 printf("failed to find a transition for %s on state %d\n", value, state); 3525#endif 3526error: 3527 if (exec->errString != NULL) 3528 xmlFree(exec->errString); 3529 exec->errString = xmlStrdup(value); 3530 exec->errStateNo = state; 3531 exec->status = -1; 3532#ifdef DEBUG_ERR 3533 testerr(exec); 3534#endif 3535 return(-1); 3536} 3537 3538/** 3539 * xmlRegExecPushStringInternal: 3540 * @exec: a regexp execution context or NULL to indicate the end 3541 * @value: a string token input 3542 * @data: data associated to the token to reuse in callbacks 3543 * @compound: value was assembled from 2 strings 3544 * 3545 * Push one input token in the execution context 3546 * 3547 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3548 * a negative value in case of error. 3549 */ 3550static int 3551xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value, 3552 void *data, int compound) { 3553 xmlRegTransPtr trans; 3554 xmlRegAtomPtr atom; 3555 int ret; 3556 int final = 0; 3557 int progress = 1; 3558 3559 if (exec == NULL) 3560 return(-1); 3561 if (exec->comp == NULL) 3562 return(-1); 3563 if (exec->status != 0) 3564 return(exec->status); 3565 3566 if (exec->comp->compact != NULL) 3567 return(xmlRegCompactPushString(exec, exec->comp, value, data)); 3568 3569 if (value == NULL) { 3570 if (exec->state->type == XML_REGEXP_FINAL_STATE) 3571 return(1); 3572 final = 1; 3573 } 3574 3575#ifdef DEBUG_PUSH 3576 printf("value pushed: %s\n", value); 3577#endif 3578 /* 3579 * If we have an active rollback stack push the new value there 3580 * and get back to where we were left 3581 */ 3582 if ((value != NULL) && (exec->inputStackNr > 0)) { 3583 xmlFARegExecSaveInputString(exec, value, data); 3584 value = exec->inputStack[exec->index].value; 3585 data = exec->inputStack[exec->index].data; 3586#ifdef DEBUG_PUSH 3587 printf("value loaded: %s\n", value); 3588#endif 3589 } 3590 3591 while ((exec->status == 0) && 3592 ((value != NULL) || 3593 ((final == 1) && 3594 (exec->state->type != XML_REGEXP_FINAL_STATE)))) { 3595 3596 /* 3597 * End of input on non-terminal state, rollback, however we may 3598 * still have epsilon like transition for counted transitions 3599 * on counters, in that case don't break too early. 3600 */ 3601 if ((value == NULL) && (exec->counts == NULL)) 3602 goto rollback; 3603 3604 exec->transcount = 0; 3605 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 3606 trans = &exec->state->trans[exec->transno]; 3607 if (trans->to < 0) 3608 continue; 3609 atom = trans->atom; 3610 ret = 0; 3611 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 3612 int i; 3613 int count; 3614 xmlRegTransPtr t; 3615 xmlRegCounterPtr counter; 3616 3617 ret = 0; 3618 3619#ifdef DEBUG_PUSH 3620 printf("testing all lax %d\n", trans->count); 3621#endif 3622 /* 3623 * Check all counted transitions from the current state 3624 */ 3625 if ((value == NULL) && (final)) { 3626 ret = 1; 3627 } else if (value != NULL) { 3628 for (i = 0;i < exec->state->nbTrans;i++) { 3629 t = &exec->state->trans[i]; 3630 if ((t->counter < 0) || (t == trans)) 3631 continue; 3632 counter = &exec->comp->counters[t->counter]; 3633 count = exec->counts[t->counter]; 3634 if ((count < counter->max) && 3635 (t->atom != NULL) && 3636 (xmlStrEqual(value, t->atom->valuep))) { 3637 ret = 0; 3638 break; 3639 } 3640 if ((count >= counter->min) && 3641 (count < counter->max) && 3642 (t->atom != NULL) && 3643 (xmlStrEqual(value, t->atom->valuep))) { 3644 ret = 1; 3645 break; 3646 } 3647 } 3648 } 3649 } else if (trans->count == REGEXP_ALL_COUNTER) { 3650 int i; 3651 int count; 3652 xmlRegTransPtr t; 3653 xmlRegCounterPtr counter; 3654 3655 ret = 1; 3656 3657#ifdef DEBUG_PUSH 3658 printf("testing all %d\n", trans->count); 3659#endif 3660 /* 3661 * Check all counted transitions from the current state 3662 */ 3663 for (i = 0;i < exec->state->nbTrans;i++) { 3664 t = &exec->state->trans[i]; 3665 if ((t->counter < 0) || (t == trans)) 3666 continue; 3667 counter = &exec->comp->counters[t->counter]; 3668 count = exec->counts[t->counter]; 3669 if ((count < counter->min) || (count > counter->max)) { 3670 ret = 0; 3671 break; 3672 } 3673 } 3674 } else if (trans->count >= 0) { 3675 int count; 3676 xmlRegCounterPtr counter; 3677 3678 /* 3679 * A counted transition. 3680 */ 3681 3682 count = exec->counts[trans->count]; 3683 counter = &exec->comp->counters[trans->count]; 3684#ifdef DEBUG_PUSH 3685 printf("testing count %d: val %d, min %d, max %d\n", 3686 trans->count, count, counter->min, counter->max); 3687#endif 3688 ret = ((count >= counter->min) && (count <= counter->max)); 3689 } else if (atom == NULL) { 3690 fprintf(stderr, "epsilon transition left at runtime\n"); 3691 exec->status = -2; 3692 break; 3693 } else if (value != NULL) { 3694 ret = xmlRegStrEqualWildcard(atom->valuep, value); 3695 if (atom->neg) { 3696 ret = !ret; 3697 if (!compound) 3698 ret = 0; 3699 } 3700 if ((ret == 1) && (trans->counter >= 0)) { 3701 xmlRegCounterPtr counter; 3702 int count; 3703 3704 count = exec->counts[trans->counter]; 3705 counter = &exec->comp->counters[trans->counter]; 3706 if (count >= counter->max) 3707 ret = 0; 3708 } 3709 3710 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 3711 xmlRegStatePtr to = exec->comp->states[trans->to]; 3712 3713 /* 3714 * this is a multiple input sequence 3715 */ 3716 if (exec->state->nbTrans > exec->transno + 1) { 3717 if (exec->inputStackNr <= 0) { 3718 xmlFARegExecSaveInputString(exec, value, data); 3719 } 3720 xmlFARegExecSave(exec); 3721 } 3722 exec->transcount = 1; 3723 do { 3724 /* 3725 * Try to progress as much as possible on the input 3726 */ 3727 if (exec->transcount == atom->max) { 3728 break; 3729 } 3730 exec->index++; 3731 value = exec->inputStack[exec->index].value; 3732 data = exec->inputStack[exec->index].data; 3733#ifdef DEBUG_PUSH 3734 printf("value loaded: %s\n", value); 3735#endif 3736 3737 /* 3738 * End of input: stop here 3739 */ 3740 if (value == NULL) { 3741 exec->index --; 3742 break; 3743 } 3744 if (exec->transcount >= atom->min) { 3745 int transno = exec->transno; 3746 xmlRegStatePtr state = exec->state; 3747 3748 /* 3749 * The transition is acceptable save it 3750 */ 3751 exec->transno = -1; /* trick */ 3752 exec->state = to; 3753 if (exec->inputStackNr <= 0) { 3754 xmlFARegExecSaveInputString(exec, value, data); 3755 } 3756 xmlFARegExecSave(exec); 3757 exec->transno = transno; 3758 exec->state = state; 3759 } 3760 ret = xmlStrEqual(value, atom->valuep); 3761 exec->transcount++; 3762 } while (ret == 1); 3763 if (exec->transcount < atom->min) 3764 ret = 0; 3765 3766 /* 3767 * If the last check failed but one transition was found 3768 * possible, rollback 3769 */ 3770 if (ret < 0) 3771 ret = 0; 3772 if (ret == 0) { 3773 goto rollback; 3774 } 3775 } 3776 } 3777 if (ret == 1) { 3778 if ((exec->callback != NULL) && (atom != NULL) && 3779 (data != NULL)) { 3780 exec->callback(exec->data, atom->valuep, 3781 atom->data, data); 3782 } 3783 if (exec->state->nbTrans > exec->transno + 1) { 3784 if (exec->inputStackNr <= 0) { 3785 xmlFARegExecSaveInputString(exec, value, data); 3786 } 3787 xmlFARegExecSave(exec); 3788 } 3789 if (trans->counter >= 0) { 3790#ifdef DEBUG_PUSH 3791 printf("Increasing count %d\n", trans->counter); 3792#endif 3793 exec->counts[trans->counter]++; 3794 } 3795 if ((trans->count >= 0) && 3796 (trans->count < REGEXP_ALL_COUNTER)) { 3797#ifdef DEBUG_REGEXP_EXEC 3798 printf("resetting count %d on transition\n", 3799 trans->count); 3800#endif 3801 exec->counts[trans->count] = 0; 3802 } 3803#ifdef DEBUG_PUSH 3804 printf("entering state %d\n", trans->to); 3805#endif 3806 if ((exec->comp->states[trans->to] != NULL) && 3807 (exec->comp->states[trans->to]->type == 3808 XML_REGEXP_SINK_STATE)) { 3809 /* 3810 * entering a sink state, save the current state as error 3811 * state. 3812 */ 3813 if (exec->errString != NULL) 3814 xmlFree(exec->errString); 3815 exec->errString = xmlStrdup(value); 3816 exec->errState = exec->state; 3817 memcpy(exec->errCounts, exec->counts, 3818 exec->comp->nbCounters * sizeof(int)); 3819 } 3820 exec->state = exec->comp->states[trans->to]; 3821 exec->transno = 0; 3822 if (trans->atom != NULL) { 3823 if (exec->inputStack != NULL) { 3824 exec->index++; 3825 if (exec->index < exec->inputStackNr) { 3826 value = exec->inputStack[exec->index].value; 3827 data = exec->inputStack[exec->index].data; 3828#ifdef DEBUG_PUSH 3829 printf("value loaded: %s\n", value); 3830#endif 3831 } else { 3832 value = NULL; 3833 data = NULL; 3834#ifdef DEBUG_PUSH 3835 printf("end of input\n"); 3836#endif 3837 } 3838 } else { 3839 value = NULL; 3840 data = NULL; 3841#ifdef DEBUG_PUSH 3842 printf("end of input\n"); 3843#endif 3844 } 3845 } 3846 goto progress; 3847 } else if (ret < 0) { 3848 exec->status = -4; 3849 break; 3850 } 3851 } 3852 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 3853rollback: 3854 /* 3855 * if we didn't yet rollback on the current input 3856 * store the current state as the error state. 3857 */ 3858 if ((progress) && (exec->state != NULL) && 3859 (exec->state->type != XML_REGEXP_SINK_STATE)) { 3860 progress = 0; 3861 if (exec->errString != NULL) 3862 xmlFree(exec->errString); 3863 exec->errString = xmlStrdup(value); 3864 exec->errState = exec->state; 3865 memcpy(exec->errCounts, exec->counts, 3866 exec->comp->nbCounters * sizeof(int)); 3867 } 3868 3869 /* 3870 * Failed to find a way out 3871 */ 3872 exec->determinist = 0; 3873 xmlFARegExecRollBack(exec); 3874 if (exec->status == 0) { 3875 value = exec->inputStack[exec->index].value; 3876 data = exec->inputStack[exec->index].data; 3877#ifdef DEBUG_PUSH 3878 printf("value loaded: %s\n", value); 3879#endif 3880 } 3881 } 3882 continue; 3883progress: 3884 progress = 1; 3885 continue; 3886 } 3887 if (exec->status == 0) { 3888 return(exec->state->type == XML_REGEXP_FINAL_STATE); 3889 } 3890#ifdef DEBUG_ERR 3891 if (exec->status < 0) { 3892 testerr(exec); 3893 } 3894#endif 3895 return(exec->status); 3896} 3897 3898/** 3899 * xmlRegExecPushString: 3900 * @exec: a regexp execution context or NULL to indicate the end 3901 * @value: a string token input 3902 * @data: data associated to the token to reuse in callbacks 3903 * 3904 * Push one input token in the execution context 3905 * 3906 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3907 * a negative value in case of error. 3908 */ 3909int 3910xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, 3911 void *data) { 3912 return(xmlRegExecPushStringInternal(exec, value, data, 0)); 3913} 3914 3915/** 3916 * xmlRegExecPushString2: 3917 * @exec: a regexp execution context or NULL to indicate the end 3918 * @value: the first string token input 3919 * @value2: the second string token input 3920 * @data: data associated to the token to reuse in callbacks 3921 * 3922 * Push one input token in the execution context 3923 * 3924 * Returns: 1 if the regexp reached a final state, 0 if non-final, and 3925 * a negative value in case of error. 3926 */ 3927int 3928xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value, 3929 const xmlChar *value2, void *data) { 3930 xmlChar buf[150]; 3931 int lenn, lenp, ret; 3932 xmlChar *str; 3933 3934 if (exec == NULL) 3935 return(-1); 3936 if (exec->comp == NULL) 3937 return(-1); 3938 if (exec->status != 0) 3939 return(exec->status); 3940 3941 if (value2 == NULL) 3942 return(xmlRegExecPushString(exec, value, data)); 3943 3944 lenn = strlen((char *) value2); 3945 lenp = strlen((char *) value); 3946 3947 if (150 < lenn + lenp + 2) { 3948 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 3949 if (str == NULL) { 3950 exec->status = -1; 3951 return(-1); 3952 } 3953 } else { 3954 str = buf; 3955 } 3956 memcpy(&str[0], value, lenp); 3957 str[lenp] = XML_REG_STRING_SEPARATOR; 3958 memcpy(&str[lenp + 1], value2, lenn); 3959 str[lenn + lenp + 1] = 0; 3960 3961 if (exec->comp->compact != NULL) 3962 ret = xmlRegCompactPushString(exec, exec->comp, str, data); 3963 else 3964 ret = xmlRegExecPushStringInternal(exec, str, data, 1); 3965 3966 if (str != buf) 3967 xmlFree(str); 3968 return(ret); 3969} 3970 3971/** 3972 * xmlRegExecGetValues: 3973 * @exec: a regexp execution context 3974 * @err: error extraction or normal one 3975 * @nbval: pointer to the number of accepted values IN/OUT 3976 * @nbneg: return number of negative transitions 3977 * @values: pointer to the array of acceptable values 3978 * @terminal: return value if this was a terminal state 3979 * 3980 * Extract informations from the regexp execution, internal routine to 3981 * implement xmlRegExecNextValues() and xmlRegExecErrInfo() 3982 * 3983 * Returns: 0 in case of success or -1 in case of error. 3984 */ 3985static int 3986xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err, 3987 int *nbval, int *nbneg, 3988 xmlChar **values, int *terminal) { 3989 int maxval; 3990 int nb = 0; 3991 3992 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) || 3993 (values == NULL) || (*nbval <= 0)) 3994 return(-1); 3995 3996 maxval = *nbval; 3997 *nbval = 0; 3998 *nbneg = 0; 3999 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) { 4000 xmlRegexpPtr comp; 4001 int target, i, state; 4002 4003 comp = exec->comp; 4004 4005 if (err) { 4006 if (exec->errStateNo == -1) return(-1); 4007 state = exec->errStateNo; 4008 } else { 4009 state = exec->index; 4010 } 4011 if (terminal != NULL) { 4012 if (comp->compact[state * (comp->nbstrings + 1)] == 4013 XML_REGEXP_FINAL_STATE) 4014 *terminal = 1; 4015 else 4016 *terminal = 0; 4017 } 4018 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4019 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4020 if ((target > 0) && (target <= comp->nbstates) && 4021 (comp->compact[(target - 1) * (comp->nbstrings + 1)] != 4022 XML_REGEXP_SINK_STATE)) { 4023 values[nb++] = comp->stringMap[i]; 4024 (*nbval)++; 4025 } 4026 } 4027 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) { 4028 target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; 4029 if ((target > 0) && (target <= comp->nbstates) && 4030 (comp->compact[(target - 1) * (comp->nbstrings + 1)] == 4031 XML_REGEXP_SINK_STATE)) { 4032 values[nb++] = comp->stringMap[i]; 4033 (*nbneg)++; 4034 } 4035 } 4036 } else { 4037 int transno; 4038 xmlRegTransPtr trans; 4039 xmlRegAtomPtr atom; 4040 xmlRegStatePtr state; 4041 4042 if (terminal != NULL) { 4043 if (exec->state->type == XML_REGEXP_FINAL_STATE) 4044 *terminal = 1; 4045 else 4046 *terminal = 0; 4047 } 4048 4049 if (err) { 4050 if (exec->errState == NULL) return(-1); 4051 state = exec->errState; 4052 } else { 4053 if (exec->state == NULL) return(-1); 4054 state = exec->state; 4055 } 4056 for (transno = 0; 4057 (transno < state->nbTrans) && (nb < maxval); 4058 transno++) { 4059 trans = &state->trans[transno]; 4060 if (trans->to < 0) 4061 continue; 4062 atom = trans->atom; 4063 if ((atom == NULL) || (atom->valuep == NULL)) 4064 continue; 4065 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4066 /* this should not be reached but ... */ 4067 TODO; 4068 } else if (trans->count == REGEXP_ALL_COUNTER) { 4069 /* this should not be reached but ... */ 4070 TODO; 4071 } else if (trans->counter >= 0) { 4072 xmlRegCounterPtr counter = NULL; 4073 int count; 4074 4075 if (err) 4076 count = exec->errCounts[trans->counter]; 4077 else 4078 count = exec->counts[trans->counter]; 4079 if (exec->comp != NULL) 4080 counter = &exec->comp->counters[trans->counter]; 4081 if ((counter == NULL) || (count < counter->max)) { 4082 if (atom->neg) 4083 values[nb++] = (xmlChar *) atom->valuep2; 4084 else 4085 values[nb++] = (xmlChar *) atom->valuep; 4086 (*nbval)++; 4087 } 4088 } else { 4089 if ((exec->comp->states[trans->to] != NULL) && 4090 (exec->comp->states[trans->to]->type != 4091 XML_REGEXP_SINK_STATE)) { 4092 if (atom->neg) 4093 values[nb++] = (xmlChar *) atom->valuep2; 4094 else 4095 values[nb++] = (xmlChar *) atom->valuep; 4096 (*nbval)++; 4097 } 4098 } 4099 } 4100 for (transno = 0; 4101 (transno < state->nbTrans) && (nb < maxval); 4102 transno++) { 4103 trans = &state->trans[transno]; 4104 if (trans->to < 0) 4105 continue; 4106 atom = trans->atom; 4107 if ((atom == NULL) || (atom->valuep == NULL)) 4108 continue; 4109 if (trans->count == REGEXP_ALL_LAX_COUNTER) { 4110 continue; 4111 } else if (trans->count == REGEXP_ALL_COUNTER) { 4112 continue; 4113 } else if (trans->counter >= 0) { 4114 continue; 4115 } else { 4116 if ((exec->comp->states[trans->to] != NULL) && 4117 (exec->comp->states[trans->to]->type == 4118 XML_REGEXP_SINK_STATE)) { 4119 if (atom->neg) 4120 values[nb++] = (xmlChar *) atom->valuep2; 4121 else 4122 values[nb++] = (xmlChar *) atom->valuep; 4123 (*nbneg)++; 4124 } 4125 } 4126 } 4127 } 4128 return(0); 4129} 4130 4131/** 4132 * xmlRegExecNextValues: 4133 * @exec: a regexp execution context 4134 * @nbval: pointer to the number of accepted values IN/OUT 4135 * @nbneg: return number of negative transitions 4136 * @values: pointer to the array of acceptable values 4137 * @terminal: return value if this was a terminal state 4138 * 4139 * Extract informations from the regexp execution, 4140 * the parameter @values must point to an array of @nbval string pointers 4141 * on return nbval will contain the number of possible strings in that 4142 * state and the @values array will be updated with them. The string values 4143 * returned will be freed with the @exec context and don't need to be 4144 * deallocated. 4145 * 4146 * Returns: 0 in case of success or -1 in case of error. 4147 */ 4148int 4149xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg, 4150 xmlChar **values, int *terminal) { 4151 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal)); 4152} 4153 4154/** 4155 * xmlRegExecErrInfo: 4156 * @exec: a regexp execution context generating an error 4157 * @string: return value for the error string 4158 * @nbval: pointer to the number of accepted values IN/OUT 4159 * @nbneg: return number of negative transitions 4160 * @values: pointer to the array of acceptable values 4161 * @terminal: return value if this was a terminal state 4162 * 4163 * Extract error informations from the regexp execution, the parameter 4164 * @string will be updated with the value pushed and not accepted, 4165 * the parameter @values must point to an array of @nbval string pointers 4166 * on return nbval will contain the number of possible strings in that 4167 * state and the @values array will be updated with them. The string values 4168 * returned will be freed with the @exec context and don't need to be 4169 * deallocated. 4170 * 4171 * Returns: 0 in case of success or -1 in case of error. 4172 */ 4173int 4174xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string, 4175 int *nbval, int *nbneg, xmlChar **values, int *terminal) { 4176 if (exec == NULL) 4177 return(-1); 4178 if (string != NULL) { 4179 if (exec->status != 0) 4180 *string = exec->errString; 4181 else 4182 *string = NULL; 4183 } 4184 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal)); 4185} 4186 4187#ifdef DEBUG_ERR 4188static void testerr(xmlRegExecCtxtPtr exec) { 4189 const xmlChar *string; 4190 xmlChar *values[5]; 4191 int nb = 5; 4192 int nbneg; 4193 int terminal; 4194 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal); 4195} 4196#endif 4197 4198#if 0 4199static int 4200xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) { 4201 xmlRegTransPtr trans; 4202 xmlRegAtomPtr atom; 4203 int ret; 4204 int codepoint, len; 4205 4206 if (exec == NULL) 4207 return(-1); 4208 if (exec->status != 0) 4209 return(exec->status); 4210 4211 while ((exec->status == 0) && 4212 ((exec->inputString[exec->index] != 0) || 4213 (exec->state->type != XML_REGEXP_FINAL_STATE))) { 4214 4215 /* 4216 * End of input on non-terminal state, rollback, however we may 4217 * still have epsilon like transition for counted transitions 4218 * on counters, in that case don't break too early. 4219 */ 4220 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) 4221 goto rollback; 4222 4223 exec->transcount = 0; 4224 for (;exec->transno < exec->state->nbTrans;exec->transno++) { 4225 trans = &exec->state->trans[exec->transno]; 4226 if (trans->to < 0) 4227 continue; 4228 atom = trans->atom; 4229 ret = 0; 4230 if (trans->count >= 0) { 4231 int count; 4232 xmlRegCounterPtr counter; 4233 4234 /* 4235 * A counted transition. 4236 */ 4237 4238 count = exec->counts[trans->count]; 4239 counter = &exec->comp->counters[trans->count]; 4240#ifdef DEBUG_REGEXP_EXEC 4241 printf("testing count %d: val %d, min %d, max %d\n", 4242 trans->count, count, counter->min, counter->max); 4243#endif 4244 ret = ((count >= counter->min) && (count <= counter->max)); 4245 } else if (atom == NULL) { 4246 fprintf(stderr, "epsilon transition left at runtime\n"); 4247 exec->status = -2; 4248 break; 4249 } else if (exec->inputString[exec->index] != 0) { 4250 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len); 4251 ret = xmlRegCheckCharacter(atom, codepoint); 4252 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) { 4253 xmlRegStatePtr to = exec->comp->states[trans->to]; 4254 4255 /* 4256 * this is a multiple input sequence 4257 */ 4258 if (exec->state->nbTrans > exec->transno + 1) { 4259 xmlFARegExecSave(exec); 4260 } 4261 exec->transcount = 1; 4262 do { 4263 /* 4264 * Try to progress as much as possible on the input 4265 */ 4266 if (exec->transcount == atom->max) { 4267 break; 4268 } 4269 exec->index += len; 4270 /* 4271 * End of input: stop here 4272 */ 4273 if (exec->inputString[exec->index] == 0) { 4274 exec->index -= len; 4275 break; 4276 } 4277 if (exec->transcount >= atom->min) { 4278 int transno = exec->transno; 4279 xmlRegStatePtr state = exec->state; 4280 4281 /* 4282 * The transition is acceptable save it 4283 */ 4284 exec->transno = -1; /* trick */ 4285 exec->state = to; 4286 xmlFARegExecSave(exec); 4287 exec->transno = transno; 4288 exec->state = state; 4289 } 4290 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), 4291 len); 4292 ret = xmlRegCheckCharacter(atom, codepoint); 4293 exec->transcount++; 4294 } while (ret == 1); 4295 if (exec->transcount < atom->min) 4296 ret = 0; 4297 4298 /* 4299 * If the last check failed but one transition was found 4300 * possible, rollback 4301 */ 4302 if (ret < 0) 4303 ret = 0; 4304 if (ret == 0) { 4305 goto rollback; 4306 } 4307 } 4308 } 4309 if (ret == 1) { 4310 if (exec->state->nbTrans > exec->transno + 1) { 4311 xmlFARegExecSave(exec); 4312 } 4313 /* 4314 * restart count for expressions like this ((abc){2})* 4315 */ 4316 if (trans->count >= 0) { 4317#ifdef DEBUG_REGEXP_EXEC 4318 printf("Reset count %d\n", trans->count); 4319#endif 4320 exec->counts[trans->count] = 0; 4321 } 4322 if (trans->counter >= 0) { 4323#ifdef DEBUG_REGEXP_EXEC 4324 printf("Increasing count %d\n", trans->counter); 4325#endif 4326 exec->counts[trans->counter]++; 4327 } 4328#ifdef DEBUG_REGEXP_EXEC 4329 printf("entering state %d\n", trans->to); 4330#endif 4331 exec->state = exec->comp->states[trans->to]; 4332 exec->transno = 0; 4333 if (trans->atom != NULL) { 4334 exec->index += len; 4335 } 4336 goto progress; 4337 } else if (ret < 0) { 4338 exec->status = -4; 4339 break; 4340 } 4341 } 4342 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) { 4343rollback: 4344 /* 4345 * Failed to find a way out 4346 */ 4347 exec->determinist = 0; 4348 xmlFARegExecRollBack(exec); 4349 } 4350progress: 4351 continue; 4352 } 4353} 4354#endif 4355/************************************************************************ 4356 * * 4357 * Parser for the Schemas Datatype Regular Expressions * 4358 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs * 4359 * * 4360 ************************************************************************/ 4361 4362/** 4363 * xmlFAIsChar: 4364 * @ctxt: a regexp parser context 4365 * 4366 * [10] Char ::= [^.\?*+()|#x5B#x5D] 4367 */ 4368static int 4369xmlFAIsChar(xmlRegParserCtxtPtr ctxt) { 4370 int cur; 4371 int len; 4372 4373 cur = CUR_SCHAR(ctxt->cur, len); 4374 if ((cur == '.') || (cur == '\\') || (cur == '?') || 4375 (cur == '*') || (cur == '+') || (cur == '(') || 4376 (cur == ')') || (cur == '|') || (cur == 0x5B) || 4377 (cur == 0x5D) || (cur == 0)) 4378 return(-1); 4379 return(cur); 4380} 4381 4382/** 4383 * xmlFAParseCharProp: 4384 * @ctxt: a regexp parser context 4385 * 4386 * [27] charProp ::= IsCategory | IsBlock 4387 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation | 4388 * Separators | Symbols | Others 4389 * [29] Letters ::= 'L' [ultmo]? 4390 * [30] Marks ::= 'M' [nce]? 4391 * [31] Numbers ::= 'N' [dlo]? 4392 * [32] Punctuation ::= 'P' [cdseifo]? 4393 * [33] Separators ::= 'Z' [slp]? 4394 * [34] Symbols ::= 'S' [mcko]? 4395 * [35] Others ::= 'C' [cfon]? 4396 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+ 4397 */ 4398static void 4399xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) { 4400 int cur; 4401 xmlRegAtomType type = (xmlRegAtomType) 0; 4402 xmlChar *blockName = NULL; 4403 4404 cur = CUR; 4405 if (cur == 'L') { 4406 NEXT; 4407 cur = CUR; 4408 if (cur == 'u') { 4409 NEXT; 4410 type = XML_REGEXP_LETTER_UPPERCASE; 4411 } else if (cur == 'l') { 4412 NEXT; 4413 type = XML_REGEXP_LETTER_LOWERCASE; 4414 } else if (cur == 't') { 4415 NEXT; 4416 type = XML_REGEXP_LETTER_TITLECASE; 4417 } else if (cur == 'm') { 4418 NEXT; 4419 type = XML_REGEXP_LETTER_MODIFIER; 4420 } else if (cur == 'o') { 4421 NEXT; 4422 type = XML_REGEXP_LETTER_OTHERS; 4423 } else { 4424 type = XML_REGEXP_LETTER; 4425 } 4426 } else if (cur == 'M') { 4427 NEXT; 4428 cur = CUR; 4429 if (cur == 'n') { 4430 NEXT; 4431 /* nonspacing */ 4432 type = XML_REGEXP_MARK_NONSPACING; 4433 } else if (cur == 'c') { 4434 NEXT; 4435 /* spacing combining */ 4436 type = XML_REGEXP_MARK_SPACECOMBINING; 4437 } else if (cur == 'e') { 4438 NEXT; 4439 /* enclosing */ 4440 type = XML_REGEXP_MARK_ENCLOSING; 4441 } else { 4442 /* all marks */ 4443 type = XML_REGEXP_MARK; 4444 } 4445 } else if (cur == 'N') { 4446 NEXT; 4447 cur = CUR; 4448 if (cur == 'd') { 4449 NEXT; 4450 /* digital */ 4451 type = XML_REGEXP_NUMBER_DECIMAL; 4452 } else if (cur == 'l') { 4453 NEXT; 4454 /* letter */ 4455 type = XML_REGEXP_NUMBER_LETTER; 4456 } else if (cur == 'o') { 4457 NEXT; 4458 /* other */ 4459 type = XML_REGEXP_NUMBER_OTHERS; 4460 } else { 4461 /* all numbers */ 4462 type = XML_REGEXP_NUMBER; 4463 } 4464 } else if (cur == 'P') { 4465 NEXT; 4466 cur = CUR; 4467 if (cur == 'c') { 4468 NEXT; 4469 /* connector */ 4470 type = XML_REGEXP_PUNCT_CONNECTOR; 4471 } else if (cur == 'd') { 4472 NEXT; 4473 /* dash */ 4474 type = XML_REGEXP_PUNCT_DASH; 4475 } else if (cur == 's') { 4476 NEXT; 4477 /* open */ 4478 type = XML_REGEXP_PUNCT_OPEN; 4479 } else if (cur == 'e') { 4480 NEXT; 4481 /* close */ 4482 type = XML_REGEXP_PUNCT_CLOSE; 4483 } else if (cur == 'i') { 4484 NEXT; 4485 /* initial quote */ 4486 type = XML_REGEXP_PUNCT_INITQUOTE; 4487 } else if (cur == 'f') { 4488 NEXT; 4489 /* final quote */ 4490 type = XML_REGEXP_PUNCT_FINQUOTE; 4491 } else if (cur == 'o') { 4492 NEXT; 4493 /* other */ 4494 type = XML_REGEXP_PUNCT_OTHERS; 4495 } else { 4496 /* all punctuation */ 4497 type = XML_REGEXP_PUNCT; 4498 } 4499 } else if (cur == 'Z') { 4500 NEXT; 4501 cur = CUR; 4502 if (cur == 's') { 4503 NEXT; 4504 /* space */ 4505 type = XML_REGEXP_SEPAR_SPACE; 4506 } else if (cur == 'l') { 4507 NEXT; 4508 /* line */ 4509 type = XML_REGEXP_SEPAR_LINE; 4510 } else if (cur == 'p') { 4511 NEXT; 4512 /* paragraph */ 4513 type = XML_REGEXP_SEPAR_PARA; 4514 } else { 4515 /* all separators */ 4516 type = XML_REGEXP_SEPAR; 4517 } 4518 } else if (cur == 'S') { 4519 NEXT; 4520 cur = CUR; 4521 if (cur == 'm') { 4522 NEXT; 4523 type = XML_REGEXP_SYMBOL_MATH; 4524 /* math */ 4525 } else if (cur == 'c') { 4526 NEXT; 4527 type = XML_REGEXP_SYMBOL_CURRENCY; 4528 /* currency */ 4529 } else if (cur == 'k') { 4530 NEXT; 4531 type = XML_REGEXP_SYMBOL_MODIFIER; 4532 /* modifiers */ 4533 } else if (cur == 'o') { 4534 NEXT; 4535 type = XML_REGEXP_SYMBOL_OTHERS; 4536 /* other */ 4537 } else { 4538 /* all symbols */ 4539 type = XML_REGEXP_SYMBOL; 4540 } 4541 } else if (cur == 'C') { 4542 NEXT; 4543 cur = CUR; 4544 if (cur == 'c') { 4545 NEXT; 4546 /* control */ 4547 type = XML_REGEXP_OTHER_CONTROL; 4548 } else if (cur == 'f') { 4549 NEXT; 4550 /* format */ 4551 type = XML_REGEXP_OTHER_FORMAT; 4552 } else if (cur == 'o') { 4553 NEXT; 4554 /* private use */ 4555 type = XML_REGEXP_OTHER_PRIVATE; 4556 } else if (cur == 'n') { 4557 NEXT; 4558 /* not assigned */ 4559 type = XML_REGEXP_OTHER_NA; 4560 } else { 4561 /* all others */ 4562 type = XML_REGEXP_OTHER; 4563 } 4564 } else if (cur == 'I') { 4565 const xmlChar *start; 4566 NEXT; 4567 cur = CUR; 4568 if (cur != 's') { 4569 ERROR("IsXXXX expected"); 4570 return; 4571 } 4572 NEXT; 4573 start = ctxt->cur; 4574 cur = CUR; 4575 if (((cur >= 'a') && (cur <= 'z')) || 4576 ((cur >= 'A') && (cur <= 'Z')) || 4577 ((cur >= '0') && (cur <= '9')) || 4578 (cur == 0x2D)) { 4579 NEXT; 4580 cur = CUR; 4581 while (((cur >= 'a') && (cur <= 'z')) || 4582 ((cur >= 'A') && (cur <= 'Z')) || 4583 ((cur >= '0') && (cur <= '9')) || 4584 (cur == 0x2D)) { 4585 NEXT; 4586 cur = CUR; 4587 } 4588 } 4589 type = XML_REGEXP_BLOCK_NAME; 4590 blockName = xmlStrndup(start, ctxt->cur - start); 4591 } else { 4592 ERROR("Unknown char property"); 4593 return; 4594 } 4595 if (ctxt->atom == NULL) { 4596 ctxt->atom = xmlRegNewAtom(ctxt, type); 4597 if (ctxt->atom != NULL) 4598 ctxt->atom->valuep = blockName; 4599 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4600 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4601 type, 0, 0, blockName); 4602 } 4603} 4604 4605/** 4606 * xmlFAParseCharClassEsc: 4607 * @ctxt: a regexp parser context 4608 * 4609 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) 4610 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E] 4611 * [25] catEsc ::= '\p{' charProp '}' 4612 * [26] complEsc ::= '\P{' charProp '}' 4613 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW]) 4614 */ 4615static void 4616xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) { 4617 int cur; 4618 4619 if (CUR == '.') { 4620 if (ctxt->atom == NULL) { 4621 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR); 4622 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4623 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4624 XML_REGEXP_ANYCHAR, 0, 0, NULL); 4625 } 4626 NEXT; 4627 return; 4628 } 4629 if (CUR != '\\') { 4630 ERROR("Escaped sequence: expecting \\"); 4631 return; 4632 } 4633 NEXT; 4634 cur = CUR; 4635 if (cur == 'p') { 4636 NEXT; 4637 if (CUR != '{') { 4638 ERROR("Expecting '{'"); 4639 return; 4640 } 4641 NEXT; 4642 xmlFAParseCharProp(ctxt); 4643 if (CUR != '}') { 4644 ERROR("Expecting '}'"); 4645 return; 4646 } 4647 NEXT; 4648 } else if (cur == 'P') { 4649 NEXT; 4650 if (CUR != '{') { 4651 ERROR("Expecting '{'"); 4652 return; 4653 } 4654 NEXT; 4655 xmlFAParseCharProp(ctxt); 4656 ctxt->atom->neg = 1; 4657 if (CUR != '}') { 4658 ERROR("Expecting '}'"); 4659 return; 4660 } 4661 NEXT; 4662 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') || 4663 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') || 4664 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') || 4665 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) || 4666 (cur == 0x5E)) { 4667 if (ctxt->atom == NULL) { 4668 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 4669 if (ctxt->atom != NULL) { 4670 switch (cur) { 4671 case 'n': 4672 ctxt->atom->codepoint = '\n'; 4673 break; 4674 case 'r': 4675 ctxt->atom->codepoint = '\r'; 4676 break; 4677 case 't': 4678 ctxt->atom->codepoint = '\t'; 4679 break; 4680 default: 4681 ctxt->atom->codepoint = cur; 4682 } 4683 } 4684 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4685 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4686 XML_REGEXP_CHARVAL, cur, cur, NULL); 4687 } 4688 NEXT; 4689 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') || 4690 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') || 4691 (cur == 'w') || (cur == 'W')) { 4692 xmlRegAtomType type = XML_REGEXP_ANYSPACE; 4693 4694 switch (cur) { 4695 case 's': 4696 type = XML_REGEXP_ANYSPACE; 4697 break; 4698 case 'S': 4699 type = XML_REGEXP_NOTSPACE; 4700 break; 4701 case 'i': 4702 type = XML_REGEXP_INITNAME; 4703 break; 4704 case 'I': 4705 type = XML_REGEXP_NOTINITNAME; 4706 break; 4707 case 'c': 4708 type = XML_REGEXP_NAMECHAR; 4709 break; 4710 case 'C': 4711 type = XML_REGEXP_NOTNAMECHAR; 4712 break; 4713 case 'd': 4714 type = XML_REGEXP_DECIMAL; 4715 break; 4716 case 'D': 4717 type = XML_REGEXP_NOTDECIMAL; 4718 break; 4719 case 'w': 4720 type = XML_REGEXP_REALCHAR; 4721 break; 4722 case 'W': 4723 type = XML_REGEXP_NOTREALCHAR; 4724 break; 4725 } 4726 NEXT; 4727 if (ctxt->atom == NULL) { 4728 ctxt->atom = xmlRegNewAtom(ctxt, type); 4729 } else if (ctxt->atom->type == XML_REGEXP_RANGES) { 4730 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4731 type, 0, 0, NULL); 4732 } 4733 } 4734} 4735 4736/** 4737 * xmlFAParseCharRef: 4738 * @ctxt: a regexp parser context 4739 * 4740 * [19] XmlCharRef ::= ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' ) 4741 */ 4742static int 4743xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) { 4744 int ret = 0, cur; 4745 4746 if ((CUR != '&') || (NXT(1) != '#')) 4747 return(-1); 4748 NEXT; 4749 NEXT; 4750 cur = CUR; 4751 if (cur == 'x') { 4752 NEXT; 4753 cur = CUR; 4754 if (((cur >= '0') && (cur <= '9')) || 4755 ((cur >= 'a') && (cur <= 'f')) || 4756 ((cur >= 'A') && (cur <= 'F'))) { 4757 while (((cur >= '0') && (cur <= '9')) || 4758 ((cur >= 'a') && (cur <= 'f')) || 4759 ((cur >= 'A') && (cur <= 'F'))) { 4760 if ((cur >= '0') && (cur <= '9')) 4761 ret = ret * 16 + cur - '0'; 4762 else if ((cur >= 'a') && (cur <= 'f')) 4763 ret = ret * 16 + 10 + (cur - 'a'); 4764 else 4765 ret = ret * 16 + 10 + (cur - 'A'); 4766 NEXT; 4767 cur = CUR; 4768 } 4769 } else { 4770 ERROR("Char ref: expecting [0-9A-F]"); 4771 return(-1); 4772 } 4773 } else { 4774 if ((cur >= '0') && (cur <= '9')) { 4775 while ((cur >= '0') && (cur <= '9')) { 4776 ret = ret * 10 + cur - '0'; 4777 NEXT; 4778 cur = CUR; 4779 } 4780 } else { 4781 ERROR("Char ref: expecting [0-9]"); 4782 return(-1); 4783 } 4784 } 4785 if (cur != ';') { 4786 ERROR("Char ref: expecting ';'"); 4787 return(-1); 4788 } else { 4789 NEXT; 4790 } 4791 return(ret); 4792} 4793 4794/** 4795 * xmlFAParseCharRange: 4796 * @ctxt: a regexp parser context 4797 * 4798 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash 4799 * [18] seRange ::= charOrEsc '-' charOrEsc 4800 * [20] charOrEsc ::= XmlChar | SingleCharEsc 4801 * [21] XmlChar ::= [^\#x2D#x5B#x5D] 4802 * [22] XmlCharIncDash ::= [^\#x5B#x5D] 4803 */ 4804static void 4805xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) { 4806 int cur, len; 4807 int start = -1; 4808 int end = -1; 4809 4810 if (CUR == '\0') { 4811 ERROR("Expecting ']'"); 4812 return; 4813 } 4814 4815 if ((CUR == '&') && (NXT(1) == '#')) { 4816 end = start = xmlFAParseCharRef(ctxt); 4817 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4818 XML_REGEXP_CHARVAL, start, end, NULL); 4819 return; 4820 } 4821 cur = CUR; 4822 if (cur == '\\') { 4823 NEXT; 4824 cur = CUR; 4825 switch (cur) { 4826 case 'n': start = 0xA; break; 4827 case 'r': start = 0xD; break; 4828 case 't': start = 0x9; break; 4829 case '\\': case '|': case '.': case '-': case '^': case '?': 4830 case '*': case '+': case '{': case '}': case '(': case ')': 4831 case '[': case ']': 4832 start = cur; break; 4833 default: 4834 ERROR("Invalid escape value"); 4835 return; 4836 } 4837 end = start; 4838 len = 1; 4839 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4840 end = start = CUR_SCHAR(ctxt->cur, len); 4841 } else { 4842 ERROR("Expecting a char range"); 4843 return; 4844 } 4845 NEXTL(len); 4846 if (start == '-') { 4847 return; 4848 } 4849 cur = CUR; 4850 if ((cur != '-') || (NXT(1) == ']')) { 4851 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4852 XML_REGEXP_CHARVAL, start, end, NULL); 4853 return; 4854 } 4855 NEXT; 4856 cur = CUR; 4857 if (cur == '\\') { 4858 NEXT; 4859 cur = CUR; 4860 switch (cur) { 4861 case 'n': end = 0xA; break; 4862 case 'r': end = 0xD; break; 4863 case 't': end = 0x9; break; 4864 case '\\': case '|': case '.': case '-': case '^': case '?': 4865 case '*': case '+': case '{': case '}': case '(': case ')': 4866 case '[': case ']': 4867 end = cur; break; 4868 default: 4869 ERROR("Invalid escape value"); 4870 return; 4871 } 4872 len = 1; 4873 } else if ((cur != 0x5B) && (cur != 0x5D)) { 4874 end = CUR_SCHAR(ctxt->cur, len); 4875 } else { 4876 ERROR("Expecting the end of a char range"); 4877 return; 4878 } 4879 NEXTL(len); 4880 /* TODO check that the values are acceptable character ranges for XML */ 4881 if (end < start) { 4882 ERROR("End of range is before start of range"); 4883 } else { 4884 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg, 4885 XML_REGEXP_CHARVAL, start, end, NULL); 4886 } 4887 return; 4888} 4889 4890/** 4891 * xmlFAParsePosCharGroup: 4892 * @ctxt: a regexp parser context 4893 * 4894 * [14] posCharGroup ::= ( charRange | charClassEsc )+ 4895 */ 4896static void 4897xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) { 4898 do { 4899 if ((CUR == '\\') || (CUR == '.')) { 4900 xmlFAParseCharClassEsc(ctxt); 4901 } else { 4902 xmlFAParseCharRange(ctxt); 4903 } 4904 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') && 4905 (CUR != 0) && (ctxt->error == 0)); 4906} 4907 4908/** 4909 * xmlFAParseCharGroup: 4910 * @ctxt: a regexp parser context 4911 * 4912 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub 4913 * [15] negCharGroup ::= '^' posCharGroup 4914 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr 4915 * [12] charClassExpr ::= '[' charGroup ']' 4916 */ 4917static void 4918xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) { 4919 int n = ctxt->neg; 4920 while ((CUR != ']') && (ctxt->error == 0)) { 4921 if (CUR == '^') { 4922 int neg = ctxt->neg; 4923 4924 NEXT; 4925 ctxt->neg = !ctxt->neg; 4926 xmlFAParsePosCharGroup(ctxt); 4927 ctxt->neg = neg; 4928 } else if ((CUR == '-') && (NXT(1) == '[')) { 4929 int neg = ctxt->neg; 4930 ctxt->neg = 2; 4931 NEXT; /* eat the '-' */ 4932 NEXT; /* eat the '[' */ 4933 xmlFAParseCharGroup(ctxt); 4934 if (CUR == ']') { 4935 NEXT; 4936 } else { 4937 ERROR("charClassExpr: ']' expected"); 4938 break; 4939 } 4940 ctxt->neg = neg; 4941 break; 4942 } else if (CUR != ']') { 4943 xmlFAParsePosCharGroup(ctxt); 4944 } 4945 } 4946 ctxt->neg = n; 4947} 4948 4949/** 4950 * xmlFAParseCharClass: 4951 * @ctxt: a regexp parser context 4952 * 4953 * [11] charClass ::= charClassEsc | charClassExpr 4954 * [12] charClassExpr ::= '[' charGroup ']' 4955 */ 4956static void 4957xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) { 4958 if (CUR == '[') { 4959 NEXT; 4960 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES); 4961 if (ctxt->atom == NULL) 4962 return; 4963 xmlFAParseCharGroup(ctxt); 4964 if (CUR == ']') { 4965 NEXT; 4966 } else { 4967 ERROR("xmlFAParseCharClass: ']' expected"); 4968 } 4969 } else { 4970 xmlFAParseCharClassEsc(ctxt); 4971 } 4972} 4973 4974/** 4975 * xmlFAParseQuantExact: 4976 * @ctxt: a regexp parser context 4977 * 4978 * [8] QuantExact ::= [0-9]+ 4979 * 4980 * Returns 0 if success or -1 in case of error 4981 */ 4982static int 4983xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) { 4984 int ret = 0; 4985 int ok = 0; 4986 4987 while ((CUR >= '0') && (CUR <= '9')) { 4988 ret = ret * 10 + (CUR - '0'); 4989 ok = 1; 4990 NEXT; 4991 } 4992 if (ok != 1) { 4993 return(-1); 4994 } 4995 return(ret); 4996} 4997 4998/** 4999 * xmlFAParseQuantifier: 5000 * @ctxt: a regexp parser context 5001 * 5002 * [4] quantifier ::= [?*+] | ( '{' quantity '}' ) 5003 * [5] quantity ::= quantRange | quantMin | QuantExact 5004 * [6] quantRange ::= QuantExact ',' QuantExact 5005 * [7] quantMin ::= QuantExact ',' 5006 * [8] QuantExact ::= [0-9]+ 5007 */ 5008static int 5009xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) { 5010 int cur; 5011 5012 cur = CUR; 5013 if ((cur == '?') || (cur == '*') || (cur == '+')) { 5014 if (ctxt->atom != NULL) { 5015 if (cur == '?') 5016 ctxt->atom->quant = XML_REGEXP_QUANT_OPT; 5017 else if (cur == '*') 5018 ctxt->atom->quant = XML_REGEXP_QUANT_MULT; 5019 else if (cur == '+') 5020 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS; 5021 } 5022 NEXT; 5023 return(1); 5024 } 5025 if (cur == '{') { 5026 int min = 0, max = 0; 5027 5028 NEXT; 5029 cur = xmlFAParseQuantExact(ctxt); 5030 if (cur >= 0) 5031 min = cur; 5032 if (CUR == ',') { 5033 NEXT; 5034 if (CUR == '}') 5035 max = INT_MAX; 5036 else { 5037 cur = xmlFAParseQuantExact(ctxt); 5038 if (cur >= 0) 5039 max = cur; 5040 else { 5041 ERROR("Improper quantifier"); 5042 } 5043 } 5044 } 5045 if (CUR == '}') { 5046 NEXT; 5047 } else { 5048 ERROR("Unterminated quantifier"); 5049 } 5050 if (max == 0) 5051 max = min; 5052 if (ctxt->atom != NULL) { 5053 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE; 5054 ctxt->atom->min = min; 5055 ctxt->atom->max = max; 5056 } 5057 return(1); 5058 } 5059 return(0); 5060} 5061 5062/** 5063 * xmlFAParseAtom: 5064 * @ctxt: a regexp parser context 5065 * 5066 * [9] atom ::= Char | charClass | ( '(' regExp ')' ) 5067 */ 5068static int 5069xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) { 5070 int codepoint, len; 5071 5072 codepoint = xmlFAIsChar(ctxt); 5073 if (codepoint > 0) { 5074 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL); 5075 if (ctxt->atom == NULL) 5076 return(-1); 5077 codepoint = CUR_SCHAR(ctxt->cur, len); 5078 ctxt->atom->codepoint = codepoint; 5079 NEXTL(len); 5080 return(1); 5081 } else if (CUR == '|') { 5082 return(0); 5083 } else if (CUR == 0) { 5084 return(0); 5085 } else if (CUR == ')') { 5086 return(0); 5087 } else if (CUR == '(') { 5088 xmlRegStatePtr start, oldend; 5089 5090 NEXT; 5091 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL); 5092 start = ctxt->state; 5093 oldend = ctxt->end; 5094 ctxt->end = NULL; 5095 ctxt->atom = NULL; 5096 xmlFAParseRegExp(ctxt, 0); 5097 if (CUR == ')') { 5098 NEXT; 5099 } else { 5100 ERROR("xmlFAParseAtom: expecting ')'"); 5101 } 5102 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG); 5103 if (ctxt->atom == NULL) 5104 return(-1); 5105 ctxt->atom->start = start; 5106 ctxt->atom->stop = ctxt->state; 5107 ctxt->end = oldend; 5108 return(1); 5109 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) { 5110 xmlFAParseCharClass(ctxt); 5111 return(1); 5112 } 5113 return(0); 5114} 5115 5116/** 5117 * xmlFAParsePiece: 5118 * @ctxt: a regexp parser context 5119 * 5120 * [3] piece ::= atom quantifier? 5121 */ 5122static int 5123xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) { 5124 int ret; 5125 5126 ctxt->atom = NULL; 5127 ret = xmlFAParseAtom(ctxt); 5128 if (ret == 0) 5129 return(0); 5130 if (ctxt->atom == NULL) { 5131 ERROR("internal: no atom generated"); 5132 } 5133 xmlFAParseQuantifier(ctxt); 5134 return(1); 5135} 5136 5137/** 5138 * xmlFAParseBranch: 5139 * @ctxt: a regexp parser context 5140 * @to: optional target to the end of the branch 5141 * 5142 * @to is used to optimize by removing duplicate path in automata 5143 * in expressions like (a|b)(c|d) 5144 * 5145 * [2] branch ::= piece* 5146 */ 5147static int 5148xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) { 5149 xmlRegStatePtr previous; 5150 int ret; 5151 5152 previous = ctxt->state; 5153 ret = xmlFAParsePiece(ctxt); 5154 if (ret != 0) { 5155 if (xmlFAGenerateTransitions(ctxt, previous, 5156 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5157 return(-1); 5158 previous = ctxt->state; 5159 ctxt->atom = NULL; 5160 } 5161 while ((ret != 0) && (ctxt->error == 0)) { 5162 ret = xmlFAParsePiece(ctxt); 5163 if (ret != 0) { 5164 if (xmlFAGenerateTransitions(ctxt, previous, 5165 (CUR=='|' || CUR==')') ? to : NULL, ctxt->atom) < 0) 5166 return(-1); 5167 previous = ctxt->state; 5168 ctxt->atom = NULL; 5169 } 5170 } 5171 return(0); 5172} 5173 5174/** 5175 * xmlFAParseRegExp: 5176 * @ctxt: a regexp parser context 5177 * @top: is this the top-level expression ? 5178 * 5179 * [1] regExp ::= branch ( '|' branch )* 5180 */ 5181static void 5182xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) { 5183 xmlRegStatePtr start, end; 5184 5185 /* if not top start should have been generated by an epsilon trans */ 5186 start = ctxt->state; 5187 ctxt->end = NULL; 5188 xmlFAParseBranch(ctxt, NULL); 5189 if (top) { 5190#ifdef DEBUG_REGEXP_GRAPH 5191 printf("State %d is final\n", ctxt->state->no); 5192#endif 5193 ctxt->state->type = XML_REGEXP_FINAL_STATE; 5194 } 5195 if (CUR != '|') { 5196 ctxt->end = ctxt->state; 5197 return; 5198 } 5199 end = ctxt->state; 5200 while ((CUR == '|') && (ctxt->error == 0)) { 5201 NEXT; 5202 ctxt->state = start; 5203 ctxt->end = NULL; 5204 xmlFAParseBranch(ctxt, end); 5205 } 5206 if (!top) { 5207 ctxt->state = end; 5208 ctxt->end = end; 5209 } 5210} 5211 5212/************************************************************************ 5213 * * 5214 * The basic API * 5215 * * 5216 ************************************************************************/ 5217 5218/** 5219 * xmlRegexpPrint: 5220 * @output: the file for the output debug 5221 * @regexp: the compiled regexp 5222 * 5223 * Print the content of the compiled regular expression 5224 */ 5225void 5226xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) { 5227 int i; 5228 5229 if (output == NULL) 5230 return; 5231 fprintf(output, " regexp: "); 5232 if (regexp == NULL) { 5233 fprintf(output, "NULL\n"); 5234 return; 5235 } 5236 fprintf(output, "'%s' ", regexp->string); 5237 fprintf(output, "\n"); 5238 fprintf(output, "%d atoms:\n", regexp->nbAtoms); 5239 for (i = 0;i < regexp->nbAtoms; i++) { 5240 fprintf(output, " %02d ", i); 5241 xmlRegPrintAtom(output, regexp->atoms[i]); 5242 } 5243 fprintf(output, "%d states:", regexp->nbStates); 5244 fprintf(output, "\n"); 5245 for (i = 0;i < regexp->nbStates; i++) { 5246 xmlRegPrintState(output, regexp->states[i]); 5247 } 5248 fprintf(output, "%d counters:\n", regexp->nbCounters); 5249 for (i = 0;i < regexp->nbCounters; i++) { 5250 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min, 5251 regexp->counters[i].max); 5252 } 5253} 5254 5255/** 5256 * xmlRegexpCompile: 5257 * @regexp: a regular expression string 5258 * 5259 * Parses a regular expression conforming to XML Schemas Part 2 Datatype 5260 * Appendix F and builds an automata suitable for testing strings against 5261 * that regular expression 5262 * 5263 * Returns the compiled expression or NULL in case of error 5264 */ 5265xmlRegexpPtr 5266xmlRegexpCompile(const xmlChar *regexp) { 5267 xmlRegexpPtr ret; 5268 xmlRegParserCtxtPtr ctxt; 5269 5270 ctxt = xmlRegNewParserCtxt(regexp); 5271 if (ctxt == NULL) 5272 return(NULL); 5273 5274 /* initialize the parser */ 5275 ctxt->end = NULL; 5276 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5277 xmlRegStatePush(ctxt, ctxt->start); 5278 5279 /* parse the expression building an automata */ 5280 xmlFAParseRegExp(ctxt, 1); 5281 if (CUR != 0) { 5282 ERROR("xmlFAParseRegExp: extra characters"); 5283 } 5284 ctxt->end = ctxt->state; 5285 ctxt->start->type = XML_REGEXP_START_STATE; 5286 ctxt->end->type = XML_REGEXP_FINAL_STATE; 5287 5288 /* remove the Epsilon except for counted transitions */ 5289 xmlFAEliminateEpsilonTransitions(ctxt); 5290 5291 5292 if (ctxt->error != 0) { 5293 xmlRegFreeParserCtxt(ctxt); 5294 return(NULL); 5295 } 5296 ret = xmlRegEpxFromParse(ctxt); 5297 xmlRegFreeParserCtxt(ctxt); 5298 return(ret); 5299} 5300 5301/** 5302 * xmlRegexpExec: 5303 * @comp: the compiled regular expression 5304 * @content: the value to check against the regular expression 5305 * 5306 * Check if the regular expression generates the value 5307 * 5308 * Returns 1 if it matches, 0 if not and a negative value in case of error 5309 */ 5310int 5311xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { 5312 if ((comp == NULL) || (content == NULL)) 5313 return(-1); 5314 return(xmlFARegExec(comp, content)); 5315} 5316 5317/** 5318 * xmlRegexpIsDeterminist: 5319 * @comp: the compiled regular expression 5320 * 5321 * Check if the regular expression is determinist 5322 * 5323 * Returns 1 if it yes, 0 if not and a negative value in case of error 5324 */ 5325int 5326xmlRegexpIsDeterminist(xmlRegexpPtr comp) { 5327 xmlAutomataPtr am; 5328 int ret; 5329 5330 if (comp == NULL) 5331 return(-1); 5332 if (comp->determinist != -1) 5333 return(comp->determinist); 5334 5335 am = xmlNewAutomata(); 5336 if (am->states != NULL) { 5337 int i; 5338 5339 for (i = 0;i < am->nbStates;i++) 5340 xmlRegFreeState(am->states[i]); 5341 xmlFree(am->states); 5342 } 5343 am->nbAtoms = comp->nbAtoms; 5344 am->atoms = comp->atoms; 5345 am->nbStates = comp->nbStates; 5346 am->states = comp->states; 5347 am->determinist = -1; 5348 ret = xmlFAComputesDeterminism(am); 5349 am->atoms = NULL; 5350 am->states = NULL; 5351 xmlFreeAutomata(am); 5352 return(ret); 5353} 5354 5355/** 5356 * xmlRegFreeRegexp: 5357 * @regexp: the regexp 5358 * 5359 * Free a regexp 5360 */ 5361void 5362xmlRegFreeRegexp(xmlRegexpPtr regexp) { 5363 int i; 5364 if (regexp == NULL) 5365 return; 5366 5367 if (regexp->string != NULL) 5368 xmlFree(regexp->string); 5369 if (regexp->states != NULL) { 5370 for (i = 0;i < regexp->nbStates;i++) 5371 xmlRegFreeState(regexp->states[i]); 5372 xmlFree(regexp->states); 5373 } 5374 if (regexp->atoms != NULL) { 5375 for (i = 0;i < regexp->nbAtoms;i++) 5376 xmlRegFreeAtom(regexp->atoms[i]); 5377 xmlFree(regexp->atoms); 5378 } 5379 if (regexp->counters != NULL) 5380 xmlFree(regexp->counters); 5381 if (regexp->compact != NULL) 5382 xmlFree(regexp->compact); 5383 if (regexp->transdata != NULL) 5384 xmlFree(regexp->transdata); 5385 if (regexp->stringMap != NULL) { 5386 for (i = 0; i < regexp->nbstrings;i++) 5387 xmlFree(regexp->stringMap[i]); 5388 xmlFree(regexp->stringMap); 5389 } 5390 5391 xmlFree(regexp); 5392} 5393 5394#ifdef LIBXML_AUTOMATA_ENABLED 5395/************************************************************************ 5396 * * 5397 * The Automata interface * 5398 * * 5399 ************************************************************************/ 5400 5401/** 5402 * xmlNewAutomata: 5403 * 5404 * Create a new automata 5405 * 5406 * Returns the new object or NULL in case of failure 5407 */ 5408xmlAutomataPtr 5409xmlNewAutomata(void) { 5410 xmlAutomataPtr ctxt; 5411 5412 ctxt = xmlRegNewParserCtxt(NULL); 5413 if (ctxt == NULL) 5414 return(NULL); 5415 5416 /* initialize the parser */ 5417 ctxt->end = NULL; 5418 ctxt->start = ctxt->state = xmlRegNewState(ctxt); 5419 if (ctxt->start == NULL) { 5420 xmlFreeAutomata(ctxt); 5421 return(NULL); 5422 } 5423 ctxt->start->type = XML_REGEXP_START_STATE; 5424 if (xmlRegStatePush(ctxt, ctxt->start) < 0) { 5425 xmlRegFreeState(ctxt->start); 5426 xmlFreeAutomata(ctxt); 5427 return(NULL); 5428 } 5429 5430 return(ctxt); 5431} 5432 5433/** 5434 * xmlFreeAutomata: 5435 * @am: an automata 5436 * 5437 * Free an automata 5438 */ 5439void 5440xmlFreeAutomata(xmlAutomataPtr am) { 5441 if (am == NULL) 5442 return; 5443 xmlRegFreeParserCtxt(am); 5444} 5445 5446/** 5447 * xmlAutomataGetInitState: 5448 * @am: an automata 5449 * 5450 * Initial state lookup 5451 * 5452 * Returns the initial state of the automata 5453 */ 5454xmlAutomataStatePtr 5455xmlAutomataGetInitState(xmlAutomataPtr am) { 5456 if (am == NULL) 5457 return(NULL); 5458 return(am->start); 5459} 5460 5461/** 5462 * xmlAutomataSetFinalState: 5463 * @am: an automata 5464 * @state: a state in this automata 5465 * 5466 * Makes that state a final state 5467 * 5468 * Returns 0 or -1 in case of error 5469 */ 5470int 5471xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) { 5472 if ((am == NULL) || (state == NULL)) 5473 return(-1); 5474 state->type = XML_REGEXP_FINAL_STATE; 5475 return(0); 5476} 5477 5478/** 5479 * xmlAutomataNewTransition: 5480 * @am: an automata 5481 * @from: the starting point of the transition 5482 * @to: the target point of the transition or NULL 5483 * @token: the input string associated to that transition 5484 * @data: data passed to the callback function if the transition is activated 5485 * 5486 * If @to is NULL, this creates first a new target state in the automata 5487 * and then adds a transition from the @from state to the target state 5488 * activated by the value of @token 5489 * 5490 * Returns the target state or NULL in case of error 5491 */ 5492xmlAutomataStatePtr 5493xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from, 5494 xmlAutomataStatePtr to, const xmlChar *token, 5495 void *data) { 5496 xmlRegAtomPtr atom; 5497 5498 if ((am == NULL) || (from == NULL) || (token == NULL)) 5499 return(NULL); 5500 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5501 if (atom == NULL) 5502 return(NULL); 5503 atom->data = data; 5504 if (atom == NULL) 5505 return(NULL); 5506 atom->valuep = xmlStrdup(token); 5507 5508 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5509 xmlRegFreeAtom(atom); 5510 return(NULL); 5511 } 5512 if (to == NULL) 5513 return(am->state); 5514 return(to); 5515} 5516 5517/** 5518 * xmlAutomataNewTransition2: 5519 * @am: an automata 5520 * @from: the starting point of the transition 5521 * @to: the target point of the transition or NULL 5522 * @token: the first input string associated to that transition 5523 * @token2: the second input string associated to that transition 5524 * @data: data passed to the callback function if the transition is activated 5525 * 5526 * If @to is NULL, this creates first a new target state in the automata 5527 * and then adds a transition from the @from state to the target state 5528 * activated by the value of @token 5529 * 5530 * Returns the target state or NULL in case of error 5531 */ 5532xmlAutomataStatePtr 5533xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5534 xmlAutomataStatePtr to, const xmlChar *token, 5535 const xmlChar *token2, void *data) { 5536 xmlRegAtomPtr atom; 5537 5538 if ((am == NULL) || (from == NULL) || (token == NULL)) 5539 return(NULL); 5540 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5541 if (atom == NULL) 5542 return(NULL); 5543 atom->data = data; 5544 if ((token2 == NULL) || (*token2 == 0)) { 5545 atom->valuep = xmlStrdup(token); 5546 } else { 5547 int lenn, lenp; 5548 xmlChar *str; 5549 5550 lenn = strlen((char *) token2); 5551 lenp = strlen((char *) token); 5552 5553 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5554 if (str == NULL) { 5555 xmlRegFreeAtom(atom); 5556 return(NULL); 5557 } 5558 memcpy(&str[0], token, lenp); 5559 str[lenp] = '|'; 5560 memcpy(&str[lenp + 1], token2, lenn); 5561 str[lenn + lenp + 1] = 0; 5562 5563 atom->valuep = str; 5564 } 5565 5566 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5567 xmlRegFreeAtom(atom); 5568 return(NULL); 5569 } 5570 if (to == NULL) 5571 return(am->state); 5572 return(to); 5573} 5574 5575/** 5576 * xmlAutomataNewNegTrans: 5577 * @am: an automata 5578 * @from: the starting point of the transition 5579 * @to: the target point of the transition or NULL 5580 * @token: the first input string associated to that transition 5581 * @token2: the second input string associated to that transition 5582 * @data: data passed to the callback function if the transition is activated 5583 * 5584 * If @to is NULL, this creates first a new target state in the automata 5585 * and then adds a transition from the @from state to the target state 5586 * activated by any value except (@token,@token2) 5587 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow 5588 # the semantic of XSD ##other 5589 * 5590 * Returns the target state or NULL in case of error 5591 */ 5592xmlAutomataStatePtr 5593xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5594 xmlAutomataStatePtr to, const xmlChar *token, 5595 const xmlChar *token2, void *data) { 5596 xmlRegAtomPtr atom; 5597 xmlChar err_msg[200]; 5598 5599 if ((am == NULL) || (from == NULL) || (token == NULL)) 5600 return(NULL); 5601 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5602 if (atom == NULL) 5603 return(NULL); 5604 atom->data = data; 5605 atom->neg = 1; 5606 if ((token2 == NULL) || (*token2 == 0)) { 5607 atom->valuep = xmlStrdup(token); 5608 } else { 5609 int lenn, lenp; 5610 xmlChar *str; 5611 5612 lenn = strlen((char *) token2); 5613 lenp = strlen((char *) token); 5614 5615 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5616 if (str == NULL) { 5617 xmlRegFreeAtom(atom); 5618 return(NULL); 5619 } 5620 memcpy(&str[0], token, lenp); 5621 str[lenp] = '|'; 5622 memcpy(&str[lenp + 1], token2, lenn); 5623 str[lenn + lenp + 1] = 0; 5624 5625 atom->valuep = str; 5626 } 5627 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep); 5628 err_msg[199] = 0; 5629 atom->valuep2 = xmlStrdup(err_msg); 5630 5631 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) { 5632 xmlRegFreeAtom(atom); 5633 return(NULL); 5634 } 5635 am->negs++; 5636 if (to == NULL) 5637 return(am->state); 5638 return(to); 5639} 5640 5641/** 5642 * xmlAutomataNewCountTrans2: 5643 * @am: an automata 5644 * @from: the starting point of the transition 5645 * @to: the target point of the transition or NULL 5646 * @token: the input string associated to that transition 5647 * @token2: the second input string associated to that transition 5648 * @min: the minimum successive occurences of token 5649 * @max: the maximum successive occurences of token 5650 * @data: data associated to the transition 5651 * 5652 * If @to is NULL, this creates first a new target state in the automata 5653 * and then adds a transition from the @from state to the target state 5654 * activated by a succession of input of value @token and @token2 and 5655 * whose number is between @min and @max 5656 * 5657 * Returns the target state or NULL in case of error 5658 */ 5659xmlAutomataStatePtr 5660xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5661 xmlAutomataStatePtr to, const xmlChar *token, 5662 const xmlChar *token2, 5663 int min, int max, void *data) { 5664 xmlRegAtomPtr atom; 5665 int counter; 5666 5667 if ((am == NULL) || (from == NULL) || (token == NULL)) 5668 return(NULL); 5669 if (min < 0) 5670 return(NULL); 5671 if ((max < min) || (max < 1)) 5672 return(NULL); 5673 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5674 if (atom == NULL) 5675 return(NULL); 5676 if ((token2 == NULL) || (*token2 == 0)) { 5677 atom->valuep = xmlStrdup(token); 5678 } else { 5679 int lenn, lenp; 5680 xmlChar *str; 5681 5682 lenn = strlen((char *) token2); 5683 lenp = strlen((char *) token); 5684 5685 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5686 if (str == NULL) { 5687 xmlRegFreeAtom(atom); 5688 return(NULL); 5689 } 5690 memcpy(&str[0], token, lenp); 5691 str[lenp] = '|'; 5692 memcpy(&str[lenp + 1], token2, lenn); 5693 str[lenn + lenp + 1] = 0; 5694 5695 atom->valuep = str; 5696 } 5697 atom->data = data; 5698 if (min == 0) 5699 atom->min = 1; 5700 else 5701 atom->min = min; 5702 atom->max = max; 5703 5704 /* 5705 * associate a counter to the transition. 5706 */ 5707 counter = xmlRegGetCounter(am); 5708 am->counters[counter].min = min; 5709 am->counters[counter].max = max; 5710 5711 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5712 if (to == NULL) { 5713 to = xmlRegNewState(am); 5714 xmlRegStatePush(am, to); 5715 } 5716 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5717 xmlRegAtomPush(am, atom); 5718 am->state = to; 5719 5720 if (to == NULL) 5721 to = am->state; 5722 if (to == NULL) 5723 return(NULL); 5724 if (min == 0) 5725 xmlFAGenerateEpsilonTransition(am, from, to); 5726 return(to); 5727} 5728 5729/** 5730 * xmlAutomataNewCountTrans: 5731 * @am: an automata 5732 * @from: the starting point of the transition 5733 * @to: the target point of the transition or NULL 5734 * @token: the input string associated to that transition 5735 * @min: the minimum successive occurences of token 5736 * @max: the maximum successive occurences of token 5737 * @data: data associated to the transition 5738 * 5739 * If @to is NULL, this creates first a new target state in the automata 5740 * and then adds a transition from the @from state to the target state 5741 * activated by a succession of input of value @token and whose number 5742 * is between @min and @max 5743 * 5744 * Returns the target state or NULL in case of error 5745 */ 5746xmlAutomataStatePtr 5747xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5748 xmlAutomataStatePtr to, const xmlChar *token, 5749 int min, int max, void *data) { 5750 xmlRegAtomPtr atom; 5751 int counter; 5752 5753 if ((am == NULL) || (from == NULL) || (token == NULL)) 5754 return(NULL); 5755 if (min < 0) 5756 return(NULL); 5757 if ((max < min) || (max < 1)) 5758 return(NULL); 5759 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5760 if (atom == NULL) 5761 return(NULL); 5762 atom->valuep = xmlStrdup(token); 5763 atom->data = data; 5764 if (min == 0) 5765 atom->min = 1; 5766 else 5767 atom->min = min; 5768 atom->max = max; 5769 5770 /* 5771 * associate a counter to the transition. 5772 */ 5773 counter = xmlRegGetCounter(am); 5774 am->counters[counter].min = min; 5775 am->counters[counter].max = max; 5776 5777 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5778 if (to == NULL) { 5779 to = xmlRegNewState(am); 5780 xmlRegStatePush(am, to); 5781 } 5782 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5783 xmlRegAtomPush(am, atom); 5784 am->state = to; 5785 5786 if (to == NULL) 5787 to = am->state; 5788 if (to == NULL) 5789 return(NULL); 5790 if (min == 0) 5791 xmlFAGenerateEpsilonTransition(am, from, to); 5792 return(to); 5793} 5794 5795/** 5796 * xmlAutomataNewOnceTrans2: 5797 * @am: an automata 5798 * @from: the starting point of the transition 5799 * @to: the target point of the transition or NULL 5800 * @token: the input string associated to that transition 5801 * @token2: the second input string associated to that transition 5802 * @min: the minimum successive occurences of token 5803 * @max: the maximum successive occurences of token 5804 * @data: data associated to the transition 5805 * 5806 * If @to is NULL, this creates first a new target state in the automata 5807 * and then adds a transition from the @from state to the target state 5808 * activated by a succession of input of value @token and @token2 and whose 5809 * number is between @min and @max, moreover that transition can only be 5810 * crossed once. 5811 * 5812 * Returns the target state or NULL in case of error 5813 */ 5814xmlAutomataStatePtr 5815xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from, 5816 xmlAutomataStatePtr to, const xmlChar *token, 5817 const xmlChar *token2, 5818 int min, int max, void *data) { 5819 xmlRegAtomPtr atom; 5820 int counter; 5821 5822 if ((am == NULL) || (from == NULL) || (token == NULL)) 5823 return(NULL); 5824 if (min < 1) 5825 return(NULL); 5826 if ((max < min) || (max < 1)) 5827 return(NULL); 5828 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5829 if (atom == NULL) 5830 return(NULL); 5831 if ((token2 == NULL) || (*token2 == 0)) { 5832 atom->valuep = xmlStrdup(token); 5833 } else { 5834 int lenn, lenp; 5835 xmlChar *str; 5836 5837 lenn = strlen((char *) token2); 5838 lenp = strlen((char *) token); 5839 5840 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2); 5841 if (str == NULL) { 5842 xmlRegFreeAtom(atom); 5843 return(NULL); 5844 } 5845 memcpy(&str[0], token, lenp); 5846 str[lenp] = '|'; 5847 memcpy(&str[lenp + 1], token2, lenn); 5848 str[lenn + lenp + 1] = 0; 5849 5850 atom->valuep = str; 5851 } 5852 atom->data = data; 5853 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5854 atom->min = min; 5855 atom->max = max; 5856 /* 5857 * associate a counter to the transition. 5858 */ 5859 counter = xmlRegGetCounter(am); 5860 am->counters[counter].min = 1; 5861 am->counters[counter].max = 1; 5862 5863 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5864 if (to == NULL) { 5865 to = xmlRegNewState(am); 5866 xmlRegStatePush(am, to); 5867 } 5868 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5869 xmlRegAtomPush(am, atom); 5870 am->state = to; 5871 return(to); 5872} 5873 5874 5875 5876/** 5877 * xmlAutomataNewOnceTrans: 5878 * @am: an automata 5879 * @from: the starting point of the transition 5880 * @to: the target point of the transition or NULL 5881 * @token: the input string associated to that transition 5882 * @min: the minimum successive occurences of token 5883 * @max: the maximum successive occurences of token 5884 * @data: data associated to the transition 5885 * 5886 * If @to is NULL, this creates first a new target state in the automata 5887 * and then adds a transition from the @from state to the target state 5888 * activated by a succession of input of value @token and whose number 5889 * is between @min and @max, moreover that transition can only be crossed 5890 * once. 5891 * 5892 * Returns the target state or NULL in case of error 5893 */ 5894xmlAutomataStatePtr 5895xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5896 xmlAutomataStatePtr to, const xmlChar *token, 5897 int min, int max, void *data) { 5898 xmlRegAtomPtr atom; 5899 int counter; 5900 5901 if ((am == NULL) || (from == NULL) || (token == NULL)) 5902 return(NULL); 5903 if (min < 1) 5904 return(NULL); 5905 if ((max < min) || (max < 1)) 5906 return(NULL); 5907 atom = xmlRegNewAtom(am, XML_REGEXP_STRING); 5908 if (atom == NULL) 5909 return(NULL); 5910 atom->valuep = xmlStrdup(token); 5911 atom->data = data; 5912 atom->quant = XML_REGEXP_QUANT_ONCEONLY; 5913 atom->min = min; 5914 atom->max = max; 5915 /* 5916 * associate a counter to the transition. 5917 */ 5918 counter = xmlRegGetCounter(am); 5919 am->counters[counter].min = 1; 5920 am->counters[counter].max = 1; 5921 5922 /* xmlFAGenerateTransitions(am, from, to, atom); */ 5923 if (to == NULL) { 5924 to = xmlRegNewState(am); 5925 xmlRegStatePush(am, to); 5926 } 5927 xmlRegStateAddTrans(am, from, atom, to, counter, -1); 5928 xmlRegAtomPush(am, atom); 5929 am->state = to; 5930 return(to); 5931} 5932 5933/** 5934 * xmlAutomataNewState: 5935 * @am: an automata 5936 * 5937 * Create a new disconnected state in the automata 5938 * 5939 * Returns the new state or NULL in case of error 5940 */ 5941xmlAutomataStatePtr 5942xmlAutomataNewState(xmlAutomataPtr am) { 5943 xmlAutomataStatePtr to; 5944 5945 if (am == NULL) 5946 return(NULL); 5947 to = xmlRegNewState(am); 5948 xmlRegStatePush(am, to); 5949 return(to); 5950} 5951 5952/** 5953 * xmlAutomataNewEpsilon: 5954 * @am: an automata 5955 * @from: the starting point of the transition 5956 * @to: the target point of the transition or NULL 5957 * 5958 * If @to is NULL, this creates first a new target state in the automata 5959 * and then adds an epsilon transition from the @from state to the 5960 * target state 5961 * 5962 * Returns the target state or NULL in case of error 5963 */ 5964xmlAutomataStatePtr 5965xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, 5966 xmlAutomataStatePtr to) { 5967 if ((am == NULL) || (from == NULL)) 5968 return(NULL); 5969 xmlFAGenerateEpsilonTransition(am, from, to); 5970 if (to == NULL) 5971 return(am->state); 5972 return(to); 5973} 5974 5975/** 5976 * xmlAutomataNewAllTrans: 5977 * @am: an automata 5978 * @from: the starting point of the transition 5979 * @to: the target point of the transition or NULL 5980 * @lax: allow to transition if not all all transitions have been activated 5981 * 5982 * If @to is NULL, this creates first a new target state in the automata 5983 * and then adds a an ALL transition from the @from state to the 5984 * target state. That transition is an epsilon transition allowed only when 5985 * all transitions from the @from node have been activated. 5986 * 5987 * Returns the target state or NULL in case of error 5988 */ 5989xmlAutomataStatePtr 5990xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 5991 xmlAutomataStatePtr to, int lax) { 5992 if ((am == NULL) || (from == NULL)) 5993 return(NULL); 5994 xmlFAGenerateAllTransition(am, from, to, lax); 5995 if (to == NULL) 5996 return(am->state); 5997 return(to); 5998} 5999 6000/** 6001 * xmlAutomataNewCounter: 6002 * @am: an automata 6003 * @min: the minimal value on the counter 6004 * @max: the maximal value on the counter 6005 * 6006 * Create a new counter 6007 * 6008 * Returns the counter number or -1 in case of error 6009 */ 6010int 6011xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) { 6012 int ret; 6013 6014 if (am == NULL) 6015 return(-1); 6016 6017 ret = xmlRegGetCounter(am); 6018 if (ret < 0) 6019 return(-1); 6020 am->counters[ret].min = min; 6021 am->counters[ret].max = max; 6022 return(ret); 6023} 6024 6025/** 6026 * xmlAutomataNewCountedTrans: 6027 * @am: an automata 6028 * @from: the starting point of the transition 6029 * @to: the target point of the transition or NULL 6030 * @counter: the counter associated to that transition 6031 * 6032 * If @to is NULL, this creates first a new target state in the automata 6033 * and then adds an epsilon transition from the @from state to the target state 6034 * which will increment the counter provided 6035 * 6036 * Returns the target state or NULL in case of error 6037 */ 6038xmlAutomataStatePtr 6039xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6040 xmlAutomataStatePtr to, int counter) { 6041 if ((am == NULL) || (from == NULL) || (counter < 0)) 6042 return(NULL); 6043 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter); 6044 if (to == NULL) 6045 return(am->state); 6046 return(to); 6047} 6048 6049/** 6050 * xmlAutomataNewCounterTrans: 6051 * @am: an automata 6052 * @from: the starting point of the transition 6053 * @to: the target point of the transition or NULL 6054 * @counter: the counter associated to that transition 6055 * 6056 * If @to is NULL, this creates first a new target state in the automata 6057 * and then adds an epsilon transition from the @from state to the target state 6058 * which will be allowed only if the counter is within the right range. 6059 * 6060 * Returns the target state or NULL in case of error 6061 */ 6062xmlAutomataStatePtr 6063xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, 6064 xmlAutomataStatePtr to, int counter) { 6065 if ((am == NULL) || (from == NULL) || (counter < 0)) 6066 return(NULL); 6067 xmlFAGenerateCountedTransition(am, from, to, counter); 6068 if (to == NULL) 6069 return(am->state); 6070 return(to); 6071} 6072 6073/** 6074 * xmlAutomataCompile: 6075 * @am: an automata 6076 * 6077 * Compile the automata into a Reg Exp ready for being executed. 6078 * The automata should be free after this point. 6079 * 6080 * Returns the compiled regexp or NULL in case of error 6081 */ 6082xmlRegexpPtr 6083xmlAutomataCompile(xmlAutomataPtr am) { 6084 xmlRegexpPtr ret; 6085 6086 if ((am == NULL) || (am->error != 0)) return(NULL); 6087 xmlFAEliminateEpsilonTransitions(am); 6088 /* xmlFAComputesDeterminism(am); */ 6089 ret = xmlRegEpxFromParse(am); 6090 6091 return(ret); 6092} 6093 6094/** 6095 * xmlAutomataIsDeterminist: 6096 * @am: an automata 6097 * 6098 * Checks if an automata is determinist. 6099 * 6100 * Returns 1 if true, 0 if not, and -1 in case of error 6101 */ 6102int 6103xmlAutomataIsDeterminist(xmlAutomataPtr am) { 6104 int ret; 6105 6106 if (am == NULL) 6107 return(-1); 6108 6109 ret = xmlFAComputesDeterminism(am); 6110 return(ret); 6111} 6112#endif /* LIBXML_AUTOMATA_ENABLED */ 6113 6114#ifdef LIBXML_EXPR_ENABLED 6115/************************************************************************ 6116 * * 6117 * Formal Expression handling code * 6118 * * 6119 ************************************************************************/ 6120/************************************************************************ 6121 * * 6122 * Expression handling context * 6123 * * 6124 ************************************************************************/ 6125 6126struct _xmlExpCtxt { 6127 xmlDictPtr dict; 6128 xmlExpNodePtr *table; 6129 int size; 6130 int nbElems; 6131 int nb_nodes; 6132 const char *expr; 6133 const char *cur; 6134 int nb_cons; 6135 int tabSize; 6136}; 6137 6138/** 6139 * xmlExpNewCtxt: 6140 * @maxNodes: the maximum number of nodes 6141 * @dict: optional dictionnary to use internally 6142 * 6143 * Creates a new context for manipulating expressions 6144 * 6145 * Returns the context or NULL in case of error 6146 */ 6147xmlExpCtxtPtr 6148xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) { 6149 xmlExpCtxtPtr ret; 6150 int size = 256; 6151 6152 if (maxNodes <= 4096) 6153 maxNodes = 4096; 6154 6155 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt)); 6156 if (ret == NULL) 6157 return(NULL); 6158 memset(ret, 0, sizeof(xmlExpCtxt)); 6159 ret->size = size; 6160 ret->nbElems = 0; 6161 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr)); 6162 if (ret->table == NULL) { 6163 xmlFree(ret); 6164 return(NULL); 6165 } 6166 memset(ret->table, 0, size * sizeof(xmlExpNodePtr)); 6167 if (dict == NULL) { 6168 ret->dict = xmlDictCreate(); 6169 if (ret->dict == NULL) { 6170 xmlFree(ret->table); 6171 xmlFree(ret); 6172 return(NULL); 6173 } 6174 } else { 6175 ret->dict = dict; 6176 xmlDictReference(ret->dict); 6177 } 6178 return(ret); 6179} 6180 6181/** 6182 * xmlExpFreeCtxt: 6183 * @ctxt: an expression context 6184 * 6185 * Free an expression context 6186 */ 6187void 6188xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) { 6189 if (ctxt == NULL) 6190 return; 6191 xmlDictFree(ctxt->dict); 6192 if (ctxt->table != NULL) 6193 xmlFree(ctxt->table); 6194 xmlFree(ctxt); 6195} 6196 6197/************************************************************************ 6198 * * 6199 * Structure associated to an expression node * 6200 * * 6201 ************************************************************************/ 6202#define MAX_NODES 10000 6203 6204/* #define DEBUG_DERIV */ 6205 6206/* 6207 * TODO: 6208 * - Wildcards 6209 * - public API for creation 6210 * 6211 * Started 6212 * - regression testing 6213 * 6214 * Done 6215 * - split into module and test tool 6216 * - memleaks 6217 */ 6218 6219typedef enum { 6220 XML_EXP_NILABLE = (1 << 0) 6221} xmlExpNodeInfo; 6222 6223#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE) 6224 6225struct _xmlExpNode { 6226 unsigned char type;/* xmlExpNodeType */ 6227 unsigned char info;/* OR of xmlExpNodeInfo */ 6228 unsigned short key; /* the hash key */ 6229 unsigned int ref; /* The number of references */ 6230 int c_max; /* the maximum length it can consume */ 6231 xmlExpNodePtr exp_left; 6232 xmlExpNodePtr next;/* the next node in the hash table or free list */ 6233 union { 6234 struct { 6235 int f_min; 6236 int f_max; 6237 } count; 6238 struct { 6239 xmlExpNodePtr f_right; 6240 } children; 6241 const xmlChar *f_str; 6242 } field; 6243}; 6244 6245#define exp_min field.count.f_min 6246#define exp_max field.count.f_max 6247/* #define exp_left field.children.f_left */ 6248#define exp_right field.children.f_right 6249#define exp_str field.f_str 6250 6251static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type); 6252static xmlExpNode forbiddenExpNode = { 6253 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6254}; 6255xmlExpNodePtr forbiddenExp = &forbiddenExpNode; 6256static xmlExpNode emptyExpNode = { 6257 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}} 6258}; 6259xmlExpNodePtr emptyExp = &emptyExpNode; 6260 6261/************************************************************************ 6262 * * 6263 * The custom hash table for unicity and canonicalization * 6264 * of sub-expressions pointers * 6265 * * 6266 ************************************************************************/ 6267/* 6268 * xmlExpHashNameComputeKey: 6269 * Calculate the hash key for a token 6270 */ 6271static unsigned short 6272xmlExpHashNameComputeKey(const xmlChar *name) { 6273 unsigned short value = 0L; 6274 char ch; 6275 6276 if (name != NULL) { 6277 value += 30 * (*name); 6278 while ((ch = *name++) != 0) { 6279 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); 6280 } 6281 } 6282 return (value); 6283} 6284 6285/* 6286 * xmlExpHashComputeKey: 6287 * Calculate the hash key for a compound expression 6288 */ 6289static unsigned short 6290xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left, 6291 xmlExpNodePtr right) { 6292 unsigned long value; 6293 unsigned short ret; 6294 6295 switch (type) { 6296 case XML_EXP_SEQ: 6297 value = left->key; 6298 value += right->key; 6299 value *= 3; 6300 ret = (unsigned short) value; 6301 break; 6302 case XML_EXP_OR: 6303 value = left->key; 6304 value += right->key; 6305 value *= 7; 6306 ret = (unsigned short) value; 6307 break; 6308 case XML_EXP_COUNT: 6309 value = left->key; 6310 value += right->key; 6311 ret = (unsigned short) value; 6312 break; 6313 default: 6314 ret = 0; 6315 } 6316 return(ret); 6317} 6318 6319 6320static xmlExpNodePtr 6321xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) { 6322 xmlExpNodePtr ret; 6323 6324 if (ctxt->nb_nodes >= MAX_NODES) 6325 return(NULL); 6326 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode)); 6327 if (ret == NULL) 6328 return(NULL); 6329 memset(ret, 0, sizeof(xmlExpNode)); 6330 ret->type = type; 6331 ret->next = NULL; 6332 ctxt->nb_nodes++; 6333 ctxt->nb_cons++; 6334 return(ret); 6335} 6336 6337/** 6338 * xmlExpHashGetEntry: 6339 * @table: the hash table 6340 * 6341 * Get the unique entry from the hash table. The entry is created if 6342 * needed. @left and @right are consumed, i.e. their ref count will 6343 * be decremented by the operation. 6344 * 6345 * Returns the pointer or NULL in case of error 6346 */ 6347static xmlExpNodePtr 6348xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type, 6349 xmlExpNodePtr left, xmlExpNodePtr right, 6350 const xmlChar *name, int min, int max) { 6351 unsigned short kbase, key; 6352 xmlExpNodePtr entry; 6353 xmlExpNodePtr insert; 6354 6355 if (ctxt == NULL) 6356 return(NULL); 6357 6358 /* 6359 * Check for duplicate and insertion location. 6360 */ 6361 if (type == XML_EXP_ATOM) { 6362 kbase = xmlExpHashNameComputeKey(name); 6363 } else if (type == XML_EXP_COUNT) { 6364 /* COUNT reduction rule 1 */ 6365 /* a{1} -> a */ 6366 if (min == max) { 6367 if (min == 1) { 6368 return(left); 6369 } 6370 if (min == 0) { 6371 xmlExpFree(ctxt, left); 6372 return(emptyExp); 6373 } 6374 } 6375 if (min < 0) { 6376 xmlExpFree(ctxt, left); 6377 return(forbiddenExp); 6378 } 6379 if (max == -1) 6380 kbase = min + 79; 6381 else 6382 kbase = max - min; 6383 kbase += left->key; 6384 } else if (type == XML_EXP_OR) { 6385 /* Forbid reduction rules */ 6386 if (left->type == XML_EXP_FORBID) { 6387 xmlExpFree(ctxt, left); 6388 return(right); 6389 } 6390 if (right->type == XML_EXP_FORBID) { 6391 xmlExpFree(ctxt, right); 6392 return(left); 6393 } 6394 6395 /* OR reduction rule 1 */ 6396 /* a | a reduced to a */ 6397 if (left == right) { 6398 left->ref--; 6399 return(left); 6400 } 6401 /* OR canonicalization rule 1 */ 6402 /* linearize (a | b) | c into a | (b | c) */ 6403 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) { 6404 xmlExpNodePtr tmp = left; 6405 left = right; 6406 right = tmp; 6407 } 6408 /* OR reduction rule 2 */ 6409 /* a | (a | b) and b | (a | b) are reduced to a | b */ 6410 if (right->type == XML_EXP_OR) { 6411 if ((left == right->exp_left) || 6412 (left == right->exp_right)) { 6413 xmlExpFree(ctxt, left); 6414 return(right); 6415 } 6416 } 6417 /* OR canonicalization rule 2 */ 6418 /* linearize (a | b) | c into a | (b | c) */ 6419 if (left->type == XML_EXP_OR) { 6420 xmlExpNodePtr tmp; 6421 6422 /* OR canonicalization rule 2 */ 6423 if ((left->exp_right->type != XML_EXP_OR) && 6424 (left->exp_right->key < left->exp_left->key)) { 6425 tmp = left->exp_right; 6426 left->exp_right = left->exp_left; 6427 left->exp_left = tmp; 6428 } 6429 left->exp_right->ref++; 6430 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right, 6431 NULL, 0, 0); 6432 left->exp_left->ref++; 6433 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp, 6434 NULL, 0, 0); 6435 6436 xmlExpFree(ctxt, left); 6437 return(tmp); 6438 } 6439 if (right->type == XML_EXP_OR) { 6440 /* Ordering in the tree */ 6441 /* C | (A | B) -> A | (B | C) */ 6442 if (left->key > right->exp_right->key) { 6443 xmlExpNodePtr tmp; 6444 right->exp_right->ref++; 6445 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right, 6446 left, NULL, 0, 0); 6447 right->exp_left->ref++; 6448 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6449 tmp, NULL, 0, 0); 6450 xmlExpFree(ctxt, right); 6451 return(tmp); 6452 } 6453 /* Ordering in the tree */ 6454 /* B | (A | C) -> A | (B | C) */ 6455 if (left->key > right->exp_left->key) { 6456 xmlExpNodePtr tmp; 6457 right->exp_right->ref++; 6458 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, 6459 right->exp_right, NULL, 0, 0); 6460 right->exp_left->ref++; 6461 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left, 6462 tmp, NULL, 0, 0); 6463 xmlExpFree(ctxt, right); 6464 return(tmp); 6465 } 6466 } 6467 /* we know both types are != XML_EXP_OR here */ 6468 else if (left->key > right->key) { 6469 xmlExpNodePtr tmp = left; 6470 left = right; 6471 right = tmp; 6472 } 6473 kbase = xmlExpHashComputeKey(type, left, right); 6474 } else if (type == XML_EXP_SEQ) { 6475 /* Forbid reduction rules */ 6476 if (left->type == XML_EXP_FORBID) { 6477 xmlExpFree(ctxt, right); 6478 return(left); 6479 } 6480 if (right->type == XML_EXP_FORBID) { 6481 xmlExpFree(ctxt, left); 6482 return(right); 6483 } 6484 /* Empty reduction rules */ 6485 if (right->type == XML_EXP_EMPTY) { 6486 return(left); 6487 } 6488 if (left->type == XML_EXP_EMPTY) { 6489 return(right); 6490 } 6491 kbase = xmlExpHashComputeKey(type, left, right); 6492 } else 6493 return(NULL); 6494 6495 key = kbase % ctxt->size; 6496 if (ctxt->table[key] != NULL) { 6497 for (insert = ctxt->table[key]; insert != NULL; 6498 insert = insert->next) { 6499 if ((insert->key == kbase) && 6500 (insert->type == type)) { 6501 if (type == XML_EXP_ATOM) { 6502 if (name == insert->exp_str) { 6503 insert->ref++; 6504 return(insert); 6505 } 6506 } else if (type == XML_EXP_COUNT) { 6507 if ((insert->exp_min == min) && (insert->exp_max == max) && 6508 (insert->exp_left == left)) { 6509 insert->ref++; 6510 left->ref--; 6511 return(insert); 6512 } 6513 } else if ((insert->exp_left == left) && 6514 (insert->exp_right == right)) { 6515 insert->ref++; 6516 left->ref--; 6517 right->ref--; 6518 return(insert); 6519 } 6520 } 6521 } 6522 } 6523 6524 entry = xmlExpNewNode(ctxt, type); 6525 if (entry == NULL) 6526 return(NULL); 6527 entry->key = kbase; 6528 if (type == XML_EXP_ATOM) { 6529 entry->exp_str = name; 6530 entry->c_max = 1; 6531 } else if (type == XML_EXP_COUNT) { 6532 entry->exp_min = min; 6533 entry->exp_max = max; 6534 entry->exp_left = left; 6535 if ((min == 0) || (IS_NILLABLE(left))) 6536 entry->info |= XML_EXP_NILABLE; 6537 if (max < 0) 6538 entry->c_max = -1; 6539 else 6540 entry->c_max = max * entry->exp_left->c_max; 6541 } else { 6542 entry->exp_left = left; 6543 entry->exp_right = right; 6544 if (type == XML_EXP_OR) { 6545 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right))) 6546 entry->info |= XML_EXP_NILABLE; 6547 if ((entry->exp_left->c_max == -1) || 6548 (entry->exp_right->c_max == -1)) 6549 entry->c_max = -1; 6550 else if (entry->exp_left->c_max > entry->exp_right->c_max) 6551 entry->c_max = entry->exp_left->c_max; 6552 else 6553 entry->c_max = entry->exp_right->c_max; 6554 } else { 6555 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right))) 6556 entry->info |= XML_EXP_NILABLE; 6557 if ((entry->exp_left->c_max == -1) || 6558 (entry->exp_right->c_max == -1)) 6559 entry->c_max = -1; 6560 else 6561 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max; 6562 } 6563 } 6564 entry->ref = 1; 6565 if (ctxt->table[key] != NULL) 6566 entry->next = ctxt->table[key]; 6567 6568 ctxt->table[key] = entry; 6569 ctxt->nbElems++; 6570 6571 return(entry); 6572} 6573 6574/** 6575 * xmlExpFree: 6576 * @ctxt: the expression context 6577 * @exp: the expression 6578 * 6579 * Dereference the expression 6580 */ 6581void 6582xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) { 6583 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp)) 6584 return; 6585 exp->ref--; 6586 if (exp->ref == 0) { 6587 unsigned short key; 6588 6589 /* Unlink it first from the hash table */ 6590 key = exp->key % ctxt->size; 6591 if (ctxt->table[key] == exp) { 6592 ctxt->table[key] = exp->next; 6593 } else { 6594 xmlExpNodePtr tmp; 6595 6596 tmp = ctxt->table[key]; 6597 while (tmp != NULL) { 6598 if (tmp->next == exp) { 6599 tmp->next = exp->next; 6600 break; 6601 } 6602 tmp = tmp->next; 6603 } 6604 } 6605 6606 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) { 6607 xmlExpFree(ctxt, exp->exp_left); 6608 xmlExpFree(ctxt, exp->exp_right); 6609 } else if (exp->type == XML_EXP_COUNT) { 6610 xmlExpFree(ctxt, exp->exp_left); 6611 } 6612 xmlFree(exp); 6613 ctxt->nb_nodes--; 6614 } 6615} 6616 6617/** 6618 * xmlExpRef: 6619 * @exp: the expression 6620 * 6621 * Increase the reference count of the expression 6622 */ 6623void 6624xmlExpRef(xmlExpNodePtr exp) { 6625 if (exp != NULL) 6626 exp->ref++; 6627} 6628 6629/** 6630 * xmlExpNewAtom: 6631 * @ctxt: the expression context 6632 * @name: the atom name 6633 * @len: the atom name lenght in byte (or -1); 6634 * 6635 * Get the atom associated to this name from that context 6636 * 6637 * Returns the node or NULL in case of error 6638 */ 6639xmlExpNodePtr 6640xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) { 6641 if ((ctxt == NULL) || (name == NULL)) 6642 return(NULL); 6643 name = xmlDictLookup(ctxt->dict, name, len); 6644 if (name == NULL) 6645 return(NULL); 6646 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0)); 6647} 6648 6649/** 6650 * xmlExpNewOr: 6651 * @ctxt: the expression context 6652 * @left: left expression 6653 * @right: right expression 6654 * 6655 * Get the atom associated to the choice @left | @right 6656 * Note that @left and @right are consumed in the operation, to keep 6657 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6658 * this is true even in case of failure (unless ctxt == NULL). 6659 * 6660 * Returns the node or NULL in case of error 6661 */ 6662xmlExpNodePtr 6663xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6664 if (ctxt == NULL) 6665 return(NULL); 6666 if ((left == NULL) || (right == NULL)) { 6667 xmlExpFree(ctxt, left); 6668 xmlExpFree(ctxt, right); 6669 return(NULL); 6670 } 6671 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0)); 6672} 6673 6674/** 6675 * xmlExpNewSeq: 6676 * @ctxt: the expression context 6677 * @left: left expression 6678 * @right: right expression 6679 * 6680 * Get the atom associated to the sequence @left , @right 6681 * Note that @left and @right are consumed in the operation, to keep 6682 * an handle on them use xmlExpRef() and use xmlExpFree() to release them, 6683 * this is true even in case of failure (unless ctxt == NULL). 6684 * 6685 * Returns the node or NULL in case of error 6686 */ 6687xmlExpNodePtr 6688xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) { 6689 if (ctxt == NULL) 6690 return(NULL); 6691 if ((left == NULL) || (right == NULL)) { 6692 xmlExpFree(ctxt, left); 6693 xmlExpFree(ctxt, right); 6694 return(NULL); 6695 } 6696 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0)); 6697} 6698 6699/** 6700 * xmlExpNewRange: 6701 * @ctxt: the expression context 6702 * @subset: the expression to be repeated 6703 * @min: the lower bound for the repetition 6704 * @max: the upper bound for the repetition, -1 means infinite 6705 * 6706 * Get the atom associated to the range (@subset){@min, @max} 6707 * Note that @subset is consumed in the operation, to keep 6708 * an handle on it use xmlExpRef() and use xmlExpFree() to release it, 6709 * this is true even in case of failure (unless ctxt == NULL). 6710 * 6711 * Returns the node or NULL in case of error 6712 */ 6713xmlExpNodePtr 6714xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) { 6715 if (ctxt == NULL) 6716 return(NULL); 6717 if ((subset == NULL) || (min < 0) || (max < -1) || 6718 ((max >= 0) && (min > max))) { 6719 xmlExpFree(ctxt, subset); 6720 return(NULL); 6721 } 6722 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset, 6723 NULL, NULL, min, max)); 6724} 6725 6726/************************************************************************ 6727 * * 6728 * Public API for operations on expressions * 6729 * * 6730 ************************************************************************/ 6731 6732static int 6733xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6734 const xmlChar**list, int len, int nb) { 6735 int tmp, tmp2; 6736tail: 6737 switch (exp->type) { 6738 case XML_EXP_EMPTY: 6739 return(0); 6740 case XML_EXP_ATOM: 6741 for (tmp = 0;tmp < nb;tmp++) 6742 if (list[tmp] == exp->exp_str) 6743 return(0); 6744 if (nb >= len) 6745 return(-2); 6746 list[nb++] = exp->exp_str; 6747 return(1); 6748 case XML_EXP_COUNT: 6749 exp = exp->exp_left; 6750 goto tail; 6751 case XML_EXP_SEQ: 6752 case XML_EXP_OR: 6753 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb); 6754 if (tmp < 0) 6755 return(tmp); 6756 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len, 6757 nb + tmp); 6758 if (tmp2 < 0) 6759 return(tmp2); 6760 return(tmp + tmp2); 6761 } 6762 return(-1); 6763} 6764 6765/** 6766 * xmlExpGetLanguage: 6767 * @ctxt: the expression context 6768 * @exp: the expression 6769 * @langList: where to store the tokens 6770 * @len: the allocated lenght of @list 6771 * 6772 * Find all the strings used in @exp and store them in @list 6773 * 6774 * Returns the number of unique strings found, -1 in case of errors and 6775 * -2 if there is more than @len strings 6776 */ 6777int 6778xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6779 const xmlChar**langList, int len) { 6780 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0)) 6781 return(-1); 6782 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0)); 6783} 6784 6785static int 6786xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6787 const xmlChar**list, int len, int nb) { 6788 int tmp, tmp2; 6789tail: 6790 switch (exp->type) { 6791 case XML_EXP_FORBID: 6792 return(0); 6793 case XML_EXP_EMPTY: 6794 return(0); 6795 case XML_EXP_ATOM: 6796 for (tmp = 0;tmp < nb;tmp++) 6797 if (list[tmp] == exp->exp_str) 6798 return(0); 6799 if (nb >= len) 6800 return(-2); 6801 list[nb++] = exp->exp_str; 6802 return(1); 6803 case XML_EXP_COUNT: 6804 exp = exp->exp_left; 6805 goto tail; 6806 case XML_EXP_SEQ: 6807 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6808 if (tmp < 0) 6809 return(tmp); 6810 if (IS_NILLABLE(exp->exp_left)) { 6811 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6812 nb + tmp); 6813 if (tmp2 < 0) 6814 return(tmp2); 6815 tmp += tmp2; 6816 } 6817 return(tmp); 6818 case XML_EXP_OR: 6819 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb); 6820 if (tmp < 0) 6821 return(tmp); 6822 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len, 6823 nb + tmp); 6824 if (tmp2 < 0) 6825 return(tmp2); 6826 return(tmp + tmp2); 6827 } 6828 return(-1); 6829} 6830 6831/** 6832 * xmlExpGetStart: 6833 * @ctxt: the expression context 6834 * @exp: the expression 6835 * @tokList: where to store the tokens 6836 * @len: the allocated lenght of @list 6837 * 6838 * Find all the strings that appears at the start of the languages 6839 * accepted by @exp and store them in @list. E.g. for (a, b) | c 6840 * it will return the list [a, c] 6841 * 6842 * Returns the number of unique strings found, -1 in case of errors and 6843 * -2 if there is more than @len strings 6844 */ 6845int 6846xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6847 const xmlChar**tokList, int len) { 6848 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0)) 6849 return(-1); 6850 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0)); 6851} 6852 6853/** 6854 * xmlExpIsNillable: 6855 * @exp: the expression 6856 * 6857 * Finds if the expression is nillable, i.e. if it accepts the empty sequqnce 6858 * 6859 * Returns 1 if nillable, 0 if not and -1 in case of error 6860 */ 6861int 6862xmlExpIsNillable(xmlExpNodePtr exp) { 6863 if (exp == NULL) 6864 return(-1); 6865 return(IS_NILLABLE(exp) != 0); 6866} 6867 6868static xmlExpNodePtr 6869xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str) 6870{ 6871 xmlExpNodePtr ret; 6872 6873 switch (exp->type) { 6874 case XML_EXP_EMPTY: 6875 return(forbiddenExp); 6876 case XML_EXP_FORBID: 6877 return(forbiddenExp); 6878 case XML_EXP_ATOM: 6879 if (exp->exp_str == str) { 6880#ifdef DEBUG_DERIV 6881 printf("deriv atom: equal => Empty\n"); 6882#endif 6883 ret = emptyExp; 6884 } else { 6885#ifdef DEBUG_DERIV 6886 printf("deriv atom: mismatch => forbid\n"); 6887#endif 6888 /* TODO wildcards here */ 6889 ret = forbiddenExp; 6890 } 6891 return(ret); 6892 case XML_EXP_OR: { 6893 xmlExpNodePtr tmp; 6894 6895#ifdef DEBUG_DERIV 6896 printf("deriv or: => or(derivs)\n"); 6897#endif 6898 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6899 if (tmp == NULL) { 6900 return(NULL); 6901 } 6902 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6903 if (ret == NULL) { 6904 xmlExpFree(ctxt, tmp); 6905 return(NULL); 6906 } 6907 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, 6908 NULL, 0, 0); 6909 return(ret); 6910 } 6911 case XML_EXP_SEQ: 6912#ifdef DEBUG_DERIV 6913 printf("deriv seq: starting with left\n"); 6914#endif 6915 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6916 if (ret == NULL) { 6917 return(NULL); 6918 } else if (ret == forbiddenExp) { 6919 if (IS_NILLABLE(exp->exp_left)) { 6920#ifdef DEBUG_DERIV 6921 printf("deriv seq: left failed but nillable\n"); 6922#endif 6923 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str); 6924 } 6925 } else { 6926#ifdef DEBUG_DERIV 6927 printf("deriv seq: left match => sequence\n"); 6928#endif 6929 exp->exp_right->ref++; 6930 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right, 6931 NULL, 0, 0); 6932 } 6933 return(ret); 6934 case XML_EXP_COUNT: { 6935 int min, max; 6936 xmlExpNodePtr tmp; 6937 6938 if (exp->exp_max == 0) 6939 return(forbiddenExp); 6940 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str); 6941 if (ret == NULL) 6942 return(NULL); 6943 if (ret == forbiddenExp) { 6944#ifdef DEBUG_DERIV 6945 printf("deriv count: pattern mismatch => forbid\n"); 6946#endif 6947 return(ret); 6948 } 6949 if (exp->exp_max == 1) 6950 return(ret); 6951 if (exp->exp_max < 0) /* unbounded */ 6952 max = -1; 6953 else 6954 max = exp->exp_max - 1; 6955 if (exp->exp_min > 0) 6956 min = exp->exp_min - 1; 6957 else 6958 min = 0; 6959 exp->exp_left->ref++; 6960 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL, 6961 NULL, min, max); 6962 if (ret == emptyExp) { 6963#ifdef DEBUG_DERIV 6964 printf("deriv count: match to empty => new count\n"); 6965#endif 6966 return(tmp); 6967 } 6968#ifdef DEBUG_DERIV 6969 printf("deriv count: match => sequence with new count\n"); 6970#endif 6971 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp, 6972 NULL, 0, 0)); 6973 } 6974 } 6975 return(NULL); 6976} 6977 6978/** 6979 * xmlExpStringDerive: 6980 * @ctxt: the expression context 6981 * @exp: the expression 6982 * @str: the string 6983 * @len: the string len in bytes if available 6984 * 6985 * Do one step of Brzozowski derivation of the expression @exp with 6986 * respect to the input string 6987 * 6988 * Returns the resulting expression or NULL in case of internal error 6989 */ 6990xmlExpNodePtr 6991xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 6992 const xmlChar *str, int len) { 6993 const xmlChar *input; 6994 6995 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) { 6996 return(NULL); 6997 } 6998 /* 6999 * check the string is in the dictionnary, if yes use an interned 7000 * copy, otherwise we know it's not an acceptable input 7001 */ 7002 input = xmlDictExists(ctxt->dict, str, len); 7003 if (input == NULL) { 7004 return(forbiddenExp); 7005 } 7006 return(xmlExpStringDeriveInt(ctxt, exp, input)); 7007} 7008 7009static int 7010xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) { 7011 int ret = 1; 7012 7013 if (sub->c_max == -1) { 7014 if (exp->c_max != -1) 7015 ret = 0; 7016 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) { 7017 ret = 0; 7018 } 7019#if 0 7020 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp))) 7021 ret = 0; 7022#endif 7023 return(ret); 7024} 7025 7026static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, 7027 xmlExpNodePtr sub); 7028/** 7029 * xmlExpDivide: 7030 * @ctxt: the expressions context 7031 * @exp: the englobing expression 7032 * @sub: the subexpression 7033 * @mult: the multiple expression 7034 * @remain: the remain from the derivation of the multiple 7035 * 7036 * Check if exp is a multiple of sub, i.e. if there is a finite number n 7037 * so that sub{n} subsume exp 7038 * 7039 * Returns the multiple value if successful, 0 if it is not a multiple 7040 * and -1 in case of internel error. 7041 */ 7042 7043static int 7044xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub, 7045 xmlExpNodePtr *mult, xmlExpNodePtr *remain) { 7046 int i; 7047 xmlExpNodePtr tmp, tmp2; 7048 7049 if (mult != NULL) *mult = NULL; 7050 if (remain != NULL) *remain = NULL; 7051 if (exp->c_max == -1) return(0); 7052 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0); 7053 7054 for (i = 1;i <= exp->c_max;i++) { 7055 sub->ref++; 7056 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7057 sub, NULL, NULL, i, i); 7058 if (tmp == NULL) { 7059 return(-1); 7060 } 7061 if (!xmlExpCheckCard(tmp, exp)) { 7062 xmlExpFree(ctxt, tmp); 7063 continue; 7064 } 7065 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp); 7066 if (tmp2 == NULL) { 7067 xmlExpFree(ctxt, tmp); 7068 return(-1); 7069 } 7070 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) { 7071 if (remain != NULL) 7072 *remain = tmp2; 7073 else 7074 xmlExpFree(ctxt, tmp2); 7075 if (mult != NULL) 7076 *mult = tmp; 7077 else 7078 xmlExpFree(ctxt, tmp); 7079#ifdef DEBUG_DERIV 7080 printf("Divide succeeded %d\n", i); 7081#endif 7082 return(i); 7083 } 7084 xmlExpFree(ctxt, tmp); 7085 xmlExpFree(ctxt, tmp2); 7086 } 7087#ifdef DEBUG_DERIV 7088 printf("Divide failed\n"); 7089#endif 7090 return(0); 7091} 7092 7093/** 7094 * xmlExpExpDeriveInt: 7095 * @ctxt: the expressions context 7096 * @exp: the englobing expression 7097 * @sub: the subexpression 7098 * 7099 * Try to do a step of Brzozowski derivation but at a higher level 7100 * the input being a subexpression. 7101 * 7102 * Returns the resulting expression or NULL in case of internal error 7103 */ 7104static xmlExpNodePtr 7105xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7106 xmlExpNodePtr ret, tmp, tmp2, tmp3; 7107 const xmlChar **tab; 7108 int len, i; 7109 7110 /* 7111 * In case of equality and if the expression can only consume a finite 7112 * amount, then the derivation is empty 7113 */ 7114 if ((exp == sub) && (exp->c_max >= 0)) { 7115#ifdef DEBUG_DERIV 7116 printf("Equal(exp, sub) and finite -> Empty\n"); 7117#endif 7118 return(emptyExp); 7119 } 7120 /* 7121 * decompose sub sequence first 7122 */ 7123 if (sub->type == XML_EXP_EMPTY) { 7124#ifdef DEBUG_DERIV 7125 printf("Empty(sub) -> Empty\n"); 7126#endif 7127 exp->ref++; 7128 return(exp); 7129 } 7130 if (sub->type == XML_EXP_SEQ) { 7131#ifdef DEBUG_DERIV 7132 printf("Seq(sub) -> decompose\n"); 7133#endif 7134 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7135 if (tmp == NULL) 7136 return(NULL); 7137 if (tmp == forbiddenExp) 7138 return(tmp); 7139 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right); 7140 xmlExpFree(ctxt, tmp); 7141 return(ret); 7142 } 7143 if (sub->type == XML_EXP_OR) { 7144#ifdef DEBUG_DERIV 7145 printf("Or(sub) -> decompose\n"); 7146#endif 7147 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left); 7148 if (tmp == forbiddenExp) 7149 return(tmp); 7150 if (tmp == NULL) 7151 return(NULL); 7152 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right); 7153 if ((ret == NULL) || (ret == forbiddenExp)) { 7154 xmlExpFree(ctxt, tmp); 7155 return(ret); 7156 } 7157 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0)); 7158 } 7159 if (!xmlExpCheckCard(exp, sub)) { 7160#ifdef DEBUG_DERIV 7161 printf("CheckCard(exp, sub) failed -> Forbid\n"); 7162#endif 7163 return(forbiddenExp); 7164 } 7165 switch (exp->type) { 7166 case XML_EXP_EMPTY: 7167 if (sub == emptyExp) 7168 return(emptyExp); 7169#ifdef DEBUG_DERIV 7170 printf("Empty(exp) -> Forbid\n"); 7171#endif 7172 return(forbiddenExp); 7173 case XML_EXP_FORBID: 7174#ifdef DEBUG_DERIV 7175 printf("Forbid(exp) -> Forbid\n"); 7176#endif 7177 return(forbiddenExp); 7178 case XML_EXP_ATOM: 7179 if (sub->type == XML_EXP_ATOM) { 7180 /* TODO: handle wildcards */ 7181 if (exp->exp_str == sub->exp_str) { 7182#ifdef DEBUG_DERIV 7183 printf("Atom match -> Empty\n"); 7184#endif 7185 return(emptyExp); 7186 } 7187#ifdef DEBUG_DERIV 7188 printf("Atom mismatch -> Forbid\n"); 7189#endif 7190 return(forbiddenExp); 7191 } 7192 if ((sub->type == XML_EXP_COUNT) && 7193 (sub->exp_max == 1) && 7194 (sub->exp_left->type == XML_EXP_ATOM)) { 7195 /* TODO: handle wildcards */ 7196 if (exp->exp_str == sub->exp_left->exp_str) { 7197#ifdef DEBUG_DERIV 7198 printf("Atom match -> Empty\n"); 7199#endif 7200 return(emptyExp); 7201 } 7202#ifdef DEBUG_DERIV 7203 printf("Atom mismatch -> Forbid\n"); 7204#endif 7205 return(forbiddenExp); 7206 } 7207#ifdef DEBUG_DERIV 7208 printf("Compex exp vs Atom -> Forbid\n"); 7209#endif 7210 return(forbiddenExp); 7211 case XML_EXP_SEQ: 7212 /* try to get the sequence consumed only if possible */ 7213 if (xmlExpCheckCard(exp->exp_left, sub)) { 7214 /* See if the sequence can be consumed directly */ 7215#ifdef DEBUG_DERIV 7216 printf("Seq trying left only\n"); 7217#endif 7218 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7219 if ((ret != forbiddenExp) && (ret != NULL)) { 7220#ifdef DEBUG_DERIV 7221 printf("Seq trying left only worked\n"); 7222#endif 7223 /* 7224 * TODO: assumption here that we are determinist 7225 * i.e. we won't get to a nillable exp left 7226 * subset which could be matched by the right 7227 * part too. 7228 * e.g.: (a | b)+,(a | c) and 'a+,a' 7229 */ 7230 exp->exp_right->ref++; 7231 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7232 exp->exp_right, NULL, 0, 0)); 7233 } 7234#ifdef DEBUG_DERIV 7235 } else { 7236 printf("Seq: left too short\n"); 7237#endif 7238 } 7239 /* Try instead to decompose */ 7240 if (sub->type == XML_EXP_COUNT) { 7241 int min, max; 7242 7243#ifdef DEBUG_DERIV 7244 printf("Seq: sub is a count\n"); 7245#endif 7246 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7247 if (ret == NULL) 7248 return(NULL); 7249 if (ret != forbiddenExp) { 7250#ifdef DEBUG_DERIV 7251 printf("Seq , Count match on left\n"); 7252#endif 7253 if (sub->exp_max < 0) 7254 max = -1; 7255 else 7256 max = sub->exp_max -1; 7257 if (sub->exp_min > 0) 7258 min = sub->exp_min -1; 7259 else 7260 min = 0; 7261 exp->exp_right->ref++; 7262 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, 7263 exp->exp_right, NULL, 0, 0); 7264 if (tmp == NULL) 7265 return(NULL); 7266 7267 sub->exp_left->ref++; 7268 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, 7269 sub->exp_left, NULL, NULL, min, max); 7270 if (tmp2 == NULL) { 7271 xmlExpFree(ctxt, tmp); 7272 return(NULL); 7273 } 7274 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7275 xmlExpFree(ctxt, tmp); 7276 xmlExpFree(ctxt, tmp2); 7277 return(ret); 7278 } 7279 } 7280 /* we made no progress on structured operations */ 7281 break; 7282 case XML_EXP_OR: 7283#ifdef DEBUG_DERIV 7284 printf("Or , trying both side\n"); 7285#endif 7286 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7287 if (ret == NULL) 7288 return(NULL); 7289 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub); 7290 if (tmp == NULL) { 7291 xmlExpFree(ctxt, ret); 7292 return(NULL); 7293 } 7294 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0)); 7295 case XML_EXP_COUNT: { 7296 int min, max; 7297 7298 if (sub->type == XML_EXP_COUNT) { 7299 /* 7300 * Try to see if the loop is completely subsumed 7301 */ 7302 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left); 7303 if (tmp == NULL) 7304 return(NULL); 7305 if (tmp == forbiddenExp) { 7306 int mult; 7307 7308#ifdef DEBUG_DERIV 7309 printf("Count, Count inner don't subsume\n"); 7310#endif 7311 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left, 7312 NULL, &tmp); 7313 if (mult <= 0) { 7314#ifdef DEBUG_DERIV 7315 printf("Count, Count not multiple => forbidden\n"); 7316#endif 7317 return(forbiddenExp); 7318 } 7319 if (sub->exp_max == -1) { 7320 max = -1; 7321 if (exp->exp_max == -1) { 7322 if (exp->exp_min <= sub->exp_min * mult) 7323 min = 0; 7324 else 7325 min = exp->exp_min - sub->exp_min * mult; 7326 } else { 7327#ifdef DEBUG_DERIV 7328 printf("Count, Count finite can't subsume infinite\n"); 7329#endif 7330 xmlExpFree(ctxt, tmp); 7331 return(forbiddenExp); 7332 } 7333 } else { 7334 if (exp->exp_max == -1) { 7335#ifdef DEBUG_DERIV 7336 printf("Infinite loop consume mult finite loop\n"); 7337#endif 7338 if (exp->exp_min > sub->exp_min * mult) { 7339 max = -1; 7340 min = exp->exp_min - sub->exp_min * mult; 7341 } else { 7342 max = -1; 7343 min = 0; 7344 } 7345 } else { 7346 if (exp->exp_max < sub->exp_max * mult) { 7347#ifdef DEBUG_DERIV 7348 printf("loops max mult mismatch => forbidden\n"); 7349#endif 7350 xmlExpFree(ctxt, tmp); 7351 return(forbiddenExp); 7352 } 7353 if (sub->exp_max * mult > exp->exp_min) 7354 min = 0; 7355 else 7356 min = exp->exp_min - sub->exp_max * mult; 7357 max = exp->exp_max - sub->exp_max * mult; 7358 } 7359 } 7360 } else if (!IS_NILLABLE(tmp)) { 7361 /* 7362 * TODO: loop here to try to grow if working on finite 7363 * blocks. 7364 */ 7365#ifdef DEBUG_DERIV 7366 printf("Count, Count remain not nillable => forbidden\n"); 7367#endif 7368 xmlExpFree(ctxt, tmp); 7369 return(forbiddenExp); 7370 } else if (sub->exp_max == -1) { 7371 if (exp->exp_max == -1) { 7372 if (exp->exp_min <= sub->exp_min) { 7373#ifdef DEBUG_DERIV 7374 printf("Infinite loops Okay => COUNT(0,Inf)\n"); 7375#endif 7376 max = -1; 7377 min = 0; 7378 } else { 7379#ifdef DEBUG_DERIV 7380 printf("Infinite loops min => Count(X,Inf)\n"); 7381#endif 7382 max = -1; 7383 min = exp->exp_min - sub->exp_min; 7384 } 7385 } else if (exp->exp_min > sub->exp_min) { 7386#ifdef DEBUG_DERIV 7387 printf("loops min mismatch 1 => forbidden ???\n"); 7388#endif 7389 xmlExpFree(ctxt, tmp); 7390 return(forbiddenExp); 7391 } else { 7392 max = -1; 7393 min = 0; 7394 } 7395 } else { 7396 if (exp->exp_max == -1) { 7397#ifdef DEBUG_DERIV 7398 printf("Infinite loop consume finite loop\n"); 7399#endif 7400 if (exp->exp_min > sub->exp_min) { 7401 max = -1; 7402 min = exp->exp_min - sub->exp_min; 7403 } else { 7404 max = -1; 7405 min = 0; 7406 } 7407 } else { 7408 if (exp->exp_max < sub->exp_max) { 7409#ifdef DEBUG_DERIV 7410 printf("loops max mismatch => forbidden\n"); 7411#endif 7412 xmlExpFree(ctxt, tmp); 7413 return(forbiddenExp); 7414 } 7415 if (sub->exp_max > exp->exp_min) 7416 min = 0; 7417 else 7418 min = exp->exp_min - sub->exp_max; 7419 max = exp->exp_max - sub->exp_max; 7420 } 7421 } 7422#ifdef DEBUG_DERIV 7423 printf("loops match => SEQ(COUNT())\n"); 7424#endif 7425 exp->exp_left->ref++; 7426 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7427 NULL, NULL, min, max); 7428 if (tmp2 == NULL) { 7429 return(NULL); 7430 } 7431 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7432 NULL, 0, 0); 7433 return(ret); 7434 } 7435 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub); 7436 if (tmp == NULL) 7437 return(NULL); 7438 if (tmp == forbiddenExp) { 7439#ifdef DEBUG_DERIV 7440 printf("loop mismatch => forbidden\n"); 7441#endif 7442 return(forbiddenExp); 7443 } 7444 if (exp->exp_min > 0) 7445 min = exp->exp_min - 1; 7446 else 7447 min = 0; 7448 if (exp->exp_max < 0) 7449 max = -1; 7450 else 7451 max = exp->exp_max - 1; 7452 7453#ifdef DEBUG_DERIV 7454 printf("loop match => SEQ(COUNT())\n"); 7455#endif 7456 exp->exp_left->ref++; 7457 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, 7458 NULL, NULL, min, max); 7459 if (tmp2 == NULL) 7460 return(NULL); 7461 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2, 7462 NULL, 0, 0); 7463 return(ret); 7464 } 7465 } 7466 7467#ifdef DEBUG_DERIV 7468 printf("Fallback to derivative\n"); 7469#endif 7470 if (IS_NILLABLE(sub)) { 7471 if (!(IS_NILLABLE(exp))) 7472 return(forbiddenExp); 7473 else 7474 ret = emptyExp; 7475 } else 7476 ret = NULL; 7477 /* 7478 * here the structured derivation made no progress so 7479 * we use the default token based derivation to force one more step 7480 */ 7481 if (ctxt->tabSize == 0) 7482 ctxt->tabSize = 40; 7483 7484 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize * 7485 sizeof(const xmlChar *)); 7486 if (tab == NULL) { 7487 return(NULL); 7488 } 7489 7490 /* 7491 * collect all the strings accepted by the subexpression on input 7492 */ 7493 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7494 while (len < 0) { 7495 const xmlChar **temp; 7496 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 * 7497 sizeof(const xmlChar *)); 7498 if (temp == NULL) { 7499 xmlFree((xmlChar **) tab); 7500 return(NULL); 7501 } 7502 tab = temp; 7503 ctxt->tabSize *= 2; 7504 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0); 7505 } 7506 for (i = 0;i < len;i++) { 7507 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]); 7508 if ((tmp == NULL) || (tmp == forbiddenExp)) { 7509 xmlExpFree(ctxt, ret); 7510 xmlFree((xmlChar **) tab); 7511 return(tmp); 7512 } 7513 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]); 7514 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) { 7515 xmlExpFree(ctxt, tmp); 7516 xmlExpFree(ctxt, ret); 7517 xmlFree((xmlChar **) tab); 7518 return(tmp); 7519 } 7520 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2); 7521 xmlExpFree(ctxt, tmp); 7522 xmlExpFree(ctxt, tmp2); 7523 7524 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) { 7525 xmlExpFree(ctxt, ret); 7526 xmlFree((xmlChar **) tab); 7527 return(tmp3); 7528 } 7529 7530 if (ret == NULL) 7531 ret = tmp3; 7532 else { 7533 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0); 7534 if (ret == NULL) { 7535 xmlFree((xmlChar **) tab); 7536 return(NULL); 7537 } 7538 } 7539 } 7540 xmlFree((xmlChar **) tab); 7541 return(ret); 7542} 7543 7544/** 7545 * xmlExpExpDerive: 7546 * @ctxt: the expressions context 7547 * @exp: the englobing expression 7548 * @sub: the subexpression 7549 * 7550 * Evaluates the expression resulting from @exp consuming a sub expression @sub 7551 * Based on algebraic derivation and sometimes direct Brzozowski derivation 7552 * it usually tatkes less than linear time and can handle expressions generating 7553 * infinite languages. 7554 * 7555 * Returns the resulting expression or NULL in case of internal error, the 7556 * result must be freed 7557 */ 7558xmlExpNodePtr 7559xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7560 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7561 return(NULL); 7562 7563 /* 7564 * O(1) speedups 7565 */ 7566 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7567#ifdef DEBUG_DERIV 7568 printf("Sub nillable and not exp : can't subsume\n"); 7569#endif 7570 return(forbiddenExp); 7571 } 7572 if (xmlExpCheckCard(exp, sub) == 0) { 7573#ifdef DEBUG_DERIV 7574 printf("sub generate longuer sequances than exp : can't subsume\n"); 7575#endif 7576 return(forbiddenExp); 7577 } 7578 return(xmlExpExpDeriveInt(ctxt, exp, sub)); 7579} 7580 7581/** 7582 * xmlExpSubsume: 7583 * @ctxt: the expressions context 7584 * @exp: the englobing expression 7585 * @sub: the subexpression 7586 * 7587 * Check whether @exp accepts all the languages accexpted by @sub 7588 * the input being a subexpression. 7589 * 7590 * Returns 1 if true 0 if false and -1 in case of failure. 7591 */ 7592int 7593xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) { 7594 xmlExpNodePtr tmp; 7595 7596 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL)) 7597 return(-1); 7598 7599 /* 7600 * TODO: speedup by checking the language of sub is a subset of the 7601 * language of exp 7602 */ 7603 /* 7604 * O(1) speedups 7605 */ 7606 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) { 7607#ifdef DEBUG_DERIV 7608 printf("Sub nillable and not exp : can't subsume\n"); 7609#endif 7610 return(0); 7611 } 7612 if (xmlExpCheckCard(exp, sub) == 0) { 7613#ifdef DEBUG_DERIV 7614 printf("sub generate longuer sequances than exp : can't subsume\n"); 7615#endif 7616 return(0); 7617 } 7618 tmp = xmlExpExpDeriveInt(ctxt, exp, sub); 7619#ifdef DEBUG_DERIV 7620 printf("Result derivation :\n"); 7621 PRINT_EXP(tmp); 7622#endif 7623 if (tmp == NULL) 7624 return(-1); 7625 if (tmp == forbiddenExp) 7626 return(0); 7627 if (tmp == emptyExp) 7628 return(1); 7629 if ((tmp != NULL) && (IS_NILLABLE(tmp))) { 7630 xmlExpFree(ctxt, tmp); 7631 return(1); 7632 } 7633 xmlExpFree(ctxt, tmp); 7634 return(0); 7635} 7636 7637/************************************************************************ 7638 * * 7639 * Parsing expression * 7640 * * 7641 ************************************************************************/ 7642 7643static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt); 7644 7645#undef CUR 7646#define CUR (*ctxt->cur) 7647#undef NEXT 7648#define NEXT ctxt->cur++; 7649#undef IS_BLANK 7650#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) 7651#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++; 7652 7653static int 7654xmlExpParseNumber(xmlExpCtxtPtr ctxt) { 7655 int ret = 0; 7656 7657 SKIP_BLANKS 7658 if (CUR == '*') { 7659 NEXT 7660 return(-1); 7661 } 7662 if ((CUR < '0') || (CUR > '9')) 7663 return(-1); 7664 while ((CUR >= '0') && (CUR <= '9')) { 7665 ret = ret * 10 + (CUR - '0'); 7666 NEXT 7667 } 7668 return(ret); 7669} 7670 7671static xmlExpNodePtr 7672xmlExpParseOr(xmlExpCtxtPtr ctxt) { 7673 const char *base; 7674 xmlExpNodePtr ret; 7675 const xmlChar *val; 7676 7677 SKIP_BLANKS 7678 base = ctxt->cur; 7679 if (*ctxt->cur == '(') { 7680 NEXT 7681 ret = xmlExpParseExpr(ctxt); 7682 SKIP_BLANKS 7683 if (*ctxt->cur != ')') { 7684 fprintf(stderr, "unbalanced '(' : %s\n", base); 7685 xmlExpFree(ctxt, ret); 7686 return(NULL); 7687 } 7688 NEXT; 7689 SKIP_BLANKS 7690 goto parse_quantifier; 7691 } 7692 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') && 7693 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') && 7694 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}')) 7695 NEXT; 7696 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base); 7697 if (val == NULL) 7698 return(NULL); 7699 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0); 7700 if (ret == NULL) 7701 return(NULL); 7702 SKIP_BLANKS 7703parse_quantifier: 7704 if (CUR == '{') { 7705 int min, max; 7706 7707 NEXT 7708 min = xmlExpParseNumber(ctxt); 7709 if (min < 0) { 7710 xmlExpFree(ctxt, ret); 7711 return(NULL); 7712 } 7713 SKIP_BLANKS 7714 if (CUR == ',') { 7715 NEXT 7716 max = xmlExpParseNumber(ctxt); 7717 SKIP_BLANKS 7718 } else 7719 max = min; 7720 if (CUR != '}') { 7721 xmlExpFree(ctxt, ret); 7722 return(NULL); 7723 } 7724 NEXT 7725 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7726 min, max); 7727 SKIP_BLANKS 7728 } else if (CUR == '?') { 7729 NEXT 7730 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7731 0, 1); 7732 SKIP_BLANKS 7733 } else if (CUR == '+') { 7734 NEXT 7735 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7736 1, -1); 7737 SKIP_BLANKS 7738 } else if (CUR == '*') { 7739 NEXT 7740 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL, 7741 0, -1); 7742 SKIP_BLANKS 7743 } 7744 return(ret); 7745} 7746 7747 7748static xmlExpNodePtr 7749xmlExpParseSeq(xmlExpCtxtPtr ctxt) { 7750 xmlExpNodePtr ret, right; 7751 7752 ret = xmlExpParseOr(ctxt); 7753 SKIP_BLANKS 7754 while (CUR == '|') { 7755 NEXT 7756 right = xmlExpParseOr(ctxt); 7757 if (right == NULL) { 7758 xmlExpFree(ctxt, ret); 7759 return(NULL); 7760 } 7761 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0); 7762 if (ret == NULL) 7763 return(NULL); 7764 } 7765 return(ret); 7766} 7767 7768static xmlExpNodePtr 7769xmlExpParseExpr(xmlExpCtxtPtr ctxt) { 7770 xmlExpNodePtr ret, right; 7771 7772 ret = xmlExpParseSeq(ctxt); 7773 SKIP_BLANKS 7774 while (CUR == ',') { 7775 NEXT 7776 right = xmlExpParseSeq(ctxt); 7777 if (right == NULL) { 7778 xmlExpFree(ctxt, ret); 7779 return(NULL); 7780 } 7781 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0); 7782 if (ret == NULL) 7783 return(NULL); 7784 } 7785 return(ret); 7786} 7787 7788/** 7789 * xmlExpParse: 7790 * @ctxt: the expressions context 7791 * @expr: the 0 terminated string 7792 * 7793 * Minimal parser for regexps, it understand the following constructs 7794 * - string terminals 7795 * - choice operator | 7796 * - sequence operator , 7797 * - subexpressions (...) 7798 * - usual cardinality operators + * and ? 7799 * - finite sequences { min, max } 7800 * - infinite sequences { min, * } 7801 * There is minimal checkings made especially no checking on strings values 7802 * 7803 * Returns a new expression or NULL in case of failure 7804 */ 7805xmlExpNodePtr 7806xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) { 7807 xmlExpNodePtr ret; 7808 7809 ctxt->expr = expr; 7810 ctxt->cur = expr; 7811 7812 ret = xmlExpParseExpr(ctxt); 7813 SKIP_BLANKS 7814 if (*ctxt->cur != 0) { 7815 xmlExpFree(ctxt, ret); 7816 return(NULL); 7817 } 7818 return(ret); 7819} 7820 7821static void 7822xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) { 7823 xmlExpNodePtr c; 7824 7825 if (expr == NULL) return; 7826 if (glob) xmlBufferWriteChar(buf, "("); 7827 switch (expr->type) { 7828 case XML_EXP_EMPTY: 7829 xmlBufferWriteChar(buf, "empty"); 7830 break; 7831 case XML_EXP_FORBID: 7832 xmlBufferWriteChar(buf, "forbidden"); 7833 break; 7834 case XML_EXP_ATOM: 7835 xmlBufferWriteCHAR(buf, expr->exp_str); 7836 break; 7837 case XML_EXP_SEQ: 7838 c = expr->exp_left; 7839 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7840 xmlExpDumpInt(buf, c, 1); 7841 else 7842 xmlExpDumpInt(buf, c, 0); 7843 xmlBufferWriteChar(buf, " , "); 7844 c = expr->exp_right; 7845 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7846 xmlExpDumpInt(buf, c, 1); 7847 else 7848 xmlExpDumpInt(buf, c, 0); 7849 break; 7850 case XML_EXP_OR: 7851 c = expr->exp_left; 7852 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7853 xmlExpDumpInt(buf, c, 1); 7854 else 7855 xmlExpDumpInt(buf, c, 0); 7856 xmlBufferWriteChar(buf, " | "); 7857 c = expr->exp_right; 7858 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7859 xmlExpDumpInt(buf, c, 1); 7860 else 7861 xmlExpDumpInt(buf, c, 0); 7862 break; 7863 case XML_EXP_COUNT: { 7864 char rep[40]; 7865 7866 c = expr->exp_left; 7867 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR)) 7868 xmlExpDumpInt(buf, c, 1); 7869 else 7870 xmlExpDumpInt(buf, c, 0); 7871 if ((expr->exp_min == 0) && (expr->exp_max == 1)) { 7872 rep[0] = '?'; 7873 rep[1] = 0; 7874 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) { 7875 rep[0] = '*'; 7876 rep[1] = 0; 7877 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) { 7878 rep[0] = '+'; 7879 rep[1] = 0; 7880 } else if (expr->exp_max == expr->exp_min) { 7881 snprintf(rep, 39, "{%d}", expr->exp_min); 7882 } else if (expr->exp_max < 0) { 7883 snprintf(rep, 39, "{%d,inf}", expr->exp_min); 7884 } else { 7885 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max); 7886 } 7887 rep[39] = 0; 7888 xmlBufferWriteChar(buf, rep); 7889 break; 7890 } 7891 default: 7892 fprintf(stderr, "Error in tree\n"); 7893 } 7894 if (glob) 7895 xmlBufferWriteChar(buf, ")"); 7896} 7897/** 7898 * xmlExpDump: 7899 * @buf: a buffer to receive the output 7900 * @expr: the compiled expression 7901 * 7902 * Serialize the expression as compiled to the buffer 7903 */ 7904void 7905xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) { 7906 if ((buf == NULL) || (expr == NULL)) 7907 return; 7908 xmlExpDumpInt(buf, expr, 0); 7909} 7910 7911/** 7912 * xmlExpMaxToken: 7913 * @expr: a compiled expression 7914 * 7915 * Indicate the maximum number of input a expression can accept 7916 * 7917 * Returns the maximum length or -1 in case of error 7918 */ 7919int 7920xmlExpMaxToken(xmlExpNodePtr expr) { 7921 if (expr == NULL) 7922 return(-1); 7923 return(expr->c_max); 7924} 7925 7926/** 7927 * xmlExpCtxtNbNodes: 7928 * @ctxt: an expression context 7929 * 7930 * Debugging facility provides the number of allocated nodes at a that point 7931 * 7932 * Returns the number of nodes in use or -1 in case of error 7933 */ 7934int 7935xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) { 7936 if (ctxt == NULL) 7937 return(-1); 7938 return(ctxt->nb_nodes); 7939} 7940 7941/** 7942 * xmlExpCtxtNbCons: 7943 * @ctxt: an expression context 7944 * 7945 * Debugging facility provides the number of allocated nodes over lifetime 7946 * 7947 * Returns the number of nodes ever allocated or -1 in case of error 7948 */ 7949int 7950xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) { 7951 if (ctxt == NULL) 7952 return(-1); 7953 return(ctxt->nb_cons); 7954} 7955 7956#endif /* LIBXML_EXPR_ENABLED */ 7957#define bottom_xmlregexp 7958#include "elfgcchack.h" 7959#endif /* LIBXML_REGEXP_ENABLED */ 7960