1/* 2 * tclRegexp.c -- 3 * 4 * This file contains the public interfaces to the Tcl regular 5 * expression mechanism. 6 * 7 * Copyright (c) 1998 by Sun Microsystems, Inc. 8 * Copyright (c) 1998-1999 by Scriptics Corporation. 9 * 10 * See the file "license.terms" for information on usage and redistribution 11 * of this file, and for a DISCLAIMER OF ALL WARRANTIES. 12 * 13 * RCS: @(#) $Id: tclRegexp.c,v 1.14.4.2 2006/04/07 01:14:28 hobbs Exp $ 14 */ 15 16#include "tclInt.h" 17#include "tclPort.h" 18#include "tclRegexp.h" 19 20/* 21 *---------------------------------------------------------------------- 22 * The routines in this file use Henry Spencer's regular expression 23 * package contained in the following additional source files: 24 * 25 * regc_color.c regc_cvec.c regc_lex.c 26 * regc_nfa.c regcomp.c regcustom.h 27 * rege_dfa.c regerror.c regerrs.h 28 * regex.h regexec.c regfree.c 29 * regfronts.c regguts.h 30 * 31 * Copyright (c) 1998 Henry Spencer. All rights reserved. 32 * 33 * Development of this software was funded, in part, by Cray Research Inc., 34 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 35 * Corporation, none of whom are responsible for the results. The author 36 * thanks all of them. 37 * 38 * Redistribution and use in source and binary forms -- with or without 39 * modification -- are permitted for any purpose, provided that 40 * redistributions in source form retain this entire copyright notice and 41 * indicate the origin and nature of any modifications. 42 * 43 * I'd appreciate being given credit for this package in the documentation 44 * of software which uses it, but that is not a requirement. 45 * 46 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 47 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 48 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 49 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 50 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 51 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 52 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 53 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 54 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 55 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 56 * 57 * *** NOTE: this code has been altered slightly for use in Tcl: *** 58 * *** 1. Names have been changed, e.g. from re_comp to *** 59 * *** TclRegComp, to avoid clashes with other *** 60 * *** regexp implementations used by applications. *** 61 */ 62 63/* 64 * Thread local storage used to maintain a per-thread cache of compiled 65 * regular expressions. 66 */ 67 68#define NUM_REGEXPS 30 69 70typedef struct ThreadSpecificData { 71 int initialized; /* Set to 1 when the module is initialized. */ 72 char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled 73 * regular expression patterns. NULL 74 * means that this slot isn't used. 75 * Malloc-ed. */ 76 int patLengths[NUM_REGEXPS];/* Number of non-null characters in 77 * corresponding entry in patterns. 78 * -1 means entry isn't used. */ 79 struct TclRegexp *regexps[NUM_REGEXPS]; 80 /* Compiled forms of above strings. Also 81 * malloc-ed, or NULL if not in use yet. */ 82} ThreadSpecificData; 83 84static Tcl_ThreadDataKey dataKey; 85 86/* 87 * Declarations for functions used only in this file. 88 */ 89 90static TclRegexp * CompileRegexp _ANSI_ARGS_((Tcl_Interp *interp, 91 CONST char *pattern, int length, int flags)); 92static void DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr, 93 Tcl_Obj *copyPtr)); 94static void FinalizeRegexp _ANSI_ARGS_((ClientData clientData)); 95static void FreeRegexp _ANSI_ARGS_((TclRegexp *regexpPtr)); 96static void FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr)); 97static int RegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp, 98 Tcl_RegExp re, CONST Tcl_UniChar *uniString, 99 int numChars, int nmatches, int flags)); 100static int SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp, 101 Tcl_Obj *objPtr)); 102 103/* 104 * The regular expression Tcl object type. This serves as a cache 105 * of the compiled form of the regular expression. 106 */ 107 108static Tcl_ObjType tclRegexpType = { 109 "regexp", /* name */ 110 FreeRegexpInternalRep, /* freeIntRepProc */ 111 DupRegexpInternalRep, /* dupIntRepProc */ 112 NULL, /* updateStringProc */ 113 SetRegexpFromAny /* setFromAnyProc */ 114}; 115 116 117/* 118 *---------------------------------------------------------------------- 119 * 120 * Tcl_RegExpCompile -- 121 * 122 * Compile a regular expression into a form suitable for fast 123 * matching. This procedure is DEPRECATED in favor of the 124 * object version of the command. 125 * 126 * Results: 127 * The return value is a pointer to the compiled form of string, 128 * suitable for passing to Tcl_RegExpExec. This compiled form 129 * is only valid up until the next call to this procedure, so 130 * don't keep these around for a long time! If an error occurred 131 * while compiling the pattern, then NULL is returned and an error 132 * message is left in the interp's result. 133 * 134 * Side effects: 135 * Updates the cache of compiled regexps. 136 * 137 *---------------------------------------------------------------------- 138 */ 139 140Tcl_RegExp 141Tcl_RegExpCompile(interp, string) 142 Tcl_Interp *interp; /* For use in error reporting and 143 * to access the interp regexp cache. */ 144 CONST char *string; /* String for which to produce 145 * compiled regular expression. */ 146{ 147 return (Tcl_RegExp) CompileRegexp(interp, string, (int) strlen(string), 148 REG_ADVANCED); 149} 150 151/* 152 *---------------------------------------------------------------------- 153 * 154 * Tcl_RegExpExec -- 155 * 156 * Execute the regular expression matcher using a compiled form 157 * of a regular expression and save information about any match 158 * that is found. 159 * 160 * Results: 161 * If an error occurs during the matching operation then -1 162 * is returned and the interp's result contains an error message. 163 * Otherwise the return value is 1 if a matching range is 164 * found and 0 if there is no matching range. 165 * 166 * Side effects: 167 * None. 168 * 169 *---------------------------------------------------------------------- 170 */ 171 172int 173Tcl_RegExpExec(interp, re, string, start) 174 Tcl_Interp *interp; /* Interpreter to use for error reporting. */ 175 Tcl_RegExp re; /* Compiled regular expression; must have 176 * been returned by previous call to 177 * Tcl_GetRegExpFromObj. */ 178 CONST char *string; /* String against which to match re. */ 179 CONST char *start; /* If string is part of a larger string, 180 * this identifies beginning of larger 181 * string, so that "^" won't match. */ 182{ 183 int flags, result, numChars; 184 TclRegexp *regexp = (TclRegexp *)re; 185 Tcl_DString ds; 186 CONST Tcl_UniChar *ustr; 187 188 /* 189 * If the starting point is offset from the beginning of the buffer, 190 * then we need to tell the regexp engine not to match "^". 191 */ 192 193 if (string > start) { 194 flags = REG_NOTBOL; 195 } else { 196 flags = 0; 197 } 198 199 /* 200 * Remember the string for use by Tcl_RegExpRange(). 201 */ 202 203 regexp->string = string; 204 regexp->objPtr = NULL; 205 206 /* 207 * Convert the string to Unicode and perform the match. 208 */ 209 210 Tcl_DStringInit(&ds); 211 ustr = Tcl_UtfToUniCharDString(string, -1, &ds); 212 numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar); 213 result = RegExpExecUniChar(interp, re, ustr, numChars, 214 -1 /* nmatches */, flags); 215 Tcl_DStringFree(&ds); 216 217 return result; 218} 219 220/* 221 *--------------------------------------------------------------------------- 222 * 223 * Tcl_RegExpRange -- 224 * 225 * Returns pointers describing the range of a regular expression match, 226 * or one of the subranges within the match. 227 * 228 * Results: 229 * The variables at *startPtr and *endPtr are modified to hold the 230 * addresses of the endpoints of the range given by index. If the 231 * specified range doesn't exist then NULLs are returned. 232 * 233 * Side effects: 234 * None. 235 * 236 *--------------------------------------------------------------------------- 237 */ 238 239void 240Tcl_RegExpRange(re, index, startPtr, endPtr) 241 Tcl_RegExp re; /* Compiled regular expression that has 242 * been passed to Tcl_RegExpExec. */ 243 int index; /* 0 means give the range of the entire 244 * match, > 0 means give the range of 245 * a matching subrange. */ 246 CONST char **startPtr; /* Store address of first character in 247 * (sub-) range here. */ 248 CONST char **endPtr; /* Store address of character just after last 249 * in (sub-) range here. */ 250{ 251 TclRegexp *regexpPtr = (TclRegexp *) re; 252 CONST char *string; 253 254 if ((size_t) index > regexpPtr->re.re_nsub) { 255 *startPtr = *endPtr = NULL; 256 } else if (regexpPtr->matches[index].rm_so < 0) { 257 *startPtr = *endPtr = NULL; 258 } else { 259 if (regexpPtr->objPtr) { 260 string = Tcl_GetString(regexpPtr->objPtr); 261 } else { 262 string = regexpPtr->string; 263 } 264 *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); 265 *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); 266 } 267} 268 269/* 270 *--------------------------------------------------------------------------- 271 * 272 * RegExpExecUniChar -- 273 * 274 * Execute the regular expression matcher using a compiled form of a 275 * regular expression and save information about any match that is 276 * found. 277 * 278 * Results: 279 * If an error occurs during the matching operation then -1 is 280 * returned and an error message is left in interp's result. 281 * Otherwise the return value is 1 if a matching range was found or 282 * 0 if there was no matching range. 283 * 284 * Side effects: 285 * None. 286 * 287 *---------------------------------------------------------------------- 288 */ 289 290static int 291RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags) 292 Tcl_Interp *interp; /* Interpreter to use for error reporting. */ 293 Tcl_RegExp re; /* Compiled regular expression; returned by 294 * a previous call to Tcl_GetRegExpFromObj */ 295 CONST Tcl_UniChar *wString; /* String against which to match re. */ 296 int numChars; /* Length of Tcl_UniChar string (must 297 * be >= 0). */ 298 int nmatches; /* How many subexpression matches (counting 299 * the whole match as subexpression 0) are 300 * of interest. -1 means "don't know". */ 301 int flags; /* Regular expression flags. */ 302{ 303 int status; 304 TclRegexp *regexpPtr = (TclRegexp *) re; 305 size_t last = regexpPtr->re.re_nsub + 1; 306 size_t nm = last; 307 308 if (nmatches >= 0 && (size_t) nmatches < nm) { 309 nm = (size_t) nmatches; 310 } 311 312 status = TclReExec(®expPtr->re, wString, (size_t) numChars, 313 ®expPtr->details, nm, regexpPtr->matches, flags); 314 315 /* 316 * Check for errors. 317 */ 318 319 if (status != REG_OKAY) { 320 if (status == REG_NOMATCH) { 321 return 0; 322 } 323 if (interp != NULL) { 324 TclRegError(interp, "error while matching regular expression: ", 325 status); 326 } 327 return -1; 328 } 329 return 1; 330} 331 332/* 333 *--------------------------------------------------------------------------- 334 * 335 * TclRegExpRangeUniChar -- 336 * 337 * Returns pointers describing the range of a regular expression match, 338 * or one of the subranges within the match, or the hypothetical range 339 * represented by the rm_extend field of the rm_detail_t. 340 * 341 * Results: 342 * The variables at *startPtr and *endPtr are modified to hold the 343 * offsets of the endpoints of the range given by index. If the 344 * specified range doesn't exist then -1s are supplied. 345 * 346 * Side effects: 347 * None. 348 * 349 *--------------------------------------------------------------------------- 350 */ 351 352void 353TclRegExpRangeUniChar(re, index, startPtr, endPtr) 354 Tcl_RegExp re; /* Compiled regular expression that has 355 * been passed to Tcl_RegExpExec. */ 356 int index; /* 0 means give the range of the entire 357 * match, > 0 means give the range of 358 * a matching subrange, -1 means the 359 * range of the rm_extend field. */ 360 int *startPtr; /* Store address of first character in 361 * (sub-) range here. */ 362 int *endPtr; /* Store address of character just after last 363 * in (sub-) range here. */ 364{ 365 TclRegexp *regexpPtr = (TclRegexp *) re; 366 367 if ((regexpPtr->flags®_EXPECT) && index == -1) { 368 *startPtr = regexpPtr->details.rm_extend.rm_so; 369 *endPtr = regexpPtr->details.rm_extend.rm_eo; 370 } else if ((size_t) index > regexpPtr->re.re_nsub) { 371 *startPtr = -1; 372 *endPtr = -1; 373 } else { 374 *startPtr = regexpPtr->matches[index].rm_so; 375 *endPtr = regexpPtr->matches[index].rm_eo; 376 } 377} 378 379/* 380 *---------------------------------------------------------------------- 381 * 382 * Tcl_RegExpMatch -- 383 * 384 * See if a string matches a regular expression. 385 * 386 * Results: 387 * If an error occurs during the matching operation then -1 388 * is returned and the interp's result contains an error message. 389 * Otherwise the return value is 1 if "string" matches "pattern" 390 * and 0 otherwise. 391 * 392 * Side effects: 393 * None. 394 * 395 *---------------------------------------------------------------------- 396 */ 397 398int 399Tcl_RegExpMatch(interp, string, pattern) 400 Tcl_Interp *interp; /* Used for error reporting. May be NULL. */ 401 CONST char *string; /* String. */ 402 CONST char *pattern; /* Regular expression to match against 403 * string. */ 404{ 405 Tcl_RegExp re; 406 407 re = Tcl_RegExpCompile(interp, pattern); 408 if (re == NULL) { 409 return -1; 410 } 411 return Tcl_RegExpExec(interp, re, string, string); 412} 413 414/* 415 *---------------------------------------------------------------------- 416 * 417 * Tcl_RegExpExecObj -- 418 * 419 * Execute a precompiled regexp against the given object. 420 * 421 * Results: 422 * If an error occurs during the matching operation then -1 423 * is returned and the interp's result contains an error message. 424 * Otherwise the return value is 1 if "string" matches "pattern" 425 * and 0 otherwise. 426 * 427 * Side effects: 428 * Converts the object to a Unicode object. 429 * 430 *---------------------------------------------------------------------- 431 */ 432 433int 434Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags) 435 Tcl_Interp *interp; /* Interpreter to use for error reporting. */ 436 Tcl_RegExp re; /* Compiled regular expression; must have 437 * been returned by previous call to 438 * Tcl_GetRegExpFromObj. */ 439 Tcl_Obj *objPtr; /* String against which to match re. */ 440 int offset; /* Character index that marks where matching 441 * should begin. */ 442 int nmatches; /* How many subexpression matches (counting 443 * the whole match as subexpression 0) are 444 * of interest. -1 means all of them. */ 445 int flags; /* Regular expression execution flags. */ 446{ 447 TclRegexp *regexpPtr = (TclRegexp *) re; 448 Tcl_UniChar *udata; 449 int length; 450 451 /* 452 * Save the target object so we can extract strings from it later. 453 */ 454 455 regexpPtr->string = NULL; 456 regexpPtr->objPtr = objPtr; 457 458 udata = Tcl_GetUnicodeFromObj(objPtr, &length); 459 460 if (offset > length) { 461 offset = length; 462 } 463 udata += offset; 464 length -= offset; 465 466 return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); 467} 468 469/* 470 *---------------------------------------------------------------------- 471 * 472 * Tcl_RegExpMatchObj -- 473 * 474 * See if an object matches a regular expression. 475 * 476 * Results: 477 * If an error occurs during the matching operation then -1 478 * is returned and the interp's result contains an error message. 479 * Otherwise the return value is 1 if "string" matches "pattern" 480 * and 0 otherwise. 481 * 482 * Side effects: 483 * Changes the internal rep of the pattern and string objects. 484 * 485 *---------------------------------------------------------------------- 486 */ 487 488int 489Tcl_RegExpMatchObj(interp, stringObj, patternObj) 490 Tcl_Interp *interp; /* Used for error reporting. May be NULL. */ 491 Tcl_Obj *stringObj; /* Object containing the String to search. */ 492 Tcl_Obj *patternObj; /* Regular expression to match against 493 * string. */ 494{ 495 Tcl_RegExp re; 496 497 re = Tcl_GetRegExpFromObj(interp, patternObj, 498 TCL_REG_ADVANCED | TCL_REG_NOSUB); 499 if (re == NULL) { 500 return -1; 501 } 502 return Tcl_RegExpExecObj(interp, re, stringObj, 0 /* offset */, 503 0 /* nmatches */, 0 /* flags */); 504} 505 506/* 507 *---------------------------------------------------------------------- 508 * 509 * Tcl_RegExpGetInfo -- 510 * 511 * Retrieve information about the current match. 512 * 513 * Results: 514 * None. 515 * 516 * Side effects: 517 * None. 518 * 519 *---------------------------------------------------------------------- 520 */ 521 522void 523Tcl_RegExpGetInfo(regexp, infoPtr) 524 Tcl_RegExp regexp; /* Pattern from which to get subexpressions. */ 525 Tcl_RegExpInfo *infoPtr; /* Match information is stored here. */ 526{ 527 TclRegexp *regexpPtr = (TclRegexp *) regexp; 528 529 infoPtr->nsubs = regexpPtr->re.re_nsub; 530 infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; 531 infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; 532} 533 534/* 535 *---------------------------------------------------------------------- 536 * 537 * Tcl_GetRegExpFromObj -- 538 * 539 * Compile a regular expression into a form suitable for fast 540 * matching. This procedure caches the result in a Tcl_Obj. 541 * 542 * Results: 543 * The return value is a pointer to the compiled form of string, 544 * suitable for passing to Tcl_RegExpExec. If an error occurred 545 * while compiling the pattern, then NULL is returned and an error 546 * message is left in the interp's result. 547 * 548 * Side effects: 549 * Updates the native rep of the Tcl_Obj. 550 * 551 *---------------------------------------------------------------------- 552 */ 553 554Tcl_RegExp 555Tcl_GetRegExpFromObj(interp, objPtr, flags) 556 Tcl_Interp *interp; /* For use in error reporting, and to access 557 * the interp regexp cache. */ 558 Tcl_Obj *objPtr; /* Object whose string rep contains regular 559 * expression pattern. Internal rep will be 560 * changed to compiled form of this regular 561 * expression. */ 562 int flags; /* Regular expression compilation flags. */ 563{ 564 int length; 565 Tcl_ObjType *typePtr; 566 TclRegexp *regexpPtr; 567 char *pattern; 568 569 typePtr = objPtr->typePtr; 570 regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; 571 572 if ((typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { 573 pattern = Tcl_GetStringFromObj(objPtr, &length); 574 575 regexpPtr = CompileRegexp(interp, pattern, length, flags); 576 if (regexpPtr == NULL) { 577 return NULL; 578 } 579 580 /* 581 * Add a reference to the regexp so it will persist even if it is 582 * pushed out of the current thread's regexp cache. This reference 583 * will be removed when the object's internal rep is freed. 584 */ 585 586 regexpPtr->refCount++; 587 588 /* 589 * Free the old representation and set our type. 590 */ 591 592 if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) { 593 (*typePtr->freeIntRepProc)(objPtr); 594 } 595 objPtr->internalRep.otherValuePtr = (VOID *) regexpPtr; 596 objPtr->typePtr = &tclRegexpType; 597 } 598 return (Tcl_RegExp) regexpPtr; 599} 600 601/* 602 *---------------------------------------------------------------------- 603 * 604 * TclRegAbout -- 605 * 606 * Return information about a compiled regular expression. 607 * 608 * Results: 609 * The return value is -1 for failure, 0 for success, although at 610 * the moment there's nothing that could fail. On success, a list 611 * is left in the interp's result: first element is the subexpression 612 * count, second is a list of re_info bit names. 613 * 614 * Side effects: 615 * None. 616 * 617 *---------------------------------------------------------------------- 618 */ 619 620int 621TclRegAbout(interp, re) 622 Tcl_Interp *interp; /* For use in variable assignment. */ 623 Tcl_RegExp re; /* The compiled regular expression. */ 624{ 625 TclRegexp *regexpPtr = (TclRegexp *)re; 626 char buf[TCL_INTEGER_SPACE]; 627 static struct infoname { 628 int bit; 629 char *text; 630 } infonames[] = { 631 {REG_UBACKREF, "REG_UBACKREF"}, 632 {REG_ULOOKAHEAD, "REG_ULOOKAHEAD"}, 633 {REG_UBOUNDS, "REG_UBOUNDS"}, 634 {REG_UBRACES, "REG_UBRACES"}, 635 {REG_UBSALNUM, "REG_UBSALNUM"}, 636 {REG_UPBOTCH, "REG_UPBOTCH"}, 637 {REG_UBBS, "REG_UBBS"}, 638 {REG_UNONPOSIX, "REG_UNONPOSIX"}, 639 {REG_UUNSPEC, "REG_UUNSPEC"}, 640 {REG_UUNPORT, "REG_UUNPORT"}, 641 {REG_ULOCALE, "REG_ULOCALE"}, 642 {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, 643 {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, 644 {REG_USHORTEST, "REG_USHORTEST"}, 645 {0, ""} 646 }; 647 struct infoname *inf; 648 int n; 649 650 Tcl_ResetResult(interp); 651 652 sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub)); 653 Tcl_AppendElement(interp, buf); 654 655 /* 656 * Must count bits before generating list, because we must know 657 * whether {} are needed before we start appending names. 658 */ 659 n = 0; 660 for (inf = infonames; inf->bit != 0; inf++) { 661 if (regexpPtr->re.re_info&inf->bit) { 662 n++; 663 } 664 } 665 if (n != 1) { 666 Tcl_AppendResult(interp, " {", NULL); 667 } 668 for (inf = infonames; inf->bit != 0; inf++) { 669 if (regexpPtr->re.re_info&inf->bit) { 670 Tcl_AppendElement(interp, inf->text); 671 } 672 } 673 if (n != 1) { 674 Tcl_AppendResult(interp, "}", NULL); 675 } 676 677 return 0; 678} 679 680/* 681 *---------------------------------------------------------------------- 682 * 683 * TclRegError -- 684 * 685 * Generate an error message based on the regexp status code. 686 * 687 * Results: 688 * Places an error in the interpreter. 689 * 690 * Side effects: 691 * Sets errorCode as well. 692 * 693 *---------------------------------------------------------------------- 694 */ 695 696void 697TclRegError(interp, msg, status) 698 Tcl_Interp *interp; /* Interpreter for error reporting. */ 699 CONST char *msg; /* Message to prepend to error. */ 700 int status; /* Status code to report. */ 701{ 702 char buf[100]; /* ample in practice */ 703 char cbuf[100]; /* lots in practice */ 704 size_t n; 705 char *p; 706 707 Tcl_ResetResult(interp); 708 n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf)); 709 p = (n > sizeof(buf)) ? "..." : ""; 710 Tcl_AppendResult(interp, msg, buf, p, NULL); 711 712 sprintf(cbuf, "%d", status); 713 (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf)); 714 Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL); 715} 716 717 718/* 719 *---------------------------------------------------------------------- 720 * 721 * FreeRegexpInternalRep -- 722 * 723 * Deallocate the storage associated with a regexp object's internal 724 * representation. 725 * 726 * Results: 727 * None. 728 * 729 * Side effects: 730 * Frees the compiled regular expression. 731 * 732 *---------------------------------------------------------------------- 733 */ 734 735static void 736FreeRegexpInternalRep(objPtr) 737 Tcl_Obj *objPtr; /* Regexp object with internal rep to free. */ 738{ 739 TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; 740 741 /* 742 * If this is the last reference to the regexp, free it. 743 */ 744 745 if (--(regexpRepPtr->refCount) <= 0) { 746 FreeRegexp(regexpRepPtr); 747 } 748} 749 750/* 751 *---------------------------------------------------------------------- 752 * 753 * DupRegexpInternalRep -- 754 * 755 * We copy the reference to the compiled regexp and bump its 756 * reference count. 757 * 758 * Results: 759 * None. 760 * 761 * Side effects: 762 * Increments the reference count of the regexp. 763 * 764 *---------------------------------------------------------------------- 765 */ 766 767static void 768DupRegexpInternalRep(srcPtr, copyPtr) 769 Tcl_Obj *srcPtr; /* Object with internal rep to copy. */ 770 Tcl_Obj *copyPtr; /* Object with internal rep to set. */ 771{ 772 TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr; 773 regexpPtr->refCount++; 774 copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr; 775 copyPtr->typePtr = &tclRegexpType; 776} 777 778/* 779 *---------------------------------------------------------------------- 780 * 781 * SetRegexpFromAny -- 782 * 783 * Attempt to generate a compiled regular expression for the Tcl object 784 * "objPtr". 785 * 786 * Results: 787 * The return value is TCL_OK or TCL_ERROR. If an error occurs during 788 * conversion, an error message is left in the interpreter's result 789 * unless "interp" is NULL. 790 * 791 * Side effects: 792 * If no error occurs, a regular expression is stored as "objPtr"s 793 * internal representation. 794 * 795 *---------------------------------------------------------------------- 796 */ 797 798static int 799SetRegexpFromAny(interp, objPtr) 800 Tcl_Interp *interp; /* Used for error reporting if not NULL. */ 801 Tcl_Obj *objPtr; /* The object to convert. */ 802{ 803 if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) { 804 return TCL_ERROR; 805 } 806 return TCL_OK; 807} 808 809/* 810 *--------------------------------------------------------------------------- 811 * 812 * CompileRegexp -- 813 * 814 * Attempt to compile the given regexp pattern. If the compiled 815 * regular expression can be found in the per-thread cache, it 816 * will be used instead of compiling a new copy. 817 * 818 * Results: 819 * The return value is a pointer to a newly allocated TclRegexp 820 * that represents the compiled pattern, or NULL if the pattern 821 * could not be compiled. If NULL is returned, an error message is 822 * left in the interp's result. 823 * 824 * Side effects: 825 * The thread-local regexp cache is updated and a new TclRegexp may 826 * be allocated. 827 * 828 *---------------------------------------------------------------------- 829 */ 830 831static TclRegexp * 832CompileRegexp(interp, string, length, flags) 833 Tcl_Interp *interp; /* Used for error reporting if not NULL. */ 834 CONST char *string; /* The regexp to compile (UTF-8). */ 835 int length; /* The length of the string in bytes. */ 836 int flags; /* Compilation flags. */ 837{ 838 TclRegexp *regexpPtr; 839 CONST Tcl_UniChar *uniString; 840 int numChars; 841 Tcl_DString stringBuf; 842 int status, i; 843 ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); 844 845 if (!tsdPtr->initialized) { 846 tsdPtr->initialized = 1; 847 Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL); 848 } 849 850 /* 851 * This routine maintains a second-level regular expression cache in 852 * addition to the per-object regexp cache. The per-thread cache is needed 853 * to handle the case where for various reasons the object is lost between 854 * invocations of the regexp command, but the literal pattern is the same. 855 */ 856 857 /* 858 * Check the per-thread compiled regexp cache. We can only reuse 859 * a regexp if it has the same pattern and the same flags. 860 */ 861 862 for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { 863 if ((length == tsdPtr->patLengths[i]) 864 && (tsdPtr->regexps[i]->flags == flags) 865 && (strcmp(string, tsdPtr->patterns[i]) == 0)) { 866 /* 867 * Move the matched pattern to the first slot in the 868 * cache and shift the other patterns down one position. 869 */ 870 871 if (i != 0) { 872 int j; 873 char *cachedString; 874 875 cachedString = tsdPtr->patterns[i]; 876 regexpPtr = tsdPtr->regexps[i]; 877 for (j = i-1; j >= 0; j--) { 878 tsdPtr->patterns[j+1] = tsdPtr->patterns[j]; 879 tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j]; 880 tsdPtr->regexps[j+1] = tsdPtr->regexps[j]; 881 } 882 tsdPtr->patterns[0] = cachedString; 883 tsdPtr->patLengths[0] = length; 884 tsdPtr->regexps[0] = regexpPtr; 885 } 886 return tsdPtr->regexps[0]; 887 } 888 } 889 890 /* 891 * This is a new expression, so compile it and add it to the cache. 892 */ 893 894 regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); 895 regexpPtr->objPtr = NULL; 896 regexpPtr->string = NULL; 897 regexpPtr->details.rm_extend.rm_so = -1; 898 regexpPtr->details.rm_extend.rm_eo = -1; 899 900 /* 901 * Get the up-to-date string representation and map to unicode. 902 */ 903 904 Tcl_DStringInit(&stringBuf); 905 uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); 906 numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); 907 908 /* 909 * Compile the string and check for errors. 910 */ 911 912 regexpPtr->flags = flags; 913 status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); 914 Tcl_DStringFree(&stringBuf); 915 916 if (status != REG_OKAY) { 917 /* 918 * Clean up and report errors in the interpreter, if possible. 919 */ 920 921 ckfree((char *)regexpPtr); 922 if (interp) { 923 TclRegError(interp, 924 "couldn't compile regular expression pattern: ", 925 status); 926 } 927 return NULL; 928 } 929 930 /* 931 * Allocate enough space for all of the subexpressions, plus one 932 * extra for the entire pattern. 933 */ 934 935 regexpPtr->matches = (regmatch_t *) ckalloc( 936 sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); 937 938 /* 939 * Initialize the refcount to one initially, since it is in the cache. 940 */ 941 942 regexpPtr->refCount = 1; 943 944 /* 945 * Free the last regexp, if necessary, and make room at the head of the 946 * list for the new regexp. 947 */ 948 949 if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) { 950 TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1]; 951 if (--(oldRegexpPtr->refCount) <= 0) { 952 FreeRegexp(oldRegexpPtr); 953 } 954 ckfree(tsdPtr->patterns[NUM_REGEXPS-1]); 955 } 956 for (i = NUM_REGEXPS - 2; i >= 0; i--) { 957 tsdPtr->patterns[i+1] = tsdPtr->patterns[i]; 958 tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i]; 959 tsdPtr->regexps[i+1] = tsdPtr->regexps[i]; 960 } 961 tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1)); 962 strcpy(tsdPtr->patterns[0], string); 963 tsdPtr->patLengths[0] = length; 964 tsdPtr->regexps[0] = regexpPtr; 965 966 return regexpPtr; 967} 968 969/* 970 *---------------------------------------------------------------------- 971 * 972 * FreeRegexp -- 973 * 974 * Release the storage associated with a TclRegexp. 975 * 976 * Results: 977 * None. 978 * 979 * Side effects: 980 * None. 981 * 982 *---------------------------------------------------------------------- 983 */ 984 985static void 986FreeRegexp(regexpPtr) 987 TclRegexp *regexpPtr; /* Compiled regular expression to free. */ 988{ 989 TclReFree(®expPtr->re); 990 if (regexpPtr->matches) { 991 ckfree((char *) regexpPtr->matches); 992 } 993 ckfree((char *) regexpPtr); 994} 995 996/* 997 *---------------------------------------------------------------------- 998 * 999 * FinalizeRegexp -- 1000 * 1001 * Release the storage associated with the per-thread regexp 1002 * cache. 1003 * 1004 * Results: 1005 * None. 1006 * 1007 * Side effects: 1008 * None. 1009 * 1010 *---------------------------------------------------------------------- 1011 */ 1012 1013static void 1014FinalizeRegexp(clientData) 1015 ClientData clientData; /* Not used. */ 1016{ 1017 int i; 1018 TclRegexp *regexpPtr; 1019 ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); 1020 1021 for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { 1022 regexpPtr = tsdPtr->regexps[i]; 1023 if (--(regexpPtr->refCount) <= 0) { 1024 FreeRegexp(regexpPtr); 1025 } 1026 ckfree(tsdPtr->patterns[i]); 1027 tsdPtr->patterns[i] = NULL; 1028 } 1029 /* 1030 * We may find ourselves reinitialized if another finalization routine 1031 * invokes regexps. 1032 */ 1033 tsdPtr->initialized = 0; 1034} 1035