1/*
2 * tclRegexp.c --
3 *
4 *	This file contains the public interfaces to the Tcl regular
5 *	expression mechanism.
6 *
7 * Copyright (c) 1998 by Sun Microsystems, Inc.
8 * Copyright (c) 1998-1999 by Scriptics Corporation.
9 *
10 * See the file "license.terms" for information on usage and redistribution
11 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
12 *
13 * RCS: @(#) $Id: tclRegexp.c,v 1.14.4.2 2006/04/07 01:14:28 hobbs Exp $
14 */
15
16#include "tclInt.h"
17#include "tclPort.h"
18#include "tclRegexp.h"
19
20/*
21 *----------------------------------------------------------------------
22 * The routines in this file use Henry Spencer's regular expression
23 * package contained in the following additional source files:
24 *
25 *	regc_color.c	regc_cvec.c	regc_lex.c
26 *	regc_nfa.c	regcomp.c	regcustom.h
27 *	rege_dfa.c	regerror.c	regerrs.h
28 *	regex.h		regexec.c	regfree.c
29 *	regfronts.c	regguts.h
30 *
31 * Copyright (c) 1998 Henry Spencer.  All rights reserved.
32 *
33 * Development of this software was funded, in part, by Cray Research Inc.,
34 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
35 * Corporation, none of whom are responsible for the results.  The author
36 * thanks all of them.
37 *
38 * Redistribution and use in source and binary forms -- with or without
39 * modification -- are permitted for any purpose, provided that
40 * redistributions in source form retain this entire copyright notice and
41 * indicate the origin and nature of any modifications.
42 *
43 * I'd appreciate being given credit for this package in the documentation
44 * of software which uses it, but that is not a requirement.
45 *
46 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
47 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
48 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
49 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
52 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
53 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
54 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
55 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56 *
57 * *** NOTE: this code has been altered slightly for use in Tcl: ***
58 * *** 1. Names have been changed, e.g. from re_comp to		 ***
59 * ***    TclRegComp, to avoid clashes with other 		 ***
60 * ***    regexp implementations used by applications. 		 ***
61 */
62
63/*
64 * Thread local storage used to maintain a per-thread cache of compiled
65 * regular expressions.
66 */
67
68#define NUM_REGEXPS 30
69
70typedef struct ThreadSpecificData {
71    int initialized;		/* Set to 1 when the module is initialized. */
72    char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled
73				 * regular expression patterns.	 NULL
74				 * means that this slot isn't used.
75				 * Malloc-ed. */
76    int patLengths[NUM_REGEXPS];/* Number of non-null characters in
77				 * corresponding entry in patterns.
78				 * -1 means entry isn't used. */
79    struct TclRegexp *regexps[NUM_REGEXPS];
80				/* Compiled forms of above strings.  Also
81				 * malloc-ed, or NULL if not in use yet. */
82} ThreadSpecificData;
83
84static Tcl_ThreadDataKey dataKey;
85
86/*
87 * Declarations for functions used only in this file.
88 */
89
90static TclRegexp *	CompileRegexp _ANSI_ARGS_((Tcl_Interp *interp,
91			    CONST char *pattern, int length, int flags));
92static void		DupRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
93			    Tcl_Obj *copyPtr));
94static void		FinalizeRegexp _ANSI_ARGS_((ClientData clientData));
95static void		FreeRegexp _ANSI_ARGS_((TclRegexp *regexpPtr));
96static void		FreeRegexpInternalRep _ANSI_ARGS_((Tcl_Obj *objPtr));
97static int		RegExpExecUniChar _ANSI_ARGS_((Tcl_Interp *interp,
98			    Tcl_RegExp re, CONST Tcl_UniChar *uniString,
99			    int numChars, int nmatches, int flags));
100static int		SetRegexpFromAny _ANSI_ARGS_((Tcl_Interp *interp,
101			    Tcl_Obj *objPtr));
102
103/*
104 * The regular expression Tcl object type.  This serves as a cache
105 * of the compiled form of the regular expression.
106 */
107
108static Tcl_ObjType tclRegexpType = {
109    "regexp",				/* name */
110    FreeRegexpInternalRep,		/* freeIntRepProc */
111    DupRegexpInternalRep,		/* dupIntRepProc */
112    NULL,				/* updateStringProc */
113    SetRegexpFromAny			/* setFromAnyProc */
114};
115
116
117/*
118 *----------------------------------------------------------------------
119 *
120 * Tcl_RegExpCompile --
121 *
122 *	Compile a regular expression into a form suitable for fast
123 *	matching.  This procedure is DEPRECATED in favor of the
124 *	object version of the command.
125 *
126 * Results:
127 *	The return value is a pointer to the compiled form of string,
128 *	suitable for passing to Tcl_RegExpExec.  This compiled form
129 *	is only valid up until the next call to this procedure, so
130 *	don't keep these around for a long time!  If an error occurred
131 *	while compiling the pattern, then NULL is returned and an error
132 *	message is left in the interp's result.
133 *
134 * Side effects:
135 *	Updates the cache of compiled regexps.
136 *
137 *----------------------------------------------------------------------
138 */
139
140Tcl_RegExp
141Tcl_RegExpCompile(interp, string)
142    Tcl_Interp *interp;		/* For use in error reporting and
143				 * to access the interp regexp cache. */
144    CONST char *string;		/* String for which to produce
145				 * compiled regular expression. */
146{
147    return (Tcl_RegExp) CompileRegexp(interp, string, (int) strlen(string),
148	    REG_ADVANCED);
149}
150
151/*
152 *----------------------------------------------------------------------
153 *
154 * Tcl_RegExpExec --
155 *
156 *	Execute the regular expression matcher using a compiled form
157 *	of a regular expression and save information about any match
158 *	that is found.
159 *
160 * Results:
161 *	If an error occurs during the matching operation then -1
162 *	is returned and the interp's result contains an error message.
163 *	Otherwise the return value is 1 if a matching range is
164 *	found and 0 if there is no matching range.
165 *
166 * Side effects:
167 *	None.
168 *
169 *----------------------------------------------------------------------
170 */
171
172int
173Tcl_RegExpExec(interp, re, string, start)
174    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */
175    Tcl_RegExp re;		/* Compiled regular expression;  must have
176				 * been returned by previous call to
177				 * Tcl_GetRegExpFromObj. */
178    CONST char *string;		/* String against which to match re. */
179    CONST char *start;		/* If string is part of a larger string,
180				 * this identifies beginning of larger
181				 * string, so that "^" won't match. */
182{
183    int flags, result, numChars;
184    TclRegexp *regexp = (TclRegexp *)re;
185    Tcl_DString ds;
186    CONST Tcl_UniChar *ustr;
187
188    /*
189     * If the starting point is offset from the beginning of the buffer,
190     * then we need to tell the regexp engine not to match "^".
191     */
192
193    if (string > start) {
194	flags = REG_NOTBOL;
195    } else {
196	flags = 0;
197    }
198
199    /*
200     * Remember the string for use by Tcl_RegExpRange().
201     */
202
203    regexp->string = string;
204    regexp->objPtr = NULL;
205
206    /*
207     * Convert the string to Unicode and perform the match.
208     */
209
210    Tcl_DStringInit(&ds);
211    ustr = Tcl_UtfToUniCharDString(string, -1, &ds);
212    numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
213    result = RegExpExecUniChar(interp, re, ustr, numChars,
214	    -1 /* nmatches */, flags);
215    Tcl_DStringFree(&ds);
216
217    return result;
218}
219
220/*
221 *---------------------------------------------------------------------------
222 *
223 * Tcl_RegExpRange --
224 *
225 *	Returns pointers describing the range of a regular expression match,
226 *	or one of the subranges within the match.
227 *
228 * Results:
229 *	The variables at *startPtr and *endPtr are modified to hold the
230 *	addresses of the endpoints of the range given by index.  If the
231 *	specified range doesn't exist then NULLs are returned.
232 *
233 * Side effects:
234 *	None.
235 *
236 *---------------------------------------------------------------------------
237 */
238
239void
240Tcl_RegExpRange(re, index, startPtr, endPtr)
241    Tcl_RegExp re;		/* Compiled regular expression that has
242				 * been passed to Tcl_RegExpExec. */
243    int index;			/* 0 means give the range of the entire
244				 * match, > 0 means give the range of
245				 * a matching subrange. */
246    CONST char **startPtr;	/* Store address of first character in
247				 * (sub-) range here. */
248    CONST char **endPtr;	/* Store address of character just after last
249				 * in (sub-) range here. */
250{
251    TclRegexp *regexpPtr = (TclRegexp *) re;
252    CONST char *string;
253
254    if ((size_t) index > regexpPtr->re.re_nsub) {
255	*startPtr = *endPtr = NULL;
256    } else if (regexpPtr->matches[index].rm_so < 0) {
257	*startPtr = *endPtr = NULL;
258    } else {
259	if (regexpPtr->objPtr) {
260	    string = Tcl_GetString(regexpPtr->objPtr);
261	} else {
262	    string = regexpPtr->string;
263	}
264	*startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so);
265	*endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
266    }
267}
268
269/*
270 *---------------------------------------------------------------------------
271 *
272 * RegExpExecUniChar --
273 *
274 *	Execute the regular expression matcher using a compiled form of a
275 *	regular expression and save information about any match that is
276 *	found.
277 *
278 * Results:
279 *	If an error occurs during the matching operation then -1 is
280 *	returned and an error message is left in interp's result.
281 *	Otherwise the return value is 1 if a matching range was found or
282 *	0 if there was no matching range.
283 *
284 * Side effects:
285 *	None.
286 *
287 *----------------------------------------------------------------------
288 */
289
290static int
291RegExpExecUniChar(interp, re, wString, numChars, nmatches, flags)
292    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */
293    Tcl_RegExp re;		/* Compiled regular expression; returned by
294				 * a previous call to Tcl_GetRegExpFromObj */
295    CONST Tcl_UniChar *wString;	/* String against which to match re. */
296    int numChars;		/* Length of Tcl_UniChar string (must
297				 * be >= 0). */
298    int nmatches;		/* How many subexpression matches (counting
299				 * the whole match as subexpression 0) are
300				 * of interest.  -1 means "don't know". */
301    int flags;			/* Regular expression flags. */
302{
303    int status;
304    TclRegexp *regexpPtr = (TclRegexp *) re;
305    size_t last = regexpPtr->re.re_nsub + 1;
306    size_t nm = last;
307
308    if (nmatches >= 0 && (size_t) nmatches < nm) {
309	nm = (size_t) nmatches;
310    }
311
312    status = TclReExec(&regexpPtr->re, wString, (size_t) numChars,
313	    &regexpPtr->details, nm, regexpPtr->matches, flags);
314
315    /*
316     * Check for errors.
317     */
318
319    if (status != REG_OKAY) {
320	if (status == REG_NOMATCH) {
321	    return 0;
322	}
323	if (interp != NULL) {
324	    TclRegError(interp, "error while matching regular expression: ",
325		    status);
326	}
327	return -1;
328    }
329    return 1;
330}
331
332/*
333 *---------------------------------------------------------------------------
334 *
335 * TclRegExpRangeUniChar --
336 *
337 *	Returns pointers describing the range of a regular expression match,
338 *	or one of the subranges within the match, or the hypothetical range
339 *	represented by the rm_extend field of the rm_detail_t.
340 *
341 * Results:
342 *	The variables at *startPtr and *endPtr are modified to hold the
343 *	offsets of the endpoints of the range given by index.  If the
344 *	specified range doesn't exist then -1s are supplied.
345 *
346 * Side effects:
347 *	None.
348 *
349 *---------------------------------------------------------------------------
350 */
351
352void
353TclRegExpRangeUniChar(re, index, startPtr, endPtr)
354    Tcl_RegExp re;		/* Compiled regular expression that has
355				 * been passed to Tcl_RegExpExec. */
356    int index;			/* 0 means give the range of the entire
357				 * match, > 0 means give the range of
358				 * a matching subrange, -1 means the
359				 * range of the rm_extend field. */
360    int *startPtr;		/* Store address of first character in
361				 * (sub-) range here. */
362    int *endPtr;		/* Store address of character just after last
363				 * in (sub-) range here. */
364{
365    TclRegexp *regexpPtr = (TclRegexp *) re;
366
367    if ((regexpPtr->flags&REG_EXPECT) && index == -1) {
368	*startPtr = regexpPtr->details.rm_extend.rm_so;
369	*endPtr = regexpPtr->details.rm_extend.rm_eo;
370    } else if ((size_t) index > regexpPtr->re.re_nsub) {
371	*startPtr = -1;
372	*endPtr = -1;
373    } else {
374	*startPtr = regexpPtr->matches[index].rm_so;
375	*endPtr = regexpPtr->matches[index].rm_eo;
376    }
377}
378
379/*
380 *----------------------------------------------------------------------
381 *
382 * Tcl_RegExpMatch --
383 *
384 *	See if a string matches a regular expression.
385 *
386 * Results:
387 *	If an error occurs during the matching operation then -1
388 *	is returned and the interp's result contains an error message.
389 *	Otherwise the return value is 1 if "string" matches "pattern"
390 *	and 0 otherwise.
391 *
392 * Side effects:
393 *	None.
394 *
395 *----------------------------------------------------------------------
396 */
397
398int
399Tcl_RegExpMatch(interp, string, pattern)
400    Tcl_Interp *interp;		/* Used for error reporting. May be NULL. */
401    CONST char *string;		/* String. */
402    CONST char *pattern;	/* Regular expression to match against
403				 * string. */
404{
405    Tcl_RegExp re;
406
407    re = Tcl_RegExpCompile(interp, pattern);
408    if (re == NULL) {
409	return -1;
410    }
411    return Tcl_RegExpExec(interp, re, string, string);
412}
413
414/*
415 *----------------------------------------------------------------------
416 *
417 * Tcl_RegExpExecObj --
418 *
419 *	Execute a precompiled regexp against the given object.
420 *
421 * Results:
422 *	If an error occurs during the matching operation then -1
423 *	is returned and the interp's result contains an error message.
424 *	Otherwise the return value is 1 if "string" matches "pattern"
425 *	and 0 otherwise.
426 *
427 * Side effects:
428 *	Converts the object to a Unicode object.
429 *
430 *----------------------------------------------------------------------
431 */
432
433int
434Tcl_RegExpExecObj(interp, re, objPtr, offset, nmatches, flags)
435    Tcl_Interp *interp;		/* Interpreter to use for error reporting. */
436    Tcl_RegExp re;		/* Compiled regular expression;  must have
437				 * been returned by previous call to
438				 * Tcl_GetRegExpFromObj. */
439    Tcl_Obj *objPtr;		/* String against which to match re. */
440    int offset;			/* Character index that marks where matching
441				 * should begin. */
442    int nmatches;		/* How many subexpression matches (counting
443				 * the whole match as subexpression 0) are
444				 * of interest.  -1 means all of them. */
445    int flags;			/* Regular expression execution flags. */
446{
447    TclRegexp *regexpPtr = (TclRegexp *) re;
448    Tcl_UniChar *udata;
449    int length;
450
451    /*
452     * Save the target object so we can extract strings from it later.
453     */
454
455    regexpPtr->string = NULL;
456    regexpPtr->objPtr = objPtr;
457
458    udata = Tcl_GetUnicodeFromObj(objPtr, &length);
459
460    if (offset > length) {
461	offset = length;
462    }
463    udata += offset;
464    length -= offset;
465
466    return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
467}
468
469/*
470 *----------------------------------------------------------------------
471 *
472 * Tcl_RegExpMatchObj --
473 *
474 *	See if an object matches a regular expression.
475 *
476 * Results:
477 *	If an error occurs during the matching operation then -1
478 *	is returned and the interp's result contains an error message.
479 *	Otherwise the return value is 1 if "string" matches "pattern"
480 *	and 0 otherwise.
481 *
482 * Side effects:
483 *	Changes the internal rep of the pattern and string objects.
484 *
485 *----------------------------------------------------------------------
486 */
487
488int
489Tcl_RegExpMatchObj(interp, stringObj, patternObj)
490    Tcl_Interp *interp;		/* Used for error reporting. May be NULL. */
491    Tcl_Obj *stringObj;		/* Object containing the String to search. */
492    Tcl_Obj *patternObj;	/* Regular expression to match against
493				 * string. */
494{
495    Tcl_RegExp re;
496
497    re = Tcl_GetRegExpFromObj(interp, patternObj,
498	    TCL_REG_ADVANCED | TCL_REG_NOSUB);
499    if (re == NULL) {
500	return -1;
501    }
502    return Tcl_RegExpExecObj(interp, re, stringObj, 0 /* offset */,
503	    0 /* nmatches */, 0 /* flags */);
504}
505
506/*
507 *----------------------------------------------------------------------
508 *
509 * Tcl_RegExpGetInfo --
510 *
511 *	Retrieve information about the current match.
512 *
513 * Results:
514 *	None.
515 *
516 * Side effects:
517 *	None.
518 *
519 *----------------------------------------------------------------------
520 */
521
522void
523Tcl_RegExpGetInfo(regexp, infoPtr)
524    Tcl_RegExp regexp;		/* Pattern from which to get subexpressions. */
525    Tcl_RegExpInfo *infoPtr;	/* Match information is stored here.  */
526{
527    TclRegexp *regexpPtr = (TclRegexp *) regexp;
528
529    infoPtr->nsubs = regexpPtr->re.re_nsub;
530    infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches;
531    infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so;
532}
533
534/*
535 *----------------------------------------------------------------------
536 *
537 * Tcl_GetRegExpFromObj --
538 *
539 *	Compile a regular expression into a form suitable for fast
540 *	matching.  This procedure caches the result in a Tcl_Obj.
541 *
542 * Results:
543 *	The return value is a pointer to the compiled form of string,
544 *	suitable for passing to Tcl_RegExpExec.  If an error occurred
545 *	while compiling the pattern, then NULL is returned and an error
546 *	message is left in the interp's result.
547 *
548 * Side effects:
549 *	Updates the native rep of the Tcl_Obj.
550 *
551 *----------------------------------------------------------------------
552 */
553
554Tcl_RegExp
555Tcl_GetRegExpFromObj(interp, objPtr, flags)
556    Tcl_Interp *interp;		/* For use in error reporting, and to access
557				 * the interp regexp cache. */
558    Tcl_Obj *objPtr;		/* Object whose string rep contains regular
559				 * expression pattern.  Internal rep will be
560				 * changed to compiled form of this regular
561				 * expression. */
562    int flags;			/* Regular expression compilation flags. */
563{
564    int length;
565    Tcl_ObjType *typePtr;
566    TclRegexp *regexpPtr;
567    char *pattern;
568
569    typePtr = objPtr->typePtr;
570    regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
571
572    if ((typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
573	pattern = Tcl_GetStringFromObj(objPtr, &length);
574
575	regexpPtr = CompileRegexp(interp, pattern, length, flags);
576	if (regexpPtr == NULL) {
577	    return NULL;
578	}
579
580	/*
581	 * Add a reference to the regexp so it will persist even if it is
582	 * pushed out of the current thread's regexp cache.  This reference
583	 * will be removed when the object's internal rep is freed.
584	 */
585
586	regexpPtr->refCount++;
587
588	/*
589	 * Free the old representation and set our type.
590	 */
591
592	if ((typePtr != NULL) && (typePtr->freeIntRepProc != NULL)) {
593	    (*typePtr->freeIntRepProc)(objPtr);
594	}
595	objPtr->internalRep.otherValuePtr = (VOID *) regexpPtr;
596	objPtr->typePtr = &tclRegexpType;
597    }
598    return (Tcl_RegExp) regexpPtr;
599}
600
601/*
602 *----------------------------------------------------------------------
603 *
604 * TclRegAbout --
605 *
606 *	Return information about a compiled regular expression.
607 *
608 * Results:
609 *	The return value is -1 for failure, 0 for success, although at
610 *	the moment there's nothing that could fail.  On success, a list
611 *	is left in the interp's result:  first element is the subexpression
612 *	count, second is a list of re_info bit names.
613 *
614 * Side effects:
615 *	None.
616 *
617 *----------------------------------------------------------------------
618 */
619
620int
621TclRegAbout(interp, re)
622    Tcl_Interp *interp;		/* For use in variable assignment. */
623    Tcl_RegExp re;		/* The compiled regular expression. */
624{
625    TclRegexp *regexpPtr = (TclRegexp *)re;
626    char buf[TCL_INTEGER_SPACE];
627    static struct infoname {
628	int bit;
629	char *text;
630    } infonames[] = {
631	{REG_UBACKREF,		"REG_UBACKREF"},
632	{REG_ULOOKAHEAD,	"REG_ULOOKAHEAD"},
633	{REG_UBOUNDS,		"REG_UBOUNDS"},
634	{REG_UBRACES,		"REG_UBRACES"},
635	{REG_UBSALNUM,		"REG_UBSALNUM"},
636	{REG_UPBOTCH,		"REG_UPBOTCH"},
637	{REG_UBBS,		"REG_UBBS"},
638	{REG_UNONPOSIX,		"REG_UNONPOSIX"},
639	{REG_UUNSPEC,		"REG_UUNSPEC"},
640	{REG_UUNPORT,		"REG_UUNPORT"},
641	{REG_ULOCALE,		"REG_ULOCALE"},
642	{REG_UEMPTYMATCH,	"REG_UEMPTYMATCH"},
643	{REG_UIMPOSSIBLE,	"REG_UIMPOSSIBLE"},
644	{REG_USHORTEST,		"REG_USHORTEST"},
645	{0,			""}
646    };
647    struct infoname *inf;
648    int n;
649
650    Tcl_ResetResult(interp);
651
652    sprintf(buf, "%u", (unsigned)(regexpPtr->re.re_nsub));
653    Tcl_AppendElement(interp, buf);
654
655    /*
656     * Must count bits before generating list, because we must know
657     * whether {} are needed before we start appending names.
658     */
659    n = 0;
660    for (inf = infonames; inf->bit != 0; inf++) {
661	if (regexpPtr->re.re_info&inf->bit) {
662	    n++;
663	}
664    }
665    if (n != 1) {
666	Tcl_AppendResult(interp, " {", NULL);
667    }
668    for (inf = infonames; inf->bit != 0; inf++) {
669	if (regexpPtr->re.re_info&inf->bit) {
670	    Tcl_AppendElement(interp, inf->text);
671	}
672    }
673    if (n != 1) {
674	Tcl_AppendResult(interp, "}", NULL);
675    }
676
677    return 0;
678}
679
680/*
681 *----------------------------------------------------------------------
682 *
683 * TclRegError --
684 *
685 *	Generate an error message based on the regexp status code.
686 *
687 * Results:
688 *	Places an error in the interpreter.
689 *
690 * Side effects:
691 *	Sets errorCode as well.
692 *
693 *----------------------------------------------------------------------
694 */
695
696void
697TclRegError(interp, msg, status)
698    Tcl_Interp *interp;		/* Interpreter for error reporting. */
699    CONST char *msg;		/* Message to prepend to error. */
700    int status;			/* Status code to report. */
701{
702    char buf[100];		/* ample in practice */
703    char cbuf[100];		/* lots in practice */
704    size_t n;
705    char *p;
706
707    Tcl_ResetResult(interp);
708    n = TclReError(status, (regex_t *)NULL, buf, sizeof(buf));
709    p = (n > sizeof(buf)) ? "..." : "";
710    Tcl_AppendResult(interp, msg, buf, p, NULL);
711
712    sprintf(cbuf, "%d", status);
713    (VOID) TclReError(REG_ITOA, (regex_t *)NULL, cbuf, sizeof(cbuf));
714    Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
715}
716
717
718/*
719 *----------------------------------------------------------------------
720 *
721 * FreeRegexpInternalRep --
722 *
723 *	Deallocate the storage associated with a regexp object's internal
724 *	representation.
725 *
726 * Results:
727 *	None.
728 *
729 * Side effects:
730 *	Frees the compiled regular expression.
731 *
732 *----------------------------------------------------------------------
733 */
734
735static void
736FreeRegexpInternalRep(objPtr)
737    Tcl_Obj *objPtr;		/* Regexp object with internal rep to free. */
738{
739    TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
740
741    /*
742     * If this is the last reference to the regexp, free it.
743     */
744
745    if (--(regexpRepPtr->refCount) <= 0) {
746	FreeRegexp(regexpRepPtr);
747    }
748}
749
750/*
751 *----------------------------------------------------------------------
752 *
753 * DupRegexpInternalRep --
754 *
755 *	We copy the reference to the compiled regexp and bump its
756 *	reference count.
757 *
758 * Results:
759 *	None.
760 *
761 * Side effects:
762 *	Increments the reference count of the regexp.
763 *
764 *----------------------------------------------------------------------
765 */
766
767static void
768DupRegexpInternalRep(srcPtr, copyPtr)
769    Tcl_Obj *srcPtr;		/* Object with internal rep to copy. */
770    Tcl_Obj *copyPtr;		/* Object with internal rep to set. */
771{
772    TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr;
773    regexpPtr->refCount++;
774    copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr;
775    copyPtr->typePtr = &tclRegexpType;
776}
777
778/*
779 *----------------------------------------------------------------------
780 *
781 * SetRegexpFromAny --
782 *
783 *	Attempt to generate a compiled regular expression for the Tcl object
784 *	"objPtr".
785 *
786 * Results:
787 *	The return value is TCL_OK or TCL_ERROR. If an error occurs during
788 *	conversion, an error message is left in the interpreter's result
789 *	unless "interp" is NULL.
790 *
791 * Side effects:
792 *	If no error occurs, a regular expression is stored as "objPtr"s
793 *	internal representation.
794 *
795 *----------------------------------------------------------------------
796 */
797
798static int
799SetRegexpFromAny(interp, objPtr)
800    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */
801    Tcl_Obj *objPtr;		/* The object to convert. */
802{
803    if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {
804	return TCL_ERROR;
805    }
806    return TCL_OK;
807}
808
809/*
810 *---------------------------------------------------------------------------
811 *
812 * CompileRegexp --
813 *
814 *	Attempt to compile the given regexp pattern.  If the compiled
815 *	regular expression can be found in the per-thread cache, it
816 *	will be used instead of compiling a new copy.
817 *
818 * Results:
819 *	The return value is a pointer to a newly allocated TclRegexp
820 *	that represents the compiled pattern, or NULL if the pattern
821 *	could not be compiled.  If NULL is returned, an error message is
822 *	left in the interp's result.
823 *
824 * Side effects:
825 *	The thread-local regexp cache is updated and a new TclRegexp may
826 *	be allocated.
827 *
828 *----------------------------------------------------------------------
829 */
830
831static TclRegexp *
832CompileRegexp(interp, string, length, flags)
833    Tcl_Interp *interp;		/* Used for error reporting if not NULL. */
834    CONST char *string;		/* The regexp to compile (UTF-8). */
835    int length;			/* The length of the string in bytes. */
836    int flags;			/* Compilation flags. */
837{
838    TclRegexp *regexpPtr;
839    CONST Tcl_UniChar *uniString;
840    int numChars;
841    Tcl_DString stringBuf;
842    int status, i;
843    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
844
845    if (!tsdPtr->initialized) {
846	tsdPtr->initialized = 1;
847	Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
848    }
849
850    /*
851     * This routine maintains a second-level regular expression cache in
852     * addition to the per-object regexp cache.  The per-thread cache is needed
853     * to handle the case where for various reasons the object is lost between
854     * invocations of the regexp command, but the literal pattern is the same.
855     */
856
857    /*
858     * Check the per-thread compiled regexp cache.  We can only reuse
859     * a regexp if it has the same pattern and the same flags.
860     */
861
862    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
863	if ((length == tsdPtr->patLengths[i])
864		&& (tsdPtr->regexps[i]->flags == flags)
865		&& (strcmp(string, tsdPtr->patterns[i]) == 0)) {
866	    /*
867	     * Move the matched pattern to the first slot in the
868	     * cache and shift the other patterns down one position.
869	     */
870
871	    if (i != 0) {
872		int j;
873		char *cachedString;
874
875		cachedString = tsdPtr->patterns[i];
876		regexpPtr = tsdPtr->regexps[i];
877		for (j = i-1; j >= 0; j--) {
878		    tsdPtr->patterns[j+1] = tsdPtr->patterns[j];
879		    tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j];
880		    tsdPtr->regexps[j+1] = tsdPtr->regexps[j];
881		}
882		tsdPtr->patterns[0] = cachedString;
883		tsdPtr->patLengths[0] = length;
884		tsdPtr->regexps[0] = regexpPtr;
885	    }
886	    return tsdPtr->regexps[0];
887	}
888    }
889
890    /*
891     * This is a new expression, so compile it and add it to the cache.
892     */
893
894    regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
895    regexpPtr->objPtr = NULL;
896    regexpPtr->string = NULL;
897    regexpPtr->details.rm_extend.rm_so = -1;
898    regexpPtr->details.rm_extend.rm_eo = -1;
899
900    /*
901     * Get the up-to-date string representation and map to unicode.
902     */
903
904    Tcl_DStringInit(&stringBuf);
905    uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
906    numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);
907
908    /*
909     * Compile the string and check for errors.
910     */
911
912    regexpPtr->flags = flags;
913    status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);
914    Tcl_DStringFree(&stringBuf);
915
916    if (status != REG_OKAY) {
917	/*
918	 * Clean up and report errors in the interpreter, if possible.
919	 */
920
921	ckfree((char *)regexpPtr);
922	if (interp) {
923	    TclRegError(interp,
924		    "couldn't compile regular expression pattern: ",
925		    status);
926	}
927	return NULL;
928    }
929
930    /*
931     * Allocate enough space for all of the subexpressions, plus one
932     * extra for the entire pattern.
933     */
934
935    regexpPtr->matches = (regmatch_t *) ckalloc(
936	    sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));
937
938    /*
939     * Initialize the refcount to one initially, since it is in the cache.
940     */
941
942    regexpPtr->refCount = 1;
943
944    /*
945     * Free the last regexp, if necessary, and make room at the head of the
946     * list for the new regexp.
947     */
948
949    if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {
950	TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1];
951	if (--(oldRegexpPtr->refCount) <= 0) {
952	    FreeRegexp(oldRegexpPtr);
953	}
954	ckfree(tsdPtr->patterns[NUM_REGEXPS-1]);
955    }
956    for (i = NUM_REGEXPS - 2; i >= 0; i--) {
957	tsdPtr->patterns[i+1] = tsdPtr->patterns[i];
958	tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];
959	tsdPtr->regexps[i+1] = tsdPtr->regexps[i];
960    }
961    tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
962    strcpy(tsdPtr->patterns[0], string);
963    tsdPtr->patLengths[0] = length;
964    tsdPtr->regexps[0] = regexpPtr;
965
966    return regexpPtr;
967}
968
969/*
970 *----------------------------------------------------------------------
971 *
972 * FreeRegexp --
973 *
974 *	Release the storage associated with a TclRegexp.
975 *
976 * Results:
977 *	None.
978 *
979 * Side effects:
980 *	None.
981 *
982 *----------------------------------------------------------------------
983 */
984
985static void
986FreeRegexp(regexpPtr)
987    TclRegexp *regexpPtr;	/* Compiled regular expression to free. */
988{
989    TclReFree(&regexpPtr->re);
990    if (regexpPtr->matches) {
991	ckfree((char *) regexpPtr->matches);
992    }
993    ckfree((char *) regexpPtr);
994}
995
996/*
997 *----------------------------------------------------------------------
998 *
999 * FinalizeRegexp --
1000 *
1001 *	Release the storage associated with the per-thread regexp
1002 *	cache.
1003 *
1004 * Results:
1005 *	None.
1006 *
1007 * Side effects:
1008 *	None.
1009 *
1010 *----------------------------------------------------------------------
1011 */
1012
1013static void
1014FinalizeRegexp(clientData)
1015    ClientData clientData;	/* Not used. */
1016{
1017    int i;
1018    TclRegexp *regexpPtr;
1019    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
1020
1021    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
1022	regexpPtr = tsdPtr->regexps[i];
1023	if (--(regexpPtr->refCount) <= 0) {
1024	    FreeRegexp(regexpPtr);
1025	}
1026	ckfree(tsdPtr->patterns[i]);
1027	tsdPtr->patterns[i] = NULL;
1028    }
1029    /*
1030     * We may find ourselves reinitialized if another finalization routine
1031     * invokes regexps.
1032     */
1033    tsdPtr->initialized = 0;
1034}
1035