1/*
2 * tclUtf.c --
3 *
4 *	Routines for manipulating UTF-8 strings.
5 *
6 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
7 *
8 * See the file "license.terms" for information on usage and redistribution of
9 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
10 *
11 * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $
12 */
13
14#include "tclInt.h"
15
16/*
17 * Include the static character classification tables and macros.
18 */
19
20#include "tclUniData.c"
21
22/*
23 * The following macros are used for fast character category tests. The x_BITS
24 * values are shifted right by the category value to determine whether the
25 * given category is included in the set.
26 */
27
28#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
29	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
30
31#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
32
33#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
34	| (1 << PARAGRAPH_SEPARATOR))
35
36#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
37
38#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
39	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
40	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
41	(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
42	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
43	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
44	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
45	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
46	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
47
48#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
49	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
50	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
51	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
52
53/*
54 * Unicode characters less than this value are represented by themselves in
55 * UTF-8 strings.
56 */
57
58#define UNICODE_SELF	0x80
59
60/*
61 * The following structures are used when mapping between Unicode (UCS-2) and
62 * UTF-8.
63 */
64
65static CONST unsigned char totalBytes[256] = {
66    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74#if TCL_UTF_MAX > 3
75    4,4,4,4,4,4,4,4,
76#else
77    1,1,1,1,1,1,1,1,
78#endif
79#if TCL_UTF_MAX > 4
80    5,5,5,5,
81#else
82    1,1,1,1,
83#endif
84#if TCL_UTF_MAX > 5
85    6,6,6,6
86#else
87    1,1,1,1
88#endif
89};
90
91/*
92 * Functions used only in this module.
93 */
94
95static int		UtfCount(int ch);
96
97/*
98 *---------------------------------------------------------------------------
99 *
100 * UtfCount --
101 *
102 *	Find the number of bytes in the Utf character "ch".
103 *
104 * Results:
105 *	The return values is the number of bytes in the Utf character "ch".
106 *
107 * Side effects:
108 *	None.
109 *
110 *---------------------------------------------------------------------------
111 */
112
113INLINE static int
114UtfCount(
115    int ch)			/* The Tcl_UniChar whose size is returned. */
116{
117    if ((ch > 0) && (ch < UNICODE_SELF)) {
118	return 1;
119    }
120    if (ch <= 0x7FF) {
121	return 2;
122    }
123    if (ch <= 0xFFFF) {
124	return 3;
125    }
126#if TCL_UTF_MAX > 3
127    if (ch <= 0x1FFFFF) {
128	return 4;
129    }
130    if (ch <= 0x3FFFFFF) {
131	return 5;
132    }
133    if (ch <= 0x7FFFFFFF) {
134	return 6;
135    }
136#endif
137    return 3;
138}
139
140/*
141 *---------------------------------------------------------------------------
142 *
143 * Tcl_UniCharToUtf --
144 *
145 *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
146 *	provided buffer. Equivalent to Plan 9 runetochar().
147 *
148 * Results:
149 *	The return values is the number of bytes in the buffer that were
150 *	consumed.
151 *
152 * Side effects:
153 *	None.
154 *
155 *---------------------------------------------------------------------------
156 */
157
158INLINE int
159Tcl_UniCharToUtf(
160    int ch,			/* The Tcl_UniChar to be stored in the
161				 * buffer. */
162    char *buf)			/* Buffer in which the UTF-8 representation of
163				 * the Tcl_UniChar is stored. Buffer must be
164				 * large enough to hold the UTF-8 character
165				 * (at most TCL_UTF_MAX bytes). */
166{
167    if ((ch > 0) && (ch < UNICODE_SELF)) {
168	buf[0] = (char) ch;
169	return 1;
170    }
171    if (ch >= 0) {
172	if (ch <= 0x7FF) {
173	    buf[1] = (char) ((ch | 0x80) & 0xBF);
174	    buf[0] = (char) ((ch >> 6) | 0xC0);
175	    return 2;
176	}
177	if (ch <= 0xFFFF) {
178	three:
179	    buf[2] = (char) ((ch | 0x80) & 0xBF);
180	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
181	    buf[0] = (char) ((ch >> 12) | 0xE0);
182	    return 3;
183	}
184
185#if TCL_UTF_MAX > 3
186	if (ch <= 0x1FFFFF) {
187	    buf[3] = (char) ((ch | 0x80) & 0xBF);
188	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
189	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
190	    buf[0] = (char) ((ch >> 18) | 0xF0);
191	    return 4;
192	}
193	if (ch <= 0x3FFFFFF) {
194	    buf[4] = (char) ((ch | 0x80) & 0xBF);
195	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
196	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
197	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
198	    buf[0] = (char) ((ch >> 24) | 0xF8);
199	    return 5;
200	}
201	if (ch <= 0x7FFFFFFF) {
202	    buf[5] = (char) ((ch | 0x80) & 0xBF);
203	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
204	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
205	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
206	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
207	    buf[0] = (char) ((ch >> 30) | 0xFC);
208	    return 6;
209	}
210#endif
211    }
212
213    ch = 0xFFFD;
214    goto three;
215}
216
217/*
218 *---------------------------------------------------------------------------
219 *
220 * Tcl_UniCharToUtfDString --
221 *
222 *	Convert the given Unicode string to UTF-8.
223 *
224 * Results:
225 *	The return value is a pointer to the UTF-8 representation of the
226 *	Unicode string. Storage for the return value is appended to the end of
227 *	dsPtr.
228 *
229 * Side effects:
230 *	None.
231 *
232 *---------------------------------------------------------------------------
233 */
234
235char *
236Tcl_UniCharToUtfDString(
237    CONST Tcl_UniChar *uniStr,	/* Unicode string to convert to UTF-8. */
238    int uniLength,		/* Length of Unicode string in Tcl_UniChars
239				 * (must be >= 0). */
240    Tcl_DString *dsPtr)		/* UTF-8 representation of string is appended
241				 * to this previously initialized DString. */
242{
243    CONST Tcl_UniChar *w, *wEnd;
244    char *p, *string;
245    int oldLength;
246
247    /*
248     * UTF-8 string length in bytes will be <= Unicode string length *
249     * TCL_UTF_MAX.
250     */
251
252    oldLength = Tcl_DStringLength(dsPtr);
253    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
254    string = Tcl_DStringValue(dsPtr) + oldLength;
255
256    p = string;
257    wEnd = uniStr + uniLength;
258    for (w = uniStr; w < wEnd; ) {
259	p += Tcl_UniCharToUtf(*w, p);
260	w++;
261    }
262    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
263
264    return string;
265}
266
267/*
268 *---------------------------------------------------------------------------
269 *
270 * Tcl_UtfToUniChar --
271 *
272 *	Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
273 *	sequences are converted to valid Tcl_UniChars and processing
274 *	continues. Equivalent to Plan 9 chartorune().
275 *
276 *	The caller must ensure that the source buffer is long enough that this
277 *	routine does not run off the end and dereference non-existent memory
278 *	looking for trail bytes. If the source buffer is known to be '\0'
279 *	terminated, this cannot happen. Otherwise, the caller should call
280 *	Tcl_UtfCharComplete() before calling this routine to ensure that
281 *	enough bytes remain in the string.
282 *
283 * Results:
284 *	*chPtr is filled with the Tcl_UniChar, and the return value is the
285 *	number of bytes from the UTF-8 string that were consumed.
286 *
287 * Side effects:
288 *	None.
289 *
290 *---------------------------------------------------------------------------
291 */
292
293int
294Tcl_UtfToUniChar(
295    register CONST char *src,	/* The UTF-8 string. */
296    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
297				 * the UTF-8 string. */
298{
299    register int byte;
300
301    /*
302     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
303     */
304
305    byte = *((unsigned char *) src);
306    if (byte < 0xC0) {
307	/*
308	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
309	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
310	 * characters representing themselves.
311	 */
312
313	*chPtr = (Tcl_UniChar) byte;
314	return 1;
315    } else if (byte < 0xE0) {
316	if ((src[1] & 0xC0) == 0x80) {
317	    /*
318	     * Two-byte-character lead-byte followed by a trail-byte.
319	     */
320
321	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
322	    return 2;
323	}
324
325	/*
326	 * A two-byte-character lead-byte not followed by trail-byte
327	 * represents itself.
328	 */
329
330	*chPtr = (Tcl_UniChar) byte;
331	return 1;
332    } else if (byte < 0xF0) {
333	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
334	    /*
335	     * Three-byte-character lead byte followed by two trail bytes.
336	     */
337
338	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
340	    return 3;
341	}
342
343	/*
344	 * A three-byte-character lead-byte not followed by two trail-bytes
345	 * represents itself.
346	 */
347
348	*chPtr = (Tcl_UniChar) byte;
349	return 1;
350    }
351#if TCL_UTF_MAX > 3
352    {
353	int ch, total, trail;
354
355	total = totalBytes[byte];
356	trail = total - 1;
357	if (trail > 0) {
358	    ch = byte & (0x3F >> trail);
359	    do {
360		src++;
361		if ((*src & 0xC0) != 0x80) {
362		    *chPtr = byte;
363		    return 1;
364		}
365		ch <<= 6;
366		ch |= (*src & 0x3F);
367		trail--;
368	    } while (trail > 0);
369	    *chPtr = ch;
370	    return total;
371	}
372    }
373#endif
374
375    *chPtr = (Tcl_UniChar) byte;
376    return 1;
377}
378
379/*
380 *---------------------------------------------------------------------------
381 *
382 * Tcl_UtfToUniCharDString --
383 *
384 *	Convert the UTF-8 string to Unicode.
385 *
386 * Results:
387 *	The return value is a pointer to the Unicode representation of the
388 *	UTF-8 string. Storage for the return value is appended to the end of
389 *	dsPtr. The Unicode string is terminated with a Unicode NULL character.
390 *
391 * Side effects:
392 *	None.
393 *
394 *---------------------------------------------------------------------------
395 */
396
397Tcl_UniChar *
398Tcl_UtfToUniCharDString(
399    CONST char *src,		/* UTF-8 string to convert to Unicode. */
400    int length,			/* Length of UTF-8 string in bytes, or -1 for
401				 * strlen(). */
402    Tcl_DString *dsPtr)		/* Unicode representation of string is
403				 * appended to this previously initialized
404				 * DString. */
405{
406    Tcl_UniChar *w, *wString;
407    CONST char *p, *end;
408    int oldLength;
409
410    if (length < 0) {
411	length = strlen(src);
412    }
413
414    /*
415     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
416     * bytes.
417     */
418
419    oldLength = Tcl_DStringLength(dsPtr);
420    Tcl_DStringSetLength(dsPtr,
421	    (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423
424    w = wString;
425    end = src + length;
426    for (p = src; p < end; ) {
427	p += TclUtfToUniChar(p, w);
428	w++;
429    }
430    *w = '\0';
431    Tcl_DStringSetLength(dsPtr,
432	    (oldLength + ((char *) w - (char *) wString)));
433
434    return wString;
435}
436
437/*
438 *---------------------------------------------------------------------------
439 *
440 * Tcl_UtfCharComplete --
441 *
442 *	Determine if the UTF-8 string of the given length is long enough to be
443 *	decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
444 *	string is properly formed. Equivalent to Plan 9 fullrune().
445 *
446 * Results:
447 *	The return value is 0 if the string is not long enough, non-zero
448 *	otherwise.
449 *
450 * Side effects:
451 *	None.
452 *
453 *---------------------------------------------------------------------------
454 */
455
456int
457Tcl_UtfCharComplete(
458    CONST char *src,		/* String to check if first few bytes contain
459				 * a complete UTF-8 character. */
460    int length)			/* Length of above string in bytes. */
461{
462    int ch;
463
464    ch = *((unsigned char *) src);
465    return length >= totalBytes[ch];
466}
467
468/*
469 *---------------------------------------------------------------------------
470 *
471 * Tcl_NumUtfChars --
472 *
473 *	Returns the number of characters (not bytes) in the UTF-8 string, not
474 *	including the terminating NULL byte. This is equivalent to Plan 9
475 *	utflen() and utfnlen().
476 *
477 * Results:
478 *	As above.
479 *
480 * Side effects:
481 *	None.
482 *
483 *---------------------------------------------------------------------------
484 */
485
486int
487Tcl_NumUtfChars(
488    register CONST char *src,	/* The UTF-8 string to measure. */
489    int length)			/* The length of the string in bytes, or -1
490				 * for strlen(string). */
491{
492    Tcl_UniChar ch;
493    register Tcl_UniChar *chPtr = &ch;
494    register int i;
495
496    /*
497     * The separate implementations are faster.
498     *
499     * Since this is a time-sensitive function, we also do the check for the
500     * single-byte char case specially.
501     */
502
503    i = 0;
504    if (length < 0) {
505	while (*src != '\0') {
506	    src += TclUtfToUniChar(src, chPtr);
507	    i++;
508	}
509    } else {
510	register int n;
511
512	while (length > 0) {
513	    if (UCHAR(*src) < 0xC0) {
514		length--;
515		src++;
516	    } else {
517		n = Tcl_UtfToUniChar(src, chPtr);
518		length -= n;
519		src += n;
520	    }
521	    i++;
522	}
523    }
524    return i;
525}
526
527/*
528 *---------------------------------------------------------------------------
529 *
530 * Tcl_UtfFindFirst --
531 *
532 *	Returns a pointer to the first occurance of the given Tcl_UniChar in
533 *	the NULL-terminated UTF-8 string. The NULL terminator is considered
534 *	part of the UTF-8 string. Equivalent to Plan 9 utfrune().
535 *
536 * Results:
537 *	As above. If the Tcl_UniChar does not exist in the given string, the
538 *	return value is NULL.
539 *
540 * Side effects:
541 *	None.
542 *
543 *---------------------------------------------------------------------------
544 */
545
546CONST char *
547Tcl_UtfFindFirst(
548    CONST char *src,		/* The UTF-8 string to be searched. */
549    int ch)			/* The Tcl_UniChar to search for. */
550{
551    int len;
552    Tcl_UniChar find;
553
554    while (1) {
555	len = TclUtfToUniChar(src, &find);
556	if (find == ch) {
557	    return src;
558	}
559	if (*src == '\0') {
560	    return NULL;
561	}
562	src += len;
563    }
564}
565
566/*
567 *---------------------------------------------------------------------------
568 *
569 * Tcl_UtfFindLast --
570 *
571 *	Returns a pointer to the last occurance of the given Tcl_UniChar in
572 *	the NULL-terminated UTF-8 string. The NULL terminator is considered
573 *	part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
574 *
575 * Results:
576 *	As above. If the Tcl_UniChar does not exist in the given string, the
577 *	return value is NULL.
578 *
579 * Side effects:
580 *	None.
581 *
582 *---------------------------------------------------------------------------
583 */
584
585CONST char *
586Tcl_UtfFindLast(
587    CONST char *src,		/* The UTF-8 string to be searched. */
588    int ch)			/* The Tcl_UniChar to search for. */
589{
590    int len;
591    Tcl_UniChar find;
592    CONST char *last;
593
594    last = NULL;
595    while (1) {
596	len = TclUtfToUniChar(src, &find);
597	if (find == ch) {
598	    last = src;
599	}
600	if (*src == '\0') {
601	    break;
602	}
603	src += len;
604    }
605    return last;
606}
607
608/*
609 *---------------------------------------------------------------------------
610 *
611 * Tcl_UtfNext --
612 *
613 *	Given a pointer to some current location in a UTF-8 string, move
614 *	forward one character. The caller must ensure that they are not asking
615 *	for the next character after the last character in the string.
616 *
617 * Results:
618 *	The return value is the pointer to the next character in the UTF-8
619 *	string.
620 *
621 * Side effects:
622 *	None.
623 *
624 *---------------------------------------------------------------------------
625 */
626
627CONST char *
628Tcl_UtfNext(
629    CONST char *src)		/* The current location in the string. */
630{
631    Tcl_UniChar ch;
632
633    return src + TclUtfToUniChar(src, &ch);
634}
635
636/*
637 *---------------------------------------------------------------------------
638 *
639 * Tcl_UtfPrev --
640 *
641 *	Given a pointer to some current location in a UTF-8 string, move
642 *	backwards one character. This works correctly when the pointer is in
643 *	the middle of a UTF-8 character.
644 *
645 * Results:
646 *	The return value is a pointer to the previous character in the UTF-8
647 *	string. If the current location was already at the beginning of the
648 *	string, the return value will also be a pointer to the beginning of
649 *	the string.
650 *
651 * Side effects:
652 *	None.
653 *
654 *---------------------------------------------------------------------------
655 */
656
657CONST char *
658Tcl_UtfPrev(
659    CONST char *src,		/* The current location in the string. */
660    CONST char *start)		/* Pointer to the beginning of the string, to
661				 * avoid going backwards too far. */
662{
663    CONST char *look;
664    int i, byte;
665
666    src--;
667    look = src;
668    for (i = 0; i < TCL_UTF_MAX; i++) {
669	if (look < start) {
670	    if (src < start) {
671		src = start;
672	    }
673	    break;
674	}
675	byte = *((unsigned char *) look);
676	if (byte < 0x80) {
677	    break;
678	}
679	if (byte >= 0xC0) {
680	    return look;
681	}
682	look--;
683    }
684    return src;
685}
686
687/*
688 *---------------------------------------------------------------------------
689 *
690 * Tcl_UniCharAtIndex --
691 *
692 *	Returns the Unicode character represented at the specified character
693 *	(not byte) position in the UTF-8 string.
694 *
695 * Results:
696 *	As above.
697 *
698 * Side effects:
699 *	None.
700 *
701 *---------------------------------------------------------------------------
702 */
703
704Tcl_UniChar
705Tcl_UniCharAtIndex(
706    register CONST char *src,	/* The UTF-8 string to dereference. */
707    register int index)		/* The position of the desired character. */
708{
709    Tcl_UniChar ch;
710
711    while (index >= 0) {
712	index--;
713	src += TclUtfToUniChar(src, &ch);
714    }
715    return ch;
716}
717
718/*
719 *---------------------------------------------------------------------------
720 *
721 * Tcl_UtfAtIndex --
722 *
723 *	Returns a pointer to the specified character (not byte) position in
724 *	the UTF-8 string.
725 *
726 * Results:
727 *	As above.
728 *
729 * Side effects:
730 *	None.
731 *
732 *---------------------------------------------------------------------------
733 */
734
735CONST char *
736Tcl_UtfAtIndex(
737    register CONST char *src,	/* The UTF-8 string. */
738    register int index)		/* The position of the desired character. */
739{
740    Tcl_UniChar ch;
741
742    while (index > 0) {
743	index--;
744	src += TclUtfToUniChar(src, &ch);
745    }
746    return src;
747}
748
749/*
750 *---------------------------------------------------------------------------
751 *
752 * Tcl_UtfBackslash --
753 *
754 *	Figure out how to handle a backslash sequence.
755 *
756 * Results:
757 *	Stores the bytes represented by the backslash sequence in dst and
758 *	returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
759 *	are written to dst; dst must have been large enough to accept those
760 *	bytes. If readPtr isn't NULL then it is filled in with a count of the
761 *	number of bytes in the backslash sequence.
762 *
763 * Side effects:
764 *	The maximum number of bytes it takes to represent a Unicode character
765 *	in UTF-8 is guaranteed to be less than the number of bytes used to
766 *	express the backslash sequence that represents that Unicode character.
767 *	If the target buffer into which the caller is going to store the bytes
768 *	that represent the Unicode character is at least as large as the
769 *	source buffer from which the backslashed sequence was extracted, no
770 *	buffer overruns should occur.
771 *
772 *---------------------------------------------------------------------------
773 */
774
775int
776Tcl_UtfBackslash(
777    CONST char *src,		/* Points to the backslash character of a
778				 * backslash sequence. */
779    int *readPtr,		/* Fill in with number of characters read from
780				 * src, unless NULL. */
781    char *dst)			/* Filled with the bytes represented by the
782				 * backslash sequence. */
783{
784#define LINE_LENGTH 128
785    int numRead;
786    int result;
787
788    result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
789    if (numRead == LINE_LENGTH) {
790	/*
791	 * We ate a whole line. Pay the price of a strlen()
792	 */
793
794	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
795    }
796    if (readPtr != NULL) {
797	*readPtr = numRead;
798    }
799    return result;
800}
801
802/*
803 *----------------------------------------------------------------------
804 *
805 * Tcl_UtfToUpper --
806 *
807 *	Convert lowercase characters to uppercase characters in a UTF string
808 *	in place. The conversion may shrink the UTF string.
809 *
810 * Results:
811 *	Returns the number of bytes in the resulting string excluding the
812 *	trailing null.
813 *
814 * Side effects:
815 *	Writes a terminating null after the last converted character.
816 *
817 *----------------------------------------------------------------------
818 */
819
820int
821Tcl_UtfToUpper(
822    char *str)			/* String to convert in place. */
823{
824    Tcl_UniChar ch, upChar;
825    char *src, *dst;
826    int bytes;
827
828    /*
829     * Iterate over the string until we hit the terminating null.
830     */
831
832    src = dst = str;
833    while (*src) {
834	bytes = TclUtfToUniChar(src, &ch);
835	upChar = Tcl_UniCharToUpper(ch);
836
837	/*
838	 * To keep badly formed Utf strings from getting inflated by the
839	 * conversion (thereby causing a segfault), only copy the upper case
840	 * char to dst if its size is <= the original char.
841	 */
842
843	if (bytes < UtfCount(upChar)) {
844	    memcpy(dst, src, (size_t) bytes);
845	    dst += bytes;
846	} else {
847	    dst += Tcl_UniCharToUtf(upChar, dst);
848	}
849	src += bytes;
850    }
851    *dst = '\0';
852    return (dst - str);
853}
854
855/*
856 *----------------------------------------------------------------------
857 *
858 * Tcl_UtfToLower --
859 *
860 *	Convert uppercase characters to lowercase characters in a UTF string
861 *	in place. The conversion may shrink the UTF string.
862 *
863 * Results:
864 *	Returns the number of bytes in the resulting string excluding the
865 *	trailing null.
866 *
867 * Side effects:
868 *	Writes a terminating null after the last converted character.
869 *
870 *----------------------------------------------------------------------
871 */
872
873int
874Tcl_UtfToLower(
875    char *str)			/* String to convert in place. */
876{
877    Tcl_UniChar ch, lowChar;
878    char *src, *dst;
879    int bytes;
880
881    /*
882     * Iterate over the string until we hit the terminating null.
883     */
884
885    src = dst = str;
886    while (*src) {
887	bytes = TclUtfToUniChar(src, &ch);
888	lowChar = Tcl_UniCharToLower(ch);
889
890	/*
891	 * To keep badly formed Utf strings from getting inflated by the
892	 * conversion (thereby causing a segfault), only copy the lower case
893	 * char to dst if its size is <= the original char.
894	 */
895
896	if (bytes < UtfCount(lowChar)) {
897	    memcpy(dst, src, (size_t) bytes);
898	    dst += bytes;
899	} else {
900	    dst += Tcl_UniCharToUtf(lowChar, dst);
901	}
902	src += bytes;
903    }
904    *dst = '\0';
905    return (dst - str);
906}
907
908/*
909 *----------------------------------------------------------------------
910 *
911 * Tcl_UtfToTitle --
912 *
913 *	Changes the first character of a UTF string to title case or uppercase
914 *	and the rest of the string to lowercase. The conversion happens in
915 *	place and may shrink the UTF string.
916 *
917 * Results:
918 *	Returns the number of bytes in the resulting string excluding the
919 *	trailing null.
920 *
921 * Side effects:
922 *	Writes a terminating null after the last converted character.
923 *
924 *----------------------------------------------------------------------
925 */
926
927int
928Tcl_UtfToTitle(
929    char *str)			/* String to convert in place. */
930{
931    Tcl_UniChar ch, titleChar, lowChar;
932    char *src, *dst;
933    int bytes;
934
935    /*
936     * Capitalize the first character and then lowercase the rest of the
937     * characters until we get to a null.
938     */
939
940    src = dst = str;
941
942    if (*src) {
943	bytes = TclUtfToUniChar(src, &ch);
944	titleChar = Tcl_UniCharToTitle(ch);
945
946	if (bytes < UtfCount(titleChar)) {
947	    memcpy(dst, src, (size_t) bytes);
948	    dst += bytes;
949	} else {
950	    dst += Tcl_UniCharToUtf(titleChar, dst);
951	}
952	src += bytes;
953    }
954    while (*src) {
955	bytes = TclUtfToUniChar(src, &ch);
956	lowChar = Tcl_UniCharToLower(ch);
957
958	if (bytes < UtfCount(lowChar)) {
959	    memcpy(dst, src, (size_t) bytes);
960	    dst += bytes;
961	} else {
962	    dst += Tcl_UniCharToUtf(lowChar, dst);
963	}
964	src += bytes;
965    }
966    *dst = '\0';
967    return (dst - str);
968}
969
970/*
971 *----------------------------------------------------------------------
972 *
973 * TclpUtfNcmp2 --
974 *
975 *	Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
976 *	ct are assumed to be at least numBytes bytes long.
977 *
978 * Results:
979 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
980 *
981 * Side effects:
982 *	None.
983 *
984 *----------------------------------------------------------------------
985 */
986
987int
988TclpUtfNcmp2(
989    CONST char *cs,		/* UTF string to compare to ct. */
990    CONST char *ct,		/* UTF string cs is compared to. */
991    unsigned long numBytes)	/* Number of *bytes* to compare. */
992{
993    /*
994     * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
995     * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
996     * fine in the strcmp manner.
997     */
998
999    register int result = 0;
1000
1001    for ( ; numBytes != 0; numBytes--, cs++, ct++) {
1002	if (*cs != *ct) {
1003	    result = UCHAR(*cs) - UCHAR(*ct);
1004	    break;
1005	}
1006    }
1007    if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
1008	unsigned char c1, c2;
1009
1010	c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
1011	c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
1012	result = (c1 - c2);
1013    }
1014    return result;
1015}
1016
1017/*
1018 *----------------------------------------------------------------------
1019 *
1020 * Tcl_UtfNcmp --
1021 *
1022 *	Compare at most numChars UTF chars of string cs to string ct. Both cs
1023 *	and ct are assumed to be at least numChars UTF chars long.
1024 *
1025 * Results:
1026 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1027 *
1028 * Side effects:
1029 *	None.
1030 *
1031 *----------------------------------------------------------------------
1032 */
1033
1034int
1035Tcl_UtfNcmp(
1036    CONST char *cs,		/* UTF string to compare to ct. */
1037    CONST char *ct,		/* UTF string cs is compared to. */
1038    unsigned long numChars)	/* Number of UTF chars to compare. */
1039{
1040    Tcl_UniChar ch1, ch2;
1041
1042    /*
1043     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
1044     * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
1045     * (the byte 0x01.)
1046     */
1047
1048    while (numChars-- > 0) {
1049	/*
1050	 * n must be interpreted as chars, not bytes. This should be called
1051	 * only when both strings are of at least n chars long (no need for \0
1052	 * check)
1053	 */
1054
1055	cs += TclUtfToUniChar(cs, &ch1);
1056	ct += TclUtfToUniChar(ct, &ch2);
1057	if (ch1 != ch2) {
1058	    return (ch1 - ch2);
1059	}
1060    }
1061    return 0;
1062}
1063
1064/*
1065 *----------------------------------------------------------------------
1066 *
1067 * Tcl_UtfNcasecmp --
1068 *
1069 *	Compare at most numChars UTF chars of string cs to string ct case
1070 *	insensitive. Both cs and ct are assumed to be at least numChars UTF
1071 *	chars long.
1072 *
1073 * Results:
1074 *	Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1075 *
1076 * Side effects:
1077 *	None.
1078 *
1079 *----------------------------------------------------------------------
1080 */
1081
1082int
1083Tcl_UtfNcasecmp(
1084    CONST char *cs,		/* UTF string to compare to ct. */
1085    CONST char *ct,		/* UTF string cs is compared to. */
1086    unsigned long numChars)	/* Number of UTF chars to compare. */
1087{
1088    Tcl_UniChar ch1, ch2;
1089    while (numChars-- > 0) {
1090	/*
1091	 * n must be interpreted as chars, not bytes.
1092	 * This should be called only when both strings are of
1093	 * at least n chars long (no need for \0 check)
1094	 */
1095	cs += TclUtfToUniChar(cs, &ch1);
1096	ct += TclUtfToUniChar(ct, &ch2);
1097	if (ch1 != ch2) {
1098	    ch1 = Tcl_UniCharToLower(ch1);
1099	    ch2 = Tcl_UniCharToLower(ch2);
1100	    if (ch1 != ch2) {
1101		return (ch1 - ch2);
1102	    }
1103	}
1104    }
1105    return 0;
1106}
1107
1108/*
1109 *----------------------------------------------------------------------
1110 *
1111 * Tcl_UniCharToUpper --
1112 *
1113 *	Compute the uppercase equivalent of the given Unicode character.
1114 *
1115 * Results:
1116 *	Returns the uppercase Unicode character.
1117 *
1118 * Side effects:
1119 *	None.
1120 *
1121 *----------------------------------------------------------------------
1122 */
1123
1124Tcl_UniChar
1125Tcl_UniCharToUpper(
1126    int ch)			/* Unicode character to convert. */
1127{
1128    int info = GetUniCharInfo(ch);
1129
1130    if (GetCaseType(info) & 0x04) {
1131	return (Tcl_UniChar) (ch - GetDelta(info));
1132    } else {
1133	return ch;
1134    }
1135}
1136
1137/*
1138 *----------------------------------------------------------------------
1139 *
1140 * Tcl_UniCharToLower --
1141 *
1142 *	Compute the lowercase equivalent of the given Unicode character.
1143 *
1144 * Results:
1145 *	Returns the lowercase Unicode character.
1146 *
1147 * Side effects:
1148 *	None.
1149 *
1150 *----------------------------------------------------------------------
1151 */
1152
1153Tcl_UniChar
1154Tcl_UniCharToLower(
1155    int ch)			/* Unicode character to convert. */
1156{
1157    int info = GetUniCharInfo(ch);
1158
1159    if (GetCaseType(info) & 0x02) {
1160	return (Tcl_UniChar) (ch + GetDelta(info));
1161    } else {
1162	return ch;
1163    }
1164}
1165
1166/*
1167 *----------------------------------------------------------------------
1168 *
1169 * Tcl_UniCharToTitle --
1170 *
1171 *	Compute the titlecase equivalent of the given Unicode character.
1172 *
1173 * Results:
1174 *	Returns the titlecase Unicode character.
1175 *
1176 * Side effects:
1177 *	None.
1178 *
1179 *----------------------------------------------------------------------
1180 */
1181
1182Tcl_UniChar
1183Tcl_UniCharToTitle(
1184    int ch)			/* Unicode character to convert. */
1185{
1186    int info = GetUniCharInfo(ch);
1187    int mode = GetCaseType(info);
1188
1189    if (mode & 0x1) {
1190	/*
1191	 * Subtract or add one depending on the original case.
1192	 */
1193
1194	return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1195    } else if (mode == 0x4) {
1196	return (Tcl_UniChar) (ch - GetDelta(info));
1197    } else {
1198	return ch;
1199    }
1200}
1201
1202/*
1203 *----------------------------------------------------------------------
1204 *
1205 * Tcl_UniCharLen --
1206 *
1207 *	Find the length of a UniChar string. The str input must be null
1208 *	terminated.
1209 *
1210 * Results:
1211 *	Returns the length of str in UniChars (not bytes).
1212 *
1213 * Side effects:
1214 *	None.
1215 *
1216 *----------------------------------------------------------------------
1217 */
1218
1219int
1220Tcl_UniCharLen(
1221    CONST Tcl_UniChar *uniStr)	/* Unicode string to find length of. */
1222{
1223    int len = 0;
1224
1225    while (*uniStr != '\0') {
1226	len++;
1227	uniStr++;
1228    }
1229    return len;
1230}
1231
1232/*
1233 *----------------------------------------------------------------------
1234 *
1235 * Tcl_UniCharNcmp --
1236 *
1237 *	Compare at most numChars unichars of string ucs to string uct.
1238 *	Both ucs and uct are assumed to be at least numChars unichars long.
1239 *
1240 * Results:
1241 *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1242 *
1243 * Side effects:
1244 *	None.
1245 *
1246 *----------------------------------------------------------------------
1247 */
1248
1249int
1250Tcl_UniCharNcmp(
1251    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
1252    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
1253    unsigned long numChars)	/* Number of unichars to compare. */
1254{
1255#ifdef WORDS_BIGENDIAN
1256    /*
1257     * We are definitely on a big-endian machine; memcmp() is safe
1258     */
1259
1260    return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
1261
1262#else /* !WORDS_BIGENDIAN */
1263    /*
1264     * We can't simply call memcmp() because that is not lexically correct.
1265     */
1266
1267    for ( ; numChars != 0; ucs++, uct++, numChars--) {
1268	if (*ucs != *uct) {
1269	    return (*ucs - *uct);
1270	}
1271    }
1272    return 0;
1273#endif /* WORDS_BIGENDIAN */
1274}
1275
1276/*
1277 *----------------------------------------------------------------------
1278 *
1279 * Tcl_UniCharNcasecmp --
1280 *
1281 *	Compare at most numChars unichars of string ucs to string uct case
1282 *	insensitive. Both ucs and uct are assumed to be at least numChars
1283 *	unichars long.
1284 *
1285 * Results:
1286 *	Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1287 *
1288 * Side effects:
1289 *	None.
1290 *
1291 *----------------------------------------------------------------------
1292 */
1293
1294int
1295Tcl_UniCharNcasecmp(
1296    CONST Tcl_UniChar *ucs,	/* Unicode string to compare to uct. */
1297    CONST Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
1298    unsigned long numChars)	/* Number of unichars to compare. */
1299{
1300    for ( ; numChars != 0; numChars--, ucs++, uct++) {
1301	if (*ucs != *uct) {
1302	    Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
1303	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
1304
1305	    if (lcs != lct) {
1306		return (lcs - lct);
1307	    }
1308	}
1309    }
1310    return 0;
1311}
1312
1313/*
1314 *----------------------------------------------------------------------
1315 *
1316 * Tcl_UniCharIsAlnum --
1317 *
1318 *	Test if a character is an alphanumeric Unicode character.
1319 *
1320 * Results:
1321 *	Returns 1 if character is alphanumeric.
1322 *
1323 * Side effects:
1324 *	None.
1325 *
1326 *----------------------------------------------------------------------
1327 */
1328
1329int
1330Tcl_UniCharIsAlnum(
1331    int ch)			/* Unicode character to test. */
1332{
1333    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1334
1335    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1336}
1337
1338/*
1339 *----------------------------------------------------------------------
1340 *
1341 * Tcl_UniCharIsAlpha --
1342 *
1343 *	Test if a character is an alphabetic Unicode character.
1344 *
1345 * Results:
1346 *	Returns 1 if character is alphabetic.
1347 *
1348 * Side effects:
1349 *	None.
1350 *
1351 *----------------------------------------------------------------------
1352 */
1353
1354int
1355Tcl_UniCharIsAlpha(
1356    int ch)			/* Unicode character to test. */
1357{
1358    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1359    return ((ALPHA_BITS >> category) & 1);
1360}
1361
1362/*
1363 *----------------------------------------------------------------------
1364 *
1365 * Tcl_UniCharIsControl --
1366 *
1367 *	Test if a character is a Unicode control character.
1368 *
1369 * Results:
1370 *	Returns non-zero if character is a control.
1371 *
1372 * Side effects:
1373 *	None.
1374 *
1375 *----------------------------------------------------------------------
1376 */
1377
1378int
1379Tcl_UniCharIsControl(
1380    int ch)			/* Unicode character to test. */
1381{
1382    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1383}
1384
1385/*
1386 *----------------------------------------------------------------------
1387 *
1388 * Tcl_UniCharIsDigit --
1389 *
1390 *	Test if a character is a numeric Unicode character.
1391 *
1392 * Results:
1393 *	Returns non-zero if character is a digit.
1394 *
1395 * Side effects:
1396 *	None.
1397 *
1398 *----------------------------------------------------------------------
1399 */
1400
1401int
1402Tcl_UniCharIsDigit(
1403    int ch)			/* Unicode character to test. */
1404{
1405    return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
1406}
1407
1408/*
1409 *----------------------------------------------------------------------
1410 *
1411 * Tcl_UniCharIsGraph --
1412 *
1413 *	Test if a character is any Unicode print character except space.
1414 *
1415 * Results:
1416 *	Returns non-zero if character is printable, but not space.
1417 *
1418 * Side effects:
1419 *	None.
1420 *
1421 *----------------------------------------------------------------------
1422 */
1423
1424int
1425Tcl_UniCharIsGraph(
1426    int ch)			/* Unicode character to test. */
1427{
1428    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1429    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1430}
1431
1432/*
1433 *----------------------------------------------------------------------
1434 *
1435 * Tcl_UniCharIsLower --
1436 *
1437 *	Test if a character is a lowercase Unicode character.
1438 *
1439 * Results:
1440 *	Returns non-zero if character is lowercase.
1441 *
1442 * Side effects:
1443 *	None.
1444 *
1445 *----------------------------------------------------------------------
1446 */
1447
1448int
1449Tcl_UniCharIsLower(
1450    int ch)			/* Unicode character to test. */
1451{
1452    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1453}
1454
1455/*
1456 *----------------------------------------------------------------------
1457 *
1458 * Tcl_UniCharIsPrint --
1459 *
1460 *	Test if a character is a Unicode print character.
1461 *
1462 * Results:
1463 *	Returns non-zero if character is printable.
1464 *
1465 * Side effects:
1466 *	None.
1467 *
1468 *----------------------------------------------------------------------
1469 */
1470
1471int
1472Tcl_UniCharIsPrint(
1473    int ch)			/* Unicode character to test. */
1474{
1475    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1476    return ((PRINT_BITS >> category) & 1);
1477}
1478
1479/*
1480 *----------------------------------------------------------------------
1481 *
1482 * Tcl_UniCharIsPunct --
1483 *
1484 *	Test if a character is a Unicode punctuation character.
1485 *
1486 * Results:
1487 *	Returns non-zero if character is punct.
1488 *
1489 * Side effects:
1490 *	None.
1491 *
1492 *----------------------------------------------------------------------
1493 */
1494
1495int
1496Tcl_UniCharIsPunct(
1497    int ch)			/* Unicode character to test. */
1498{
1499    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1500    return ((PUNCT_BITS >> category) & 1);
1501}
1502
1503/*
1504 *----------------------------------------------------------------------
1505 *
1506 * Tcl_UniCharIsSpace --
1507 *
1508 *	Test if a character is a whitespace Unicode character.
1509 *
1510 * Results:
1511 *	Returns non-zero if character is a space.
1512 *
1513 * Side effects:
1514 *	None.
1515 *
1516 *----------------------------------------------------------------------
1517 */
1518
1519int
1520Tcl_UniCharIsSpace(
1521    int ch)			/* Unicode character to test. */
1522{
1523    register int category;
1524
1525    /*
1526     * If the character is within the first 127 characters, just use the
1527     * standard C function, otherwise consult the Unicode table.
1528     */
1529
1530    if (ch < 0x80) {
1531	return isspace(UCHAR(ch)); /* INTL: ISO space */
1532    } else {
1533	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1534	return ((SPACE_BITS >> category) & 1);
1535    }
1536}
1537
1538/*
1539 *----------------------------------------------------------------------
1540 *
1541 * Tcl_UniCharIsUpper --
1542 *
1543 *	Test if a character is a uppercase Unicode character.
1544 *
1545 * Results:
1546 *	Returns non-zero if character is uppercase.
1547 *
1548 * Side effects:
1549 *	None.
1550 *
1551 *----------------------------------------------------------------------
1552 */
1553
1554int
1555Tcl_UniCharIsUpper(
1556    int ch)			/* Unicode character to test. */
1557{
1558    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1559}
1560
1561/*
1562 *----------------------------------------------------------------------
1563 *
1564 * Tcl_UniCharIsWordChar --
1565 *
1566 *	Test if a character is alphanumeric or a connector punctuation mark.
1567 *
1568 * Results:
1569 *	Returns 1 if character is a word character.
1570 *
1571 * Side effects:
1572 *	None.
1573 *
1574 *----------------------------------------------------------------------
1575 */
1576
1577int
1578Tcl_UniCharIsWordChar(
1579    int ch)			/* Unicode character to test. */
1580{
1581    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1582
1583    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1584}
1585
1586/*
1587 *----------------------------------------------------------------------
1588 *
1589 * Tcl_UniCharCaseMatch --
1590 *
1591 *	See if a particular Unicode string matches a particular pattern.
1592 *	Allows case insensitivity. This is the Unicode equivalent of the char*
1593 *	Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
1594 *	This has no provision for counted UniChar strings, thus should not be
1595 *	used where NULLs are expected in the UniChar string. Use
1596 *	TclUniCharMatch where possible.
1597 *
1598 * Results:
1599 *	The return value is 1 if string matches pattern, and 0 otherwise. The
1600 *	matching operation permits the following special characters in the
1601 *	pattern: *?\[] (see the manual entry for details on what these mean).
1602 *
1603 * Side effects:
1604 *	None.
1605 *
1606 *----------------------------------------------------------------------
1607 */
1608
1609int
1610Tcl_UniCharCaseMatch(
1611    CONST Tcl_UniChar *uniStr,	/* Unicode String. */
1612    CONST Tcl_UniChar *uniPattern,
1613				/* Pattern, which may contain special
1614				 * characters. */
1615    int nocase)			/* 0 for case sensitive, 1 for insensitive */
1616{
1617    Tcl_UniChar ch1, p;
1618
1619    while (1) {
1620	p = *uniPattern;
1621
1622	/*
1623	 * See if we're at the end of both the pattern and the string. If so,
1624	 * we succeeded. If we're at the end of the pattern but not at the end
1625	 * of the string, we failed.
1626	 */
1627
1628	if (p == 0) {
1629	    return (*uniStr == 0);
1630	}
1631	if ((*uniStr == 0) && (p != '*')) {
1632	    return 0;
1633	}
1634
1635	/*
1636	 * Check for a "*" as the next pattern character. It matches any
1637	 * substring. We handle this by skipping all the characters up to the
1638	 * next matching one in the pattern, and then calling ourselves
1639	 * recursively for each postfix of string, until either we match or we
1640	 * reach the end of the string.
1641	 */
1642
1643	if (p == '*') {
1644	    /*
1645	     * Skip all successive *'s in the pattern
1646	     */
1647
1648	    while (*(++uniPattern) == '*') {
1649		/* empty body */
1650	    }
1651	    p = *uniPattern;
1652	    if (p == 0) {
1653		return 1;
1654	    }
1655	    if (nocase) {
1656		p = Tcl_UniCharToLower(p);
1657	    }
1658	    while (1) {
1659		/*
1660		 * Optimization for matching - cruise through the string
1661		 * quickly if the next char in the pattern isn't a special
1662		 * character
1663		 */
1664
1665		if ((p != '[') && (p != '?') && (p != '\\')) {
1666		    if (nocase) {
1667			while (*uniStr && (p != *uniStr)
1668				&& (p != Tcl_UniCharToLower(*uniStr))) {
1669			    uniStr++;
1670			}
1671		    } else {
1672			while (*uniStr && (p != *uniStr)) {
1673			    uniStr++;
1674			}
1675		    }
1676		}
1677		if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
1678		    return 1;
1679		}
1680		if (*uniStr == 0) {
1681		    return 0;
1682		}
1683		uniStr++;
1684	    }
1685	}
1686
1687	/*
1688	 * Check for a "?" as the next pattern character. It matches any
1689	 * single character.
1690	 */
1691
1692	if (p == '?') {
1693	    uniPattern++;
1694	    uniStr++;
1695	    continue;
1696	}
1697
1698	/*
1699	 * Check for a "[" as the next pattern character. It is followed by a
1700	 * list of characters that are acceptable, or by a range (two
1701	 * characters separated by "-").
1702	 */
1703
1704	if (p == '[') {
1705	    Tcl_UniChar startChar, endChar;
1706
1707	    uniPattern++;
1708	    ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
1709	    uniStr++;
1710	    while (1) {
1711		if ((*uniPattern == ']') || (*uniPattern == 0)) {
1712		    return 0;
1713		}
1714		startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1715			: *uniPattern);
1716		uniPattern++;
1717		if (*uniPattern == '-') {
1718		    uniPattern++;
1719		    if (*uniPattern == 0) {
1720			return 0;
1721		    }
1722		    endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1723			    : *uniPattern);
1724		    uniPattern++;
1725		    if (((startChar <= ch1) && (ch1 <= endChar))
1726			    || ((endChar <= ch1) && (ch1 <= startChar))) {
1727			/*
1728			 * Matches ranges of form [a-z] or [z-a].
1729			 */
1730			break;
1731		    }
1732		} else if (startChar == ch1) {
1733		    break;
1734		}
1735	    }
1736	    while (*uniPattern != ']') {
1737		if (*uniPattern == 0) {
1738		    uniPattern--;
1739		    break;
1740		}
1741		uniPattern++;
1742	    }
1743	    uniPattern++;
1744	    continue;
1745	}
1746
1747	/*
1748	 * If the next pattern character is '\', just strip off the '\' so we
1749	 * do exact matching on the character that follows.
1750	 */
1751
1752	if (p == '\\') {
1753	    if (*(++uniPattern) == '\0') {
1754		return 0;
1755	    }
1756	}
1757
1758	/*
1759	 * There's no special character. Just make sure that the next bytes of
1760	 * each string match.
1761	 */
1762
1763	if (nocase) {
1764	    if (Tcl_UniCharToLower(*uniStr) !=
1765		    Tcl_UniCharToLower(*uniPattern)) {
1766		return 0;
1767	    }
1768	} else if (*uniStr != *uniPattern) {
1769	    return 0;
1770	}
1771	uniStr++;
1772	uniPattern++;
1773    }
1774}
1775
1776/*
1777 *----------------------------------------------------------------------
1778 *
1779 * TclUniCharMatch --
1780 *
1781 *	See if a particular Unicode string matches a particular pattern.
1782 *	Allows case insensitivity. This is the Unicode equivalent of the char*
1783 *	Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
1784 *	Strings, so embedded NULLs are allowed.
1785 *
1786 * Results:
1787 *	The return value is 1 if string matches pattern, and 0 otherwise. The
1788 *	matching operation permits the following special characters in the
1789 *	pattern: *?\[] (see the manual entry for details on what these mean).
1790 *
1791 * Side effects:
1792 *	None.
1793 *
1794 *----------------------------------------------------------------------
1795 */
1796
1797int
1798TclUniCharMatch(
1799    CONST Tcl_UniChar *string,	/* Unicode String. */
1800    int strLen,			/* Length of String */
1801    CONST Tcl_UniChar *pattern,	/* Pattern, which may contain special
1802				 * characters. */
1803    int ptnLen,			/* Length of Pattern */
1804    int nocase)			/* 0 for case sensitive, 1 for insensitive */
1805{
1806    CONST Tcl_UniChar *stringEnd, *patternEnd;
1807    Tcl_UniChar p;
1808
1809    stringEnd = string + strLen;
1810    patternEnd = pattern + ptnLen;
1811
1812    while (1) {
1813	/*
1814	 * See if we're at the end of both the pattern and the string. If so,
1815	 * we succeeded. If we're at the end of the pattern but not at the end
1816	 * of the string, we failed.
1817	 */
1818
1819	if (pattern == patternEnd) {
1820	    return (string == stringEnd);
1821	}
1822	p = *pattern;
1823	if ((string == stringEnd) && (p != '*')) {
1824	    return 0;
1825	}
1826
1827	/*
1828	 * Check for a "*" as the next pattern character. It matches any
1829	 * substring. We handle this by skipping all the characters up to the
1830	 * next matching one in the pattern, and then calling ourselves
1831	 * recursively for each postfix of string, until either we match or we
1832	 * reach the end of the string.
1833	 */
1834
1835	if (p == '*') {
1836	    /*
1837	     * Skip all successive *'s in the pattern.
1838	     */
1839
1840	    while (*(++pattern) == '*') {
1841		/* empty body */
1842	    }
1843	    if (pattern == patternEnd) {
1844		return 1;
1845	    }
1846	    p = *pattern;
1847	    if (nocase) {
1848		p = Tcl_UniCharToLower(p);
1849	    }
1850	    while (1) {
1851		/*
1852		 * Optimization for matching - cruise through the string
1853		 * quickly if the next char in the pattern isn't a special
1854		 * character.
1855		 */
1856
1857		if ((p != '[') && (p != '?') && (p != '\\')) {
1858		    if (nocase) {
1859			while ((string < stringEnd) && (p != *string)
1860				&& (p != Tcl_UniCharToLower(*string))) {
1861			    string++;
1862			}
1863		    } else {
1864			while ((string < stringEnd) && (p != *string)) {
1865			    string++;
1866			}
1867		    }
1868		}
1869		if (TclUniCharMatch(string, stringEnd - string,
1870			pattern, patternEnd - pattern, nocase)) {
1871		    return 1;
1872		}
1873		if (string == stringEnd) {
1874		    return 0;
1875		}
1876		string++;
1877	    }
1878	}
1879
1880	/*
1881	 * Check for a "?" as the next pattern character. It matches any
1882	 * single character.
1883	 */
1884
1885	if (p == '?') {
1886	    pattern++;
1887	    string++;
1888	    continue;
1889	}
1890
1891	/*
1892	 * Check for a "[" as the next pattern character. It is followed by a
1893	 * list of characters that are acceptable, or by a range (two
1894	 * characters separated by "-").
1895	 */
1896
1897	if (p == '[') {
1898	    Tcl_UniChar ch1, startChar, endChar;
1899
1900	    pattern++;
1901	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1902	    string++;
1903	    while (1) {
1904		if ((*pattern == ']') || (pattern == patternEnd)) {
1905		    return 0;
1906		}
1907		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1908		pattern++;
1909		if (*pattern == '-') {
1910		    pattern++;
1911		    if (pattern == patternEnd) {
1912			return 0;
1913		    }
1914		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1915			    : *pattern);
1916		    pattern++;
1917		    if (((startChar <= ch1) && (ch1 <= endChar))
1918			    || ((endChar <= ch1) && (ch1 <= startChar))) {
1919			/*
1920			 * Matches ranges of form [a-z] or [z-a].
1921			 */
1922			break;
1923		    }
1924		} else if (startChar == ch1) {
1925		    break;
1926		}
1927	    }
1928	    while (*pattern != ']') {
1929		if (pattern == patternEnd) {
1930		    pattern--;
1931		    break;
1932		}
1933		pattern++;
1934	    }
1935	    pattern++;
1936	    continue;
1937	}
1938
1939	/*
1940	 * If the next pattern character is '\', just strip off the '\' so we
1941	 * do exact matching on the character that follows.
1942	 */
1943
1944	if (p == '\\') {
1945	    if (++pattern == patternEnd) {
1946		return 0;
1947	    }
1948	}
1949
1950	/*
1951	 * There's no special character. Just make sure that the next bytes of
1952	 * each string match.
1953	 */
1954
1955	if (nocase) {
1956	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1957		return 0;
1958	    }
1959	} else if (*string != *pattern) {
1960	    return 0;
1961	}
1962	string++;
1963	pattern++;
1964    }
1965}
1966
1967/*
1968 * Local Variables:
1969 * mode: c
1970 * c-basic-offset: 4
1971 * fill-column: 78
1972 * End:
1973 */
1974