1/* 2 * tclUtf.c -- 3 * 4 * Routines for manipulating UTF-8 strings. 5 * 6 * Copyright (c) 1997-1998 Sun Microsystems, Inc. 7 * 8 * See the file "license.terms" for information on usage and redistribution of 9 * this file, and for a DISCLAIMER OF ALL WARRANTIES. 10 * 11 * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $ 12 */ 13 14#include "tclInt.h" 15 16/* 17 * Include the static character classification tables and macros. 18 */ 19 20#include "tclUniData.c" 21 22/* 23 * The following macros are used for fast character category tests. The x_BITS 24 * values are shifted right by the category value to determine whether the 25 * given category is included in the set. 26 */ 27 28#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ 29 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) 30 31#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) 32 33#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ 34 | (1 << PARAGRAPH_SEPARATOR)) 35 36#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) 37 38#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ 39 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ 40 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ 41 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ 42 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ 43 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ 44 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ 45 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ 46 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) 47 48#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ 49 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ 50 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ 51 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) 52 53/* 54 * Unicode characters less than this value are represented by themselves in 55 * UTF-8 strings. 56 */ 57 58#define UNICODE_SELF 0x80 59 60/* 61 * The following structures are used when mapping between Unicode (UCS-2) and 62 * UTF-8. 63 */ 64 65static CONST unsigned char totalBytes[256] = { 66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 72 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 73 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 74#if TCL_UTF_MAX > 3 75 4,4,4,4,4,4,4,4, 76#else 77 1,1,1,1,1,1,1,1, 78#endif 79#if TCL_UTF_MAX > 4 80 5,5,5,5, 81#else 82 1,1,1,1, 83#endif 84#if TCL_UTF_MAX > 5 85 6,6,6,6 86#else 87 1,1,1,1 88#endif 89}; 90 91/* 92 * Functions used only in this module. 93 */ 94 95static int UtfCount(int ch); 96 97/* 98 *--------------------------------------------------------------------------- 99 * 100 * UtfCount -- 101 * 102 * Find the number of bytes in the Utf character "ch". 103 * 104 * Results: 105 * The return values is the number of bytes in the Utf character "ch". 106 * 107 * Side effects: 108 * None. 109 * 110 *--------------------------------------------------------------------------- 111 */ 112 113INLINE static int 114UtfCount( 115 int ch) /* The Tcl_UniChar whose size is returned. */ 116{ 117 if ((ch > 0) && (ch < UNICODE_SELF)) { 118 return 1; 119 } 120 if (ch <= 0x7FF) { 121 return 2; 122 } 123 if (ch <= 0xFFFF) { 124 return 3; 125 } 126#if TCL_UTF_MAX > 3 127 if (ch <= 0x1FFFFF) { 128 return 4; 129 } 130 if (ch <= 0x3FFFFFF) { 131 return 5; 132 } 133 if (ch <= 0x7FFFFFFF) { 134 return 6; 135 } 136#endif 137 return 3; 138} 139 140/* 141 *--------------------------------------------------------------------------- 142 * 143 * Tcl_UniCharToUtf -- 144 * 145 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the 146 * provided buffer. Equivalent to Plan 9 runetochar(). 147 * 148 * Results: 149 * The return values is the number of bytes in the buffer that were 150 * consumed. 151 * 152 * Side effects: 153 * None. 154 * 155 *--------------------------------------------------------------------------- 156 */ 157 158INLINE int 159Tcl_UniCharToUtf( 160 int ch, /* The Tcl_UniChar to be stored in the 161 * buffer. */ 162 char *buf) /* Buffer in which the UTF-8 representation of 163 * the Tcl_UniChar is stored. Buffer must be 164 * large enough to hold the UTF-8 character 165 * (at most TCL_UTF_MAX bytes). */ 166{ 167 if ((ch > 0) && (ch < UNICODE_SELF)) { 168 buf[0] = (char) ch; 169 return 1; 170 } 171 if (ch >= 0) { 172 if (ch <= 0x7FF) { 173 buf[1] = (char) ((ch | 0x80) & 0xBF); 174 buf[0] = (char) ((ch >> 6) | 0xC0); 175 return 2; 176 } 177 if (ch <= 0xFFFF) { 178 three: 179 buf[2] = (char) ((ch | 0x80) & 0xBF); 180 buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); 181 buf[0] = (char) ((ch >> 12) | 0xE0); 182 return 3; 183 } 184 185#if TCL_UTF_MAX > 3 186 if (ch <= 0x1FFFFF) { 187 buf[3] = (char) ((ch | 0x80) & 0xBF); 188 buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); 189 buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); 190 buf[0] = (char) ((ch >> 18) | 0xF0); 191 return 4; 192 } 193 if (ch <= 0x3FFFFFF) { 194 buf[4] = (char) ((ch | 0x80) & 0xBF); 195 buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); 196 buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); 197 buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); 198 buf[0] = (char) ((ch >> 24) | 0xF8); 199 return 5; 200 } 201 if (ch <= 0x7FFFFFFF) { 202 buf[5] = (char) ((ch | 0x80) & 0xBF); 203 buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); 204 buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); 205 buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); 206 buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); 207 buf[0] = (char) ((ch >> 30) | 0xFC); 208 return 6; 209 } 210#endif 211 } 212 213 ch = 0xFFFD; 214 goto three; 215} 216 217/* 218 *--------------------------------------------------------------------------- 219 * 220 * Tcl_UniCharToUtfDString -- 221 * 222 * Convert the given Unicode string to UTF-8. 223 * 224 * Results: 225 * The return value is a pointer to the UTF-8 representation of the 226 * Unicode string. Storage for the return value is appended to the end of 227 * dsPtr. 228 * 229 * Side effects: 230 * None. 231 * 232 *--------------------------------------------------------------------------- 233 */ 234 235char * 236Tcl_UniCharToUtfDString( 237 CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ 238 int uniLength, /* Length of Unicode string in Tcl_UniChars 239 * (must be >= 0). */ 240 Tcl_DString *dsPtr) /* UTF-8 representation of string is appended 241 * to this previously initialized DString. */ 242{ 243 CONST Tcl_UniChar *w, *wEnd; 244 char *p, *string; 245 int oldLength; 246 247 /* 248 * UTF-8 string length in bytes will be <= Unicode string length * 249 * TCL_UTF_MAX. 250 */ 251 252 oldLength = Tcl_DStringLength(dsPtr); 253 Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); 254 string = Tcl_DStringValue(dsPtr) + oldLength; 255 256 p = string; 257 wEnd = uniStr + uniLength; 258 for (w = uniStr; w < wEnd; ) { 259 p += Tcl_UniCharToUtf(*w, p); 260 w++; 261 } 262 Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); 263 264 return string; 265} 266 267/* 268 *--------------------------------------------------------------------------- 269 * 270 * Tcl_UtfToUniChar -- 271 * 272 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 273 * sequences are converted to valid Tcl_UniChars and processing 274 * continues. Equivalent to Plan 9 chartorune(). 275 * 276 * The caller must ensure that the source buffer is long enough that this 277 * routine does not run off the end and dereference non-existent memory 278 * looking for trail bytes. If the source buffer is known to be '\0' 279 * terminated, this cannot happen. Otherwise, the caller should call 280 * Tcl_UtfCharComplete() before calling this routine to ensure that 281 * enough bytes remain in the string. 282 * 283 * Results: 284 * *chPtr is filled with the Tcl_UniChar, and the return value is the 285 * number of bytes from the UTF-8 string that were consumed. 286 * 287 * Side effects: 288 * None. 289 * 290 *--------------------------------------------------------------------------- 291 */ 292 293int 294Tcl_UtfToUniChar( 295 register CONST char *src, /* The UTF-8 string. */ 296 register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by 297 * the UTF-8 string. */ 298{ 299 register int byte; 300 301 /* 302 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. 303 */ 304 305 byte = *((unsigned char *) src); 306 if (byte < 0xC0) { 307 /* 308 * Handles properly formed UTF-8 characters between 0x01 and 0x7F. 309 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid 310 * characters representing themselves. 311 */ 312 313 *chPtr = (Tcl_UniChar) byte; 314 return 1; 315 } else if (byte < 0xE0) { 316 if ((src[1] & 0xC0) == 0x80) { 317 /* 318 * Two-byte-character lead-byte followed by a trail-byte. 319 */ 320 321 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); 322 return 2; 323 } 324 325 /* 326 * A two-byte-character lead-byte not followed by trail-byte 327 * represents itself. 328 */ 329 330 *chPtr = (Tcl_UniChar) byte; 331 return 1; 332 } else if (byte < 0xF0) { 333 if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { 334 /* 335 * Three-byte-character lead byte followed by two trail bytes. 336 */ 337 338 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 339 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); 340 return 3; 341 } 342 343 /* 344 * A three-byte-character lead-byte not followed by two trail-bytes 345 * represents itself. 346 */ 347 348 *chPtr = (Tcl_UniChar) byte; 349 return 1; 350 } 351#if TCL_UTF_MAX > 3 352 { 353 int ch, total, trail; 354 355 total = totalBytes[byte]; 356 trail = total - 1; 357 if (trail > 0) { 358 ch = byte & (0x3F >> trail); 359 do { 360 src++; 361 if ((*src & 0xC0) != 0x80) { 362 *chPtr = byte; 363 return 1; 364 } 365 ch <<= 6; 366 ch |= (*src & 0x3F); 367 trail--; 368 } while (trail > 0); 369 *chPtr = ch; 370 return total; 371 } 372 } 373#endif 374 375 *chPtr = (Tcl_UniChar) byte; 376 return 1; 377} 378 379/* 380 *--------------------------------------------------------------------------- 381 * 382 * Tcl_UtfToUniCharDString -- 383 * 384 * Convert the UTF-8 string to Unicode. 385 * 386 * Results: 387 * The return value is a pointer to the Unicode representation of the 388 * UTF-8 string. Storage for the return value is appended to the end of 389 * dsPtr. The Unicode string is terminated with a Unicode NULL character. 390 * 391 * Side effects: 392 * None. 393 * 394 *--------------------------------------------------------------------------- 395 */ 396 397Tcl_UniChar * 398Tcl_UtfToUniCharDString( 399 CONST char *src, /* UTF-8 string to convert to Unicode. */ 400 int length, /* Length of UTF-8 string in bytes, or -1 for 401 * strlen(). */ 402 Tcl_DString *dsPtr) /* Unicode representation of string is 403 * appended to this previously initialized 404 * DString. */ 405{ 406 Tcl_UniChar *w, *wString; 407 CONST char *p, *end; 408 int oldLength; 409 410 if (length < 0) { 411 length = strlen(src); 412 } 413 414 /* 415 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in 416 * bytes. 417 */ 418 419 oldLength = Tcl_DStringLength(dsPtr); 420 Tcl_DStringSetLength(dsPtr, 421 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); 422 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); 423 424 w = wString; 425 end = src + length; 426 for (p = src; p < end; ) { 427 p += TclUtfToUniChar(p, w); 428 w++; 429 } 430 *w = '\0'; 431 Tcl_DStringSetLength(dsPtr, 432 (oldLength + ((char *) w - (char *) wString))); 433 434 return wString; 435} 436 437/* 438 *--------------------------------------------------------------------------- 439 * 440 * Tcl_UtfCharComplete -- 441 * 442 * Determine if the UTF-8 string of the given length is long enough to be 443 * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 444 * string is properly formed. Equivalent to Plan 9 fullrune(). 445 * 446 * Results: 447 * The return value is 0 if the string is not long enough, non-zero 448 * otherwise. 449 * 450 * Side effects: 451 * None. 452 * 453 *--------------------------------------------------------------------------- 454 */ 455 456int 457Tcl_UtfCharComplete( 458 CONST char *src, /* String to check if first few bytes contain 459 * a complete UTF-8 character. */ 460 int length) /* Length of above string in bytes. */ 461{ 462 int ch; 463 464 ch = *((unsigned char *) src); 465 return length >= totalBytes[ch]; 466} 467 468/* 469 *--------------------------------------------------------------------------- 470 * 471 * Tcl_NumUtfChars -- 472 * 473 * Returns the number of characters (not bytes) in the UTF-8 string, not 474 * including the terminating NULL byte. This is equivalent to Plan 9 475 * utflen() and utfnlen(). 476 * 477 * Results: 478 * As above. 479 * 480 * Side effects: 481 * None. 482 * 483 *--------------------------------------------------------------------------- 484 */ 485 486int 487Tcl_NumUtfChars( 488 register CONST char *src, /* The UTF-8 string to measure. */ 489 int length) /* The length of the string in bytes, or -1 490 * for strlen(string). */ 491{ 492 Tcl_UniChar ch; 493 register Tcl_UniChar *chPtr = &ch; 494 register int i; 495 496 /* 497 * The separate implementations are faster. 498 * 499 * Since this is a time-sensitive function, we also do the check for the 500 * single-byte char case specially. 501 */ 502 503 i = 0; 504 if (length < 0) { 505 while (*src != '\0') { 506 src += TclUtfToUniChar(src, chPtr); 507 i++; 508 } 509 } else { 510 register int n; 511 512 while (length > 0) { 513 if (UCHAR(*src) < 0xC0) { 514 length--; 515 src++; 516 } else { 517 n = Tcl_UtfToUniChar(src, chPtr); 518 length -= n; 519 src += n; 520 } 521 i++; 522 } 523 } 524 return i; 525} 526 527/* 528 *--------------------------------------------------------------------------- 529 * 530 * Tcl_UtfFindFirst -- 531 * 532 * Returns a pointer to the first occurance of the given Tcl_UniChar in 533 * the NULL-terminated UTF-8 string. The NULL terminator is considered 534 * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). 535 * 536 * Results: 537 * As above. If the Tcl_UniChar does not exist in the given string, the 538 * return value is NULL. 539 * 540 * Side effects: 541 * None. 542 * 543 *--------------------------------------------------------------------------- 544 */ 545 546CONST char * 547Tcl_UtfFindFirst( 548 CONST char *src, /* The UTF-8 string to be searched. */ 549 int ch) /* The Tcl_UniChar to search for. */ 550{ 551 int len; 552 Tcl_UniChar find; 553 554 while (1) { 555 len = TclUtfToUniChar(src, &find); 556 if (find == ch) { 557 return src; 558 } 559 if (*src == '\0') { 560 return NULL; 561 } 562 src += len; 563 } 564} 565 566/* 567 *--------------------------------------------------------------------------- 568 * 569 * Tcl_UtfFindLast -- 570 * 571 * Returns a pointer to the last occurance of the given Tcl_UniChar in 572 * the NULL-terminated UTF-8 string. The NULL terminator is considered 573 * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). 574 * 575 * Results: 576 * As above. If the Tcl_UniChar does not exist in the given string, the 577 * return value is NULL. 578 * 579 * Side effects: 580 * None. 581 * 582 *--------------------------------------------------------------------------- 583 */ 584 585CONST char * 586Tcl_UtfFindLast( 587 CONST char *src, /* The UTF-8 string to be searched. */ 588 int ch) /* The Tcl_UniChar to search for. */ 589{ 590 int len; 591 Tcl_UniChar find; 592 CONST char *last; 593 594 last = NULL; 595 while (1) { 596 len = TclUtfToUniChar(src, &find); 597 if (find == ch) { 598 last = src; 599 } 600 if (*src == '\0') { 601 break; 602 } 603 src += len; 604 } 605 return last; 606} 607 608/* 609 *--------------------------------------------------------------------------- 610 * 611 * Tcl_UtfNext -- 612 * 613 * Given a pointer to some current location in a UTF-8 string, move 614 * forward one character. The caller must ensure that they are not asking 615 * for the next character after the last character in the string. 616 * 617 * Results: 618 * The return value is the pointer to the next character in the UTF-8 619 * string. 620 * 621 * Side effects: 622 * None. 623 * 624 *--------------------------------------------------------------------------- 625 */ 626 627CONST char * 628Tcl_UtfNext( 629 CONST char *src) /* The current location in the string. */ 630{ 631 Tcl_UniChar ch; 632 633 return src + TclUtfToUniChar(src, &ch); 634} 635 636/* 637 *--------------------------------------------------------------------------- 638 * 639 * Tcl_UtfPrev -- 640 * 641 * Given a pointer to some current location in a UTF-8 string, move 642 * backwards one character. This works correctly when the pointer is in 643 * the middle of a UTF-8 character. 644 * 645 * Results: 646 * The return value is a pointer to the previous character in the UTF-8 647 * string. If the current location was already at the beginning of the 648 * string, the return value will also be a pointer to the beginning of 649 * the string. 650 * 651 * Side effects: 652 * None. 653 * 654 *--------------------------------------------------------------------------- 655 */ 656 657CONST char * 658Tcl_UtfPrev( 659 CONST char *src, /* The current location in the string. */ 660 CONST char *start) /* Pointer to the beginning of the string, to 661 * avoid going backwards too far. */ 662{ 663 CONST char *look; 664 int i, byte; 665 666 src--; 667 look = src; 668 for (i = 0; i < TCL_UTF_MAX; i++) { 669 if (look < start) { 670 if (src < start) { 671 src = start; 672 } 673 break; 674 } 675 byte = *((unsigned char *) look); 676 if (byte < 0x80) { 677 break; 678 } 679 if (byte >= 0xC0) { 680 return look; 681 } 682 look--; 683 } 684 return src; 685} 686 687/* 688 *--------------------------------------------------------------------------- 689 * 690 * Tcl_UniCharAtIndex -- 691 * 692 * Returns the Unicode character represented at the specified character 693 * (not byte) position in the UTF-8 string. 694 * 695 * Results: 696 * As above. 697 * 698 * Side effects: 699 * None. 700 * 701 *--------------------------------------------------------------------------- 702 */ 703 704Tcl_UniChar 705Tcl_UniCharAtIndex( 706 register CONST char *src, /* The UTF-8 string to dereference. */ 707 register int index) /* The position of the desired character. */ 708{ 709 Tcl_UniChar ch; 710 711 while (index >= 0) { 712 index--; 713 src += TclUtfToUniChar(src, &ch); 714 } 715 return ch; 716} 717 718/* 719 *--------------------------------------------------------------------------- 720 * 721 * Tcl_UtfAtIndex -- 722 * 723 * Returns a pointer to the specified character (not byte) position in 724 * the UTF-8 string. 725 * 726 * Results: 727 * As above. 728 * 729 * Side effects: 730 * None. 731 * 732 *--------------------------------------------------------------------------- 733 */ 734 735CONST char * 736Tcl_UtfAtIndex( 737 register CONST char *src, /* The UTF-8 string. */ 738 register int index) /* The position of the desired character. */ 739{ 740 Tcl_UniChar ch; 741 742 while (index > 0) { 743 index--; 744 src += TclUtfToUniChar(src, &ch); 745 } 746 return src; 747} 748 749/* 750 *--------------------------------------------------------------------------- 751 * 752 * Tcl_UtfBackslash -- 753 * 754 * Figure out how to handle a backslash sequence. 755 * 756 * Results: 757 * Stores the bytes represented by the backslash sequence in dst and 758 * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes 759 * are written to dst; dst must have been large enough to accept those 760 * bytes. If readPtr isn't NULL then it is filled in with a count of the 761 * number of bytes in the backslash sequence. 762 * 763 * Side effects: 764 * The maximum number of bytes it takes to represent a Unicode character 765 * in UTF-8 is guaranteed to be less than the number of bytes used to 766 * express the backslash sequence that represents that Unicode character. 767 * If the target buffer into which the caller is going to store the bytes 768 * that represent the Unicode character is at least as large as the 769 * source buffer from which the backslashed sequence was extracted, no 770 * buffer overruns should occur. 771 * 772 *--------------------------------------------------------------------------- 773 */ 774 775int 776Tcl_UtfBackslash( 777 CONST char *src, /* Points to the backslash character of a 778 * backslash sequence. */ 779 int *readPtr, /* Fill in with number of characters read from 780 * src, unless NULL. */ 781 char *dst) /* Filled with the bytes represented by the 782 * backslash sequence. */ 783{ 784#define LINE_LENGTH 128 785 int numRead; 786 int result; 787 788 result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); 789 if (numRead == LINE_LENGTH) { 790 /* 791 * We ate a whole line. Pay the price of a strlen() 792 */ 793 794 result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); 795 } 796 if (readPtr != NULL) { 797 *readPtr = numRead; 798 } 799 return result; 800} 801 802/* 803 *---------------------------------------------------------------------- 804 * 805 * Tcl_UtfToUpper -- 806 * 807 * Convert lowercase characters to uppercase characters in a UTF string 808 * in place. The conversion may shrink the UTF string. 809 * 810 * Results: 811 * Returns the number of bytes in the resulting string excluding the 812 * trailing null. 813 * 814 * Side effects: 815 * Writes a terminating null after the last converted character. 816 * 817 *---------------------------------------------------------------------- 818 */ 819 820int 821Tcl_UtfToUpper( 822 char *str) /* String to convert in place. */ 823{ 824 Tcl_UniChar ch, upChar; 825 char *src, *dst; 826 int bytes; 827 828 /* 829 * Iterate over the string until we hit the terminating null. 830 */ 831 832 src = dst = str; 833 while (*src) { 834 bytes = TclUtfToUniChar(src, &ch); 835 upChar = Tcl_UniCharToUpper(ch); 836 837 /* 838 * To keep badly formed Utf strings from getting inflated by the 839 * conversion (thereby causing a segfault), only copy the upper case 840 * char to dst if its size is <= the original char. 841 */ 842 843 if (bytes < UtfCount(upChar)) { 844 memcpy(dst, src, (size_t) bytes); 845 dst += bytes; 846 } else { 847 dst += Tcl_UniCharToUtf(upChar, dst); 848 } 849 src += bytes; 850 } 851 *dst = '\0'; 852 return (dst - str); 853} 854 855/* 856 *---------------------------------------------------------------------- 857 * 858 * Tcl_UtfToLower -- 859 * 860 * Convert uppercase characters to lowercase characters in a UTF string 861 * in place. The conversion may shrink the UTF string. 862 * 863 * Results: 864 * Returns the number of bytes in the resulting string excluding the 865 * trailing null. 866 * 867 * Side effects: 868 * Writes a terminating null after the last converted character. 869 * 870 *---------------------------------------------------------------------- 871 */ 872 873int 874Tcl_UtfToLower( 875 char *str) /* String to convert in place. */ 876{ 877 Tcl_UniChar ch, lowChar; 878 char *src, *dst; 879 int bytes; 880 881 /* 882 * Iterate over the string until we hit the terminating null. 883 */ 884 885 src = dst = str; 886 while (*src) { 887 bytes = TclUtfToUniChar(src, &ch); 888 lowChar = Tcl_UniCharToLower(ch); 889 890 /* 891 * To keep badly formed Utf strings from getting inflated by the 892 * conversion (thereby causing a segfault), only copy the lower case 893 * char to dst if its size is <= the original char. 894 */ 895 896 if (bytes < UtfCount(lowChar)) { 897 memcpy(dst, src, (size_t) bytes); 898 dst += bytes; 899 } else { 900 dst += Tcl_UniCharToUtf(lowChar, dst); 901 } 902 src += bytes; 903 } 904 *dst = '\0'; 905 return (dst - str); 906} 907 908/* 909 *---------------------------------------------------------------------- 910 * 911 * Tcl_UtfToTitle -- 912 * 913 * Changes the first character of a UTF string to title case or uppercase 914 * and the rest of the string to lowercase. The conversion happens in 915 * place and may shrink the UTF string. 916 * 917 * Results: 918 * Returns the number of bytes in the resulting string excluding the 919 * trailing null. 920 * 921 * Side effects: 922 * Writes a terminating null after the last converted character. 923 * 924 *---------------------------------------------------------------------- 925 */ 926 927int 928Tcl_UtfToTitle( 929 char *str) /* String to convert in place. */ 930{ 931 Tcl_UniChar ch, titleChar, lowChar; 932 char *src, *dst; 933 int bytes; 934 935 /* 936 * Capitalize the first character and then lowercase the rest of the 937 * characters until we get to a null. 938 */ 939 940 src = dst = str; 941 942 if (*src) { 943 bytes = TclUtfToUniChar(src, &ch); 944 titleChar = Tcl_UniCharToTitle(ch); 945 946 if (bytes < UtfCount(titleChar)) { 947 memcpy(dst, src, (size_t) bytes); 948 dst += bytes; 949 } else { 950 dst += Tcl_UniCharToUtf(titleChar, dst); 951 } 952 src += bytes; 953 } 954 while (*src) { 955 bytes = TclUtfToUniChar(src, &ch); 956 lowChar = Tcl_UniCharToLower(ch); 957 958 if (bytes < UtfCount(lowChar)) { 959 memcpy(dst, src, (size_t) bytes); 960 dst += bytes; 961 } else { 962 dst += Tcl_UniCharToUtf(lowChar, dst); 963 } 964 src += bytes; 965 } 966 *dst = '\0'; 967 return (dst - str); 968} 969 970/* 971 *---------------------------------------------------------------------- 972 * 973 * TclpUtfNcmp2 -- 974 * 975 * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and 976 * ct are assumed to be at least numBytes bytes long. 977 * 978 * Results: 979 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 980 * 981 * Side effects: 982 * None. 983 * 984 *---------------------------------------------------------------------- 985 */ 986 987int 988TclpUtfNcmp2( 989 CONST char *cs, /* UTF string to compare to ct. */ 990 CONST char *ct, /* UTF string cs is compared to. */ 991 unsigned long numBytes) /* Number of *bytes* to compare. */ 992{ 993 /* 994 * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to 995 * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes 996 * fine in the strcmp manner. 997 */ 998 999 register int result = 0; 1000 1001 for ( ; numBytes != 0; numBytes--, cs++, ct++) { 1002 if (*cs != *ct) { 1003 result = UCHAR(*cs) - UCHAR(*ct); 1004 break; 1005 } 1006 } 1007 if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { 1008 unsigned char c1, c2; 1009 1010 c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); 1011 c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); 1012 result = (c1 - c2); 1013 } 1014 return result; 1015} 1016 1017/* 1018 *---------------------------------------------------------------------- 1019 * 1020 * Tcl_UtfNcmp -- 1021 * 1022 * Compare at most numChars UTF chars of string cs to string ct. Both cs 1023 * and ct are assumed to be at least numChars UTF chars long. 1024 * 1025 * Results: 1026 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1027 * 1028 * Side effects: 1029 * None. 1030 * 1031 *---------------------------------------------------------------------- 1032 */ 1033 1034int 1035Tcl_UtfNcmp( 1036 CONST char *cs, /* UTF string to compare to ct. */ 1037 CONST char *ct, /* UTF string cs is compared to. */ 1038 unsigned long numChars) /* Number of UTF chars to compare. */ 1039{ 1040 Tcl_UniChar ch1, ch2; 1041 1042 /* 1043 * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the 1044 * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 1045 * (the byte 0x01.) 1046 */ 1047 1048 while (numChars-- > 0) { 1049 /* 1050 * n must be interpreted as chars, not bytes. This should be called 1051 * only when both strings are of at least n chars long (no need for \0 1052 * check) 1053 */ 1054 1055 cs += TclUtfToUniChar(cs, &ch1); 1056 ct += TclUtfToUniChar(ct, &ch2); 1057 if (ch1 != ch2) { 1058 return (ch1 - ch2); 1059 } 1060 } 1061 return 0; 1062} 1063 1064/* 1065 *---------------------------------------------------------------------- 1066 * 1067 * Tcl_UtfNcasecmp -- 1068 * 1069 * Compare at most numChars UTF chars of string cs to string ct case 1070 * insensitive. Both cs and ct are assumed to be at least numChars UTF 1071 * chars long. 1072 * 1073 * Results: 1074 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1075 * 1076 * Side effects: 1077 * None. 1078 * 1079 *---------------------------------------------------------------------- 1080 */ 1081 1082int 1083Tcl_UtfNcasecmp( 1084 CONST char *cs, /* UTF string to compare to ct. */ 1085 CONST char *ct, /* UTF string cs is compared to. */ 1086 unsigned long numChars) /* Number of UTF chars to compare. */ 1087{ 1088 Tcl_UniChar ch1, ch2; 1089 while (numChars-- > 0) { 1090 /* 1091 * n must be interpreted as chars, not bytes. 1092 * This should be called only when both strings are of 1093 * at least n chars long (no need for \0 check) 1094 */ 1095 cs += TclUtfToUniChar(cs, &ch1); 1096 ct += TclUtfToUniChar(ct, &ch2); 1097 if (ch1 != ch2) { 1098 ch1 = Tcl_UniCharToLower(ch1); 1099 ch2 = Tcl_UniCharToLower(ch2); 1100 if (ch1 != ch2) { 1101 return (ch1 - ch2); 1102 } 1103 } 1104 } 1105 return 0; 1106} 1107 1108/* 1109 *---------------------------------------------------------------------- 1110 * 1111 * Tcl_UniCharToUpper -- 1112 * 1113 * Compute the uppercase equivalent of the given Unicode character. 1114 * 1115 * Results: 1116 * Returns the uppercase Unicode character. 1117 * 1118 * Side effects: 1119 * None. 1120 * 1121 *---------------------------------------------------------------------- 1122 */ 1123 1124Tcl_UniChar 1125Tcl_UniCharToUpper( 1126 int ch) /* Unicode character to convert. */ 1127{ 1128 int info = GetUniCharInfo(ch); 1129 1130 if (GetCaseType(info) & 0x04) { 1131 return (Tcl_UniChar) (ch - GetDelta(info)); 1132 } else { 1133 return ch; 1134 } 1135} 1136 1137/* 1138 *---------------------------------------------------------------------- 1139 * 1140 * Tcl_UniCharToLower -- 1141 * 1142 * Compute the lowercase equivalent of the given Unicode character. 1143 * 1144 * Results: 1145 * Returns the lowercase Unicode character. 1146 * 1147 * Side effects: 1148 * None. 1149 * 1150 *---------------------------------------------------------------------- 1151 */ 1152 1153Tcl_UniChar 1154Tcl_UniCharToLower( 1155 int ch) /* Unicode character to convert. */ 1156{ 1157 int info = GetUniCharInfo(ch); 1158 1159 if (GetCaseType(info) & 0x02) { 1160 return (Tcl_UniChar) (ch + GetDelta(info)); 1161 } else { 1162 return ch; 1163 } 1164} 1165 1166/* 1167 *---------------------------------------------------------------------- 1168 * 1169 * Tcl_UniCharToTitle -- 1170 * 1171 * Compute the titlecase equivalent of the given Unicode character. 1172 * 1173 * Results: 1174 * Returns the titlecase Unicode character. 1175 * 1176 * Side effects: 1177 * None. 1178 * 1179 *---------------------------------------------------------------------- 1180 */ 1181 1182Tcl_UniChar 1183Tcl_UniCharToTitle( 1184 int ch) /* Unicode character to convert. */ 1185{ 1186 int info = GetUniCharInfo(ch); 1187 int mode = GetCaseType(info); 1188 1189 if (mode & 0x1) { 1190 /* 1191 * Subtract or add one depending on the original case. 1192 */ 1193 1194 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); 1195 } else if (mode == 0x4) { 1196 return (Tcl_UniChar) (ch - GetDelta(info)); 1197 } else { 1198 return ch; 1199 } 1200} 1201 1202/* 1203 *---------------------------------------------------------------------- 1204 * 1205 * Tcl_UniCharLen -- 1206 * 1207 * Find the length of a UniChar string. The str input must be null 1208 * terminated. 1209 * 1210 * Results: 1211 * Returns the length of str in UniChars (not bytes). 1212 * 1213 * Side effects: 1214 * None. 1215 * 1216 *---------------------------------------------------------------------- 1217 */ 1218 1219int 1220Tcl_UniCharLen( 1221 CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ 1222{ 1223 int len = 0; 1224 1225 while (*uniStr != '\0') { 1226 len++; 1227 uniStr++; 1228 } 1229 return len; 1230} 1231 1232/* 1233 *---------------------------------------------------------------------- 1234 * 1235 * Tcl_UniCharNcmp -- 1236 * 1237 * Compare at most numChars unichars of string ucs to string uct. 1238 * Both ucs and uct are assumed to be at least numChars unichars long. 1239 * 1240 * Results: 1241 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 1242 * 1243 * Side effects: 1244 * None. 1245 * 1246 *---------------------------------------------------------------------- 1247 */ 1248 1249int 1250Tcl_UniCharNcmp( 1251 CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 1252 CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 1253 unsigned long numChars) /* Number of unichars to compare. */ 1254{ 1255#ifdef WORDS_BIGENDIAN 1256 /* 1257 * We are definitely on a big-endian machine; memcmp() is safe 1258 */ 1259 1260 return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); 1261 1262#else /* !WORDS_BIGENDIAN */ 1263 /* 1264 * We can't simply call memcmp() because that is not lexically correct. 1265 */ 1266 1267 for ( ; numChars != 0; ucs++, uct++, numChars--) { 1268 if (*ucs != *uct) { 1269 return (*ucs - *uct); 1270 } 1271 } 1272 return 0; 1273#endif /* WORDS_BIGENDIAN */ 1274} 1275 1276/* 1277 *---------------------------------------------------------------------- 1278 * 1279 * Tcl_UniCharNcasecmp -- 1280 * 1281 * Compare at most numChars unichars of string ucs to string uct case 1282 * insensitive. Both ucs and uct are assumed to be at least numChars 1283 * unichars long. 1284 * 1285 * Results: 1286 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 1287 * 1288 * Side effects: 1289 * None. 1290 * 1291 *---------------------------------------------------------------------- 1292 */ 1293 1294int 1295Tcl_UniCharNcasecmp( 1296 CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 1297 CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 1298 unsigned long numChars) /* Number of unichars to compare. */ 1299{ 1300 for ( ; numChars != 0; numChars--, ucs++, uct++) { 1301 if (*ucs != *uct) { 1302 Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); 1303 Tcl_UniChar lct = Tcl_UniCharToLower(*uct); 1304 1305 if (lcs != lct) { 1306 return (lcs - lct); 1307 } 1308 } 1309 } 1310 return 0; 1311} 1312 1313/* 1314 *---------------------------------------------------------------------- 1315 * 1316 * Tcl_UniCharIsAlnum -- 1317 * 1318 * Test if a character is an alphanumeric Unicode character. 1319 * 1320 * Results: 1321 * Returns 1 if character is alphanumeric. 1322 * 1323 * Side effects: 1324 * None. 1325 * 1326 *---------------------------------------------------------------------- 1327 */ 1328 1329int 1330Tcl_UniCharIsAlnum( 1331 int ch) /* Unicode character to test. */ 1332{ 1333 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1334 1335 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); 1336} 1337 1338/* 1339 *---------------------------------------------------------------------- 1340 * 1341 * Tcl_UniCharIsAlpha -- 1342 * 1343 * Test if a character is an alphabetic Unicode character. 1344 * 1345 * Results: 1346 * Returns 1 if character is alphabetic. 1347 * 1348 * Side effects: 1349 * None. 1350 * 1351 *---------------------------------------------------------------------- 1352 */ 1353 1354int 1355Tcl_UniCharIsAlpha( 1356 int ch) /* Unicode character to test. */ 1357{ 1358 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1359 return ((ALPHA_BITS >> category) & 1); 1360} 1361 1362/* 1363 *---------------------------------------------------------------------- 1364 * 1365 * Tcl_UniCharIsControl -- 1366 * 1367 * Test if a character is a Unicode control character. 1368 * 1369 * Results: 1370 * Returns non-zero if character is a control. 1371 * 1372 * Side effects: 1373 * None. 1374 * 1375 *---------------------------------------------------------------------- 1376 */ 1377 1378int 1379Tcl_UniCharIsControl( 1380 int ch) /* Unicode character to test. */ 1381{ 1382 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); 1383} 1384 1385/* 1386 *---------------------------------------------------------------------- 1387 * 1388 * Tcl_UniCharIsDigit -- 1389 * 1390 * Test if a character is a numeric Unicode character. 1391 * 1392 * Results: 1393 * Returns non-zero if character is a digit. 1394 * 1395 * Side effects: 1396 * None. 1397 * 1398 *---------------------------------------------------------------------- 1399 */ 1400 1401int 1402Tcl_UniCharIsDigit( 1403 int ch) /* Unicode character to test. */ 1404{ 1405 return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; 1406} 1407 1408/* 1409 *---------------------------------------------------------------------- 1410 * 1411 * Tcl_UniCharIsGraph -- 1412 * 1413 * Test if a character is any Unicode print character except space. 1414 * 1415 * Results: 1416 * Returns non-zero if character is printable, but not space. 1417 * 1418 * Side effects: 1419 * None. 1420 * 1421 *---------------------------------------------------------------------- 1422 */ 1423 1424int 1425Tcl_UniCharIsGraph( 1426 int ch) /* Unicode character to test. */ 1427{ 1428 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1429 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); 1430} 1431 1432/* 1433 *---------------------------------------------------------------------- 1434 * 1435 * Tcl_UniCharIsLower -- 1436 * 1437 * Test if a character is a lowercase Unicode character. 1438 * 1439 * Results: 1440 * Returns non-zero if character is lowercase. 1441 * 1442 * Side effects: 1443 * None. 1444 * 1445 *---------------------------------------------------------------------- 1446 */ 1447 1448int 1449Tcl_UniCharIsLower( 1450 int ch) /* Unicode character to test. */ 1451{ 1452 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); 1453} 1454 1455/* 1456 *---------------------------------------------------------------------- 1457 * 1458 * Tcl_UniCharIsPrint -- 1459 * 1460 * Test if a character is a Unicode print character. 1461 * 1462 * Results: 1463 * Returns non-zero if character is printable. 1464 * 1465 * Side effects: 1466 * None. 1467 * 1468 *---------------------------------------------------------------------- 1469 */ 1470 1471int 1472Tcl_UniCharIsPrint( 1473 int ch) /* Unicode character to test. */ 1474{ 1475 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1476 return ((PRINT_BITS >> category) & 1); 1477} 1478 1479/* 1480 *---------------------------------------------------------------------- 1481 * 1482 * Tcl_UniCharIsPunct -- 1483 * 1484 * Test if a character is a Unicode punctuation character. 1485 * 1486 * Results: 1487 * Returns non-zero if character is punct. 1488 * 1489 * Side effects: 1490 * None. 1491 * 1492 *---------------------------------------------------------------------- 1493 */ 1494 1495int 1496Tcl_UniCharIsPunct( 1497 int ch) /* Unicode character to test. */ 1498{ 1499 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1500 return ((PUNCT_BITS >> category) & 1); 1501} 1502 1503/* 1504 *---------------------------------------------------------------------- 1505 * 1506 * Tcl_UniCharIsSpace -- 1507 * 1508 * Test if a character is a whitespace Unicode character. 1509 * 1510 * Results: 1511 * Returns non-zero if character is a space. 1512 * 1513 * Side effects: 1514 * None. 1515 * 1516 *---------------------------------------------------------------------- 1517 */ 1518 1519int 1520Tcl_UniCharIsSpace( 1521 int ch) /* Unicode character to test. */ 1522{ 1523 register int category; 1524 1525 /* 1526 * If the character is within the first 127 characters, just use the 1527 * standard C function, otherwise consult the Unicode table. 1528 */ 1529 1530 if (ch < 0x80) { 1531 return isspace(UCHAR(ch)); /* INTL: ISO space */ 1532 } else { 1533 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1534 return ((SPACE_BITS >> category) & 1); 1535 } 1536} 1537 1538/* 1539 *---------------------------------------------------------------------- 1540 * 1541 * Tcl_UniCharIsUpper -- 1542 * 1543 * Test if a character is a uppercase Unicode character. 1544 * 1545 * Results: 1546 * Returns non-zero if character is uppercase. 1547 * 1548 * Side effects: 1549 * None. 1550 * 1551 *---------------------------------------------------------------------- 1552 */ 1553 1554int 1555Tcl_UniCharIsUpper( 1556 int ch) /* Unicode character to test. */ 1557{ 1558 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); 1559} 1560 1561/* 1562 *---------------------------------------------------------------------- 1563 * 1564 * Tcl_UniCharIsWordChar -- 1565 * 1566 * Test if a character is alphanumeric or a connector punctuation mark. 1567 * 1568 * Results: 1569 * Returns 1 if character is a word character. 1570 * 1571 * Side effects: 1572 * None. 1573 * 1574 *---------------------------------------------------------------------- 1575 */ 1576 1577int 1578Tcl_UniCharIsWordChar( 1579 int ch) /* Unicode character to test. */ 1580{ 1581 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 1582 1583 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); 1584} 1585 1586/* 1587 *---------------------------------------------------------------------- 1588 * 1589 * Tcl_UniCharCaseMatch -- 1590 * 1591 * See if a particular Unicode string matches a particular pattern. 1592 * Allows case insensitivity. This is the Unicode equivalent of the char* 1593 * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. 1594 * This has no provision for counted UniChar strings, thus should not be 1595 * used where NULLs are expected in the UniChar string. Use 1596 * TclUniCharMatch where possible. 1597 * 1598 * Results: 1599 * The return value is 1 if string matches pattern, and 0 otherwise. The 1600 * matching operation permits the following special characters in the 1601 * pattern: *?\[] (see the manual entry for details on what these mean). 1602 * 1603 * Side effects: 1604 * None. 1605 * 1606 *---------------------------------------------------------------------- 1607 */ 1608 1609int 1610Tcl_UniCharCaseMatch( 1611 CONST Tcl_UniChar *uniStr, /* Unicode String. */ 1612 CONST Tcl_UniChar *uniPattern, 1613 /* Pattern, which may contain special 1614 * characters. */ 1615 int nocase) /* 0 for case sensitive, 1 for insensitive */ 1616{ 1617 Tcl_UniChar ch1, p; 1618 1619 while (1) { 1620 p = *uniPattern; 1621 1622 /* 1623 * See if we're at the end of both the pattern and the string. If so, 1624 * we succeeded. If we're at the end of the pattern but not at the end 1625 * of the string, we failed. 1626 */ 1627 1628 if (p == 0) { 1629 return (*uniStr == 0); 1630 } 1631 if ((*uniStr == 0) && (p != '*')) { 1632 return 0; 1633 } 1634 1635 /* 1636 * Check for a "*" as the next pattern character. It matches any 1637 * substring. We handle this by skipping all the characters up to the 1638 * next matching one in the pattern, and then calling ourselves 1639 * recursively for each postfix of string, until either we match or we 1640 * reach the end of the string. 1641 */ 1642 1643 if (p == '*') { 1644 /* 1645 * Skip all successive *'s in the pattern 1646 */ 1647 1648 while (*(++uniPattern) == '*') { 1649 /* empty body */ 1650 } 1651 p = *uniPattern; 1652 if (p == 0) { 1653 return 1; 1654 } 1655 if (nocase) { 1656 p = Tcl_UniCharToLower(p); 1657 } 1658 while (1) { 1659 /* 1660 * Optimization for matching - cruise through the string 1661 * quickly if the next char in the pattern isn't a special 1662 * character 1663 */ 1664 1665 if ((p != '[') && (p != '?') && (p != '\\')) { 1666 if (nocase) { 1667 while (*uniStr && (p != *uniStr) 1668 && (p != Tcl_UniCharToLower(*uniStr))) { 1669 uniStr++; 1670 } 1671 } else { 1672 while (*uniStr && (p != *uniStr)) { 1673 uniStr++; 1674 } 1675 } 1676 } 1677 if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { 1678 return 1; 1679 } 1680 if (*uniStr == 0) { 1681 return 0; 1682 } 1683 uniStr++; 1684 } 1685 } 1686 1687 /* 1688 * Check for a "?" as the next pattern character. It matches any 1689 * single character. 1690 */ 1691 1692 if (p == '?') { 1693 uniPattern++; 1694 uniStr++; 1695 continue; 1696 } 1697 1698 /* 1699 * Check for a "[" as the next pattern character. It is followed by a 1700 * list of characters that are acceptable, or by a range (two 1701 * characters separated by "-"). 1702 */ 1703 1704 if (p == '[') { 1705 Tcl_UniChar startChar, endChar; 1706 1707 uniPattern++; 1708 ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); 1709 uniStr++; 1710 while (1) { 1711 if ((*uniPattern == ']') || (*uniPattern == 0)) { 1712 return 0; 1713 } 1714 startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 1715 : *uniPattern); 1716 uniPattern++; 1717 if (*uniPattern == '-') { 1718 uniPattern++; 1719 if (*uniPattern == 0) { 1720 return 0; 1721 } 1722 endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 1723 : *uniPattern); 1724 uniPattern++; 1725 if (((startChar <= ch1) && (ch1 <= endChar)) 1726 || ((endChar <= ch1) && (ch1 <= startChar))) { 1727 /* 1728 * Matches ranges of form [a-z] or [z-a]. 1729 */ 1730 break; 1731 } 1732 } else if (startChar == ch1) { 1733 break; 1734 } 1735 } 1736 while (*uniPattern != ']') { 1737 if (*uniPattern == 0) { 1738 uniPattern--; 1739 break; 1740 } 1741 uniPattern++; 1742 } 1743 uniPattern++; 1744 continue; 1745 } 1746 1747 /* 1748 * If the next pattern character is '\', just strip off the '\' so we 1749 * do exact matching on the character that follows. 1750 */ 1751 1752 if (p == '\\') { 1753 if (*(++uniPattern) == '\0') { 1754 return 0; 1755 } 1756 } 1757 1758 /* 1759 * There's no special character. Just make sure that the next bytes of 1760 * each string match. 1761 */ 1762 1763 if (nocase) { 1764 if (Tcl_UniCharToLower(*uniStr) != 1765 Tcl_UniCharToLower(*uniPattern)) { 1766 return 0; 1767 } 1768 } else if (*uniStr != *uniPattern) { 1769 return 0; 1770 } 1771 uniStr++; 1772 uniPattern++; 1773 } 1774} 1775 1776/* 1777 *---------------------------------------------------------------------- 1778 * 1779 * TclUniCharMatch -- 1780 * 1781 * See if a particular Unicode string matches a particular pattern. 1782 * Allows case insensitivity. This is the Unicode equivalent of the char* 1783 * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted 1784 * Strings, so embedded NULLs are allowed. 1785 * 1786 * Results: 1787 * The return value is 1 if string matches pattern, and 0 otherwise. The 1788 * matching operation permits the following special characters in the 1789 * pattern: *?\[] (see the manual entry for details on what these mean). 1790 * 1791 * Side effects: 1792 * None. 1793 * 1794 *---------------------------------------------------------------------- 1795 */ 1796 1797int 1798TclUniCharMatch( 1799 CONST Tcl_UniChar *string, /* Unicode String. */ 1800 int strLen, /* Length of String */ 1801 CONST Tcl_UniChar *pattern, /* Pattern, which may contain special 1802 * characters. */ 1803 int ptnLen, /* Length of Pattern */ 1804 int nocase) /* 0 for case sensitive, 1 for insensitive */ 1805{ 1806 CONST Tcl_UniChar *stringEnd, *patternEnd; 1807 Tcl_UniChar p; 1808 1809 stringEnd = string + strLen; 1810 patternEnd = pattern + ptnLen; 1811 1812 while (1) { 1813 /* 1814 * See if we're at the end of both the pattern and the string. If so, 1815 * we succeeded. If we're at the end of the pattern but not at the end 1816 * of the string, we failed. 1817 */ 1818 1819 if (pattern == patternEnd) { 1820 return (string == stringEnd); 1821 } 1822 p = *pattern; 1823 if ((string == stringEnd) && (p != '*')) { 1824 return 0; 1825 } 1826 1827 /* 1828 * Check for a "*" as the next pattern character. It matches any 1829 * substring. We handle this by skipping all the characters up to the 1830 * next matching one in the pattern, and then calling ourselves 1831 * recursively for each postfix of string, until either we match or we 1832 * reach the end of the string. 1833 */ 1834 1835 if (p == '*') { 1836 /* 1837 * Skip all successive *'s in the pattern. 1838 */ 1839 1840 while (*(++pattern) == '*') { 1841 /* empty body */ 1842 } 1843 if (pattern == patternEnd) { 1844 return 1; 1845 } 1846 p = *pattern; 1847 if (nocase) { 1848 p = Tcl_UniCharToLower(p); 1849 } 1850 while (1) { 1851 /* 1852 * Optimization for matching - cruise through the string 1853 * quickly if the next char in the pattern isn't a special 1854 * character. 1855 */ 1856 1857 if ((p != '[') && (p != '?') && (p != '\\')) { 1858 if (nocase) { 1859 while ((string < stringEnd) && (p != *string) 1860 && (p != Tcl_UniCharToLower(*string))) { 1861 string++; 1862 } 1863 } else { 1864 while ((string < stringEnd) && (p != *string)) { 1865 string++; 1866 } 1867 } 1868 } 1869 if (TclUniCharMatch(string, stringEnd - string, 1870 pattern, patternEnd - pattern, nocase)) { 1871 return 1; 1872 } 1873 if (string == stringEnd) { 1874 return 0; 1875 } 1876 string++; 1877 } 1878 } 1879 1880 /* 1881 * Check for a "?" as the next pattern character. It matches any 1882 * single character. 1883 */ 1884 1885 if (p == '?') { 1886 pattern++; 1887 string++; 1888 continue; 1889 } 1890 1891 /* 1892 * Check for a "[" as the next pattern character. It is followed by a 1893 * list of characters that are acceptable, or by a range (two 1894 * characters separated by "-"). 1895 */ 1896 1897 if (p == '[') { 1898 Tcl_UniChar ch1, startChar, endChar; 1899 1900 pattern++; 1901 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); 1902 string++; 1903 while (1) { 1904 if ((*pattern == ']') || (pattern == patternEnd)) { 1905 return 0; 1906 } 1907 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); 1908 pattern++; 1909 if (*pattern == '-') { 1910 pattern++; 1911 if (pattern == patternEnd) { 1912 return 0; 1913 } 1914 endChar = (nocase ? Tcl_UniCharToLower(*pattern) 1915 : *pattern); 1916 pattern++; 1917 if (((startChar <= ch1) && (ch1 <= endChar)) 1918 || ((endChar <= ch1) && (ch1 <= startChar))) { 1919 /* 1920 * Matches ranges of form [a-z] or [z-a]. 1921 */ 1922 break; 1923 } 1924 } else if (startChar == ch1) { 1925 break; 1926 } 1927 } 1928 while (*pattern != ']') { 1929 if (pattern == patternEnd) { 1930 pattern--; 1931 break; 1932 } 1933 pattern++; 1934 } 1935 pattern++; 1936 continue; 1937 } 1938 1939 /* 1940 * If the next pattern character is '\', just strip off the '\' so we 1941 * do exact matching on the character that follows. 1942 */ 1943 1944 if (p == '\\') { 1945 if (++pattern == patternEnd) { 1946 return 0; 1947 } 1948 } 1949 1950 /* 1951 * There's no special character. Just make sure that the next bytes of 1952 * each string match. 1953 */ 1954 1955 if (nocase) { 1956 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { 1957 return 0; 1958 } 1959 } else if (*string != *pattern) { 1960 return 0; 1961 } 1962 string++; 1963 pattern++; 1964 } 1965} 1966 1967/* 1968 * Local Variables: 1969 * mode: c 1970 * c-basic-offset: 4 1971 * fill-column: 78 1972 * End: 1973 */ 1974