1/* 2 * tclEncoding.c -- 3 * 4 * Contains the implementation of the encoding conversion package. 5 * 6 * Copyright (c) 1996-1998 Sun Microsystems, Inc. 7 * 8 * See the file "license.terms" for information on usage and redistribution of 9 * this file, and for a DISCLAIMER OF ALL WARRANTIES. 10 * 11 * RCS: @(#) $Id: tclEncoding.c,v 1.59.2.2 2009/11/16 12:14:45 ferrieux Exp $ 12 */ 13 14#include "tclInt.h" 15 16typedef size_t (LengthProc)(CONST char *src); 17 18/* 19 * The following data structure represents an encoding, which describes how to 20 * convert between various character sets and UTF-8. 21 */ 22 23typedef struct Encoding { 24 char *name; /* Name of encoding. Malloced because (1) hash 25 * table entry that owns this encoding may be 26 * freed prior to this encoding being freed, 27 * (2) string passed in the Tcl_EncodingType 28 * structure may not be persistent. */ 29 Tcl_EncodingConvertProc *toUtfProc; 30 /* Function to convert from external encoding 31 * into UTF-8. */ 32 Tcl_EncodingConvertProc *fromUtfProc; 33 /* Function to convert from UTF-8 into 34 * external encoding. */ 35 Tcl_EncodingFreeProc *freeProc; 36 /* If non-NULL, function to call when this 37 * encoding is deleted. */ 38 int nullSize; /* Number of 0x00 bytes that signify 39 * end-of-string in this encoding. This number 40 * is used to determine the source string 41 * length when the srcLen argument is 42 * negative. This number can be 1 or 2. */ 43 ClientData clientData; /* Arbitrary value associated with encoding 44 * type. Passed to conversion functions. */ 45 LengthProc *lengthProc; /* Function to compute length of 46 * null-terminated strings in this encoding. 47 * If nullSize is 1, this is strlen; if 48 * nullSize is 2, this is a function that 49 * returns the number of bytes in a 0x0000 50 * terminated string. */ 51 int refCount; /* Number of uses of this structure. */ 52 Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ 53} Encoding; 54 55/* 56 * The following structure is the clientData for a dynamically-loaded, 57 * table-driven encoding created by LoadTableEncoding(). It maps between 58 * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only) 59 * encoding. 60 */ 61 62typedef struct TableEncodingData { 63 int fallback; /* Character (in this encoding) to substitute 64 * when this encoding cannot represent a UTF-8 65 * character. */ 66 char prefixBytes[256]; /* If a byte in the input stream is a lead 67 * byte for a 2-byte sequence, the 68 * corresponding entry in this array is 1, 69 * otherwise it is 0. */ 70 unsigned short **toUnicode; /* Two dimensional sparse matrix to map 71 * characters from the encoding to Unicode. 72 * Each element of the toUnicode array points 73 * to an array of 256 shorts. If there is no 74 * corresponding character in Unicode, the 75 * value in the matrix is 0x0000. 76 * malloc'd. */ 77 unsigned short **fromUnicode; 78 /* Two dimensional sparse matrix to map 79 * characters from Unicode to the encoding. 80 * Each element of the fromUnicode array 81 * points to an array of 256 shorts. If there 82 * is no corresponding character the encoding, 83 * the value in the matrix is 0x0000. 84 * malloc'd. */ 85} TableEncodingData; 86 87/* 88 * The following structures is the clientData for a dynamically-loaded, 89 * escape-driven encoding that is itself comprised of other simpler encodings. 90 * An example is "iso-2022-jp", which uses escape sequences to switch between 91 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven" 92 * does not necessarily mean that the ESCAPE character is the character used 93 * for switching character sets. 94 */ 95 96typedef struct EscapeSubTable { 97 unsigned int sequenceLen; /* Length of following string. */ 98 char sequence[16]; /* Escape code that marks this encoding. */ 99 char name[32]; /* Name for encoding. */ 100 Encoding *encodingPtr; /* Encoding loaded using above name, or NULL 101 * if this sub-encoding has not been needed 102 * yet. */ 103} EscapeSubTable; 104 105typedef struct EscapeEncodingData { 106 int fallback; /* Character (in this encoding) to substitute 107 * when this encoding cannot represent a UTF-8 108 * character. */ 109 unsigned int initLen; /* Length of following string. */ 110 char init[16]; /* String to emit or expect before first char 111 * in conversion. */ 112 unsigned int finalLen; /* Length of following string. */ 113 char final[16]; /* String to emit or expect after last char in 114 * conversion. */ 115 char prefixBytes[256]; /* If a byte in the input stream is the first 116 * character of one of the escape sequences in 117 * the following array, the corresponding 118 * entry in this array is 1, otherwise it is 119 * 0. */ 120 int numSubTables; /* Length of following array. */ 121 EscapeSubTable subTables[1];/* Information about each EscapeSubTable used 122 * by this encoding type. The actual size will 123 * be as large as necessary to hold all 124 * EscapeSubTables. */ 125} EscapeEncodingData; 126 127/* 128 * Constants used when loading an encoding file to identify the type of the 129 * file. 130 */ 131 132#define ENCODING_SINGLEBYTE 0 133#define ENCODING_DOUBLEBYTE 1 134#define ENCODING_MULTIBYTE 2 135#define ENCODING_ESCAPE 3 136 137/* 138 * A list of directories in which Tcl should look for *.enc files. This list 139 * is shared by all threads. Access is governed by a mutex lock. 140 */ 141 142static TclInitProcessGlobalValueProc InitializeEncodingSearchPath; 143static ProcessGlobalValue encodingSearchPath = { 144 0, 0, NULL, NULL, InitializeEncodingSearchPath, NULL, NULL 145}; 146 147/* 148 * A map from encoding names to the directories in which their data files have 149 * been seen. The string value of the map is shared by all threads. Access to 150 * the shared string is governed by a mutex lock. 151 */ 152 153static ProcessGlobalValue encodingFileMap = { 154 0, 0, NULL, NULL, NULL, NULL, NULL 155}; 156 157/* 158 * A list of directories making up the "library path". Historically this 159 * search path has served many uses, but the only one remaining is a base for 160 * the encodingSearchPath above. If the application does not explicitly set 161 * the encodingSearchPath, then it will be initialized by appending /encoding 162 * to each directory in this "libraryPath". 163 */ 164 165static ProcessGlobalValue libraryPath = { 166 0, 0, NULL, NULL, TclpInitLibraryPath, NULL, NULL 167}; 168 169static int encodingsInitialized = 0; 170 171/* 172 * Hash table that keeps track of all loaded Encodings. Keys are the string 173 * names that represent the encoding, values are (Encoding *). 174 */ 175 176static Tcl_HashTable encodingTable; 177TCL_DECLARE_MUTEX(encodingMutex) 178 179/* 180 * The following are used to hold the default and current system encodings. 181 * If NULL is passed to one of the conversion routines, the current setting of 182 * the system encoding will be used to perform the conversion. 183 */ 184 185static Tcl_Encoding defaultEncoding; 186static Tcl_Encoding systemEncoding; 187 188/* 189 * The following variable is used in the sparse matrix code for a 190 * TableEncoding to represent a page in the table that has no entries. 191 */ 192 193static unsigned short emptyPage[256]; 194 195/* 196 * Functions used only in this module. 197 */ 198 199static int BinaryProc(ClientData clientData, 200 CONST char *src, int srcLen, int flags, 201 Tcl_EncodingState *statePtr, char *dst, int dstLen, 202 int *srcReadPtr, int *dstWrotePtr, 203 int *dstCharsPtr); 204static void DupEncodingIntRep(Tcl_Obj *srcPtr, Tcl_Obj *dupPtr); 205static void EscapeFreeProc(ClientData clientData); 206static int EscapeFromUtfProc(ClientData clientData, 207 CONST char *src, int srcLen, int flags, 208 Tcl_EncodingState *statePtr, char *dst, int dstLen, 209 int *srcReadPtr, int *dstWrotePtr, 210 int *dstCharsPtr); 211static int EscapeToUtfProc(ClientData clientData, 212 CONST char *src, int srcLen, int flags, 213 Tcl_EncodingState *statePtr, char *dst, int dstLen, 214 int *srcReadPtr, int *dstWrotePtr, 215 int *dstCharsPtr); 216static void FillEncodingFileMap(void); 217static void FreeEncoding(Tcl_Encoding encoding); 218static void FreeEncodingIntRep(Tcl_Obj *objPtr); 219static Encoding * GetTableEncoding(EscapeEncodingData *dataPtr, 220 int state); 221static Tcl_Encoding LoadEncodingFile(Tcl_Interp *interp, CONST char *name); 222static Tcl_Encoding LoadTableEncoding(CONST char *name, int type, 223 Tcl_Channel chan); 224static Tcl_Encoding LoadEscapeEncoding(CONST char *name, Tcl_Channel chan); 225static Tcl_Channel OpenEncodingFileChannel(Tcl_Interp *interp, 226 CONST char *name); 227static void TableFreeProc(ClientData clientData); 228static int TableFromUtfProc(ClientData clientData, 229 CONST char *src, int srcLen, int flags, 230 Tcl_EncodingState *statePtr, char *dst, int dstLen, 231 int *srcReadPtr, int *dstWrotePtr, 232 int *dstCharsPtr); 233static int TableToUtfProc(ClientData clientData, CONST char *src, 234 int srcLen, int flags, Tcl_EncodingState *statePtr, 235 char *dst, int dstLen, int *srcReadPtr, 236 int *dstWrotePtr, int *dstCharsPtr); 237static size_t unilen(CONST char *src); 238static int UnicodeToUtfProc(ClientData clientData, 239 CONST char *src, int srcLen, int flags, 240 Tcl_EncodingState *statePtr, char *dst, int dstLen, 241 int *srcReadPtr, int *dstWrotePtr, 242 int *dstCharsPtr); 243static int UtfToUnicodeProc(ClientData clientData, 244 CONST char *src, int srcLen, int flags, 245 Tcl_EncodingState *statePtr, char *dst, int dstLen, 246 int *srcReadPtr, int *dstWrotePtr, 247 int *dstCharsPtr); 248static int UtfToUtfProc(ClientData clientData, 249 CONST char *src, int srcLen, int flags, 250 Tcl_EncodingState *statePtr, char *dst, int dstLen, 251 int *srcReadPtr, int *dstWrotePtr, 252 int *dstCharsPtr, int pureNullMode); 253static int UtfIntToUtfExtProc(ClientData clientData, 254 CONST char *src, int srcLen, int flags, 255 Tcl_EncodingState *statePtr, char *dst, int dstLen, 256 int *srcReadPtr, int *dstWrotePtr, 257 int *dstCharsPtr); 258static int UtfExtToUtfIntProc(ClientData clientData, 259 CONST char *src, int srcLen, int flags, 260 Tcl_EncodingState *statePtr, char *dst, int dstLen, 261 int *srcReadPtr, int *dstWrotePtr, 262 int *dstCharsPtr); 263static int Iso88591FromUtfProc(ClientData clientData, 264 CONST char *src, int srcLen, int flags, 265 Tcl_EncodingState *statePtr, char *dst, int dstLen, 266 int *srcReadPtr, int *dstWrotePtr, 267 int *dstCharsPtr); 268static int Iso88591ToUtfProc(ClientData clientData, 269 CONST char *src, int srcLen, int flags, 270 Tcl_EncodingState *statePtr, char *dst, 271 int dstLen, int *srcReadPtr, int *dstWrotePtr, 272 int *dstCharsPtr); 273 274/* 275 * A Tcl_ObjType for holding a cached Tcl_Encoding in the otherValuePtr field 276 * of the intrep. This should help the lifetime of encodings be more useful. 277 * See concerns raised in [Bug 1077262]. 278 */ 279 280static Tcl_ObjType encodingType = { 281 "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL 282}; 283 284/* 285 *---------------------------------------------------------------------- 286 * 287 * Tcl_GetEncodingFromObj -- 288 * 289 * Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if 290 * possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is 291 * returned, and if interp is non-NULL, an error message is written 292 * there. 293 * 294 * Results: 295 * Standard Tcl return code. 296 * 297 * Side effects: 298 * Caches the Tcl_Encoding value as the internal rep of (*objPtr). 299 * 300 *---------------------------------------------------------------------- 301 */ 302 303int 304Tcl_GetEncodingFromObj( 305 Tcl_Interp *interp, 306 Tcl_Obj *objPtr, 307 Tcl_Encoding *encodingPtr) 308{ 309 CONST char *name = Tcl_GetString(objPtr); 310 if (objPtr->typePtr != &encodingType) { 311 Tcl_Encoding encoding = Tcl_GetEncoding(interp, name); 312 313 if (encoding == NULL) { 314 return TCL_ERROR; 315 } 316 TclFreeIntRep(objPtr); 317 objPtr->internalRep.otherValuePtr = (VOID *) encoding; 318 objPtr->typePtr = &encodingType; 319 } 320 *encodingPtr = Tcl_GetEncoding(NULL, name); 321 return TCL_OK; 322} 323 324/* 325 *---------------------------------------------------------------------- 326 * 327 * FreeEncodingIntRep -- 328 * 329 * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType. 330 * 331 *---------------------------------------------------------------------- 332 */ 333 334static void 335FreeEncodingIntRep( 336 Tcl_Obj *objPtr) 337{ 338 Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.otherValuePtr); 339} 340 341/* 342 *---------------------------------------------------------------------- 343 * 344 * DupEncodingIntRep -- 345 * 346 * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType. 347 * 348 *---------------------------------------------------------------------- 349 */ 350 351static void 352DupEncodingIntRep( 353 Tcl_Obj *srcPtr, 354 Tcl_Obj *dupPtr) 355{ 356 dupPtr->internalRep.otherValuePtr = (VOID *) 357 Tcl_GetEncoding(NULL, srcPtr->bytes); 358} 359 360/* 361 *---------------------------------------------------------------------- 362 * 363 * Tcl_GetEncodingSearchPath -- 364 * 365 * Keeps the per-thread copy of the encoding search path current with 366 * changes to the global copy. 367 * 368 * Results: 369 * Returns a "list" (Tcl_Obj *) that contains the encoding search path. 370 * 371 *---------------------------------------------------------------------- 372 */ 373 374Tcl_Obj * 375Tcl_GetEncodingSearchPath(void) 376{ 377 return TclGetProcessGlobalValue(&encodingSearchPath); 378} 379 380/* 381 *---------------------------------------------------------------------- 382 * 383 * Tcl_SetEncodingSearchPath -- 384 * 385 * Keeps the per-thread copy of the encoding search path current with 386 * changes to the global copy. 387 * 388 *---------------------------------------------------------------------- 389 */ 390 391int 392Tcl_SetEncodingSearchPath( 393 Tcl_Obj *searchPath) 394{ 395 int dummy; 396 397 if (TCL_ERROR == Tcl_ListObjLength(NULL, searchPath, &dummy)) { 398 return TCL_ERROR; 399 } 400 TclSetProcessGlobalValue(&encodingSearchPath, searchPath, NULL); 401 return TCL_OK; 402} 403 404/* 405 *---------------------------------------------------------------------- 406 * 407 * TclGetLibraryPath -- 408 * 409 * Keeps the per-thread copy of the library path current with changes to 410 * the global copy. 411 * 412 * Results: 413 * Returns a "list" (Tcl_Obj *) that contains the library path. 414 * 415 *---------------------------------------------------------------------- 416 */ 417 418Tcl_Obj * 419TclGetLibraryPath(void) 420{ 421 return TclGetProcessGlobalValue(&libraryPath); 422} 423 424/* 425 *---------------------------------------------------------------------- 426 * 427 * TclSetLibraryPath -- 428 * 429 * Keeps the per-thread copy of the library path current with changes to 430 * the global copy. 431 * 432 * NOTE: this routine returns void, so there's no way to report the error 433 * that searchPath is not a valid list. In that case, this routine will 434 * silently do nothing. 435 * 436 *---------------------------------------------------------------------- 437 */ 438 439void 440TclSetLibraryPath( 441 Tcl_Obj *path) 442{ 443 int dummy; 444 445 if (TCL_ERROR == Tcl_ListObjLength(NULL, path, &dummy)) { 446 return; 447 } 448 TclSetProcessGlobalValue(&libraryPath, path, NULL); 449} 450 451/* 452 *--------------------------------------------------------------------------- 453 * 454 * FillEncodingFileMap -- 455 * 456 * Called to bring the encoding file map in sync with the current value 457 * of the encoding search path. 458 * 459 * Scan the directories on the encoding search path, find the *.enc 460 * files, and store the found pathnames in a map associated with the 461 * encoding name. 462 * 463 * In particular, if $dir is on the encoding search path, and the file 464 * $dir/foo.enc is found, then store a "foo" -> $dir entry in the map. 465 * Later, any need for the "foo" encoding will quickly * be able to 466 * construct the $dir/foo.enc pathname for reading the encoding data. 467 * 468 * Results: 469 * None. 470 * 471 * Side effects: 472 * Entries are added to the encoding file map. 473 * 474 *--------------------------------------------------------------------------- 475 */ 476 477static void 478FillEncodingFileMap(void) 479{ 480 int i, numDirs = 0; 481 Tcl_Obj *map, *searchPath; 482 483 searchPath = Tcl_GetEncodingSearchPath(); 484 Tcl_IncrRefCount(searchPath); 485 Tcl_ListObjLength(NULL, searchPath, &numDirs); 486 map = Tcl_NewDictObj(); 487 Tcl_IncrRefCount(map); 488 489 for (i = numDirs-1; i >= 0; i--) { 490 /* 491 * Iterate backwards through the search path so as we overwrite 492 * entries found, we favor files earlier on the search path. 493 */ 494 495 int j, numFiles; 496 Tcl_Obj *directory, *matchFileList = Tcl_NewObj(); 497 Tcl_Obj **filev; 498 Tcl_GlobTypeData readableFiles = { 499 TCL_GLOB_TYPE_FILE, TCL_GLOB_PERM_R, NULL, NULL 500 }; 501 502 Tcl_ListObjIndex(NULL, searchPath, i, &directory); 503 Tcl_IncrRefCount(directory); 504 Tcl_IncrRefCount(matchFileList); 505 Tcl_FSMatchInDirectory(NULL, matchFileList, directory, "*.enc", 506 &readableFiles); 507 508 Tcl_ListObjGetElements(NULL, matchFileList, &numFiles, &filev); 509 for (j=0; j<numFiles; j++) { 510 Tcl_Obj *encodingName, *file; 511 512 file = TclPathPart(NULL, filev[j], TCL_PATH_TAIL); 513 encodingName = TclPathPart(NULL, file, TCL_PATH_ROOT); 514 Tcl_DictObjPut(NULL, map, encodingName, directory); 515 Tcl_DecrRefCount(file); 516 Tcl_DecrRefCount(encodingName); 517 } 518 Tcl_DecrRefCount(matchFileList); 519 Tcl_DecrRefCount(directory); 520 } 521 Tcl_DecrRefCount(searchPath); 522 TclSetProcessGlobalValue(&encodingFileMap, map, NULL); 523 Tcl_DecrRefCount(map); 524} 525 526/* 527 *--------------------------------------------------------------------------- 528 * 529 * TclInitEncodingSubsystem -- 530 * 531 * Initialize all resources used by this subsystem on a per-process 532 * basis. 533 * 534 * Results: 535 * None. 536 * 537 * Side effects: 538 * Depends on the memory, object, and IO subsystems. 539 * 540 *--------------------------------------------------------------------------- 541 */ 542 543void 544TclInitEncodingSubsystem(void) 545{ 546 Tcl_EncodingType type; 547 548 if (encodingsInitialized) { 549 return; 550 } 551 552 Tcl_MutexLock(&encodingMutex); 553 Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); 554 Tcl_MutexUnlock(&encodingMutex); 555 556 /* 557 * Create a few initial encodings. Note that the UTF-8 to UTF-8 558 * translation is not a no-op, because it will turn a stream of improperly 559 * formed UTF-8 into a properly formed stream. 560 */ 561 562 type.encodingName = "identity"; 563 type.toUtfProc = BinaryProc; 564 type.fromUtfProc = BinaryProc; 565 type.freeProc = NULL; 566 type.nullSize = 1; 567 type.clientData = NULL; 568 569 defaultEncoding = Tcl_CreateEncoding(&type); 570 systemEncoding = Tcl_GetEncoding(NULL, type.encodingName); 571 572 type.encodingName = "utf-8"; 573 type.toUtfProc = UtfExtToUtfIntProc; 574 type.fromUtfProc = UtfIntToUtfExtProc; 575 type.freeProc = NULL; 576 type.nullSize = 1; 577 type.clientData = NULL; 578 Tcl_CreateEncoding(&type); 579 580 type.encodingName = "unicode"; 581 type.toUtfProc = UnicodeToUtfProc; 582 type.fromUtfProc = UtfToUnicodeProc; 583 type.freeProc = NULL; 584 type.nullSize = 2; 585 type.clientData = NULL; 586 Tcl_CreateEncoding(&type); 587 588 /* 589 * Need the iso8859-1 encoding in order to process binary data, so force 590 * it to always be embedded. Note that this encoding *must* be a proper 591 * table encoding or some of the escape encodings crash! Hence the ugly 592 * code to duplicate the structure of a table encoding here. 593 */ 594 595 { 596 TableEncodingData *dataPtr = (TableEncodingData *) 597 ckalloc(sizeof(TableEncodingData)); 598 unsigned size; 599 unsigned short i; 600 601 memset(dataPtr, 0, sizeof(TableEncodingData)); 602 dataPtr->fallback = '?'; 603 604 size = 256*(sizeof(unsigned short *) + sizeof(unsigned short)); 605 dataPtr->toUnicode = (unsigned short **) ckalloc(size); 606 memset(dataPtr->toUnicode, 0, size); 607 dataPtr->fromUnicode = (unsigned short **) ckalloc(size); 608 memset(dataPtr->fromUnicode, 0, size); 609 610 dataPtr->toUnicode[0] = (unsigned short *) (dataPtr->toUnicode + 256); 611 dataPtr->fromUnicode[0] = (unsigned short *) 612 (dataPtr->fromUnicode + 256); 613 for (i=1 ; i<256 ; i++) { 614 dataPtr->toUnicode[i] = emptyPage; 615 dataPtr->fromUnicode[i] = emptyPage; 616 } 617 618 for (i=0 ; i<256 ; i++) { 619 dataPtr->toUnicode[0][i] = i; 620 dataPtr->fromUnicode[0][i] = i; 621 } 622 623 type.encodingName = "iso8859-1"; 624 type.toUtfProc = Iso88591ToUtfProc; 625 type.fromUtfProc = Iso88591FromUtfProc; 626 type.freeProc = TableFreeProc; 627 type.nullSize = 1; 628 type.clientData = dataPtr; 629 Tcl_CreateEncoding(&type); 630 } 631 632 encodingsInitialized = 1; 633} 634 635/* 636 *---------------------------------------------------------------------- 637 * 638 * TclFinalizeEncodingSubsystem -- 639 * 640 * Release the state associated with the encoding subsystem. 641 * 642 * Results: 643 * None. 644 * 645 * Side effects: 646 * Frees all of the encodings. 647 * 648 *---------------------------------------------------------------------- 649 */ 650 651void 652TclFinalizeEncodingSubsystem(void) 653{ 654 Tcl_HashSearch search; 655 Tcl_HashEntry *hPtr; 656 657 Tcl_MutexLock(&encodingMutex); 658 encodingsInitialized = 0; 659 FreeEncoding(systemEncoding); 660 661 hPtr = Tcl_FirstHashEntry(&encodingTable, &search); 662 while (hPtr != NULL) { 663 /* 664 * Call FreeEncoding instead of doing it directly to handle refcounts 665 * like escape encodings use. [Bug 524674] Make sure to call 666 * Tcl_FirstHashEntry repeatedly so that all encodings are eventually 667 * cleaned up. 668 */ 669 670 FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr)); 671 hPtr = Tcl_FirstHashEntry(&encodingTable, &search); 672 } 673 674 Tcl_DeleteHashTable(&encodingTable); 675 Tcl_MutexUnlock(&encodingMutex); 676} 677 678/* 679 *------------------------------------------------------------------------- 680 * 681 * Tcl_GetDefaultEncodingDir -- 682 * 683 * Legacy public interface to retrieve first directory in the encoding 684 * searchPath. 685 * 686 * Results: 687 * The directory pathname, as a string, or NULL for an empty encoding 688 * search path. 689 * 690 * Side effects: 691 * None. 692 * 693 *------------------------------------------------------------------------- 694 */ 695 696CONST char * 697Tcl_GetDefaultEncodingDir(void) 698{ 699 int numDirs; 700 Tcl_Obj *first, *searchPath = Tcl_GetEncodingSearchPath(); 701 702 Tcl_ListObjLength(NULL, searchPath, &numDirs); 703 if (numDirs == 0) { 704 return NULL; 705 } 706 Tcl_ListObjIndex(NULL, searchPath, 0, &first); 707 708 return Tcl_GetString(first); 709} 710 711/* 712 *------------------------------------------------------------------------- 713 * 714 * Tcl_SetDefaultEncodingDir -- 715 * 716 * Legacy public interface to set the first directory in the encoding 717 * search path. 718 * 719 * Results: 720 * None. 721 * 722 * Side effects: 723 * Modifies the encoding search path. 724 * 725 *------------------------------------------------------------------------- 726 */ 727 728void 729Tcl_SetDefaultEncodingDir( 730 CONST char *path) 731{ 732 Tcl_Obj *searchPath = Tcl_GetEncodingSearchPath(); 733 Tcl_Obj *directory = Tcl_NewStringObj(path, -1); 734 735 searchPath = Tcl_DuplicateObj(searchPath); 736 Tcl_ListObjReplace(NULL, searchPath, 0, 0, 1, &directory); 737 Tcl_SetEncodingSearchPath(searchPath); 738} 739 740/* 741 *------------------------------------------------------------------------- 742 * 743 * Tcl_GetEncoding -- 744 * 745 * Given the name of a encoding, find the corresponding Tcl_Encoding 746 * token. If the encoding did not already exist, Tcl attempts to 747 * dynamically load an encoding by that name. 748 * 749 * Results: 750 * Returns a token that represents the encoding. If the name didn't refer 751 * to any known or loadable encoding, NULL is returned. If NULL was 752 * returned, an error message is left in interp's result object, unless 753 * interp was NULL. 754 * 755 * Side effects: 756 * The new encoding type is entered into a table visible to all 757 * interpreters, keyed off the encoding's name. For each call to this 758 * function, there should eventually be a call to Tcl_FreeEncoding, so 759 * that the database can be cleaned up when encodings aren't needed 760 * anymore. 761 * 762 *------------------------------------------------------------------------- 763 */ 764 765Tcl_Encoding 766Tcl_GetEncoding( 767 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ 768 CONST char *name) /* The name of the desired encoding. */ 769{ 770 Tcl_HashEntry *hPtr; 771 Encoding *encodingPtr; 772 773 Tcl_MutexLock(&encodingMutex); 774 if (name == NULL) { 775 encodingPtr = (Encoding *) systemEncoding; 776 encodingPtr->refCount++; 777 Tcl_MutexUnlock(&encodingMutex); 778 return systemEncoding; 779 } 780 781 hPtr = Tcl_FindHashEntry(&encodingTable, name); 782 if (hPtr != NULL) { 783 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 784 encodingPtr->refCount++; 785 Tcl_MutexUnlock(&encodingMutex); 786 return (Tcl_Encoding) encodingPtr; 787 } 788 Tcl_MutexUnlock(&encodingMutex); 789 790 return LoadEncodingFile(interp, name); 791} 792 793/* 794 *--------------------------------------------------------------------------- 795 * 796 * Tcl_FreeEncoding -- 797 * 798 * This function is called to release an encoding allocated by 799 * Tcl_CreateEncoding() or Tcl_GetEncoding(). 800 * 801 * Results: 802 * None. 803 * 804 * Side effects: 805 * The reference count associated with the encoding is decremented and 806 * the encoding may be deleted if nothing is using it anymore. 807 * 808 *--------------------------------------------------------------------------- 809 */ 810 811void 812Tcl_FreeEncoding( 813 Tcl_Encoding encoding) 814{ 815 Tcl_MutexLock(&encodingMutex); 816 FreeEncoding(encoding); 817 Tcl_MutexUnlock(&encodingMutex); 818} 819 820/* 821 *---------------------------------------------------------------------- 822 * 823 * FreeEncoding -- 824 * 825 * This function is called to release an encoding by functions that 826 * already have the encodingMutex. 827 * 828 * Results: 829 * None. 830 * 831 * Side effects: 832 * The reference count associated with the encoding is decremented and 833 * the encoding may be deleted if nothing is using it anymore. 834 * 835 *---------------------------------------------------------------------- 836 */ 837 838static void 839FreeEncoding( 840 Tcl_Encoding encoding) 841{ 842 Encoding *encodingPtr; 843 844 encodingPtr = (Encoding *) encoding; 845 if (encodingPtr == NULL) { 846 return; 847 } 848 if (encodingPtr->refCount<=0) { 849 Tcl_Panic("FreeEncoding: refcount problem !!!"); 850 } 851 encodingPtr->refCount--; 852 if (encodingPtr->refCount == 0) { 853 if (encodingPtr->freeProc != NULL) { 854 (*encodingPtr->freeProc)(encodingPtr->clientData); 855 } 856 if (encodingPtr->hPtr != NULL) { 857 Tcl_DeleteHashEntry(encodingPtr->hPtr); 858 } 859 ckfree((char *) encodingPtr->name); 860 ckfree((char *) encodingPtr); 861 } 862} 863 864/* 865 *------------------------------------------------------------------------- 866 * 867 * Tcl_GetEncodingName -- 868 * 869 * Given an encoding, return the name that was used to constuct the 870 * encoding. 871 * 872 * Results: 873 * The name of the encoding. 874 * 875 * Side effects: 876 * None. 877 * 878 *--------------------------------------------------------------------------- 879 */ 880 881CONST char * 882Tcl_GetEncodingName( 883 Tcl_Encoding encoding) /* The encoding whose name to fetch. */ 884{ 885 if (encoding == NULL) { 886 encoding = systemEncoding; 887 } 888 889 return ((Encoding *) encoding)->name; 890} 891 892/* 893 *------------------------------------------------------------------------- 894 * 895 * Tcl_GetEncodingNames -- 896 * 897 * Get the list of all known encodings, including the ones stored as 898 * files on disk in the encoding path. 899 * 900 * Results: 901 * Modifies interp's result object to hold a list of all the available 902 * encodings. 903 * 904 * Side effects: 905 * None. 906 * 907 *------------------------------------------------------------------------- 908 */ 909 910void 911Tcl_GetEncodingNames( 912 Tcl_Interp *interp) /* Interp to hold result. */ 913{ 914 Tcl_HashTable table; 915 Tcl_HashSearch search; 916 Tcl_HashEntry *hPtr; 917 Tcl_Obj *map, *name, *result = Tcl_NewObj(); 918 Tcl_DictSearch mapSearch; 919 int dummy, done = 0; 920 921 Tcl_InitObjHashTable(&table); 922 923 /* 924 * Copy encoding names from loaded encoding table to table. 925 */ 926 927 Tcl_MutexLock(&encodingMutex); 928 for (hPtr = Tcl_FirstHashEntry(&encodingTable, &search); hPtr != NULL; 929 hPtr = Tcl_NextHashEntry(&search)) { 930 Encoding *encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 931 Tcl_CreateHashEntry(&table, 932 (char *) Tcl_NewStringObj(encodingPtr->name, -1), &dummy); 933 } 934 Tcl_MutexUnlock(&encodingMutex); 935 936 FillEncodingFileMap(); 937 map = TclGetProcessGlobalValue(&encodingFileMap); 938 939 /* 940 * Copy encoding names from encoding file map to table. 941 */ 942 943 Tcl_DictObjFirst(NULL, map, &mapSearch, &name, NULL, &done); 944 for (; !done; Tcl_DictObjNext(&mapSearch, &name, NULL, &done)) { 945 Tcl_CreateHashEntry(&table, (char *) name, &dummy); 946 } 947 948 /* 949 * Pull all encoding names from table into the result list. 950 */ 951 952 for (hPtr = Tcl_FirstHashEntry(&table, &search); hPtr != NULL; 953 hPtr = Tcl_NextHashEntry(&search)) { 954 Tcl_ListObjAppendElement(NULL, result, 955 (Tcl_Obj *) Tcl_GetHashKey(&table, hPtr)); 956 } 957 Tcl_SetObjResult(interp, result); 958 Tcl_DeleteHashTable(&table); 959} 960 961/* 962 *------------------------------------------------------------------------ 963 * 964 * Tcl_SetSystemEncoding -- 965 * 966 * Sets the default encoding that should be used whenever the user passes 967 * a NULL value in to one of the conversion routines. If the supplied 968 * name is NULL, the system encoding is reset to the default system 969 * encoding. 970 * 971 * Results: 972 * The return value is TCL_OK if the system encoding was successfully set 973 * to the encoding specified by name, TCL_ERROR otherwise. If TCL_ERROR 974 * is returned, an error message is left in interp's result object, 975 * unless interp was NULL. 976 * 977 * Side effects: 978 * The reference count of the new system encoding is incremented. The 979 * reference count of the old system encoding is decremented and it may 980 * be freed. 981 * 982 *------------------------------------------------------------------------ 983 */ 984 985int 986Tcl_SetSystemEncoding( 987 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ 988 CONST char *name) /* The name of the desired encoding, or NULL 989 * to reset to default encoding. */ 990{ 991 Tcl_Encoding encoding; 992 Encoding *encodingPtr; 993 994 if (name == NULL) { 995 Tcl_MutexLock(&encodingMutex); 996 encoding = defaultEncoding; 997 encodingPtr = (Encoding *) encoding; 998 encodingPtr->refCount++; 999 Tcl_MutexUnlock(&encodingMutex); 1000 } else { 1001 encoding = Tcl_GetEncoding(interp, name); 1002 if (encoding == NULL) { 1003 return TCL_ERROR; 1004 } 1005 } 1006 1007 Tcl_MutexLock(&encodingMutex); 1008 FreeEncoding(systemEncoding); 1009 systemEncoding = encoding; 1010 Tcl_MutexUnlock(&encodingMutex); 1011 1012 return TCL_OK; 1013} 1014 1015/* 1016 *--------------------------------------------------------------------------- 1017 * 1018 * Tcl_CreateEncoding -- 1019 * 1020 * This function is called to define a new encoding and the functions 1021 * that are used to convert between the specified encoding and Unicode. 1022 * 1023 * Results: 1024 * Returns a token that represents the encoding. If an encoding with the 1025 * same name already existed, the old encoding token remains valid and 1026 * continues to behave as it used to, and will eventually be garbage 1027 * collected when the last reference to it goes away. Any subsequent 1028 * calls to Tcl_GetEncoding with the specified name will retrieve the 1029 * most recent encoding token. 1030 * 1031 * Side effects: 1032 * The new encoding type is entered into a table visible to all 1033 * interpreters, keyed off the encoding's name. For each call to this 1034 * function, there should eventually be a call to Tcl_FreeEncoding, so 1035 * that the database can be cleaned up when encodings aren't needed 1036 * anymore. 1037 * 1038 *--------------------------------------------------------------------------- 1039 */ 1040 1041Tcl_Encoding 1042Tcl_CreateEncoding( 1043 const Tcl_EncodingType *typePtr) 1044 /* The encoding type. */ 1045{ 1046 Tcl_HashEntry *hPtr; 1047 int isNew; 1048 Encoding *encodingPtr; 1049 char *name; 1050 1051 Tcl_MutexLock(&encodingMutex); 1052 hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &isNew); 1053 if (isNew == 0) { 1054 /* 1055 * Remove old encoding from hash table, but don't delete it until last 1056 * reference goes away. 1057 */ 1058 1059 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 1060 encodingPtr->hPtr = NULL; 1061 } 1062 1063 name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1); 1064 1065 encodingPtr = (Encoding *) ckalloc(sizeof(Encoding)); 1066 encodingPtr->name = strcpy(name, typePtr->encodingName); 1067 encodingPtr->toUtfProc = typePtr->toUtfProc; 1068 encodingPtr->fromUtfProc = typePtr->fromUtfProc; 1069 encodingPtr->freeProc = typePtr->freeProc; 1070 encodingPtr->nullSize = typePtr->nullSize; 1071 encodingPtr->clientData = typePtr->clientData; 1072 if (typePtr->nullSize == 1) { 1073 encodingPtr->lengthProc = (LengthProc *) strlen; 1074 } else { 1075 encodingPtr->lengthProc = (LengthProc *) unilen; 1076 } 1077 encodingPtr->refCount = 1; 1078 encodingPtr->hPtr = hPtr; 1079 Tcl_SetHashValue(hPtr, encodingPtr); 1080 1081 Tcl_MutexUnlock(&encodingMutex); 1082 1083 return (Tcl_Encoding) encodingPtr; 1084} 1085 1086/* 1087 *------------------------------------------------------------------------- 1088 * 1089 * Tcl_ExternalToUtfDString -- 1090 * 1091 * Convert a source buffer from the specified encoding into UTF-8. If any 1092 * of the bytes in the source buffer are invalid or cannot be represented 1093 * in the target encoding, a default fallback character will be 1094 * substituted. 1095 * 1096 * Results: 1097 * The converted bytes are stored in the DString, which is then NULL 1098 * terminated. The return value is a pointer to the value stored in the 1099 * DString. 1100 * 1101 * Side effects: 1102 * None. 1103 * 1104 *------------------------------------------------------------------------- 1105 */ 1106 1107char * 1108Tcl_ExternalToUtfDString( 1109 Tcl_Encoding encoding, /* The encoding for the source string, or NULL 1110 * for the default system encoding. */ 1111 CONST char *src, /* Source string in specified encoding. */ 1112 int srcLen, /* Source string length in bytes, or < 0 for 1113 * encoding-specific string length. */ 1114 Tcl_DString *dstPtr) /* Uninitialized or free DString in which the 1115 * converted string is stored. */ 1116{ 1117 char *dst; 1118 Tcl_EncodingState state; 1119 Encoding *encodingPtr; 1120 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; 1121 1122 Tcl_DStringInit(dstPtr); 1123 dst = Tcl_DStringValue(dstPtr); 1124 dstLen = dstPtr->spaceAvl - 1; 1125 1126 if (encoding == NULL) { 1127 encoding = systemEncoding; 1128 } 1129 encodingPtr = (Encoding *) encoding; 1130 1131 if (src == NULL) { 1132 srcLen = 0; 1133 } else if (srcLen < 0) { 1134 srcLen = (*encodingPtr->lengthProc)(src); 1135 } 1136 1137 flags = TCL_ENCODING_START | TCL_ENCODING_END; 1138 1139 while (1) { 1140 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, 1141 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, 1142 &dstChars); 1143 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); 1144 1145 if (result != TCL_CONVERT_NOSPACE) { 1146 Tcl_DStringSetLength(dstPtr, soFar); 1147 return Tcl_DStringValue(dstPtr); 1148 } 1149 1150 flags &= ~TCL_ENCODING_START; 1151 src += srcRead; 1152 srcLen -= srcRead; 1153 if (Tcl_DStringLength(dstPtr) == 0) { 1154 Tcl_DStringSetLength(dstPtr, dstLen); 1155 } 1156 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); 1157 dst = Tcl_DStringValue(dstPtr) + soFar; 1158 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; 1159 } 1160} 1161 1162/* 1163 *------------------------------------------------------------------------- 1164 * 1165 * Tcl_ExternalToUtf -- 1166 * 1167 * Convert a source buffer from the specified encoding into UTF-8. 1168 * 1169 * Results: 1170 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE, 1171 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as 1172 * documented in tcl.h. 1173 * 1174 * Side effects: 1175 * The converted bytes are stored in the output buffer. 1176 * 1177 *------------------------------------------------------------------------- 1178 */ 1179 1180int 1181Tcl_ExternalToUtf( 1182 Tcl_Interp *interp, /* Interp for error return, if not NULL. */ 1183 Tcl_Encoding encoding, /* The encoding for the source string, or NULL 1184 * for the default system encoding. */ 1185 CONST char *src, /* Source string in specified encoding. */ 1186 int srcLen, /* Source string length in bytes, or < 0 for 1187 * encoding-specific string length. */ 1188 int flags, /* Conversion control flags. */ 1189 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 1190 * information used during a piecewise 1191 * conversion. Contents of statePtr are 1192 * initialized and/or reset by conversion 1193 * routine under control of flags argument. */ 1194 char *dst, /* Output buffer in which converted string is 1195 * stored. */ 1196 int dstLen, /* The maximum length of output buffer in 1197 * bytes. */ 1198 int *srcReadPtr, /* Filled with the number of bytes from the 1199 * source string that were converted. This may 1200 * be less than the original source length if 1201 * there was a problem converting some source 1202 * characters. */ 1203 int *dstWrotePtr, /* Filled with the number of bytes that were 1204 * stored in the output buffer as a result of 1205 * the conversion. */ 1206 int *dstCharsPtr) /* Filled with the number of characters that 1207 * correspond to the bytes stored in the 1208 * output buffer. */ 1209{ 1210 Encoding *encodingPtr; 1211 int result, srcRead, dstWrote, dstChars; 1212 Tcl_EncodingState state; 1213 1214 if (encoding == NULL) { 1215 encoding = systemEncoding; 1216 } 1217 encodingPtr = (Encoding *) encoding; 1218 1219 if (src == NULL) { 1220 srcLen = 0; 1221 } else if (srcLen < 0) { 1222 srcLen = (*encodingPtr->lengthProc)(src); 1223 } 1224 if (statePtr == NULL) { 1225 flags |= TCL_ENCODING_START | TCL_ENCODING_END; 1226 statePtr = &state; 1227 } 1228 if (srcReadPtr == NULL) { 1229 srcReadPtr = &srcRead; 1230 } 1231 if (dstWrotePtr == NULL) { 1232 dstWrotePtr = &dstWrote; 1233 } 1234 if (dstCharsPtr == NULL) { 1235 dstCharsPtr = &dstChars; 1236 } 1237 1238 /* 1239 * If there are any null characters in the middle of the buffer, they will 1240 * converted to the UTF-8 null character (\xC080). To get the actual \0 at 1241 * the end of the destination buffer, we need to append it manually. 1242 */ 1243 1244 dstLen--; 1245 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen, 1246 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, 1247 dstCharsPtr); 1248 dst[*dstWrotePtr] = '\0'; 1249 1250 return result; 1251} 1252 1253/* 1254 *------------------------------------------------------------------------- 1255 * 1256 * Tcl_UtfToExternalDString -- 1257 * 1258 * Convert a source buffer from UTF-8 into the specified encoding. If any 1259 * of the bytes in the source buffer are invalid or cannot be represented 1260 * in the target encoding, a default fallback character will be 1261 * substituted. 1262 * 1263 * Results: 1264 * The converted bytes are stored in the DString, which is then NULL 1265 * terminated in an encoding-specific manner. The return value is a 1266 * pointer to the value stored in the DString. 1267 * 1268 * Side effects: 1269 * None. 1270 * 1271 *------------------------------------------------------------------------- 1272 */ 1273 1274char * 1275Tcl_UtfToExternalDString( 1276 Tcl_Encoding encoding, /* The encoding for the converted string, or 1277 * NULL for the default system encoding. */ 1278 CONST char *src, /* Source string in UTF-8. */ 1279 int srcLen, /* Source string length in bytes, or < 0 for 1280 * strlen(). */ 1281 Tcl_DString *dstPtr) /* Uninitialized or free DString in which the 1282 * converted string is stored. */ 1283{ 1284 char *dst; 1285 Tcl_EncodingState state; 1286 Encoding *encodingPtr; 1287 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; 1288 1289 Tcl_DStringInit(dstPtr); 1290 dst = Tcl_DStringValue(dstPtr); 1291 dstLen = dstPtr->spaceAvl - 1; 1292 1293 if (encoding == NULL) { 1294 encoding = systemEncoding; 1295 } 1296 encodingPtr = (Encoding *) encoding; 1297 1298 if (src == NULL) { 1299 srcLen = 0; 1300 } else if (srcLen < 0) { 1301 srcLen = strlen(src); 1302 } 1303 flags = TCL_ENCODING_START | TCL_ENCODING_END; 1304 while (1) { 1305 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, 1306 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, 1307 &dstChars); 1308 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); 1309 1310 if (result != TCL_CONVERT_NOSPACE) { 1311 if (encodingPtr->nullSize == 2) { 1312 Tcl_DStringSetLength(dstPtr, soFar + 1); 1313 } 1314 Tcl_DStringSetLength(dstPtr, soFar); 1315 return Tcl_DStringValue(dstPtr); 1316 } 1317 1318 flags &= ~TCL_ENCODING_START; 1319 src += srcRead; 1320 srcLen -= srcRead; 1321 if (Tcl_DStringLength(dstPtr) == 0) { 1322 Tcl_DStringSetLength(dstPtr, dstLen); 1323 } 1324 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); 1325 dst = Tcl_DStringValue(dstPtr) + soFar; 1326 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; 1327 } 1328} 1329 1330/* 1331 *------------------------------------------------------------------------- 1332 * 1333 * Tcl_UtfToExternal -- 1334 * 1335 * Convert a buffer from UTF-8 into the specified encoding. 1336 * 1337 * Results: 1338 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE, 1339 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, as 1340 * documented in tcl.h. 1341 * 1342 * Side effects: 1343 * The converted bytes are stored in the output buffer. 1344 * 1345 *------------------------------------------------------------------------- 1346 */ 1347 1348int 1349Tcl_UtfToExternal( 1350 Tcl_Interp *interp, /* Interp for error return, if not NULL. */ 1351 Tcl_Encoding encoding, /* The encoding for the converted string, or 1352 * NULL for the default system encoding. */ 1353 CONST char *src, /* Source string in UTF-8. */ 1354 int srcLen, /* Source string length in bytes, or < 0 for 1355 * strlen(). */ 1356 int flags, /* Conversion control flags. */ 1357 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 1358 * information used during a piecewise 1359 * conversion. Contents of statePtr are 1360 * initialized and/or reset by conversion 1361 * routine under control of flags argument. */ 1362 char *dst, /* Output buffer in which converted string 1363 * is stored. */ 1364 int dstLen, /* The maximum length of output buffer in 1365 * bytes. */ 1366 int *srcReadPtr, /* Filled with the number of bytes from the 1367 * source string that were converted. This may 1368 * be less than the original source length if 1369 * there was a problem converting some source 1370 * characters. */ 1371 int *dstWrotePtr, /* Filled with the number of bytes that were 1372 * stored in the output buffer as a result of 1373 * the conversion. */ 1374 int *dstCharsPtr) /* Filled with the number of characters that 1375 * correspond to the bytes stored in the 1376 * output buffer. */ 1377{ 1378 Encoding *encodingPtr; 1379 int result, srcRead, dstWrote, dstChars; 1380 Tcl_EncodingState state; 1381 1382 if (encoding == NULL) { 1383 encoding = systemEncoding; 1384 } 1385 encodingPtr = (Encoding *) encoding; 1386 1387 if (src == NULL) { 1388 srcLen = 0; 1389 } else if (srcLen < 0) { 1390 srcLen = strlen(src); 1391 } 1392 if (statePtr == NULL) { 1393 flags |= TCL_ENCODING_START | TCL_ENCODING_END; 1394 statePtr = &state; 1395 } 1396 if (srcReadPtr == NULL) { 1397 srcReadPtr = &srcRead; 1398 } 1399 if (dstWrotePtr == NULL) { 1400 dstWrotePtr = &dstWrote; 1401 } 1402 if (dstCharsPtr == NULL) { 1403 dstCharsPtr = &dstChars; 1404 } 1405 1406 dstLen -= encodingPtr->nullSize; 1407 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen, 1408 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, 1409 dstCharsPtr); 1410 if (encodingPtr->nullSize == 2) { 1411 dst[*dstWrotePtr + 1] = '\0'; 1412 } 1413 dst[*dstWrotePtr] = '\0'; 1414 1415 return result; 1416} 1417 1418/* 1419 *--------------------------------------------------------------------------- 1420 * 1421 * Tcl_FindExecutable -- 1422 * 1423 * This function computes the absolute path name of the current 1424 * application, given its argv[0] value. 1425 * 1426 * Results: 1427 * None. 1428 * 1429 * Side effects: 1430 * The absolute pathname for the application is computed and stored to be 1431 * returned later be [info nameofexecutable]. 1432 * 1433 *--------------------------------------------------------------------------- 1434 */ 1435 1436void 1437Tcl_FindExecutable( 1438 CONST char *argv0) /* The value of the application's argv[0] 1439 * (native). */ 1440{ 1441 TclInitSubsystems(); 1442 TclpSetInitialEncodings(); 1443 TclpFindExecutable(argv0); 1444} 1445 1446/* 1447 *--------------------------------------------------------------------------- 1448 * 1449 * OpenEncodingFileChannel -- 1450 * 1451 * Open the file believed to hold data for the encoding, "name". 1452 * 1453 * Results: 1454 * Returns the readable Tcl_Channel from opening the file, or NULL if the 1455 * file could not be successfully opened. If NULL was returned, an error 1456 * message is left in interp's result object, unless interp was NULL. 1457 * 1458 * Side effects: 1459 * Channel may be opened. Information about the filesystem may be cached 1460 * to speed later calls. 1461 * 1462 *--------------------------------------------------------------------------- 1463 */ 1464 1465static Tcl_Channel 1466OpenEncodingFileChannel( 1467 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ 1468 CONST char *name) /* The name of the encoding file on disk and 1469 * also the name for new encoding. */ 1470{ 1471 Tcl_Obj *nameObj = Tcl_NewStringObj(name, -1); 1472 Tcl_Obj *fileNameObj = Tcl_DuplicateObj(nameObj); 1473 Tcl_Obj *searchPath = Tcl_DuplicateObj(Tcl_GetEncodingSearchPath()); 1474 Tcl_Obj *map = TclGetProcessGlobalValue(&encodingFileMap); 1475 Tcl_Obj **dir, *path, *directory = NULL; 1476 Tcl_Channel chan = NULL; 1477 int i, numDirs; 1478 1479 Tcl_ListObjGetElements(NULL, searchPath, &numDirs, &dir); 1480 Tcl_IncrRefCount(nameObj); 1481 Tcl_AppendToObj(fileNameObj, ".enc", -1); 1482 Tcl_IncrRefCount(fileNameObj); 1483 Tcl_DictObjGet(NULL, map, nameObj, &directory); 1484 1485 /* 1486 * Check that any cached directory is still on the encoding search path. 1487 */ 1488 1489 if (NULL != directory) { 1490 int verified = 0; 1491 1492 for (i=0; i<numDirs && !verified; i++) { 1493 if (dir[i] == directory) { 1494 verified = 1; 1495 } 1496 } 1497 if (!verified) { 1498 CONST char *dirString = Tcl_GetString(directory); 1499 for (i=0; i<numDirs && !verified; i++) { 1500 if (strcmp(dirString, Tcl_GetString(dir[i])) == 0) { 1501 verified = 1; 1502 } 1503 } 1504 } 1505 if (!verified) { 1506 /* 1507 * Directory no longer on the search path. Remove from cache. 1508 */ 1509 1510 map = Tcl_DuplicateObj(map); 1511 Tcl_DictObjRemove(NULL, map, nameObj); 1512 TclSetProcessGlobalValue(&encodingFileMap, map, NULL); 1513 directory = NULL; 1514 } 1515 } 1516 1517 if (NULL != directory) { 1518 /* 1519 * Got a directory from the cache. Try to use it first. 1520 */ 1521 1522 Tcl_IncrRefCount(directory); 1523 path = Tcl_FSJoinToPath(directory, 1, &fileNameObj); 1524 Tcl_IncrRefCount(path); 1525 Tcl_DecrRefCount(directory); 1526 chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0); 1527 Tcl_DecrRefCount(path); 1528 } 1529 1530 /* 1531 * Scan the search path until we find it. 1532 */ 1533 1534 for (i=0; i<numDirs && (chan == NULL); i++) { 1535 path = Tcl_FSJoinToPath(dir[i], 1, &fileNameObj); 1536 Tcl_IncrRefCount(path); 1537 chan = Tcl_FSOpenFileChannel(NULL, path, "r", 0); 1538 Tcl_DecrRefCount(path); 1539 if (chan != NULL) { 1540 /* 1541 * Save directory in the cache. 1542 */ 1543 1544 map = Tcl_DuplicateObj(TclGetProcessGlobalValue(&encodingFileMap)); 1545 Tcl_DictObjPut(NULL, map, nameObj, dir[i]); 1546 TclSetProcessGlobalValue(&encodingFileMap, map, NULL); 1547 } 1548 } 1549 1550 if ((NULL == chan) && (interp != NULL)) { 1551 Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL); 1552 Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "ENCODING", name, NULL); 1553 } 1554 Tcl_DecrRefCount(fileNameObj); 1555 Tcl_DecrRefCount(nameObj); 1556 Tcl_DecrRefCount(searchPath); 1557 1558 return chan; 1559} 1560 1561/* 1562 *--------------------------------------------------------------------------- 1563 * 1564 * LoadEncodingFile -- 1565 * 1566 * Read a file that describes an encoding and create a new Encoding from 1567 * the data. 1568 * 1569 * Results: 1570 * The return value is the newly loaded Encoding, or NULL if the file 1571 * didn't exist of was in the incorrect format. If NULL was returned, an 1572 * error message is left in interp's result object, unless interp was 1573 * NULL. 1574 * 1575 * Side effects: 1576 * File read from disk. 1577 * 1578 *--------------------------------------------------------------------------- 1579 */ 1580 1581static Tcl_Encoding 1582LoadEncodingFile( 1583 Tcl_Interp *interp, /* Interp for error reporting, if not NULL. */ 1584 CONST char *name) /* The name of the encoding file on disk and 1585 * also the name for new encoding. */ 1586{ 1587 Tcl_Channel chan = NULL; 1588 Tcl_Encoding encoding = NULL; 1589 int ch; 1590 1591 chan = OpenEncodingFileChannel(interp, name); 1592 if (chan == NULL) { 1593 return NULL; 1594 } 1595 1596 Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8"); 1597 1598 while (1) { 1599 Tcl_DString ds; 1600 1601 Tcl_DStringInit(&ds); 1602 Tcl_Gets(chan, &ds); 1603 ch = Tcl_DStringValue(&ds)[0]; 1604 Tcl_DStringFree(&ds); 1605 if (ch != '#') { 1606 break; 1607 } 1608 } 1609 1610 switch (ch) { 1611 case 'S': 1612 encoding = LoadTableEncoding(name, ENCODING_SINGLEBYTE, chan); 1613 break; 1614 case 'D': 1615 encoding = LoadTableEncoding(name, ENCODING_DOUBLEBYTE, chan); 1616 break; 1617 case 'M': 1618 encoding = LoadTableEncoding(name, ENCODING_MULTIBYTE, chan); 1619 break; 1620 case 'E': 1621 encoding = LoadEscapeEncoding(name, chan); 1622 break; 1623 } 1624 if ((encoding == NULL) && (interp != NULL)) { 1625 Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL); 1626 } 1627 Tcl_Close(NULL, chan); 1628 1629 return encoding; 1630} 1631 1632/* 1633 *------------------------------------------------------------------------- 1634 * 1635 * LoadTableEncoding -- 1636 * 1637 * Helper function for LoadEncodingTable(). Loads a table to that 1638 * converts between Unicode and some other encoding and creates an 1639 * encoding (using a TableEncoding structure) from that information. 1640 * 1641 * File contains binary data, but begins with a marker to indicate 1642 * byte-ordering, so that same binary file can be read on either endian 1643 * platforms. 1644 * 1645 * Results: 1646 * The return value is the new encoding, or NULL if the encoding could 1647 * not be created (because the file contained invalid data). 1648 * 1649 * Side effects: 1650 * None. 1651 * 1652 *------------------------------------------------------------------------- 1653 */ 1654 1655static Tcl_Encoding 1656LoadTableEncoding( 1657 CONST char *name, /* Name for new encoding. */ 1658 int type, /* Type of encoding (ENCODING_?????). */ 1659 Tcl_Channel chan) /* File containing new encoding. */ 1660{ 1661 Tcl_DString lineString; 1662 Tcl_Obj *objPtr; 1663 char *line; 1664 int i, hi, lo, numPages, symbol, fallback; 1665 unsigned char used[256]; 1666 unsigned int size; 1667 TableEncodingData *dataPtr; 1668 unsigned short *pageMemPtr; 1669 Tcl_EncodingType encType; 1670 1671 /* 1672 * Speed over memory. Use a full 256 character table to decode hex 1673 * sequences in the encoding files. 1674 */ 1675 1676 static char staticHex[] = { 1677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 ... 15 */ 1678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 ... 31 */ 1679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 ... 47 */ 1680 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 48 ... 63 */ 1681 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 ... 79 */ 1682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 ... 95 */ 1683 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96 ... 111 */ 1684 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */ 1685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */ 1686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */ 1687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */ 1688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */ 1689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */ 1690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */ 1691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */ 1692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */ 1693 }; 1694 1695 Tcl_DStringInit(&lineString); 1696 Tcl_Gets(chan, &lineString); 1697 line = Tcl_DStringValue(&lineString); 1698 1699 fallback = (int) strtol(line, &line, 16); 1700 symbol = (int) strtol(line, &line, 10); 1701 numPages = (int) strtol(line, &line, 10); 1702 Tcl_DStringFree(&lineString); 1703 1704 if (numPages < 0) { 1705 numPages = 0; 1706 } else if (numPages > 256) { 1707 numPages = 256; 1708 } 1709 1710 memset(used, 0, sizeof(used)); 1711 1712#undef PAGESIZE 1713#define PAGESIZE (256 * sizeof(unsigned short)) 1714 1715 dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData)); 1716 memset(dataPtr, 0, sizeof(TableEncodingData)); 1717 1718 dataPtr->fallback = fallback; 1719 1720 /* 1721 * Read the table that maps characters to Unicode. Performs a single 1722 * malloc to get the memory for the array and all the pages needed by the 1723 * array. 1724 */ 1725 1726 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; 1727 dataPtr->toUnicode = (unsigned short **) ckalloc(size); 1728 memset(dataPtr->toUnicode, 0, size); 1729 pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256); 1730 1731 TclNewObj(objPtr); 1732 Tcl_IncrRefCount(objPtr); 1733 for (i = 0; i < numPages; i++) { 1734 int ch; 1735 char *p; 1736 1737 Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0); 1738 p = Tcl_GetString(objPtr); 1739 hi = (staticHex[UCHAR(p[0])] << 4) + staticHex[UCHAR(p[1])]; 1740 dataPtr->toUnicode[hi] = pageMemPtr; 1741 p += 2; 1742 for (lo = 0; lo < 256; lo++) { 1743 if ((lo & 0x0f) == 0) { 1744 p++; 1745 } 1746 ch = (staticHex[UCHAR(p[0])] << 12) + (staticHex[UCHAR(p[1])] << 8) 1747 + (staticHex[UCHAR(p[2])] << 4) + staticHex[UCHAR(p[3])]; 1748 if (ch != 0) { 1749 used[ch >> 8] = 1; 1750 } 1751 *pageMemPtr = (unsigned short) ch; 1752 pageMemPtr++; 1753 p += 4; 1754 } 1755 } 1756 TclDecrRefCount(objPtr); 1757 1758 if (type == ENCODING_DOUBLEBYTE) { 1759 memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes)); 1760 } else { 1761 for (hi = 1; hi < 256; hi++) { 1762 if (dataPtr->toUnicode[hi] != NULL) { 1763 dataPtr->prefixBytes[hi] = 1; 1764 } 1765 } 1766 } 1767 1768 /* 1769 * Invert toUnicode array to produce the fromUnicode array. Performs a 1770 * single malloc to get the memory for the array and all the pages needed 1771 * by the array. While reading in the toUnicode array, we remembered what 1772 * pages that would be needed for the fromUnicode array. 1773 */ 1774 1775 if (symbol) { 1776 used[0] = 1; 1777 } 1778 numPages = 0; 1779 for (hi = 0; hi < 256; hi++) { 1780 if (used[hi]) { 1781 numPages++; 1782 } 1783 } 1784 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; 1785 dataPtr->fromUnicode = (unsigned short **) ckalloc(size); 1786 memset(dataPtr->fromUnicode, 0, size); 1787 pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256); 1788 1789 for (hi = 0; hi < 256; hi++) { 1790 if (dataPtr->toUnicode[hi] == NULL) { 1791 dataPtr->toUnicode[hi] = emptyPage; 1792 } else { 1793 for (lo = 0; lo < 256; lo++) { 1794 int ch; 1795 1796 ch = dataPtr->toUnicode[hi][lo]; 1797 if (ch != 0) { 1798 unsigned short *page; 1799 1800 page = dataPtr->fromUnicode[ch >> 8]; 1801 if (page == NULL) { 1802 page = pageMemPtr; 1803 pageMemPtr += 256; 1804 dataPtr->fromUnicode[ch >> 8] = page; 1805 } 1806 page[ch & 0xff] = (unsigned short) ((hi << 8) + lo); 1807 } 1808 } 1809 } 1810 } 1811 if (type == ENCODING_MULTIBYTE) { 1812 /* 1813 * If multibyte encodings don't have a backslash character, define 1814 * one. Otherwise, on Windows, native file names won't work because 1815 * the backslash in the file name will map to the unknown character 1816 * (question mark) when converting from UTF-8 to external encoding. 1817 */ 1818 1819 if (dataPtr->fromUnicode[0] != NULL) { 1820 if (dataPtr->fromUnicode[0]['\\'] == '\0') { 1821 dataPtr->fromUnicode[0]['\\'] = '\\'; 1822 } 1823 } 1824 } 1825 if (symbol) { 1826 unsigned short *page; 1827 1828 /* 1829 * Make a special symbol encoding that not only maps the symbol 1830 * characters from their Unicode code points down into page 0, but 1831 * also ensure that the characters on page 0 map to themselves. This 1832 * is so that a symbol font can be used to display a simple string 1833 * like "abcd" and have alpha, beta, chi, delta show up, rather than 1834 * have "unknown" chars show up because strictly speaking the symbol 1835 * font doesn't have glyphs for those low ascii chars. 1836 */ 1837 1838 page = dataPtr->fromUnicode[0]; 1839 if (page == NULL) { 1840 page = pageMemPtr; 1841 dataPtr->fromUnicode[0] = page; 1842 } 1843 for (lo = 0; lo < 256; lo++) { 1844 if (dataPtr->toUnicode[0][lo] != 0) { 1845 page[lo] = (unsigned short) lo; 1846 } 1847 } 1848 } 1849 for (hi = 0; hi < 256; hi++) { 1850 if (dataPtr->fromUnicode[hi] == NULL) { 1851 dataPtr->fromUnicode[hi] = emptyPage; 1852 } 1853 } 1854 1855 /* 1856 * For trailing 'R'everse encoding, see [Patch 689341] 1857 */ 1858 1859 Tcl_DStringInit(&lineString); 1860 do { 1861 int len; 1862 1863 /* 1864 * Skip leading empty lines. 1865 */ 1866 1867 while ((len = Tcl_Gets(chan, &lineString)) == 0) { 1868 /* empty body */ 1869 } 1870 1871 if (len < 0) { 1872 break; 1873 } 1874 line = Tcl_DStringValue(&lineString); 1875 if (line[0] != 'R') { 1876 break; 1877 } 1878 for (Tcl_DStringSetLength(&lineString, 0); 1879 (len = Tcl_Gets(chan, &lineString)) >= 0; 1880 Tcl_DStringSetLength(&lineString, 0)) { 1881 unsigned char* p; 1882 int to, from; 1883 1884 if (len < 5) { 1885 continue; 1886 } 1887 p = (unsigned char*) Tcl_DStringValue(&lineString); 1888 to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) 1889 + (staticHex[p[2]] << 4) + staticHex[p[3]]; 1890 if (to == 0) { 1891 continue; 1892 } 1893 for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) { 1894 from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) 1895 + (staticHex[p[2]] << 4) + staticHex[p[3]]; 1896 if (from == 0) { 1897 continue; 1898 } 1899 dataPtr->fromUnicode[from >> 8][from & 0xff] = to; 1900 } 1901 } 1902 } while (0); 1903 Tcl_DStringFree(&lineString); 1904 1905 encType.encodingName = name; 1906 encType.toUtfProc = TableToUtfProc; 1907 encType.fromUtfProc = TableFromUtfProc; 1908 encType.freeProc = TableFreeProc; 1909 encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1; 1910 encType.clientData = (ClientData) dataPtr; 1911 1912 return Tcl_CreateEncoding(&encType); 1913} 1914 1915/* 1916 *------------------------------------------------------------------------- 1917 * 1918 * LoadEscapeEncoding -- 1919 * 1920 * Helper function for LoadEncodingTable(). Loads a state machine that 1921 * converts between Unicode and some other encoding. 1922 * 1923 * File contains text data that describes the escape sequences that are 1924 * used to choose an encoding and the associated names for the 1925 * sub-encodings. 1926 * 1927 * Results: 1928 * The return value is the new encoding, or NULL if the encoding could 1929 * not be created (because the file contained invalid data). 1930 * 1931 * Side effects: 1932 * None. 1933 * 1934 *------------------------------------------------------------------------- 1935 */ 1936 1937static Tcl_Encoding 1938LoadEscapeEncoding( 1939 CONST char *name, /* Name for new encoding. */ 1940 Tcl_Channel chan) /* File containing new encoding. */ 1941{ 1942 int i; 1943 unsigned int size; 1944 Tcl_DString escapeData; 1945 char init[16], final[16]; 1946 EscapeEncodingData *dataPtr; 1947 Tcl_EncodingType type; 1948 1949 init[0] = '\0'; 1950 final[0] = '\0'; 1951 Tcl_DStringInit(&escapeData); 1952 1953 while (1) { 1954 int argc; 1955 CONST char **argv; 1956 char *line; 1957 Tcl_DString lineString; 1958 1959 Tcl_DStringInit(&lineString); 1960 if (Tcl_Gets(chan, &lineString) < 0) { 1961 break; 1962 } 1963 line = Tcl_DStringValue(&lineString); 1964 if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) { 1965 continue; 1966 } 1967 if (argc >= 2) { 1968 if (strcmp(argv[0], "name") == 0) { 1969 /* do nothing */ 1970 } else if (strcmp(argv[0], "init") == 0) { 1971 strncpy(init, argv[1], sizeof(init)); 1972 init[sizeof(init) - 1] = '\0'; 1973 } else if (strcmp(argv[0], "final") == 0) { 1974 strncpy(final, argv[1], sizeof(final)); 1975 final[sizeof(final) - 1] = '\0'; 1976 } else { 1977 EscapeSubTable est; 1978 Encoding *e; 1979 1980 strncpy(est.sequence, argv[1], sizeof(est.sequence)); 1981 est.sequence[sizeof(est.sequence) - 1] = '\0'; 1982 est.sequenceLen = strlen(est.sequence); 1983 1984 strncpy(est.name, argv[0], sizeof(est.name)); 1985 est.name[sizeof(est.name) - 1] = '\0'; 1986 1987 /* 1988 * To avoid infinite recursion in [encoding system iso2022-*] 1989 */ 1990 1991 e = (Encoding *) Tcl_GetEncoding(NULL, est.name); 1992 if (e && e->toUtfProc != TableToUtfProc && 1993 e->toUtfProc != Iso88591ToUtfProc) { 1994 Tcl_FreeEncoding((Tcl_Encoding) e); 1995 e = NULL; 1996 } 1997 est.encodingPtr = e; 1998 Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est)); 1999 } 2000 } 2001 ckfree((char *) argv); 2002 Tcl_DStringFree(&lineString); 2003 } 2004 2005 size = sizeof(EscapeEncodingData) - sizeof(EscapeSubTable) 2006 + Tcl_DStringLength(&escapeData); 2007 dataPtr = (EscapeEncodingData *) ckalloc(size); 2008 dataPtr->initLen = strlen(init); 2009 strcpy(dataPtr->init, init); 2010 dataPtr->finalLen = strlen(final); 2011 strcpy(dataPtr->final, final); 2012 dataPtr->numSubTables = 2013 Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable); 2014 memcpy(dataPtr->subTables, Tcl_DStringValue(&escapeData), 2015 (size_t) Tcl_DStringLength(&escapeData)); 2016 Tcl_DStringFree(&escapeData); 2017 2018 memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes)); 2019 for (i = 0; i < dataPtr->numSubTables; i++) { 2020 dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1; 2021 } 2022 if (dataPtr->init[0] != '\0') { 2023 dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1; 2024 } 2025 if (dataPtr->final[0] != '\0') { 2026 dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1; 2027 } 2028 2029 type.encodingName = name; 2030 type.toUtfProc = EscapeToUtfProc; 2031 type.fromUtfProc = EscapeFromUtfProc; 2032 type.freeProc = EscapeFreeProc; 2033 type.nullSize = 1; 2034 type.clientData = (ClientData) dataPtr; 2035 2036 return Tcl_CreateEncoding(&type); 2037} 2038 2039/* 2040 *------------------------------------------------------------------------- 2041 * 2042 * BinaryProc -- 2043 * 2044 * The default conversion when no other conversion is specified. No 2045 * translation is done; source bytes are copied directly to destination 2046 * bytes. 2047 * 2048 * Results: 2049 * Returns TCL_OK if conversion was successful. 2050 * 2051 * Side effects: 2052 * None. 2053 * 2054 *------------------------------------------------------------------------- 2055 */ 2056 2057static int 2058BinaryProc( 2059 ClientData clientData, /* Not used. */ 2060 CONST char *src, /* Source string (unknown encoding). */ 2061 int srcLen, /* Source string length in bytes. */ 2062 int flags, /* Conversion control flags. */ 2063 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2064 * information used during a piecewise 2065 * conversion. Contents of statePtr are 2066 * initialized and/or reset by conversion 2067 * routine under control of flags argument. */ 2068 char *dst, /* Output buffer in which converted string is 2069 * stored. */ 2070 int dstLen, /* The maximum length of output buffer in 2071 * bytes. */ 2072 int *srcReadPtr, /* Filled with the number of bytes from the 2073 * source string that were converted. */ 2074 int *dstWrotePtr, /* Filled with the number of bytes that were 2075 * stored in the output buffer as a result of 2076 * the conversion. */ 2077 int *dstCharsPtr) /* Filled with the number of characters that 2078 * correspond to the bytes stored in the 2079 * output buffer. */ 2080{ 2081 int result; 2082 2083 result = TCL_OK; 2084 dstLen -= TCL_UTF_MAX - 1; 2085 if (dstLen < 0) { 2086 dstLen = 0; 2087 } 2088 if (srcLen > dstLen) { 2089 srcLen = dstLen; 2090 result = TCL_CONVERT_NOSPACE; 2091 } 2092 2093 *srcReadPtr = srcLen; 2094 *dstWrotePtr = srcLen; 2095 *dstCharsPtr = srcLen; 2096 memcpy(dst, src, (size_t) srcLen); 2097 return result; 2098} 2099 2100/* 2101 *------------------------------------------------------------------------- 2102 * 2103 * UtfExtToUtfIntProc -- 2104 * 2105 * Convert from UTF-8 to UTF-8. While converting null-bytes from the 2106 * Tcl's internal representation (0xc0, 0x80) to the official 2107 * representation (0x00). See UtfToUtfProc for details. 2108 * 2109 * Results: 2110 * Returns TCL_OK if conversion was successful. 2111 * 2112 * Side effects: 2113 * None. 2114 * 2115 *------------------------------------------------------------------------- 2116 */ 2117 2118static int 2119UtfIntToUtfExtProc( 2120 ClientData clientData, /* Not used. */ 2121 CONST char *src, /* Source string in UTF-8. */ 2122 int srcLen, /* Source string length in bytes. */ 2123 int flags, /* Conversion control flags. */ 2124 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2125 * information used during a piecewise 2126 * conversion. Contents of statePtr are 2127 * initialized and/or reset by conversion 2128 * routine under control of flags argument. */ 2129 char *dst, /* Output buffer in which converted string 2130 * is stored. */ 2131 int dstLen, /* The maximum length of output buffer in 2132 * bytes. */ 2133 int *srcReadPtr, /* Filled with the number of bytes from the 2134 * source string that were converted. This may 2135 * be less than the original source length if 2136 * there was a problem converting some source 2137 * characters. */ 2138 int *dstWrotePtr, /* Filled with the number of bytes that were 2139 * stored in the output buffer as a result of 2140 * the conversion. */ 2141 int *dstCharsPtr) /* Filled with the number of characters that 2142 * correspond to the bytes stored in the 2143 * output buffer. */ 2144{ 2145 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2146 srcReadPtr, dstWrotePtr, dstCharsPtr, 1); 2147} 2148 2149/* 2150 *------------------------------------------------------------------------- 2151 * 2152 * UtfExtToUtfIntProc -- 2153 * 2154 * Convert from UTF-8 to UTF-8 while converting null-bytes from the 2155 * official representation (0x00) to Tcl's internal representation (0xc0, 2156 * 0x80). See UtfToUtfProc for details. 2157 * 2158 * Results: 2159 * Returns TCL_OK if conversion was successful. 2160 * 2161 * Side effects: 2162 * None. 2163 * 2164 *------------------------------------------------------------------------- 2165 */ 2166static int 2167UtfExtToUtfIntProc( 2168 ClientData clientData, /* Not used. */ 2169 CONST char *src, /* Source string in UTF-8. */ 2170 int srcLen, /* Source string length in bytes. */ 2171 int flags, /* Conversion control flags. */ 2172 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2173 * information used during a piecewise 2174 * conversion. Contents of statePtr are 2175 * initialized and/or reset by conversion 2176 * routine under control of flags argument. */ 2177 char *dst, /* Output buffer in which converted string is 2178 * stored. */ 2179 int dstLen, /* The maximum length of output buffer in 2180 * bytes. */ 2181 int *srcReadPtr, /* Filled with the number of bytes from the 2182 * source string that were converted. This may 2183 * be less than the original source length if 2184 * there was a problem converting some source 2185 * characters. */ 2186 int *dstWrotePtr, /* Filled with the number of bytes that were 2187 * stored in the output buffer as a result of 2188 * the conversion. */ 2189 int *dstCharsPtr) /* Filled with the number of characters that 2190 * correspond to the bytes stored in the 2191 * output buffer. */ 2192{ 2193 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2194 srcReadPtr, dstWrotePtr, dstCharsPtr, 0); 2195} 2196 2197/* 2198 *------------------------------------------------------------------------- 2199 * 2200 * UtfToUtfProc -- 2201 * 2202 * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 translation 2203 * is not a no-op, because it will turn a stream of improperly formed 2204 * UTF-8 into a properly formed stream. 2205 * 2206 * Results: 2207 * Returns TCL_OK if conversion was successful. 2208 * 2209 * Side effects: 2210 * None. 2211 * 2212 *------------------------------------------------------------------------- 2213 */ 2214 2215static int 2216UtfToUtfProc( 2217 ClientData clientData, /* Not used. */ 2218 CONST char *src, /* Source string in UTF-8. */ 2219 int srcLen, /* Source string length in bytes. */ 2220 int flags, /* Conversion control flags. */ 2221 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2222 * information used during a piecewise 2223 * conversion. Contents of statePtr are 2224 * initialized and/or reset by conversion 2225 * routine under control of flags argument. */ 2226 char *dst, /* Output buffer in which converted string is 2227 * stored. */ 2228 int dstLen, /* The maximum length of output buffer in 2229 * bytes. */ 2230 int *srcReadPtr, /* Filled with the number of bytes from the 2231 * source string that were converted. This may 2232 * be less than the original source length if 2233 * there was a problem converting some source 2234 * characters. */ 2235 int *dstWrotePtr, /* Filled with the number of bytes that were 2236 * stored in the output buffer as a result of 2237 * the conversion. */ 2238 int *dstCharsPtr, /* Filled with the number of characters that 2239 * correspond to the bytes stored in the 2240 * output buffer. */ 2241 int pureNullMode) /* Convert embedded nulls from internal 2242 * representation to real null-bytes or vice 2243 * versa. */ 2244{ 2245 CONST char *srcStart, *srcEnd, *srcClose; 2246 char *dstStart, *dstEnd; 2247 int result, numChars; 2248 Tcl_UniChar ch; 2249 2250 result = TCL_OK; 2251 2252 srcStart = src; 2253 srcEnd = src + srcLen; 2254 srcClose = srcEnd; 2255 if ((flags & TCL_ENCODING_END) == 0) { 2256 srcClose -= TCL_UTF_MAX; 2257 } 2258 2259 dstStart = dst; 2260 dstEnd = dst + dstLen - TCL_UTF_MAX; 2261 2262 for (numChars = 0; src < srcEnd; numChars++) { 2263 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2264 /* 2265 * If there is more string to follow, this will ensure that the 2266 * last UTF-8 character in the source buffer hasn't been cut off. 2267 */ 2268 2269 result = TCL_CONVERT_MULTIBYTE; 2270 break; 2271 } 2272 if (dst > dstEnd) { 2273 result = TCL_CONVERT_NOSPACE; 2274 break; 2275 } 2276 if (UCHAR(*src) < 0x80 && !(UCHAR(*src) == 0 && pureNullMode == 0)) { 2277 /* 2278 * Copy 7bit chatacters, but skip null-bytes when we are in input 2279 * mode, so that they get converted to 0xc080. 2280 */ 2281 2282 *dst++ = *src++; 2283 } else if (pureNullMode == 1 && UCHAR(*src) == 0xc0 && 2284 UCHAR(*(src+1)) == 0x80) { 2285 /* 2286 * Convert 0xc080 to real nulls when we are in output mode. 2287 */ 2288 2289 *dst++ = 0; 2290 src += 2; 2291 } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { 2292 /* 2293 * Always check before using Tcl_UtfToUniChar. Not doing can so 2294 * cause it run beyond the endof the buffer! If we happen such an 2295 * incomplete char its byts are made to represent themselves. 2296 */ 2297 2298 ch = (unsigned char) *src; 2299 src += 1; 2300 dst += Tcl_UniCharToUtf(ch, dst); 2301 } else { 2302 src += Tcl_UtfToUniChar(src, &ch); 2303 dst += Tcl_UniCharToUtf(ch, dst); 2304 } 2305 } 2306 2307 *srcReadPtr = src - srcStart; 2308 *dstWrotePtr = dst - dstStart; 2309 *dstCharsPtr = numChars; 2310 return result; 2311} 2312 2313/* 2314 *------------------------------------------------------------------------- 2315 * 2316 * UnicodeToUtfProc -- 2317 * 2318 * Convert from Unicode to UTF-8. 2319 * 2320 * Results: 2321 * Returns TCL_OK if conversion was successful. 2322 * 2323 * Side effects: 2324 * None. 2325 * 2326 *------------------------------------------------------------------------- 2327 */ 2328 2329static int 2330UnicodeToUtfProc( 2331 ClientData clientData, /* Not used. */ 2332 CONST char *src, /* Source string in Unicode. */ 2333 int srcLen, /* Source string length in bytes. */ 2334 int flags, /* Conversion control flags. */ 2335 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2336 * information used during a piecewise 2337 * conversion. Contents of statePtr are 2338 * initialized and/or reset by conversion 2339 * routine under control of flags argument. */ 2340 char *dst, /* Output buffer in which converted string is 2341 * stored. */ 2342 int dstLen, /* The maximum length of output buffer in 2343 * bytes. */ 2344 int *srcReadPtr, /* Filled with the number of bytes from the 2345 * source string that were converted. This may 2346 * be less than the original source length if 2347 * there was a problem converting some source 2348 * characters. */ 2349 int *dstWrotePtr, /* Filled with the number of bytes that were 2350 * stored in the output buffer as a result of 2351 * the conversion. */ 2352 int *dstCharsPtr) /* Filled with the number of characters that 2353 * correspond to the bytes stored in the 2354 * output buffer. */ 2355{ 2356 CONST char *srcStart, *srcEnd; 2357 char *dstEnd, *dstStart; 2358 int result, numChars; 2359 Tcl_UniChar ch; 2360 2361 result = TCL_OK; 2362 if ((srcLen % sizeof(Tcl_UniChar)) != 0) { 2363 result = TCL_CONVERT_MULTIBYTE; 2364 srcLen /= sizeof(Tcl_UniChar); 2365 srcLen *= sizeof(Tcl_UniChar); 2366 } 2367 2368 srcStart = src; 2369 srcEnd = src + srcLen; 2370 2371 dstStart = dst; 2372 dstEnd = dst + dstLen - TCL_UTF_MAX; 2373 2374 for (numChars = 0; src < srcEnd; numChars++) { 2375 if (dst > dstEnd) { 2376 result = TCL_CONVERT_NOSPACE; 2377 break; 2378 } 2379 /* 2380 * Special case for 1-byte utf chars for speed. Make sure we 2381 * work with Tcl_UniChar-size data. 2382 */ 2383 ch = *(Tcl_UniChar *)src; 2384 if (ch && ch < 0x80) { 2385 *dst++ = (ch & 0xFF); 2386 } else { 2387 dst += Tcl_UniCharToUtf(ch, dst); 2388 } 2389 src += sizeof(Tcl_UniChar); 2390 } 2391 2392 *srcReadPtr = src - srcStart; 2393 *dstWrotePtr = dst - dstStart; 2394 *dstCharsPtr = numChars; 2395 return result; 2396} 2397 2398/* 2399 *------------------------------------------------------------------------- 2400 * 2401 * UtfToUnicodeProc -- 2402 * 2403 * Convert from UTF-8 to Unicode. 2404 * 2405 * Results: 2406 * Returns TCL_OK if conversion was successful. 2407 * 2408 * Side effects: 2409 * None. 2410 * 2411 *------------------------------------------------------------------------- 2412 */ 2413 2414static int 2415UtfToUnicodeProc( 2416 ClientData clientData, /* TableEncodingData that specifies 2417 * encoding. */ 2418 CONST char *src, /* Source string in UTF-8. */ 2419 int srcLen, /* Source string length in bytes. */ 2420 int flags, /* Conversion control flags. */ 2421 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2422 * information used during a piecewise 2423 * conversion. Contents of statePtr are 2424 * initialized and/or reset by conversion 2425 * routine under control of flags argument. */ 2426 char *dst, /* Output buffer in which converted string is 2427 * stored. */ 2428 int dstLen, /* The maximum length of output buffer in 2429 * bytes. */ 2430 int *srcReadPtr, /* Filled with the number of bytes from the 2431 * source string that were converted. This may 2432 * be less than the original source length if 2433 * there was a problem converting some source 2434 * characters. */ 2435 int *dstWrotePtr, /* Filled with the number of bytes that were 2436 * stored in the output buffer as a result of 2437 * the conversion. */ 2438 int *dstCharsPtr) /* Filled with the number of characters that 2439 * correspond to the bytes stored in the 2440 * output buffer. */ 2441{ 2442 CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; 2443 int result, numChars; 2444 Tcl_UniChar ch; 2445 2446 srcStart = src; 2447 srcEnd = src + srcLen; 2448 srcClose = srcEnd; 2449 if ((flags & TCL_ENCODING_END) == 0) { 2450 srcClose -= TCL_UTF_MAX; 2451 } 2452 2453 dstStart = dst; 2454 dstEnd = dst + dstLen - sizeof(Tcl_UniChar); 2455 2456 result = TCL_OK; 2457 for (numChars = 0; src < srcEnd; numChars++) { 2458 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2459 /* 2460 * If there is more string to follow, this will ensure that the 2461 * last UTF-8 character in the source buffer hasn't been cut off. 2462 */ 2463 2464 result = TCL_CONVERT_MULTIBYTE; 2465 break; 2466 } 2467 if (dst > dstEnd) { 2468 result = TCL_CONVERT_NOSPACE; 2469 break; 2470 } 2471 src += TclUtfToUniChar(src, &ch); 2472 /* 2473 * Need to handle this in a way that won't cause misalignment 2474 * by casting dst to a Tcl_UniChar. [Bug 1122671] 2475 * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. 2476 */ 2477#ifdef WORDS_BIGENDIAN 2478 *dst++ = (ch >> 8); 2479 *dst++ = (ch & 0xFF); 2480#else 2481 *dst++ = (ch & 0xFF); 2482 *dst++ = (ch >> 8); 2483#endif 2484 } 2485 *srcReadPtr = src - srcStart; 2486 *dstWrotePtr = dst - dstStart; 2487 *dstCharsPtr = numChars; 2488 return result; 2489} 2490 2491/* 2492 *------------------------------------------------------------------------- 2493 * 2494 * TableToUtfProc -- 2495 * 2496 * Convert from the encoding specified by the TableEncodingData into 2497 * UTF-8. 2498 * 2499 * Results: 2500 * Returns TCL_OK if conversion was successful. 2501 * 2502 * Side effects: 2503 * None. 2504 * 2505 *------------------------------------------------------------------------- 2506 */ 2507 2508static int 2509TableToUtfProc( 2510 ClientData clientData, /* TableEncodingData that specifies 2511 * encoding. */ 2512 CONST char *src, /* Source string in specified encoding. */ 2513 int srcLen, /* Source string length in bytes. */ 2514 int flags, /* Conversion control flags. */ 2515 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2516 * information used during a piecewise 2517 * conversion. Contents of statePtr are 2518 * initialized and/or reset by conversion 2519 * routine under control of flags argument. */ 2520 char *dst, /* Output buffer in which converted string is 2521 * stored. */ 2522 int dstLen, /* The maximum length of output buffer in 2523 * bytes. */ 2524 int *srcReadPtr, /* Filled with the number of bytes from the 2525 * source string that were converted. This may 2526 * be less than the original source length if 2527 * there was a problem converting some source 2528 * characters. */ 2529 int *dstWrotePtr, /* Filled with the number of bytes that were 2530 * stored in the output buffer as a result of 2531 * the conversion. */ 2532 int *dstCharsPtr) /* Filled with the number of characters that 2533 * correspond to the bytes stored in the 2534 * output buffer. */ 2535{ 2536 CONST char *srcStart, *srcEnd; 2537 char *dstEnd, *dstStart, *prefixBytes; 2538 int result, byte, numChars; 2539 Tcl_UniChar ch; 2540 unsigned short **toUnicode; 2541 unsigned short *pageZero; 2542 TableEncodingData *dataPtr; 2543 2544 srcStart = src; 2545 srcEnd = src + srcLen; 2546 2547 dstStart = dst; 2548 dstEnd = dst + dstLen - TCL_UTF_MAX; 2549 2550 dataPtr = (TableEncodingData *) clientData; 2551 toUnicode = dataPtr->toUnicode; 2552 prefixBytes = dataPtr->prefixBytes; 2553 pageZero = toUnicode[0]; 2554 2555 result = TCL_OK; 2556 for (numChars = 0; src < srcEnd; numChars++) { 2557 if (dst > dstEnd) { 2558 result = TCL_CONVERT_NOSPACE; 2559 break; 2560 } 2561 byte = *((unsigned char *) src); 2562 if (prefixBytes[byte]) { 2563 src++; 2564 if (src >= srcEnd) { 2565 src--; 2566 result = TCL_CONVERT_MULTIBYTE; 2567 break; 2568 } 2569 ch = toUnicode[byte][*((unsigned char *) src)]; 2570 } else { 2571 ch = pageZero[byte]; 2572 } 2573 if ((ch == 0) && (byte != 0)) { 2574 if (flags & TCL_ENCODING_STOPONERROR) { 2575 result = TCL_CONVERT_SYNTAX; 2576 break; 2577 } 2578 if (prefixBytes[byte]) { 2579 src--; 2580 } 2581 ch = (Tcl_UniChar) byte; 2582 } 2583 /* 2584 * Special case for 1-byte utf chars for speed. 2585 */ 2586 if (ch && ch < 0x80) { 2587 *dst++ = (char) ch; 2588 } else { 2589 dst += Tcl_UniCharToUtf(ch, dst); 2590 } 2591 src++; 2592 } 2593 2594 *srcReadPtr = src - srcStart; 2595 *dstWrotePtr = dst - dstStart; 2596 *dstCharsPtr = numChars; 2597 return result; 2598} 2599 2600/* 2601 *------------------------------------------------------------------------- 2602 * 2603 * TableFromUtfProc -- 2604 * 2605 * Convert from UTF-8 into the encoding specified by the 2606 * TableEncodingData. 2607 * 2608 * Results: 2609 * Returns TCL_OK if conversion was successful. 2610 * 2611 * Side effects: 2612 * None. 2613 * 2614 *------------------------------------------------------------------------- 2615 */ 2616 2617static int 2618TableFromUtfProc( 2619 ClientData clientData, /* TableEncodingData that specifies 2620 * encoding. */ 2621 CONST char *src, /* Source string in UTF-8. */ 2622 int srcLen, /* Source string length in bytes. */ 2623 int flags, /* Conversion control flags. */ 2624 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2625 * information used during a piecewise 2626 * conversion. Contents of statePtr are 2627 * initialized and/or reset by conversion 2628 * routine under control of flags argument. */ 2629 char *dst, /* Output buffer in which converted string is 2630 * stored. */ 2631 int dstLen, /* The maximum length of output buffer in 2632 * bytes. */ 2633 int *srcReadPtr, /* Filled with the number of bytes from the 2634 * source string that were converted. This may 2635 * be less than the original source length if 2636 * there was a problem converting some source 2637 * characters. */ 2638 int *dstWrotePtr, /* Filled with the number of bytes that were 2639 * stored in the output buffer as a result of 2640 * the conversion. */ 2641 int *dstCharsPtr) /* Filled with the number of characters that 2642 * correspond to the bytes stored in the 2643 * output buffer. */ 2644{ 2645 CONST char *srcStart, *srcEnd, *srcClose; 2646 char *dstStart, *dstEnd, *prefixBytes; 2647 Tcl_UniChar ch; 2648 int result, len, word, numChars; 2649 TableEncodingData *dataPtr; 2650 unsigned short **fromUnicode; 2651 2652 result = TCL_OK; 2653 2654 dataPtr = (TableEncodingData *) clientData; 2655 prefixBytes = dataPtr->prefixBytes; 2656 fromUnicode = dataPtr->fromUnicode; 2657 2658 srcStart = src; 2659 srcEnd = src + srcLen; 2660 srcClose = srcEnd; 2661 if ((flags & TCL_ENCODING_END) == 0) { 2662 srcClose -= TCL_UTF_MAX; 2663 } 2664 2665 dstStart = dst; 2666 dstEnd = dst + dstLen - 1; 2667 2668 for (numChars = 0; src < srcEnd; numChars++) { 2669 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2670 /* 2671 * If there is more string to follow, this will ensure that the 2672 * last UTF-8 character in the source buffer hasn't been cut off. 2673 */ 2674 2675 result = TCL_CONVERT_MULTIBYTE; 2676 break; 2677 } 2678 len = TclUtfToUniChar(src, &ch); 2679 2680#if TCL_UTF_MAX > 3 2681 /* 2682 * This prevents a crash condition. More evaluation is required for 2683 * full support of int Tcl_UniChar. [Bug 1004065] 2684 */ 2685 2686 if (ch & 0xffff0000) { 2687 word = 0; 2688 } else 2689#endif 2690 word = fromUnicode[(ch >> 8)][ch & 0xff]; 2691 2692 if ((word == 0) && (ch != 0)) { 2693 if (flags & TCL_ENCODING_STOPONERROR) { 2694 result = TCL_CONVERT_UNKNOWN; 2695 break; 2696 } 2697 word = dataPtr->fallback; 2698 } 2699 if (prefixBytes[(word >> 8)] != 0) { 2700 if (dst + 1 > dstEnd) { 2701 result = TCL_CONVERT_NOSPACE; 2702 break; 2703 } 2704 dst[0] = (char) (word >> 8); 2705 dst[1] = (char) word; 2706 dst += 2; 2707 } else { 2708 if (dst > dstEnd) { 2709 result = TCL_CONVERT_NOSPACE; 2710 break; 2711 } 2712 dst[0] = (char) word; 2713 dst++; 2714 } 2715 src += len; 2716 } 2717 2718 *srcReadPtr = src - srcStart; 2719 *dstWrotePtr = dst - dstStart; 2720 *dstCharsPtr = numChars; 2721 return result; 2722} 2723 2724/* 2725 *------------------------------------------------------------------------- 2726 * 2727 * Iso88591ToUtfProc -- 2728 * 2729 * Convert from the "iso8859-1" encoding into UTF-8. 2730 * 2731 * Results: 2732 * Returns TCL_OK if conversion was successful. 2733 * 2734 * Side effects: 2735 * None. 2736 * 2737 *------------------------------------------------------------------------- 2738 */ 2739 2740static int 2741Iso88591ToUtfProc( 2742 ClientData clientData, /* Ignored. */ 2743 CONST char *src, /* Source string in specified encoding. */ 2744 int srcLen, /* Source string length in bytes. */ 2745 int flags, /* Conversion control flags. */ 2746 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2747 * information used during a piecewise 2748 * conversion. Contents of statePtr are 2749 * initialized and/or reset by conversion 2750 * routine under control of flags argument. */ 2751 char *dst, /* Output buffer in which converted string is 2752 * stored. */ 2753 int dstLen, /* The maximum length of output buffer in 2754 * bytes. */ 2755 int *srcReadPtr, /* Filled with the number of bytes from the 2756 * source string that were converted. This may 2757 * be less than the original source length if 2758 * there was a problem converting some source 2759 * characters. */ 2760 int *dstWrotePtr, /* Filled with the number of bytes that were 2761 * stored in the output buffer as a result of 2762 * the conversion. */ 2763 int *dstCharsPtr) /* Filled with the number of characters that 2764 * correspond to the bytes stored in the 2765 * output buffer. */ 2766{ 2767 CONST char *srcStart, *srcEnd; 2768 char *dstEnd, *dstStart; 2769 int result, numChars; 2770 2771 srcStart = src; 2772 srcEnd = src + srcLen; 2773 2774 dstStart = dst; 2775 dstEnd = dst + dstLen - TCL_UTF_MAX; 2776 2777 result = TCL_OK; 2778 for (numChars = 0; src < srcEnd; numChars++) { 2779 Tcl_UniChar ch; 2780 2781 if (dst > dstEnd) { 2782 result = TCL_CONVERT_NOSPACE; 2783 break; 2784 } 2785 ch = (Tcl_UniChar) *((unsigned char *) src); 2786 /* 2787 * Special case for 1-byte utf chars for speed. 2788 */ 2789 if (ch && ch < 0x80) { 2790 *dst++ = (char) ch; 2791 } else { 2792 dst += Tcl_UniCharToUtf(ch, dst); 2793 } 2794 src++; 2795 } 2796 2797 *srcReadPtr = src - srcStart; 2798 *dstWrotePtr = dst - dstStart; 2799 *dstCharsPtr = numChars; 2800 return result; 2801} 2802 2803/* 2804 *------------------------------------------------------------------------- 2805 * 2806 * Iso88591FromUtfProc -- 2807 * 2808 * Convert from UTF-8 into the encoding "iso8859-1". 2809 * 2810 * Results: 2811 * Returns TCL_OK if conversion was successful. 2812 * 2813 * Side effects: 2814 * None. 2815 * 2816 *------------------------------------------------------------------------- 2817 */ 2818 2819static int 2820Iso88591FromUtfProc( 2821 ClientData clientData, /* Ignored. */ 2822 CONST char *src, /* Source string in UTF-8. */ 2823 int srcLen, /* Source string length in bytes. */ 2824 int flags, /* Conversion control flags. */ 2825 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2826 * information used during a piecewise 2827 * conversion. Contents of statePtr are 2828 * initialized and/or reset by conversion 2829 * routine under control of flags argument. */ 2830 char *dst, /* Output buffer in which converted string is 2831 * stored. */ 2832 int dstLen, /* The maximum length of output buffer in 2833 * bytes. */ 2834 int *srcReadPtr, /* Filled with the number of bytes from the 2835 * source string that were converted. This may 2836 * be less than the original source length if 2837 * there was a problem converting some source 2838 * characters. */ 2839 int *dstWrotePtr, /* Filled with the number of bytes that were 2840 * stored in the output buffer as a result of 2841 * the conversion. */ 2842 int *dstCharsPtr) /* Filled with the number of characters that 2843 * correspond to the bytes stored in the 2844 * output buffer. */ 2845{ 2846 CONST char *srcStart, *srcEnd, *srcClose; 2847 char *dstStart, *dstEnd; 2848 int result, numChars; 2849 2850 result = TCL_OK; 2851 2852 srcStart = src; 2853 srcEnd = src + srcLen; 2854 srcClose = srcEnd; 2855 if ((flags & TCL_ENCODING_END) == 0) { 2856 srcClose -= TCL_UTF_MAX; 2857 } 2858 2859 dstStart = dst; 2860 dstEnd = dst + dstLen - 1; 2861 2862 for (numChars = 0; src < srcEnd; numChars++) { 2863 Tcl_UniChar ch; 2864 int len; 2865 2866 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2867 /* 2868 * If there is more string to follow, this will ensure that the 2869 * last UTF-8 character in the source buffer hasn't been cut off. 2870 */ 2871 2872 result = TCL_CONVERT_MULTIBYTE; 2873 break; 2874 } 2875 len = TclUtfToUniChar(src, &ch); 2876 2877 /* 2878 * Check for illegal characters. 2879 */ 2880 2881 if (ch > 0xff) { 2882 if (flags & TCL_ENCODING_STOPONERROR) { 2883 result = TCL_CONVERT_UNKNOWN; 2884 break; 2885 } 2886 2887 /* 2888 * Plunge on, using '?' as a fallback character. 2889 */ 2890 2891 ch = (Tcl_UniChar) '?'; 2892 } 2893 2894 if (dst > dstEnd) { 2895 result = TCL_CONVERT_NOSPACE; 2896 break; 2897 } 2898 *(dst++) = (char) ch; 2899 src += len; 2900 } 2901 2902 *srcReadPtr = src - srcStart; 2903 *dstWrotePtr = dst - dstStart; 2904 *dstCharsPtr = numChars; 2905 return result; 2906} 2907 2908/* 2909 *--------------------------------------------------------------------------- 2910 * 2911 * TableFreeProc -- 2912 * 2913 * This function is invoked when an encoding is deleted. It deletes the 2914 * memory used by the TableEncodingData. 2915 * 2916 * Results: 2917 * None. 2918 * 2919 * Side effects: 2920 * Memory freed. 2921 * 2922 *--------------------------------------------------------------------------- 2923 */ 2924 2925static void 2926TableFreeProc( 2927 ClientData clientData) /* TableEncodingData that specifies 2928 * encoding. */ 2929{ 2930 TableEncodingData *dataPtr; 2931 2932 /* 2933 * Make sure we aren't freeing twice on shutdown. [Bug 219314] 2934 */ 2935 2936 dataPtr = (TableEncodingData *) clientData; 2937 ckfree((char *) dataPtr->toUnicode); 2938 ckfree((char *) dataPtr->fromUnicode); 2939 ckfree((char *) dataPtr); 2940} 2941 2942/* 2943 *------------------------------------------------------------------------- 2944 * 2945 * EscapeToUtfProc -- 2946 * 2947 * Convert from the encoding specified by the EscapeEncodingData into 2948 * UTF-8. 2949 * 2950 * Results: 2951 * Returns TCL_OK if conversion was successful. 2952 * 2953 * Side effects: 2954 * None. 2955 * 2956 *------------------------------------------------------------------------- 2957 */ 2958 2959static int 2960EscapeToUtfProc( 2961 ClientData clientData, /* EscapeEncodingData that specifies 2962 * encoding. */ 2963 CONST char *src, /* Source string in specified encoding. */ 2964 int srcLen, /* Source string length in bytes. */ 2965 int flags, /* Conversion control flags. */ 2966 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 2967 * information used during a piecewise 2968 * conversion. Contents of statePtr are 2969 * initialized and/or reset by conversion 2970 * routine under control of flags argument. */ 2971 char *dst, /* Output buffer in which converted string is 2972 * stored. */ 2973 int dstLen, /* The maximum length of output buffer in 2974 * bytes. */ 2975 int *srcReadPtr, /* Filled with the number of bytes from the 2976 * source string that were converted. This may 2977 * be less than the original source length if 2978 * there was a problem converting some source 2979 * characters. */ 2980 int *dstWrotePtr, /* Filled with the number of bytes that were 2981 * stored in the output buffer as a result of 2982 * the conversion. */ 2983 int *dstCharsPtr) /* Filled with the number of characters that 2984 * correspond to the bytes stored in the 2985 * output buffer. */ 2986{ 2987 EscapeEncodingData *dataPtr; 2988 char *prefixBytes, *tablePrefixBytes; 2989 unsigned short **tableToUnicode; 2990 Encoding *encodingPtr; 2991 int state, result, numChars; 2992 CONST char *srcStart, *srcEnd; 2993 char *dstStart, *dstEnd; 2994 2995 result = TCL_OK; 2996 2997 tablePrefixBytes = NULL; /* lint. */ 2998 tableToUnicode = NULL; /* lint. */ 2999 3000 dataPtr = (EscapeEncodingData *) clientData; 3001 prefixBytes = dataPtr->prefixBytes; 3002 encodingPtr = NULL; 3003 3004 srcStart = src; 3005 srcEnd = src + srcLen; 3006 3007 dstStart = dst; 3008 dstEnd = dst + dstLen - TCL_UTF_MAX; 3009 3010 state = PTR2INT(*statePtr); 3011 if (flags & TCL_ENCODING_START) { 3012 state = 0; 3013 } 3014 3015 for (numChars = 0; src < srcEnd; ) { 3016 int byte, hi, lo, ch; 3017 3018 if (dst > dstEnd) { 3019 result = TCL_CONVERT_NOSPACE; 3020 break; 3021 } 3022 byte = *((unsigned char *) src); 3023 if (prefixBytes[byte]) { 3024 unsigned int left, len, longest; 3025 int checked, i; 3026 EscapeSubTable *subTablePtr; 3027 3028 /* 3029 * Saw the beginning of an escape sequence. 3030 */ 3031 3032 left = srcEnd - src; 3033 len = dataPtr->initLen; 3034 longest = len; 3035 checked = 0; 3036 3037 if (len <= left) { 3038 checked++; 3039 if ((len > 0) && (memcmp(src, dataPtr->init, len) == 0)) { 3040 /* 3041 * If we see initialization string, skip it, even if we're 3042 * not at the beginning of the buffer. 3043 */ 3044 3045 src += len; 3046 continue; 3047 } 3048 } 3049 3050 len = dataPtr->finalLen; 3051 if (len > longest) { 3052 longest = len; 3053 } 3054 3055 if (len <= left) { 3056 checked++; 3057 if ((len > 0) && (memcmp(src, dataPtr->final, len) == 0)) { 3058 /* 3059 * If we see finalization string, skip it, even if we're 3060 * not at the end of the buffer. 3061 */ 3062 3063 src += len; 3064 continue; 3065 } 3066 } 3067 3068 subTablePtr = dataPtr->subTables; 3069 for (i = 0; i < dataPtr->numSubTables; i++) { 3070 len = subTablePtr->sequenceLen; 3071 if (len > longest) { 3072 longest = len; 3073 } 3074 if (len <= left) { 3075 checked++; 3076 if ((len > 0) && 3077 (memcmp(src, subTablePtr->sequence, len) == 0)) { 3078 state = i; 3079 encodingPtr = NULL; 3080 subTablePtr = NULL; 3081 src += len; 3082 break; 3083 } 3084 } 3085 subTablePtr++; 3086 } 3087 3088 if (subTablePtr == NULL) { 3089 /* 3090 * A match was found, the escape sequence was consumed, and 3091 * the state was updated. 3092 */ 3093 3094 continue; 3095 } 3096 3097 /* 3098 * We have a split-up or unrecognized escape sequence. If we 3099 * checked all the sequences, then it's a syntax error, otherwise 3100 * we need more bytes to determine a match. 3101 */ 3102 3103 if ((checked == dataPtr->numSubTables + 2) 3104 || (flags & TCL_ENCODING_END)) { 3105 if ((flags & TCL_ENCODING_STOPONERROR) == 0) { 3106 /* 3107 * Skip the unknown escape sequence. 3108 */ 3109 3110 src += longest; 3111 continue; 3112 } 3113 result = TCL_CONVERT_SYNTAX; 3114 } else { 3115 result = TCL_CONVERT_MULTIBYTE; 3116 } 3117 break; 3118 } 3119 3120 if (encodingPtr == NULL) { 3121 TableEncodingData *tableDataPtr; 3122 3123 encodingPtr = GetTableEncoding(dataPtr, state); 3124 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 3125 tablePrefixBytes = tableDataPtr->prefixBytes; 3126 tableToUnicode = tableDataPtr->toUnicode; 3127 } 3128 3129 if (tablePrefixBytes[byte]) { 3130 src++; 3131 if (src >= srcEnd) { 3132 src--; 3133 result = TCL_CONVERT_MULTIBYTE; 3134 break; 3135 } 3136 hi = byte; 3137 lo = *((unsigned char *) src); 3138 } else { 3139 hi = 0; 3140 lo = byte; 3141 } 3142 3143 ch = tableToUnicode[hi][lo]; 3144 dst += Tcl_UniCharToUtf(ch, dst); 3145 src++; 3146 numChars++; 3147 } 3148 3149 *statePtr = (Tcl_EncodingState) INT2PTR(state); 3150 *srcReadPtr = src - srcStart; 3151 *dstWrotePtr = dst - dstStart; 3152 *dstCharsPtr = numChars; 3153 return result; 3154} 3155 3156/* 3157 *------------------------------------------------------------------------- 3158 * 3159 * EscapeFromUtfProc -- 3160 * 3161 * Convert from UTF-8 into the encoding specified by the 3162 * EscapeEncodingData. 3163 * 3164 * Results: 3165 * Returns TCL_OK if conversion was successful. 3166 * 3167 * Side effects: 3168 * None. 3169 * 3170 *------------------------------------------------------------------------- 3171 */ 3172 3173static int 3174EscapeFromUtfProc( 3175 ClientData clientData, /* EscapeEncodingData that specifies 3176 * encoding. */ 3177 CONST char *src, /* Source string in UTF-8. */ 3178 int srcLen, /* Source string length in bytes. */ 3179 int flags, /* Conversion control flags. */ 3180 Tcl_EncodingState *statePtr,/* Place for conversion routine to store state 3181 * information used during a piecewise 3182 * conversion. Contents of statePtr are 3183 * initialized and/or reset by conversion 3184 * routine under control of flags argument. */ 3185 char *dst, /* Output buffer in which converted string is 3186 * stored. */ 3187 int dstLen, /* The maximum length of output buffer in 3188 * bytes. */ 3189 int *srcReadPtr, /* Filled with the number of bytes from the 3190 * source string that were converted. This may 3191 * be less than the original source length if 3192 * there was a problem converting some source 3193 * characters. */ 3194 int *dstWrotePtr, /* Filled with the number of bytes that were 3195 * stored in the output buffer as a result of 3196 * the conversion. */ 3197 int *dstCharsPtr) /* Filled with the number of characters that 3198 * correspond to the bytes stored in the 3199 * output buffer. */ 3200{ 3201 EscapeEncodingData *dataPtr; 3202 Encoding *encodingPtr; 3203 CONST char *srcStart, *srcEnd, *srcClose; 3204 char *dstStart, *dstEnd; 3205 int state, result, numChars; 3206 TableEncodingData *tableDataPtr; 3207 char *tablePrefixBytes; 3208 unsigned short **tableFromUnicode; 3209 3210 result = TCL_OK; 3211 3212 dataPtr = (EscapeEncodingData *) clientData; 3213 3214 srcStart = src; 3215 srcEnd = src + srcLen; 3216 srcClose = srcEnd; 3217 if ((flags & TCL_ENCODING_END) == 0) { 3218 srcClose -= TCL_UTF_MAX; 3219 } 3220 3221 dstStart = dst; 3222 dstEnd = dst + dstLen - 1; 3223 3224 /* 3225 * RFC1468 states that the text starts in ASCII, and switches to Japanese 3226 * characters, and that the text must end in ASCII. [Patch 474358] 3227 */ 3228 3229 if (flags & TCL_ENCODING_START) { 3230 state = 0; 3231 if ((dst + dataPtr->initLen) > dstEnd) { 3232 *srcReadPtr = 0; 3233 *dstWrotePtr = 0; 3234 return TCL_CONVERT_NOSPACE; 3235 } 3236 memcpy(dst, dataPtr->init, (size_t)dataPtr->initLen); 3237 dst += dataPtr->initLen; 3238 } else { 3239 state = PTR2INT(*statePtr); 3240 } 3241 3242 encodingPtr = GetTableEncoding(dataPtr, state); 3243 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 3244 tablePrefixBytes = tableDataPtr->prefixBytes; 3245 tableFromUnicode = tableDataPtr->fromUnicode; 3246 3247 for (numChars = 0; src < srcEnd; numChars++) { 3248 unsigned int len; 3249 int word; 3250 Tcl_UniChar ch; 3251 3252 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 3253 /* 3254 * If there is more string to follow, this will ensure that the 3255 * last UTF-8 character in the source buffer hasn't been cut off. 3256 */ 3257 3258 result = TCL_CONVERT_MULTIBYTE; 3259 break; 3260 } 3261 len = TclUtfToUniChar(src, &ch); 3262 word = tableFromUnicode[(ch >> 8)][ch & 0xff]; 3263 3264 if ((word == 0) && (ch != 0)) { 3265 int oldState; 3266 EscapeSubTable *subTablePtr; 3267 3268 oldState = state; 3269 for (state = 0; state < dataPtr->numSubTables; state++) { 3270 encodingPtr = GetTableEncoding(dataPtr, state); 3271 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 3272 word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff]; 3273 if (word != 0) { 3274 break; 3275 } 3276 } 3277 3278 if (word == 0) { 3279 state = oldState; 3280 if (flags & TCL_ENCODING_STOPONERROR) { 3281 result = TCL_CONVERT_UNKNOWN; 3282 break; 3283 } 3284 encodingPtr = GetTableEncoding(dataPtr, state); 3285 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 3286 word = tableDataPtr->fallback; 3287 } 3288 3289 tablePrefixBytes = tableDataPtr->prefixBytes; 3290 tableFromUnicode = tableDataPtr->fromUnicode; 3291 3292 /* 3293 * The state variable has the value of oldState when word is 0. 3294 * In this case, the escape sequense should not be copied to dst 3295 * because the current character set is not changed. 3296 */ 3297 3298 if (state != oldState) { 3299 subTablePtr = &dataPtr->subTables[state]; 3300 if ((dst + subTablePtr->sequenceLen) > dstEnd) { 3301 /* 3302 * If there is no space to write the escape sequence, the 3303 * state variable must be changed to the value of oldState 3304 * variable because this escape sequence must be written 3305 * in the next conversion. 3306 */ 3307 3308 state = oldState; 3309 result = TCL_CONVERT_NOSPACE; 3310 break; 3311 } 3312 memcpy(dst, subTablePtr->sequence, 3313 (size_t) subTablePtr->sequenceLen); 3314 dst += subTablePtr->sequenceLen; 3315 } 3316 } 3317 3318 if (tablePrefixBytes[(word >> 8)] != 0) { 3319 if (dst + 1 > dstEnd) { 3320 result = TCL_CONVERT_NOSPACE; 3321 break; 3322 } 3323 dst[0] = (char) (word >> 8); 3324 dst[1] = (char) word; 3325 dst += 2; 3326 } else { 3327 if (dst > dstEnd) { 3328 result = TCL_CONVERT_NOSPACE; 3329 break; 3330 } 3331 dst[0] = (char) word; 3332 dst++; 3333 } 3334 src += len; 3335 } 3336 3337 if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) { 3338 unsigned int len = dataPtr->subTables[0].sequenceLen; 3339 /* 3340 * Certain encodings like iso2022-jp need to write 3341 * an escape sequence after all characters have 3342 * been converted. This logic checks that enough 3343 * room is available in the buffer for the escape bytes. 3344 * The TCL_ENCODING_END flag is cleared after a final 3345 * escape sequence has been added to the buffer so 3346 * that another call to this method does not attempt 3347 * to append escape bytes a second time. 3348 */ 3349 if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) { 3350 result = TCL_CONVERT_NOSPACE; 3351 } else { 3352 if (state) { 3353 memcpy(dst, dataPtr->subTables[0].sequence, (size_t) len); 3354 dst += len; 3355 } 3356 memcpy(dst, dataPtr->final, (size_t) dataPtr->finalLen); 3357 dst += dataPtr->finalLen; 3358 state &= ~TCL_ENCODING_END; 3359 } 3360 } 3361 3362 *statePtr = (Tcl_EncodingState) INT2PTR(state); 3363 *srcReadPtr = src - srcStart; 3364 *dstWrotePtr = dst - dstStart; 3365 *dstCharsPtr = numChars; 3366 return result; 3367} 3368 3369/* 3370 *--------------------------------------------------------------------------- 3371 * 3372 * EscapeFreeProc -- 3373 * 3374 * This function is invoked when an EscapeEncodingData encoding is 3375 * deleted. It deletes the memory used by the encoding. 3376 * 3377 * Results: 3378 * None. 3379 * 3380 * Side effects: 3381 * Memory freed. 3382 * 3383 *--------------------------------------------------------------------------- 3384 */ 3385 3386static void 3387EscapeFreeProc( 3388 ClientData clientData) /* EscapeEncodingData that specifies 3389 * encoding. */ 3390{ 3391 EscapeEncodingData *dataPtr; 3392 EscapeSubTable *subTablePtr; 3393 int i; 3394 3395 dataPtr = (EscapeEncodingData *) clientData; 3396 if (dataPtr == NULL) { 3397 return; 3398 } 3399 /* 3400 * The subTables should be freed recursively in normal operation but not 3401 * during TclFinalizeEncodingSubsystem because they are also present as a 3402 * weak reference in the toplevel encodingTable (ie they don't have a +1 3403 * refcount for this), and unpredictable nuking order could remove them 3404 * from under the following loop's feet [Bug 2891556]. 3405 * 3406 * The encodingsInitialized flag, being reset on entry to TFES, can serve 3407 * as a "not in finalization" test. 3408 */ 3409 if (encodingsInitialized) 3410 { 3411 subTablePtr = dataPtr->subTables; 3412 for (i = 0; i < dataPtr->numSubTables; i++) { 3413 FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr); 3414 subTablePtr++; 3415 } 3416 } 3417 ckfree((char *) dataPtr); 3418} 3419 3420/* 3421 *--------------------------------------------------------------------------- 3422 * 3423 * GetTableEncoding -- 3424 * 3425 * Helper function for the EscapeEncodingData conversions. Gets the 3426 * encoding (of type TextEncodingData) that represents the specified 3427 * state. 3428 * 3429 * Results: 3430 * The return value is the encoding. 3431 * 3432 * Side effects: 3433 * If the encoding that represents the specified state has not already 3434 * been used by this EscapeEncoding, it will be loaded and cached in the 3435 * dataPtr. 3436 * 3437 *--------------------------------------------------------------------------- 3438 */ 3439 3440static Encoding * 3441GetTableEncoding( 3442 EscapeEncodingData *dataPtr,/* Contains names of encodings. */ 3443 int state) /* Index in dataPtr of desired Encoding. */ 3444{ 3445 EscapeSubTable *subTablePtr; 3446 Encoding *encodingPtr; 3447 3448 subTablePtr = &dataPtr->subTables[state]; 3449 encodingPtr = subTablePtr->encodingPtr; 3450 3451 if (encodingPtr == NULL) { 3452 encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name); 3453 if ((encodingPtr == NULL) 3454 || (encodingPtr->toUtfProc != TableToUtfProc 3455 && encodingPtr->toUtfProc != Iso88591ToUtfProc)) { 3456 Tcl_Panic("EscapeToUtfProc: invalid sub table"); 3457 } 3458 subTablePtr->encodingPtr = encodingPtr; 3459 } 3460 3461 return encodingPtr; 3462} 3463 3464/* 3465 *--------------------------------------------------------------------------- 3466 * 3467 * unilen -- 3468 * 3469 * A helper function for the Tcl_ExternalToUtf functions. This function 3470 * is similar to strlen for double-byte characters: it returns the number 3471 * of bytes in a 0x0000 terminated string. 3472 * 3473 * Results: 3474 * As above. 3475 * 3476 * Side effects: 3477 * None. 3478 * 3479 *--------------------------------------------------------------------------- 3480 */ 3481 3482static size_t 3483unilen( 3484 CONST char *src) 3485{ 3486 unsigned short *p; 3487 3488 p = (unsigned short *) src; 3489 while (*p != 0x0000) { 3490 p++; 3491 } 3492 return (char *) p - src; 3493} 3494 3495/* 3496 *------------------------------------------------------------------------- 3497 * 3498 * InitializeEncodingSearchPath -- 3499 * 3500 * This is the fallback routine that sets the default value of the 3501 * encoding search path if the application has not set one via a call to 3502 * Tcl_SetEncodingSearchPath() by the first time the search path is needed 3503 * to load encoding data. 3504 * 3505 * The default encoding search path is produced by taking each directory 3506 * in the library path, appending a subdirectory named "encoding", and if 3507 * the resulting directory exists, adding it to the encoding search path. 3508 * 3509 * Results: 3510 * None. 3511 * 3512 * Side effects: 3513 * Sets the encoding search path to an initial value. 3514 * 3515 *------------------------------------------------------------------------- 3516 */ 3517 3518static void 3519InitializeEncodingSearchPath( 3520 char **valuePtr, 3521 int *lengthPtr, 3522 Tcl_Encoding *encodingPtr) 3523{ 3524 char *bytes; 3525 int i, numDirs, numBytes; 3526 Tcl_Obj *libPath, *encodingObj, *searchPath; 3527 3528 TclNewLiteralStringObj(encodingObj, "encoding"); 3529 TclNewObj(searchPath); 3530 Tcl_IncrRefCount(encodingObj); 3531 Tcl_IncrRefCount(searchPath); 3532 libPath = TclGetLibraryPath(); 3533 Tcl_IncrRefCount(libPath); 3534 Tcl_ListObjLength(NULL, libPath, &numDirs); 3535 3536 for (i = 0; i < numDirs; i++) { 3537 Tcl_Obj *directory, *path; 3538 Tcl_StatBuf stat; 3539 3540 Tcl_ListObjIndex(NULL, libPath, i, &directory); 3541 path = Tcl_FSJoinToPath(directory, 1, &encodingObj); 3542 Tcl_IncrRefCount(path); 3543 if ((0 == Tcl_FSStat(path, &stat)) && S_ISDIR(stat.st_mode)) { 3544 Tcl_ListObjAppendElement(NULL, searchPath, path); 3545 } 3546 Tcl_DecrRefCount(path); 3547 } 3548 3549 Tcl_DecrRefCount(libPath); 3550 Tcl_DecrRefCount(encodingObj); 3551 *encodingPtr = libraryPath.encoding; 3552 if (*encodingPtr) { 3553 ((Encoding *)(*encodingPtr))->refCount++; 3554 } 3555 bytes = Tcl_GetStringFromObj(searchPath, &numBytes); 3556 3557 *lengthPtr = numBytes; 3558 *valuePtr = ckalloc((unsigned int) numBytes + 1); 3559 memcpy(*valuePtr, bytes, (size_t) numBytes + 1); 3560 Tcl_DecrRefCount(searchPath); 3561} 3562 3563/* 3564 * Local Variables: 3565 * mode: c 3566 * c-basic-offset: 4 3567 * fill-column: 78 3568 * End: 3569 */ 3570 3571