1/* 2 * tclEncoding.c -- 3 * 4 * Contains the implementation of the encoding conversion package. 5 * 6 * Copyright (c) 1996-1998 Sun Microsystems, Inc. 7 * 8 * See the file "license.terms" for information on usage and redistribution 9 * of this file, and for a DISCLAIMER OF ALL WARRANTIES. 10 * 11 * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.14 2007/02/12 19:25:42 andreas_kupries Exp $ 12 */ 13 14#include "tclInt.h" 15#include "tclPort.h" 16 17typedef size_t (LengthProc)_ANSI_ARGS_((CONST char *src)); 18 19/* 20 * The following data structure represents an encoding, which describes how 21 * to convert between various character sets and UTF-8. 22 */ 23 24typedef struct Encoding { 25 char *name; /* Name of encoding. Malloced because (1) 26 * hash table entry that owns this encoding 27 * may be freed prior to this encoding being 28 * freed, (2) string passed in the 29 * Tcl_EncodingType structure may not be 30 * persistent. */ 31 Tcl_EncodingConvertProc *toUtfProc; 32 /* Procedure to convert from external 33 * encoding into UTF-8. */ 34 Tcl_EncodingConvertProc *fromUtfProc; 35 /* Procedure to convert from UTF-8 into 36 * external encoding. */ 37 Tcl_EncodingFreeProc *freeProc; 38 /* If non-NULL, procedure to call when this 39 * encoding is deleted. */ 40 int nullSize; /* Number of 0x00 bytes that signify 41 * end-of-string in this encoding. This 42 * number is used to determine the source 43 * string length when the srcLen argument is 44 * negative. This number can be 1 or 2. */ 45 ClientData clientData; /* Arbitrary value associated with encoding 46 * type. Passed to conversion procedures. */ 47 LengthProc *lengthProc; /* Function to compute length of 48 * null-terminated strings in this encoding. 49 * If nullSize is 1, this is strlen; if 50 * nullSize is 2, this is a function that 51 * returns the number of bytes in a 0x0000 52 * terminated string. */ 53 int refCount; /* Number of uses of this structure. */ 54 Tcl_HashEntry *hPtr; /* Hash table entry that owns this encoding. */ 55} Encoding; 56 57/* 58 * The following structure is the clientData for a dynamically-loaded, 59 * table-driven encoding created by LoadTableEncoding(). It maps between 60 * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only) 61 * encoding. 62 */ 63 64typedef struct TableEncodingData { 65 int fallback; /* Character (in this encoding) to 66 * substitute when this encoding cannot 67 * represent a UTF-8 character. */ 68 char prefixBytes[256]; /* If a byte in the input stream is a lead 69 * byte for a 2-byte sequence, the 70 * corresponding entry in this array is 1, 71 * otherwise it is 0. */ 72 unsigned short **toUnicode; /* Two dimensional sparse matrix to map 73 * characters from the encoding to Unicode. 74 * Each element of the toUnicode array points 75 * to an array of 256 shorts. If there is no 76 * corresponding character in Unicode, the 77 * value in the matrix is 0x0000. malloc'd. */ 78 unsigned short **fromUnicode; 79 /* Two dimensional sparse matrix to map 80 * characters from Unicode to the encoding. 81 * Each element of the fromUnicode array 82 * points to an array of 256 shorts. If there 83 * is no corresponding character the encoding, 84 * the value in the matrix is 0x0000. 85 * malloc'd. */ 86} TableEncodingData; 87 88/* 89 * The following structures is the clientData for a dynamically-loaded, 90 * escape-driven encoding that is itself comprised of other simpler 91 * encodings. An example is "iso-2022-jp", which uses escape sequences to 92 * switch between ascii, jis0208, jis0212, gb2312, and ksc5601. Note that 93 * "escape-driven" does not necessarily mean that the ESCAPE character is 94 * the character used for switching character sets. 95 */ 96 97typedef struct EscapeSubTable { 98 unsigned int sequenceLen; /* Length of following string. */ 99 char sequence[16]; /* Escape code that marks this encoding. */ 100 char name[32]; /* Name for encoding. */ 101 Encoding *encodingPtr; /* Encoding loaded using above name, or NULL 102 * if this sub-encoding has not been needed 103 * yet. */ 104} EscapeSubTable; 105 106typedef struct EscapeEncodingData { 107 int fallback; /* Character (in this encoding) to 108 * substitute when this encoding cannot 109 * represent a UTF-8 character. */ 110 unsigned int initLen; /* Length of following string. */ 111 char init[16]; /* String to emit or expect before first char 112 * in conversion. */ 113 unsigned int finalLen; /* Length of following string. */ 114 char final[16]; /* String to emit or expect after last char 115 * in conversion. */ 116 char prefixBytes[256]; /* If a byte in the input stream is the 117 * first character of one of the escape 118 * sequences in the following array, the 119 * corresponding entry in this array is 1, 120 * otherwise it is 0. */ 121 int numSubTables; /* Length of following array. */ 122 EscapeSubTable subTables[1];/* Information about each EscapeSubTable 123 * used by this encoding type. The actual 124 * size will be as large as necessary to 125 * hold all EscapeSubTables. */ 126} EscapeEncodingData; 127 128/* 129 * Constants used when loading an encoding file to identify the type of the 130 * file. 131 */ 132 133#define ENCODING_SINGLEBYTE 0 134#define ENCODING_DOUBLEBYTE 1 135#define ENCODING_MULTIBYTE 2 136#define ENCODING_ESCAPE 3 137 138/* 139 * Initialize the default encoding directory. If this variable contains 140 * a non NULL value, it will be the first path used to locate the 141 * system encoding files. 142 */ 143 144char *tclDefaultEncodingDir = NULL; 145 146static int encodingsInitialized = 0; 147 148/* 149 * Hash table that keeps track of all loaded Encodings. Keys are 150 * the string names that represent the encoding, values are (Encoding *). 151 */ 152 153static Tcl_HashTable encodingTable; 154TCL_DECLARE_MUTEX(encodingMutex) 155 156/* 157 * The following are used to hold the default and current system encodings. 158 * If NULL is passed to one of the conversion routines, the current setting 159 * of the system encoding will be used to perform the conversion. 160 */ 161 162static Tcl_Encoding defaultEncoding; 163static Tcl_Encoding systemEncoding; 164 165/* 166 * The following variable is used in the sparse matrix code for a 167 * TableEncoding to represent a page in the table that has no entries. 168 */ 169 170static unsigned short emptyPage[256]; 171 172/* 173 * Procedures used only in this module. 174 */ 175 176static int BinaryProc _ANSI_ARGS_((ClientData clientData, 177 CONST char *src, int srcLen, int flags, 178 Tcl_EncodingState *statePtr, char *dst, int dstLen, 179 int *srcReadPtr, int *dstWrotePtr, 180 int *dstCharsPtr)); 181static void DupEncodingIntRep _ANSI_ARGS_((Tcl_Obj *srcPtr, 182 Tcl_Obj *dupPtr)); 183static void EscapeFreeProc _ANSI_ARGS_((ClientData clientData)); 184static int EscapeFromUtfProc _ANSI_ARGS_((ClientData clientData, 185 CONST char *src, int srcLen, int flags, 186 Tcl_EncodingState *statePtr, char *dst, int dstLen, 187 int *srcReadPtr, int *dstWrotePtr, 188 int *dstCharsPtr)); 189static int EscapeToUtfProc _ANSI_ARGS_((ClientData clientData, 190 CONST char *src, int srcLen, int flags, 191 Tcl_EncodingState *statePtr, char *dst, int dstLen, 192 int *srcReadPtr, int *dstWrotePtr, 193 int *dstCharsPtr)); 194static void FreeEncoding _ANSI_ARGS_((Tcl_Encoding encoding)); 195static void FreeEncodingIntRep _ANSI_ARGS_((Tcl_Obj *objPtr)); 196static Encoding * GetTableEncoding _ANSI_ARGS_(( 197 EscapeEncodingData *dataPtr, int state)); 198static Tcl_Encoding LoadEncodingFile _ANSI_ARGS_((Tcl_Interp *interp, 199 CONST char *name)); 200static Tcl_Encoding LoadTableEncoding _ANSI_ARGS_((Tcl_Interp *interp, 201 CONST char *name, int type, Tcl_Channel chan)); 202static Tcl_Encoding LoadEscapeEncoding _ANSI_ARGS_((CONST char *name, 203 Tcl_Channel chan)); 204static Tcl_Channel OpenEncodingFile _ANSI_ARGS_((CONST char *dir, 205 CONST char *name)); 206static void TableFreeProc _ANSI_ARGS_((ClientData clientData)); 207static int TableFromUtfProc _ANSI_ARGS_((ClientData clientData, 208 CONST char *src, int srcLen, int flags, 209 Tcl_EncodingState *statePtr, char *dst, int dstLen, 210 int *srcReadPtr, int *dstWrotePtr, 211 int *dstCharsPtr)); 212static int TableToUtfProc _ANSI_ARGS_((ClientData clientData, 213 CONST char *src, int srcLen, int flags, 214 Tcl_EncodingState *statePtr, char *dst, int dstLen, 215 int *srcReadPtr, int *dstWrotePtr, 216 int *dstCharsPtr)); 217static size_t unilen _ANSI_ARGS_((CONST char *src)); 218static int UnicodeToUtfProc _ANSI_ARGS_((ClientData clientData, 219 CONST char *src, int srcLen, int flags, 220 Tcl_EncodingState *statePtr, char *dst, int dstLen, 221 int *srcReadPtr, int *dstWrotePtr, 222 int *dstCharsPtr)); 223static int UtfToUnicodeProc _ANSI_ARGS_((ClientData clientData, 224 CONST char *src, int srcLen, int flags, 225 Tcl_EncodingState *statePtr, char *dst, int dstLen, 226 int *srcReadPtr, int *dstWrotePtr, 227 int *dstCharsPtr)); 228static int UtfToUtfProc _ANSI_ARGS_((ClientData clientData, 229 CONST char *src, int srcLen, int flags, 230 Tcl_EncodingState *statePtr, char *dst, int dstLen, 231 int *srcReadPtr, int *dstWrotePtr, 232 int *dstCharsPtr, int pureNullMode)); 233static int UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData, 234 CONST char *src, int srcLen, int flags, 235 Tcl_EncodingState *statePtr, char *dst, int dstLen, 236 int *srcReadPtr, int *dstWrotePtr, 237 int *dstCharsPtr)); 238static int UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData, 239 CONST char *src, int srcLen, int flags, 240 Tcl_EncodingState *statePtr, char *dst, int dstLen, 241 int *srcReadPtr, int *dstWrotePtr, 242 int *dstCharsPtr)); 243static int TclFindEncodings _ANSI_ARGS_((CONST char *argv0)); 244 245/* 246 * A Tcl_ObjType for holding a cached Tcl_Encoding as the intrep. 247 * This should help the lifetime of encodings be more useful. 248 * See concerns raised in [Bug 1077262]. 249 */ 250 251static Tcl_ObjType EncodingType = { 252 "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL 253}; 254 255 256/* 257 *---------------------------------------------------------------------- 258 * 259 * TclGetEncodingFromObj -- 260 * 261 * Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), 262 * if possible, and returns TCL_OK. If no such encoding exists, 263 * TCL_ERROR is returned, and if interp is non-NULL, an error message 264 * is written there. 265 * 266 * Results: 267 * Standard Tcl return code. 268 * 269 * Side effects: 270 * Caches the Tcl_Encoding value as the internal rep of (*objPtr). 271 * 272 *---------------------------------------------------------------------- 273 */ 274int 275TclGetEncodingFromObj(interp, objPtr, encodingPtr) 276 Tcl_Interp *interp; 277 Tcl_Obj *objPtr; 278 Tcl_Encoding *encodingPtr; 279{ 280 CONST char *name = Tcl_GetString(objPtr); 281 if (objPtr->typePtr != &EncodingType) { 282 Tcl_Encoding encoding = Tcl_GetEncoding(interp, name); 283 284 if (encoding == NULL) { 285 return TCL_ERROR; 286 } 287 if (objPtr->typePtr && objPtr->typePtr->freeIntRepProc) { 288 objPtr->typePtr->freeIntRepProc(objPtr); 289 } 290 objPtr->internalRep.otherValuePtr = (VOID *) encoding; 291 objPtr->typePtr = &EncodingType; 292 } 293 *encodingPtr = Tcl_GetEncoding(NULL, name); 294 return TCL_OK; 295} 296 297/* 298 *---------------------------------------------------------------------- 299 * 300 * FreeEncodingIntRep -- 301 * 302 * The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType. 303 * 304 *---------------------------------------------------------------------- 305 */ 306static void 307FreeEncodingIntRep(objPtr) 308 Tcl_Obj *objPtr; 309{ 310 Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.otherValuePtr); 311} 312 313/* 314 *---------------------------------------------------------------------- 315 * 316 * DupEncodingIntRep -- 317 * 318 * The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType. 319 * 320 *---------------------------------------------------------------------- 321 */ 322static void 323DupEncodingIntRep(srcPtr, dupPtr) 324 Tcl_Obj *srcPtr; 325 Tcl_Obj *dupPtr; 326{ 327 dupPtr->internalRep.otherValuePtr = (VOID *) 328 Tcl_GetEncoding(NULL, srcPtr->bytes); 329} 330 331/* 332 *--------------------------------------------------------------------------- 333 * 334 * TclInitEncodingSubsystem -- 335 * 336 * Initialize all resources used by this subsystem on a per-process 337 * basis. 338 * 339 * Results: 340 * None. 341 * 342 * Side effects: 343 * Depends on the memory, object, and IO subsystems. 344 * 345 *--------------------------------------------------------------------------- 346 */ 347 348void 349TclInitEncodingSubsystem() 350{ 351 Tcl_EncodingType type; 352 353 Tcl_MutexLock(&encodingMutex); 354 Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); 355 Tcl_MutexUnlock(&encodingMutex); 356 357 /* 358 * Create a few initial encodings. Note that the UTF-8 to UTF-8 359 * translation is not a no-op, because it will turn a stream of 360 * improperly formed UTF-8 into a properly formed stream. 361 */ 362 363 type.encodingName = "identity"; 364 type.toUtfProc = BinaryProc; 365 type.fromUtfProc = BinaryProc; 366 type.freeProc = NULL; 367 type.nullSize = 1; 368 type.clientData = NULL; 369 370 defaultEncoding = Tcl_CreateEncoding(&type); 371 systemEncoding = Tcl_GetEncoding(NULL, type.encodingName); 372 373 type.encodingName = "utf-8"; 374 type.toUtfProc = UtfExtToUtfIntProc; 375 type.fromUtfProc = UtfIntToUtfExtProc; 376 type.freeProc = NULL; 377 type.nullSize = 1; 378 type.clientData = NULL; 379 Tcl_CreateEncoding(&type); 380 381 type.encodingName = "unicode"; 382 type.toUtfProc = UnicodeToUtfProc; 383 type.fromUtfProc = UtfToUnicodeProc; 384 type.freeProc = NULL; 385 type.nullSize = 2; 386 type.clientData = NULL; 387 Tcl_CreateEncoding(&type); 388} 389 390 391/* 392 *---------------------------------------------------------------------- 393 * 394 * TclFinalizeEncodingSubsystem -- 395 * 396 * Release the state associated with the encoding subsystem. 397 * 398 * Results: 399 * None. 400 * 401 * Side effects: 402 * Frees all of the encodings. 403 * 404 *---------------------------------------------------------------------- 405 */ 406 407void 408TclFinalizeEncodingSubsystem() 409{ 410 Tcl_HashSearch search; 411 Tcl_HashEntry *hPtr; 412 413 Tcl_MutexLock(&encodingMutex); 414 encodingsInitialized = 0; 415 FreeEncoding(systemEncoding); 416 hPtr = Tcl_FirstHashEntry(&encodingTable, &search); 417 while (hPtr != NULL) { 418 /* 419 * Call FreeEncoding instead of doing it directly to handle refcounts 420 * like escape encodings use. [Bug #524674] 421 * Make sure to call Tcl_FirstHashEntry repeatedly so that all 422 * encodings are eventually cleaned up. 423 */ 424 FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr)); 425 hPtr = Tcl_FirstHashEntry(&encodingTable, &search); 426 } 427 Tcl_DeleteHashTable(&encodingTable); 428 Tcl_MutexUnlock(&encodingMutex); 429} 430 431/* 432 *------------------------------------------------------------------------- 433 * 434 * Tcl_GetDefaultEncodingDir -- 435 * 436 * 437 * Results: 438 * 439 * Side effects: 440 * 441 *------------------------------------------------------------------------- 442 */ 443 444CONST char * 445Tcl_GetDefaultEncodingDir() 446{ 447 return tclDefaultEncodingDir; 448} 449 450/* 451 *------------------------------------------------------------------------- 452 * 453 * Tcl_SetDefaultEncodingDir -- 454 * 455 * 456 * Results: 457 * 458 * Side effects: 459 * 460 *------------------------------------------------------------------------- 461 */ 462 463void 464Tcl_SetDefaultEncodingDir(path) 465 CONST char *path; 466{ 467 tclDefaultEncodingDir = (char *)ckalloc((unsigned) strlen(path) + 1); 468 strcpy(tclDefaultEncodingDir, path); 469} 470 471/* 472 *------------------------------------------------------------------------- 473 * 474 * Tcl_GetEncoding -- 475 * 476 * Given the name of a encoding, find the corresponding Tcl_Encoding 477 * token. If the encoding did not already exist, Tcl attempts to 478 * dynamically load an encoding by that name. 479 * 480 * Results: 481 * Returns a token that represents the encoding. If the name didn't 482 * refer to any known or loadable encoding, NULL is returned. If 483 * NULL was returned, an error message is left in interp's result 484 * object, unless interp was NULL. 485 * 486 * Side effects: 487 * The new encoding type is entered into a table visible to all 488 * interpreters, keyed off the encoding's name. For each call to 489 * this procedure, there should eventually be a call to 490 * Tcl_FreeEncoding, so that the database can be cleaned up when 491 * encodings aren't needed anymore. 492 * 493 *------------------------------------------------------------------------- 494 */ 495 496Tcl_Encoding 497Tcl_GetEncoding(interp, name) 498 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */ 499 CONST char *name; /* The name of the desired encoding. */ 500{ 501 Tcl_HashEntry *hPtr; 502 Encoding *encodingPtr; 503 504 Tcl_MutexLock(&encodingMutex); 505 if (name == NULL) { 506 encodingPtr = (Encoding *) systemEncoding; 507 encodingPtr->refCount++; 508 Tcl_MutexUnlock(&encodingMutex); 509 return systemEncoding; 510 } 511 512 hPtr = Tcl_FindHashEntry(&encodingTable, name); 513 if (hPtr != NULL) { 514 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 515 encodingPtr->refCount++; 516 Tcl_MutexUnlock(&encodingMutex); 517 return (Tcl_Encoding) encodingPtr; 518 } 519 Tcl_MutexUnlock(&encodingMutex); 520 return LoadEncodingFile(interp, name); 521} 522 523/* 524 *--------------------------------------------------------------------------- 525 * 526 * Tcl_FreeEncoding -- 527 * 528 * This procedure is called to release an encoding allocated by 529 * Tcl_CreateEncoding() or Tcl_GetEncoding(). 530 * 531 * Results: 532 * None. 533 * 534 * Side effects: 535 * The reference count associated with the encoding is decremented 536 * and the encoding may be deleted if nothing is using it anymore. 537 * 538 *--------------------------------------------------------------------------- 539 */ 540 541void 542Tcl_FreeEncoding(encoding) 543 Tcl_Encoding encoding; 544{ 545 Tcl_MutexLock(&encodingMutex); 546 FreeEncoding(encoding); 547 Tcl_MutexUnlock(&encodingMutex); 548} 549 550/* 551 *---------------------------------------------------------------------- 552 * 553 * FreeEncoding -- 554 * 555 * This procedure is called to release an encoding by procedures 556 * that already have the encodingMutex. 557 * 558 * Results: 559 * None. 560 * 561 * Side effects: 562 * The reference count associated with the encoding is decremented 563 * and the encoding may be deleted if nothing is using it anymore. 564 * 565 *---------------------------------------------------------------------- 566 */ 567 568static void 569FreeEncoding(encoding) 570 Tcl_Encoding encoding; 571{ 572 Encoding *encodingPtr; 573 574 encodingPtr = (Encoding *) encoding; 575 if (encodingPtr == NULL) { 576 return; 577 } 578 if (encodingPtr->refCount<=0) { 579 Tcl_Panic("FreeEncoding: refcount problem !!!"); 580 } 581 encodingPtr->refCount--; 582 if (encodingPtr->refCount == 0) { 583 if (encodingPtr->freeProc != NULL) { 584 (*encodingPtr->freeProc)(encodingPtr->clientData); 585 } 586 if (encodingPtr->hPtr != NULL) { 587 Tcl_DeleteHashEntry(encodingPtr->hPtr); 588 } 589 ckfree((char *) encodingPtr->name); 590 ckfree((char *) encodingPtr); 591 } 592} 593 594/* 595 *------------------------------------------------------------------------- 596 * 597 * Tcl_GetEncodingName -- 598 * 599 * Given an encoding, return the name that was used to constuct 600 * the encoding. 601 * 602 * Results: 603 * The name of the encoding. 604 * 605 * Side effects: 606 * None. 607 * 608 *--------------------------------------------------------------------------- 609 */ 610 611CONST char * 612Tcl_GetEncodingName(encoding) 613 Tcl_Encoding encoding; /* The encoding whose name to fetch. */ 614{ 615 Encoding *encodingPtr; 616 617 if (encoding == NULL) { 618 encoding = systemEncoding; 619 } 620 encodingPtr = (Encoding *) encoding; 621 return encodingPtr->name; 622} 623 624/* 625 *------------------------------------------------------------------------- 626 * 627 * Tcl_GetEncodingNames -- 628 * 629 * Get the list of all known encodings, including the ones stored 630 * as files on disk in the encoding path. 631 * 632 * Results: 633 * Modifies interp's result object to hold a list of all the available 634 * encodings. 635 * 636 * Side effects: 637 * None. 638 * 639 *------------------------------------------------------------------------- 640 */ 641 642void 643Tcl_GetEncodingNames(interp) 644 Tcl_Interp *interp; /* Interp to hold result. */ 645{ 646 Tcl_HashSearch search; 647 Tcl_HashEntry *hPtr; 648 Tcl_Obj *pathPtr, *resultPtr; 649 int dummy; 650 651 Tcl_HashTable table; 652 653 Tcl_MutexLock(&encodingMutex); 654 Tcl_InitHashTable(&table, TCL_STRING_KEYS); 655 hPtr = Tcl_FirstHashEntry(&encodingTable, &search); 656 while (hPtr != NULL) { 657 Encoding *encodingPtr; 658 659 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 660 Tcl_CreateHashEntry(&table, encodingPtr->name, &dummy); 661 hPtr = Tcl_NextHashEntry(&search); 662 } 663 Tcl_MutexUnlock(&encodingMutex); 664 665 pathPtr = TclGetLibraryPath(); 666 if (pathPtr != NULL) { 667 int i, objc; 668 Tcl_Obj **objv; 669 char globArgString[10]; 670 Tcl_Obj* encodingObj = Tcl_NewStringObj("encoding",-1); 671 Tcl_IncrRefCount(encodingObj); 672 673 objc = 0; 674 Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv); 675 676 for (i = 0; i < objc; i++) { 677 Tcl_Obj *searchIn; 678 679 /* 680 * Construct the path from the element of pathPtr, 681 * joined with 'encoding'. 682 */ 683 searchIn = Tcl_FSJoinToPath(objv[i],1,&encodingObj); 684 Tcl_IncrRefCount(searchIn); 685 Tcl_ResetResult(interp); 686 687 /* 688 * TclGlob() changes the contents of globArgString, which causes 689 * a segfault if we pass in a pointer to non-writeable memory. 690 * TclGlob() puts its results directly into interp. 691 */ 692 693 strcpy(globArgString, "*.enc"); 694 /* 695 * The GLOBMODE_TAILS flag returns just the tail of each file 696 * which is the encoding name with a .enc extension 697 */ 698 if ((TclGlob(interp, globArgString, searchIn, 699 TCL_GLOBMODE_TAILS, NULL) == TCL_OK)) { 700 int objc2 = 0; 701 Tcl_Obj **objv2; 702 int j; 703 704 Tcl_ListObjGetElements(NULL, Tcl_GetObjResult(interp), &objc2, 705 &objv2); 706 707 for (j = 0; j < objc2; j++) { 708 int length; 709 char *string; 710 string = Tcl_GetStringFromObj(objv2[j], &length); 711 length -= 4; 712 if (length > 0) { 713 string[length] = '\0'; 714 Tcl_CreateHashEntry(&table, string, &dummy); 715 string[length] = '.'; 716 } 717 } 718 } 719 Tcl_DecrRefCount(searchIn); 720 } 721 Tcl_DecrRefCount(encodingObj); 722 } 723 724 /* 725 * Clear any values placed in the result by globbing. 726 */ 727 728 Tcl_ResetResult(interp); 729 resultPtr = Tcl_GetObjResult(interp); 730 731 hPtr = Tcl_FirstHashEntry(&table, &search); 732 while (hPtr != NULL) { 733 Tcl_Obj *strPtr; 734 735 strPtr = Tcl_NewStringObj(Tcl_GetHashKey(&table, hPtr), -1); 736 Tcl_ListObjAppendElement(NULL, resultPtr, strPtr); 737 hPtr = Tcl_NextHashEntry(&search); 738 } 739 Tcl_DeleteHashTable(&table); 740} 741 742/* 743 *------------------------------------------------------------------------ 744 * 745 * Tcl_SetSystemEncoding -- 746 * 747 * Sets the default encoding that should be used whenever the user 748 * passes a NULL value in to one of the conversion routines. 749 * If the supplied name is NULL, the system encoding is reset to the 750 * default system encoding. 751 * 752 * Results: 753 * The return value is TCL_OK if the system encoding was successfully 754 * set to the encoding specified by name, TCL_ERROR otherwise. If 755 * TCL_ERROR is returned, an error message is left in interp's result 756 * object, unless interp was NULL. 757 * 758 * Side effects: 759 * The reference count of the new system encoding is incremented. 760 * The reference count of the old system encoding is decremented and 761 * it may be freed. 762 * 763 *------------------------------------------------------------------------ 764 */ 765 766int 767Tcl_SetSystemEncoding(interp, name) 768 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */ 769 CONST char *name; /* The name of the desired encoding, or NULL 770 * to reset to default encoding. */ 771{ 772 Tcl_Encoding encoding; 773 Encoding *encodingPtr; 774 775 if (name == NULL) { 776 Tcl_MutexLock(&encodingMutex); 777 encoding = defaultEncoding; 778 encodingPtr = (Encoding *) encoding; 779 encodingPtr->refCount++; 780 Tcl_MutexUnlock(&encodingMutex); 781 } else { 782 encoding = Tcl_GetEncoding(interp, name); 783 if (encoding == NULL) { 784 return TCL_ERROR; 785 } 786 } 787 788 Tcl_MutexLock(&encodingMutex); 789 FreeEncoding(systemEncoding); 790 systemEncoding = encoding; 791 Tcl_MutexUnlock(&encodingMutex); 792 793 return TCL_OK; 794} 795 796/* 797 *--------------------------------------------------------------------------- 798 * 799 * Tcl_CreateEncoding -- 800 * 801 * This procedure is called to define a new encoding and the procedures 802 * that are used to convert between the specified encoding and Unicode. 803 * 804 * Results: 805 * Returns a token that represents the encoding. If an encoding with 806 * the same name already existed, the old encoding token remains 807 * valid and continues to behave as it used to, and will eventually 808 * be garbage collected when the last reference to it goes away. Any 809 * subsequent calls to Tcl_GetEncoding with the specified name will 810 * retrieve the most recent encoding token. 811 * 812 * Side effects: 813 * The new encoding type is entered into a table visible to all 814 * interpreters, keyed off the encoding's name. For each call to 815 * this procedure, there should eventually be a call to 816 * Tcl_FreeEncoding, so that the database can be cleaned up when 817 * encodings aren't needed anymore. 818 * 819 *--------------------------------------------------------------------------- 820 */ 821 822Tcl_Encoding 823Tcl_CreateEncoding(typePtr) 824 Tcl_EncodingType *typePtr; /* The encoding type. */ 825{ 826 Tcl_HashEntry *hPtr; 827 int new; 828 Encoding *encodingPtr; 829 char *name; 830 831 Tcl_MutexLock(&encodingMutex); 832 hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &new); 833 if (new == 0) { 834 /* 835 * Remove old encoding from hash table, but don't delete it until 836 * last reference goes away. 837 */ 838 839 encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr); 840 encodingPtr->hPtr = NULL; 841 } 842 843 name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1); 844 845 encodingPtr = (Encoding *) ckalloc(sizeof(Encoding)); 846 encodingPtr->name = strcpy(name, typePtr->encodingName); 847 encodingPtr->toUtfProc = typePtr->toUtfProc; 848 encodingPtr->fromUtfProc = typePtr->fromUtfProc; 849 encodingPtr->freeProc = typePtr->freeProc; 850 encodingPtr->nullSize = typePtr->nullSize; 851 encodingPtr->clientData = typePtr->clientData; 852 if (typePtr->nullSize == 1) { 853 encodingPtr->lengthProc = (LengthProc *) strlen; 854 } else { 855 encodingPtr->lengthProc = (LengthProc *) unilen; 856 } 857 encodingPtr->refCount = 1; 858 encodingPtr->hPtr = hPtr; 859 Tcl_SetHashValue(hPtr, encodingPtr); 860 861 Tcl_MutexUnlock(&encodingMutex); 862 863 return (Tcl_Encoding) encodingPtr; 864} 865 866/* 867 *------------------------------------------------------------------------- 868 * 869 * Tcl_ExternalToUtfDString -- 870 * 871 * Convert a source buffer from the specified encoding into UTF-8. 872 * If any of the bytes in the source buffer are invalid or cannot 873 * be represented in the target encoding, a default fallback 874 * character will be substituted. 875 * 876 * Results: 877 * The converted bytes are stored in the DString, which is then NULL 878 * terminated. The return value is a pointer to the value stored 879 * in the DString. 880 * 881 * Side effects: 882 * None. 883 * 884 *------------------------------------------------------------------------- 885 */ 886 887char * 888Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr) 889 Tcl_Encoding encoding; /* The encoding for the source string, or 890 * NULL for the default system encoding. */ 891 CONST char *src; /* Source string in specified encoding. */ 892 int srcLen; /* Source string length in bytes, or < 0 for 893 * encoding-specific string length. */ 894 Tcl_DString *dstPtr; /* Uninitialized or free DString in which 895 * the converted string is stored. */ 896{ 897 char *dst; 898 Tcl_EncodingState state; 899 Encoding *encodingPtr; 900 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; 901 902 Tcl_DStringInit(dstPtr); 903 dst = Tcl_DStringValue(dstPtr); 904 dstLen = dstPtr->spaceAvl - 1; 905 906 if (encoding == NULL) { 907 encoding = systemEncoding; 908 } 909 encodingPtr = (Encoding *) encoding; 910 911 if (src == NULL) { 912 srcLen = 0; 913 } else if (srcLen < 0) { 914 srcLen = (*encodingPtr->lengthProc)(src); 915 } 916 flags = TCL_ENCODING_START | TCL_ENCODING_END; 917 while (1) { 918 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, 919 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, 920 &dstChars); 921 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); 922 if (result != TCL_CONVERT_NOSPACE) { 923 Tcl_DStringSetLength(dstPtr, soFar); 924 return Tcl_DStringValue(dstPtr); 925 } 926 flags &= ~TCL_ENCODING_START; 927 src += srcRead; 928 srcLen -= srcRead; 929 if (Tcl_DStringLength(dstPtr) == 0) { 930 Tcl_DStringSetLength(dstPtr, dstLen); 931 } 932 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); 933 dst = Tcl_DStringValue(dstPtr) + soFar; 934 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; 935 } 936} 937 938/* 939 *------------------------------------------------------------------------- 940 * 941 * Tcl_ExternalToUtf -- 942 * 943 * Convert a source buffer from the specified encoding into UTF-8. 944 * 945 * Results: 946 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE, 947 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, 948 * as documented in tcl.h. 949 * 950 * Side effects: 951 * The converted bytes are stored in the output buffer. 952 * 953 *------------------------------------------------------------------------- 954 */ 955 956int 957Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr, dst, 958 dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr) 959 Tcl_Interp *interp; /* Interp for error return, if not NULL. */ 960 Tcl_Encoding encoding; /* The encoding for the source string, or 961 * NULL for the default system encoding. */ 962 CONST char *src; /* Source string in specified encoding. */ 963 int srcLen; /* Source string length in bytes, or < 0 for 964 * encoding-specific string length. */ 965 int flags; /* Conversion control flags. */ 966 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 967 * state information used during a piecewise 968 * conversion. Contents of statePtr are 969 * initialized and/or reset by conversion 970 * routine under control of flags argument. */ 971 char *dst; /* Output buffer in which converted string 972 * is stored. */ 973 int dstLen; /* The maximum length of output buffer in 974 * bytes. */ 975 int *srcReadPtr; /* Filled with the number of bytes from the 976 * source string that were converted. This 977 * may be less than the original source length 978 * if there was a problem converting some 979 * source characters. */ 980 int *dstWrotePtr; /* Filled with the number of bytes that were 981 * stored in the output buffer as a result of 982 * the conversion. */ 983 int *dstCharsPtr; /* Filled with the number of characters that 984 * correspond to the bytes stored in the 985 * output buffer. */ 986{ 987 Encoding *encodingPtr; 988 int result, srcRead, dstWrote, dstChars; 989 Tcl_EncodingState state; 990 991 if (encoding == NULL) { 992 encoding = systemEncoding; 993 } 994 encodingPtr = (Encoding *) encoding; 995 996 if (src == NULL) { 997 srcLen = 0; 998 } else if (srcLen < 0) { 999 srcLen = (*encodingPtr->lengthProc)(src); 1000 } 1001 if (statePtr == NULL) { 1002 flags |= TCL_ENCODING_START | TCL_ENCODING_END; 1003 statePtr = &state; 1004 } 1005 if (srcReadPtr == NULL) { 1006 srcReadPtr = &srcRead; 1007 } 1008 if (dstWrotePtr == NULL) { 1009 dstWrotePtr = &dstWrote; 1010 } 1011 if (dstCharsPtr == NULL) { 1012 dstCharsPtr = &dstChars; 1013 } 1014 1015 /* 1016 * If there are any null characters in the middle of the buffer, they will 1017 * converted to the UTF-8 null character (\xC080). To get the actual 1018 * \0 at the end of the destination buffer, we need to append it manually. 1019 */ 1020 1021 dstLen--; 1022 result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen, 1023 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, 1024 dstCharsPtr); 1025 dst[*dstWrotePtr] = '\0'; 1026 return result; 1027} 1028 1029/* 1030 *------------------------------------------------------------------------- 1031 * 1032 * Tcl_UtfToExternalDString -- 1033 * 1034 * Convert a source buffer from UTF-8 into the specified encoding. 1035 * If any of the bytes in the source buffer are invalid or cannot 1036 * be represented in the target encoding, a default fallback 1037 * character will be substituted. 1038 * 1039 * Results: 1040 * The converted bytes are stored in the DString, which is then 1041 * NULL terminated in an encoding-specific manner. The return value 1042 * is a pointer to the value stored in the DString. 1043 * 1044 * Side effects: 1045 * None. 1046 * 1047 *------------------------------------------------------------------------- 1048 */ 1049 1050char * 1051Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr) 1052 Tcl_Encoding encoding; /* The encoding for the converted string, 1053 * or NULL for the default system encoding. */ 1054 CONST char *src; /* Source string in UTF-8. */ 1055 int srcLen; /* Source string length in bytes, or < 0 for 1056 * strlen(). */ 1057 Tcl_DString *dstPtr; /* Uninitialized or free DString in which 1058 * the converted string is stored. */ 1059{ 1060 char *dst; 1061 Tcl_EncodingState state; 1062 Encoding *encodingPtr; 1063 int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars; 1064 1065 Tcl_DStringInit(dstPtr); 1066 dst = Tcl_DStringValue(dstPtr); 1067 dstLen = dstPtr->spaceAvl - 1; 1068 1069 if (encoding == NULL) { 1070 encoding = systemEncoding; 1071 } 1072 encodingPtr = (Encoding *) encoding; 1073 1074 if (src == NULL) { 1075 srcLen = 0; 1076 } else if (srcLen < 0) { 1077 srcLen = strlen(src); 1078 } 1079 flags = TCL_ENCODING_START | TCL_ENCODING_END; 1080 while (1) { 1081 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, 1082 srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, 1083 &dstChars); 1084 soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); 1085 if (result != TCL_CONVERT_NOSPACE) { 1086 if (encodingPtr->nullSize == 2) { 1087 Tcl_DStringSetLength(dstPtr, soFar + 1); 1088 } 1089 Tcl_DStringSetLength(dstPtr, soFar); 1090 return Tcl_DStringValue(dstPtr); 1091 } 1092 flags &= ~TCL_ENCODING_START; 1093 src += srcRead; 1094 srcLen -= srcRead; 1095 if (Tcl_DStringLength(dstPtr) == 0) { 1096 Tcl_DStringSetLength(dstPtr, dstLen); 1097 } 1098 Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); 1099 dst = Tcl_DStringValue(dstPtr) + soFar; 1100 dstLen = Tcl_DStringLength(dstPtr) - soFar - 1; 1101 } 1102} 1103 1104/* 1105 *------------------------------------------------------------------------- 1106 * 1107 * Tcl_UtfToExternal -- 1108 * 1109 * Convert a buffer from UTF-8 into the specified encoding. 1110 * 1111 * Results: 1112 * The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE, 1113 * TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE, 1114 * as documented in tcl.h. 1115 * 1116 * Side effects: 1117 * The converted bytes are stored in the output buffer. 1118 * 1119 *------------------------------------------------------------------------- 1120 */ 1121 1122int 1123Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr, dst, 1124 dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr) 1125 Tcl_Interp *interp; /* Interp for error return, if not NULL. */ 1126 Tcl_Encoding encoding; /* The encoding for the converted string, 1127 * or NULL for the default system encoding. */ 1128 CONST char *src; /* Source string in UTF-8. */ 1129 int srcLen; /* Source string length in bytes, or < 0 for 1130 * strlen(). */ 1131 int flags; /* Conversion control flags. */ 1132 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 1133 * state information used during a piecewise 1134 * conversion. Contents of statePtr are 1135 * initialized and/or reset by conversion 1136 * routine under control of flags argument. */ 1137 char *dst; /* Output buffer in which converted string 1138 * is stored. */ 1139 int dstLen; /* The maximum length of output buffer in 1140 * bytes. */ 1141 int *srcReadPtr; /* Filled with the number of bytes from the 1142 * source string that were converted. This 1143 * may be less than the original source length 1144 * if there was a problem converting some 1145 * source characters. */ 1146 int *dstWrotePtr; /* Filled with the number of bytes that were 1147 * stored in the output buffer as a result of 1148 * the conversion. */ 1149 int *dstCharsPtr; /* Filled with the number of characters that 1150 * correspond to the bytes stored in the 1151 * output buffer. */ 1152{ 1153 Encoding *encodingPtr; 1154 int result, srcRead, dstWrote, dstChars; 1155 Tcl_EncodingState state; 1156 1157 if (encoding == NULL) { 1158 encoding = systemEncoding; 1159 } 1160 encodingPtr = (Encoding *) encoding; 1161 1162 if (src == NULL) { 1163 srcLen = 0; 1164 } else if (srcLen < 0) { 1165 srcLen = strlen(src); 1166 } 1167 if (statePtr == NULL) { 1168 flags |= TCL_ENCODING_START | TCL_ENCODING_END; 1169 statePtr = &state; 1170 } 1171 if (srcReadPtr == NULL) { 1172 srcReadPtr = &srcRead; 1173 } 1174 if (dstWrotePtr == NULL) { 1175 dstWrotePtr = &dstWrote; 1176 } 1177 if (dstCharsPtr == NULL) { 1178 dstCharsPtr = &dstChars; 1179 } 1180 1181 dstLen -= encodingPtr->nullSize; 1182 result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen, 1183 flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, 1184 dstCharsPtr); 1185 if (encodingPtr->nullSize == 2) { 1186 dst[*dstWrotePtr + 1] = '\0'; 1187 } 1188 dst[*dstWrotePtr] = '\0'; 1189 1190 return result; 1191} 1192 1193/* 1194 *--------------------------------------------------------------------------- 1195 * 1196 * Tcl_FindExecutable -- 1197 * 1198 * This procedure computes the absolute path name of the current 1199 * application, given its argv[0] value. 1200 * 1201 * Results: 1202 * None. 1203 * 1204 * Side effects: 1205 * The variable tclExecutableName gets filled in with the file 1206 * name for the application, if we figured it out. If we couldn't 1207 * figure it out, tclExecutableName is set to NULL. 1208 * 1209 *--------------------------------------------------------------------------- 1210 */ 1211 1212void 1213Tcl_FindExecutable(argv0) 1214 CONST char *argv0; /* The value of the application's argv[0] 1215 * (native). */ 1216{ 1217 int mustCleanUtf; 1218 CONST char *name; 1219 Tcl_DString buffer, nameString; 1220 1221 TclInitSubsystems(argv0); 1222 1223 if (argv0 == NULL) { 1224 goto done; 1225 } 1226 if (tclExecutableName != NULL) { 1227 ckfree(tclExecutableName); 1228 tclExecutableName = NULL; 1229 } 1230 if ((name = TclpFindExecutable(argv0)) == NULL) { 1231 goto done; 1232 } 1233 1234 /* 1235 * The value returned from TclpNameOfExecutable is a UTF string that 1236 * is possibly dirty depending on when it was initialized. 1237 * TclFindEncodings will indicate whether we must "clean" the UTF (as 1238 * reported by the underlying system). To assure that the UTF string 1239 * is a properly encoded native string for this system, convert the 1240 * UTF string to the default native encoding before the default 1241 * encoding is initialized. Then, convert it back to UTF after the 1242 * system encoding is loaded. 1243 */ 1244 1245 Tcl_UtfToExternalDString(NULL, name, -1, &buffer); 1246 mustCleanUtf = TclFindEncodings(argv0); 1247 1248 /* 1249 * Now it is OK to convert the native string back to UTF and set 1250 * the value of the tclExecutableName. 1251 */ 1252 1253 if (mustCleanUtf) { 1254 Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&buffer), -1, 1255 &nameString); 1256 tclExecutableName = (char *) 1257 ckalloc((unsigned) (Tcl_DStringLength(&nameString) + 1)); 1258 strcpy(tclExecutableName, Tcl_DStringValue(&nameString)); 1259 1260 Tcl_DStringFree(&nameString); 1261 } else { 1262 tclExecutableName = (char *) ckalloc((unsigned) (strlen(name) + 1)); 1263 strcpy(tclExecutableName, name); 1264 } 1265 Tcl_DStringFree(&buffer); 1266 return; 1267 1268 done: 1269 (void) TclFindEncodings(argv0); 1270} 1271 1272/* 1273 *--------------------------------------------------------------------------- 1274 * 1275 * LoadEncodingFile -- 1276 * 1277 * Read a file that describes an encoding and create a new Encoding 1278 * from the data. 1279 * 1280 * Results: 1281 * The return value is the newly loaded Encoding, or NULL if 1282 * the file didn't exist of was in the incorrect format. If NULL was 1283 * returned, an error message is left in interp's result object, 1284 * unless interp was NULL. 1285 * 1286 * Side effects: 1287 * File read from disk. 1288 * 1289 *--------------------------------------------------------------------------- 1290 */ 1291 1292static Tcl_Encoding 1293LoadEncodingFile(interp, name) 1294 Tcl_Interp *interp; /* Interp for error reporting, if not NULL. */ 1295 CONST char *name; /* The name of the encoding file on disk 1296 * and also the name for new encoding. */ 1297{ 1298 int objc, i, ch; 1299 Tcl_Obj **objv; 1300 Tcl_Obj *pathPtr; 1301 Tcl_Channel chan; 1302 Tcl_Encoding encoding; 1303 1304 pathPtr = TclGetLibraryPath(); 1305 if (pathPtr == NULL) { 1306 goto unknown; 1307 } 1308 objc = 0; 1309 Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv); 1310 1311 chan = NULL; 1312 for (i = 0; i < objc; i++) { 1313 chan = OpenEncodingFile(Tcl_GetString(objv[i]), name); 1314 if (chan != NULL) { 1315 break; 1316 } 1317 } 1318 1319 if (chan == NULL) { 1320 goto unknown; 1321 } 1322 1323 Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8"); 1324 1325 while (1) { 1326 Tcl_DString ds; 1327 1328 Tcl_DStringInit(&ds); 1329 Tcl_Gets(chan, &ds); 1330 ch = Tcl_DStringValue(&ds)[0]; 1331 Tcl_DStringFree(&ds); 1332 if (ch != '#') { 1333 break; 1334 } 1335 } 1336 1337 encoding = NULL; 1338 switch (ch) { 1339 case 'S': { 1340 encoding = LoadTableEncoding(interp, name, ENCODING_SINGLEBYTE, 1341 chan); 1342 break; 1343 } 1344 case 'D': { 1345 encoding = LoadTableEncoding(interp, name, ENCODING_DOUBLEBYTE, 1346 chan); 1347 break; 1348 } 1349 case 'M': { 1350 encoding = LoadTableEncoding(interp, name, ENCODING_MULTIBYTE, 1351 chan); 1352 break; 1353 } 1354 case 'E': { 1355 encoding = LoadEscapeEncoding(name, chan); 1356 break; 1357 } 1358 } 1359 if ((encoding == NULL) && (interp != NULL)) { 1360 Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL); 1361 if (ch == 'E') { 1362 Tcl_AppendResult(interp, " or missing sub-encoding", NULL); 1363 } 1364 } 1365 Tcl_Close(NULL, chan); 1366 return encoding; 1367 1368 unknown: 1369 if (interp != NULL) { 1370 Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL); 1371 } 1372 return NULL; 1373} 1374 1375/* 1376 *---------------------------------------------------------------------- 1377 * 1378 * OpenEncodingFile -- 1379 * 1380 * Look for the file encoding/<name>.enc in the specified 1381 * directory. 1382 * 1383 * Results: 1384 * Returns an open file channel if the file exists. 1385 * 1386 * Side effects: 1387 * None. 1388 * 1389 *---------------------------------------------------------------------- 1390 */ 1391 1392static Tcl_Channel 1393OpenEncodingFile(dir, name) 1394 CONST char *dir; 1395 CONST char *name; 1396 1397{ 1398 CONST char *argv[3]; 1399 Tcl_DString pathString; 1400 CONST char *path; 1401 Tcl_Channel chan; 1402 Tcl_Obj *pathPtr; 1403 1404 argv[0] = dir; 1405 argv[1] = "encoding"; 1406 argv[2] = name; 1407 1408 Tcl_DStringInit(&pathString); 1409 Tcl_JoinPath(3, argv, &pathString); 1410 path = Tcl_DStringAppend(&pathString, ".enc", -1); 1411 pathPtr = Tcl_NewStringObj(path,-1); 1412 1413 Tcl_IncrRefCount(pathPtr); 1414 chan = Tcl_FSOpenFileChannel(NULL, pathPtr, "r", 0); 1415 Tcl_DecrRefCount(pathPtr); 1416 1417 Tcl_DStringFree(&pathString); 1418 1419 return chan; 1420} 1421 1422/* 1423 *------------------------------------------------------------------------- 1424 * 1425 * LoadTableEncoding -- 1426 * 1427 * Helper function for LoadEncodingTable(). Loads a table to that 1428 * converts between Unicode and some other encoding and creates an 1429 * encoding (using a TableEncoding structure) from that information. 1430 * 1431 * File contains binary data, but begins with a marker to indicate 1432 * byte-ordering, so that same binary file can be read on either 1433 * endian platforms. 1434 * 1435 * Results: 1436 * The return value is the new encoding, or NULL if the encoding 1437 * could not be created (because the file contained invalid data). 1438 * 1439 * Side effects: 1440 * None. 1441 * 1442 *------------------------------------------------------------------------- 1443 */ 1444 1445static Tcl_Encoding 1446LoadTableEncoding(interp, name, type, chan) 1447 Tcl_Interp *interp; /* Interp for temporary obj while reading. */ 1448 CONST char *name; /* Name for new encoding. */ 1449 int type; /* Type of encoding (ENCODING_?????). */ 1450 Tcl_Channel chan; /* File containing new encoding. */ 1451{ 1452 Tcl_DString lineString; 1453 Tcl_Obj *objPtr; 1454 char *line; 1455 int i, hi, lo, numPages, symbol, fallback; 1456 unsigned char used[256]; 1457 unsigned int size; 1458 TableEncodingData *dataPtr; 1459 unsigned short *pageMemPtr; 1460 Tcl_EncodingType encType; 1461 1462 /* 1463 * Speed over memory. Use a full 256 character table to decode hex 1464 * sequences in the encoding files. 1465 */ 1466 1467 static char staticHex[] = { 1468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0 ... 15 */ 1469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 ... 31 */ 1470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 32 ... 47 */ 1471 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 48 ... 63 */ 1472 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64 ... 79 */ 1473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80 ... 95 */ 1474 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96 ... 111 */ 1475 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */ 1476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */ 1477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */ 1478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */ 1479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */ 1480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */ 1481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */ 1482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */ 1483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */ 1484 }; 1485 1486 Tcl_DStringInit(&lineString); 1487 Tcl_Gets(chan, &lineString); 1488 line = Tcl_DStringValue(&lineString); 1489 1490 fallback = (int) strtol(line, &line, 16); 1491 symbol = (int) strtol(line, &line, 10); 1492 numPages = (int) strtol(line, &line, 10); 1493 Tcl_DStringFree(&lineString); 1494 1495 if (numPages < 0) { 1496 numPages = 0; 1497 } else if (numPages > 256) { 1498 numPages = 256; 1499 } 1500 1501 memset(used, 0, sizeof(used)); 1502 1503#undef PAGESIZE 1504#define PAGESIZE (256 * sizeof(unsigned short)) 1505 1506 dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData)); 1507 memset(dataPtr, 0, sizeof(TableEncodingData)); 1508 1509 dataPtr->fallback = fallback; 1510 1511 /* 1512 * Read the table that maps characters to Unicode. Performs a single 1513 * malloc to get the memory for the array and all the pages needed by 1514 * the array. 1515 */ 1516 1517 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; 1518 dataPtr->toUnicode = (unsigned short **) ckalloc(size); 1519 memset(dataPtr->toUnicode, 0, size); 1520 pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256); 1521 1522 if (interp == NULL) { 1523 objPtr = Tcl_NewObj(); 1524 } else { 1525 objPtr = Tcl_GetObjResult(interp); 1526 } 1527 for (i = 0; i < numPages; i++) { 1528 int ch; 1529 char *p; 1530 1531 Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0); 1532 p = Tcl_GetString(objPtr); 1533 hi = (staticHex[(unsigned int)p[0]] << 4) + staticHex[(unsigned int)p[1]]; 1534 dataPtr->toUnicode[hi] = pageMemPtr; 1535 p += 2; 1536 for (lo = 0; lo < 256; lo++) { 1537 if ((lo & 0x0f) == 0) { 1538 p++; 1539 } 1540 ch = (staticHex[(unsigned int)p[0]] << 12) + (staticHex[(unsigned int)p[1]] << 8) 1541 + (staticHex[(unsigned int)p[2]] << 4) + staticHex[(unsigned int)p[3]]; 1542 if (ch != 0) { 1543 used[ch >> 8] = 1; 1544 } 1545 *pageMemPtr = (unsigned short) ch; 1546 pageMemPtr++; 1547 p += 4; 1548 } 1549 } 1550 if (interp == NULL) { 1551 Tcl_DecrRefCount(objPtr); 1552 } else { 1553 Tcl_ResetResult(interp); 1554 } 1555 1556 if (type == ENCODING_DOUBLEBYTE) { 1557 memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes)); 1558 } else { 1559 for (hi = 1; hi < 256; hi++) { 1560 if (dataPtr->toUnicode[hi] != NULL) { 1561 dataPtr->prefixBytes[hi] = 1; 1562 } 1563 } 1564 } 1565 1566 /* 1567 * Invert toUnicode array to produce the fromUnicode array. Performs a 1568 * single malloc to get the memory for the array and all the pages 1569 * needed by the array. While reading in the toUnicode array, we 1570 * remembered what pages that would be needed for the fromUnicode array. 1571 */ 1572 1573 if (symbol) { 1574 used[0] = 1; 1575 } 1576 numPages = 0; 1577 for (hi = 0; hi < 256; hi++) { 1578 if (used[hi]) { 1579 numPages++; 1580 } 1581 } 1582 size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE; 1583 dataPtr->fromUnicode = (unsigned short **) ckalloc(size); 1584 memset(dataPtr->fromUnicode, 0, size); 1585 pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256); 1586 1587 for (hi = 0; hi < 256; hi++) { 1588 if (dataPtr->toUnicode[hi] == NULL) { 1589 dataPtr->toUnicode[hi] = emptyPage; 1590 } else { 1591 for (lo = 0; lo < 256; lo++) { 1592 int ch; 1593 1594 ch = dataPtr->toUnicode[hi][lo]; 1595 if (ch != 0) { 1596 unsigned short *page; 1597 1598 page = dataPtr->fromUnicode[ch >> 8]; 1599 if (page == NULL) { 1600 page = pageMemPtr; 1601 pageMemPtr += 256; 1602 dataPtr->fromUnicode[ch >> 8] = page; 1603 } 1604 page[ch & 0xff] = (unsigned short) ((hi << 8) + lo); 1605 } 1606 } 1607 } 1608 } 1609 if (type == ENCODING_MULTIBYTE) { 1610 /* 1611 * If multibyte encodings don't have a backslash character, define 1612 * one. Otherwise, on Windows, native file names won't work because 1613 * the backslash in the file name will map to the unknown character 1614 * (question mark) when converting from UTF-8 to external encoding. 1615 */ 1616 1617 if (dataPtr->fromUnicode[0] != NULL) { 1618 if (dataPtr->fromUnicode[0]['\\'] == '\0') { 1619 dataPtr->fromUnicode[0]['\\'] = '\\'; 1620 } 1621 } 1622 } 1623 if (symbol) { 1624 unsigned short *page; 1625 1626 /* 1627 * Make a special symbol encoding that not only maps the symbol 1628 * characters from their Unicode code points down into page 0, but 1629 * also ensure that the characters on page 0 map to themselves. 1630 * This is so that a symbol font can be used to display a simple 1631 * string like "abcd" and have alpha, beta, chi, delta show up, 1632 * rather than have "unknown" chars show up because strictly 1633 * speaking the symbol font doesn't have glyphs for those low ascii 1634 * chars. 1635 */ 1636 1637 page = dataPtr->fromUnicode[0]; 1638 if (page == NULL) { 1639 page = pageMemPtr; 1640 dataPtr->fromUnicode[0] = page; 1641 } 1642 for (lo = 0; lo < 256; lo++) { 1643 if (dataPtr->toUnicode[0][lo] != 0) { 1644 page[lo] = (unsigned short) lo; 1645 } 1646 } 1647 } 1648 for (hi = 0; hi < 256; hi++) { 1649 if (dataPtr->fromUnicode[hi] == NULL) { 1650 dataPtr->fromUnicode[hi] = emptyPage; 1651 } 1652 } 1653 /* 1654 * For trailing 'R'everse encoding, see [Patch #689341] 1655 */ 1656 Tcl_DStringInit(&lineString); 1657 do { 1658 int len; 1659 /* skip leading empty lines */ 1660 while ((len = Tcl_Gets(chan, &lineString)) == 0) 1661 ; 1662 if (len < 0) { 1663 break; 1664 } 1665 line = Tcl_DStringValue(&lineString); 1666 if (line[0] != 'R') { 1667 break; 1668 } 1669 for (Tcl_DStringSetLength(&lineString, 0); 1670 (len = Tcl_Gets(chan, &lineString)) >= 0; 1671 Tcl_DStringSetLength(&lineString, 0)) { 1672 unsigned char* p; 1673 int to, from; 1674 if (len < 5) { 1675 continue; 1676 } 1677 p = (unsigned char*) Tcl_DStringValue(&lineString); 1678 to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) 1679 + (staticHex[p[2]] << 4) + staticHex[p[3]]; 1680 if (to == 0) { 1681 continue; 1682 } 1683 for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) { 1684 from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8) 1685 + (staticHex[p[2]] << 4) + staticHex[p[3]]; 1686 if (from == 0) { 1687 continue; 1688 } 1689 dataPtr->fromUnicode[from >> 8][from & 0xff] = to; 1690 } 1691 } 1692 } while (0); 1693 Tcl_DStringFree(&lineString); 1694 1695 encType.encodingName = name; 1696 encType.toUtfProc = TableToUtfProc; 1697 encType.fromUtfProc = TableFromUtfProc; 1698 encType.freeProc = TableFreeProc; 1699 encType.nullSize = (type == ENCODING_DOUBLEBYTE) ? 2 : 1; 1700 encType.clientData = (ClientData) dataPtr; 1701 return Tcl_CreateEncoding(&encType); 1702} 1703 1704/* 1705 *------------------------------------------------------------------------- 1706 * 1707 * LoadEscapeEncoding -- 1708 * 1709 * Helper function for LoadEncodingTable(). Loads a state machine 1710 * that converts between Unicode and some other encoding. 1711 * 1712 * File contains text data that describes the escape sequences that 1713 * are used to choose an encoding and the associated names for the 1714 * sub-encodings. 1715 * 1716 * Results: 1717 * The return value is the new encoding, or NULL if the encoding 1718 * could not be created (because the file contained invalid data). 1719 * 1720 * Side effects: 1721 * None. 1722 * 1723 *------------------------------------------------------------------------- 1724 */ 1725 1726static Tcl_Encoding 1727LoadEscapeEncoding(name, chan) 1728 CONST char *name; /* Name for new encoding. */ 1729 Tcl_Channel chan; /* File containing new encoding. */ 1730{ 1731 int i, missingSubEncoding = 0; 1732 unsigned int size; 1733 Tcl_DString escapeData; 1734 char init[16], final[16]; 1735 EscapeEncodingData *dataPtr; 1736 Tcl_EncodingType type; 1737 1738 init[0] = '\0'; 1739 final[0] = '\0'; 1740 Tcl_DStringInit(&escapeData); 1741 1742 while (1) { 1743 int argc; 1744 CONST char **argv; 1745 char *line; 1746 Tcl_DString lineString; 1747 1748 Tcl_DStringInit(&lineString); 1749 if (Tcl_Gets(chan, &lineString) < 0) { 1750 break; 1751 } 1752 line = Tcl_DStringValue(&lineString); 1753 if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) { 1754 continue; 1755 } 1756 if (argc >= 2) { 1757 if (strcmp(argv[0], "name") == 0) { 1758 ; 1759 } else if (strcmp(argv[0], "init") == 0) { 1760 strncpy(init, argv[1], sizeof(init)); 1761 init[sizeof(init) - 1] = '\0'; 1762 } else if (strcmp(argv[0], "final") == 0) { 1763 strncpy(final, argv[1], sizeof(final)); 1764 final[sizeof(final) - 1] = '\0'; 1765 } else { 1766 EscapeSubTable est; 1767 1768 strncpy(est.sequence, argv[1], sizeof(est.sequence)); 1769 est.sequence[sizeof(est.sequence) - 1] = '\0'; 1770 est.sequenceLen = strlen(est.sequence); 1771 1772 strncpy(est.name, argv[0], sizeof(est.name)); 1773 est.name[sizeof(est.name) - 1] = '\0'; 1774 1775 /* 1776 * Load the subencodings first so we're never stuck 1777 * trying to use a half-loaded system encoding to 1778 * open/read a *.enc file. 1779 */ 1780 1781 est.encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, est.name); 1782 if ((est.encodingPtr == NULL) 1783 || (est.encodingPtr->toUtfProc != TableToUtfProc)) { 1784 missingSubEncoding = 1; 1785 } 1786 Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est)); 1787 } 1788 } 1789 ckfree((char *) argv); 1790 Tcl_DStringFree(&lineString); 1791 } 1792 if (missingSubEncoding) { 1793 Tcl_DStringFree(&escapeData); 1794 return NULL; 1795 } 1796 1797 size = sizeof(EscapeEncodingData) 1798 - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData); 1799 dataPtr = (EscapeEncodingData *) ckalloc(size); 1800 dataPtr->initLen = strlen(init); 1801 strcpy(dataPtr->init, init); 1802 dataPtr->finalLen = strlen(final); 1803 strcpy(dataPtr->final, final); 1804 dataPtr->numSubTables = Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable); 1805 memcpy((VOID *) dataPtr->subTables, (VOID *) Tcl_DStringValue(&escapeData), 1806 (size_t) Tcl_DStringLength(&escapeData)); 1807 Tcl_DStringFree(&escapeData); 1808 1809 memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes)); 1810 for (i = 0; i < dataPtr->numSubTables; i++) { 1811 dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1; 1812 } 1813 if (dataPtr->init[0] != '\0') { 1814 dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1; 1815 } 1816 if (dataPtr->final[0] != '\0') { 1817 dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1; 1818 } 1819 1820 type.encodingName = name; 1821 type.toUtfProc = EscapeToUtfProc; 1822 type.fromUtfProc = EscapeFromUtfProc; 1823 type.freeProc = EscapeFreeProc; 1824 type.nullSize = 1; 1825 type.clientData = (ClientData) dataPtr; 1826 1827 return Tcl_CreateEncoding(&type); 1828} 1829 1830/* 1831 *------------------------------------------------------------------------- 1832 * 1833 * BinaryProc -- 1834 * 1835 * The default conversion when no other conversion is specified. 1836 * No translation is done; source bytes are copied directly to 1837 * destination bytes. 1838 * 1839 * Results: 1840 * Returns TCL_OK if conversion was successful. 1841 * 1842 * Side effects: 1843 * None. 1844 * 1845 *------------------------------------------------------------------------- 1846 */ 1847 1848static int 1849BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 1850 srcReadPtr, dstWrotePtr, dstCharsPtr) 1851 ClientData clientData; /* Not used. */ 1852 CONST char *src; /* Source string (unknown encoding). */ 1853 int srcLen; /* Source string length in bytes. */ 1854 int flags; /* Conversion control flags. */ 1855 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 1856 * state information used during a piecewise 1857 * conversion. Contents of statePtr are 1858 * initialized and/or reset by conversion 1859 * routine under control of flags argument. */ 1860 char *dst; /* Output buffer in which converted string 1861 * is stored. */ 1862 int dstLen; /* The maximum length of output buffer in 1863 * bytes. */ 1864 int *srcReadPtr; /* Filled with the number of bytes from the 1865 * source string that were converted. */ 1866 int *dstWrotePtr; /* Filled with the number of bytes that were 1867 * stored in the output buffer as a result of 1868 * the conversion. */ 1869 int *dstCharsPtr; /* Filled with the number of characters that 1870 * correspond to the bytes stored in the 1871 * output buffer. */ 1872{ 1873 int result; 1874 1875 result = TCL_OK; 1876 dstLen -= TCL_UTF_MAX - 1; 1877 if (dstLen < 0) { 1878 dstLen = 0; 1879 } 1880 if (srcLen > dstLen) { 1881 srcLen = dstLen; 1882 result = TCL_CONVERT_NOSPACE; 1883 } 1884 1885 *srcReadPtr = srcLen; 1886 *dstWrotePtr = srcLen; 1887 *dstCharsPtr = srcLen; 1888 memcpy((void *) dst, (void *) src, (size_t) srcLen); 1889 return result; 1890} 1891 1892 1893/* 1894 *------------------------------------------------------------------------- 1895 * 1896 * UtfExtToUtfIntProc -- 1897 * 1898 * Convert from UTF-8 to UTF-8. While converting null-bytes from 1899 * the Tcl's internal representation (0xc0, 0x80) to the official 1900 * representation (0x00). See UtfToUtfProc for details. 1901 * 1902 * Results: 1903 * Returns TCL_OK if conversion was successful. 1904 * 1905 * Side effects: 1906 * None. 1907 * 1908 *------------------------------------------------------------------------- 1909 */ 1910static int 1911UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 1912 srcReadPtr, dstWrotePtr, dstCharsPtr) 1913 ClientData clientData; /* Not used. */ 1914 CONST char *src; /* Source string in UTF-8. */ 1915 int srcLen; /* Source string length in bytes. */ 1916 int flags; /* Conversion control flags. */ 1917 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 1918 * state information used during a piecewise 1919 * conversion. Contents of statePtr are 1920 * initialized and/or reset by conversion 1921 * routine under control of flags argument. */ 1922 char *dst; /* Output buffer in which converted string 1923 * is stored. */ 1924 int dstLen; /* The maximum length of output buffer in 1925 * bytes. */ 1926 int *srcReadPtr; /* Filled with the number of bytes from the 1927 * source string that were converted. This 1928 * may be less than the original source length 1929 * if there was a problem converting some 1930 * source characters. */ 1931 int *dstWrotePtr; /* Filled with the number of bytes that were 1932 * stored in the output buffer as a result of 1933 * the conversion. */ 1934 int *dstCharsPtr; /* Filled with the number of characters that 1935 * correspond to the bytes stored in the 1936 * output buffer. */ 1937{ 1938 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 1939 srcReadPtr, dstWrotePtr, dstCharsPtr, 1); 1940} 1941 1942/* 1943 *------------------------------------------------------------------------- 1944 * 1945 * UtfExtToUtfIntProc -- 1946 * 1947 * Convert from UTF-8 to UTF-8 while converting null-bytes from 1948 * the official representation (0x00) to Tcl's internal 1949 * representation (0xc0, 0x80). See UtfToUtfProc for details. 1950 * 1951 * Results: 1952 * Returns TCL_OK if conversion was successful. 1953 * 1954 * Side effects: 1955 * None. 1956 * 1957 *------------------------------------------------------------------------- 1958 */ 1959static int 1960UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 1961 srcReadPtr, dstWrotePtr, dstCharsPtr) 1962 ClientData clientData; /* Not used. */ 1963 CONST char *src; /* Source string in UTF-8. */ 1964 int srcLen; /* Source string length in bytes. */ 1965 int flags; /* Conversion control flags. */ 1966 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 1967 * state information used during a piecewise 1968 * conversion. Contents of statePtr are 1969 * initialized and/or reset by conversion 1970 * routine under control of flags argument. */ 1971 char *dst; /* Output buffer in which converted string 1972 * is stored. */ 1973 int dstLen; /* The maximum length of output buffer in 1974 * bytes. */ 1975 int *srcReadPtr; /* Filled with the number of bytes from the 1976 * source string that were converted. This 1977 * may be less than the original source length 1978 * if there was a problem converting some 1979 * source characters. */ 1980 int *dstWrotePtr; /* Filled with the number of bytes that were 1981 * stored in the output buffer as a result of 1982 * the conversion. */ 1983 int *dstCharsPtr; /* Filled with the number of characters that 1984 * correspond to the bytes stored in the 1985 * output buffer. */ 1986{ 1987 return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 1988 srcReadPtr, dstWrotePtr, dstCharsPtr, 0); 1989} 1990 1991/* 1992 *------------------------------------------------------------------------- 1993 * 1994 * UtfToUtfProc -- 1995 * 1996 * Convert from UTF-8 to UTF-8. Note that the UTF-8 to UTF-8 1997 * translation is not a no-op, because it will turn a stream of 1998 * improperly formed UTF-8 into a properly formed stream. 1999 * 2000 * Results: 2001 * Returns TCL_OK if conversion was successful. 2002 * 2003 * Side effects: 2004 * None. 2005 * 2006 *------------------------------------------------------------------------- 2007 */ 2008 2009static int 2010UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2011 srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode) 2012 ClientData clientData; /* Not used. */ 2013 CONST char *src; /* Source string in UTF-8. */ 2014 int srcLen; /* Source string length in bytes. */ 2015 int flags; /* Conversion control flags. */ 2016 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2017 * state information used during a piecewise 2018 * conversion. Contents of statePtr are 2019 * initialized and/or reset by conversion 2020 * routine under control of flags argument. */ 2021 char *dst; /* Output buffer in which converted string 2022 * is stored. */ 2023 int dstLen; /* The maximum length of output buffer in 2024 * bytes. */ 2025 int *srcReadPtr; /* Filled with the number of bytes from the 2026 * source string that were converted. This 2027 * may be less than the original source length 2028 * if there was a problem converting some 2029 * source characters. */ 2030 int *dstWrotePtr; /* Filled with the number of bytes that were 2031 * stored in the output buffer as a result of 2032 * the conversion. */ 2033 int *dstCharsPtr; /* Filled with the number of characters that 2034 * correspond to the bytes stored in the 2035 * output buffer. */ 2036 int pureNullMode; /* Convert embedded nulls from 2037 * internal representation to real 2038 * null-bytes or vice versa */ 2039 2040{ 2041 CONST char *srcStart, *srcEnd, *srcClose; 2042 char *dstStart, *dstEnd; 2043 int result, numChars; 2044 Tcl_UniChar ch; 2045 2046 result = TCL_OK; 2047 2048 srcStart = src; 2049 srcEnd = src + srcLen; 2050 srcClose = srcEnd; 2051 if ((flags & TCL_ENCODING_END) == 0) { 2052 srcClose -= TCL_UTF_MAX; 2053 } 2054 2055 dstStart = dst; 2056 dstEnd = dst + dstLen - TCL_UTF_MAX; 2057 2058 for (numChars = 0; src < srcEnd; numChars++) { 2059 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2060 /* 2061 * If there is more string to follow, this will ensure that the 2062 * last UTF-8 character in the source buffer hasn't been cut off. 2063 */ 2064 2065 result = TCL_CONVERT_MULTIBYTE; 2066 break; 2067 } 2068 if (dst > dstEnd) { 2069 result = TCL_CONVERT_NOSPACE; 2070 break; 2071 } 2072 if (UCHAR(*src) < 0x80 && 2073 !(UCHAR(*src) == 0 && pureNullMode == 0)) { 2074 /* 2075 * Copy 7bit chatacters, but skip null-bytes when we are 2076 * in input mode, so that they get converted to 0xc080. 2077 */ 2078 *dst++ = *src++; 2079 } else if (pureNullMode == 1 && 2080 UCHAR(*src) == 0xc0 && 2081 UCHAR(*(src+1)) == 0x80) { 2082 /* 2083 * Convert 0xc080 to real nulls when we are in output mode. 2084 */ 2085 *dst++ = 0; 2086 src += 2; 2087 } else if (!Tcl_UtfCharComplete(src, srcEnd - src)) { 2088 /* Always check before using Tcl_UtfToUniChar. Not doing 2089 * can so cause it run beyond the endof the buffer! If we 2090 * happen such an incomplete char its bytes are made to 2091 * represent themselves. 2092 */ 2093 2094 ch = (unsigned char) *src; 2095 src += 1; 2096 dst += Tcl_UniCharToUtf(ch, dst); 2097 } else { 2098 src += Tcl_UtfToUniChar(src, &ch); 2099 dst += Tcl_UniCharToUtf(ch, dst); 2100 } 2101 } 2102 2103 *srcReadPtr = src - srcStart; 2104 *dstWrotePtr = dst - dstStart; 2105 *dstCharsPtr = numChars; 2106 return result; 2107} 2108 2109/* 2110 *------------------------------------------------------------------------- 2111 * 2112 * UnicodeToUtfProc -- 2113 * 2114 * Convert from Unicode to UTF-8. 2115 * 2116 * Results: 2117 * Returns TCL_OK if conversion was successful. 2118 * 2119 * Side effects: 2120 * None. 2121 * 2122 *------------------------------------------------------------------------- 2123 */ 2124 2125static int 2126UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2127 srcReadPtr, dstWrotePtr, dstCharsPtr) 2128 ClientData clientData; /* Not used. */ 2129 CONST char *src; /* Source string in Unicode. */ 2130 int srcLen; /* Source string length in bytes. */ 2131 int flags; /* Conversion control flags. */ 2132 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2133 * state information used during a piecewise 2134 * conversion. Contents of statePtr are 2135 * initialized and/or reset by conversion 2136 * routine under control of flags argument. */ 2137 char *dst; /* Output buffer in which converted string 2138 * is stored. */ 2139 int dstLen; /* The maximum length of output buffer in 2140 * bytes. */ 2141 int *srcReadPtr; /* Filled with the number of bytes from the 2142 * source string that were converted. This 2143 * may be less than the original source length 2144 * if there was a problem converting some 2145 * source characters. */ 2146 int *dstWrotePtr; /* Filled with the number of bytes that were 2147 * stored in the output buffer as a result of 2148 * the conversion. */ 2149 int *dstCharsPtr; /* Filled with the number of characters that 2150 * correspond to the bytes stored in the 2151 * output buffer. */ 2152{ 2153 CONST char *srcStart, *srcEnd; 2154 char *dstEnd, *dstStart; 2155 int result, numChars; 2156 Tcl_UniChar ch; 2157 2158 result = TCL_OK; 2159 if ((srcLen % sizeof(Tcl_UniChar)) != 0) { 2160 result = TCL_CONVERT_MULTIBYTE; 2161 srcLen /= sizeof(Tcl_UniChar); 2162 srcLen *= sizeof(Tcl_UniChar); 2163 } 2164 2165 srcStart = src; 2166 srcEnd = src + srcLen; 2167 2168 dstStart = dst; 2169 dstEnd = dst + dstLen - TCL_UTF_MAX; 2170 2171 for (numChars = 0; src < srcEnd; numChars++) { 2172 if (dst > dstEnd) { 2173 result = TCL_CONVERT_NOSPACE; 2174 break; 2175 } 2176 /* 2177 * Special case for 1-byte utf chars for speed. Make sure we 2178 * work with Tcl_UniChar-size data. 2179 */ 2180 ch = *(Tcl_UniChar *)src; 2181 if (ch && ch < 0x80) { 2182 *dst++ = (ch & 0xFF); 2183 } else { 2184 dst += Tcl_UniCharToUtf(ch, dst); 2185 } 2186 src += sizeof(Tcl_UniChar); 2187 } 2188 2189 *srcReadPtr = src - srcStart; 2190 *dstWrotePtr = dst - dstStart; 2191 *dstCharsPtr = numChars; 2192 return result; 2193} 2194 2195/* 2196 *------------------------------------------------------------------------- 2197 * 2198 * UtfToUnicodeProc -- 2199 * 2200 * Convert from UTF-8 to Unicode. 2201 * 2202 * Results: 2203 * Returns TCL_OK if conversion was successful. 2204 * 2205 * Side effects: 2206 * None. 2207 * 2208 *------------------------------------------------------------------------- 2209 */ 2210 2211static int 2212UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2213 srcReadPtr, dstWrotePtr, dstCharsPtr) 2214 ClientData clientData; /* TableEncodingData that specifies encoding. */ 2215 CONST char *src; /* Source string in UTF-8. */ 2216 int srcLen; /* Source string length in bytes. */ 2217 int flags; /* Conversion control flags. */ 2218 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2219 * state information used during a piecewise 2220 * conversion. Contents of statePtr are 2221 * initialized and/or reset by conversion 2222 * routine under control of flags argument. */ 2223 char *dst; /* Output buffer in which converted string 2224 * is stored. */ 2225 int dstLen; /* The maximum length of output buffer in 2226 * bytes. */ 2227 int *srcReadPtr; /* Filled with the number of bytes from the 2228 * source string that were converted. This 2229 * may be less than the original source length 2230 * if there was a problem converting some 2231 * source characters. */ 2232 int *dstWrotePtr; /* Filled with the number of bytes that were 2233 * stored in the output buffer as a result of 2234 * the conversion. */ 2235 int *dstCharsPtr; /* Filled with the number of characters that 2236 * correspond to the bytes stored in the 2237 * output buffer. */ 2238{ 2239 CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; 2240 int result, numChars; 2241 Tcl_UniChar ch; 2242 2243 srcStart = src; 2244 srcEnd = src + srcLen; 2245 srcClose = srcEnd; 2246 if ((flags & TCL_ENCODING_END) == 0) { 2247 srcClose -= TCL_UTF_MAX; 2248 } 2249 2250 dstStart = dst; 2251 dstEnd = dst + dstLen - sizeof(Tcl_UniChar); 2252 2253 result = TCL_OK; 2254 for (numChars = 0; src < srcEnd; numChars++) { 2255 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2256 /* 2257 * If there is more string to follow, this will ensure that the 2258 * last UTF-8 character in the source buffer hasn't been cut off. 2259 */ 2260 2261 result = TCL_CONVERT_MULTIBYTE; 2262 break; 2263 } 2264 if (dst > dstEnd) { 2265 result = TCL_CONVERT_NOSPACE; 2266 break; 2267 } 2268 src += TclUtfToUniChar(src, &ch); 2269 /* 2270 * Need to handle this in a way that won't cause misalignment 2271 * by casting dst to a Tcl_UniChar. [Bug 1122671] 2272 * XXX: This hard-codes the assumed size of Tcl_UniChar as 2. 2273 */ 2274#ifdef WORDS_BIGENDIAN 2275 *dst++ = (ch >> 8); 2276 *dst++ = (ch & 0xFF); 2277#else 2278 *dst++ = (ch & 0xFF); 2279 *dst++ = (ch >> 8); 2280#endif 2281 } 2282 *srcReadPtr = src - srcStart; 2283 *dstWrotePtr = dst - dstStart; 2284 *dstCharsPtr = numChars; 2285 return result; 2286} 2287 2288/* 2289 *------------------------------------------------------------------------- 2290 * 2291 * TableToUtfProc -- 2292 * 2293 * Convert from the encoding specified by the TableEncodingData into 2294 * UTF-8. 2295 * 2296 * Results: 2297 * Returns TCL_OK if conversion was successful. 2298 * 2299 * Side effects: 2300 * None. 2301 * 2302 *------------------------------------------------------------------------- 2303 */ 2304 2305static int 2306TableToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2307 srcReadPtr, dstWrotePtr, dstCharsPtr) 2308 ClientData clientData; /* TableEncodingData that specifies 2309 * encoding. */ 2310 CONST char *src; /* Source string in specified encoding. */ 2311 int srcLen; /* Source string length in bytes. */ 2312 int flags; /* Conversion control flags. */ 2313 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2314 * state information used during a piecewise 2315 * conversion. Contents of statePtr are 2316 * initialized and/or reset by conversion 2317 * routine under control of flags argument. */ 2318 char *dst; /* Output buffer in which converted string 2319 * is stored. */ 2320 int dstLen; /* The maximum length of output buffer in 2321 * bytes. */ 2322 int *srcReadPtr; /* Filled with the number of bytes from the 2323 * source string that were converted. This 2324 * may be less than the original source length 2325 * if there was a problem converting some 2326 * source characters. */ 2327 int *dstWrotePtr; /* Filled with the number of bytes that were 2328 * stored in the output buffer as a result of 2329 * the conversion. */ 2330 int *dstCharsPtr; /* Filled with the number of characters that 2331 * correspond to the bytes stored in the 2332 * output buffer. */ 2333{ 2334 CONST char *srcStart, *srcEnd; 2335 char *dstEnd, *dstStart, *prefixBytes; 2336 int result, byte, numChars; 2337 Tcl_UniChar ch; 2338 unsigned short **toUnicode; 2339 unsigned short *pageZero; 2340 TableEncodingData *dataPtr; 2341 2342 srcStart = src; 2343 srcEnd = src + srcLen; 2344 2345 dstStart = dst; 2346 dstEnd = dst + dstLen - TCL_UTF_MAX; 2347 2348 dataPtr = (TableEncodingData *) clientData; 2349 toUnicode = dataPtr->toUnicode; 2350 prefixBytes = dataPtr->prefixBytes; 2351 pageZero = toUnicode[0]; 2352 2353 result = TCL_OK; 2354 for (numChars = 0; src < srcEnd; numChars++) { 2355 if (dst > dstEnd) { 2356 result = TCL_CONVERT_NOSPACE; 2357 break; 2358 } 2359 byte = *((unsigned char *) src); 2360 if (prefixBytes[byte]) { 2361 src++; 2362 if (src >= srcEnd) { 2363 src--; 2364 result = TCL_CONVERT_MULTIBYTE; 2365 break; 2366 } 2367 ch = toUnicode[byte][*((unsigned char *) src)]; 2368 } else { 2369 ch = pageZero[byte]; 2370 } 2371 if ((ch == 0) && (byte != 0)) { 2372 if (flags & TCL_ENCODING_STOPONERROR) { 2373 result = TCL_CONVERT_SYNTAX; 2374 break; 2375 } 2376 if (prefixBytes[byte]) { 2377 src--; 2378 } 2379 ch = (Tcl_UniChar) byte; 2380 } 2381 /* 2382 * Special case for 1-byte utf chars for speed. 2383 */ 2384 if (ch && ch < 0x80) { 2385 *dst++ = (char) ch; 2386 } else { 2387 dst += Tcl_UniCharToUtf(ch, dst); 2388 } 2389 src++; 2390 } 2391 *srcReadPtr = src - srcStart; 2392 *dstWrotePtr = dst - dstStart; 2393 *dstCharsPtr = numChars; 2394 return result; 2395} 2396 2397/* 2398 *------------------------------------------------------------------------- 2399 * 2400 * TableFromUtfProc -- 2401 * 2402 * Convert from UTF-8 into the encoding specified by the 2403 * TableEncodingData. 2404 * 2405 * Results: 2406 * Returns TCL_OK if conversion was successful. 2407 * 2408 * Side effects: 2409 * None. 2410 * 2411 *------------------------------------------------------------------------- 2412 */ 2413 2414static int 2415TableFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2416 srcReadPtr, dstWrotePtr, dstCharsPtr) 2417 ClientData clientData; /* TableEncodingData that specifies 2418 * encoding. */ 2419 CONST char *src; /* Source string in UTF-8. */ 2420 int srcLen; /* Source string length in bytes. */ 2421 int flags; /* Conversion control flags. */ 2422 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2423 * state information used during a piecewise 2424 * conversion. Contents of statePtr are 2425 * initialized and/or reset by conversion 2426 * routine under control of flags argument. */ 2427 char *dst; /* Output buffer in which converted string 2428 * is stored. */ 2429 int dstLen; /* The maximum length of output buffer in 2430 * bytes. */ 2431 int *srcReadPtr; /* Filled with the number of bytes from the 2432 * source string that were converted. This 2433 * may be less than the original source length 2434 * if there was a problem converting some 2435 * source characters. */ 2436 int *dstWrotePtr; /* Filled with the number of bytes that were 2437 * stored in the output buffer as a result of 2438 * the conversion. */ 2439 int *dstCharsPtr; /* Filled with the number of characters that 2440 * correspond to the bytes stored in the 2441 * output buffer. */ 2442{ 2443 CONST char *srcStart, *srcEnd, *srcClose; 2444 char *dstStart, *dstEnd, *prefixBytes; 2445 Tcl_UniChar ch; 2446 int result, len, word, numChars; 2447 TableEncodingData *dataPtr; 2448 unsigned short **fromUnicode; 2449 2450 result = TCL_OK; 2451 2452 dataPtr = (TableEncodingData *) clientData; 2453 prefixBytes = dataPtr->prefixBytes; 2454 fromUnicode = dataPtr->fromUnicode; 2455 2456 srcStart = src; 2457 srcEnd = src + srcLen; 2458 srcClose = srcEnd; 2459 if ((flags & TCL_ENCODING_END) == 0) { 2460 srcClose -= TCL_UTF_MAX; 2461 } 2462 2463 dstStart = dst; 2464 dstEnd = dst + dstLen - 1; 2465 2466 for (numChars = 0; src < srcEnd; numChars++) { 2467 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2468 /* 2469 * If there is more string to follow, this will ensure that the 2470 * last UTF-8 character in the source buffer hasn't been cut off. 2471 */ 2472 2473 result = TCL_CONVERT_MULTIBYTE; 2474 break; 2475 } 2476 len = TclUtfToUniChar(src, &ch); 2477 2478#if TCL_UTF_MAX > 3 2479 /* 2480 * This prevents a crash condition. More evaluation is required 2481 * for full support of int Tcl_UniChar. [Bug 1004065] 2482 */ 2483 if (ch & 0xffff0000) { 2484 word = 0; 2485 } else 2486#endif 2487 word = fromUnicode[(ch >> 8)][ch & 0xff]; 2488 2489 if ((word == 0) && (ch != 0)) { 2490 if (flags & TCL_ENCODING_STOPONERROR) { 2491 result = TCL_CONVERT_UNKNOWN; 2492 break; 2493 } 2494 word = dataPtr->fallback; 2495 } 2496 if (prefixBytes[(word >> 8)] != 0) { 2497 if (dst + 1 > dstEnd) { 2498 result = TCL_CONVERT_NOSPACE; 2499 break; 2500 } 2501 dst[0] = (char) (word >> 8); 2502 dst[1] = (char) word; 2503 dst += 2; 2504 } else { 2505 if (dst > dstEnd) { 2506 result = TCL_CONVERT_NOSPACE; 2507 break; 2508 } 2509 dst[0] = (char) word; 2510 dst++; 2511 } 2512 src += len; 2513 } 2514 *srcReadPtr = src - srcStart; 2515 *dstWrotePtr = dst - dstStart; 2516 *dstCharsPtr = numChars; 2517 return result; 2518} 2519 2520/* 2521 *--------------------------------------------------------------------------- 2522 * 2523 * TableFreeProc -- 2524 * 2525 * This procedure is invoked when an encoding is deleted. It deletes 2526 * the memory used by the TableEncodingData. 2527 * 2528 * Results: 2529 * None. 2530 * 2531 * Side effects: 2532 * Memory freed. 2533 * 2534 *--------------------------------------------------------------------------- 2535 */ 2536 2537static void 2538TableFreeProc(clientData) 2539 ClientData clientData; /* TableEncodingData that specifies 2540 * encoding. */ 2541{ 2542 TableEncodingData *dataPtr; 2543 2544 /* 2545 * Make sure we aren't freeing twice on shutdown. [Bug #219314] 2546 */ 2547 2548 dataPtr = (TableEncodingData *) clientData; 2549 ckfree((char *) dataPtr->toUnicode); 2550 ckfree((char *) dataPtr->fromUnicode); 2551 ckfree((char *) dataPtr); 2552} 2553 2554/* 2555 *------------------------------------------------------------------------- 2556 * 2557 * EscapeToUtfProc -- 2558 * 2559 * Convert from the encoding specified by the EscapeEncodingData into 2560 * UTF-8. 2561 * 2562 * Results: 2563 * Returns TCL_OK if conversion was successful. 2564 * 2565 * Side effects: 2566 * None. 2567 * 2568 *------------------------------------------------------------------------- 2569 */ 2570 2571static int 2572EscapeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2573 srcReadPtr, dstWrotePtr, dstCharsPtr) 2574 ClientData clientData; /* EscapeEncodingData that specifies 2575 * encoding. */ 2576 CONST char *src; /* Source string in specified encoding. */ 2577 int srcLen; /* Source string length in bytes. */ 2578 int flags; /* Conversion control flags. */ 2579 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2580 * state information used during a piecewise 2581 * conversion. Contents of statePtr are 2582 * initialized and/or reset by conversion 2583 * routine under control of flags argument. */ 2584 char *dst; /* Output buffer in which converted string 2585 * is stored. */ 2586 int dstLen; /* The maximum length of output buffer in 2587 * bytes. */ 2588 int *srcReadPtr; /* Filled with the number of bytes from the 2589 * source string that were converted. This 2590 * may be less than the original source length 2591 * if there was a problem converting some 2592 * source characters. */ 2593 int *dstWrotePtr; /* Filled with the number of bytes that were 2594 * stored in the output buffer as a result of 2595 * the conversion. */ 2596 int *dstCharsPtr; /* Filled with the number of characters that 2597 * correspond to the bytes stored in the 2598 * output buffer. */ 2599{ 2600 EscapeEncodingData *dataPtr; 2601 char *prefixBytes, *tablePrefixBytes; 2602 unsigned short **tableToUnicode; 2603 Encoding *encodingPtr; 2604 int state, result, numChars; 2605 CONST char *srcStart, *srcEnd; 2606 char *dstStart, *dstEnd; 2607 2608 result = TCL_OK; 2609 2610 tablePrefixBytes = NULL; /* lint. */ 2611 tableToUnicode = NULL; /* lint. */ 2612 2613 dataPtr = (EscapeEncodingData *) clientData; 2614 prefixBytes = dataPtr->prefixBytes; 2615 encodingPtr = NULL; 2616 2617 srcStart = src; 2618 srcEnd = src + srcLen; 2619 2620 dstStart = dst; 2621 dstEnd = dst + dstLen - TCL_UTF_MAX; 2622 2623 state = (int) *statePtr; 2624 if (flags & TCL_ENCODING_START) { 2625 state = 0; 2626 } 2627 2628 for (numChars = 0; src < srcEnd; ) { 2629 int byte, hi, lo, ch; 2630 2631 if (dst > dstEnd) { 2632 result = TCL_CONVERT_NOSPACE; 2633 break; 2634 } 2635 byte = *((unsigned char *) src); 2636 if (prefixBytes[byte]) { 2637 unsigned int left, len, longest; 2638 int checked, i; 2639 EscapeSubTable *subTablePtr; 2640 2641 /* 2642 * Saw the beginning of an escape sequence. 2643 */ 2644 2645 left = srcEnd - src; 2646 len = dataPtr->initLen; 2647 longest = len; 2648 checked = 0; 2649 if (len <= left) { 2650 checked++; 2651 if ((len > 0) && 2652 (memcmp(src, dataPtr->init, len) == 0)) { 2653 /* 2654 * If we see initialization string, skip it, even if we're 2655 * not at the beginning of the buffer. 2656 */ 2657 2658 src += len; 2659 continue; 2660 } 2661 } 2662 len = dataPtr->finalLen; 2663 if (len > longest) { 2664 longest = len; 2665 } 2666 if (len <= left) { 2667 checked++; 2668 if ((len > 0) && 2669 (memcmp(src, dataPtr->final, len) == 0)) { 2670 /* 2671 * If we see finalization string, skip it, even if we're 2672 * not at the end of the buffer. 2673 */ 2674 2675 src += len; 2676 continue; 2677 } 2678 } 2679 subTablePtr = dataPtr->subTables; 2680 for (i = 0; i < dataPtr->numSubTables; i++) { 2681 len = subTablePtr->sequenceLen; 2682 if (len > longest) { 2683 longest = len; 2684 } 2685 if (len <= left) { 2686 checked++; 2687 if ((len > 0) && 2688 (memcmp(src, subTablePtr->sequence, len) == 0)) { 2689 state = i; 2690 encodingPtr = NULL; 2691 subTablePtr = NULL; 2692 src += len; 2693 break; 2694 } 2695 } 2696 subTablePtr++; 2697 } 2698 if (subTablePtr == NULL) { 2699 /* 2700 * A match was found, the escape sequence was consumed, and 2701 * the state was updated. 2702 */ 2703 2704 continue; 2705 } 2706 2707 /* 2708 * We have a split-up or unrecognized escape sequence. If we 2709 * checked all the sequences, then it's a syntax error, 2710 * otherwise we need more bytes to determine a match. 2711 */ 2712 2713 if ((checked == dataPtr->numSubTables + 2) 2714 || (flags & TCL_ENCODING_END)) { 2715 if ((flags & TCL_ENCODING_STOPONERROR) == 0) { 2716 /* 2717 * Skip the unknown escape sequence. 2718 */ 2719 2720 src += longest; 2721 continue; 2722 } 2723 result = TCL_CONVERT_SYNTAX; 2724 } else { 2725 result = TCL_CONVERT_MULTIBYTE; 2726 } 2727 break; 2728 } 2729 2730 if (encodingPtr == NULL) { 2731 TableEncodingData *tableDataPtr; 2732 2733 encodingPtr = GetTableEncoding(dataPtr, state); 2734 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 2735 tablePrefixBytes = tableDataPtr->prefixBytes; 2736 tableToUnicode = tableDataPtr->toUnicode; 2737 } 2738 if (tablePrefixBytes[byte]) { 2739 src++; 2740 if (src >= srcEnd) { 2741 src--; 2742 result = TCL_CONVERT_MULTIBYTE; 2743 break; 2744 } 2745 hi = byte; 2746 lo = *((unsigned char *) src); 2747 } else { 2748 hi = 0; 2749 lo = byte; 2750 } 2751 ch = tableToUnicode[hi][lo]; 2752 dst += Tcl_UniCharToUtf(ch, dst); 2753 src++; 2754 numChars++; 2755 } 2756 2757 *statePtr = (Tcl_EncodingState) state; 2758 *srcReadPtr = src - srcStart; 2759 *dstWrotePtr = dst - dstStart; 2760 *dstCharsPtr = numChars; 2761 return result; 2762} 2763 2764/* 2765 *------------------------------------------------------------------------- 2766 * 2767 * EscapeFromUtfProc -- 2768 * 2769 * Convert from UTF-8 into the encoding specified by the 2770 * EscapeEncodingData. 2771 * 2772 * Results: 2773 * Returns TCL_OK if conversion was successful. 2774 * 2775 * Side effects: 2776 * None. 2777 * 2778 *------------------------------------------------------------------------- 2779 */ 2780 2781static int 2782EscapeFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen, 2783 srcReadPtr, dstWrotePtr, dstCharsPtr) 2784 ClientData clientData; /* EscapeEncodingData that specifies 2785 * encoding. */ 2786 CONST char *src; /* Source string in UTF-8. */ 2787 int srcLen; /* Source string length in bytes. */ 2788 int flags; /* Conversion control flags. */ 2789 Tcl_EncodingState *statePtr;/* Place for conversion routine to store 2790 * state information used during a piecewise 2791 * conversion. Contents of statePtr are 2792 * initialized and/or reset by conversion 2793 * routine under control of flags argument. */ 2794 char *dst; /* Output buffer in which converted string 2795 * is stored. */ 2796 int dstLen; /* The maximum length of output buffer in 2797 * bytes. */ 2798 int *srcReadPtr; /* Filled with the number of bytes from the 2799 * source string that were converted. This 2800 * may be less than the original source length 2801 * if there was a problem converting some 2802 * source characters. */ 2803 int *dstWrotePtr; /* Filled with the number of bytes that were 2804 * stored in the output buffer as a result of 2805 * the conversion. */ 2806 int *dstCharsPtr; /* Filled with the number of characters that 2807 * correspond to the bytes stored in the 2808 * output buffer. */ 2809{ 2810 EscapeEncodingData *dataPtr; 2811 Encoding *encodingPtr; 2812 CONST char *srcStart, *srcEnd, *srcClose; 2813 char *dstStart, *dstEnd; 2814 int state, result, numChars; 2815 TableEncodingData *tableDataPtr; 2816 char *tablePrefixBytes; 2817 unsigned short **tableFromUnicode; 2818 2819 result = TCL_OK; 2820 2821 dataPtr = (EscapeEncodingData *) clientData; 2822 2823 srcStart = src; 2824 srcEnd = src + srcLen; 2825 srcClose = srcEnd; 2826 if ((flags & TCL_ENCODING_END) == 0) { 2827 srcClose -= TCL_UTF_MAX; 2828 } 2829 2830 dstStart = dst; 2831 dstEnd = dst + dstLen - 1; 2832 2833 /* 2834 * RFC1468 states that the text starts in ASCII, and switches to Japanese 2835 * characters, and that the text must end in ASCII. [Patch #474358] 2836 */ 2837 2838 if (flags & TCL_ENCODING_START) { 2839 state = 0; 2840 if ((dst + dataPtr->initLen) > dstEnd) { 2841 *srcReadPtr = 0; 2842 *dstWrotePtr = 0; 2843 return TCL_CONVERT_NOSPACE; 2844 } 2845 memcpy((VOID *) dst, (VOID *) dataPtr->init, 2846 (size_t) dataPtr->initLen); 2847 dst += dataPtr->initLen; 2848 } else { 2849 state = (int) *statePtr; 2850 } 2851 2852 encodingPtr = GetTableEncoding(dataPtr, state); 2853 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 2854 tablePrefixBytes = tableDataPtr->prefixBytes; 2855 tableFromUnicode = tableDataPtr->fromUnicode; 2856 2857 for (numChars = 0; src < srcEnd; numChars++) { 2858 unsigned int len; 2859 int word; 2860 Tcl_UniChar ch; 2861 2862 if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { 2863 /* 2864 * If there is more string to follow, this will ensure that the 2865 * last UTF-8 character in the source buffer hasn't been cut off. 2866 */ 2867 2868 result = TCL_CONVERT_MULTIBYTE; 2869 break; 2870 } 2871 len = TclUtfToUniChar(src, &ch); 2872 word = tableFromUnicode[(ch >> 8)][ch & 0xff]; 2873 2874 if ((word == 0) && (ch != 0)) { 2875 int oldState; 2876 EscapeSubTable *subTablePtr; 2877 2878 oldState = state; 2879 for (state = 0; state < dataPtr->numSubTables; state++) { 2880 encodingPtr = GetTableEncoding(dataPtr, state); 2881 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 2882 word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff]; 2883 if (word != 0) { 2884 break; 2885 } 2886 } 2887 2888 if (word == 0) { 2889 state = oldState; 2890 if (flags & TCL_ENCODING_STOPONERROR) { 2891 result = TCL_CONVERT_UNKNOWN; 2892 break; 2893 } 2894 encodingPtr = GetTableEncoding(dataPtr, state); 2895 tableDataPtr = (TableEncodingData *) encodingPtr->clientData; 2896 word = tableDataPtr->fallback; 2897 } 2898 2899 tablePrefixBytes = tableDataPtr->prefixBytes; 2900 tableFromUnicode = tableDataPtr->fromUnicode; 2901 2902 /* 2903 * The state variable has the value of oldState when word is 0. 2904 * In this case, the escape sequense should not be copied to dst 2905 * because the current character set is not changed. 2906 */ 2907 if (state != oldState) { 2908 subTablePtr = &dataPtr->subTables[state]; 2909 if ((dst + subTablePtr->sequenceLen) > dstEnd) { 2910 /* 2911 * If there is no space to write the escape sequence, the 2912 * state variable must be changed to the value of oldState 2913 * variable because this escape sequence must be written 2914 * in the next conversion. 2915 */ 2916 state = oldState; 2917 result = TCL_CONVERT_NOSPACE; 2918 break; 2919 } 2920 memcpy((VOID *) dst, (VOID *) subTablePtr->sequence, 2921 (size_t) subTablePtr->sequenceLen); 2922 dst += subTablePtr->sequenceLen; 2923 } 2924 } 2925 2926 if (tablePrefixBytes[(word >> 8)] != 0) { 2927 if (dst + 1 > dstEnd) { 2928 result = TCL_CONVERT_NOSPACE; 2929 break; 2930 } 2931 dst[0] = (char) (word >> 8); 2932 dst[1] = (char) word; 2933 dst += 2; 2934 } else { 2935 if (dst > dstEnd) { 2936 result = TCL_CONVERT_NOSPACE; 2937 break; 2938 } 2939 dst[0] = (char) word; 2940 dst++; 2941 } 2942 src += len; 2943 } 2944 2945 if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) { 2946 unsigned int len = dataPtr->subTables[0].sequenceLen; 2947 /* 2948 * [Bug 1516109]. 2949 * Certain encodings like iso2022-jp need to write 2950 * an escape sequence after all characters have 2951 * been converted. This logic checks that enough 2952 * room is available in the buffer for the escape bytes. 2953 * The TCL_ENCODING_END flag is cleared after a final 2954 * escape sequence has been added to the buffer so 2955 * that another call to this method does not attempt 2956 * to append escape bytes a second time. 2957 */ 2958 if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) { 2959 result = TCL_CONVERT_NOSPACE; 2960 } else { 2961 if (state) { 2962 memcpy((VOID *) dst, (VOID *) dataPtr->subTables[0].sequence, 2963 (size_t) len); 2964 dst += len; 2965 } 2966 memcpy((VOID *) dst, (VOID *) dataPtr->final, 2967 (size_t) dataPtr->finalLen); 2968 dst += dataPtr->finalLen; 2969 state &= ~TCL_ENCODING_END; 2970 } 2971 } 2972 2973 *statePtr = (Tcl_EncodingState) state; 2974 *srcReadPtr = src - srcStart; 2975 *dstWrotePtr = dst - dstStart; 2976 *dstCharsPtr = numChars; 2977 return result; 2978} 2979 2980/* 2981 *--------------------------------------------------------------------------- 2982 * 2983 * EscapeFreeProc -- 2984 * 2985 * This procedure is invoked when an EscapeEncodingData encoding is 2986 * deleted. It deletes the memory used by the encoding. 2987 * 2988 * Results: 2989 * None. 2990 * 2991 * Side effects: 2992 * Memory freed. 2993 * 2994 *--------------------------------------------------------------------------- 2995 */ 2996 2997static void 2998EscapeFreeProc(clientData) 2999 ClientData clientData; /* EscapeEncodingData that specifies encoding. */ 3000{ 3001 EscapeEncodingData *dataPtr; 3002 EscapeSubTable *subTablePtr; 3003 int i; 3004 3005 dataPtr = (EscapeEncodingData *) clientData; 3006 if (dataPtr == NULL) { 3007 return; 3008 } 3009 /* 3010 * The subTables should be freed recursively in normal operation but not 3011 * during TclFinalizeEncodingSubsystem because they are also present as a 3012 * weak reference in the toplevel encodingTable (ie they don't have a +1 3013 * refcount for this), and unpredictable nuking order could remove them 3014 * from under the following loop's feet [Bug 2891556]. 3015 * 3016 * The encodingsInitialized flag, being reset on entry to TFES, can serve 3017 * as a "not in finalization" test. 3018 */ 3019 if (encodingsInitialized) 3020 { 3021 subTablePtr = dataPtr->subTables; 3022 for (i = 0; i < dataPtr->numSubTables; i++) { 3023 FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr); 3024 subTablePtr++; 3025 } 3026 } 3027 ckfree((char *) dataPtr); 3028} 3029 3030/* 3031 *--------------------------------------------------------------------------- 3032 * 3033 * GetTableEncoding -- 3034 * 3035 * Helper function for the EscapeEncodingData conversions. Gets the 3036 * encoding (of type TextEncodingData) that represents the specified 3037 * state. 3038 * 3039 * Results: 3040 * The return value is the encoding. 3041 * 3042 * Side effects: 3043 * If the encoding that represents the specified state has not 3044 * already been used by this EscapeEncoding, it will be loaded 3045 * and cached in the dataPtr. 3046 * 3047 *--------------------------------------------------------------------------- 3048 */ 3049 3050static Encoding * 3051GetTableEncoding(dataPtr, state) 3052 EscapeEncodingData *dataPtr;/* Contains names of encodings. */ 3053 int state; /* Index in dataPtr of desired Encoding. */ 3054{ 3055 EscapeSubTable *subTablePtr; 3056 Encoding *encodingPtr; 3057 3058 subTablePtr = &dataPtr->subTables[state]; 3059 encodingPtr = subTablePtr->encodingPtr; 3060 if (encodingPtr == NULL) { 3061 /* 3062 * Now that escape encodings load their sub-encodings first, and 3063 * fail to load if any sub-encodings are missing, this branch should 3064 * never happen. 3065 */ 3066 encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name); 3067 if ((encodingPtr == NULL) 3068 || (encodingPtr->toUtfProc != TableToUtfProc)) { 3069 panic("EscapeToUtfProc: invalid sub table"); 3070 } 3071 subTablePtr->encodingPtr = encodingPtr; 3072 } 3073 return encodingPtr; 3074} 3075 3076/* 3077 *--------------------------------------------------------------------------- 3078 * 3079 * unilen -- 3080 * 3081 * A helper function for the Tcl_ExternalToUtf functions. This 3082 * function is similar to strlen for double-byte characters: it 3083 * returns the number of bytes in a 0x0000 terminated string. 3084 * 3085 * Results: 3086 * As above. 3087 * 3088 * Side effects: 3089 * None. 3090 * 3091 *--------------------------------------------------------------------------- 3092 */ 3093 3094static size_t 3095unilen(src) 3096 CONST char *src; 3097{ 3098 unsigned short *p; 3099 3100 p = (unsigned short *) src; 3101 while (*p != 0x0000) { 3102 p++; 3103 } 3104 return (char *) p - src; 3105} 3106 3107/* 3108 *------------------------------------------------------------------------- 3109 * 3110 * TclFindEncodings -- 3111 * 3112 * Find and load the encoding file for this operating system. 3113 * Before this is called, Tcl makes assumptions about the 3114 * native string representation, but the true encoding is not 3115 * assured. 3116 * 3117 * Results: 3118 * Return result of TclpInitLibraryPath, which reports whether the 3119 * path is clean (0) or dirty (1) UTF. 3120 * 3121 * Side effects: 3122 * Varied, see the respective initialization routines. 3123 * 3124 *------------------------------------------------------------------------- 3125 */ 3126 3127static int 3128TclFindEncodings(argv0) 3129 CONST char *argv0; /* Name of executable from argv[0] to main() 3130 * in native multi-byte encoding. */ 3131{ 3132 int mustCleanUtf = 0; 3133 3134 if (encodingsInitialized == 0) { 3135 /* 3136 * Double check inside the mutex. There may be calls 3137 * back into this routine from some of the procedures below. 3138 */ 3139 3140 TclpInitLock(); 3141 if (encodingsInitialized == 0) { 3142 char *native; 3143 Tcl_Obj *pathPtr; 3144 Tcl_DString libPath, buffer; 3145 3146 /* 3147 * Have to set this bit here to avoid deadlock with the 3148 * routines below us that call into TclInitSubsystems. 3149 */ 3150 3151 encodingsInitialized = 1; 3152 3153 native = TclpFindExecutable(argv0); 3154 mustCleanUtf = TclpInitLibraryPath(native); 3155 3156 /* 3157 * The library path was set in the TclpInitLibraryPath routine. 3158 * The string set is a dirty UTF string. To preserve the value 3159 * convert the UTF string back to native before setting the new 3160 * default encoding. 3161 */ 3162 3163 pathPtr = TclGetLibraryPath(); 3164 if ((pathPtr != NULL) && mustCleanUtf) { 3165 Tcl_UtfToExternalDString(NULL, Tcl_GetString(pathPtr), -1, 3166 &libPath); 3167 } 3168 3169 TclpSetInitialEncodings(); 3170 3171 /* 3172 * Now convert the native string back to UTF. 3173 */ 3174 3175 if ((pathPtr != NULL) && mustCleanUtf) { 3176 Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&libPath), -1, 3177 &buffer); 3178 pathPtr = Tcl_NewStringObj(Tcl_DStringValue(&buffer), -1); 3179 TclSetLibraryPath(pathPtr); 3180 3181 Tcl_DStringFree(&libPath); 3182 Tcl_DStringFree(&buffer); 3183 } 3184 } 3185 TclpInitUnlock(); 3186 } 3187 3188 return mustCleanUtf; 3189} 3190