1/*
2 * tclEncoding.c --
3 *
4 *	Contains the implementation of the encoding conversion package.
5 *
6 * Copyright (c) 1996-1998 Sun Microsystems, Inc.
7 *
8 * See the file "license.terms" for information on usage and redistribution
9 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
10 *
11 * RCS: @(#) $Id: tclEncoding.c,v 1.16.2.14 2007/02/12 19:25:42 andreas_kupries Exp $
12 */
13
14#include "tclInt.h"
15#include "tclPort.h"
16
17typedef size_t (LengthProc)_ANSI_ARGS_((CONST char *src));
18
19/*
20 * The following data structure represents an encoding, which describes how
21 * to convert between various character sets and UTF-8.
22 */
23
24typedef struct Encoding {
25    char *name;			/* Name of encoding.  Malloced because (1)
26				 * hash table entry that owns this encoding
27				 * may be freed prior to this encoding being
28				 * freed, (2) string passed in the
29				 * Tcl_EncodingType structure may not be
30				 * persistent. */
31    Tcl_EncodingConvertProc *toUtfProc;
32				/* Procedure to convert from external
33				 * encoding into UTF-8. */
34    Tcl_EncodingConvertProc *fromUtfProc;
35				/* Procedure to convert from UTF-8 into
36				 * external encoding. */
37    Tcl_EncodingFreeProc *freeProc;
38				/* If non-NULL, procedure to call when this
39				 * encoding is deleted. */
40    int nullSize;		/* Number of 0x00 bytes that signify
41				 * end-of-string in this encoding.  This
42				 * number is used to determine the source
43				 * string length when the srcLen argument is
44				 * negative.  This number can be 1 or 2. */
45    ClientData clientData;	/* Arbitrary value associated with encoding
46				 * type.  Passed to conversion procedures. */
47    LengthProc *lengthProc;	/* Function to compute length of
48				 * null-terminated strings in this encoding.
49				 * If nullSize is 1, this is strlen; if
50				 * nullSize is 2, this is a function that
51				 * returns the number of bytes in a 0x0000
52				 * terminated string. */
53    int refCount;		/* Number of uses of this structure. */
54    Tcl_HashEntry *hPtr;	/* Hash table entry that owns this encoding. */
55} Encoding;
56
57/*
58 * The following structure is the clientData for a dynamically-loaded,
59 * table-driven encoding created by LoadTableEncoding().  It maps between
60 * Unicode and a single-byte, double-byte, or multibyte (1 or 2 bytes only)
61 * encoding.
62 */
63
64typedef struct TableEncodingData {
65    int fallback;		/* Character (in this encoding) to
66				 * substitute when this encoding cannot
67				 * represent a UTF-8 character. */
68    char prefixBytes[256];	/* If a byte in the input stream is a lead
69				 * byte for a 2-byte sequence, the
70				 * corresponding entry in this array is 1,
71				 * otherwise it is 0. */
72    unsigned short **toUnicode;	/* Two dimensional sparse matrix to map
73				 * characters from the encoding to Unicode.
74				 * Each element of the toUnicode array points
75				 * to an array of 256 shorts.  If there is no
76				 * corresponding character in Unicode, the
77				 * value in the matrix is 0x0000.  malloc'd. */
78    unsigned short **fromUnicode;
79				/* Two dimensional sparse matrix to map
80				 * characters from Unicode to the encoding.
81				 * Each element of the fromUnicode array
82				 * points to an array of 256 shorts.  If there
83				 * is no corresponding character the encoding,
84				 * the value in the matrix is 0x0000.
85				 * malloc'd. */
86} TableEncodingData;
87
88/*
89 * The following structures is the clientData for a dynamically-loaded,
90 * escape-driven encoding that is itself comprised of other simpler
91 * encodings.  An example is "iso-2022-jp", which uses escape sequences to
92 * switch between ascii, jis0208, jis0212, gb2312, and ksc5601.  Note that
93 * "escape-driven" does not necessarily mean that the ESCAPE character is
94 * the character used for switching character sets.
95 */
96
97typedef struct EscapeSubTable {
98    unsigned int sequenceLen;	/* Length of following string. */
99    char sequence[16];		/* Escape code that marks this encoding. */
100    char name[32];		/* Name for encoding. */
101    Encoding *encodingPtr;	/* Encoding loaded using above name, or NULL
102				 * if this sub-encoding has not been needed
103				 * yet. */
104} EscapeSubTable;
105
106typedef struct EscapeEncodingData {
107    int fallback;		/* Character (in this encoding) to
108				 * substitute when this encoding cannot
109				 * represent a UTF-8 character. */
110    unsigned int initLen;	/* Length of following string. */
111    char init[16];		/* String to emit or expect before first char
112				 * in conversion. */
113    unsigned int finalLen;	/* Length of following string. */
114    char final[16];		/* String to emit or expect after last char
115				 * in conversion. */
116    char prefixBytes[256];	/* If a byte in the input stream is the
117				 * first character of one of the escape
118				 * sequences in the following array, the
119				 * corresponding entry in this array is 1,
120				 * otherwise it is 0. */
121    int numSubTables;		/* Length of following array. */
122    EscapeSubTable subTables[1];/* Information about each EscapeSubTable
123				 * used by this encoding type.  The actual
124				 * size will be as large as necessary to
125				 * hold all EscapeSubTables. */
126} EscapeEncodingData;
127
128/*
129 * Constants used when loading an encoding file to identify the type of the
130 * file.
131 */
132
133#define ENCODING_SINGLEBYTE	0
134#define ENCODING_DOUBLEBYTE	1
135#define ENCODING_MULTIBYTE	2
136#define ENCODING_ESCAPE		3
137
138/*
139 * Initialize the default encoding directory.  If this variable contains
140 * a non NULL value, it will be the first path used to locate the
141 * system encoding files.
142 */
143
144char *tclDefaultEncodingDir = NULL;
145
146static int encodingsInitialized  = 0;
147
148/*
149 * Hash table that keeps track of all loaded Encodings.  Keys are
150 * the string names that represent the encoding, values are (Encoding *).
151 */
152
153static Tcl_HashTable encodingTable;
154TCL_DECLARE_MUTEX(encodingMutex)
155
156/*
157 * The following are used to hold the default and current system encodings.
158 * If NULL is passed to one of the conversion routines, the current setting
159 * of the system encoding will be used to perform the conversion.
160 */
161
162static Tcl_Encoding defaultEncoding;
163static Tcl_Encoding systemEncoding;
164
165/*
166 * The following variable is used in the sparse matrix code for a
167 * TableEncoding to represent a page in the table that has no entries.
168 */
169
170static unsigned short emptyPage[256];
171
172/*
173 * Procedures used only in this module.
174 */
175
176static int		BinaryProc _ANSI_ARGS_((ClientData clientData,
177			    CONST char *src, int srcLen, int flags,
178			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
179			    int *srcReadPtr, int *dstWrotePtr,
180			    int *dstCharsPtr));
181static void		DupEncodingIntRep _ANSI_ARGS_((Tcl_Obj *srcPtr,
182			    Tcl_Obj *dupPtr));
183static void		EscapeFreeProc _ANSI_ARGS_((ClientData clientData));
184static int		EscapeFromUtfProc _ANSI_ARGS_((ClientData clientData,
185			    CONST char *src, int srcLen, int flags,
186			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
187			    int *srcReadPtr, int *dstWrotePtr,
188			    int *dstCharsPtr));
189static int		EscapeToUtfProc _ANSI_ARGS_((ClientData clientData,
190			    CONST char *src, int srcLen, int flags,
191			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
192			    int *srcReadPtr, int *dstWrotePtr,
193			    int *dstCharsPtr));
194static void		FreeEncoding _ANSI_ARGS_((Tcl_Encoding encoding));
195static void		FreeEncodingIntRep _ANSI_ARGS_((Tcl_Obj *objPtr));
196static Encoding *	GetTableEncoding _ANSI_ARGS_((
197			    EscapeEncodingData *dataPtr, int state));
198static Tcl_Encoding	LoadEncodingFile _ANSI_ARGS_((Tcl_Interp *interp,
199			    CONST char *name));
200static Tcl_Encoding	LoadTableEncoding _ANSI_ARGS_((Tcl_Interp *interp,
201			    CONST char *name, int type, Tcl_Channel chan));
202static Tcl_Encoding	LoadEscapeEncoding _ANSI_ARGS_((CONST char *name,
203			    Tcl_Channel chan));
204static Tcl_Channel	OpenEncodingFile _ANSI_ARGS_((CONST char *dir,
205			    CONST char *name));
206static void		TableFreeProc _ANSI_ARGS_((ClientData clientData));
207static int		TableFromUtfProc _ANSI_ARGS_((ClientData clientData,
208			    CONST char *src, int srcLen, int flags,
209			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
210			    int *srcReadPtr, int *dstWrotePtr,
211			    int *dstCharsPtr));
212static int		TableToUtfProc _ANSI_ARGS_((ClientData clientData,
213			    CONST char *src, int srcLen, int flags,
214			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
215			    int *srcReadPtr, int *dstWrotePtr,
216			    int *dstCharsPtr));
217static size_t		unilen _ANSI_ARGS_((CONST char *src));
218static int		UnicodeToUtfProc _ANSI_ARGS_((ClientData clientData,
219			    CONST char *src, int srcLen, int flags,
220			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
221			    int *srcReadPtr, int *dstWrotePtr,
222			    int *dstCharsPtr));
223static int		UtfToUnicodeProc _ANSI_ARGS_((ClientData clientData,
224			    CONST char *src, int srcLen, int flags,
225			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
226			    int *srcReadPtr, int *dstWrotePtr,
227			    int *dstCharsPtr));
228static int		UtfToUtfProc _ANSI_ARGS_((ClientData clientData,
229			    CONST char *src, int srcLen, int flags,
230			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
231			    int *srcReadPtr, int *dstWrotePtr,
232			    int *dstCharsPtr, int pureNullMode));
233static int		UtfIntToUtfExtProc _ANSI_ARGS_((ClientData clientData,
234			    CONST char *src, int srcLen, int flags,
235			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
236			    int *srcReadPtr, int *dstWrotePtr,
237			    int *dstCharsPtr));
238static int		UtfExtToUtfIntProc _ANSI_ARGS_((ClientData clientData,
239			    CONST char *src, int srcLen, int flags,
240			    Tcl_EncodingState *statePtr, char *dst, int dstLen,
241			    int *srcReadPtr, int *dstWrotePtr,
242			    int *dstCharsPtr));
243static int		TclFindEncodings _ANSI_ARGS_((CONST char *argv0));
244
245/*
246 * A Tcl_ObjType for holding a cached Tcl_Encoding as the intrep.
247 * This should help the lifetime of encodings be more useful.
248 * See concerns raised in [Bug 1077262].
249 */
250
251static Tcl_ObjType EncodingType = {
252    "encoding", FreeEncodingIntRep, DupEncodingIntRep, NULL, NULL
253};
254
255
256/*
257 *----------------------------------------------------------------------
258 *
259 * TclGetEncodingFromObj --
260 *
261 *      Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr),
262 *      if possible, and returns TCL_OK.  If no such encoding exists,
263 *      TCL_ERROR is returned, and if interp is non-NULL, an error message
264 *      is written there.
265 *
266 * Results:
267 *      Standard Tcl return code.
268 *
269 * Side effects:
270 * 	Caches the Tcl_Encoding value as the internal rep of (*objPtr).
271 *
272 *----------------------------------------------------------------------
273 */
274int
275TclGetEncodingFromObj(interp, objPtr, encodingPtr)
276    Tcl_Interp *interp;
277    Tcl_Obj *objPtr;
278    Tcl_Encoding *encodingPtr;
279{
280    CONST char *name = Tcl_GetString(objPtr);
281    if (objPtr->typePtr != &EncodingType) {
282	Tcl_Encoding encoding = Tcl_GetEncoding(interp, name);
283
284	if (encoding == NULL) {
285	    return TCL_ERROR;
286	}
287	if (objPtr->typePtr && objPtr->typePtr->freeIntRepProc) {
288	    objPtr->typePtr->freeIntRepProc(objPtr);
289	}
290	objPtr->internalRep.otherValuePtr = (VOID *) encoding;
291	objPtr->typePtr = &EncodingType;
292    }
293    *encodingPtr = Tcl_GetEncoding(NULL, name);
294    return TCL_OK;
295}
296
297/*
298 *----------------------------------------------------------------------
299 *
300 * FreeEncodingIntRep --
301 *
302 *      The Tcl_FreeInternalRepProc for the "encoding" Tcl_ObjType.
303 *
304 *----------------------------------------------------------------------
305 */
306static void
307FreeEncodingIntRep(objPtr)
308    Tcl_Obj *objPtr;
309{
310    Tcl_FreeEncoding((Tcl_Encoding) objPtr->internalRep.otherValuePtr);
311}
312
313/*
314 *----------------------------------------------------------------------
315 *
316 * DupEncodingIntRep --
317 *
318 *      The Tcl_DupInternalRepProc for the "encoding" Tcl_ObjType.
319 *
320 *----------------------------------------------------------------------
321 */
322static void
323DupEncodingIntRep(srcPtr, dupPtr)
324    Tcl_Obj *srcPtr;
325    Tcl_Obj *dupPtr;
326{
327    dupPtr->internalRep.otherValuePtr = (VOID *)
328	    Tcl_GetEncoding(NULL, srcPtr->bytes);
329}
330
331/*
332 *---------------------------------------------------------------------------
333 *
334 * TclInitEncodingSubsystem --
335 *
336 *	Initialize all resources used by this subsystem on a per-process
337 *	basis.
338 *
339 * Results:
340 *	None.
341 *
342 * Side effects:
343 *	Depends on the memory, object, and IO subsystems.
344 *
345 *---------------------------------------------------------------------------
346 */
347
348void
349TclInitEncodingSubsystem()
350{
351    Tcl_EncodingType type;
352
353    Tcl_MutexLock(&encodingMutex);
354    Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS);
355    Tcl_MutexUnlock(&encodingMutex);
356
357    /*
358     * Create a few initial encodings.  Note that the UTF-8 to UTF-8
359     * translation is not a no-op, because it will turn a stream of
360     * improperly formed UTF-8 into a properly formed stream.
361     */
362
363    type.encodingName	= "identity";
364    type.toUtfProc	= BinaryProc;
365    type.fromUtfProc	= BinaryProc;
366    type.freeProc	= NULL;
367    type.nullSize	= 1;
368    type.clientData	= NULL;
369
370    defaultEncoding	= Tcl_CreateEncoding(&type);
371    systemEncoding	= Tcl_GetEncoding(NULL, type.encodingName);
372
373    type.encodingName	= "utf-8";
374    type.toUtfProc	= UtfExtToUtfIntProc;
375    type.fromUtfProc	= UtfIntToUtfExtProc;
376    type.freeProc	= NULL;
377    type.nullSize	= 1;
378    type.clientData	= NULL;
379    Tcl_CreateEncoding(&type);
380
381    type.encodingName   = "unicode";
382    type.toUtfProc	= UnicodeToUtfProc;
383    type.fromUtfProc    = UtfToUnicodeProc;
384    type.freeProc	= NULL;
385    type.nullSize	= 2;
386    type.clientData	= NULL;
387    Tcl_CreateEncoding(&type);
388}
389
390
391/*
392 *----------------------------------------------------------------------
393 *
394 * TclFinalizeEncodingSubsystem --
395 *
396 *	Release the state associated with the encoding subsystem.
397 *
398 * Results:
399 *	None.
400 *
401 * Side effects:
402 *	Frees all of the encodings.
403 *
404 *----------------------------------------------------------------------
405 */
406
407void
408TclFinalizeEncodingSubsystem()
409{
410    Tcl_HashSearch search;
411    Tcl_HashEntry *hPtr;
412
413    Tcl_MutexLock(&encodingMutex);
414    encodingsInitialized  = 0;
415    FreeEncoding(systemEncoding);
416    hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
417    while (hPtr != NULL) {
418	/*
419	 * Call FreeEncoding instead of doing it directly to handle refcounts
420	 * like escape encodings use.  [Bug #524674]
421	 * Make sure to call Tcl_FirstHashEntry repeatedly so that all
422	 * encodings are eventually cleaned up.
423	 */
424	FreeEncoding((Tcl_Encoding) Tcl_GetHashValue(hPtr));
425	hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
426    }
427    Tcl_DeleteHashTable(&encodingTable);
428    Tcl_MutexUnlock(&encodingMutex);
429}
430
431/*
432 *-------------------------------------------------------------------------
433 *
434 * Tcl_GetDefaultEncodingDir --
435 *
436 *
437 * Results:
438 *
439 * Side effects:
440 *
441 *-------------------------------------------------------------------------
442 */
443
444CONST char *
445Tcl_GetDefaultEncodingDir()
446{
447    return tclDefaultEncodingDir;
448}
449
450/*
451 *-------------------------------------------------------------------------
452 *
453 * Tcl_SetDefaultEncodingDir --
454 *
455 *
456 * Results:
457 *
458 * Side effects:
459 *
460 *-------------------------------------------------------------------------
461 */
462
463void
464Tcl_SetDefaultEncodingDir(path)
465    CONST char *path;
466{
467    tclDefaultEncodingDir = (char *)ckalloc((unsigned) strlen(path) + 1);
468    strcpy(tclDefaultEncodingDir, path);
469}
470
471/*
472 *-------------------------------------------------------------------------
473 *
474 * Tcl_GetEncoding --
475 *
476 *	Given the name of a encoding, find the corresponding Tcl_Encoding
477 *	token.  If the encoding did not already exist, Tcl attempts to
478 *	dynamically load an encoding by that name.
479 *
480 * Results:
481 *	Returns a token that represents the encoding.  If the name didn't
482 *	refer to any known or loadable encoding, NULL is returned.  If
483 *	NULL was returned, an error message is left in interp's result
484 *	object, unless interp was NULL.
485 *
486 * Side effects:
487 *	The new encoding type is entered into a table visible to all
488 *	interpreters, keyed off the encoding's name.  For each call to
489 *	this procedure, there should eventually be a call to
490 *	Tcl_FreeEncoding, so that the database can be cleaned up when
491 *	encodings aren't needed anymore.
492 *
493 *-------------------------------------------------------------------------
494 */
495
496Tcl_Encoding
497Tcl_GetEncoding(interp, name)
498    Tcl_Interp *interp;		/* Interp for error reporting, if not NULL. */
499    CONST char *name;		/* The name of the desired encoding. */
500{
501    Tcl_HashEntry *hPtr;
502    Encoding *encodingPtr;
503
504    Tcl_MutexLock(&encodingMutex);
505    if (name == NULL) {
506	encodingPtr = (Encoding *) systemEncoding;
507	encodingPtr->refCount++;
508	Tcl_MutexUnlock(&encodingMutex);
509	return systemEncoding;
510    }
511
512    hPtr = Tcl_FindHashEntry(&encodingTable, name);
513    if (hPtr != NULL) {
514	encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
515	encodingPtr->refCount++;
516	Tcl_MutexUnlock(&encodingMutex);
517	return (Tcl_Encoding) encodingPtr;
518    }
519    Tcl_MutexUnlock(&encodingMutex);
520    return LoadEncodingFile(interp, name);
521}
522
523/*
524 *---------------------------------------------------------------------------
525 *
526 * Tcl_FreeEncoding --
527 *
528 *	This procedure is called to release an encoding allocated by
529 *	Tcl_CreateEncoding() or Tcl_GetEncoding().
530 *
531 * Results:
532 *	None.
533 *
534 * Side effects:
535 *	The reference count associated with the encoding is decremented
536 *	and the encoding may be deleted if nothing is using it anymore.
537 *
538 *---------------------------------------------------------------------------
539 */
540
541void
542Tcl_FreeEncoding(encoding)
543    Tcl_Encoding encoding;
544{
545    Tcl_MutexLock(&encodingMutex);
546    FreeEncoding(encoding);
547    Tcl_MutexUnlock(&encodingMutex);
548}
549
550/*
551 *----------------------------------------------------------------------
552 *
553 * FreeEncoding --
554 *
555 *	This procedure is called to release an encoding by procedures
556 *	that already have the encodingMutex.
557 *
558 * Results:
559 *	None.
560 *
561 * Side effects:
562 *	The reference count associated with the encoding is decremented
563 *	and the encoding may be deleted if nothing is using it anymore.
564 *
565 *----------------------------------------------------------------------
566 */
567
568static void
569FreeEncoding(encoding)
570    Tcl_Encoding encoding;
571{
572    Encoding *encodingPtr;
573
574    encodingPtr = (Encoding *) encoding;
575    if (encodingPtr == NULL) {
576	return;
577    }
578    if (encodingPtr->refCount<=0) {
579	Tcl_Panic("FreeEncoding: refcount problem !!!");
580    }
581    encodingPtr->refCount--;
582    if (encodingPtr->refCount == 0) {
583	if (encodingPtr->freeProc != NULL) {
584	    (*encodingPtr->freeProc)(encodingPtr->clientData);
585	}
586	if (encodingPtr->hPtr != NULL) {
587	    Tcl_DeleteHashEntry(encodingPtr->hPtr);
588	}
589	ckfree((char *) encodingPtr->name);
590	ckfree((char *) encodingPtr);
591    }
592}
593
594/*
595 *-------------------------------------------------------------------------
596 *
597 * Tcl_GetEncodingName --
598 *
599 *	Given an encoding, return the name that was used to constuct
600 *	the encoding.
601 *
602 * Results:
603 *	The name of the encoding.
604 *
605 * Side effects:
606 *	None.
607 *
608 *---------------------------------------------------------------------------
609 */
610
611CONST char *
612Tcl_GetEncodingName(encoding)
613    Tcl_Encoding encoding;	/* The encoding whose name to fetch. */
614{
615    Encoding *encodingPtr;
616
617    if (encoding == NULL) {
618	encoding = systemEncoding;
619    }
620    encodingPtr = (Encoding *) encoding;
621    return encodingPtr->name;
622}
623
624/*
625 *-------------------------------------------------------------------------
626 *
627 * Tcl_GetEncodingNames --
628 *
629 *	Get the list of all known encodings, including the ones stored
630 *	as files on disk in the encoding path.
631 *
632 * Results:
633 *	Modifies interp's result object to hold a list of all the available
634 *	encodings.
635 *
636 * Side effects:
637 *	None.
638 *
639 *-------------------------------------------------------------------------
640 */
641
642void
643Tcl_GetEncodingNames(interp)
644    Tcl_Interp *interp;		/* Interp to hold result. */
645{
646    Tcl_HashSearch search;
647    Tcl_HashEntry *hPtr;
648    Tcl_Obj *pathPtr, *resultPtr;
649    int dummy;
650
651    Tcl_HashTable table;
652
653    Tcl_MutexLock(&encodingMutex);
654    Tcl_InitHashTable(&table, TCL_STRING_KEYS);
655    hPtr = Tcl_FirstHashEntry(&encodingTable, &search);
656    while (hPtr != NULL) {
657	Encoding *encodingPtr;
658
659	encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
660	Tcl_CreateHashEntry(&table, encodingPtr->name, &dummy);
661	hPtr = Tcl_NextHashEntry(&search);
662    }
663    Tcl_MutexUnlock(&encodingMutex);
664
665    pathPtr = TclGetLibraryPath();
666    if (pathPtr != NULL) {
667	int i, objc;
668	Tcl_Obj **objv;
669	char globArgString[10];
670	Tcl_Obj* encodingObj = Tcl_NewStringObj("encoding",-1);
671	Tcl_IncrRefCount(encodingObj);
672
673	objc = 0;
674	Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
675
676	for (i = 0; i < objc; i++) {
677	    Tcl_Obj *searchIn;
678
679	    /*
680	     * Construct the path from the element of pathPtr,
681	     * joined with 'encoding'.
682	     */
683	    searchIn = Tcl_FSJoinToPath(objv[i],1,&encodingObj);
684	    Tcl_IncrRefCount(searchIn);
685	    Tcl_ResetResult(interp);
686
687	    /*
688	     * TclGlob() changes the contents of globArgString, which causes
689	     * a segfault if we pass in a pointer to non-writeable memory.
690	     * TclGlob() puts its results directly into interp.
691	     */
692
693	    strcpy(globArgString, "*.enc");
694	    /*
695	     * The GLOBMODE_TAILS flag returns just the tail of each file
696	     * which is the encoding name with a .enc extension
697	     */
698	    if ((TclGlob(interp, globArgString, searchIn,
699			 TCL_GLOBMODE_TAILS, NULL) == TCL_OK)) {
700		int objc2 = 0;
701		Tcl_Obj **objv2;
702		int j;
703
704		Tcl_ListObjGetElements(NULL, Tcl_GetObjResult(interp), &objc2,
705			&objv2);
706
707		for (j = 0; j < objc2; j++) {
708		    int length;
709		    char *string;
710		    string = Tcl_GetStringFromObj(objv2[j], &length);
711		    length -= 4;
712		    if (length > 0) {
713			string[length] = '\0';
714			Tcl_CreateHashEntry(&table, string, &dummy);
715			string[length] = '.';
716		    }
717		}
718	    }
719	    Tcl_DecrRefCount(searchIn);
720	}
721	Tcl_DecrRefCount(encodingObj);
722    }
723
724    /*
725     * Clear any values placed in the result by globbing.
726     */
727
728    Tcl_ResetResult(interp);
729    resultPtr = Tcl_GetObjResult(interp);
730
731    hPtr = Tcl_FirstHashEntry(&table, &search);
732    while (hPtr != NULL) {
733	Tcl_Obj *strPtr;
734
735	strPtr = Tcl_NewStringObj(Tcl_GetHashKey(&table, hPtr), -1);
736	Tcl_ListObjAppendElement(NULL, resultPtr, strPtr);
737	hPtr = Tcl_NextHashEntry(&search);
738    }
739    Tcl_DeleteHashTable(&table);
740}
741
742/*
743 *------------------------------------------------------------------------
744 *
745 * Tcl_SetSystemEncoding --
746 *
747 *	Sets the default encoding that should be used whenever the user
748 *	passes a NULL value in to one of the conversion routines.
749 *	If the supplied name is NULL, the system encoding is reset to the
750 *	default system encoding.
751 *
752 * Results:
753 *	The return value is TCL_OK if the system encoding was successfully
754 *	set to the encoding specified by name, TCL_ERROR otherwise.  If
755 *	TCL_ERROR is returned, an error message is left in interp's result
756 *	object, unless interp was NULL.
757 *
758 * Side effects:
759 *	The reference count of the new system encoding is incremented.
760 *	The reference count of the old system encoding is decremented and
761 *	it may be freed.
762 *
763 *------------------------------------------------------------------------
764 */
765
766int
767Tcl_SetSystemEncoding(interp, name)
768    Tcl_Interp *interp;		/* Interp for error reporting, if not NULL. */
769    CONST char *name;		/* The name of the desired encoding, or NULL
770				 * to reset to default encoding. */
771{
772    Tcl_Encoding encoding;
773    Encoding *encodingPtr;
774
775    if (name == NULL) {
776	Tcl_MutexLock(&encodingMutex);
777	encoding = defaultEncoding;
778	encodingPtr = (Encoding *) encoding;
779	encodingPtr->refCount++;
780	Tcl_MutexUnlock(&encodingMutex);
781    } else {
782	encoding = Tcl_GetEncoding(interp, name);
783	if (encoding == NULL) {
784	    return TCL_ERROR;
785	}
786    }
787
788    Tcl_MutexLock(&encodingMutex);
789    FreeEncoding(systemEncoding);
790    systemEncoding = encoding;
791    Tcl_MutexUnlock(&encodingMutex);
792
793    return TCL_OK;
794}
795
796/*
797 *---------------------------------------------------------------------------
798 *
799 * Tcl_CreateEncoding --
800 *
801 *	This procedure is called to define a new encoding and the procedures
802 *	that are used to convert between the specified encoding and Unicode.
803 *
804 * Results:
805 *	Returns a token that represents the encoding.  If an encoding with
806 *	the same name already existed, the old encoding token remains
807 *	valid and continues to behave as it used to, and will eventually
808 *	be garbage collected when the last reference to it goes away.  Any
809 *	subsequent calls to Tcl_GetEncoding with the specified name will
810 *	retrieve the most recent encoding token.
811 *
812 * Side effects:
813 *	The new encoding type is entered into a table visible to all
814 *	interpreters, keyed off the encoding's name.  For each call to
815 *	this procedure, there should eventually be a call to
816 *	Tcl_FreeEncoding, so that the database can be cleaned up when
817 *	encodings aren't needed anymore.
818 *
819 *---------------------------------------------------------------------------
820 */
821
822Tcl_Encoding
823Tcl_CreateEncoding(typePtr)
824    Tcl_EncodingType *typePtr;	/* The encoding type. */
825{
826    Tcl_HashEntry *hPtr;
827    int new;
828    Encoding *encodingPtr;
829    char *name;
830
831    Tcl_MutexLock(&encodingMutex);
832    hPtr = Tcl_CreateHashEntry(&encodingTable, typePtr->encodingName, &new);
833    if (new == 0) {
834	/*
835	 * Remove old encoding from hash table, but don't delete it until
836	 * last reference goes away.
837	 */
838
839	encodingPtr = (Encoding *) Tcl_GetHashValue(hPtr);
840	encodingPtr->hPtr = NULL;
841    }
842
843    name = ckalloc((unsigned) strlen(typePtr->encodingName) + 1);
844
845    encodingPtr = (Encoding *) ckalloc(sizeof(Encoding));
846    encodingPtr->name		= strcpy(name, typePtr->encodingName);
847    encodingPtr->toUtfProc	= typePtr->toUtfProc;
848    encodingPtr->fromUtfProc	= typePtr->fromUtfProc;
849    encodingPtr->freeProc	= typePtr->freeProc;
850    encodingPtr->nullSize	= typePtr->nullSize;
851    encodingPtr->clientData	= typePtr->clientData;
852    if (typePtr->nullSize == 1) {
853	encodingPtr->lengthProc = (LengthProc *) strlen;
854    } else {
855	encodingPtr->lengthProc = (LengthProc *) unilen;
856    }
857    encodingPtr->refCount	= 1;
858    encodingPtr->hPtr		= hPtr;
859    Tcl_SetHashValue(hPtr, encodingPtr);
860
861    Tcl_MutexUnlock(&encodingMutex);
862
863    return (Tcl_Encoding) encodingPtr;
864}
865
866/*
867 *-------------------------------------------------------------------------
868 *
869 * Tcl_ExternalToUtfDString --
870 *
871 *	Convert a source buffer from the specified encoding into UTF-8.
872 *	If any of the bytes in the source buffer are invalid or cannot
873 *	be represented in the target encoding, a default fallback
874 *	character will be substituted.
875 *
876 * Results:
877 *	The converted bytes are stored in the DString, which is then NULL
878 *	terminated.  The return value is a pointer to the value stored
879 *	in the DString.
880 *
881 * Side effects:
882 *	None.
883 *
884 *-------------------------------------------------------------------------
885 */
886
887char *
888Tcl_ExternalToUtfDString(encoding, src, srcLen, dstPtr)
889    Tcl_Encoding encoding;	/* The encoding for the source string, or
890				 * NULL for the default system encoding. */
891    CONST char *src;		/* Source string in specified encoding. */
892    int srcLen;			/* Source string length in bytes, or < 0 for
893				 * encoding-specific string length. */
894    Tcl_DString *dstPtr;	/* Uninitialized or free DString in which
895				 * the converted string is stored. */
896{
897    char *dst;
898    Tcl_EncodingState state;
899    Encoding *encodingPtr;
900    int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
901
902    Tcl_DStringInit(dstPtr);
903    dst = Tcl_DStringValue(dstPtr);
904    dstLen = dstPtr->spaceAvl - 1;
905
906    if (encoding == NULL) {
907	encoding = systemEncoding;
908    }
909    encodingPtr = (Encoding *) encoding;
910
911    if (src == NULL) {
912	srcLen = 0;
913    } else if (srcLen < 0) {
914	srcLen = (*encodingPtr->lengthProc)(src);
915    }
916    flags = TCL_ENCODING_START | TCL_ENCODING_END;
917    while (1) {
918	result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src,
919		srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
920		&dstChars);
921	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
922	if (result != TCL_CONVERT_NOSPACE) {
923	    Tcl_DStringSetLength(dstPtr, soFar);
924	    return Tcl_DStringValue(dstPtr);
925	}
926	flags &= ~TCL_ENCODING_START;
927	src += srcRead;
928	srcLen -= srcRead;
929	if (Tcl_DStringLength(dstPtr) == 0) {
930	    Tcl_DStringSetLength(dstPtr, dstLen);
931	}
932	Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
933	dst = Tcl_DStringValue(dstPtr) + soFar;
934	dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
935    }
936}
937
938/*
939 *-------------------------------------------------------------------------
940 *
941 * Tcl_ExternalToUtf --
942 *
943 *	Convert a source buffer from the specified encoding into UTF-8.
944 *
945 * Results:
946 *	The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
947 *	TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
948 *	as documented in tcl.h.
949 *
950 * Side effects:
951 *	The converted bytes are stored in the output buffer.
952 *
953 *-------------------------------------------------------------------------
954 */
955
956int
957Tcl_ExternalToUtf(interp, encoding, src, srcLen, flags, statePtr, dst,
958	dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
959    Tcl_Interp *interp;		/* Interp for error return, if not NULL. */
960    Tcl_Encoding encoding;	/* The encoding for the source string, or
961				 * NULL for the default system encoding. */
962    CONST char *src;		/* Source string in specified encoding. */
963    int srcLen;			/* Source string length in bytes, or < 0 for
964				 * encoding-specific string length. */
965    int flags;			/* Conversion control flags. */
966    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
967				 * state information used during a piecewise
968				 * conversion.  Contents of statePtr are
969				 * initialized and/or reset by conversion
970				 * routine under control of flags argument. */
971    char *dst;			/* Output buffer in which converted string
972				 * is stored. */
973    int dstLen;			/* The maximum length of output buffer in
974				 * bytes. */
975    int *srcReadPtr;		/* Filled with the number of bytes from the
976				 * source string that were converted.  This
977				 * may be less than the original source length
978				 * if there was a problem converting some
979				 * source characters. */
980    int *dstWrotePtr;		/* Filled with the number of bytes that were
981				 * stored in the output buffer as a result of
982				 * the conversion. */
983    int *dstCharsPtr;		/* Filled with the number of characters that
984				 * correspond to the bytes stored in the
985				 * output buffer. */
986{
987    Encoding *encodingPtr;
988    int result, srcRead, dstWrote, dstChars;
989    Tcl_EncodingState state;
990
991    if (encoding == NULL) {
992	encoding = systemEncoding;
993    }
994    encodingPtr = (Encoding *) encoding;
995
996    if (src == NULL) {
997	srcLen = 0;
998    } else if (srcLen < 0) {
999	srcLen = (*encodingPtr->lengthProc)(src);
1000    }
1001    if (statePtr == NULL) {
1002	flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1003	statePtr = &state;
1004    }
1005    if (srcReadPtr == NULL) {
1006	srcReadPtr = &srcRead;
1007    }
1008    if (dstWrotePtr == NULL) {
1009	dstWrotePtr = &dstWrote;
1010    }
1011    if (dstCharsPtr == NULL) {
1012	dstCharsPtr = &dstChars;
1013    }
1014
1015    /*
1016     * If there are any null characters in the middle of the buffer, they will
1017     * converted to the UTF-8 null character (\xC080).  To get the actual
1018     * \0 at the end of the destination buffer, we need to append it manually.
1019     */
1020
1021    dstLen--;
1022    result = (*encodingPtr->toUtfProc)(encodingPtr->clientData, src, srcLen,
1023	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1024	    dstCharsPtr);
1025    dst[*dstWrotePtr] = '\0';
1026    return result;
1027}
1028
1029/*
1030 *-------------------------------------------------------------------------
1031 *
1032 * Tcl_UtfToExternalDString --
1033 *
1034 *	Convert a source buffer from UTF-8 into the specified encoding.
1035 *	If any of the bytes in the source buffer are invalid or cannot
1036 *	be represented in the target encoding, a default fallback
1037 *	character will be substituted.
1038 *
1039 * Results:
1040 *	The converted bytes are stored in the DString, which is then
1041 *	NULL terminated in an encoding-specific manner.  The return value
1042 *	is a pointer to the value stored in the DString.
1043 *
1044 * Side effects:
1045 *	None.
1046 *
1047 *-------------------------------------------------------------------------
1048 */
1049
1050char *
1051Tcl_UtfToExternalDString(encoding, src, srcLen, dstPtr)
1052    Tcl_Encoding encoding;	/* The encoding for the converted string,
1053				 * or NULL for the default system encoding. */
1054    CONST char *src;		/* Source string in UTF-8. */
1055    int srcLen;			/* Source string length in bytes, or < 0 for
1056				 * strlen(). */
1057    Tcl_DString *dstPtr;	/* Uninitialized or free DString in which
1058				 * the converted string is stored. */
1059{
1060    char *dst;
1061    Tcl_EncodingState state;
1062    Encoding *encodingPtr;
1063    int flags, dstLen, result, soFar, srcRead, dstWrote, dstChars;
1064
1065    Tcl_DStringInit(dstPtr);
1066    dst = Tcl_DStringValue(dstPtr);
1067    dstLen = dstPtr->spaceAvl - 1;
1068
1069    if (encoding == NULL) {
1070	encoding = systemEncoding;
1071    }
1072    encodingPtr = (Encoding *) encoding;
1073
1074    if (src == NULL) {
1075	srcLen = 0;
1076    } else if (srcLen < 0) {
1077	srcLen = strlen(src);
1078    }
1079    flags = TCL_ENCODING_START | TCL_ENCODING_END;
1080    while (1) {
1081	result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src,
1082		srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote,
1083		&dstChars);
1084	soFar = dst + dstWrote - Tcl_DStringValue(dstPtr);
1085	if (result != TCL_CONVERT_NOSPACE) {
1086	    if (encodingPtr->nullSize == 2) {
1087	        Tcl_DStringSetLength(dstPtr, soFar + 1);
1088	    }
1089	    Tcl_DStringSetLength(dstPtr, soFar);
1090	    return Tcl_DStringValue(dstPtr);
1091	}
1092	flags &= ~TCL_ENCODING_START;
1093	src += srcRead;
1094	srcLen -= srcRead;
1095	if (Tcl_DStringLength(dstPtr) == 0) {
1096	    Tcl_DStringSetLength(dstPtr, dstLen);
1097	}
1098	Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1);
1099	dst = Tcl_DStringValue(dstPtr) + soFar;
1100	dstLen = Tcl_DStringLength(dstPtr) - soFar - 1;
1101    }
1102}
1103
1104/*
1105 *-------------------------------------------------------------------------
1106 *
1107 * Tcl_UtfToExternal --
1108 *
1109 *	Convert a buffer from UTF-8 into the specified encoding.
1110 *
1111 * Results:
1112 *	The return value is one of TCL_OK, TCL_CONVERT_MULTIBYTE,
1113 *	TCL_CONVERT_SYNTAX, TCL_CONVERT_UNKNOWN, or TCL_CONVERT_NOSPACE,
1114 *	as documented in tcl.h.
1115 *
1116 * Side effects:
1117 *	The converted bytes are stored in the output buffer.
1118 *
1119 *-------------------------------------------------------------------------
1120 */
1121
1122int
1123Tcl_UtfToExternal(interp, encoding, src, srcLen, flags, statePtr, dst,
1124	dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr)
1125    Tcl_Interp *interp;		/* Interp for error return, if not NULL. */
1126    Tcl_Encoding encoding;	/* The encoding for the converted string,
1127				 * or NULL for the default system encoding. */
1128    CONST char *src;		/* Source string in UTF-8. */
1129    int srcLen;			/* Source string length in bytes, or < 0 for
1130				 * strlen(). */
1131    int flags;			/* Conversion control flags. */
1132    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1133				 * state information used during a piecewise
1134				 * conversion.  Contents of statePtr are
1135				 * initialized and/or reset by conversion
1136				 * routine under control of flags argument. */
1137    char *dst;			/* Output buffer in which converted string
1138				 * is stored. */
1139    int dstLen;			/* The maximum length of output buffer in
1140				 * bytes. */
1141    int *srcReadPtr;		/* Filled with the number of bytes from the
1142				 * source string that were converted.  This
1143				 * may be less than the original source length
1144				 * if there was a problem converting some
1145				 * source characters. */
1146    int *dstWrotePtr;		/* Filled with the number of bytes that were
1147				 * stored in the output buffer as a result of
1148				 * the conversion. */
1149    int *dstCharsPtr;		/* Filled with the number of characters that
1150				 * correspond to the bytes stored in the
1151				 * output buffer. */
1152{
1153    Encoding *encodingPtr;
1154    int result, srcRead, dstWrote, dstChars;
1155    Tcl_EncodingState state;
1156
1157    if (encoding == NULL) {
1158	encoding = systemEncoding;
1159    }
1160    encodingPtr = (Encoding *) encoding;
1161
1162    if (src == NULL) {
1163	srcLen = 0;
1164    } else if (srcLen < 0) {
1165	srcLen = strlen(src);
1166    }
1167    if (statePtr == NULL) {
1168	flags |= TCL_ENCODING_START | TCL_ENCODING_END;
1169	statePtr = &state;
1170    }
1171    if (srcReadPtr == NULL) {
1172	srcReadPtr = &srcRead;
1173    }
1174    if (dstWrotePtr == NULL) {
1175	dstWrotePtr = &dstWrote;
1176    }
1177    if (dstCharsPtr == NULL) {
1178	dstCharsPtr = &dstChars;
1179    }
1180
1181    dstLen -= encodingPtr->nullSize;
1182    result = (*encodingPtr->fromUtfProc)(encodingPtr->clientData, src, srcLen,
1183	    flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr,
1184	    dstCharsPtr);
1185    if (encodingPtr->nullSize == 2) {
1186	dst[*dstWrotePtr + 1] = '\0';
1187    }
1188    dst[*dstWrotePtr] = '\0';
1189
1190    return result;
1191}
1192
1193/*
1194 *---------------------------------------------------------------------------
1195 *
1196 * Tcl_FindExecutable --
1197 *
1198 *	This procedure computes the absolute path name of the current
1199 *	application, given its argv[0] value.
1200 *
1201 * Results:
1202 *	None.
1203 *
1204 * Side effects:
1205 *	The variable tclExecutableName gets filled in with the file
1206 *	name for the application, if we figured it out.  If we couldn't
1207 *	figure it out, tclExecutableName is set to NULL.
1208 *
1209 *---------------------------------------------------------------------------
1210 */
1211
1212void
1213Tcl_FindExecutable(argv0)
1214    CONST char *argv0;		/* The value of the application's argv[0]
1215				 * (native). */
1216{
1217    int mustCleanUtf;
1218    CONST char *name;
1219    Tcl_DString buffer, nameString;
1220
1221    TclInitSubsystems(argv0);
1222
1223    if (argv0 == NULL) {
1224	goto done;
1225    }
1226    if (tclExecutableName != NULL) {
1227	ckfree(tclExecutableName);
1228	tclExecutableName = NULL;
1229    }
1230    if ((name = TclpFindExecutable(argv0)) == NULL) {
1231	goto done;
1232    }
1233
1234    /*
1235     * The value returned from TclpNameOfExecutable is a UTF string that
1236     * is possibly dirty depending on when it was initialized.
1237     * TclFindEncodings will indicate whether we must "clean" the UTF (as
1238     * reported by the underlying system).  To assure that the UTF string
1239     * is a properly encoded native string for this system, convert the
1240     * UTF string to the default native encoding before the default
1241     * encoding is initialized.  Then, convert it back to UTF after the
1242     * system encoding is loaded.
1243     */
1244
1245    Tcl_UtfToExternalDString(NULL, name, -1, &buffer);
1246    mustCleanUtf = TclFindEncodings(argv0);
1247
1248    /*
1249     * Now it is OK to convert the native string back to UTF and set
1250     * the value of the tclExecutableName.
1251     */
1252
1253    if (mustCleanUtf) {
1254	Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&buffer), -1,
1255		&nameString);
1256	tclExecutableName = (char *)
1257	    ckalloc((unsigned) (Tcl_DStringLength(&nameString) + 1));
1258	strcpy(tclExecutableName, Tcl_DStringValue(&nameString));
1259
1260	Tcl_DStringFree(&nameString);
1261    } else {
1262	tclExecutableName = (char *) ckalloc((unsigned) (strlen(name) + 1));
1263	strcpy(tclExecutableName, name);
1264    }
1265    Tcl_DStringFree(&buffer);
1266    return;
1267
1268    done:
1269    (void) TclFindEncodings(argv0);
1270}
1271
1272/*
1273 *---------------------------------------------------------------------------
1274 *
1275 * LoadEncodingFile --
1276 *
1277 *	Read a file that describes an encoding and create a new Encoding
1278 *	from the data.
1279 *
1280 * Results:
1281 *	The return value is the newly loaded Encoding, or NULL if
1282 *	the file didn't exist of was in the incorrect format.  If NULL was
1283 *	returned, an error message is left in interp's result object,
1284 *	unless interp was NULL.
1285 *
1286 * Side effects:
1287 *	File read from disk.
1288 *
1289 *---------------------------------------------------------------------------
1290 */
1291
1292static Tcl_Encoding
1293LoadEncodingFile(interp, name)
1294    Tcl_Interp *interp;		/* Interp for error reporting, if not NULL. */
1295    CONST char *name;		/* The name of the encoding file on disk
1296				 * and also the name for new encoding. */
1297{
1298    int objc, i, ch;
1299    Tcl_Obj **objv;
1300    Tcl_Obj *pathPtr;
1301    Tcl_Channel chan;
1302    Tcl_Encoding encoding;
1303
1304    pathPtr = TclGetLibraryPath();
1305    if (pathPtr == NULL) {
1306	goto unknown;
1307    }
1308    objc = 0;
1309    Tcl_ListObjGetElements(NULL, pathPtr, &objc, &objv);
1310
1311    chan = NULL;
1312    for (i = 0; i < objc; i++) {
1313	chan = OpenEncodingFile(Tcl_GetString(objv[i]), name);
1314	if (chan != NULL) {
1315	    break;
1316	}
1317    }
1318
1319    if (chan == NULL) {
1320	goto unknown;
1321    }
1322
1323    Tcl_SetChannelOption(NULL, chan, "-encoding", "utf-8");
1324
1325    while (1) {
1326	Tcl_DString ds;
1327
1328	Tcl_DStringInit(&ds);
1329	Tcl_Gets(chan, &ds);
1330	ch = Tcl_DStringValue(&ds)[0];
1331	Tcl_DStringFree(&ds);
1332	if (ch != '#') {
1333	    break;
1334	}
1335    }
1336
1337    encoding = NULL;
1338    switch (ch) {
1339	case 'S': {
1340	    encoding = LoadTableEncoding(interp, name, ENCODING_SINGLEBYTE,
1341		    chan);
1342	    break;
1343	}
1344	case 'D': {
1345	    encoding = LoadTableEncoding(interp, name, ENCODING_DOUBLEBYTE,
1346		    chan);
1347	    break;
1348	}
1349	case 'M': {
1350	    encoding = LoadTableEncoding(interp, name, ENCODING_MULTIBYTE,
1351		    chan);
1352	    break;
1353	}
1354	case 'E': {
1355	    encoding = LoadEscapeEncoding(name, chan);
1356	    break;
1357	}
1358    }
1359    if ((encoding == NULL) && (interp != NULL)) {
1360	Tcl_AppendResult(interp, "invalid encoding file \"", name, "\"", NULL);
1361	if (ch == 'E') {
1362	    Tcl_AppendResult(interp, " or missing sub-encoding", NULL);
1363	}
1364    }
1365    Tcl_Close(NULL, chan);
1366    return encoding;
1367
1368    unknown:
1369    if (interp != NULL) {
1370	Tcl_AppendResult(interp, "unknown encoding \"", name, "\"", NULL);
1371    }
1372    return NULL;
1373}
1374
1375/*
1376 *----------------------------------------------------------------------
1377 *
1378 * OpenEncodingFile --
1379 *
1380 *	Look for the file encoding/<name>.enc in the specified
1381 *	directory.
1382 *
1383 * Results:
1384 *	Returns an open file channel if the file exists.
1385 *
1386 * Side effects:
1387 *	None.
1388 *
1389 *----------------------------------------------------------------------
1390 */
1391
1392static Tcl_Channel
1393OpenEncodingFile(dir, name)
1394    CONST char *dir;
1395    CONST char *name;
1396
1397{
1398    CONST char *argv[3];
1399    Tcl_DString pathString;
1400    CONST char *path;
1401    Tcl_Channel chan;
1402    Tcl_Obj *pathPtr;
1403
1404    argv[0] = dir;
1405    argv[1] = "encoding";
1406    argv[2] = name;
1407
1408    Tcl_DStringInit(&pathString);
1409    Tcl_JoinPath(3, argv, &pathString);
1410    path = Tcl_DStringAppend(&pathString, ".enc", -1);
1411    pathPtr = Tcl_NewStringObj(path,-1);
1412
1413    Tcl_IncrRefCount(pathPtr);
1414    chan = Tcl_FSOpenFileChannel(NULL, pathPtr, "r", 0);
1415    Tcl_DecrRefCount(pathPtr);
1416
1417    Tcl_DStringFree(&pathString);
1418
1419    return chan;
1420}
1421
1422/*
1423 *-------------------------------------------------------------------------
1424 *
1425 * LoadTableEncoding --
1426 *
1427 *	Helper function for LoadEncodingTable().  Loads a table to that
1428 *	converts between Unicode and some other encoding and creates an
1429 *	encoding (using a TableEncoding structure) from that information.
1430 *
1431 *	File contains binary data, but begins with a marker to indicate
1432 *	byte-ordering, so that same binary file can be read on either
1433 *	endian platforms.
1434 *
1435 * Results:
1436 *	The return value is the new encoding, or NULL if the encoding
1437 *	could not be created (because the file contained invalid data).
1438 *
1439 * Side effects:
1440 *	None.
1441 *
1442 *-------------------------------------------------------------------------
1443 */
1444
1445static Tcl_Encoding
1446LoadTableEncoding(interp, name, type, chan)
1447    Tcl_Interp *interp;		/* Interp for temporary obj while reading. */
1448    CONST char *name;		/* Name for new encoding. */
1449    int type;			/* Type of encoding (ENCODING_?????). */
1450    Tcl_Channel chan;		/* File containing new encoding. */
1451{
1452    Tcl_DString lineString;
1453    Tcl_Obj *objPtr;
1454    char *line;
1455    int i, hi, lo, numPages, symbol, fallback;
1456    unsigned char used[256];
1457    unsigned int size;
1458    TableEncodingData *dataPtr;
1459    unsigned short *pageMemPtr;
1460    Tcl_EncodingType encType;
1461
1462    /*
1463     * Speed over memory. Use a full 256 character table to decode hex
1464     * sequences in the encoding files.
1465     */
1466
1467    static char staticHex[] = {
1468      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*   0 ...  15 */
1469      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  16 ...  31 */
1470      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  32 ...  47 */
1471      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /*  48 ...  63 */
1472      0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  64 ...  79 */
1473      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  80 ...  95 */
1474      0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*  96 ... 111 */
1475      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0, /* 112 ... 127 */
1476      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128 ... 143 */
1477      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144 ... 159 */
1478      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160 ... 175 */
1479      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176 ... 191 */
1480      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192 ... 207 */
1481      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 208 ... 223 */
1482      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 224 ... 239 */
1483      0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240 ... 255 */
1484    };
1485
1486    Tcl_DStringInit(&lineString);
1487    Tcl_Gets(chan, &lineString);
1488    line = Tcl_DStringValue(&lineString);
1489
1490    fallback = (int) strtol(line, &line, 16);
1491    symbol = (int) strtol(line, &line, 10);
1492    numPages = (int) strtol(line, &line, 10);
1493    Tcl_DStringFree(&lineString);
1494
1495    if (numPages < 0) {
1496	numPages = 0;
1497    } else if (numPages > 256) {
1498	numPages = 256;
1499    }
1500
1501    memset(used, 0, sizeof(used));
1502
1503#undef PAGESIZE
1504#define PAGESIZE    (256 * sizeof(unsigned short))
1505
1506    dataPtr = (TableEncodingData *) ckalloc(sizeof(TableEncodingData));
1507    memset(dataPtr, 0, sizeof(TableEncodingData));
1508
1509    dataPtr->fallback = fallback;
1510
1511    /*
1512     * Read the table that maps characters to Unicode.  Performs a single
1513     * malloc to get the memory for the array and all the pages needed by
1514     * the array.
1515     */
1516
1517    size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1518    dataPtr->toUnicode = (unsigned short **) ckalloc(size);
1519    memset(dataPtr->toUnicode, 0, size);
1520    pageMemPtr = (unsigned short *) (dataPtr->toUnicode + 256);
1521
1522    if (interp == NULL) {
1523	objPtr = Tcl_NewObj();
1524    } else {
1525	objPtr = Tcl_GetObjResult(interp);
1526    }
1527    for (i = 0; i < numPages; i++) {
1528	int ch;
1529	char *p;
1530
1531	Tcl_ReadChars(chan, objPtr, 3 + 16 * (16 * 4 + 1), 0);
1532	p = Tcl_GetString(objPtr);
1533	hi = (staticHex[(unsigned int)p[0]] << 4) + staticHex[(unsigned int)p[1]];
1534	dataPtr->toUnicode[hi] = pageMemPtr;
1535	p += 2;
1536	for (lo = 0; lo < 256; lo++) {
1537	    if ((lo & 0x0f) == 0) {
1538		p++;
1539	    }
1540	    ch = (staticHex[(unsigned int)p[0]] << 12) + (staticHex[(unsigned int)p[1]] << 8)
1541		+ (staticHex[(unsigned int)p[2]] << 4) + staticHex[(unsigned int)p[3]];
1542	    if (ch != 0) {
1543		used[ch >> 8] = 1;
1544	    }
1545	    *pageMemPtr = (unsigned short) ch;
1546	    pageMemPtr++;
1547	    p += 4;
1548	}
1549    }
1550    if (interp == NULL) {
1551	Tcl_DecrRefCount(objPtr);
1552    } else {
1553	Tcl_ResetResult(interp);
1554    }
1555
1556    if (type == ENCODING_DOUBLEBYTE) {
1557	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
1558    } else {
1559	for (hi = 1; hi < 256; hi++) {
1560	    if (dataPtr->toUnicode[hi] != NULL) {
1561		dataPtr->prefixBytes[hi] = 1;
1562	    }
1563	}
1564    }
1565
1566    /*
1567     * Invert toUnicode array to produce the fromUnicode array.  Performs a
1568     * single malloc to get the memory for the array and all the pages
1569     * needed by the array.  While reading in the toUnicode array, we
1570     * remembered what pages that would be needed for the fromUnicode array.
1571     */
1572
1573    if (symbol) {
1574	used[0] = 1;
1575    }
1576    numPages = 0;
1577    for (hi = 0; hi < 256; hi++) {
1578	if (used[hi]) {
1579	    numPages++;
1580	}
1581    }
1582    size = 256 * sizeof(unsigned short *) + numPages * PAGESIZE;
1583    dataPtr->fromUnicode = (unsigned short **) ckalloc(size);
1584    memset(dataPtr->fromUnicode, 0, size);
1585    pageMemPtr = (unsigned short *) (dataPtr->fromUnicode + 256);
1586
1587    for (hi = 0; hi < 256; hi++) {
1588	if (dataPtr->toUnicode[hi] == NULL) {
1589	    dataPtr->toUnicode[hi] = emptyPage;
1590	} else {
1591	    for (lo = 0; lo < 256; lo++) {
1592		int ch;
1593
1594		ch = dataPtr->toUnicode[hi][lo];
1595		if (ch != 0) {
1596		    unsigned short *page;
1597
1598		    page = dataPtr->fromUnicode[ch >> 8];
1599		    if (page == NULL) {
1600			page = pageMemPtr;
1601			pageMemPtr += 256;
1602			dataPtr->fromUnicode[ch >> 8] = page;
1603		    }
1604		    page[ch & 0xff] = (unsigned short) ((hi << 8) + lo);
1605		}
1606	    }
1607	}
1608    }
1609    if (type == ENCODING_MULTIBYTE) {
1610	/*
1611	 * If multibyte encodings don't have a backslash character, define
1612	 * one.  Otherwise, on Windows, native file names won't work because
1613	 * the backslash in the file name will map to the unknown character
1614	 * (question mark) when converting from UTF-8 to external encoding.
1615	 */
1616
1617	if (dataPtr->fromUnicode[0] != NULL) {
1618	    if (dataPtr->fromUnicode[0]['\\'] == '\0') {
1619		dataPtr->fromUnicode[0]['\\'] = '\\';
1620	    }
1621	}
1622    }
1623    if (symbol) {
1624	unsigned short *page;
1625
1626	/*
1627	 * Make a special symbol encoding that not only maps the symbol
1628	 * characters from their Unicode code points down into page 0, but
1629	 * also ensure that the characters on page 0 map to themselves.
1630	 * This is so that a symbol font can be used to display a simple
1631	 * string like "abcd" and have alpha, beta, chi, delta show up,
1632	 * rather than have "unknown" chars show up because strictly
1633	 * speaking the symbol font doesn't have glyphs for those low ascii
1634	 * chars.
1635	 */
1636
1637	page = dataPtr->fromUnicode[0];
1638	if (page == NULL) {
1639	    page = pageMemPtr;
1640	    dataPtr->fromUnicode[0] = page;
1641	}
1642	for (lo = 0; lo < 256; lo++) {
1643	    if (dataPtr->toUnicode[0][lo] != 0) {
1644		page[lo] = (unsigned short) lo;
1645	    }
1646	}
1647    }
1648    for (hi = 0; hi < 256; hi++) {
1649	if (dataPtr->fromUnicode[hi] == NULL) {
1650	    dataPtr->fromUnicode[hi] = emptyPage;
1651	}
1652    }
1653    /*
1654     * For trailing 'R'everse encoding, see [Patch #689341]
1655     */
1656    Tcl_DStringInit(&lineString);
1657    do {
1658	int len;
1659	/* skip leading empty lines */
1660	while ((len = Tcl_Gets(chan, &lineString)) == 0)
1661	    ;
1662	if (len < 0) {
1663	    break;
1664	}
1665	line = Tcl_DStringValue(&lineString);
1666	if (line[0] != 'R') {
1667	    break;
1668	}
1669	for (Tcl_DStringSetLength(&lineString, 0);
1670	     (len = Tcl_Gets(chan, &lineString)) >= 0;
1671	     Tcl_DStringSetLength(&lineString, 0)) {
1672	    unsigned char* p;
1673	    int to, from;
1674	    if (len < 5) {
1675		continue;
1676	    }
1677	    p = (unsigned char*) Tcl_DStringValue(&lineString);
1678	    to = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1679		+ (staticHex[p[2]] << 4) + staticHex[p[3]];
1680	    if (to == 0) {
1681	    	continue;
1682	    }
1683	    for (p += 5, len -= 5; len >= 0 && *p; p += 5, len -= 5) {
1684		from = (staticHex[p[0]] << 12) + (staticHex[p[1]] << 8)
1685			+ (staticHex[p[2]] << 4) + staticHex[p[3]];
1686	    	if (from == 0) {
1687		    continue;
1688		}
1689		dataPtr->fromUnicode[from >> 8][from & 0xff] = to;
1690	    }
1691	}
1692    } while (0);
1693    Tcl_DStringFree(&lineString);
1694
1695    encType.encodingName    = name;
1696    encType.toUtfProc	    = TableToUtfProc;
1697    encType.fromUtfProc	    = TableFromUtfProc;
1698    encType.freeProc	    = TableFreeProc;
1699    encType.nullSize	    = (type == ENCODING_DOUBLEBYTE) ? 2 : 1;
1700    encType.clientData	    = (ClientData) dataPtr;
1701    return Tcl_CreateEncoding(&encType);
1702}
1703
1704/*
1705 *-------------------------------------------------------------------------
1706 *
1707 * LoadEscapeEncoding --
1708 *
1709 *	Helper function for LoadEncodingTable().  Loads a state machine
1710 *	that converts between Unicode and some other encoding.
1711 *
1712 *	File contains text data that describes the escape sequences that
1713 *	are used to choose an encoding and the associated names for the
1714 *	sub-encodings.
1715 *
1716 * Results:
1717 *	The return value is the new encoding, or NULL if the encoding
1718 *	could not be created (because the file contained invalid data).
1719 *
1720 * Side effects:
1721 *	None.
1722 *
1723 *-------------------------------------------------------------------------
1724 */
1725
1726static Tcl_Encoding
1727LoadEscapeEncoding(name, chan)
1728    CONST char *name;		/* Name for new encoding. */
1729    Tcl_Channel chan;		/* File containing new encoding. */
1730{
1731    int i, missingSubEncoding = 0;
1732    unsigned int size;
1733    Tcl_DString escapeData;
1734    char init[16], final[16];
1735    EscapeEncodingData *dataPtr;
1736    Tcl_EncodingType type;
1737
1738    init[0] = '\0';
1739    final[0] = '\0';
1740    Tcl_DStringInit(&escapeData);
1741
1742    while (1) {
1743	int argc;
1744	CONST char **argv;
1745	char *line;
1746	Tcl_DString lineString;
1747
1748	Tcl_DStringInit(&lineString);
1749	if (Tcl_Gets(chan, &lineString) < 0) {
1750	    break;
1751	}
1752	line = Tcl_DStringValue(&lineString);
1753        if (Tcl_SplitList(NULL, line, &argc, &argv) != TCL_OK) {
1754	    continue;
1755	}
1756	if (argc >= 2) {
1757	    if (strcmp(argv[0], "name") == 0) {
1758		;
1759	    } else if (strcmp(argv[0], "init") == 0) {
1760		strncpy(init, argv[1], sizeof(init));
1761		init[sizeof(init) - 1] = '\0';
1762	    } else if (strcmp(argv[0], "final") == 0) {
1763		strncpy(final, argv[1], sizeof(final));
1764		final[sizeof(final) - 1] = '\0';
1765	    } else {
1766		EscapeSubTable est;
1767
1768		strncpy(est.sequence, argv[1], sizeof(est.sequence));
1769		est.sequence[sizeof(est.sequence) - 1] = '\0';
1770		est.sequenceLen = strlen(est.sequence);
1771
1772		strncpy(est.name, argv[0], sizeof(est.name));
1773		est.name[sizeof(est.name) - 1] = '\0';
1774
1775		/*
1776		 * Load the subencodings first so we're never stuck
1777		 * trying to use a half-loaded system encoding to
1778		 * open/read a *.enc file.
1779		 */
1780
1781		est.encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, est.name);
1782		if ((est.encodingPtr == NULL)
1783			|| (est.encodingPtr->toUtfProc != TableToUtfProc)) {
1784		    missingSubEncoding = 1;
1785		}
1786		Tcl_DStringAppend(&escapeData, (char *) &est, sizeof(est));
1787	    }
1788	}
1789	ckfree((char *) argv);
1790	Tcl_DStringFree(&lineString);
1791    }
1792    if (missingSubEncoding) {
1793	Tcl_DStringFree(&escapeData);
1794	return NULL;
1795    }
1796
1797    size = sizeof(EscapeEncodingData)
1798	    - sizeof(EscapeSubTable) + Tcl_DStringLength(&escapeData);
1799    dataPtr = (EscapeEncodingData *) ckalloc(size);
1800    dataPtr->initLen = strlen(init);
1801    strcpy(dataPtr->init, init);
1802    dataPtr->finalLen = strlen(final);
1803    strcpy(dataPtr->final, final);
1804    dataPtr->numSubTables = Tcl_DStringLength(&escapeData) / sizeof(EscapeSubTable);
1805    memcpy((VOID *) dataPtr->subTables, (VOID *) Tcl_DStringValue(&escapeData),
1806	    (size_t) Tcl_DStringLength(&escapeData));
1807    Tcl_DStringFree(&escapeData);
1808
1809    memset(dataPtr->prefixBytes, 0, sizeof(dataPtr->prefixBytes));
1810    for (i = 0; i < dataPtr->numSubTables; i++) {
1811	dataPtr->prefixBytes[UCHAR(dataPtr->subTables[i].sequence[0])] = 1;
1812    }
1813    if (dataPtr->init[0] != '\0') {
1814	dataPtr->prefixBytes[UCHAR(dataPtr->init[0])] = 1;
1815    }
1816    if (dataPtr->final[0] != '\0') {
1817	dataPtr->prefixBytes[UCHAR(dataPtr->final[0])] = 1;
1818    }
1819
1820    type.encodingName	= name;
1821    type.toUtfProc	= EscapeToUtfProc;
1822    type.fromUtfProc    = EscapeFromUtfProc;
1823    type.freeProc	= EscapeFreeProc;
1824    type.nullSize	= 1;
1825    type.clientData	= (ClientData) dataPtr;
1826
1827    return Tcl_CreateEncoding(&type);
1828}
1829
1830/*
1831 *-------------------------------------------------------------------------
1832 *
1833 * BinaryProc --
1834 *
1835 *	The default conversion when no other conversion is specified.
1836 *	No translation is done; source bytes are copied directly to
1837 *	destination bytes.
1838 *
1839 * Results:
1840 *	Returns TCL_OK if conversion was successful.
1841 *
1842 * Side effects:
1843 *	None.
1844 *
1845 *-------------------------------------------------------------------------
1846 */
1847
1848static int
1849BinaryProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1850	srcReadPtr, dstWrotePtr, dstCharsPtr)
1851    ClientData clientData;	/* Not used. */
1852    CONST char *src;		/* Source string (unknown encoding). */
1853    int srcLen;			/* Source string length in bytes. */
1854    int flags;			/* Conversion control flags. */
1855    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1856				 * state information used during a piecewise
1857				 * conversion.  Contents of statePtr are
1858				 * initialized and/or reset by conversion
1859				 * routine under control of flags argument. */
1860    char *dst;			/* Output buffer in which converted string
1861				 * is stored. */
1862    int dstLen;			/* The maximum length of output buffer in
1863				 * bytes. */
1864    int *srcReadPtr;		/* Filled with the number of bytes from the
1865				 * source string that were converted. */
1866    int *dstWrotePtr;		/* Filled with the number of bytes that were
1867				 * stored in the output buffer as a result of
1868				 * the conversion. */
1869    int *dstCharsPtr;		/* Filled with the number of characters that
1870				 * correspond to the bytes stored in the
1871				 * output buffer. */
1872{
1873    int result;
1874
1875    result = TCL_OK;
1876    dstLen -= TCL_UTF_MAX - 1;
1877    if (dstLen < 0) {
1878	dstLen = 0;
1879    }
1880    if (srcLen > dstLen) {
1881	srcLen = dstLen;
1882	result = TCL_CONVERT_NOSPACE;
1883    }
1884
1885    *srcReadPtr = srcLen;
1886    *dstWrotePtr = srcLen;
1887    *dstCharsPtr = srcLen;
1888    memcpy((void *) dst, (void *) src, (size_t) srcLen);
1889    return result;
1890}
1891
1892
1893/*
1894 *-------------------------------------------------------------------------
1895 *
1896 * UtfExtToUtfIntProc --
1897 *
1898 *	Convert from UTF-8 to UTF-8. While converting null-bytes from
1899 *	the Tcl's internal representation (0xc0, 0x80) to the official
1900 *	representation (0x00). See UtfToUtfProc for details.
1901 *
1902 * Results:
1903 *	Returns TCL_OK if conversion was successful.
1904 *
1905 * Side effects:
1906 *	None.
1907 *
1908 *-------------------------------------------------------------------------
1909 */
1910static int
1911UtfIntToUtfExtProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1912	     srcReadPtr, dstWrotePtr, dstCharsPtr)
1913    ClientData clientData;	/* Not used. */
1914    CONST char *src;		/* Source string in UTF-8. */
1915    int srcLen;			/* Source string length in bytes. */
1916    int flags;			/* Conversion control flags. */
1917    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1918				 * state information used during a piecewise
1919				 * conversion.  Contents of statePtr are
1920				 * initialized and/or reset by conversion
1921				 * routine under control of flags argument. */
1922    char *dst;			/* Output buffer in which converted string
1923				 * is stored. */
1924    int dstLen;			/* The maximum length of output buffer in
1925				 * bytes. */
1926    int *srcReadPtr;		/* Filled with the number of bytes from the
1927				 * source string that were converted.  This
1928				 * may be less than the original source length
1929				 * if there was a problem converting some
1930				 * source characters. */
1931    int *dstWrotePtr;		/* Filled with the number of bytes that were
1932				 * stored in the output buffer as a result of
1933				 * the conversion. */
1934    int *dstCharsPtr;		/* Filled with the number of characters that
1935				 * correspond to the bytes stored in the
1936				 * output buffer. */
1937{
1938    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1939			srcReadPtr, dstWrotePtr, dstCharsPtr, 1);
1940}
1941
1942/*
1943 *-------------------------------------------------------------------------
1944 *
1945 * UtfExtToUtfIntProc --
1946 *
1947 *	Convert from UTF-8 to UTF-8 while converting null-bytes from
1948 *	the official representation (0x00) to Tcl's internal
1949 *	representation (0xc0, 0x80). See UtfToUtfProc for details.
1950 *
1951 * Results:
1952 *	Returns TCL_OK if conversion was successful.
1953 *
1954 * Side effects:
1955 *	None.
1956 *
1957 *-------------------------------------------------------------------------
1958 */
1959static int
1960UtfExtToUtfIntProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1961	     srcReadPtr, dstWrotePtr, dstCharsPtr)
1962    ClientData clientData;	/* Not used. */
1963    CONST char *src;		/* Source string in UTF-8. */
1964    int srcLen;			/* Source string length in bytes. */
1965    int flags;			/* Conversion control flags. */
1966    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
1967				 * state information used during a piecewise
1968				 * conversion.  Contents of statePtr are
1969				 * initialized and/or reset by conversion
1970				 * routine under control of flags argument. */
1971    char *dst;			/* Output buffer in which converted string
1972				 * is stored. */
1973    int dstLen;			/* The maximum length of output buffer in
1974				 * bytes. */
1975    int *srcReadPtr;		/* Filled with the number of bytes from the
1976				 * source string that were converted.  This
1977				 * may be less than the original source length
1978				 * if there was a problem converting some
1979				 * source characters. */
1980    int *dstWrotePtr;		/* Filled with the number of bytes that were
1981				 * stored in the output buffer as a result of
1982				 * the conversion. */
1983    int *dstCharsPtr;		/* Filled with the number of characters that
1984				 * correspond to the bytes stored in the
1985				 * output buffer. */
1986{
1987    return UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
1988			srcReadPtr, dstWrotePtr, dstCharsPtr, 0);
1989}
1990
1991/*
1992 *-------------------------------------------------------------------------
1993 *
1994 * UtfToUtfProc --
1995 *
1996 *	Convert from UTF-8 to UTF-8.  Note that the UTF-8 to UTF-8
1997 *	translation is not a no-op, because it will turn a stream of
1998 *	improperly formed UTF-8 into a properly formed stream.
1999 *
2000 * Results:
2001 *	Returns TCL_OK if conversion was successful.
2002 *
2003 * Side effects:
2004 *	None.
2005 *
2006 *-------------------------------------------------------------------------
2007 */
2008
2009static int
2010UtfToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2011	     srcReadPtr, dstWrotePtr, dstCharsPtr, pureNullMode)
2012    ClientData clientData;	/* Not used. */
2013    CONST char *src;		/* Source string in UTF-8. */
2014    int srcLen;			/* Source string length in bytes. */
2015    int flags;			/* Conversion control flags. */
2016    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2017				 * state information used during a piecewise
2018				 * conversion.  Contents of statePtr are
2019				 * initialized and/or reset by conversion
2020				 * routine under control of flags argument. */
2021    char *dst;			/* Output buffer in which converted string
2022				 * is stored. */
2023    int dstLen;			/* The maximum length of output buffer in
2024				 * bytes. */
2025    int *srcReadPtr;		/* Filled with the number of bytes from the
2026				 * source string that were converted.  This
2027				 * may be less than the original source length
2028				 * if there was a problem converting some
2029				 * source characters. */
2030    int *dstWrotePtr;		/* Filled with the number of bytes that were
2031				 * stored in the output buffer as a result of
2032				 * the conversion. */
2033    int *dstCharsPtr;		/* Filled with the number of characters that
2034				 * correspond to the bytes stored in the
2035				 * output buffer. */
2036    int pureNullMode;		/* Convert embedded nulls from
2037				 * internal representation to real
2038				 * null-bytes or vice versa */
2039
2040{
2041    CONST char *srcStart, *srcEnd, *srcClose;
2042    char *dstStart, *dstEnd;
2043    int result, numChars;
2044    Tcl_UniChar ch;
2045
2046    result = TCL_OK;
2047
2048    srcStart = src;
2049    srcEnd = src + srcLen;
2050    srcClose = srcEnd;
2051    if ((flags & TCL_ENCODING_END) == 0) {
2052	srcClose -= TCL_UTF_MAX;
2053    }
2054
2055    dstStart = dst;
2056    dstEnd = dst + dstLen - TCL_UTF_MAX;
2057
2058    for (numChars = 0; src < srcEnd; numChars++) {
2059	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2060	    /*
2061	     * If there is more string to follow, this will ensure that the
2062	     * last UTF-8 character in the source buffer hasn't been cut off.
2063	     */
2064
2065	    result = TCL_CONVERT_MULTIBYTE;
2066	    break;
2067	}
2068	if (dst > dstEnd) {
2069	    result = TCL_CONVERT_NOSPACE;
2070	    break;
2071	}
2072	if (UCHAR(*src) < 0x80 &&
2073	    !(UCHAR(*src) == 0 && pureNullMode == 0)) {
2074	    /*
2075	     * Copy 7bit chatacters, but skip null-bytes when we are
2076	     * in input mode, so that they get converted to 0xc080.
2077	     */
2078	    *dst++ = *src++;
2079	} else if (pureNullMode == 1 &&
2080		   UCHAR(*src) == 0xc0 &&
2081		   UCHAR(*(src+1)) == 0x80) {
2082	    /*
2083	     * Convert 0xc080 to real nulls when we are in output mode.
2084	     */
2085	    *dst++ = 0;
2086	    src += 2;
2087	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
2088	    /* Always check before using Tcl_UtfToUniChar. Not doing
2089	     * can so cause it run beyond the endof the buffer!  If we
2090	     * happen such an incomplete char its bytes are made to
2091	     * represent themselves.
2092	     */
2093
2094	    ch = (unsigned char) *src;
2095	    src += 1;
2096	    dst += Tcl_UniCharToUtf(ch, dst);
2097	} else {
2098	    src += Tcl_UtfToUniChar(src, &ch);
2099	    dst += Tcl_UniCharToUtf(ch, dst);
2100	}
2101    }
2102
2103    *srcReadPtr  = src - srcStart;
2104    *dstWrotePtr = dst - dstStart;
2105    *dstCharsPtr = numChars;
2106    return result;
2107}
2108
2109/*
2110 *-------------------------------------------------------------------------
2111 *
2112 * UnicodeToUtfProc --
2113 *
2114 *	Convert from Unicode to UTF-8.
2115 *
2116 * Results:
2117 *	Returns TCL_OK if conversion was successful.
2118 *
2119 * Side effects:
2120 *	None.
2121 *
2122 *-------------------------------------------------------------------------
2123 */
2124
2125static int
2126UnicodeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2127	srcReadPtr, dstWrotePtr, dstCharsPtr)
2128    ClientData clientData;	/* Not used. */
2129    CONST char *src;		/* Source string in Unicode. */
2130    int srcLen;			/* Source string length in bytes. */
2131    int flags;			/* Conversion control flags. */
2132    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2133				 * state information used during a piecewise
2134				 * conversion.  Contents of statePtr are
2135				 * initialized and/or reset by conversion
2136				 * routine under control of flags argument. */
2137    char *dst;			/* Output buffer in which converted string
2138				 * is stored. */
2139    int dstLen;			/* The maximum length of output buffer in
2140				 * bytes. */
2141    int *srcReadPtr;		/* Filled with the number of bytes from the
2142				 * source string that were converted.  This
2143				 * may be less than the original source length
2144				 * if there was a problem converting some
2145				 * source characters. */
2146    int *dstWrotePtr;		/* Filled with the number of bytes that were
2147				 * stored in the output buffer as a result of
2148				 * the conversion. */
2149    int *dstCharsPtr;		/* Filled with the number of characters that
2150				 * correspond to the bytes stored in the
2151				 * output buffer. */
2152{
2153    CONST char *srcStart, *srcEnd;
2154    char *dstEnd, *dstStart;
2155    int result, numChars;
2156    Tcl_UniChar ch;
2157
2158    result = TCL_OK;
2159    if ((srcLen % sizeof(Tcl_UniChar)) != 0) {
2160	result = TCL_CONVERT_MULTIBYTE;
2161	srcLen /= sizeof(Tcl_UniChar);
2162	srcLen *= sizeof(Tcl_UniChar);
2163    }
2164
2165    srcStart = src;
2166    srcEnd = src + srcLen;
2167
2168    dstStart = dst;
2169    dstEnd = dst + dstLen - TCL_UTF_MAX;
2170
2171    for (numChars = 0; src < srcEnd; numChars++) {
2172	if (dst > dstEnd) {
2173	    result = TCL_CONVERT_NOSPACE;
2174	    break;
2175	}
2176	/*
2177	 * Special case for 1-byte utf chars for speed.  Make sure we
2178	 * work with Tcl_UniChar-size data.
2179	 */
2180	ch = *(Tcl_UniChar *)src;
2181	if (ch && ch < 0x80) {
2182	    *dst++ = (ch & 0xFF);
2183	} else {
2184	    dst += Tcl_UniCharToUtf(ch, dst);
2185	}
2186	src += sizeof(Tcl_UniChar);
2187    }
2188
2189    *srcReadPtr = src - srcStart;
2190    *dstWrotePtr = dst - dstStart;
2191    *dstCharsPtr = numChars;
2192    return result;
2193}
2194
2195/*
2196 *-------------------------------------------------------------------------
2197 *
2198 * UtfToUnicodeProc --
2199 *
2200 *	Convert from UTF-8 to Unicode.
2201 *
2202 * Results:
2203 *	Returns TCL_OK if conversion was successful.
2204 *
2205 * Side effects:
2206 *	None.
2207 *
2208 *-------------------------------------------------------------------------
2209 */
2210
2211static int
2212UtfToUnicodeProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2213	srcReadPtr, dstWrotePtr, dstCharsPtr)
2214    ClientData clientData;	/* TableEncodingData that specifies encoding. */
2215    CONST char *src;		/* Source string in UTF-8. */
2216    int srcLen;			/* Source string length in bytes. */
2217    int flags;			/* Conversion control flags. */
2218    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2219				 * state information used during a piecewise
2220				 * conversion.  Contents of statePtr are
2221				 * initialized and/or reset by conversion
2222				 * routine under control of flags argument. */
2223    char *dst;			/* Output buffer in which converted string
2224				 * is stored. */
2225    int dstLen;			/* The maximum length of output buffer in
2226				 * bytes. */
2227    int *srcReadPtr;		/* Filled with the number of bytes from the
2228				 * source string that were converted.  This
2229				 * may be less than the original source length
2230				 * if there was a problem converting some
2231				 * source characters. */
2232    int *dstWrotePtr;		/* Filled with the number of bytes that were
2233				 * stored in the output buffer as a result of
2234				 * the conversion. */
2235    int *dstCharsPtr;		/* Filled with the number of characters that
2236				 * correspond to the bytes stored in the
2237				 * output buffer. */
2238{
2239    CONST char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
2240    int result, numChars;
2241    Tcl_UniChar ch;
2242
2243    srcStart = src;
2244    srcEnd = src + srcLen;
2245    srcClose = srcEnd;
2246    if ((flags & TCL_ENCODING_END) == 0) {
2247	srcClose -= TCL_UTF_MAX;
2248    }
2249
2250    dstStart = dst;
2251    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);
2252
2253    result = TCL_OK;
2254    for (numChars = 0; src < srcEnd; numChars++) {
2255	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2256	    /*
2257	     * If there is more string to follow, this will ensure that the
2258	     * last UTF-8 character in the source buffer hasn't been cut off.
2259	     */
2260
2261	    result = TCL_CONVERT_MULTIBYTE;
2262	    break;
2263	}
2264	if (dst > dstEnd) {
2265	    result = TCL_CONVERT_NOSPACE;
2266	    break;
2267        }
2268	src += TclUtfToUniChar(src, &ch);
2269	/*
2270	 * Need to handle this in a way that won't cause misalignment
2271	 * by casting dst to a Tcl_UniChar. [Bug 1122671]
2272	 * XXX: This hard-codes the assumed size of Tcl_UniChar as 2.
2273	 */
2274#ifdef WORDS_BIGENDIAN
2275	*dst++ = (ch >> 8);
2276	*dst++ = (ch & 0xFF);
2277#else
2278	*dst++ = (ch & 0xFF);
2279	*dst++ = (ch >> 8);
2280#endif
2281    }
2282    *srcReadPtr = src - srcStart;
2283    *dstWrotePtr = dst - dstStart;
2284    *dstCharsPtr = numChars;
2285    return result;
2286}
2287
2288/*
2289 *-------------------------------------------------------------------------
2290 *
2291 * TableToUtfProc --
2292 *
2293 *	Convert from the encoding specified by the TableEncodingData into
2294 *	UTF-8.
2295 *
2296 * Results:
2297 *	Returns TCL_OK if conversion was successful.
2298 *
2299 * Side effects:
2300 *	None.
2301 *
2302 *-------------------------------------------------------------------------
2303 */
2304
2305static int
2306TableToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2307	srcReadPtr, dstWrotePtr, dstCharsPtr)
2308    ClientData clientData;	/* TableEncodingData that specifies
2309				 * encoding. */
2310    CONST char *src;		/* Source string in specified encoding. */
2311    int srcLen;			/* Source string length in bytes. */
2312    int flags;			/* Conversion control flags. */
2313    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2314				 * state information used during a piecewise
2315				 * conversion.  Contents of statePtr are
2316				 * initialized and/or reset by conversion
2317				 * routine under control of flags argument. */
2318    char *dst;			/* Output buffer in which converted string
2319				 * is stored. */
2320    int dstLen;			/* The maximum length of output buffer in
2321				 * bytes. */
2322    int *srcReadPtr;		/* Filled with the number of bytes from the
2323				 * source string that were converted.  This
2324				 * may be less than the original source length
2325				 * if there was a problem converting some
2326				 * source characters. */
2327    int *dstWrotePtr;		/* Filled with the number of bytes that were
2328				 * stored in the output buffer as a result of
2329				 * the conversion. */
2330    int *dstCharsPtr;		/* Filled with the number of characters that
2331				 * correspond to the bytes stored in the
2332				 * output buffer. */
2333{
2334    CONST char *srcStart, *srcEnd;
2335    char *dstEnd, *dstStart, *prefixBytes;
2336    int result, byte, numChars;
2337    Tcl_UniChar ch;
2338    unsigned short **toUnicode;
2339    unsigned short *pageZero;
2340    TableEncodingData *dataPtr;
2341
2342    srcStart = src;
2343    srcEnd = src + srcLen;
2344
2345    dstStart = dst;
2346    dstEnd = dst + dstLen - TCL_UTF_MAX;
2347
2348    dataPtr = (TableEncodingData *) clientData;
2349    toUnicode = dataPtr->toUnicode;
2350    prefixBytes = dataPtr->prefixBytes;
2351    pageZero = toUnicode[0];
2352
2353    result = TCL_OK;
2354    for (numChars = 0; src < srcEnd; numChars++) {
2355        if (dst > dstEnd) {
2356            result = TCL_CONVERT_NOSPACE;
2357            break;
2358        }
2359	byte = *((unsigned char *) src);
2360	if (prefixBytes[byte]) {
2361	    src++;
2362	    if (src >= srcEnd) {
2363		src--;
2364		result = TCL_CONVERT_MULTIBYTE;
2365		break;
2366	    }
2367	    ch = toUnicode[byte][*((unsigned char *) src)];
2368	} else {
2369	    ch = pageZero[byte];
2370	}
2371	if ((ch == 0) && (byte != 0)) {
2372	    if (flags & TCL_ENCODING_STOPONERROR) {
2373		result = TCL_CONVERT_SYNTAX;
2374		break;
2375	    }
2376	    if (prefixBytes[byte]) {
2377		src--;
2378	    }
2379	    ch = (Tcl_UniChar) byte;
2380	}
2381	/*
2382	 * Special case for 1-byte utf chars for speed.
2383	 */
2384	if (ch && ch < 0x80) {
2385	    *dst++ = (char) ch;
2386	} else {
2387	    dst += Tcl_UniCharToUtf(ch, dst);
2388	}
2389        src++;
2390    }
2391    *srcReadPtr = src - srcStart;
2392    *dstWrotePtr = dst - dstStart;
2393    *dstCharsPtr = numChars;
2394    return result;
2395}
2396
2397/*
2398 *-------------------------------------------------------------------------
2399 *
2400 * TableFromUtfProc --
2401 *
2402 *	Convert from UTF-8 into the encoding specified by the
2403 *	TableEncodingData.
2404 *
2405 * Results:
2406 *	Returns TCL_OK if conversion was successful.
2407 *
2408 * Side effects:
2409 *	None.
2410 *
2411 *-------------------------------------------------------------------------
2412 */
2413
2414static int
2415TableFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2416	srcReadPtr, dstWrotePtr, dstCharsPtr)
2417    ClientData clientData;	/* TableEncodingData that specifies
2418				 * encoding. */
2419    CONST char *src;		/* Source string in UTF-8. */
2420    int srcLen;			/* Source string length in bytes. */
2421    int flags;			/* Conversion control flags. */
2422    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2423				 * state information used during a piecewise
2424				 * conversion.  Contents of statePtr are
2425				 * initialized and/or reset by conversion
2426				 * routine under control of flags argument. */
2427    char *dst;			/* Output buffer in which converted string
2428				 * is stored. */
2429    int dstLen;			/* The maximum length of output buffer in
2430				 * bytes. */
2431    int *srcReadPtr;		/* Filled with the number of bytes from the
2432				 * source string that were converted.  This
2433				 * may be less than the original source length
2434				 * if there was a problem converting some
2435				 * source characters. */
2436    int *dstWrotePtr;		/* Filled with the number of bytes that were
2437				 * stored in the output buffer as a result of
2438				 * the conversion. */
2439    int *dstCharsPtr;		/* Filled with the number of characters that
2440				 * correspond to the bytes stored in the
2441				 * output buffer. */
2442{
2443    CONST char *srcStart, *srcEnd, *srcClose;
2444    char *dstStart, *dstEnd, *prefixBytes;
2445    Tcl_UniChar ch;
2446    int result, len, word, numChars;
2447    TableEncodingData *dataPtr;
2448    unsigned short **fromUnicode;
2449
2450    result = TCL_OK;
2451
2452    dataPtr = (TableEncodingData *) clientData;
2453    prefixBytes = dataPtr->prefixBytes;
2454    fromUnicode = dataPtr->fromUnicode;
2455
2456    srcStart = src;
2457    srcEnd = src + srcLen;
2458    srcClose = srcEnd;
2459    if ((flags & TCL_ENCODING_END) == 0) {
2460	srcClose -= TCL_UTF_MAX;
2461    }
2462
2463    dstStart = dst;
2464    dstEnd = dst + dstLen - 1;
2465
2466    for (numChars = 0; src < srcEnd; numChars++) {
2467	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2468	    /*
2469	     * If there is more string to follow, this will ensure that the
2470	     * last UTF-8 character in the source buffer hasn't been cut off.
2471	     */
2472
2473	    result = TCL_CONVERT_MULTIBYTE;
2474	    break;
2475	}
2476	len = TclUtfToUniChar(src, &ch);
2477
2478#if TCL_UTF_MAX > 3
2479	/*
2480	 * This prevents a crash condition.  More evaluation is required
2481	 * for full support of int Tcl_UniChar. [Bug 1004065]
2482	 */
2483	if (ch & 0xffff0000) {
2484	    word = 0;
2485	} else
2486#endif
2487	    word = fromUnicode[(ch >> 8)][ch & 0xff];
2488
2489	if ((word == 0) && (ch != 0)) {
2490	    if (flags & TCL_ENCODING_STOPONERROR) {
2491		result = TCL_CONVERT_UNKNOWN;
2492		break;
2493	    }
2494	    word = dataPtr->fallback;
2495	}
2496	if (prefixBytes[(word >> 8)] != 0) {
2497	    if (dst + 1 > dstEnd) {
2498		result = TCL_CONVERT_NOSPACE;
2499		break;
2500	    }
2501	    dst[0] = (char) (word >> 8);
2502	    dst[1] = (char) word;
2503	    dst += 2;
2504	} else {
2505	    if (dst > dstEnd) {
2506		result = TCL_CONVERT_NOSPACE;
2507		break;
2508	    }
2509	    dst[0] = (char) word;
2510	    dst++;
2511	}
2512	src += len;
2513    }
2514    *srcReadPtr = src - srcStart;
2515    *dstWrotePtr = dst - dstStart;
2516    *dstCharsPtr = numChars;
2517    return result;
2518}
2519
2520/*
2521 *---------------------------------------------------------------------------
2522 *
2523 * TableFreeProc --
2524 *
2525 *	This procedure is invoked when an encoding is deleted.  It deletes
2526 *	the memory used by the TableEncodingData.
2527 *
2528 * Results:
2529 *	None.
2530 *
2531 * Side effects:
2532 *	Memory freed.
2533 *
2534 *---------------------------------------------------------------------------
2535 */
2536
2537static void
2538TableFreeProc(clientData)
2539    ClientData clientData;	/* TableEncodingData that specifies
2540				 * encoding. */
2541{
2542    TableEncodingData *dataPtr;
2543
2544    /*
2545     * Make sure we aren't freeing twice on shutdown.  [Bug #219314]
2546     */
2547
2548    dataPtr = (TableEncodingData *) clientData;
2549    ckfree((char *) dataPtr->toUnicode);
2550    ckfree((char *) dataPtr->fromUnicode);
2551    ckfree((char *) dataPtr);
2552}
2553
2554/*
2555 *-------------------------------------------------------------------------
2556 *
2557 * EscapeToUtfProc --
2558 *
2559 *	Convert from the encoding specified by the EscapeEncodingData into
2560 *	UTF-8.
2561 *
2562 * Results:
2563 *	Returns TCL_OK if conversion was successful.
2564 *
2565 * Side effects:
2566 *	None.
2567 *
2568 *-------------------------------------------------------------------------
2569 */
2570
2571static int
2572EscapeToUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2573	srcReadPtr, dstWrotePtr, dstCharsPtr)
2574    ClientData clientData;	/* EscapeEncodingData that specifies
2575				 * encoding. */
2576    CONST char *src;		/* Source string in specified encoding. */
2577    int srcLen;			/* Source string length in bytes. */
2578    int flags;			/* Conversion control flags. */
2579    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2580				 * state information used during a piecewise
2581				 * conversion.  Contents of statePtr are
2582				 * initialized and/or reset by conversion
2583				 * routine under control of flags argument. */
2584    char *dst;			/* Output buffer in which converted string
2585				 * is stored. */
2586    int dstLen;			/* The maximum length of output buffer in
2587				 * bytes. */
2588    int *srcReadPtr;		/* Filled with the number of bytes from the
2589				 * source string that were converted.  This
2590				 * may be less than the original source length
2591				 * if there was a problem converting some
2592				 * source characters. */
2593    int *dstWrotePtr;		/* Filled with the number of bytes that were
2594				 * stored in the output buffer as a result of
2595				 * the conversion. */
2596    int *dstCharsPtr;		/* Filled with the number of characters that
2597				 * correspond to the bytes stored in the
2598				 * output buffer. */
2599{
2600    EscapeEncodingData *dataPtr;
2601    char *prefixBytes, *tablePrefixBytes;
2602    unsigned short **tableToUnicode;
2603    Encoding *encodingPtr;
2604    int state, result, numChars;
2605    CONST char *srcStart, *srcEnd;
2606    char *dstStart, *dstEnd;
2607
2608    result = TCL_OK;
2609
2610    tablePrefixBytes = NULL;	/* lint. */
2611    tableToUnicode = NULL;	/* lint. */
2612
2613    dataPtr = (EscapeEncodingData *) clientData;
2614    prefixBytes = dataPtr->prefixBytes;
2615    encodingPtr = NULL;
2616
2617    srcStart = src;
2618    srcEnd = src + srcLen;
2619
2620    dstStart = dst;
2621    dstEnd = dst + dstLen - TCL_UTF_MAX;
2622
2623    state = (int) *statePtr;
2624    if (flags & TCL_ENCODING_START) {
2625	state = 0;
2626    }
2627
2628    for (numChars = 0; src < srcEnd; ) {
2629	int byte, hi, lo, ch;
2630
2631        if (dst > dstEnd) {
2632            result = TCL_CONVERT_NOSPACE;
2633            break;
2634        }
2635	byte = *((unsigned char *) src);
2636	if (prefixBytes[byte]) {
2637	    unsigned int left, len, longest;
2638	    int checked, i;
2639	    EscapeSubTable *subTablePtr;
2640
2641	    /*
2642	     * Saw the beginning of an escape sequence.
2643	     */
2644
2645	    left = srcEnd - src;
2646	    len = dataPtr->initLen;
2647	    longest = len;
2648	    checked = 0;
2649	    if (len <= left) {
2650		checked++;
2651		if ((len > 0) &&
2652			(memcmp(src, dataPtr->init, len) == 0)) {
2653		    /*
2654		     * If we see initialization string, skip it, even if we're
2655		     * not at the beginning of the buffer.
2656		     */
2657
2658		    src += len;
2659		    continue;
2660		}
2661	    }
2662	    len = dataPtr->finalLen;
2663	    if (len > longest) {
2664		longest = len;
2665	    }
2666	    if (len <= left) {
2667		checked++;
2668		if ((len > 0) &&
2669			(memcmp(src, dataPtr->final, len) == 0)) {
2670		    /*
2671		     * If we see finalization string, skip it, even if we're
2672		     * not at the end of the buffer.
2673		     */
2674
2675		    src += len;
2676		    continue;
2677		}
2678	    }
2679	    subTablePtr = dataPtr->subTables;
2680	    for (i = 0; i < dataPtr->numSubTables; i++) {
2681		len = subTablePtr->sequenceLen;
2682		if (len > longest) {
2683		    longest = len;
2684		}
2685		if (len <= left) {
2686		    checked++;
2687		    if ((len > 0) &&
2688			    (memcmp(src, subTablePtr->sequence, len) == 0)) {
2689			state = i;
2690			encodingPtr = NULL;
2691			subTablePtr = NULL;
2692			src += len;
2693			break;
2694		    }
2695		}
2696		subTablePtr++;
2697	    }
2698	    if (subTablePtr == NULL) {
2699		/*
2700		 * A match was found, the escape sequence was consumed, and
2701		 * the state was updated.
2702		 */
2703
2704		continue;
2705	    }
2706
2707	    /*
2708	     * We have a split-up or unrecognized escape sequence.  If we
2709	     * checked all the sequences, then it's a syntax error,
2710	     * otherwise we need more bytes to determine a match.
2711	     */
2712
2713	    if ((checked == dataPtr->numSubTables + 2)
2714		    || (flags & TCL_ENCODING_END)) {
2715		if ((flags & TCL_ENCODING_STOPONERROR) == 0) {
2716		    /*
2717		     * Skip the unknown escape sequence.
2718		     */
2719
2720		    src += longest;
2721		    continue;
2722		}
2723		result = TCL_CONVERT_SYNTAX;
2724	    } else {
2725		result = TCL_CONVERT_MULTIBYTE;
2726	    }
2727	    break;
2728	}
2729
2730	if (encodingPtr == NULL) {
2731	    TableEncodingData *tableDataPtr;
2732
2733	    encodingPtr = GetTableEncoding(dataPtr, state);
2734	    tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2735	    tablePrefixBytes = tableDataPtr->prefixBytes;
2736	    tableToUnicode = tableDataPtr->toUnicode;
2737	}
2738	if (tablePrefixBytes[byte]) {
2739	    src++;
2740	    if (src >= srcEnd) {
2741		src--;
2742		result = TCL_CONVERT_MULTIBYTE;
2743		break;
2744	    }
2745	    hi = byte;
2746	    lo = *((unsigned char *) src);
2747	} else {
2748	    hi = 0;
2749	    lo = byte;
2750	}
2751	ch = tableToUnicode[hi][lo];
2752	dst += Tcl_UniCharToUtf(ch, dst);
2753	src++;
2754	numChars++;
2755    }
2756
2757    *statePtr = (Tcl_EncodingState) state;
2758    *srcReadPtr = src - srcStart;
2759    *dstWrotePtr = dst - dstStart;
2760    *dstCharsPtr = numChars;
2761    return result;
2762}
2763
2764/*
2765 *-------------------------------------------------------------------------
2766 *
2767 * EscapeFromUtfProc --
2768 *
2769 *	Convert from UTF-8 into the encoding specified by the
2770 *	EscapeEncodingData.
2771 *
2772 * Results:
2773 *	Returns TCL_OK if conversion was successful.
2774 *
2775 * Side effects:
2776 *	None.
2777 *
2778 *-------------------------------------------------------------------------
2779 */
2780
2781static int
2782EscapeFromUtfProc(clientData, src, srcLen, flags, statePtr, dst, dstLen,
2783	srcReadPtr, dstWrotePtr, dstCharsPtr)
2784    ClientData clientData;	/* EscapeEncodingData that specifies
2785				 * encoding. */
2786    CONST char *src;		/* Source string in UTF-8. */
2787    int srcLen;			/* Source string length in bytes. */
2788    int flags;			/* Conversion control flags. */
2789    Tcl_EncodingState *statePtr;/* Place for conversion routine to store
2790				 * state information used during a piecewise
2791				 * conversion.  Contents of statePtr are
2792				 * initialized and/or reset by conversion
2793				 * routine under control of flags argument. */
2794    char *dst;			/* Output buffer in which converted string
2795				 * is stored. */
2796    int dstLen;			/* The maximum length of output buffer in
2797				 * bytes. */
2798    int *srcReadPtr;		/* Filled with the number of bytes from the
2799				 * source string that were converted.  This
2800				 * may be less than the original source length
2801				 * if there was a problem converting some
2802				 * source characters. */
2803    int *dstWrotePtr;		/* Filled with the number of bytes that were
2804				 * stored in the output buffer as a result of
2805				 * the conversion. */
2806    int *dstCharsPtr;		/* Filled with the number of characters that
2807				 * correspond to the bytes stored in the
2808				 * output buffer. */
2809{
2810    EscapeEncodingData *dataPtr;
2811    Encoding *encodingPtr;
2812    CONST char *srcStart, *srcEnd, *srcClose;
2813    char *dstStart, *dstEnd;
2814    int state, result, numChars;
2815    TableEncodingData *tableDataPtr;
2816    char *tablePrefixBytes;
2817    unsigned short **tableFromUnicode;
2818
2819    result = TCL_OK;
2820
2821    dataPtr = (EscapeEncodingData *) clientData;
2822
2823    srcStart = src;
2824    srcEnd = src + srcLen;
2825    srcClose = srcEnd;
2826    if ((flags & TCL_ENCODING_END) == 0) {
2827	srcClose -= TCL_UTF_MAX;
2828    }
2829
2830    dstStart = dst;
2831    dstEnd = dst + dstLen - 1;
2832
2833    /*
2834     * RFC1468 states that the text starts in ASCII, and switches to Japanese
2835     * characters, and that the text must end in ASCII. [Patch #474358]
2836     */
2837
2838    if (flags & TCL_ENCODING_START) {
2839	state = 0;
2840	if ((dst + dataPtr->initLen) > dstEnd) {
2841	    *srcReadPtr = 0;
2842	    *dstWrotePtr = 0;
2843	    return TCL_CONVERT_NOSPACE;
2844	}
2845	memcpy((VOID *) dst, (VOID *) dataPtr->init,
2846		(size_t) dataPtr->initLen);
2847	dst += dataPtr->initLen;
2848    } else {
2849        state = (int) *statePtr;
2850    }
2851
2852    encodingPtr = GetTableEncoding(dataPtr, state);
2853    tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2854    tablePrefixBytes = tableDataPtr->prefixBytes;
2855    tableFromUnicode = tableDataPtr->fromUnicode;
2856
2857    for (numChars = 0; src < srcEnd; numChars++) {
2858	unsigned int len;
2859	int word;
2860	Tcl_UniChar ch;
2861
2862	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
2863	    /*
2864	     * If there is more string to follow, this will ensure that the
2865	     * last UTF-8 character in the source buffer hasn't been cut off.
2866	     */
2867
2868	    result = TCL_CONVERT_MULTIBYTE;
2869	    break;
2870	}
2871	len = TclUtfToUniChar(src, &ch);
2872	word = tableFromUnicode[(ch >> 8)][ch & 0xff];
2873
2874	if ((word == 0) && (ch != 0)) {
2875	    int oldState;
2876	    EscapeSubTable *subTablePtr;
2877
2878	    oldState = state;
2879	    for (state = 0; state < dataPtr->numSubTables; state++) {
2880		encodingPtr = GetTableEncoding(dataPtr, state);
2881		tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2882	    	word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xff];
2883		if (word != 0) {
2884		    break;
2885		}
2886	    }
2887
2888	    if (word == 0) {
2889		state = oldState;
2890		if (flags & TCL_ENCODING_STOPONERROR) {
2891		    result = TCL_CONVERT_UNKNOWN;
2892		    break;
2893		}
2894		encodingPtr = GetTableEncoding(dataPtr, state);
2895		tableDataPtr = (TableEncodingData *) encodingPtr->clientData;
2896		word = tableDataPtr->fallback;
2897	    }
2898
2899	    tablePrefixBytes = tableDataPtr->prefixBytes;
2900	    tableFromUnicode = tableDataPtr->fromUnicode;
2901
2902	    /*
2903	     * The state variable has the value of oldState when word is 0.
2904	     * In this case, the escape sequense should not be copied to dst
2905	     * because the current character set is not changed.
2906	     */
2907	    if (state != oldState) {
2908		subTablePtr = &dataPtr->subTables[state];
2909		if ((dst + subTablePtr->sequenceLen) > dstEnd) {
2910		    /*
2911		     * If there is no space to write the escape sequence, the
2912		     * state variable must be changed to the value of oldState
2913		     * variable because this escape sequence must be written
2914		     * in the next conversion.
2915		     */
2916		    state = oldState;
2917		    result = TCL_CONVERT_NOSPACE;
2918		    break;
2919		}
2920		memcpy((VOID *) dst, (VOID *) subTablePtr->sequence,
2921			(size_t) subTablePtr->sequenceLen);
2922		dst += subTablePtr->sequenceLen;
2923	    }
2924	}
2925
2926	if (tablePrefixBytes[(word >> 8)] != 0) {
2927	    if (dst + 1 > dstEnd) {
2928		result = TCL_CONVERT_NOSPACE;
2929		break;
2930	    }
2931	    dst[0] = (char) (word >> 8);
2932	    dst[1] = (char) word;
2933	    dst += 2;
2934	} else {
2935	    if (dst > dstEnd) {
2936		result = TCL_CONVERT_NOSPACE;
2937		break;
2938	    }
2939	    dst[0] = (char) word;
2940	    dst++;
2941	}
2942	src += len;
2943    }
2944
2945    if ((result == TCL_OK) && (flags & TCL_ENCODING_END)) {
2946	unsigned int len = dataPtr->subTables[0].sequenceLen;
2947	/*
2948	 * [Bug 1516109].
2949	 * Certain encodings like iso2022-jp need to write
2950	 * an escape sequence after all characters have
2951	 * been converted. This logic checks that enough
2952	 * room is available in the buffer for the escape bytes.
2953	 * The TCL_ENCODING_END flag is cleared after a final
2954	 * escape sequence has been added to the buffer so
2955	 * that another call to this method does not attempt
2956	 * to append escape bytes a second time.
2957	 */
2958	if ((dst + dataPtr->finalLen + (state?len:0)) > dstEnd) {
2959	    result = TCL_CONVERT_NOSPACE;
2960	} else {
2961	    if (state) {
2962		memcpy((VOID *) dst, (VOID *) dataPtr->subTables[0].sequence,
2963			(size_t) len);
2964		dst += len;
2965	    }
2966	    memcpy((VOID *) dst, (VOID *) dataPtr->final,
2967		    (size_t) dataPtr->finalLen);
2968	    dst += dataPtr->finalLen;
2969	    state &= ~TCL_ENCODING_END;
2970	}
2971    }
2972
2973    *statePtr = (Tcl_EncodingState) state;
2974    *srcReadPtr = src - srcStart;
2975    *dstWrotePtr = dst - dstStart;
2976    *dstCharsPtr = numChars;
2977    return result;
2978}
2979
2980/*
2981 *---------------------------------------------------------------------------
2982 *
2983 * EscapeFreeProc --
2984 *
2985 *	This procedure is invoked when an EscapeEncodingData encoding is
2986 *	deleted.  It deletes the memory used by the encoding.
2987 *
2988 * Results:
2989 *	None.
2990 *
2991 * Side effects:
2992 *	Memory freed.
2993 *
2994 *---------------------------------------------------------------------------
2995 */
2996
2997static void
2998EscapeFreeProc(clientData)
2999    ClientData clientData;	/* EscapeEncodingData that specifies encoding. */
3000{
3001    EscapeEncodingData *dataPtr;
3002    EscapeSubTable *subTablePtr;
3003    int i;
3004
3005    dataPtr = (EscapeEncodingData *) clientData;
3006    if (dataPtr == NULL) {
3007	return;
3008    }
3009    /*
3010     *  The subTables should be freed recursively in normal operation but not
3011     *  during TclFinalizeEncodingSubsystem because they are also present as a
3012     *  weak reference in the toplevel encodingTable (ie they don't have a +1
3013     *  refcount for this), and unpredictable nuking order could remove them
3014     *  from under the following loop's feet [Bug 2891556].
3015     *
3016     *  The encodingsInitialized flag, being reset on entry to TFES, can serve
3017     *  as a "not in finalization" test.
3018     */
3019    if (encodingsInitialized)
3020	{
3021	    subTablePtr = dataPtr->subTables;
3022	    for (i = 0; i < dataPtr->numSubTables; i++) {
3023		FreeEncoding((Tcl_Encoding) subTablePtr->encodingPtr);
3024		subTablePtr++;
3025	    }
3026	}
3027    ckfree((char *) dataPtr);
3028}
3029
3030/*
3031 *---------------------------------------------------------------------------
3032 *
3033 * GetTableEncoding --
3034 *
3035 *	Helper function for the EscapeEncodingData conversions.  Gets the
3036 *	encoding (of type TextEncodingData) that represents the specified
3037 *	state.
3038 *
3039 * Results:
3040 *	The return value is the encoding.
3041 *
3042 * Side effects:
3043 *	If the encoding that represents the specified state has not
3044 *	already been used by this EscapeEncoding, it will be loaded
3045 *	and cached in the dataPtr.
3046 *
3047 *---------------------------------------------------------------------------
3048 */
3049
3050static Encoding *
3051GetTableEncoding(dataPtr, state)
3052    EscapeEncodingData *dataPtr;/* Contains names of encodings. */
3053    int state;			/* Index in dataPtr of desired Encoding. */
3054{
3055    EscapeSubTable *subTablePtr;
3056    Encoding *encodingPtr;
3057
3058    subTablePtr = &dataPtr->subTables[state];
3059    encodingPtr = subTablePtr->encodingPtr;
3060    if (encodingPtr == NULL) {
3061	/*
3062	 * Now that escape encodings load their sub-encodings first, and
3063	 * fail to load if any sub-encodings are missing, this branch should
3064	 * never happen.
3065	 */
3066	encodingPtr = (Encoding *) Tcl_GetEncoding(NULL, subTablePtr->name);
3067	if ((encodingPtr == NULL)
3068		|| (encodingPtr->toUtfProc != TableToUtfProc)) {
3069	    panic("EscapeToUtfProc: invalid sub table");
3070	}
3071	subTablePtr->encodingPtr = encodingPtr;
3072    }
3073    return encodingPtr;
3074}
3075
3076/*
3077 *---------------------------------------------------------------------------
3078 *
3079 * unilen --
3080 *
3081 *	A helper function for the Tcl_ExternalToUtf functions.  This
3082 *	function is similar to strlen for double-byte characters: it
3083 *	returns the number of bytes in a 0x0000 terminated string.
3084 *
3085 * Results:
3086 *	As above.
3087 *
3088 * Side effects:
3089 *	None.
3090 *
3091 *---------------------------------------------------------------------------
3092 */
3093
3094static size_t
3095unilen(src)
3096    CONST char *src;
3097{
3098    unsigned short *p;
3099
3100    p = (unsigned short *) src;
3101    while (*p != 0x0000) {
3102	p++;
3103    }
3104    return (char *) p - src;
3105}
3106
3107/*
3108 *-------------------------------------------------------------------------
3109 *
3110 * TclFindEncodings --
3111 *
3112 *	Find and load the encoding file for this operating system.
3113 *	Before this is called, Tcl makes assumptions about the
3114 *	native string representation, but the true encoding is not
3115 *	assured.
3116 *
3117 * Results:
3118 *	Return result of TclpInitLibraryPath, which reports whether the
3119 *	path is clean (0) or dirty (1) UTF.
3120 *
3121 * Side effects:
3122 *	Varied, see the respective initialization routines.
3123 *
3124 *-------------------------------------------------------------------------
3125 */
3126
3127static int
3128TclFindEncodings(argv0)
3129    CONST char *argv0;		/* Name of executable from argv[0] to main()
3130				 * in native multi-byte encoding. */
3131{
3132    int mustCleanUtf = 0;
3133
3134    if (encodingsInitialized == 0) {
3135	/*
3136	 * Double check inside the mutex.  There may be calls
3137	 * back into this routine from some of the procedures below.
3138	 */
3139
3140	TclpInitLock();
3141	if (encodingsInitialized == 0) {
3142	    char *native;
3143	    Tcl_Obj *pathPtr;
3144	    Tcl_DString libPath, buffer;
3145
3146	    /*
3147	     * Have to set this bit here to avoid deadlock with the
3148	     * routines below us that call into TclInitSubsystems.
3149	     */
3150
3151	    encodingsInitialized = 1;
3152
3153	    native = TclpFindExecutable(argv0);
3154	    mustCleanUtf = TclpInitLibraryPath(native);
3155
3156	    /*
3157	     * The library path was set in the TclpInitLibraryPath routine.
3158	     * The string set is a dirty UTF string.  To preserve the value
3159	     * convert the UTF string back to native before setting the new
3160	     * default encoding.
3161	     */
3162
3163	    pathPtr = TclGetLibraryPath();
3164	    if ((pathPtr != NULL) && mustCleanUtf) {
3165		Tcl_UtfToExternalDString(NULL, Tcl_GetString(pathPtr), -1,
3166			&libPath);
3167	    }
3168
3169	    TclpSetInitialEncodings();
3170
3171	    /*
3172	     * Now convert the native string back to UTF.
3173	     */
3174
3175	    if ((pathPtr != NULL) && mustCleanUtf) {
3176		Tcl_ExternalToUtfDString(NULL, Tcl_DStringValue(&libPath), -1,
3177			&buffer);
3178		pathPtr = Tcl_NewStringObj(Tcl_DStringValue(&buffer), -1);
3179		TclSetLibraryPath(pathPtr);
3180
3181		Tcl_DStringFree(&libPath);
3182		Tcl_DStringFree(&buffer);
3183	    }
3184	}
3185	TclpInitUnlock();
3186    }
3187
3188    return mustCleanUtf;
3189}
3190