1/***********************************************************************
2*                                                                      *
3*               This software is part of the ast package               *
4*          Copyright (c) 1985-2011 AT&T Intellectual Property          *
5*                      and is licensed under the                       *
6*                  Common Public License, Version 1.0                  *
7*                    by AT&T Intellectual Property                     *
8*                                                                      *
9*                A copy of the License is available at                 *
10*            http://www.opensource.org/licenses/cpl1.0.txt             *
11*         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12*                                                                      *
13*              Information and Software Systems Research               *
14*                            AT&T Research                             *
15*                           Florham Park NJ                            *
16*                                                                      *
17*                 Glenn Fowler <gsf@research.att.com>                  *
18*                  David Korn <dgk@research.att.com>                   *
19*                   Phong Vo <kpv@research.att.com>                    *
20*                                                                      *
21***********************************************************************/
22#pragma prototyped
23
24/*
25 * Glenn Fowler
26 * AT&T Research
27 *
28 * iconv intercept
29 * minimally provides { utf*<=>bin ascii<=>ebcdic* }
30 */
31
32#include <ast.h>
33#include <dirent.h>
34
35#define DEBUG_TRACE		0
36#define _ICONV_LIST_PRIVATE_
37
38#include <ccode.h>
39#include <ctype.h>
40#include <iconv.h>
41
42#include "lclib.h"
43
44#if !_lib_iconv_open
45
46#define _ast_iconv_t		iconv_t
47#define _ast_iconv_f		iconv_f
48#define _ast_iconv_list_t	iconv_list_t
49#define _ast_iconv_open		iconv_open
50#define _ast_iconv		iconv
51#define _ast_iconv_close	iconv_close
52#define _ast_iconv_list		iconv_list
53#define _ast_iconv_move		iconv_move
54#define _ast_iconv_name		iconv_name
55#define _ast_iconv_write	iconv_write
56
57#endif
58
59#ifndef E2BIG
60#define E2BIG			ENOMEM
61#endif
62#ifndef EILSEQ
63#define EILSEQ			EIO
64#endif
65
66#define RETURN(e,n,fn) \
67	if (*fn && !e) e = E2BIG; \
68	if (e) { errno = e; return (size_t)(-1); } \
69	return n;
70
71typedef struct Map_s
72{
73	char*			name;
74	const unsigned char*	map;
75	_ast_iconv_f		fun;
76	int			index;
77} Map_t;
78
79typedef struct Conv_s
80{
81	iconv_t			cvt;
82	char*			buf;
83	size_t			size;
84	Map_t			from;
85	Map_t			to;
86} Conv_t;
87
88static Conv_t*			freelist[4];
89static int			freeindex;
90
91static const char		name_local[] = "local";
92static const char		name_native[] = "native";
93
94static const _ast_iconv_list_t	codes[] =
95{
96	{
97	"utf",
98	"un|unicode|utf",
99	"multibyte 8-bit unicode",
100	"UTF-%s",
101	"8",
102	CC_UTF,
103	},
104
105	{
106	"ume",
107	"um|ume|utf?(-)7",
108	"multibyte 7-bit unicode",
109	"UTF-7",
110	0,
111	CC_UME,
112	},
113
114	{
115	"euc",
116	"(big|euc)*",
117	"euc family",
118	0,
119	0,
120	CC_ICONV,
121	},
122
123	{
124	"dos",
125	"dos?(-)?(855)",
126	"dos code page",
127	"DOS855",
128	0,
129	CC_ICONV,
130	},
131
132	{
133	"ucs",
134	"ucs?(-)?(2)?(be)|utf-16?(be)",
135	"unicode runes",
136	"UCS-%s",
137	"2",
138	CC_UCS,
139	},
140
141	{
142	"ucs-le",
143	"ucs?(-)?(2)le|utf-16le",
144	"little endian unicode runes",
145	"UCS-%sLE",
146	"2",
147	CC_SCU,
148	},
149
150	{ 0 },
151};
152
153#if _UWIN
154
155#include <ast_windows.h>
156
157#ifndef CP_UCS2
158#define CP_UCS2	0x0000
159#endif
160
161static char	_win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset";
162
163/*
164 * return the codeset index given its name or alias
165 * the map is in the what? oh, the registry
166 */
167
168static int
169_win_codeset(const char* name)
170{
171	register char*	s;
172	char*		e;
173	int		n;
174	Sfio_t*		sp;
175	char		aka[128];
176	char		tmp[128];
177
178#if DEBUG_TRACE
179error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name);
180#endif
181	if (name == name_native)
182		return CP_ACP;
183	if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8"))
184		return CP_UTF8;
185	if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2"))
186		return CP_UCS2;
187	if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e)
188		return n;
189	for (;;)
190	{
191		sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name);
192		if (!(sp = sfopen(0, tmp, "r")))
193		{
194			s = (char*)name;
195			if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P'))
196				s += 2;
197			if (!isdigit(s[0]))
198				break;
199			sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s);
200			if (!(sp = sfopen(0, tmp, "r")))
201				break;
202		}
203		for (;;)
204		{
205			if (!(s = sfgetr(sp, '\n', 0)))
206			{
207				sfclose(sp);
208				return -1;
209			}
210			if (!strncasecmp(s, "AliasForCharSet=", 16))
211			{
212				n = sfvalue(sp) - 17;
213				s += 16;
214				if (n >= sizeof(aka))
215					n = sizeof(aka) - 1;
216				memcpy(aka, s, n);
217				aka[n] = 0;
218				sfclose(sp);
219				name = (const char*)aka;
220				break;
221			}
222			if (!strncasecmp(s, "CodePage=", 9))
223			{
224				s += 9;
225				n = strtol(s, 0, 0);
226				sfclose(sp);
227				return n;
228			}
229		}
230	}
231	return -1;
232}
233
234/*
235 * get and check the codeset indices
236 */
237
238static _ast_iconv_t
239_win_iconv_open(register Conv_t* cc, const char* t, const char* f)
240{
241#if DEBUG_TRACE
242error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t);
243#endif
244	if ((cc->from.index = _win_codeset(f)) < 0)
245		return (_ast_iconv_t)(-1);
246	if ((cc->to.index = _win_codeset(t)) < 0)
247		return (_ast_iconv_t)(-1);
248#if DEBUG_TRACE
249error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
250#endif
251	return (_ast_iconv_t)cc;
252}
253
254/*
255 * even though the indices already check out
256 * they could still be rejected
257 */
258
259static size_t
260_win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
261{
262	Conv_t*	cc = (Conv_t*)cd;
263	size_t	un;
264	size_t	tz;
265	size_t	fz;
266	size_t	bz;
267	size_t	pz;
268	size_t	oz;
269	LPWSTR	ub;
270
271#if DEBUG_TRACE
272error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
273#endif
274	if (cc->from.index == cc->to.index)
275	{
276		/*
277		 * easy
278		 */
279
280		fz = tz = (*fn < *tn) ? *fn : *tn;
281		memcpy(*tb, *fb, fz);
282	}
283	else
284	{
285		ub = 0;
286		un = *fn;
287
288		/*
289		 * from => ucs-2
290		 */
291
292		if (cc->to.index == CP_UCS2)
293		{
294			if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn)
295			{
296				fz = *fn;
297				tz *= sizeof(WCHAR);
298			}
299			else
300			{
301				/*
302				 * target too small
303				 * binary search on input size to make it fit
304				 */
305
306				oz = 0;
307				pz = *fn / 2;
308				fz = *fn - pz;
309				for (;;)
310				{
311					while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0)))
312						if (++fz >= *fn)
313							goto nope;
314					tz *= sizeof(WCHAR);
315					if (tz == *tn)
316						break;
317					if (!(pz /= 2))
318					{
319						if (!(fz = oz))
320							goto nope;
321						break;
322					}
323					if (tz > *tn)
324						fz -= pz;
325					else
326					{
327						oz = fz;
328						fz += pz;
329					}
330				}
331			}
332		}
333		else
334		{
335			if (cc->from.index == CP_UCS2)
336			{
337				un = *fn / sizeof(WCHAR);
338				ub = (LPWSTR)*fb;
339			}
340			else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0)))
341				goto nope;
342			else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR))))
343				goto nope;
344			else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un)))
345				goto nope;
346
347			/*
348			 * ucs-2 => to
349			 */
350
351			if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0))
352				fz = *fn;
353			else
354			{
355				/*
356				 * target too small
357				 * binary search on input size to make it fit
358				 */
359
360				oz = 0;
361				pz = *fn / 2;
362				bz = *fn - pz;
363				for (;;)
364				{
365					while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un)))
366						if (++bz > *fn)
367							goto nope;
368					if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0)))
369						goto nope;
370					if (tz == *tn)
371						break;
372					if (!(pz /= 2))
373					{
374						if (!(fz = oz))
375							goto nope;
376						break;
377					}
378					if (tz > *tn)
379						bz -= pz;
380					else
381					{
382						oz = bz;
383						bz += pz;
384					}
385				}
386				if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0)))
387					goto nope;
388#if DEBUG_TRACE
389error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz);
390#endif
391#if 0
392				fz *= sizeof(WCHAR);
393#endif
394			}
395			if (ub != (LPWSTR)*fb)
396				free(ub);
397		}
398	}
399	*fb += fz;
400	*fn -= fz;
401	*tb += tz;
402	*tn -= tz;
403	return fz;
404 nope:
405	if (ub && ub != (LPWSTR)*fb)
406		free(ub);
407	errno = EINVAL;
408	return (size_t)(-1);
409}
410
411#endif
412
413/*
414 * return canonical character code set name for m
415 * if b!=0 then canonical name placed in b of size n
416 * <ccode.h> index returned
417 */
418
419int
420_ast_iconv_name(register const char* m, register char* b, size_t n)
421{
422	register const _ast_iconv_list_t*	cp;
423	const _ast_iconv_list_t*		bp;
424	register int				c;
425	register char*				e;
426	int					sub[2];
427	char					buf[16];
428#if DEBUG_TRACE
429	char*					o;
430#endif
431
432	if (!b)
433	{
434		b = buf;
435		n = sizeof(buf);
436	}
437#if DEBUG_TRACE
438	o = b;
439#endif
440	e = b + n - 1;
441	bp = 0;
442	n = 0;
443	cp = ccmaplist(NiL);
444#if DEBUG_TRACE
445if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m);
446#endif
447	for (;;)
448	{
449#if DEBUG_TRACE
450if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name);
451#endif
452		if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE))
453		{
454			if (!(c = m[sub[1]]))
455			{
456				bp = cp;
457				break;
458			}
459			if (sub[1] > n && !isalpha(c))
460			{
461				bp = cp;
462				n = sub[1];
463			}
464		}
465		if (cp->ccode < 0)
466		{
467			if (!(++cp)->name)
468				break;
469		}
470		else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp)))
471			cp = codes;
472	}
473	if (cp = bp)
474	{
475		if (cp->canon)
476		{
477			if (cp->index)
478			{
479				for (m += sub[1]; *m && !isalnum(*m); m++);
480				if (!isdigit(*m))
481					m = cp->index;
482			}
483			else
484				m = "1";
485			b += sfsprintf(b, e - b, cp->canon, m);
486		}
487		else if (cp->ccode == CC_NATIVE)
488		{
489			if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1"))
490				switch (CC_NATIVE)
491				{
492				case CC_EBCDIC:
493					m = (const char*)"EBCDIC";
494					break;
495				case CC_EBCDIC_I:
496					m = (const char*)"EBCDIC-I";
497					break;
498				case CC_EBCDIC_O:
499					m = (const char*)"EBCDIC-O";
500					break;
501				default:
502					m = (const char*)"ISO-8859-1";
503					break;
504				}
505			b += sfsprintf(b, e - b, "%s", m);
506		}
507		*b = 0;
508#if DEBUG_TRACE
509if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o);
510#endif
511		return cp->ccode;
512	}
513	while (b < e && (c = *m++))
514	{
515		if (islower(c))
516			c = toupper(c);
517		*b++ = c;
518	}
519	*b = 0;
520#if DEBUG_TRACE
521if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o);
522#endif
523	return CC_ICONV;
524}
525
526/*
527 * convert utf-8 to bin
528 */
529
530static size_t
531utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
532{
533	register unsigned char*		f;
534	register unsigned char*		fe;
535	register unsigned char*		t;
536	register unsigned char*		te;
537	register unsigned char*		p;
538	register int			c;
539	register int			w;
540	size_t				n;
541	int				e;
542
543	e = 0;
544	f = (unsigned char*)(*fb);
545	fe = f + (*fn);
546	t = (unsigned char*)(*tb);
547	te = t + (*tn);
548	while (t < te && f < fe)
549	{
550		p = f;
551		c = *f++;
552		if (c & 0x80)
553		{
554			if (!(c & 0x40))
555			{
556				f = p;
557				e = EILSEQ;
558				break;
559			}
560			if (c & 0x20)
561			{
562				w = (c & 0x0F) << 12;
563				if (f >= fe)
564				{
565					f = p;
566					e = EINVAL;
567					break;
568				}
569				c = *f++;
570				if (c & 0x40)
571				{
572					f = p;
573					e = EILSEQ;
574					break;
575				}
576				w |= (c & 0x3F) << 6;
577			}
578			else
579				w = (c & 0x1F) << 6;
580			if (f >= fe)
581			{
582				f = p;
583				e = EINVAL;
584				break;
585			}
586			c = *f++;
587			w |= (c & 0x3F);
588		}
589		else
590			w = c;
591		*t++ = w;
592	}
593	*fn -= (char*)f - (*fb);
594	*fb = (char*)f;
595	*tn -= (n = (char*)t - (*tb));
596	*tb = (char*)t;
597	RETURN(e, n, fn);
598}
599
600/*
601 * convert bin to utf-8
602 */
603
604static size_t
605bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
606{
607	register unsigned char*		f;
608	register unsigned char*		fe;
609	register unsigned char*		t;
610	register unsigned char*		te;
611	register int			c;
612	wchar_t				w;
613	size_t				n;
614	int				e;
615
616	e = 0;
617	f = (unsigned char*)(*fb);
618	fe = f + (*fn);
619	t = (unsigned char*)(*tb);
620	te = t + (*tn);
621	while (f < fe && t < te)
622	{
623		if (!mbwide())
624		{
625			c = 1;
626			w = *f;
627		}
628		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
629		{
630			e = EINVAL;
631			break;
632		}
633		else if (!c)
634			c = 1;
635		if (!(w & ~0x7F))
636			*t++ = w;
637		else
638		{
639			if (!(w & ~0x7FF))
640			{
641				if (t >= (te - 2))
642				{
643					e = E2BIG;
644					break;
645				}
646				*t++ = 0xC0 + (w >> 6);
647			}
648			else if (!(w & ~0xffff))
649			{
650				if (t >= (te - 3))
651				{
652					e = E2BIG;
653					break;
654				}
655				*t++ = 0xE0 + (w >> 12);
656				*t++ = 0x80 + ((w >> 6 ) & 0x3F);
657			}
658			else
659			{
660				e = EILSEQ;
661				break;
662			}
663			*t++ = 0x80 + (w & 0x3F);
664		}
665		f += c;
666	}
667	*fn -= (n = (char*)f - (*fb));
668	*fb = (char*)f;
669	*tn -= (char*)t - (*tb);
670	*tb = (char*)t;
671	RETURN(e, n, fn);
672}
673
674static const unsigned char	ume_D[] =
675"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n";
676
677static const unsigned char	ume_M[] =
678"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
679
680static unsigned char		ume_d[UCHAR_MAX+1];
681
682static unsigned char		ume_m[UCHAR_MAX+1];
683
684#define NOE			0xFF
685#define UMEINIT()		(ume_d[ume_D[0]]?0:umeinit())
686
687/*
688 * initialize the ume tables
689 */
690
691static int
692umeinit(void)
693{
694	register const unsigned char*	s;
695	register int			i;
696	register int			c;
697
698	if (!ume_d[ume_D[0]])
699	{
700		s = ume_D;
701		while (c = *s++)
702			ume_d[c] = 1;
703		memset(ume_m, NOE, sizeof(ume_m));
704		for (i = 0; c = ume_M[i]; i++)
705			ume_m[c] = i;
706	}
707	return 0;
708}
709
710/*
711 * convert utf-7 to bin
712 */
713
714static size_t
715ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
716{
717	register unsigned char*		f;
718	register unsigned char*		fe;
719	register unsigned char*		t;
720	register unsigned char*		te;
721	register unsigned char*		p;
722	register int			s;
723	register int			c;
724	register int			w;
725	size_t				n;
726	int				e;
727
728	e = 0;
729	UMEINIT();
730	f = (unsigned char*)(*fb);
731	fe = f + (*fn);
732	t = (unsigned char*)(*tb);
733	te = t + (*tn);
734	s = 0;
735	while (f < fe && t < te)
736	{
737		p = f;
738		c = *f++;
739		if (s)
740		{
741			if (c == '-' && s > 1)
742				s = 0;
743			else if ((w = ume_m[c]) == NOE)
744			{
745				s = 0;
746				*t++ = c;
747			}
748			else if (f >= (fe - 2))
749			{
750				f = p;
751				e = EINVAL;
752				break;
753			}
754			else
755			{
756				s = 2;
757				w = (w << 6) | ume_m[*f++];
758				w = (w << 6) | ume_m[*f++];
759				if (!(w & ~0xFF))
760					*t++ = w;
761				else if (t >= (te - 1))
762				{
763					f = p;
764					e = E2BIG;
765					break;
766				}
767				else
768				{
769					*t++ = (w >> 8) & 0xFF;
770					*t++ = w & 0xFF;
771				}
772			}
773		}
774		else if (c == '+')
775			s = 1;
776		else
777			*t++ = c;
778	}
779	*fn -= (char*)f - (*fb);
780	*fb = (char*)f;
781	*tn -= (n = (char*)t - (*tb));
782	*tb = (char*)t;
783	RETURN(e, n, fn);
784}
785
786/*
787 * convert bin to utf-7
788 */
789
790static size_t
791bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
792{
793	register unsigned char*		f;
794	register unsigned char*		fe;
795	register unsigned char*		t;
796	register unsigned char*		te;
797	register int			c;
798	register int			s;
799	wchar_t				w;
800	size_t				n;
801	int				e;
802
803	e = 0;
804	UMEINIT();
805	f = (unsigned char*)(*fb);
806	fe = f + (*fn);
807	t = (unsigned char*)(*tb);
808	te = t + (*tn);
809	s = 0;
810	while (f < fe && t < (te - s))
811	{
812		if (!mbwide())
813		{
814			c = 1;
815			w = *f;
816		}
817		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
818		{
819			e = EINVAL;
820			break;
821		}
822		else if (!c)
823			c = 1;
824		if (!(w & ~0x7F) && ume_d[w])
825		{
826			if (s)
827			{
828				s = 0;
829				*t++ = '-';
830			}
831			*t++ = w;
832		}
833		else if (t >= (te - (4 + s)))
834		{
835			e = E2BIG;
836			break;
837		}
838		else
839		{
840			if (!s)
841			{
842				s = 1;
843				*t++ = '+';
844			}
845			*t++ = ume_M[(w >> 12) & 0x3F];
846			*t++ = ume_M[(w >> 6) & 0x3F];
847			*t++ = ume_M[w & 0x3F];
848		}
849		f += c;
850	}
851	if (s)
852		*t++ = '-';
853	*fn -= (n = (char*)f - (*fb));
854	*fb = (char*)f;
855	*tn -= (char*)t - (*tb);
856	*tb = (char*)t;
857	RETURN(e, n, fn);
858}
859
860/*
861 * convert ucs-2 to bin with no byte swap
862 */
863
864static size_t
865ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
866{
867	register unsigned char*		f;
868	register unsigned char*		fe;
869	register unsigned char*		t;
870	register unsigned char*		te;
871	register int			w;
872	size_t				n;
873	int				e;
874
875	e = 0;
876	f = (unsigned char*)(*fb);
877	fe = f + (*fn);
878	t = (unsigned char*)(*tb);
879	te = t + (*tn);
880	while (f < (fe - 1) && t < te)
881	{
882		w = *f++;
883		w = (w << 8) | *f++;
884		if (!(w & ~0xFF))
885			*t++ = w;
886		else if (t >= (te - 1))
887		{
888			f -= 2;
889			e = E2BIG;
890			break;
891		}
892		else
893		{
894			*t++ = (w >> 8) & 0xFF;
895			*t++ = w & 0xFF;
896		}
897	}
898	*fn -= (char*)f - (*fb);
899	*fb = (char*)f;
900	*tn -= (n = (char*)t - (*tb));
901	*tb = (char*)t;
902	RETURN(e, n, fn);
903}
904
905/*
906 * convert bin to ucs-2 with no byte swap
907 */
908
909static size_t
910bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
911{
912	register unsigned char*		f;
913	register unsigned char*		fe;
914	register unsigned char*		t;
915	register unsigned char*		te;
916	register int			c;
917	wchar_t				w;
918	size_t				n;
919	int				e;
920
921	e = 0;
922	f = (unsigned char*)(*fb);
923	fe = f + (*fn);
924	t = (unsigned char*)(*tb);
925	te = t + (*tn);
926	while (f < fe && t < (te - 1))
927	{
928		if (!mbwide())
929		{
930			c = 1;
931			w = *f;
932		}
933		if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
934		{
935			e = EINVAL;
936			break;
937		}
938		else if (!c)
939			c = 1;
940		*t++ = (w >> 8) & 0xFF;
941		*t++ = w & 0xFF;
942		f += c;
943	}
944	*fn -= (n = (char*)f - (*fb));
945	*fb = (char*)f;
946	*tn -= (char*)t - (*tb);
947	*tb = (char*)t;
948	RETURN(e, n, fn);
949}
950
951/*
952 * convert ucs-2 to bin with byte swap
953 */
954
955static size_t
956scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
957{
958	register unsigned char*		f;
959	register unsigned char*		fe;
960	register unsigned char*		t;
961	register unsigned char*		te;
962	register int			w;
963	size_t				n;
964	int				e;
965
966	e = 0;
967	f = (unsigned char*)(*fb);
968	fe = f + (*fn);
969	t = (unsigned char*)(*tb);
970	te = t + (*tn);
971	while (f < (fe - 1) && t < te)
972	{
973		w = *f++;
974		w = w | (*f++ << 8);
975		if (!(w & ~0xFF))
976			*t++ = w;
977		else if (t >= (te - 1))
978		{
979			f -= 2;
980			e = E2BIG;
981			break;
982		}
983		else
984		{
985			*t++ = (w >> 8) & 0xFF;
986			*t++ = w & 0xFF;
987		}
988	}
989	*fn -= (char*)f - (*fb);
990	*fb = (char*)f;
991	*tn -= (n = (char*)t - (*tb));
992	*tb = (char*)t;
993	RETURN(e, n, fn);
994}
995
996/*
997 * convert bin to ucs-2 with byte swap
998 */
999
1000static size_t
1001bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1002{
1003	register unsigned char*		f;
1004	register unsigned char*		fe;
1005	register unsigned char*		t;
1006	register unsigned char*		te;
1007	register int			c;
1008	wchar_t				w;
1009	size_t				n;
1010	int				e;
1011
1012	e = 0;
1013	f = (unsigned char*)(*fb);
1014	fe = f + (*fn);
1015	t = (unsigned char*)(*tb);
1016	te = t + (*tn);
1017	while (f < fe && t < (te - 1))
1018	{
1019		if (!mbwide())
1020		{
1021			c = 1;
1022			w = *f;
1023		}
1024		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
1025		{
1026			e = EINVAL;
1027			break;
1028		}
1029		else if (!c)
1030			c = 1;
1031		*t++ = w & 0xFF;
1032		*t++ = (w >> 8) & 0xFF;
1033		f += c;
1034	}
1035	*fn -= (n = (char*)f - (*fb));
1036	*fb = (char*)f;
1037	*tn -= (char*)t - (*tb);
1038	*tb = (char*)t;
1039	RETURN(e, n, fn);
1040}
1041
1042/*
1043 * open a character code conversion map from f to t
1044 */
1045
1046_ast_iconv_t
1047_ast_iconv_open(const char* t, const char* f)
1048{
1049	register Conv_t*	cc;
1050	int			fc;
1051	int			tc;
1052	int			i;
1053
1054	char			fr[64];
1055	char			to[64];
1056
1057#if DEBUG_TRACE
1058error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t);
1059#endif
1060	if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native))
1061		t = name_native;
1062	if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native))
1063		f = name_native;
1064
1065	/*
1066	 * the ast identify is always (iconv_t)(0)
1067	 */
1068
1069	if (t == f)
1070		return (iconv_t)(0);
1071	fc = _ast_iconv_name(f, fr, sizeof(fr));
1072	tc = _ast_iconv_name(t, to, sizeof(to));
1073#if DEBUG_TRACE
1074error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc);
1075#endif
1076	if (fc != CC_ICONV && fc == tc || streq(fr, to))
1077		return (iconv_t)(0);
1078
1079	/*
1080	 * first check the free list
1081	 */
1082
1083	for (i = 0; i < elementsof(freelist); i++)
1084		if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name))
1085		{
1086			freelist[i] = 0;
1087#if _lib_iconv_open
1088			/*
1089			 * reset the shift state if any
1090			 */
1091
1092			if (cc->cvt != (iconv_t)(-1))
1093				iconv(cc->cvt, NiL, NiL, NiL, NiL);
1094#endif
1095			return cc;
1096		}
1097
1098	/*
1099	 * allocate a new one
1100	 */
1101
1102	if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2)))
1103		return (iconv_t)(-1);
1104	cc->to.name = (char*)(cc + 1);
1105	cc->from.name = strcopy(cc->to.name, to) + 1;
1106	strcpy(cc->from.name, fr);
1107	cc->cvt = (iconv_t)(-1);
1108
1109	/*
1110	 * 8 bit maps are the easiest
1111	 */
1112
1113	if (fc >= 0 && tc >= 0)
1114		cc->from.map = ccmap(fc, tc);
1115#if _lib_iconv_open
1116	else if ((cc->cvt = iconv_open(t, f)) != (iconv_t)(-1) || (cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1))
1117		cc->from.fun = (_ast_iconv_f)iconv;
1118#endif
1119#if _UWIN
1120	else if ((cc->cvt = _win_iconv_open(cc, t, f)) != (_ast_iconv_t)(-1) || (cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1))
1121		cc->from.fun = (_ast_iconv_f)_win_iconv;
1122#endif
1123	else
1124	{
1125		switch (fc)
1126		{
1127		case CC_UTF:
1128			cc->from.fun = utf2bin;
1129			break;
1130		case CC_UME:
1131			cc->from.fun = ume2bin;
1132			break;
1133		case CC_UCS:
1134			cc->from.fun = ucs2bin;
1135			break;
1136		case CC_SCU:
1137			cc->from.fun = scu2bin;
1138			break;
1139		case CC_ASCII:
1140			break;
1141		default:
1142			if (fc < 0)
1143				goto nope;
1144			cc->from.map = ccmap(fc, CC_ASCII);
1145			break;
1146		}
1147		switch (tc)
1148		{
1149		case CC_UTF:
1150			cc->to.fun = bin2utf;
1151			break;
1152		case CC_UME:
1153			cc->to.fun = bin2ume;
1154			break;
1155		case CC_UCS:
1156			cc->to.fun = bin2ucs;
1157			break;
1158		case CC_SCU:
1159			cc->to.fun = bin2scu;
1160			break;
1161		case CC_ASCII:
1162			break;
1163		default:
1164			if (tc < 0)
1165				goto nope;
1166			cc->to.map = ccmap(CC_ASCII, tc);
1167			break;
1168		}
1169	}
1170	return (iconv_t)cc;
1171 nope:
1172	return (iconv_t)(-1);
1173}
1174
1175/*
1176 * close a character code conversion map
1177 */
1178
1179int
1180_ast_iconv_close(_ast_iconv_t cd)
1181{
1182	Conv_t*	cc;
1183	Conv_t*	oc;
1184	int	i;
1185	int	r = 0;
1186
1187	if (cd == (_ast_iconv_t)(-1))
1188		return -1;
1189	if (!(cc = (Conv_t*)cd))
1190		return 0;
1191
1192	/*
1193	 * add to the free list
1194	 */
1195
1196	i = freeindex;
1197	for (;;)
1198	{
1199		if (++ i >= elementsof(freelist))
1200			i = 0;
1201		if (!freelist[i])
1202			break;
1203		if (i == freeindex)
1204		{
1205			if (++ i >= elementsof(freelist))
1206				i = 0;
1207
1208			/*
1209			 * close the oldest
1210			 */
1211
1212			if (oc = freelist[i])
1213			{
1214#if _lib_iconv_open
1215				if (oc->cvt != (iconv_t)(-1))
1216					r = iconv_close(oc->cvt);
1217#endif
1218				if (oc->buf)
1219					free(oc->buf);
1220				free(oc);
1221			}
1222			break;
1223		}
1224	}
1225	freelist[freeindex = i] = cc;
1226	return r;
1227}
1228
1229/*
1230 * copy *fb size *fn to *tb size *tn
1231 * fb,fn tb,tn updated on return
1232 */
1233
1234size_t
1235_ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1236{
1237	Conv_t*				cc = (Conv_t*)cd;
1238	register unsigned char*		f;
1239	register unsigned char*		t;
1240	register unsigned char*		e;
1241	register const unsigned char*	m;
1242	register size_t			n;
1243	char*				b;
1244	char*				tfb;
1245	size_t				tfn;
1246	size_t				i;
1247
1248	if (!fb || !*fb)
1249	{
1250		/* TODO: reset to the initial state */
1251		if (!tb || !*tb)
1252			return 0;
1253		/* TODO: write the initial state shift sequence */
1254		return 0;
1255	}
1256	n = *tn;
1257	if (cc)
1258	{
1259		if (cc->from.fun)
1260		{
1261			if (cc->to.fun)
1262			{
1263				if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1264				{
1265					errno = ENOMEM;
1266					return -1;
1267				}
1268				b = cc->buf;
1269				i = cc->size;
1270				tfb = *fb;
1271				tfn = *fn;
1272				if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1))
1273					return -1;
1274				tfn = b - cc->buf;
1275				tfb = cc->buf;
1276				n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn);
1277				i = tfb - cc->buf;
1278				*fb += i;
1279				*fn -= i;
1280				return n;
1281			}
1282			if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1))
1283				return -1;
1284			n -= *tn;
1285			if (m = cc->to.map)
1286			{
1287				e = (unsigned char*)(*tb);
1288				for (t = e - n; t < e; t++)
1289					*t = m[*t];
1290			}
1291			return n;
1292		}
1293		else if (cc->to.fun)
1294		{
1295			if (!(m = cc->from.map))
1296				return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn);
1297			if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1298			{
1299				errno = ENOMEM;
1300				return -1;
1301			}
1302			if ((n = *fn) > cc->size)
1303				n = cc->size;
1304			f = (unsigned char*)(*fb);
1305			e = f + n;
1306			t = (unsigned char*)(b = cc->buf);
1307			while (f < e)
1308				*t++ = m[*f++];
1309			n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn);
1310			*fb += b - cc->buf;
1311			return n;
1312		}
1313	}
1314	if (n > *fn)
1315		n = *fn;
1316	if (cc && (m = cc->from.map))
1317	{
1318		f = (unsigned char*)(*fb);
1319		e = f + n;
1320		t = (unsigned char*)(*tb);
1321		while (f < e)
1322			*t++ = m[*f++];
1323	}
1324	else
1325		memcpy(*tb, *fb, n);
1326	*fb += n;
1327	*fn -= n;
1328	*tb += n;
1329	*tn -= n;
1330	return n;
1331}
1332
1333#define OK		((size_t)-1)
1334
1335/*
1336 * write *fb size *fn to op
1337 * fb,fn updated on return
1338 * total bytes written to op returned
1339 */
1340
1341ssize_t
1342_ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, Iconv_disc_t* disc)
1343{
1344	char*		fo = *fb;
1345	char*		tb;
1346	char*		ts;
1347	size_t*		e;
1348	size_t		tn;
1349	size_t		r;
1350	int		ok;
1351	Iconv_disc_t	compat;
1352
1353	/*
1354	 * the old api had optional size_t* instead of Iconv_disc_t*
1355	 */
1356
1357	if (!disc || disc->version < 20110101L || disc->version >= 30000101L)
1358	{
1359		e = (size_t*)disc;
1360		disc = &compat;
1361		iconv_init(disc, 0);
1362	}
1363	else
1364		e = 0;
1365	r = 0;
1366	tn = 0;
1367	ok = 1;
1368	while (ok && *fn > 0)
1369	{
1370		if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR)) || !(tn = sfvalue(op)))
1371		{
1372			if (!r)
1373				r = -1;
1374			break;
1375		}
1376		ts = tb;
1377#if DEBUG_TRACE
1378error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn);
1379		for (;;)
1380#else
1381		while (*fn > 0 && _ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1))
1382#endif
1383		{
1384#if DEBUG_TRACE
1385			ssize_t	_r;
1386error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb);
1387			_r = _ast_iconv(cd, fb, fn, &ts, &tn);
1388error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r);
1389			if (_r != (size_t)(-1) || !fn)
1390				break;
1391#endif
1392			switch (errno)
1393			{
1394			case E2BIG:
1395				break;
1396			case EINVAL:
1397				if (disc->errorf)
1398					(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(fo), *fb - fo);
1399				goto bad;
1400			default:
1401				if (disc->errorf)
1402					(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(fo), *fb - fo);
1403			bad:
1404				disc->errors++;
1405				if (!(disc->flags & ICONV_FATAL))
1406				{
1407					if (!(disc->flags & ICONV_OMIT) && tn > 0)
1408					{
1409						*ts++ = (disc->fill >= 0) ? disc->fill : **fb;
1410						tn--;
1411					}
1412					(*fb)++;
1413					(*fn)--;
1414					continue;
1415				}
1416				ok = 0;
1417				break;
1418			}
1419			break;
1420		}
1421#if DEBUG_TRACE
1422error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb);
1423#endif
1424		sfwrite(op, tb, ts - tb);
1425		r += ts - tb;
1426	}
1427	if (e)
1428		*e = disc->errors;
1429	return r;
1430}
1431
1432/*
1433 * move n bytes from ip to op
1434 */
1435
1436ssize_t
1437_ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, Iconv_disc_t* disc)
1438{
1439	char*		fb;
1440	char*		fs;
1441	char*		tb;
1442	char*		ts;
1443	size_t*		e;
1444	size_t		fe;
1445	size_t		fn;
1446	size_t		fo;
1447	size_t		ft;
1448	size_t		tn;
1449	size_t		i;
1450	ssize_t		r = 0;
1451	int		ok = 1;
1452	int		locked;
1453	Iconv_disc_t	compat;
1454
1455	/*
1456	 * the old api had optional size_t* instead of Iconv_disc_t*
1457	 */
1458
1459	if (!disc || disc->version < 20110101L || disc->version >= 30000101L)
1460	{
1461		e = (size_t*)disc;
1462		disc = &compat;
1463		iconv_init(disc, 0);
1464	}
1465	else
1466		e = 0;
1467	tb = 0;
1468	fe = OK;
1469	ft = 0;
1470	fn = n;
1471	do
1472	{
1473		if (n != SF_UNBOUND)
1474			n = -((ssize_t)(n & (((size_t)(~0))>>1)));
1475		if ((!(fb = (char*)sfreserve(ip, n, locked = SF_LOCKR)) || !(fo = sfvalue(ip))) &&
1476		    (!(fb = (char*)sfreserve(ip, n, locked = 0)) || !(fo = sfvalue(ip))))
1477			break;
1478		fs = fb;
1479		fn = fo;
1480		if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR)))
1481		{
1482			if (!r)
1483				r = -1;
1484			break;
1485		}
1486		ts = tb;
1487		tn = sfvalue(op);
1488		while (fn > 0 && _ast_iconv(cd, &fs, &fn, &ts, &tn) == (size_t)(-1))
1489		{
1490			switch (errno)
1491			{
1492			case E2BIG:
1493				break;
1494			case EINVAL:
1495				if (fe == ft + (fo - fn))
1496				{
1497					fe = OK;
1498					if (disc->errorf)
1499						(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn));
1500					goto bad;
1501				}
1502				fe = ft;
1503				break;
1504			default:
1505				if (disc->errorf)
1506					(*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn));
1507			bad:
1508				disc->errors++;
1509				if (!(disc->flags & ICONV_FATAL))
1510				{
1511					if (!(disc->flags & ICONV_OMIT) && tn > 0)
1512					{
1513						*ts++ = (disc->fill >= 0) ? disc->fill : *fs;
1514						tn--;
1515					}
1516					fs++;
1517					fn--;
1518					continue;
1519				}
1520				ok = 0;
1521				break;
1522			}
1523			break;
1524		}
1525		sfwrite(op, tb, ts - tb);
1526		r += ts - tb;
1527		ts = tb;
1528		if (locked)
1529			sfread(ip, fb, fs - fb);
1530		else
1531			for (i = fn; --i >= (fs - fb);)
1532				sfungetc(ip, fb[i]);
1533		if (n != SF_UNBOUND)
1534		{
1535			if (n <= (fs - fb))
1536				break;
1537			n -= fs - fb;
1538		}
1539		ft += (fs - fb);
1540		if (fn == fo)
1541			fn++;
1542	} while (ok);
1543	if (fb && locked)
1544		sfread(ip, fb, 0);
1545	if (tb)
1546	{
1547		sfwrite(op, tb, 0);
1548		if (ts > tb)
1549		{
1550			sfwrite(op, tb, ts - tb);
1551			r += ts - tb;
1552		}
1553	}
1554	if (e)
1555		*e = disc->errors;
1556	return r;
1557}
1558
1559/*
1560 * iconv_list_t iterator
1561 * call with arg 0 to start
1562 * prev return value is current arg
1563 */
1564
1565_ast_iconv_list_t*
1566_ast_iconv_list(_ast_iconv_list_t* cp)
1567{
1568#if _UWIN
1569	struct dirent*	ent;
1570
1571	if (!cp)
1572	{
1573		if (!(cp = newof(0, _ast_iconv_list_t, 1, 0)))
1574			return ccmaplist(NiL);
1575		if (!(cp->data = opendir(_win_maps)))
1576		{
1577			free(cp);
1578			return ccmaplist(NiL);
1579		}
1580	}
1581	if (cp->data)
1582	{
1583		if (ent = readdir((DIR*)cp->data))
1584		{
1585			cp->name = cp->match = cp->desc = (const char*)ent->d_name;
1586			return cp;
1587		}
1588		closedir((DIR*)cp->data);
1589		free(cp);
1590		return ccmaplist(NiL);
1591	}
1592#else
1593	if (!cp)
1594		return ccmaplist(NiL);
1595#endif
1596	if (cp->ccode >= 0)
1597		return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes;
1598	return (++cp)->name ? cp : (_ast_iconv_list_t*)0;
1599}
1600