1/*
2   Unix SMB/CIFS implementation.
3   minimal iconv implementation
4   Copyright (C) Andrew Tridgell 2001
5   Copyright (C) Jelmer Vernooij 2002,2003
6
7   This program is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 3 of the License, or
10   (at your option) any later version.
11
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with this program.  If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include "includes.h"
22
23/*
24 * We have to use strcasecmp here as the character conversions
25 * haven't been initialised yet. JRA.
26 */
27
28#undef strcasecmp
29
30/**
31 * @file
32 *
33 * @brief Samba wrapper/stub for iconv character set conversion.
34 *
35 * iconv is the XPG2 interface for converting between character
36 * encodings.  This file provides a Samba wrapper around it, and also
37 * a simple reimplementation that is used if the system does not
38 * implement iconv.
39 *
40 * Samba only works with encodings that are supersets of ASCII: ascii
41 * characters like whitespace can be tested for directly, multibyte
42 * sequences start with a byte with the high bit set, and strings are
43 * terminated by a nul byte.
44 *
45 * Note that the only function provided by iconv is conversion between
46 * characters.  It doesn't directly support operations like
47 * uppercasing or comparison.  We have to convert to UCS-2 and compare
48 * there.
49 *
50 * @sa Samba Developers Guide
51 **/
52
53static_decl_charset;
54
55static size_t ascii_pull(void *,const char **, size_t *, char **, size_t *);
56static size_t ascii_push(void *,const char **, size_t *, char **, size_t *);
57static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58static size_t  utf8_pull(void *,const char **, size_t *, char **, size_t *);
59static size_t  utf8_push(void *,const char **, size_t *, char **, size_t *);
60static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
61static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
62static size_t iconv_copy(void *,const char **, size_t *, char **, size_t *);
63static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
64
65static struct charset_functions builtin_functions[] = {
66	/* windows is really neither UCS-2 not UTF-16 */
67	{"UCS-2LE",  iconv_copy, iconv_copy},
68	{"UTF-16LE",  iconv_copy, iconv_copy},
69	{"UCS-2BE",  iconv_swab, iconv_swab},
70	{"UTF-16BE",  iconv_swab, iconv_swab},
71
72	/* we include the UTF-8 alias to cope with differing locale settings */
73	{"UTF8",   utf8_pull,  utf8_push},
74	{"UTF-8",   utf8_pull,  utf8_push},
75	{"ASCII", ascii_pull, ascii_push},
76	{"646", ascii_pull, ascii_push},
77	{"ISO-8859-1", ascii_pull, latin1_push},
78	{"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
79	{NULL, NULL, NULL}
80};
81
82static struct charset_functions *charsets = NULL;
83
84static struct charset_functions *find_charset_functions(const char *name)
85{
86	struct charset_functions *c = charsets;
87
88	while(c) {
89		if (strcasecmp(name, c->name) == 0) {
90			return c;
91		}
92		c = c->next;
93	}
94
95	return NULL;
96}
97
98NTSTATUS smb_register_charset(struct charset_functions *funcs)
99{
100	if (!funcs) {
101		return NT_STATUS_INVALID_PARAMETER;
102	}
103
104	DEBUG(5, ("Attempting to register new charset %s\n", funcs->name));
105	/* Check whether we already have this charset... */
106	if (find_charset_functions(funcs->name)) {
107		DEBUG(0, ("Duplicate charset %s, not registering\n", funcs->name));
108		return NT_STATUS_OBJECT_NAME_COLLISION;
109	}
110
111	funcs->next = funcs->prev = NULL;
112	DEBUG(5, ("Registered charset %s\n", funcs->name));
113	DLIST_ADD(charsets, funcs);
114	return NT_STATUS_OK;
115}
116
117static void lazy_initialize_iconv(void)
118{
119	static bool initialized;
120	int i;
121
122	if (!initialized) {
123		initialized = True;
124		for(i = 0; builtin_functions[i].name; i++)
125			smb_register_charset(&builtin_functions[i]);
126		static_init_charset;
127	}
128}
129
130#ifdef HAVE_NATIVE_ICONV
131/* if there was an error then reset the internal state,
132   this ensures that we don't have a shift state remaining for
133   character sets like SJIS */
134static size_t sys_iconv(void *cd,
135			const char **inbuf, size_t *inbytesleft,
136			char **outbuf, size_t *outbytesleft)
137{
138	size_t ret = iconv((iconv_t)cd,
139			   (void *)inbuf, inbytesleft,
140			   outbuf, outbytesleft);
141	if (ret == (size_t)-1) {
142		int saved_errno = errno;
143		iconv(cd, NULL, NULL, NULL, NULL);
144		errno = saved_errno;
145	}
146	return ret;
147}
148#endif
149
150/**
151 * This is a simple portable iconv() implementaion.
152 *
153 * It only knows about a very small number of character sets - just
154 * enough that Samba works on systems that don't have iconv.
155 **/
156size_t smb_iconv(smb_iconv_t cd,
157		 const char **inbuf, size_t *inbytesleft,
158		 char **outbuf, size_t *outbytesleft)
159{
160	char cvtbuf[2048];
161	char *bufp = cvtbuf;
162	size_t bufsize;
163
164	/* in many cases we can go direct */
165	if (cd->direct) {
166		return cd->direct(cd->cd_direct,
167				  inbuf, inbytesleft, outbuf, outbytesleft);
168	}
169
170
171	/* otherwise we have to do it chunks at a time */
172	while (*inbytesleft > 0) {
173		bufp = cvtbuf;
174		bufsize = sizeof(cvtbuf);
175
176		if (cd->pull(cd->cd_pull,
177			     inbuf, inbytesleft, &bufp, &bufsize) == -1
178		    && errno != E2BIG) return -1;
179
180		bufp = cvtbuf;
181		bufsize = sizeof(cvtbuf) - bufsize;
182
183		if (cd->push(cd->cd_push,
184			     (const char **)&bufp, &bufsize,
185			     outbuf, outbytesleft) == -1) return -1;
186	}
187
188	return 0;
189}
190
191
192static bool is_utf16(const char *name)
193{
194	return strcasecmp(name, "UCS-2LE") == 0 ||
195		strcasecmp(name, "UTF-16LE") == 0;
196}
197
198/*
199  simple iconv_open() wrapper
200 */
201smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
202{
203	smb_iconv_t ret;
204	struct charset_functions *from, *to;
205
206	lazy_initialize_iconv();
207	from = charsets;
208	to = charsets;
209
210	ret = SMB_MALLOC_P(struct smb_iconv_s);
211	if (!ret) {
212		errno = ENOMEM;
213		return (smb_iconv_t)-1;
214	}
215	memset(ret, 0, sizeof(struct smb_iconv_s));
216
217	ret->from_name = SMB_STRDUP(fromcode);
218	ret->to_name = SMB_STRDUP(tocode);
219
220	/* check for the simplest null conversion */
221	if (strcasecmp(fromcode, tocode) == 0) {
222		ret->direct = iconv_copy;
223		return ret;
224	}
225
226	/* check if we have a builtin function for this conversion */
227	from = find_charset_functions(fromcode);
228	if(from)ret->pull = from->pull;
229
230	to = find_charset_functions(tocode);
231	if(to)ret->push = to->push;
232
233	/* check if we can use iconv for this conversion */
234#ifdef HAVE_NATIVE_ICONV
235	if (!ret->pull) {
236		ret->cd_pull = iconv_open("UTF-16LE", fromcode);
237		if (ret->cd_pull == (iconv_t)-1)
238			ret->cd_pull = iconv_open("UCS-2LE", fromcode);
239		if (ret->cd_pull != (iconv_t)-1)
240			ret->pull = sys_iconv;
241	}
242
243	if (!ret->push) {
244		ret->cd_push = iconv_open(tocode, "UTF-16LE");
245		if (ret->cd_push == (iconv_t)-1)
246			ret->cd_push = iconv_open(tocode, "UCS-2LE");
247		if (ret->cd_push != (iconv_t)-1)
248			ret->push = sys_iconv;
249	}
250#endif
251
252	/* check if there is a module available that can do this conversion */
253	if (!ret->pull && NT_STATUS_IS_OK(smb_probe_module("charset", fromcode))) {
254		if(!(from = find_charset_functions(fromcode)))
255			DEBUG(0, ("Module %s doesn't provide charset %s!\n", fromcode, fromcode));
256		else
257			ret->pull = from->pull;
258	}
259
260	if (!ret->push && NT_STATUS_IS_OK(smb_probe_module("charset", tocode))) {
261		if(!(to = find_charset_functions(tocode)))
262			DEBUG(0, ("Module %s doesn't provide charset %s!\n", tocode, tocode));
263		else
264			ret->push = to->push;
265	}
266
267	if (!ret->push || !ret->pull) {
268		SAFE_FREE(ret->from_name);
269		SAFE_FREE(ret->to_name);
270		SAFE_FREE(ret);
271		errno = EINVAL;
272		return (smb_iconv_t)-1;
273	}
274
275	/* check for conversion to/from ucs2 */
276	if (is_utf16(fromcode) && to) {
277		ret->direct = to->push;
278		ret->push = ret->pull = NULL;
279		return ret;
280	}
281
282	if (is_utf16(tocode) && from) {
283		ret->direct = from->pull;
284		ret->push = ret->pull = NULL;
285		return ret;
286	}
287
288	/* Check if we can do the conversion direct */
289#ifdef HAVE_NATIVE_ICONV
290	if (is_utf16(fromcode)) {
291		ret->direct = sys_iconv;
292		ret->cd_direct = ret->cd_push;
293		ret->cd_push = NULL;
294		return ret;
295	}
296	if (is_utf16(tocode)) {
297		ret->direct = sys_iconv;
298		ret->cd_direct = ret->cd_pull;
299		ret->cd_pull = NULL;
300		return ret;
301	}
302#endif
303
304	return ret;
305}
306
307/*
308  simple iconv_close() wrapper
309*/
310int smb_iconv_close (smb_iconv_t cd)
311{
312#ifdef HAVE_NATIVE_ICONV
313	if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
314	if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
315	if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
316#endif
317
318	SAFE_FREE(cd->from_name);
319	SAFE_FREE(cd->to_name);
320
321	memset(cd, 0, sizeof(*cd));
322	SAFE_FREE(cd);
323	return 0;
324}
325
326
327/**********************************************************************
328 the following functions implement the builtin character sets in Samba
329 and also the "test" character sets that are designed to test
330 multi-byte character set support for english users
331***********************************************************************/
332
333static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
334			 char **outbuf, size_t *outbytesleft)
335{
336	while (*inbytesleft >= 1 && *outbytesleft >= 2) {
337		(*outbuf)[0] = (*inbuf)[0];
338		(*outbuf)[1] = 0;
339		(*inbytesleft)  -= 1;
340		(*outbytesleft) -= 2;
341		(*inbuf)  += 1;
342		(*outbuf) += 2;
343	}
344
345	if (*inbytesleft > 0) {
346		errno = E2BIG;
347		return -1;
348	}
349
350	return 0;
351}
352
353static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
354			 char **outbuf, size_t *outbytesleft)
355{
356	int ir_count=0;
357
358	while (*inbytesleft >= 2 && *outbytesleft >= 1) {
359		(*outbuf)[0] = (*inbuf)[0] & 0x7F;
360		if ((*inbuf)[1]) ir_count++;
361		(*inbytesleft)  -= 2;
362		(*outbytesleft) -= 1;
363		(*inbuf)  += 2;
364		(*outbuf) += 1;
365	}
366
367	if (*inbytesleft == 1) {
368		errno = EINVAL;
369		return -1;
370	}
371
372	if (*inbytesleft > 1) {
373		errno = E2BIG;
374		return -1;
375	}
376
377	return ir_count;
378}
379
380static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
381			 char **outbuf, size_t *outbytesleft)
382{
383	int ir_count=0;
384
385	while (*inbytesleft >= 2 && *outbytesleft >= 1) {
386		(*outbuf)[0] = (*inbuf)[0];
387		if ((*inbuf)[1]) ir_count++;
388		(*inbytesleft)  -= 2;
389		(*outbytesleft) -= 1;
390		(*inbuf)  += 2;
391		(*outbuf) += 1;
392	}
393
394	if (*inbytesleft == 1) {
395		errno = EINVAL;
396		return -1;
397	}
398
399	if (*inbytesleft > 1) {
400		errno = E2BIG;
401		return -1;
402	}
403
404	return ir_count;
405}
406
407static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
408			 char **outbuf, size_t *outbytesleft)
409{
410	while (*inbytesleft >= 1 && *outbytesleft >= 2) {
411		unsigned v;
412
413		if ((*inbuf)[0] != '@') {
414			/* seven bit ascii case */
415			(*outbuf)[0] = (*inbuf)[0];
416			(*outbuf)[1] = 0;
417			(*inbytesleft)  -= 1;
418			(*outbytesleft) -= 2;
419			(*inbuf)  += 1;
420			(*outbuf) += 2;
421			continue;
422		}
423		/* it's a hex character */
424		if (*inbytesleft < 5) {
425			errno = EINVAL;
426			return -1;
427		}
428
429		if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
430			errno = EILSEQ;
431			return -1;
432		}
433
434		(*outbuf)[0] = v&0xff;
435		(*outbuf)[1] = v>>8;
436		(*inbytesleft)  -= 5;
437		(*outbytesleft) -= 2;
438		(*inbuf)  += 5;
439		(*outbuf) += 2;
440	}
441
442	if (*inbytesleft > 0) {
443		errno = E2BIG;
444		return -1;
445	}
446
447	return 0;
448}
449
450static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
451			   char **outbuf, size_t *outbytesleft)
452{
453	while (*inbytesleft >= 2 && *outbytesleft >= 1) {
454		char buf[6];
455
456		if ((*inbuf)[1] == 0 &&
457		    ((*inbuf)[0] & 0x80) == 0 &&
458		    (*inbuf)[0] != '@') {
459			(*outbuf)[0] = (*inbuf)[0];
460			(*inbytesleft)  -= 2;
461			(*outbytesleft) -= 1;
462			(*inbuf)  += 2;
463			(*outbuf) += 1;
464			continue;
465		}
466		if (*outbytesleft < 5) {
467			errno = E2BIG;
468			return -1;
469		}
470		snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
471		memcpy(*outbuf, buf, 5);
472		(*inbytesleft)  -= 2;
473		(*outbytesleft) -= 5;
474		(*inbuf)  += 2;
475		(*outbuf) += 5;
476	}
477
478	if (*inbytesleft == 1) {
479		errno = EINVAL;
480		return -1;
481	}
482
483	if (*inbytesleft > 1) {
484		errno = E2BIG;
485		return -1;
486	}
487
488	return 0;
489}
490
491static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
492			 char **outbuf, size_t *outbytesleft)
493{
494	int n;
495
496	n = MIN(*inbytesleft, *outbytesleft);
497
498	swab(*inbuf, *outbuf, (n&~1));
499	if (n&1) {
500		(*outbuf)[n-1] = 0;
501	}
502
503	(*inbytesleft) -= n;
504	(*outbytesleft) -= n;
505	(*inbuf) += n;
506	(*outbuf) += n;
507
508	if (*inbytesleft > 0) {
509		errno = E2BIG;
510		return -1;
511	}
512
513	return 0;
514}
515
516static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
517			 char **outbuf, size_t *outbytesleft)
518{
519	int n;
520
521	n = MIN(*inbytesleft, *outbytesleft);
522
523	memmove(*outbuf, *inbuf, n);
524
525	(*inbytesleft) -= n;
526	(*outbytesleft) -= n;
527	(*inbuf) += n;
528	(*outbuf) += n;
529
530	if (*inbytesleft > 0) {
531		errno = E2BIG;
532		return -1;
533	}
534
535	return 0;
536}
537
538static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
539			 char **outbuf, size_t *outbytesleft)
540{
541	size_t in_left=*inbytesleft, out_left=*outbytesleft;
542	const uint8 *c = (const uint8 *)*inbuf;
543	uint8 *uc = (uint8 *)*outbuf;
544
545	while (in_left >= 1 && out_left >= 2) {
546		unsigned int codepoint;
547
548		if ((c[0] & 0x80) == 0) {
549			uc[0] = c[0];
550			uc[1] = 0;
551			c  += 1;
552			in_left  -= 1;
553			out_left -= 2;
554			uc += 2;
555			continue;
556		}
557
558		if ((c[0] & 0xe0) == 0xc0) {
559			if (in_left < 2 ||
560			    (c[1] & 0xc0) != 0x80) {
561				errno = EILSEQ;
562				goto error;
563			}
564			codepoint = (c[1]&0x3f) | ((c[0]&0x1f)<<6);
565			if (codepoint < 0x80) {
566				/* don't accept UTF-8 characters that are not minimally packed */
567				errno = EILSEQ;
568				goto error;
569			}
570			uc[1] = codepoint >> 8;
571			uc[0] = codepoint & 0xff;
572			c  += 2;
573			in_left  -= 2;
574			out_left -= 2;
575			uc += 2;
576			continue;
577		}
578
579		if ((c[0] & 0xf0) == 0xe0) {
580			if (in_left < 3 ||
581			    (c[1] & 0xc0) != 0x80 ||
582			    (c[2] & 0xc0) != 0x80) {
583				errno = EILSEQ;
584				goto error;
585			}
586			codepoint = (c[2]&0x3f) | ((c[1]&0x3f)<<6) | ((c[0]&0xf)<<12);
587			if (codepoint < 0x800) {
588				/* don't accept UTF-8 characters that are not minimally packed */
589				errno = EILSEQ;
590				goto error;
591			}
592			uc[1] = codepoint >> 8;
593			uc[0] = codepoint & 0xff;
594			c  += 3;
595			in_left  -= 3;
596			out_left -= 2;
597			uc += 2;
598			continue;
599		}
600
601		if ((c[0] & 0xf8) == 0xf0) {
602			if (in_left < 4 ||
603			    (c[1] & 0xc0) != 0x80 ||
604			    (c[2] & 0xc0) != 0x80 ||
605			    (c[3] & 0xc0) != 0x80) {
606				errno = EILSEQ;
607				goto error;
608			}
609			codepoint =
610				(c[3]&0x3f) |
611				((c[2]&0x3f)<<6) |
612				((c[1]&0x3f)<<12) |
613				((c[0]&0x7)<<18);
614			if (codepoint < 0x10000 || codepoint > 0x10ffff) {
615				/* don't accept UTF-8 characters that are not minimally packed */
616				errno = EILSEQ;
617				goto error;
618			}
619
620			codepoint -= 0x10000;
621
622			if (out_left < 4) {
623				errno = E2BIG;
624				goto error;
625			}
626
627			uc[0] = (codepoint>>10) & 0xFF;
628			uc[1] = (codepoint>>18) | 0xd8;
629			uc[2] = codepoint & 0xFF;
630			uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
631			c  += 4;
632			in_left  -= 4;
633			out_left -= 4;
634			uc += 4;
635			continue;
636		}
637
638		/* we don't handle 5 byte sequences */
639		errno = EINVAL;
640		goto error;
641	}
642
643	if (in_left > 0) {
644		errno = E2BIG;
645		goto error;
646	}
647
648	*inbytesleft = in_left;
649	*outbytesleft = out_left;
650	*inbuf = (char *)c;
651	*outbuf = (char *)uc;
652	return 0;
653
654error:
655	*inbytesleft = in_left;
656	*outbytesleft = out_left;
657	*inbuf = (char *)c;
658	*outbuf = (char *)uc;
659	return -1;
660}
661
662static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
663			char **outbuf, size_t *outbytesleft)
664{
665	size_t in_left=*inbytesleft, out_left=*outbytesleft;
666	uint8 *c = (uint8 *)*outbuf;
667	const uint8 *uc = (const uint8 *)*inbuf;
668
669	while (in_left >= 2 && out_left >= 1) {
670		unsigned int codepoint;
671
672		if (uc[1] == 0 && !(uc[0] & 0x80)) {
673			/* simplest case */
674			c[0] = uc[0];
675			in_left  -= 2;
676			out_left -= 1;
677			uc += 2;
678			c  += 1;
679			continue;
680		}
681
682		if ((uc[1]&0xf8) == 0) {
683			/* next simplest case */
684			if (out_left < 2) {
685				errno = E2BIG;
686				goto error;
687			}
688			c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
689			c[1] = 0x80 | (uc[0] & 0x3f);
690			in_left  -= 2;
691			out_left -= 2;
692			uc += 2;
693			c  += 2;
694			continue;
695		}
696
697		if ((uc[1] & 0xfc) == 0xdc) {
698			/* its the second part of a 4 byte sequence. Illegal */
699			if (in_left < 4) {
700				errno = EINVAL;
701			} else {
702				errno = EILSEQ;
703			}
704			goto error;
705		}
706
707		if ((uc[1] & 0xfc) != 0xd8) {
708			codepoint = uc[0] | (uc[1]<<8);
709			if (out_left < 3) {
710				errno = E2BIG;
711				goto error;
712			}
713			c[0] = 0xe0 | (codepoint >> 12);
714			c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
715			c[2] = 0x80 | (codepoint & 0x3f);
716
717			in_left  -= 2;
718			out_left -= 3;
719			uc  += 2;
720			c   += 3;
721			continue;
722		}
723
724		/* its the first part of a 4 byte sequence */
725		if (in_left < 4) {
726			errno = EINVAL;
727			goto error;
728		}
729		if ((uc[3] & 0xfc) != 0xdc) {
730			errno = EILSEQ;
731			goto error;
732		}
733		codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
734				       (uc[0]<<10) | ((uc[1] & 0x3)<<18));
735
736		if (out_left < 4) {
737			errno = E2BIG;
738			goto error;
739		}
740		c[0] = 0xf0 | (codepoint >> 18);
741		c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
742		c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
743		c[3] = 0x80 | (codepoint & 0x3f);
744
745		in_left  -= 4;
746		out_left -= 4;
747		uc       += 4;
748		c        += 4;
749	}
750
751	if (in_left == 1) {
752		errno = EINVAL;
753		goto error;
754	}
755
756	if (in_left > 1) {
757		errno = E2BIG;
758		goto error;
759	}
760
761	*inbytesleft = in_left;
762	*outbytesleft = out_left;
763	*inbuf  = (char *)uc;
764	*outbuf = (char *)c;
765
766	return 0;
767
768error:
769	*inbytesleft = in_left;
770	*outbytesleft = out_left;
771	*inbuf  = (char *)uc;
772	*outbuf = (char *)c;
773	return -1;
774}
775
776