• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/netatalk-2.2.5/libatalk/unicode/
1/*
2   Unix SMB/CIFS implementation.
3   minimal iconv implementation
4   Copyright (C) Andrew Tridgell 2001
5   Copyright (C) Jelmer Vernooij 2002,2003
6
7   This program is free software; you can redistribute it and/or modify
8   it under the terms of the GNU General Public License as published by
9   the Free Software Foundation; either version 2 of the License, or
10   (at your option) any later version.
11
12   This program is distributed in the hope that it will be useful,
13   but WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   GNU General Public License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with this program; if not, write to the Free Software
19   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
21   From samba 3.0 beta and GNU libiconv-1.8
22   It's bad but most of the time we can't use libc iconv service:
23   - it doesn't round trip for most encoding
24   - it doesn't know about Apple extension
25*/
26
27#ifdef HAVE_CONFIG_H
28#include "config.h"
29#endif /* HAVE_CONFIG_H */
30#include <stdlib.h>
31#include <errno.h>
32
33#include <netatalk/endian.h>
34#include <atalk/unicode.h>
35#include <atalk/logger.h>
36#include <atalk/unicode.h>
37#include "byteorder.h"
38
39/* Given a trailing UTF-8 byte, get the contribution from it to
40 * the Unicode scalar value for a particular bit shift amount
41 */
42#define GETUCVAL(utf8_trailbyte,shift)  ((unsigned int) (( utf8_trailbyte & 0x3F) << shift))
43
44/* Given a unicode scalar, get a trail UTF-8 byte for a particular bit shift amount */
45#define GETUTF8TRAILBYTE(uc,shift)      ((char)( 0x80 | ((uc >> shift) & 0x3F) ) )
46
47
48
49static size_t   utf8_pull(void *,char **, size_t *, char **, size_t *);
50static size_t   utf8_push(void *,char **, size_t *, char **, size_t *);
51
52struct charset_functions charset_utf8 =
53{
54	"UTF8",
55	0x08000103,
56	utf8_pull,
57	utf8_push,
58	CHARSET_VOLUME | CHARSET_MULTIBYTE | CHARSET_PRECOMPOSED,
59	NULL,
60	NULL, NULL
61};
62
63struct charset_functions charset_utf8_mac =
64{
65	"UTF8-MAC",
66	0x08000103,
67	utf8_pull,
68	utf8_push,
69	CHARSET_VOLUME | CHARSET_CLIENT | CHARSET_MULTIBYTE | CHARSET_DECOMPOSED,
70	NULL,
71	NULL, NULL
72};
73
74/* ------------------- Convert from UTF-8 to UTF-16 -------------------*/
75static size_t utf8_pull(void *cd _U_, char **inbuf, size_t *inbytesleft,
76			 char **outbuf, size_t *outbytesleft)
77{
78	ucs2_t uc = 0;
79	unsigned int codepoint;
80	int len;
81
82	while (*inbytesleft >= 1 && *outbytesleft >= 2) {
83		unsigned char *c = (unsigned char *)*inbuf;
84		len = 1;
85
86		/* Arrange conditionals in the order of most frequent occurrence
87		 * for users of Latin-based chars */
88		if ((c[0] & 0x80) == 0) {
89			uc = c[0];
90		} else if ((c[0] & 0xe0) == 0xc0) {
91			if (*inbytesleft < 2) {
92				LOG(log_debug, logtype_default, "short utf8 char");
93				goto badseq;
94			}
95			uc = (ucs2_t) (((c[0] & 0x1f) << 6) | GETUCVAL(c[1],0)) ;
96			len = 2;
97		} else if ((c[0] & 0xf0) == 0xe0) {
98			if (*inbytesleft < 3) {
99				LOG(log_debug, logtype_default, "short utf8 char");
100				goto badseq;
101			}
102			uc = (ucs2_t) (((c[0] & 0x0f) << 12) | GETUCVAL(c[1],6) | GETUCVAL(c[2],0)) ;
103			len = 3;
104		} else if ((c[0] & 0xf8) == 0xf0) {
105			/* 4 bytes, which happens for surrogate pairs only */
106			if (*inbytesleft < 4) {
107				LOG(log_debug, logtype_default, "short utf8 char");
108				goto badseq;
109			}
110			if (*outbytesleft < 4) {
111				LOG(log_debug, logtype_default, "short ucs-2 write");
112				errno = E2BIG;
113				return -1;
114			}
115			codepoint = ((c[0] & 0x07) << 18) | GETUCVAL(c[1],12) |
116				GETUCVAL(c[2],6) |  GETUCVAL(c[3],0);
117			SSVAL(*outbuf,0,(((codepoint - 0x10000) >> 10) + 0xD800)); /* hi  */
118			SSVAL(*outbuf,2,(0xDC00 + (codepoint & 0x03FF)));          /* low */
119			len = 4;
120			(*inbuf)  += 4;
121			(*inbytesleft)  -= 4;
122			(*outbytesleft) -= 4;
123			(*outbuf) += 4;
124			continue;
125		}
126		else {
127			errno = EINVAL;
128			return -1;
129		}
130
131		SSVAL(*outbuf,0,uc);
132		(*inbuf)  += len;
133		(*inbytesleft)  -= len;
134		(*outbytesleft) -= 2;
135		(*outbuf) += 2;
136	}
137
138	if (*inbytesleft > 0) {
139		errno = E2BIG;
140		return -1;
141	}
142
143	return 0;
144
145badseq:
146	errno = EINVAL;
147	return -1;
148}
149
150/* --------------------- Convert from UTF-16 to UTF-8 -----------*/
151static size_t utf8_push(void *cd _U_, char **inbuf, size_t *inbytesleft,
152			 char **outbuf, size_t *outbytesleft)
153{
154	ucs2_t uc=0;
155	ucs2_t hi, low;
156	unsigned int codepoint;
157	int olen, ilen;
158
159	while (*inbytesleft >= 2 && *outbytesleft >= 1) {
160		unsigned char *c = (unsigned char *)*outbuf;
161		uc = SVAL((*inbuf),0);
162		olen=1;
163		ilen=2;
164
165		/* Arrange conditionals in the order of most frequent occurrence for
166		   users of Latin-based chars */
167		if (uc < 0x80) {
168			c[0] = uc;
169		} else if (uc < 0x800) {
170			if (*outbytesleft < 2) {
171				LOG(log_debug, logtype_default, "short utf8 write");
172				goto toobig;
173			}
174			c[1] = GETUTF8TRAILBYTE(uc, 0);
175			c[0] = (char)(0xc0 | ((uc >> 6) & 0x1f));
176			olen = 2;
177		}
178		else if ( uc >= 0x202a && uc <= 0x202e ) {
179			/* ignore bidi hint characters */
180			olen = 0;
181		}
182		/*
183		 * A 2-byte uc value represents a stand-alone Unicode character if
184		 *     0 <= uc < 0xd800 or 0xdfff < uc <= 0xffff.
185		 * If  0xd800 <= uc <= 0xdfff, uc itself does not represent a Unicode character.
186		 * Rather, it is just part of a surrogate pair.  A surrogate pair consists of
187		 * a high surrogate in the range [0xd800 ... 0xdbff] and a low surrogate in the
188		 * range [0xdc00 ... 0xdfff].  Together the pair maps to a single Unicode character
189		 * whose scalar value is 64K or larger.  It is this scalar value that is transformed
190		 * to UTF-8, not the individual surrogates.
191		 *
192		 * See www.unicode.org/faq/utf_bom.html for more info.
193		 */
194
195		else if ( 0xd800 <= uc && uc <= 0xdfff) {
196			/* surrogate - needs 4 bytes from input and 4 bytes for output to UTF-8 */
197			if (*outbytesleft < 4) {
198				LOG(log_debug, logtype_default, "short utf8 write");
199				goto toobig;
200			}
201			if (*inbytesleft < 4) {
202				errno = EINVAL;
203				return -1;
204			}
205			hi =  SVAL((*inbuf),0);
206			low = SVAL((*inbuf),2);
207			if ( 0xd800 <= hi && hi <= 0xdbff && 0xdc00 <= low && low <= 0xdfff) {
208				codepoint = ((hi - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
209				c[3] = GETUTF8TRAILBYTE(codepoint, 0);
210				c[2] = GETUTF8TRAILBYTE(codepoint, 6);
211				c[1] = GETUTF8TRAILBYTE(codepoint, 12);
212				c[0] = (char)(0xf0 | ((codepoint >> 18) & 0x07));
213				ilen = olen = 4;
214			} else { /* invalid values for surrogate */
215				errno = EINVAL;
216				return -1;
217			}
218		} else {
219			if (*outbytesleft < 3) {
220				LOG(log_debug, logtype_default, "short utf8 write");
221				goto toobig;
222			}
223			c[2] = GETUTF8TRAILBYTE(uc, 0);
224			c[1] = GETUTF8TRAILBYTE(uc, 6);
225			c[0] = (char)(0xe0 | ((uc >> 12) & 0x0f));
226			olen = 3;
227		}
228
229		(*inbytesleft)  -= ilen;
230		(*outbytesleft) -= olen;
231		(*inbuf)  += ilen;
232		(*outbuf) += olen;
233	}
234
235	if (*inbytesleft == 1) {
236		errno = EINVAL;
237		return -1;
238	}
239
240	if (*inbytesleft > 1) {
241		errno = E2BIG;
242		return -1;
243	}
244
245	return 0;
246
247toobig:
248	errno = E2BIG;
249	return -1;
250}
251