1#ifndef lint
2static char *rcsid = "$Id: utf8.c,v 1.1 2003/06/04 00:26:44 marka Exp $";
3#endif
4
5/*
6 * Copyright (c) 2000 Japan Network Information Center.  All rights reserved.
7 *
8 * By using this file, you agree to the terms and conditions set forth bellow.
9 *
10 * 			LICENSE TERMS AND CONDITIONS
11 *
12 * The following License Terms and Conditions apply, unless a different
13 * license is obtained from Japan Network Information Center ("JPNIC"),
14 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
15 * Chiyoda-ku, Tokyo 101-0047, Japan.
16 *
17 * 1. Use, Modification and Redistribution (including distribution of any
18 *    modified or derived work) in source and/or binary forms is permitted
19 *    under this License Terms and Conditions.
20 *
21 * 2. Redistribution of source code must retain the copyright notices as they
22 *    appear in each source code file, this License Terms and Conditions.
23 *
24 * 3. Redistribution in binary form must reproduce the Copyright Notice,
25 *    this License Terms and Conditions, in the documentation and/or other
26 *    materials provided with the distribution.  For the purposes of binary
27 *    distribution the "Copyright Notice" refers to the following language:
28 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
29 *
30 * 4. The name of JPNIC may not be used to endorse or promote products
31 *    derived from this Software without specific prior written approval of
32 *    JPNIC.
33 *
34 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
35 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
37 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
38 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
39 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
40 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
41 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
42 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
43 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
44 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
45 */
46
47#include <config.h>
48
49#include <stddef.h>
50
51#include <idn/assert.h>
52#include <idn/logmacro.h>
53#include <idn/utf8.h>
54#include <idn/debug.h>
55
56#define UTF8_WIDTH(c) \
57	(((c) < 0x80) ? 1 : \
58	 ((c) < 0xc0) ? 0 : \
59	 ((c) < 0xe0) ? 2 : \
60	 ((c) < 0xf0) ? 3 : \
61	 ((c) < 0xf8) ? 4 : \
62	 ((c) < 0xfc) ? 5 : \
63	 ((c) < 0xfe) ? 6 : 0)
64
65#define VALID_CONT_BYTE(c)	(0x80 <= (c) && (c) < 0xc0)
66
67int
68idn_utf8_mblen(const char *s) {
69	int c = *(unsigned char *)s;
70
71	assert(s != NULL);
72
73#if 0
74	TRACE(("idn_utf8_mblen(s=<%s>)\n", idn__debug_hexstring(s, 6)));
75#endif
76
77	return UTF8_WIDTH(c);
78}
79
80int
81idn_utf8_getmb(const char *s, size_t len, char *buf) {
82	/* buf must be at least 7-bytes long */
83	const unsigned char *p = (const unsigned char *)s;
84	unsigned char *q = (unsigned char *)buf;
85	int width = UTF8_WIDTH(*p);
86	int w;
87
88	assert(s != NULL);
89
90#if 0
91	TRACE(("idn_utf8_getmb(s=<%s>,len=%d)\n",
92	      idn__debug_hexstring(s, 6), len));
93#endif
94
95	if (width == 0 || len < width)
96		return (0);
97
98	/* Copy the first byte. */
99	*q++ = *p++;
100
101	/* .. and the rest. */
102	w = width;
103	while (--w > 0) {
104		if (!VALID_CONT_BYTE(*p))
105			return (0);
106		*q++ = *p++;
107	}
108	return (width);
109}
110
111int
112idn_utf8_getwc(const char *s, size_t len, unsigned long *vp) {
113	unsigned long v;
114	unsigned long min;
115	const unsigned char *p = (const unsigned char *)s;
116	int c;
117	int width;
118	int rest;
119
120	assert(s != NULL);
121
122#if 0
123	TRACE(("idn_utf8_getwc(s=<%s>,len=%d)\n",
124	      idn__debug_hexstring(s, 10), len));
125#endif
126
127	c = *p++;
128	width = UTF8_WIDTH(c);
129
130	switch (width) {
131	case 0:
132		return (0);
133	case 1:
134		v = c;
135		min = 0;
136		break;
137	case 2:
138		v = c & 0x1f;
139		min = 0x80;
140		break;
141	case 3:
142		v = c & 0xf;
143		min = 0x800;
144		break;
145	case 4:
146		v = c & 0x7;
147		min = 0x10000;
148		break;
149	case 5:
150		v = c & 3;
151		min = 0x200000;
152		break;
153	case 6:
154		v = c & 1;
155		min = 0x4000000;
156		break;
157	default:
158		FATAL(("idn_utf8_getint: internal error\n"));
159		return (0);
160	}
161
162	if (len < width)
163		return (0);
164
165	rest = width - 1;
166	while (rest-- > 0) {
167		if (!VALID_CONT_BYTE(*p))
168			return (0);
169		v = (v << 6) | (*p & 0x3f);
170		p++;
171	}
172
173	if (v < min)
174		return (0);
175
176	*vp = v;
177	return (width);
178}
179
180int
181idn_utf8_putwc(char *s, size_t len, unsigned long v) {
182	unsigned char *p = (unsigned char *)s;
183	int mask;
184	int off;
185	int l;
186
187	assert(s != NULL);
188
189#if 0
190	TRACE(("idn_utf8_putwc(v=%lx)\n", v));
191#endif
192
193	if (v < 0x80) {
194		mask = 0;
195		l = 1;
196	} else if (v < 0x800) {
197		mask = 0xc0;
198		l = 2;
199	} else if (v < 0x10000) {
200		mask = 0xe0;
201		l = 3;
202	} else if (v < 0x200000) {
203		mask = 0xf0;
204		l = 4;
205	} else if (v < 0x4000000) {
206		mask = 0xf8;
207		l = 5;
208	} else if (v < 0x80000000) {
209		mask = 0xfc;
210		l = 6;
211	} else {
212		return (0);
213	}
214
215	if (len < l)
216		return (0);
217
218	off = 6 * (l - 1);
219	*p++ = (v >> off) | mask;
220	mask = 0x80;
221	while (off > 0) {
222		off -= 6;
223		*p++ = ((v >> off) & 0x3f) | mask;
224	}
225	return l;
226}
227
228int
229idn_utf8_isvalidchar(const char *s) {
230	unsigned long dummy;
231
232	TRACE(("idn_utf8_isvalidchar(s=<%s>)\n",
233	      idn__debug_hexstring(s, 6)));
234
235	return (idn_utf8_getwc(s, 6, &dummy) > 0);
236}
237
238int
239idn_utf8_isvalidstring(const char *s) {
240	unsigned long dummy;
241	int width;
242
243	assert(s != NULL);
244
245	TRACE(("idn_utf8_isvalidstring(s=<%s>)\n",
246	      idn__debug_hexstring(s, 20)));
247
248	while (*s != '\0') {
249		width = idn_utf8_getwc(s, 6, &dummy);
250		if (width == 0)
251			return (0);
252		s += width;
253	}
254	return (1);
255}
256
257char *
258idn_utf8_findfirstbyte(const char *s, const char *known_top) {
259	const unsigned char *p = (const unsigned char *)s;
260	const unsigned char *t = (const unsigned char *)known_top;
261
262	assert(s != NULL && known_top != NULL && known_top <= s);
263
264	TRACE(("idn_utf8_findfirstbyte(s=<%s>)\n",
265	      idn__debug_hexstring(s, 8)));
266
267	while (p >= t) {
268		if (!VALID_CONT_BYTE(*p))
269		    break;
270		p--;
271	}
272	if (p < t || UTF8_WIDTH(*p) == 0)
273		return (NULL);
274
275	return ((char *)p);
276}
277