1#ifndef lint
2static char *rcsid = "$Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp $";
3#endif
4
5/*
6 * Copyright (c) 2001 Japan Network Information Center.  All rights reserved.
7 *
8 * By using this file, you agree to the terms and conditions set forth bellow.
9 *
10 * 			LICENSE TERMS AND CONDITIONS
11 *
12 * The following License Terms and Conditions apply, unless a different
13 * license is obtained from Japan Network Information Center ("JPNIC"),
14 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
15 * Chiyoda-ku, Tokyo 101-0047, Japan.
16 *
17 * 1. Use, Modification and Redistribution (including distribution of any
18 *    modified or derived work) in source and/or binary forms is permitted
19 *    under this License Terms and Conditions.
20 *
21 * 2. Redistribution of source code must retain the copyright notices as they
22 *    appear in each source code file, this License Terms and Conditions.
23 *
24 * 3. Redistribution in binary form must reproduce the Copyright Notice,
25 *    this License Terms and Conditions, in the documentation and/or other
26 *    materials provided with the distribution.  For the purposes of binary
27 *    distribution the "Copyright Notice" refers to the following language:
28 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
29 *
30 * 4. The name of JPNIC may not be used to endorse or promote products
31 *    derived from this Software without specific prior written approval of
32 *    JPNIC.
33 *
34 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
35 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
37 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
38 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
39 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
40 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
41 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
42 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
43 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
44 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
45 */
46
47#include <config.h>
48
49#include <stddef.h>
50#include <stdlib.h>
51#include <string.h>
52
53#include <idn/assert.h>
54#include <idn/result.h>
55#include <idn/logmacro.h>
56#include <idn/util.h>
57#include <idn/ucs4.h>
58#include <idn/debug.h>
59
60/*
61 * Unicode surrogate pair.
62 */
63#define IS_SURROGATE_HIGH(v)	(0xd800 <= (v) && (v) <= 0xdbff)
64#define IS_SURROGATE_LOW(v)	(0xdc00 <= (v) && (v) <= 0xdfff)
65#define SURROGATE_HIGH(v)	(SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
66#define SURROGATE_LOW(v)	(SURROGATE_L_OFF + ((v) & 0x3ff))
67#define SURROGATE_BASE		0x10000
68#define SURROGATE_H_OFF		0xd800
69#define SURROGATE_L_OFF		0xdc00
70#define COMBINE_SURROGATE(h, l) \
71	(SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
72
73/*
74 * ASCII ctype macros.
75 * Note that these macros evaluate the argument multiple times.  Be careful.
76 */
77#define ASCII_TOUPPER(c) \
78	(('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c))
79#define ASCII_TOLOWER(c) \
80	(('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c))
81
82idn_result_t
83idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16,
84		     size_t tolen) {
85	unsigned short *utf16p = utf16;
86	unsigned long v;
87	idn_result_t r;
88
89	TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n",
90	       idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
91
92	while (*ucs4 != '\0') {
93		v = *ucs4++;
94
95		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
96			WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains "
97				 "surrogate pair\n"));
98			r = idn_invalid_encoding;
99			goto ret;
100		} else if (v > 0xffff) {
101			/* Convert to surrogate pair */
102			if (v >= 0x110000) {
103				r = idn_invalid_encoding;
104				goto ret;
105			}
106			if (tolen < 2) {
107				r = idn_buffer_overflow;
108				goto ret;
109			}
110			*utf16p++ = SURROGATE_HIGH(v);
111			*utf16p++ = SURROGATE_LOW(v);
112			tolen -= 2;
113		} else {
114			if (tolen < 1) {
115				r = idn_buffer_overflow;
116				goto ret;
117			}
118			*utf16p++ = v;
119			tolen--;
120		}
121	}
122
123	if (tolen < 1) {
124		r = idn_buffer_overflow;
125		goto ret;
126	}
127	*utf16p = '\0';
128
129	r = idn_success;
130ret:
131	if (r == idn_success) {
132		TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n",
133		       idn__debug_utf16xstring(utf16, 50)));
134	} else {
135		TRACE(("idn_ucs4_ucs4toutf16(): %s\n",
136		       idn_result_tostring(r)));
137	}
138	return (r);
139}
140
141idn_result_t
142idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4,
143		     size_t tolen) {
144	unsigned long *ucs4p = ucs4;
145	unsigned short v0, v1;
146	idn_result_t r;
147
148	TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n",
149	       idn__debug_utf16xstring(utf16, 50), (int)tolen));
150
151	while (*utf16 != '\0') {
152		v0 = *utf16;
153
154		if (tolen < 1) {
155			r = idn_buffer_overflow;
156			goto ret;
157		}
158
159		if (IS_SURROGATE_HIGH(v0)) {
160			v1 = *(utf16 + 1);
161			if (!IS_SURROGATE_LOW(v1)) {
162				WARNING(("idn_ucs4_utf16toucs4: "
163					 "corrupted surrogate pair\n"));
164				r = idn_invalid_encoding;
165				goto ret;
166			}
167			*ucs4p++ = COMBINE_SURROGATE(v0, v1);
168			tolen--;
169			utf16 += 2;
170
171		} else {
172			*ucs4p++ = v0;
173			tolen--;
174			utf16++;
175
176		}
177	}
178
179	if (tolen < 1) {
180		r = idn_buffer_overflow;
181		goto ret;
182	}
183	*ucs4p = '\0';
184
185	r = idn_success;
186ret:
187	if (r == idn_success) {
188		TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n",
189		       idn__debug_ucs4xstring(ucs4, 50)));
190	} else {
191		TRACE(("idn_ucs4_utf16toucs4(): %s\n",
192		       idn_result_tostring(r)));
193	}
194	return (r);
195}
196
197idn_result_t
198idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) {
199	const unsigned char *utf8p = (const unsigned char *)utf8;
200	unsigned long *ucs4p = ucs4;
201	unsigned long v, min;
202	unsigned char c;
203	int width;
204	int i;
205	idn_result_t r;
206
207	TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n",
208	       idn__debug_xstring(utf8, 50), (int)tolen));
209
210	while(*utf8p != '\0') {
211		c = *utf8p++;
212		if (c < 0x80) {
213			v = c;
214			min = 0;
215			width = 1;
216		} else if (c < 0xc0) {
217			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
218			r = idn_invalid_encoding;
219			goto ret;
220		} else if (c < 0xe0) {
221			v = c & 0x1f;
222			min = 0x80;
223			width = 2;
224		} else if (c < 0xf0) {
225			v = c & 0x0f;
226			min = 0x800;
227			width = 3;
228		} else if (c < 0xf8) {
229			v = c & 0x07;
230			min = 0x10000;
231			width = 4;
232		} else if (c < 0xfc) {
233			v = c & 0x03;
234			min = 0x200000;
235			width = 5;
236		} else if (c < 0xfe) {
237			v = c & 0x01;
238			min = 0x4000000;
239			width = 6;
240		} else {
241			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
242			r = idn_invalid_encoding;
243			goto ret;
244		}
245
246		for (i = width - 1; i > 0; i--) {
247			c = *utf8p++;
248			if (c < 0x80 || 0xc0 <= c) {
249				WARNING(("idn_ucs4_utf8toucs4: "
250					 "invalid character\n"));
251				r = idn_invalid_encoding;
252				goto ret;
253			}
254			v = (v << 6) | (c & 0x3f);
255		}
256
257	        if (v < min) {
258			WARNING(("idn_ucs4_utf8toucs4: invalid character\n"));
259			r = idn_invalid_encoding;
260			goto ret;
261		}
262		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
263			WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains "
264				 "surrogate pair\n"));
265			r = idn_invalid_encoding;
266			goto ret;
267		}
268		if (tolen < 1) {
269			r = idn_buffer_overflow;
270			goto ret;
271		}
272		tolen--;
273		*ucs4p++ = v;
274	}
275
276	if (tolen < 1) {
277		r = idn_buffer_overflow;
278		goto ret;
279	}
280	*ucs4p = '\0';
281
282	r = idn_success;
283ret:
284	if (r == idn_success) {
285		TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n",
286		       idn__debug_ucs4xstring(ucs4, 50)));
287	} else {
288		TRACE(("idn_ucs4_utf8toucs4(): %s\n",
289		       idn_result_tostring(r)));
290	}
291	return (r);
292}
293
294idn_result_t
295idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) {
296	unsigned char *utf8p = (unsigned char *)utf8;
297	unsigned long v;
298	int width;
299	int mask;
300	int offset;
301	idn_result_t r;
302
303	TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n",
304	       idn__debug_ucs4xstring(ucs4, 50), (int)tolen));
305
306	while (*ucs4 != '\0') {
307		v = *ucs4++;
308		if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) {
309			WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains "
310				 "surrogate pair\n"));
311			r = idn_invalid_encoding;
312			goto ret;
313		}
314		if (v < 0x80) {
315			mask = 0;
316			width = 1;
317		} else if (v < 0x800) {
318			mask = 0xc0;
319			width = 2;
320		} else if (v < 0x10000) {
321			mask = 0xe0;
322			width = 3;
323		} else if (v < 0x200000) {
324			mask = 0xf0;
325			width = 4;
326		} else if (v < 0x4000000) {
327			mask = 0xf8;
328			width = 5;
329		} else if (v < 0x80000000) {
330			mask = 0xfc;
331			width = 6;
332		} else {
333			WARNING(("idn_ucs4_ucs4toutf8: invalid character\n"));
334			r = idn_invalid_encoding;
335			goto ret;
336		}
337
338		if (tolen < width) {
339			r = idn_buffer_overflow;
340			goto ret;
341		}
342		offset = 6 * (width - 1);
343		*utf8p++ = (v >> offset) | mask;
344		mask = 0x80;
345		while (offset > 0) {
346			offset -= 6;
347			*utf8p++ = ((v >> offset) & 0x3f) | mask;
348		}
349		tolen -= width;
350	}
351
352	if (tolen < 1) {
353		r = idn_buffer_overflow;
354		goto ret;
355	}
356	*utf8p = '\0';
357
358	r = idn_success;
359ret:
360	if (r == idn_success) {
361		TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n",
362		       idn__debug_xstring(utf8, 50)));
363	} else {
364		TRACE(("idn_ucs4_ucs4toutf8(): %s\n",
365		       idn_result_tostring(r)));
366	}
367	return (r);
368}
369
370size_t
371idn_ucs4_strlen(const unsigned long *ucs4) {
372	size_t len;
373
374	for (len = 0; *ucs4 != '\0'; ucs4++, len++)
375		/* nothing to do */ ;
376
377	return (len);
378}
379
380unsigned long *
381idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) {
382	unsigned long *result = to;
383
384	while (*from != '\0')
385		*to++ = *from++;
386	*to = '\0';
387
388	return (result);
389}
390
391unsigned long *
392idn_ucs4_strcat(unsigned long *to, const unsigned long *from) {
393	unsigned long *result = to;
394
395	while (*to != '\0')
396		to++;
397
398	while (*from != '\0')
399		*to++ = *from++;
400	*to = '\0';
401
402	return (result);
403}
404
405int
406idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) {
407	while (*str1 != '\0') {
408		if (*str1 > *str2)
409			return (1);
410		else if (*str1 < *str2)
411			return (-1);
412		str1++;
413		str2++;
414	}
415
416	if (*str1 > *str2)
417		return (1);
418	else if (*str1 < *str2)
419		return (-1);
420
421	return (0);
422}
423
424int
425idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) {
426	unsigned long c1, c2;
427
428	while (*str1 != '\0') {
429		c1 = ASCII_TOLOWER(*str1);
430		c2 = ASCII_TOLOWER(*str2);
431		if (c1 > c2)
432			return (1);
433		else if (c1 < c2)
434			return (-1);
435		str1++;
436		str2++;
437	}
438
439	c1 = ASCII_TOLOWER(*str1);
440	c2 = ASCII_TOLOWER(*str2);
441	if (c1 > c2)
442		return (1);
443	else if (c1 < c2)
444		return (-1);
445
446	return (0);
447}
448
449
450unsigned long *
451idn_ucs4_strdup(const unsigned long *str) {
452	size_t length = idn_ucs4_strlen(str);
453	unsigned long *dupstr;
454
455	dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1));
456	if (dupstr == NULL)
457		return NULL;
458	memcpy(dupstr, str, sizeof(*str) * (length + 1));
459
460	return dupstr;
461}
462