1#ifndef lint
2static char *rcsid = "$Id: race.c,v 1.1 2003/06/04 00:26:07 marka Exp $";
3#endif
4
5/*
6 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
7 * All rights reserved.
8 *
9 * By using this file, you agree to the terms and conditions set forth bellow.
10 *
11 * 			LICENSE TERMS AND CONDITIONS
12 *
13 * The following License Terms and Conditions apply, unless a different
14 * license is obtained from Japan Network Information Center ("JPNIC"),
15 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
16 * Chiyoda-ku, Tokyo 101-0047, Japan.
17 *
18 * 1. Use, Modification and Redistribution (including distribution of any
19 *    modified or derived work) in source and/or binary forms is permitted
20 *    under this License Terms and Conditions.
21 *
22 * 2. Redistribution of source code must retain the copyright notices as they
23 *    appear in each source code file, this License Terms and Conditions.
24 *
25 * 3. Redistribution in binary form must reproduce the Copyright Notice,
26 *    this License Terms and Conditions, in the documentation and/or other
27 *    materials provided with the distribution.  For the purposes of binary
28 *    distribution the "Copyright Notice" refers to the following language:
29 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
30 *
31 * 4. The name of JPNIC may not be used to endorse or promote products
32 *    derived from this Software without specific prior written approval of
33 *    JPNIC.
34 *
35 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
36 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
38 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
39 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
42 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
43 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
44 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
45 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46 */
47
48#include <config.h>
49
50#include <stddef.h>
51#include <stdlib.h>
52#include <string.h>
53
54#include <idn/result.h>
55#include <idn/assert.h>
56#include <idn/logmacro.h>
57#include <idn/converter.h>
58#include <idn/ucs4.h>
59#include <idn/debug.h>
60#include <idn/race.h>
61#include <idn/util.h>
62
63#ifndef IDN_RACE_PREFIX
64#define IDN_RACE_PREFIX		"bq--"
65#endif
66#define RACE_2OCTET_MODE	0xd8
67#define RACE_ESCAPE		0xff
68#define RACE_ESCAPE_2ND		0x99
69
70#define RACE_BUF_SIZE		128		/* more than enough */
71
72/*
73 * Unicode surrogate pair.
74 */
75#define IS_SURROGATE_HIGH(v)	(0xd800 <= (v) && (v) <= 0xdbff)
76#define IS_SURROGATE_LOW(v)	(0xdc00 <= (v) && (v) <= 0xdfff)
77#define SURROGATE_HIGH(v)	(SURROGATE_H_OFF + (((v) - 0x10000) >> 10))
78#define SURROGATE_LOW(v)	(SURROGATE_L_OFF + ((v) & 0x3ff))
79#define SURROGATE_BASE		0x10000
80#define SURROGATE_H_OFF		0xd800
81#define SURROGATE_L_OFF		0xdc00
82#define COMBINE_SURROGATE(h, l) \
83	(SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF))
84
85/*
86 * Compression type.
87 */
88enum {
89	compress_one,	/* all characters are in a single row */
90	compress_two,	/* row 0 and another row */
91	compress_none	/* nope */
92};
93
94static idn_result_t	race_decode_decompress(const char *from,
95					       unsigned short *buf,
96					       size_t buflen);
97static idn_result_t	race_compress_encode(const unsigned short *p,
98					     int compress_mode,
99					     char *to, size_t tolen);
100static int		get_compress_mode(unsigned short *p);
101
102idn_result_t
103idn__race_decode(idn_converter_t ctx, void *privdata,
104		 const char *from, unsigned long *to, size_t tolen) {
105	unsigned short *buf = NULL;
106	size_t prefixlen = strlen(IDN_RACE_PREFIX);
107	size_t fromlen;
108	size_t buflen;
109	idn_result_t r;
110
111	assert(ctx != NULL);
112
113	TRACE(("idn__race_decode(from=\"%s\", tolen=%d)\n",
114	       idn__debug_xstring(from, 50), (int)tolen));
115
116	if (!idn__util_asciihaveaceprefix(from, IDN_RACE_PREFIX)) {
117		if (*from == '\0') {
118			r = idn_ucs4_utf8toucs4(from, to, tolen);
119			goto ret;
120		}
121		r = idn_invalid_encoding;
122		goto ret;
123	}
124	from += prefixlen;
125	fromlen = strlen(from);
126
127	/*
128	 * Allocate sufficient buffer.
129	 */
130	buflen = fromlen + 1;
131	buf = malloc(sizeof(*buf) * buflen);
132	if (buf == NULL) {
133		r = idn_nomemory;
134		goto ret;
135	}
136
137	/*
138	 * Decode base32 and decompress.
139	 */
140	r = race_decode_decompress(from, buf, buflen);
141	if (r != idn_success)
142		goto ret;
143
144	/*
145	 * Now 'buf' points the decompressed string, which must contain
146	 * UTF-16 characters.
147	 */
148
149	/*
150	 * Convert to UCS4.
151	 */
152	r = idn_ucs4_utf16toucs4(buf, to, tolen);
153	if (r != idn_success)
154		goto ret;
155
156ret:
157	free(buf);
158	if (r == idn_success) {
159		TRACE(("idn__race_decode(): succcess (to=\"%s\")\n",
160		       idn__debug_ucs4xstring(to, 50)));
161	} else {
162		TRACE(("idn__race_decode(): %s\n", idn_result_tostring(r)));
163	}
164	return (r);
165}
166
167static idn_result_t
168race_decode_decompress(const char *from, unsigned short *buf, size_t buflen)
169{
170	unsigned short *p = buf;
171	unsigned int bitbuf = 0;
172	int bitlen = 0;
173	int i, j;
174	size_t len;
175
176	while (*from != '\0') {
177		int c = *from++;
178		int x;
179
180		if ('a' <= c && c <= 'z')
181			x = c - 'a';
182		else if ('A' <= c && c <= 'Z')
183			x = c - 'A';
184		else if ('2' <= c && c <= '7')
185			x = c - '2' + 26;
186		else
187			return (idn_invalid_encoding);
188
189		bitbuf = (bitbuf << 5) + x;
190		bitlen += 5;
191		if (bitlen >= 8) {
192			*p++ = (bitbuf >> (bitlen - 8)) & 0xff;
193			bitlen -= 8;
194		}
195	}
196	len = p - buf;
197
198	/*
199	 * Now 'buf' holds the decoded string.
200	 */
201
202	/*
203	 * Decompress.
204	 */
205	if (buf[0] == RACE_2OCTET_MODE) {
206		if ((len - 1) % 2 != 0)
207			return (idn_invalid_encoding);
208		for (i = 1, j = 0; i < len; i += 2, j++)
209			buf[j] = (buf[i] << 8) + buf[i + 1];
210		len = j;
211	} else {
212		unsigned short c = buf[0] << 8;	/* higher octet */
213
214		for (i = 1, j = 0; i < len; j++) {
215			if (buf[i] == RACE_ESCAPE) {
216				if (i + 1 >= len)
217					return (idn_invalid_encoding);
218				else if (buf[i + 1] == RACE_ESCAPE_2ND)
219					buf[j] = c | 0xff;
220				else
221					buf[j] = buf[i + 1];
222				i += 2;
223
224			} else if (buf[i] == 0x99 && c == 0x00) {
225				/*
226				 * The RACE specification says this is error.
227				 */
228				return (idn_invalid_encoding);
229
230			} else {
231				buf[j] = c | buf[i++];
232			}
233		}
234		len = j;
235	}
236	buf[len] = '\0';
237
238	return (idn_success);
239}
240
241idn_result_t
242idn__race_encode(idn_converter_t ctx, void *privdata,
243		 const unsigned long *from, char *to, size_t tolen) {
244	char *to_org = to;
245	unsigned short *p, *buf = NULL;
246	size_t prefixlen = strlen(IDN_RACE_PREFIX);
247	size_t buflen;
248	size_t fromlen;
249	idn_result_t r;
250	int compress_mode;
251
252	assert(ctx != NULL);
253
254	TRACE(("idn__race_encode(from=\"%s\", tolen=%d)\n",
255	       idn__debug_ucs4xstring(from, 50), (int)tolen));
256
257	if (*from == '\0') {
258		r = idn_ucs4_ucs4toutf8(from, to, tolen);
259		goto ret;
260	} else if (idn__util_ucs4haveaceprefix(from, IDN_RACE_PREFIX)) {
261		r = idn_prohibited;
262		goto ret;
263	}
264
265	if (tolen < prefixlen) {
266		r  = idn_buffer_overflow;
267		goto ret;
268	}
269	memcpy(to, IDN_RACE_PREFIX, prefixlen);
270	to += prefixlen;
271	tolen -= prefixlen;
272
273	fromlen = idn_ucs4_strlen(from);
274	buflen = fromlen * 2 + 2;
275
276	/*
277	 * Convert to UTF-16.
278	 * Preserve space for a character at the top of the buffer.
279	 */
280	for (;;) {
281		unsigned short *new_buf;
282
283		new_buf = realloc(buf, sizeof(*buf) * buflen);
284		if (new_buf == NULL) {
285			r = idn_nomemory;
286			goto ret;
287		}
288		buf = new_buf;
289
290		r = idn_ucs4_ucs4toutf16(from, buf + 1, buflen - 1);
291		if (r == idn_success)
292			break;
293		else if (r != idn_buffer_overflow)
294			goto ret;
295
296		buflen = fromlen * 2 + 2;
297	}
298	p = buf + 1;
299
300	/*
301	 * Now 'p' contains UTF-16 encoded string.
302	 */
303
304	/*
305	 * Check U+0099.
306	 * RACE doesn't permit U+0099 in an input string.
307	 */
308	for (p = buf + 1; *p != '\0'; p++) {
309		if (*p == 0x0099) {
310			r = idn_invalid_encoding;
311			goto ret;
312		}
313	}
314
315	/*
316	 * Compress, encode in base-32 and output.
317	 */
318	compress_mode = get_compress_mode(buf + 1);
319	r = race_compress_encode(buf, compress_mode, to, tolen);
320
321ret:
322	free(buf);
323	if (r == idn_success) {
324		TRACE(("idn__race_encode(): succcess (to=\"%s\")\n",
325		       idn__debug_xstring(to_org, 50)));
326	} else {
327		TRACE(("idn__race_encode(): %s\n", idn_result_tostring(r)));
328	}
329	return (r);
330}
331
332static idn_result_t
333race_compress_encode(const unsigned short *p, int compress_mode,
334		     char *to, size_t tolen)
335{
336	unsigned long bitbuf = *p++;	/* bit stream buffer */
337	int bitlen = 8;			/* # of bits in 'bitbuf' */
338
339	while (*p != '\0' || bitlen > 0) {
340		unsigned int c = *p;
341
342		if (c == '\0') {
343			/* End of data.  Flush. */
344			bitbuf <<= (5 - bitlen);
345			bitlen = 5;
346		} else if (compress_mode == compress_none) {
347			/* Push 16 bit data. */
348			bitbuf = (bitbuf << 16) | c;
349			bitlen += 16;
350			p++;
351		} else {/* compress_mode == compress_one/compress_two */
352			/* Push 8 or 16 bit data. */
353			if (compress_mode == compress_two &&
354			    (c & 0xff00) == 0) {
355				/* Upper octet is zero (and not U1). */
356				bitbuf = (bitbuf << 16) | 0xff00 | c;
357				bitlen += 16;
358			} else if ((c & 0xff) == 0xff) {
359				/* Lower octet is 0xff. */
360				bitbuf = (bitbuf << 16) |
361					(RACE_ESCAPE << 8) | RACE_ESCAPE_2ND;
362				bitlen += 16;
363			} else {
364				/* Just output lower octet. */
365				bitbuf = (bitbuf << 8) | (c & 0xff);
366				bitlen += 8;
367			}
368			p++;
369		}
370
371		/*
372		 * Output bits in 'bitbuf' in 5-bit unit.
373		 */
374		while (bitlen >= 5) {
375			int x;
376
377			/* Get top 5 bits. */
378			x = (bitbuf >> (bitlen - 5)) & 0x1f;
379			bitlen -= 5;
380
381			/* Encode. */
382			if (x < 26)
383				x += 'a';
384			else
385				x = (x - 26) + '2';
386
387			if (tolen < 1)
388				return (idn_buffer_overflow);
389
390			*to++ = x;
391			tolen--;
392		}
393	}
394
395	if (tolen <= 0)
396		return (idn_buffer_overflow);
397
398	*to = '\0';
399	return (idn_success);
400}
401
402static int
403get_compress_mode(unsigned short *p) {
404	int zero = 0;
405	unsigned int upper = 0;
406	unsigned short *modepos = p - 1;
407
408	while (*p != '\0') {
409		unsigned int hi = *p++ & 0xff00;
410
411		if (hi == 0) {
412			zero++;
413		} else if (hi == upper) {
414			;
415		} else if (upper == 0) {
416			upper = hi;
417		} else {
418			*modepos = RACE_2OCTET_MODE;
419			return (compress_none);
420		}
421	}
422	*modepos = upper >> 8;
423	if (upper > 0 && zero > 0)
424		return (compress_two);
425	else
426		return (compress_one);
427}
428