1#ifndef lint
2static char *rcsid = "$Id: punycode.c,v 1.1 2003/06/04 00:26:06 marka Exp $";
3#endif
4
5/*
6 * Copyright (c) 2001,2002 Japan Network Information Center.
7 * All rights reserved.
8 *
9 * By using this file, you agree to the terms and conditions set forth bellow.
10 *
11 * 			LICENSE TERMS AND CONDITIONS
12 *
13 * The following License Terms and Conditions apply, unless a different
14 * license is obtained from Japan Network Information Center ("JPNIC"),
15 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
16 * Chiyoda-ku, Tokyo 101-0047, Japan.
17 *
18 * 1. Use, Modification and Redistribution (including distribution of any
19 *    modified or derived work) in source and/or binary forms is permitted
20 *    under this License Terms and Conditions.
21 *
22 * 2. Redistribution of source code must retain the copyright notices as they
23 *    appear in each source code file, this License Terms and Conditions.
24 *
25 * 3. Redistribution in binary form must reproduce the Copyright Notice,
26 *    this License Terms and Conditions, in the documentation and/or other
27 *    materials provided with the distribution.  For the purposes of binary
28 *    distribution the "Copyright Notice" refers to the following language:
29 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
30 *
31 * 4. The name of JPNIC may not be used to endorse or promote products
32 *    derived from this Software without specific prior written approval of
33 *    JPNIC.
34 *
35 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
36 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
38 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
39 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
42 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
43 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
44 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
45 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46 */
47
48#include <config.h>
49
50#include <stddef.h>
51#include <stdlib.h>
52#include <string.h>
53
54#include <idn/result.h>
55#include <idn/assert.h>
56#include <idn/logmacro.h>
57#include <idn/converter.h>
58#include <idn/ucs4.h>
59#include <idn/debug.h>
60#include <idn/punycode.h>
61#include <idn/util.h>
62
63/*
64 * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE
65 * signature, we have to choose one.  In order to prevent the converted
66 * name from beginning with a hyphen, we should choose a prefix rather
67 * than a suffix.
68 */
69#ifndef IDN_PUNYCODE_PREFIX
70#define IDN_PUNYCODE_PREFIX	"xn--"
71#endif
72
73#define INVALID_UCS	0x80000000
74#define MAX_UCS		0x10FFFF
75
76/*
77 * As the draft states, it is possible that `delta' may overflow during
78 * the encoding.  The upper bound of 'delta' is:
79 *   <# of chars. of input string> + <max. difference in code point> *
80 *   <# of chars. of input string + 1>
81 * For this value not to be greater than 0xffffffff (since the calculation
82 * is done using unsigned long, which is at least 32bit long), the maxmum
83 * input string size is about 3850 characters, which is long enough for
84 * a domain label...
85 */
86#define PUNYCODE_MAXINPUT	3800
87
88/*
89 * Parameters.
90 */
91#define PUNYCODE_BASE		36
92#define PUNYCODE_TMIN		1
93#define PUNYCODE_TMAX		26
94#define PUNYCODE_SKEW		38
95#define PUNYCODE_DAMP		700
96#define PUNYCODE_INITIAL_BIAS	72
97#define PUNYCODE_INITIAL_N	0x80
98
99static int		punycode_getwc(const char *s, size_t len,
100				      int bias, unsigned long *vp);
101static int		punycode_putwc(char *s, size_t len,
102				      unsigned long delta, int bias);
103static int		punycode_update_bias(unsigned long delta,
104					    size_t npoints, int first);
105
106idn_result_t
107idn__punycode_decode(idn_converter_t ctx, void *privdata,
108		    const char *from, unsigned long *to, size_t tolen) {
109	unsigned long *to_org = to;
110	unsigned long c, idx;
111	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
112	size_t fromlen;
113	size_t uidx, fidx, ucslen;
114	int first, bias;
115	idn_result_t r;
116
117	assert(ctx != NULL);
118
119	TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n",
120	       idn__debug_xstring(from, 50), (int)tolen));
121
122	if (!idn__util_asciihaveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
123		if (*from == '\0') {
124			r = idn_ucs4_utf8toucs4(from, to, tolen);
125			goto ret;
126		}
127		r = idn_invalid_encoding;
128		goto ret;
129	}
130	from += prefixlen;
131	fromlen = strlen(from);
132
133	/*
134	 * Find the last delimiter, and copy the characters
135	 * before it verbatim.
136	 */
137	ucslen = 0;
138	for (fidx = fromlen; fidx > 0; fidx--) {
139		if (from[fidx - 1] == '-') {
140			if (tolen < fidx) {
141				r = idn_buffer_overflow;
142				goto ret;
143			}
144			for (uidx = 0; uidx < fidx - 1; uidx++) {
145				to[uidx] = from[uidx];
146			}
147			ucslen = uidx;
148			break;
149		}
150	}
151
152	first = 1;
153	bias = PUNYCODE_INITIAL_BIAS;
154	c = PUNYCODE_INITIAL_N;
155	idx = 0;
156	while (fidx < fromlen) {
157		int len;
158		unsigned long delta;
159		int i;
160
161		len = punycode_getwc(from + fidx, fromlen - fidx, bias, &delta);
162		if (len == 0) {
163			r = idn_invalid_encoding;
164			goto ret;
165		}
166		fidx += len;
167
168		bias = punycode_update_bias(delta, ucslen + 1, first);
169		first = 0;
170		idx += delta;
171		c += idx / (ucslen + 1);
172		uidx = idx % (ucslen + 1);
173
174		/* Insert 'c' at uidx. */
175		if (tolen-- <= 0) {
176			r = idn_buffer_overflow;
177			goto ret;
178		}
179		for (i = ucslen; i > uidx; i--)
180			to[i] = to[i - 1];
181		to[uidx] = c;
182
183		ucslen++;
184		idx = uidx + 1;
185	}
186
187	/* Terminate with NUL. */
188	if (tolen <= 0) {
189		r = idn_buffer_overflow;
190		goto ret;
191	}
192	to[ucslen] = '\0';
193	r = idn_success;
194
195ret:
196	if (r == idn_success) {
197		TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n",
198		       idn__debug_ucs4xstring(to_org, 50)));
199	} else {
200		TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r)));
201	}
202	return (r);
203}
204
205idn_result_t
206idn__punycode_encode(idn_converter_t ctx, void *privdata,
207		    const unsigned long *from, char *to, size_t tolen) {
208	char *to_org = to;
209	unsigned long cur_code, next_code, delta;
210	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
211	size_t fromlen;
212	size_t ucsdone;
213	size_t toidx;
214	int uidx, bias, first;
215	idn_result_t r;
216
217	assert(ctx != NULL);
218
219	TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n",
220	       idn__debug_ucs4xstring(from, 50), (int)tolen));
221
222	if (*from == '\0') {
223		r = idn_ucs4_ucs4toutf8(from, to, tolen);
224		goto ret;
225	} else if (idn__util_ucs4haveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
226		r = idn_prohibited;
227		goto ret;
228	}
229
230	if (tolen < prefixlen) {
231		r = idn_buffer_overflow;
232		goto ret;
233	}
234	memcpy(to, IDN_PUNYCODE_PREFIX, prefixlen);
235	to += prefixlen;
236	tolen -= prefixlen;
237
238	fromlen = idn_ucs4_strlen(from);
239
240	/*
241	 * If the input string is too long (actually too long to be sane),
242	 * return failure in order to prevent possible overflow.
243	 */
244	if (fromlen > PUNYCODE_MAXINPUT) {
245		ERROR(("idn__punycode_encode(): "
246		       "the input string is too long to convert Punycode\n",
247		       idn__debug_ucs4xstring(from, 50)));
248		r = idn_failure;
249		goto ret;
250	}
251
252	ucsdone = 0;	/* number of characters processed */
253	toidx = 0;
254
255	/*
256	 * First, pick up basic code points and copy them to 'to'.
257	 */
258	for (uidx = 0; uidx < fromlen; uidx++) {
259		if (from[uidx] < 0x80) {
260			if (toidx >= tolen) {
261				r = idn_buffer_overflow;
262				goto ret;
263			}
264			to[toidx++] = from[uidx];
265			ucsdone++;
266		}
267	}
268
269	/*
270	 * If there are any basic code points, output a delimiter
271	 * (hyphen-minus).
272	 */
273	if (toidx > 0) {
274		if (toidx >= tolen) {
275			r = idn_buffer_overflow;
276			goto ret;
277		}
278		to[toidx++] = '-';
279		to += toidx;
280		tolen -= toidx;
281	}
282
283	/*
284	 * Then encode non-basic characters.
285	 */
286	first = 1;
287	cur_code = PUNYCODE_INITIAL_N;
288	bias = PUNYCODE_INITIAL_BIAS;
289	delta = 0;
290	while (ucsdone < fromlen) {
291		int limit = -1, rest;
292
293		/*
294		 * Find the smallest code point equal to or greater
295		 * than 'cur_code'.  Also remember the index of the
296		 * last occurence of the code point.
297		 */
298		for (next_code = MAX_UCS, uidx = fromlen - 1;
299		     uidx >= 0; uidx--) {
300			if (from[uidx] >= cur_code && from[uidx] < next_code) {
301				next_code = from[uidx];
302				limit = uidx;
303			}
304		}
305		/* There must be such code point. */
306		assert(limit >= 0);
307
308		delta += (next_code - cur_code) * (ucsdone + 1);
309		cur_code = next_code;
310
311		/*
312		 * Scan the input string again, and encode characters
313		 * whose code point is 'cur_code'.  Use 'limit' to avoid
314		 * unnecessary scan.
315		 */
316		for (uidx = 0, rest = ucsdone; uidx <= limit; uidx++) {
317			if (from[uidx] < cur_code) {
318				delta++;
319				rest--;
320			} else if (from[uidx] == cur_code) {
321				int sz = punycode_putwc(to, tolen, delta, bias);
322				if (sz == 0) {
323					r = idn_buffer_overflow;
324					goto ret;
325				}
326				to += sz;
327				tolen -= sz;
328				ucsdone++;
329				bias = punycode_update_bias(delta, ucsdone,
330							   first);
331				delta = 0;
332				first = 0;
333			}
334		}
335		delta += rest + 1;
336		cur_code++;
337	}
338
339	/*
340	 * Terminate with NUL.
341	 */
342	if (tolen <= 0) {
343		r = idn_buffer_overflow;
344		goto ret;
345	}
346	*to = '\0';
347	r = idn_success;
348
349ret:
350	if (r == idn_success) {
351		TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n",
352		       idn__debug_xstring(to_org, 50)));
353	} else {
354		TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r)));
355	}
356	return (r);
357}
358
359static int
360punycode_getwc(const char *s, size_t len, int bias, unsigned long *vp) {
361	size_t orglen = len;
362	unsigned long v = 0, w = 1;
363	int k;
364
365	for (k = PUNYCODE_BASE - bias; len > 0; k += PUNYCODE_BASE) {
366		int c = *s++;
367		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
368			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
369
370		len--;
371		if ('a' <= c && c <= 'z')
372			c = c - 'a';
373		else if ('A' <= c && c <= 'Z')
374			c = c - 'A';
375		else if ('0' <= c && c <= '9')
376			c = c - '0' + 26;
377		else
378			c = -1;
379
380		if (c < 0)
381			return (0);	/* invalid character */
382
383		v += c * w;
384
385		if (c < t) {
386			*vp = v;
387			return (orglen - len);
388		}
389
390		w *= (PUNYCODE_BASE - t);
391	}
392
393	return (0);	/* final character missing */
394}
395
396static int
397punycode_putwc(char *s, size_t len, unsigned long delta, int bias) {
398	const char *punycode_base36 = "abcdefghijklmnopqrstuvwxyz0123456789";
399	int k;
400	char *sorg = s;
401
402	for (k = PUNYCODE_BASE - bias; 1; k += PUNYCODE_BASE) {
403		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
404			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
405
406		if (delta < t)
407			break;
408		if (len < 1)
409			return (0);
410		*s++ = punycode_base36[t + ((delta - t) % (PUNYCODE_BASE - t))];
411		len--;
412		delta = (delta - t) / (PUNYCODE_BASE - t);
413	}
414	if (len < 1)
415		return (0);
416	*s++ = punycode_base36[delta];
417	return (s - sorg);
418}
419
420static int
421punycode_update_bias(unsigned long delta, size_t npoints, int first) {
422	int k = 0;
423
424	delta /= first ? PUNYCODE_DAMP : 2;
425	delta += delta / npoints;
426
427	while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) {
428		delta /= PUNYCODE_BASE - PUNYCODE_TMIN;
429		k++;
430	}
431	return (PUNYCODE_BASE * k +
432		(((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta) /
433		 (delta + PUNYCODE_SKEW)));
434}
435