1/*	$NetBSD: punycode.c,v 1.2 2011/02/16 03:46:56 christos Exp $	*/
2
3#ifndef lint
4static char *rcsid = "Id: punycode.c,v 1.1 2003/06/04 00:26:06 marka Exp ";
5#endif
6
7/*
8 * Copyright (c) 2001,2002 Japan Network Information Center.
9 * All rights reserved.
10 *
11 * By using this file, you agree to the terms and conditions set forth bellow.
12 *
13 * 			LICENSE TERMS AND CONDITIONS
14 *
15 * The following License Terms and Conditions apply, unless a different
16 * license is obtained from Japan Network Information Center ("JPNIC"),
17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18 * Chiyoda-ku, Tokyo 101-0047, Japan.
19 *
20 * 1. Use, Modification and Redistribution (including distribution of any
21 *    modified or derived work) in source and/or binary forms is permitted
22 *    under this License Terms and Conditions.
23 *
24 * 2. Redistribution of source code must retain the copyright notices as they
25 *    appear in each source code file, this License Terms and Conditions.
26 *
27 * 3. Redistribution in binary form must reproduce the Copyright Notice,
28 *    this License Terms and Conditions, in the documentation and/or other
29 *    materials provided with the distribution.  For the purposes of binary
30 *    distribution the "Copyright Notice" refers to the following language:
31 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32 *
33 * 4. The name of JPNIC may not be used to endorse or promote products
34 *    derived from this Software without specific prior written approval of
35 *    JPNIC.
36 *
37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48 */
49
50#include <config.h>
51
52#include <stddef.h>
53#include <stdlib.h>
54#include <string.h>
55
56#include <idn/result.h>
57#include <idn/assert.h>
58#include <idn/logmacro.h>
59#include <idn/converter.h>
60#include <idn/ucs4.h>
61#include <idn/debug.h>
62#include <idn/punycode.h>
63#include <idn/util.h>
64
65/*
66 * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE
67 * signature, we have to choose one.  In order to prevent the converted
68 * name from beginning with a hyphen, we should choose a prefix rather
69 * than a suffix.
70 */
71#ifndef IDN_PUNYCODE_PREFIX
72#define IDN_PUNYCODE_PREFIX	"xn--"
73#endif
74
75#define INVALID_UCS	0x80000000
76#define MAX_UCS		0x10FFFF
77
78/*
79 * As the draft states, it is possible that `delta' may overflow during
80 * the encoding.  The upper bound of 'delta' is:
81 *   <# of chars. of input string> + <max. difference in code point> *
82 *   <# of chars. of input string + 1>
83 * For this value not to be greater than 0xffffffff (since the calculation
84 * is done using unsigned long, which is at least 32bit long), the maxmum
85 * input string size is about 3850 characters, which is long enough for
86 * a domain label...
87 */
88#define PUNYCODE_MAXINPUT	3800
89
90/*
91 * Parameters.
92 */
93#define PUNYCODE_BASE		36
94#define PUNYCODE_TMIN		1
95#define PUNYCODE_TMAX		26
96#define PUNYCODE_SKEW		38
97#define PUNYCODE_DAMP		700
98#define PUNYCODE_INITIAL_BIAS	72
99#define PUNYCODE_INITIAL_N	0x80
100
101static int		punycode_getwc(const char *s, size_t len,
102				      int bias, unsigned long *vp);
103static int		punycode_putwc(char *s, size_t len,
104				      unsigned long delta, int bias);
105static int		punycode_update_bias(unsigned long delta,
106					    size_t npoints, int first);
107
108idn_result_t
109idn__punycode_decode(idn_converter_t ctx, void *privdata,
110		    const char *from, unsigned long *to, size_t tolen) {
111	unsigned long *to_org = to;
112	unsigned long c, idx;
113	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
114	size_t fromlen;
115	size_t uidx, fidx, ucslen;
116	int first, bias;
117	idn_result_t r;
118
119	assert(ctx != NULL);
120
121	TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n",
122	       idn__debug_xstring(from, 50), (int)tolen));
123
124	if (!idn__util_asciihaveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
125		if (*from == '\0') {
126			r = idn_ucs4_utf8toucs4(from, to, tolen);
127			goto ret;
128		}
129		r = idn_invalid_encoding;
130		goto ret;
131	}
132	from += prefixlen;
133	fromlen = strlen(from);
134
135	/*
136	 * Find the last delimiter, and copy the characters
137	 * before it verbatim.
138	 */
139	ucslen = 0;
140	for (fidx = fromlen; fidx > 0; fidx--) {
141		if (from[fidx - 1] == '-') {
142			if (tolen < fidx) {
143				r = idn_buffer_overflow;
144				goto ret;
145			}
146			for (uidx = 0; uidx < fidx - 1; uidx++) {
147				to[uidx] = from[uidx];
148			}
149			ucslen = uidx;
150			break;
151		}
152	}
153
154	first = 1;
155	bias = PUNYCODE_INITIAL_BIAS;
156	c = PUNYCODE_INITIAL_N;
157	idx = 0;
158	while (fidx < fromlen) {
159		int len;
160		unsigned long delta;
161		int i;
162
163		len = punycode_getwc(from + fidx, fromlen - fidx, bias, &delta);
164		if (len == 0) {
165			r = idn_invalid_encoding;
166			goto ret;
167		}
168		fidx += len;
169
170		bias = punycode_update_bias(delta, ucslen + 1, first);
171		first = 0;
172		idx += delta;
173		c += idx / (ucslen + 1);
174		uidx = idx % (ucslen + 1);
175
176		/* Insert 'c' at uidx. */
177		if (tolen-- <= 0) {
178			r = idn_buffer_overflow;
179			goto ret;
180		}
181		for (i = ucslen; i > uidx; i--)
182			to[i] = to[i - 1];
183		to[uidx] = c;
184
185		ucslen++;
186		idx = uidx + 1;
187	}
188
189	/* Terminate with NUL. */
190	if (tolen <= 0) {
191		r = idn_buffer_overflow;
192		goto ret;
193	}
194	to[ucslen] = '\0';
195	r = idn_success;
196
197ret:
198	if (r == idn_success) {
199		TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n",
200		       idn__debug_ucs4xstring(to_org, 50)));
201	} else {
202		TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r)));
203	}
204	return (r);
205}
206
207idn_result_t
208idn__punycode_encode(idn_converter_t ctx, void *privdata,
209		    const unsigned long *from, char *to, size_t tolen) {
210	char *to_org = to;
211	unsigned long cur_code, next_code, delta;
212	size_t prefixlen = strlen(IDN_PUNYCODE_PREFIX);
213	size_t fromlen;
214	size_t ucsdone;
215	size_t toidx;
216	int uidx, bias, first;
217	idn_result_t r;
218
219	assert(ctx != NULL);
220
221	TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n",
222	       idn__debug_ucs4xstring(from, 50), (int)tolen));
223
224	if (*from == '\0') {
225		r = idn_ucs4_ucs4toutf8(from, to, tolen);
226		goto ret;
227	} else if (idn__util_ucs4haveaceprefix(from, IDN_PUNYCODE_PREFIX)) {
228		r = idn_prohibited;
229		goto ret;
230	}
231
232	if (tolen < prefixlen) {
233		r = idn_buffer_overflow;
234		goto ret;
235	}
236	memcpy(to, IDN_PUNYCODE_PREFIX, prefixlen);
237	to += prefixlen;
238	tolen -= prefixlen;
239
240	fromlen = idn_ucs4_strlen(from);
241
242	/*
243	 * If the input string is too long (actually too long to be sane),
244	 * return failure in order to prevent possible overflow.
245	 */
246	if (fromlen > PUNYCODE_MAXINPUT) {
247		ERROR(("idn__punycode_encode(): "
248		       "the input string is too long to convert Punycode\n",
249		       idn__debug_ucs4xstring(from, 50)));
250		r = idn_failure;
251		goto ret;
252	}
253
254	ucsdone = 0;	/* number of characters processed */
255	toidx = 0;
256
257	/*
258	 * First, pick up basic code points and copy them to 'to'.
259	 */
260	for (uidx = 0; uidx < fromlen; uidx++) {
261		if (from[uidx] < 0x80) {
262			if (toidx >= tolen) {
263				r = idn_buffer_overflow;
264				goto ret;
265			}
266			to[toidx++] = from[uidx];
267			ucsdone++;
268		}
269	}
270
271	/*
272	 * If there are any basic code points, output a delimiter
273	 * (hyphen-minus).
274	 */
275	if (toidx > 0) {
276		if (toidx >= tolen) {
277			r = idn_buffer_overflow;
278			goto ret;
279		}
280		to[toidx++] = '-';
281		to += toidx;
282		tolen -= toidx;
283	}
284
285	/*
286	 * Then encode non-basic characters.
287	 */
288	first = 1;
289	cur_code = PUNYCODE_INITIAL_N;
290	bias = PUNYCODE_INITIAL_BIAS;
291	delta = 0;
292	while (ucsdone < fromlen) {
293		int limit = -1, rest;
294
295		/*
296		 * Find the smallest code point equal to or greater
297		 * than 'cur_code'.  Also remember the index of the
298		 * last occurence of the code point.
299		 */
300		for (next_code = MAX_UCS, uidx = fromlen - 1;
301		     uidx >= 0; uidx--) {
302			if (from[uidx] >= cur_code && from[uidx] < next_code) {
303				next_code = from[uidx];
304				limit = uidx;
305			}
306		}
307		/* There must be such code point. */
308		assert(limit >= 0);
309
310		delta += (next_code - cur_code) * (ucsdone + 1);
311		cur_code = next_code;
312
313		/*
314		 * Scan the input string again, and encode characters
315		 * whose code point is 'cur_code'.  Use 'limit' to avoid
316		 * unnecessary scan.
317		 */
318		for (uidx = 0, rest = ucsdone; uidx <= limit; uidx++) {
319			if (from[uidx] < cur_code) {
320				delta++;
321				rest--;
322			} else if (from[uidx] == cur_code) {
323				int sz = punycode_putwc(to, tolen, delta, bias);
324				if (sz == 0) {
325					r = idn_buffer_overflow;
326					goto ret;
327				}
328				to += sz;
329				tolen -= sz;
330				ucsdone++;
331				bias = punycode_update_bias(delta, ucsdone,
332							   first);
333				delta = 0;
334				first = 0;
335			}
336		}
337		delta += rest + 1;
338		cur_code++;
339	}
340
341	/*
342	 * Terminate with NUL.
343	 */
344	if (tolen <= 0) {
345		r = idn_buffer_overflow;
346		goto ret;
347	}
348	*to = '\0';
349	r = idn_success;
350
351ret:
352	if (r == idn_success) {
353		TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n",
354		       idn__debug_xstring(to_org, 50)));
355	} else {
356		TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r)));
357	}
358	return (r);
359}
360
361static int
362punycode_getwc(const char *s, size_t len, int bias, unsigned long *vp) {
363	size_t orglen = len;
364	unsigned long v = 0, w = 1;
365	int k;
366
367	for (k = PUNYCODE_BASE - bias; len > 0; k += PUNYCODE_BASE) {
368		int c = *s++;
369		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
370			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
371
372		len--;
373		if ('a' <= c && c <= 'z')
374			c = c - 'a';
375		else if ('A' <= c && c <= 'Z')
376			c = c - 'A';
377		else if ('0' <= c && c <= '9')
378			c = c - '0' + 26;
379		else
380			c = -1;
381
382		if (c < 0)
383			return (0);	/* invalid character */
384
385		v += c * w;
386
387		if (c < t) {
388			*vp = v;
389			return (orglen - len);
390		}
391
392		w *= (PUNYCODE_BASE - t);
393	}
394
395	return (0);	/* final character missing */
396}
397
398static int
399punycode_putwc(char *s, size_t len, unsigned long delta, int bias) {
400	const char *punycode_base36 = "abcdefghijklmnopqrstuvwxyz0123456789";
401	int k;
402	char *sorg = s;
403
404	for (k = PUNYCODE_BASE - bias; 1; k += PUNYCODE_BASE) {
405		int t = (k < PUNYCODE_TMIN) ? PUNYCODE_TMIN :
406			(k > PUNYCODE_TMAX) ? PUNYCODE_TMAX : k;
407
408		if (delta < t)
409			break;
410		if (len < 1)
411			return (0);
412		*s++ = punycode_base36[t + ((delta - t) % (PUNYCODE_BASE - t))];
413		len--;
414		delta = (delta - t) / (PUNYCODE_BASE - t);
415	}
416	if (len < 1)
417		return (0);
418	*s++ = punycode_base36[delta];
419	return (s - sorg);
420}
421
422static int
423punycode_update_bias(unsigned long delta, size_t npoints, int first) {
424	int k = 0;
425
426	delta /= first ? PUNYCODE_DAMP : 2;
427	delta += delta / npoints;
428
429	while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) {
430		delta /= PUNYCODE_BASE - PUNYCODE_TMIN;
431		k++;
432	}
433	return (PUNYCODE_BASE * k +
434		(((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta) /
435		 (delta + PUNYCODE_SKEW)));
436}
437