1/*	$NetBSD: unormalize.c,v 1.2.6.1 2012/06/06 18:18:08 bouyer Exp $	*/
2
3#ifndef lint
4static char *rcsid = "Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp ";
5#endif
6
7/*
8 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
9 * All rights reserved.
10 *
11 * By using this file, you agree to the terms and conditions set forth bellow.
12 *
13 * 			LICENSE TERMS AND CONDITIONS
14 *
15 * The following License Terms and Conditions apply, unless a different
16 * license is obtained from Japan Network Information Center ("JPNIC"),
17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18 * Chiyoda-ku, Tokyo 101-0047, Japan.
19 *
20 * 1. Use, Modification and Redistribution (including distribution of any
21 *    modified or derived work) in source and/or binary forms is permitted
22 *    under this License Terms and Conditions.
23 *
24 * 2. Redistribution of source code must retain the copyright notices as they
25 *    appear in each source code file, this License Terms and Conditions.
26 *
27 * 3. Redistribution in binary form must reproduce the Copyright Notice,
28 *    this License Terms and Conditions, in the documentation and/or other
29 *    materials provided with the distribution.  For the purposes of binary
30 *    distribution the "Copyright Notice" refers to the following language:
31 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32 *
33 * 4. The name of JPNIC may not be used to endorse or promote products
34 *    derived from this Software without specific prior written approval of
35 *    JPNIC.
36 *
37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48 */
49
50#include <config.h>
51
52#include <stddef.h>
53#include <stdlib.h>
54#include <string.h>
55
56#include <idn/result.h>
57#include <idn/assert.h>
58#include <idn/logmacro.h>
59#include <idn/ucs4.h>
60#include <idn/unicode.h>
61#include <idn/unormalize.h>
62#include <idn/debug.h>
63
64#if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY)
65#define memmove(a,b,c)	bcopy((char *)(b),(char *)(a),(int)(c))
66#endif
67
68#define WORKBUF_SIZE		128
69#define WORKBUF_SIZE_MAX	10000
70
71typedef struct {
72	idn__unicode_version_t version; /* Unicode version */
73	int cur;		/* pointing now processing character */
74	int last;		/* pointing just after the last character */
75	int size;		/* size of UCS and CLASS array */
76	unsigned long *ucs4;	/* UCS-4 characters */
77	int *class;		/* and their canonical classes */
78	unsigned long ucs4_buf[WORKBUF_SIZE];	/* local buffer */
79	int class_buf[WORKBUF_SIZE];		/* ditto */
80} workbuf_t;
81
82static idn_result_t	normalize(idn__unicode_version_t version,
83				  int do_composition, int compat,
84				  const unsigned long *from,
85				  unsigned long *to, size_t tolen);
86static idn_result_t	decompose(workbuf_t *wb, unsigned long c, int compat);
87static void		get_class(workbuf_t *wb);
88static void		reorder(workbuf_t *wb);
89static void		compose(workbuf_t *wb);
90static idn_result_t	flush_before_cur(workbuf_t *wb,
91					 unsigned long **top, size_t *tolenp);
92static void		workbuf_init(workbuf_t *wb);
93static void		workbuf_free(workbuf_t *wb);
94static idn_result_t	workbuf_extend(workbuf_t *wb);
95static idn_result_t	workbuf_append(workbuf_t *wb, unsigned long c);
96static void		workbuf_shift(workbuf_t *wb, int shift);
97static void		workbuf_removevoid(workbuf_t *wb);
98
99idn_result_t
100idn__unormalize_formkc(idn__unicode_version_t version,
101		       const unsigned long *from, unsigned long *to,
102		       size_t tolen) {
103	assert(version != NULL && from != NULL && to != NULL && tolen >= 0);
104	TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n",
105	       idn__debug_ucs4xstring(from, 50), tolen));
106	return (normalize(version, 1, 1, from, to, tolen));
107}
108
109static idn_result_t
110normalize(idn__unicode_version_t version, int do_composition, int compat,
111	  const unsigned long *from, unsigned long *to, size_t tolen) {
112	workbuf_t wb;
113	idn_result_t r = idn_success;
114
115	/*
116	 * Initialize working buffer.
117	 */
118	workbuf_init(&wb);
119	wb.version = version;
120
121	while (*from != '\0') {
122		unsigned long c;
123
124		assert(wb.cur == wb.last);
125
126		/*
127		 * Get one character from 'from'.
128		 */
129		c = *from++;
130
131		/*
132		 * Decompose it.
133		 */
134		if ((r = decompose(&wb, c, compat)) != idn_success)
135			goto ret;
136
137		/*
138		 * Get canonical class.
139		 */
140		get_class(&wb);
141
142		/*
143		 * Reorder & compose.
144		 */
145		for (; wb.cur < wb.last; wb.cur++) {
146			if (wb.cur == 0) {
147				continue;
148			} else if (wb.class[wb.cur] > 0) {
149				/*
150				 * This is not a starter. Try reordering.
151				 * Note that characters up to it are
152				 * already in canonical order.
153				 */
154				reorder(&wb);
155				continue;
156			}
157
158			/*
159			 * This is a starter character, and there are
160			 * some characters before it.  Those characters
161			 * have been reordered properly, and
162			 * ready for composition.
163			 */
164			if (do_composition && wb.class[0] == 0)
165				compose(&wb);
166
167			/*
168			 * If CUR points to a starter character,
169			 * then process of characters before CUR are
170			 * already finished, because any further
171			 * reordering/composition for them are blocked
172			 * by the starter CUR points.
173			 */
174			if (wb.cur > 0 && wb.class[wb.cur] == 0) {
175				/* Flush everything before CUR. */
176				r = flush_before_cur(&wb, &to, &tolen);
177				if (r != idn_success)
178					goto ret;
179			}
180		}
181	}
182
183	if (r == idn_success) {
184		if (do_composition && wb.cur > 0 && wb.class[0] == 0) {
185			/*
186			 * There is some characters left in WB.
187			 * They are ordered, but not composed yet.
188			 * Now CUR points just after the last character in WB,
189			 * and since compose() tries to compose characters
190			 * between top and CUR inclusive, we must make CUR
191			 * one character back during compose().
192			 */
193			wb.cur--;
194			compose(&wb);
195			wb.cur++;
196		}
197		/*
198		 * Call this even when WB.CUR == 0, to make TO
199		 * NUL-terminated.
200		 */
201		r = flush_before_cur(&wb, &to, &tolen);
202		if (r != idn_success)
203			goto ret;
204	}
205
206	if (tolen <= 0) {
207		r = idn_buffer_overflow;
208		goto ret;
209	}
210	*to = '\0';
211
212ret:
213	workbuf_free(&wb);
214	return (r);
215}
216
217static idn_result_t
218decompose(workbuf_t *wb, unsigned long c, int compat) {
219	idn_result_t r;
220	int dec_len;
221
222again:
223	r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last,
224				   wb->size - wb->last, c, &dec_len);
225	switch (r) {
226	case idn_success:
227		wb->last += dec_len;
228		return (idn_success);
229	case idn_notfound:
230		return (workbuf_append(wb, c));
231	case idn_buffer_overflow:
232		if ((r = workbuf_extend(wb)) != idn_success)
233			return (r);
234		if (wb->size > WORKBUF_SIZE_MAX) {
235			WARNING(("idn__unormalize_form*: "
236				"working buffer too large\n"));
237			return (idn_nomemory);
238		}
239		goto again;
240	default:
241		return (r);
242	}
243	/* NOTREACHED */
244}
245
246static void
247get_class(workbuf_t *wb) {
248	int i;
249
250	for (i = wb->cur; i < wb->last; i++)
251		wb->class[i] = idn__unicode_canonicalclass(wb->version,
252							   wb->ucs4[i]);
253}
254
255static void
256reorder(workbuf_t *wb) {
257	unsigned long c;
258	int i;
259	int class;
260
261	assert(wb != NULL);
262
263	i = wb->cur;
264	c = wb->ucs4[i];
265	class = wb->class[i];
266
267	while (i > 0 && wb->class[i - 1] > class) {
268		wb->ucs4[i] = wb->ucs4[i - 1];
269		wb->class[i] =wb->class[i - 1];
270		i--;
271		wb->ucs4[i] = c;
272		wb->class[i] = class;
273	}
274}
275
276static void
277compose(workbuf_t *wb) {
278	int cur;
279	unsigned long *ucs4;
280	int *class;
281	int last_class;
282	int nvoids;
283	int i;
284	idn__unicode_version_t ver;
285
286	assert(wb != NULL && wb->class[0] == 0);
287
288	cur = wb->cur;
289	ucs4 = wb->ucs4;
290	class = wb->class;
291	ver = wb->version;
292
293	/*
294	 * If there are no decomposition sequence that begins with
295	 * the top character, composition is impossible.
296	 */
297	if (!idn__unicode_iscompositecandidate(ver, ucs4[0]))
298		return;
299
300	last_class = 0;
301	nvoids = 0;
302	for (i = 1; i <= cur; i++) {
303		unsigned long c;
304		int cl = class[i];
305
306		if ((last_class < cl || cl == 0) &&
307		    idn__unicode_compose(ver, ucs4[0], ucs4[i],
308					 &c) == idn_success) {
309			/*
310			 * Replace the top character with the composed one.
311			 */
312			ucs4[0] = c;
313			class[0] = idn__unicode_canonicalclass(ver, c);
314
315			class[i] = -1;	/* void this character */
316			nvoids++;
317		} else {
318			last_class = cl;
319		}
320	}
321
322	/* Purge void characters, if any. */
323	if (nvoids > 0)
324		workbuf_removevoid(wb);
325}
326
327static idn_result_t
328flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) {
329	if (*tolenp < wb->cur)
330		return (idn_buffer_overflow);
331
332	memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur);
333	*top += wb->cur;
334	*tolenp -= wb->cur;
335	workbuf_shift(wb, wb->cur);
336
337	return (idn_success);
338}
339
340static void
341workbuf_init(workbuf_t *wb) {
342	wb->cur = 0;
343	wb->last = 0;
344	wb->size = WORKBUF_SIZE;
345	wb->ucs4 = wb->ucs4_buf;
346	wb->class = wb->class_buf;
347}
348
349static void
350workbuf_free(workbuf_t *wb) {
351	if (wb->ucs4 != wb->ucs4_buf) {
352		free(wb->ucs4);
353		free(wb->class);
354	}
355}
356
357static idn_result_t
358workbuf_extend(workbuf_t *wb) {
359	int newsize = wb->size * 3;
360
361	if (wb->ucs4 == wb->ucs4_buf) {
362		wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize);
363		wb->class = malloc(sizeof(wb->class[0]) * newsize);
364	} else {
365		wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize);
366		wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize);
367	}
368	if (wb->ucs4 == NULL || wb->class == NULL)
369		return (idn_nomemory);
370	else
371		return (idn_success);
372}
373
374static idn_result_t
375workbuf_append(workbuf_t *wb, unsigned long c) {
376	idn_result_t r;
377
378	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success)
379		return (r);
380	wb->ucs4[wb->last++] = c;
381	return (idn_success);
382}
383
384static void
385workbuf_shift(workbuf_t *wb, int shift) {
386	int nmove;
387
388	assert(wb != NULL && wb->cur >= shift);
389
390	nmove = wb->last - shift;
391	(void)memmove(&wb->ucs4[0], &wb->ucs4[shift],
392		      nmove * sizeof(wb->ucs4[0]));
393	(void)memmove(&wb->class[0], &wb->class[shift],
394		      nmove * sizeof(wb->class[0]));
395	wb->cur -= shift;
396	wb->last -= shift;
397}
398
399static void
400workbuf_removevoid(workbuf_t *wb) {
401	int i, j;
402	int last = wb->last;
403
404	for (i = j = 0; i < last; i++) {
405		if (wb->class[i] >= 0) {
406			if (j < i) {
407				wb->ucs4[j] = wb->ucs4[i];
408				wb->class[j] = wb->class[i];
409			}
410			j++;
411		}
412	}
413	wb->cur -= last - j;
414	wb->last = j;
415}
416