1#ifndef lint
2static char *rcsid = "$Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp $";
3#endif
4
5/*
6 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
7 * All rights reserved.
8 *
9 * By using this file, you agree to the terms and conditions set forth bellow.
10 *
11 * 			LICENSE TERMS AND CONDITIONS
12 *
13 * The following License Terms and Conditions apply, unless a different
14 * license is obtained from Japan Network Information Center ("JPNIC"),
15 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
16 * Chiyoda-ku, Tokyo 101-0047, Japan.
17 *
18 * 1. Use, Modification and Redistribution (including distribution of any
19 *    modified or derived work) in source and/or binary forms is permitted
20 *    under this License Terms and Conditions.
21 *
22 * 2. Redistribution of source code must retain the copyright notices as they
23 *    appear in each source code file, this License Terms and Conditions.
24 *
25 * 3. Redistribution in binary form must reproduce the Copyright Notice,
26 *    this License Terms and Conditions, in the documentation and/or other
27 *    materials provided with the distribution.  For the purposes of binary
28 *    distribution the "Copyright Notice" refers to the following language:
29 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
30 *
31 * 4. The name of JPNIC may not be used to endorse or promote products
32 *    derived from this Software without specific prior written approval of
33 *    JPNIC.
34 *
35 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
36 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
37 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
38 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
39 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
40 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
41 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
42 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
43 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
44 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
45 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
46 */
47
48#include <config.h>
49
50#include <stddef.h>
51#include <stdlib.h>
52#include <string.h>
53
54#include <idn/result.h>
55#include <idn/assert.h>
56#include <idn/logmacro.h>
57#include <idn/ucs4.h>
58#include <idn/unicode.h>
59#include <idn/unormalize.h>
60#include <idn/debug.h>
61
62#if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY)
63#define memmove(a,b,c)	bcopy((char *)(b),(char *)(a),(int)(c))
64#endif
65
66#define WORKBUF_SIZE		128
67#define WORKBUF_SIZE_MAX	10000
68
69typedef struct {
70	idn__unicode_version_t version; /* Unicode version */
71	int cur;		/* pointing now processing character */
72	int last;		/* pointing just after the last character */
73	int size;		/* size of UCS and CLASS array */
74	unsigned long *ucs4;	/* UCS-4 characters */
75	int *class;		/* and their canonical classes */
76	unsigned long ucs4_buf[WORKBUF_SIZE];	/* local buffer */
77	int class_buf[WORKBUF_SIZE];		/* ditto */
78} workbuf_t;
79
80static idn_result_t	normalize(idn__unicode_version_t version,
81				  int do_composition, int compat,
82				  const unsigned long *from,
83				  unsigned long *to, size_t tolen);
84static idn_result_t	decompose(workbuf_t *wb, unsigned long c, int compat);
85static void		get_class(workbuf_t *wb);
86static void		reorder(workbuf_t *wb);
87static void		compose(workbuf_t *wb);
88static idn_result_t	flush_before_cur(workbuf_t *wb,
89					 unsigned long **top, size_t *tolenp);
90static void		workbuf_init(workbuf_t *wb);
91static void		workbuf_free(workbuf_t *wb);
92static idn_result_t	workbuf_extend(workbuf_t *wb);
93static idn_result_t	workbuf_append(workbuf_t *wb, unsigned long c);
94static void		workbuf_shift(workbuf_t *wb, int shift);
95static void		workbuf_removevoid(workbuf_t *wb);
96
97idn_result_t
98idn__unormalize_formkc(idn__unicode_version_t version,
99		       const unsigned long *from, unsigned long *to,
100		       size_t tolen) {
101	assert(version != NULL && from != NULL && to != NULL && tolen >= 0);
102	TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n",
103	       idn__debug_ucs4xstring(from, 50), tolen));
104	return (normalize(version, 1, 1, from, to, tolen));
105}
106
107static idn_result_t
108normalize(idn__unicode_version_t version, int do_composition, int compat,
109	  const unsigned long *from, unsigned long *to, size_t tolen) {
110	workbuf_t wb;
111	idn_result_t r = idn_success;
112
113	/*
114	 * Initialize working buffer.
115	 */
116	workbuf_init(&wb);
117	wb.version = version;
118
119	while (*from != '\0') {
120		unsigned long c;
121
122		assert(wb.cur == wb.last);
123
124		/*
125		 * Get one character from 'from'.
126		 */
127		c = *from++;
128
129		/*
130		 * Decompose it.
131		 */
132		if ((r = decompose(&wb, c, compat)) != idn_success)
133			goto ret;
134
135		/*
136		 * Get canonical class.
137		 */
138		get_class(&wb);
139
140		/*
141		 * Reorder & compose.
142		 */
143		for (; wb.cur < wb.last; wb.cur++) {
144			if (wb.cur == 0) {
145				continue;
146			} else if (wb.class[wb.cur] > 0) {
147				/*
148				 * This is not a starter. Try reordering.
149				 * Note that characters up to it are
150				 * already in canonical order.
151				 */
152				reorder(&wb);
153				continue;
154			}
155
156			/*
157			 * This is a starter character, and there are
158			 * some characters before it.  Those characters
159			 * have been reordered properly, and
160			 * ready for composition.
161			 */
162			if (do_composition && wb.class[0] == 0)
163				compose(&wb);
164
165			/*
166			 * If CUR points to a starter character,
167			 * then process of characters before CUR are
168			 * already finished, because any further
169			 * reordering/composition for them are blocked
170			 * by the starter CUR points.
171			 */
172			if (wb.cur > 0 && wb.class[wb.cur] == 0) {
173				/* Flush everything before CUR. */
174				r = flush_before_cur(&wb, &to, &tolen);
175				if (r != idn_success)
176					goto ret;
177			}
178		}
179	}
180
181	if (r == idn_success) {
182		if (do_composition && wb.cur > 0 && wb.class[0] == 0) {
183			/*
184			 * There is some characters left in WB.
185			 * They are ordered, but not composed yet.
186			 * Now CUR points just after the last character in WB,
187			 * and since compose() tries to compose characters
188			 * between top and CUR inclusive, we must make CUR
189			 * one character back during compose().
190			 */
191			wb.cur--;
192			compose(&wb);
193			wb.cur++;
194		}
195		/*
196		 * Call this even when WB.CUR == 0, to make TO
197		 * NUL-terminated.
198		 */
199		r = flush_before_cur(&wb, &to, &tolen);
200		if (r != idn_success)
201			goto ret;
202	}
203
204	if (tolen <= 0) {
205		r = idn_buffer_overflow;
206		goto ret;
207	}
208	*to = '\0';
209
210ret:
211	workbuf_free(&wb);
212	return (r);
213}
214
215static idn_result_t
216decompose(workbuf_t *wb, unsigned long c, int compat) {
217	idn_result_t r;
218	int dec_len;
219
220again:
221	r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last,
222				   wb->size - wb->last, c, &dec_len);
223	switch (r) {
224	case idn_success:
225		wb->last += dec_len;
226		return (idn_success);
227	case idn_notfound:
228		return (workbuf_append(wb, c));
229	case idn_buffer_overflow:
230		if ((r = workbuf_extend(wb)) != idn_success)
231			return (r);
232		if (wb->size > WORKBUF_SIZE_MAX) {
233			WARNING(("idn__unormalize_form*: "
234				"working buffer too large\n"));
235			return (idn_nomemory);
236		}
237		goto again;
238	default:
239		return (r);
240	}
241	/* NOTREACHED */
242}
243
244static void
245get_class(workbuf_t *wb) {
246	int i;
247
248	for (i = wb->cur; i < wb->last; i++)
249		wb->class[i] = idn__unicode_canonicalclass(wb->version,
250							   wb->ucs4[i]);
251}
252
253static void
254reorder(workbuf_t *wb) {
255	unsigned long c;
256	int i;
257	int class;
258
259	assert(wb != NULL);
260
261	i = wb->cur;
262	c = wb->ucs4[i];
263	class = wb->class[i];
264
265	while (i > 0 && wb->class[i - 1] > class) {
266		wb->ucs4[i] = wb->ucs4[i - 1];
267		wb->class[i] =wb->class[i - 1];
268		i--;
269		wb->ucs4[i] = c;
270		wb->class[i] = class;
271	}
272}
273
274static void
275compose(workbuf_t *wb) {
276	int cur;
277	unsigned long *ucs4;
278	int *class;
279	int last_class;
280	int nvoids;
281	int i;
282	idn__unicode_version_t ver;
283
284	assert(wb != NULL && wb->class[0] == 0);
285
286	cur = wb->cur;
287	ucs4 = wb->ucs4;
288	class = wb->class;
289	ver = wb->version;
290
291	/*
292	 * If there are no decomposition sequence that begins with
293	 * the top character, composition is impossible.
294	 */
295	if (!idn__unicode_iscompositecandidate(ver, ucs4[0]))
296		return;
297
298	last_class = 0;
299	nvoids = 0;
300	for (i = 1; i <= cur; i++) {
301		unsigned long c;
302		int cl = class[i];
303
304		if ((last_class < cl || cl == 0) &&
305		    idn__unicode_compose(ver, ucs4[0], ucs4[i],
306					 &c) == idn_success) {
307			/*
308			 * Replace the top character with the composed one.
309			 */
310			ucs4[0] = c;
311			class[0] = idn__unicode_canonicalclass(ver, c);
312
313			class[i] = -1;	/* void this character */
314			nvoids++;
315		} else {
316			last_class = cl;
317		}
318	}
319
320	/* Purge void characters, if any. */
321	if (nvoids > 0)
322		workbuf_removevoid(wb);
323}
324
325static idn_result_t
326flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) {
327	if (*tolenp < wb->cur)
328		return (idn_buffer_overflow);
329
330	memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur);
331	*top += wb->cur;
332	*tolenp -= wb->cur;
333	workbuf_shift(wb, wb->cur);
334
335	return (idn_success);
336}
337
338static void
339workbuf_init(workbuf_t *wb) {
340	wb->cur = 0;
341	wb->last = 0;
342	wb->size = WORKBUF_SIZE;
343	wb->ucs4 = wb->ucs4_buf;
344	wb->class = wb->class_buf;
345}
346
347static void
348workbuf_free(workbuf_t *wb) {
349	if (wb->ucs4 != wb->ucs4_buf) {
350		free(wb->ucs4);
351		free(wb->class);
352	}
353}
354
355static idn_result_t
356workbuf_extend(workbuf_t *wb) {
357	int newsize = wb->size * 3;
358
359	if (wb->ucs4 == wb->ucs4_buf) {
360		wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize);
361		wb->class = malloc(sizeof(wb->class[0]) * newsize);
362	} else {
363		wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize);
364		wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize);
365	}
366	if (wb->ucs4 == NULL || wb->class == NULL)
367		return (idn_nomemory);
368	else
369		return (idn_success);
370}
371
372static idn_result_t
373workbuf_append(workbuf_t *wb, unsigned long c) {
374	idn_result_t r;
375
376	if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success)
377		return (r);
378	wb->ucs4[wb->last++] = c;
379	return (idn_success);
380}
381
382static void
383workbuf_shift(workbuf_t *wb, int shift) {
384	int nmove;
385
386	assert(wb != NULL && wb->cur >= shift);
387
388	nmove = wb->last - shift;
389	(void)memmove(&wb->ucs4[0], &wb->ucs4[shift],
390		      nmove * sizeof(wb->ucs4[0]));
391	(void)memmove(&wb->class[0], &wb->class[shift],
392		      nmove * sizeof(wb->class[0]));
393	wb->cur -= shift;
394	wb->last -= shift;
395}
396
397static void
398workbuf_removevoid(workbuf_t *wb) {
399	int i, j;
400	int last = wb->last;
401
402	for (i = j = 0; i < last; i++) {
403		if (wb->class[i] >= 0) {
404			if (j < i) {
405				wb->ucs4[j] = wb->ucs4[i];
406				wb->class[j] = wb->class[i];
407			}
408			j++;
409		}
410	}
411	wb->cur -= last - j;
412	wb->last = j;
413}
414