unicode.c revision 1.2.6.1
1/*	$NetBSD: unicode.c,v 1.2.6.1 2012/06/06 18:18:07 bouyer Exp $	*/
2
3#ifndef lint
4static char *rcsid = "Id: unicode.c,v 1.1 2003/06/04 00:26:16 marka Exp ";
5#endif
6
7/*
8 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
9 * All rights reserved.
10 *
11 * By using this file, you agree to the terms and conditions set forth bellow.
12 *
13 * 			LICENSE TERMS AND CONDITIONS
14 *
15 * The following License Terms and Conditions apply, unless a different
16 * license is obtained from Japan Network Information Center ("JPNIC"),
17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18 * Chiyoda-ku, Tokyo 101-0047, Japan.
19 *
20 * 1. Use, Modification and Redistribution (including distribution of any
21 *    modified or derived work) in source and/or binary forms is permitted
22 *    under this License Terms and Conditions.
23 *
24 * 2. Redistribution of source code must retain the copyright notices as they
25 *    appear in each source code file, this License Terms and Conditions.
26 *
27 * 3. Redistribution in binary form must reproduce the Copyright Notice,
28 *    this License Terms and Conditions, in the documentation and/or other
29 *    materials provided with the distribution.  For the purposes of binary
30 *    distribution the "Copyright Notice" refers to the following language:
31 *    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
32 *
33 * 4. The name of JPNIC may not be used to endorse or promote products
34 *    derived from this Software without specific prior written approval of
35 *    JPNIC.
36 *
37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38 *    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 *    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40 *    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
41 *    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 *    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 *    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 *    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 *    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46 *    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 *    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48 */
49
50#include <config.h>
51
52#include <stddef.h>
53#include <stdlib.h>
54#include <string.h>
55
56#include <idn/result.h>
57#include <idn/logmacro.h>
58#include <idn/assert.h>
59#include <idn/unicode.h>
60
61#define UNICODE_CURRENT	"3.2.0"
62
63#define UCS_MAX		0x10ffff
64#define END_BIT		0x80000000
65
66/*
67 * Some constants for Hangul decomposition/composition.
68 */
69#define SBase		0xac00
70#define LBase		0x1100
71#define VBase		0x1161
72#define TBase		0x11a7
73#define LCount		19
74#define VCount		21
75#define TCount		28
76#define SLast		(SBase + LCount * VCount * TCount)
77
78/*
79 * Symbol composition macro.
80 */
81#define compose_sym(a, b)		compose_symX(a, b)
82#define compose_symX(a, b)		a ## b
83
84struct composition {
85	unsigned long c2;	/* 2nd character */
86	unsigned long comp;	/* composed character */
87};
88
89#include "unicodedata_320.c"
90#define VERSION v320
91#include "unicode_template.c"
92#undef VERSION
93
94typedef int	(*unicode_canonclassproc)(unsigned long v);
95typedef int	(*unicode_decomposeproc)(unsigned long c,
96					 const unsigned long **seqp);
97typedef int	(*unicode_composeproc)(unsigned long c,
98				       const struct composition **compp);
99
100static struct idn__unicode_ops {
101	char *version;
102	unicode_canonclassproc canonclass_proc;
103	unicode_decomposeproc decompose_proc;
104	unicode_composeproc compose_proc;
105} unicode_versions[] = {
106#define MAKE_UNICODE_HANDLE(version, suffix) \
107	{ version, \
108	  compose_sym(canonclass_, suffix), \
109	  compose_sym(decompose_, suffix), \
110	  compose_sym(compose_, suffix) }
111	MAKE_UNICODE_HANDLE("3.2.0", v320),
112	{ NULL },
113#undef MAKE_UNICODE_HANDLE
114};
115
116idn_result_t
117idn__unicode_create(const char *version,
118		    idn__unicode_version_t *versionp) {
119	idn__unicode_version_t v;
120
121	assert(versionp != NULL);
122	TRACE(("idn__unicode_create(version=%-.50s)\n",
123	       version == NULL ? "<NULL>" : version));
124
125	if (version == NULL)
126		version = UNICODE_CURRENT;
127
128	for (v = unicode_versions; v->version != NULL; v++) {
129		if (strcmp(v->version, version) == 0) {
130			*versionp = v;
131			return (idn_success);
132		}
133	}
134	return (idn_notfound);
135}
136
137void
138idn__unicode_destroy(idn__unicode_version_t version) {
139	assert(version != NULL);
140	TRACE(("idn__unicode_destroy()\n"));
141	/* Nothing to do */
142}
143
144int
145idn__unicode_canonicalclass(idn__unicode_version_t version, unsigned long c) {
146	if (c > UCS_MAX)
147		return (0);
148
149	return (*version->canonclass_proc)(c);
150}
151
152idn_result_t
153idn__unicode_decompose(idn__unicode_version_t version,
154		       int compat, unsigned long *v, size_t vlen,
155		       unsigned long c, int *decomp_lenp) {
156	unsigned long *vorg = v;
157	int seqidx;
158	const unsigned long *seq;
159
160	assert(v != NULL && vlen >= 0 && decomp_lenp != NULL);
161
162	if (c > UCS_MAX)
163		return (idn_notfound);
164
165	/*
166	 * First, check for Hangul.
167	 */
168	if (SBase <= c && c < SLast) {
169		int idx, t_offset, v_offset, l_offset;
170
171		idx = c - SBase;
172		t_offset = idx % TCount;
173		idx /= TCount;
174		v_offset = idx % VCount;
175		l_offset = idx / VCount;
176		if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
177			return (idn_buffer_overflow);
178		*v++ = LBase + l_offset;
179		*v++ = VBase + v_offset;
180		if (t_offset > 0)
181			*v++ = TBase + t_offset;
182		*decomp_lenp = v - vorg;
183		return (idn_success);
184	}
185
186	/*
187	 * Look up decomposition table.  If no decomposition is defined
188	 * or if it is a compatibility decomosition when canonical
189	 * decomposition requested, return 'idn_notfound'.
190	 */
191	seqidx = (*version->decompose_proc)(c, &seq);
192	if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
193		return (idn_notfound);
194
195	/*
196	 * Copy the decomposed sequence.  The end of the sequence are
197	 * marked with END_BIT.
198	 */
199	do {
200		unsigned long c;
201		int dlen;
202		idn_result_t r;
203
204		c = *seq & ~END_BIT;
205
206		/* Decompose recursively. */
207		r = idn__unicode_decompose(version, compat, v, vlen, c, &dlen);
208		if (r == idn_success) {
209			v += dlen;
210			vlen -= dlen;
211		} else if (r == idn_notfound) {
212			if (vlen < 1)
213				return (idn_buffer_overflow);
214			*v++ = c;
215			vlen--;
216		} else {
217			return (r);
218		}
219
220	} while ((*seq++ & END_BIT) == 0);
221
222	*decomp_lenp = v - vorg;
223
224	return (idn_success);
225}
226
227int
228idn__unicode_iscompositecandidate(idn__unicode_version_t version,
229				  unsigned long c) {
230	const struct composition *dummy;
231
232	if (c > UCS_MAX)
233		return (0);
234
235	/* Check for Hangul */
236	if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
237		return (1);
238
239	/*
240	 * Look up composition table.  If there are no composition
241	 * that begins with the given character, it is not a
242	 * composition candidate.
243	 */
244	if ((*version->compose_proc)(c, &dummy) == 0)
245		return (0);
246	else
247		return (1);
248}
249
250idn_result_t
251idn__unicode_compose(idn__unicode_version_t version, unsigned long c1,
252		     unsigned long c2, unsigned long *compp) {
253	int n;
254	int lo, hi;
255	const struct composition *cseq;
256
257	assert(compp != NULL);
258
259	if (c1 > UCS_MAX || c2 > UCS_MAX)
260		return (idn_notfound);
261
262	/*
263	 * Check for Hangul.
264	 */
265	if (LBase <= c1 && c1 < LBase + LCount &&
266	    VBase <= c2 && c2 < VBase + VCount) {
267		/*
268		 * Hangul L and V.
269		 */
270		*compp = SBase +
271			((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
272		return (idn_success);
273	} else if (SBase <= c1 && c1 < SLast &&
274		   TBase <= c2 && c2 < TBase + TCount &&
275		   (c1 - SBase) % TCount == 0) {
276		/*
277		 * Hangul LV and T.
278		 */
279		*compp = c1 + (c2 - TBase);
280		return (idn_success);
281	}
282
283	/*
284	 * Look up composition table.  If the result is 0, no composition
285	 * is defined.  Otherwise, upper 16bits of the result contains
286	 * the number of composition that begins with 'c1', and the lower
287	 * 16bits is the offset in 'compose_seq'.
288	 */
289	if ((n = (*version->compose_proc)(c1, &cseq)) == 0)
290		return (idn_notfound);
291
292	/*
293	 * The composite sequences are sorted by the 2nd character 'c2'.
294	 * So we can use binary search.
295	 */
296	lo = 0;
297	hi = n - 1;
298	while (lo <= hi) {
299		int mid = (lo + hi) / 2;
300
301		if (cseq[mid].c2 < c2) {
302			lo = mid + 1;
303		} else if (cseq[mid].c2 > c2) {
304			hi = mid - 1;
305		} else {
306			*compp = cseq[mid].comp;
307			return (idn_success);
308		}
309	}
310	return (idn_notfound);
311}
312