1226031Sstas/*
2226031Sstas * Copyright (c) 2004 Kungliga Tekniska H��gskolan
3226031Sstas * (Royal Institute of Technology, Stockholm, Sweden).
4226031Sstas * All rights reserved.
5226031Sstas *
6226031Sstas * Redistribution and use in source and binary forms, with or without
7226031Sstas * modification, are permitted provided that the following conditions
8226031Sstas * are met:
9226031Sstas *
10226031Sstas * 1. Redistributions of source code must retain the above copyright
11226031Sstas *    notice, this list of conditions and the following disclaimer.
12226031Sstas *
13226031Sstas * 2. Redistributions in binary form must reproduce the above copyright
14226031Sstas *    notice, this list of conditions and the following disclaimer in the
15226031Sstas *    documentation and/or other materials provided with the distribution.
16226031Sstas *
17226031Sstas * 3. Neither the name of the Institute nor the names of its contributors
18226031Sstas *    may be used to endorse or promote products derived from this software
19226031Sstas *    without specific prior written permission.
20226031Sstas *
21226031Sstas * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22226031Sstas * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23226031Sstas * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24226031Sstas * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25226031Sstas * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26226031Sstas * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27226031Sstas * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28226031Sstas * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29226031Sstas * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30226031Sstas * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31226031Sstas * SUCH DAMAGE.
32226031Sstas */
33226031Sstas
34226031Sstas#ifdef HAVE_CONFIG_H
35226031Sstas#include <config.h>
36226031Sstas#endif
37226031Sstas#include "windlocl.h"
38226031Sstas
39226031Sstas#include <assert.h>
40226031Sstas#include <stdlib.h>
41226031Sstas#include <errno.h>
42226031Sstas#include <stdio.h>
43226031Sstas
44226031Sstas#include "roken.h"
45226031Sstas
46226031Sstas#include "normalize_table.h"
47226031Sstas
48226031Sstasstatic int
49226031Sstastranslation_cmp(const void *key, const void *data)
50226031Sstas{
51226031Sstas    const struct translation *t1 = (const struct translation *)key;
52226031Sstas    const struct translation *t2 = (const struct translation *)data;
53226031Sstas
54226031Sstas    return t1->key - t2->key;
55226031Sstas}
56226031Sstas
57226031Sstasenum { s_base  = 0xAC00};
58226031Sstasenum { s_count = 11172};
59226031Sstasenum { l_base  = 0x1100};
60226031Sstasenum { l_count = 19};
61226031Sstasenum { v_base  = 0x1161};
62226031Sstasenum { v_count = 21};
63226031Sstasenum { t_base  = 0x11A7};
64226031Sstasenum { t_count = 28};
65226031Sstasenum { n_count = v_count * t_count};
66226031Sstas
67226031Sstasstatic int
68226031Sstashangul_decomp(const uint32_t *in, size_t in_len,
69226031Sstas	      uint32_t *out, size_t *out_len)
70226031Sstas{
71226031Sstas    uint32_t u = *in;
72226031Sstas    unsigned s_index;
73226031Sstas    unsigned l, v, t;
74226031Sstas    unsigned o;
75226031Sstas
76226031Sstas    if (u < s_base || u >= s_base + s_count)
77226031Sstas	return 0;
78226031Sstas    s_index = u - s_base;
79226031Sstas    l = l_base + s_index / n_count;
80226031Sstas    v = v_base + (s_index % n_count) / t_count;
81226031Sstas    t = t_base + s_index % t_count;
82226031Sstas    o = 2;
83226031Sstas    if (t != t_base)
84226031Sstas	++o;
85226031Sstas    if (*out_len < o)
86226031Sstas	return WIND_ERR_OVERRUN;
87226031Sstas    out[0] = l;
88226031Sstas    out[1] = v;
89226031Sstas    if (t != t_base)
90226031Sstas	out[2] = t;
91226031Sstas    *out_len = o;
92226031Sstas    return 1;
93226031Sstas}
94226031Sstas
95226031Sstasstatic uint32_t
96226031Sstashangul_composition(const uint32_t *in, size_t in_len)
97226031Sstas{
98226031Sstas    if (in_len < 2)
99226031Sstas	return 0;
100226031Sstas    if (in[0] >= l_base && in[0] < l_base + l_count) {
101226031Sstas	unsigned l_index = in[0] - l_base;
102226031Sstas	unsigned v_index;
103226031Sstas
104226031Sstas	if (in[1] < v_base || in[1] >= v_base + v_count)
105226031Sstas	    return 0;
106226031Sstas	v_index = in[1] - v_base;
107226031Sstas	return (l_index * v_count + v_index) * t_count + s_base;
108226031Sstas    } else if (in[0] >= s_base && in[0] < s_base + s_count) {
109226031Sstas	unsigned s_index = in[0] - s_base;
110226031Sstas	unsigned t_index;
111226031Sstas
112226031Sstas	if (s_index % t_count != 0)
113226031Sstas	    return 0;
114226031Sstas	if (in[1] < t_base || in[1] >= t_base + t_count)
115226031Sstas	    return 0;
116226031Sstas	t_index = in[1] - t_base;
117226031Sstas	return in[0] + t_index;
118226031Sstas    }
119226031Sstas    return 0;
120226031Sstas}
121226031Sstas
122226031Sstasstatic int
123226031Sstascompat_decomp(const uint32_t *in, size_t in_len,
124226031Sstas	      uint32_t *out, size_t *out_len)
125226031Sstas{
126226031Sstas    unsigned i;
127226031Sstas    unsigned o = 0;
128226031Sstas
129226031Sstas    for (i = 0; i < in_len; ++i) {
130226031Sstas	struct translation ts = {in[i]};
131226031Sstas	size_t sub_len = *out_len - o;
132226031Sstas	int ret;
133226031Sstas
134226031Sstas	ret = hangul_decomp(in + i, in_len - i,
135226031Sstas			    out + o, &sub_len);
136226031Sstas	if (ret) {
137226031Sstas	    if (ret == WIND_ERR_OVERRUN)
138226031Sstas		return ret;
139226031Sstas	    o += sub_len;
140226031Sstas	} else {
141226031Sstas	    void *s = bsearch(&ts,
142226031Sstas			      _wind_normalize_table,
143226031Sstas			      _wind_normalize_table_size,
144226031Sstas			      sizeof(_wind_normalize_table[0]),
145226031Sstas			      translation_cmp);
146226031Sstas	    if (s != NULL) {
147226031Sstas		const struct translation *t = (const struct translation *)s;
148226031Sstas
149226031Sstas		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
150226031Sstas				    t->val_len,
151226031Sstas				    out + o, &sub_len);
152226031Sstas		if (ret)
153226031Sstas		    return ret;
154226031Sstas		o += sub_len;
155226031Sstas	    } else {
156226031Sstas		if (o >= *out_len)
157226031Sstas		    return WIND_ERR_OVERRUN;
158226031Sstas		out[o++] = in[i];
159226031Sstas
160226031Sstas	    }
161226031Sstas	}
162226031Sstas    }
163226031Sstas    *out_len = o;
164226031Sstas    return 0;
165226031Sstas}
166226031Sstas
167226031Sstasstatic void
168226031Sstasswap_char(uint32_t * a, uint32_t * b)
169226031Sstas{
170226031Sstas    uint32_t t;
171226031Sstas    t = *a;
172226031Sstas    *a = *b;
173226031Sstas    *b = t;
174226031Sstas}
175226031Sstas
176226031Sstas/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
177226031Sstas * that all have Canonical_Combining_Class > 0 */
178226031Sstasstatic void
179226031Sstascanonical_reorder_sequence(uint32_t * a, size_t len)
180226031Sstas{
181226031Sstas    size_t i, j;
182226031Sstas
183226031Sstas    if (len <= 1)
184226031Sstas	return;
185226031Sstas
186226031Sstas    for (i = 1; i < len; i++) {
187226031Sstas	for (j = i;
188226031Sstas	     j > 0 &&
189226031Sstas		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
190226031Sstas	     j--)
191226031Sstas	    swap_char(&a[j], &a[j-1]);
192226031Sstas    }
193226031Sstas}
194226031Sstas
195226031Sstasstatic void
196226031Sstascanonical_reorder(uint32_t *tmp, size_t tmp_len)
197226031Sstas{
198226031Sstas    size_t i;
199226031Sstas
200226031Sstas    for (i = 0; i < tmp_len; ++i) {
201226031Sstas	int cc = _wind_combining_class(tmp[i]);
202226031Sstas	if (cc) {
203226031Sstas	    size_t j;
204226031Sstas	    for (j = i + 1;
205226031Sstas		 j < tmp_len && _wind_combining_class(tmp[j]);
206226031Sstas		 ++j)
207226031Sstas		;
208226031Sstas	    canonical_reorder_sequence(&tmp[i], j - i);
209226031Sstas	    i = j;
210226031Sstas	}
211226031Sstas    }
212226031Sstas}
213226031Sstas
214226031Sstasstatic uint32_t
215226031Sstasfind_composition(const uint32_t *in, unsigned in_len)
216226031Sstas{
217226031Sstas    unsigned short canon_index = 0;
218226031Sstas    uint32_t cur;
219226031Sstas    unsigned n = 0;
220226031Sstas
221226031Sstas    cur = hangul_composition(in, in_len);
222226031Sstas    if (cur)
223226031Sstas	return cur;
224226031Sstas
225226031Sstas    do {
226226031Sstas	const struct canon_node *c = &_wind_canon_table[canon_index];
227226031Sstas	unsigned i;
228226031Sstas
229226031Sstas	if (n % 5 == 0) {
230226031Sstas	    cur = *in++;
231226031Sstas	    if (in_len-- == 0)
232226031Sstas		return c->val;
233226031Sstas	}
234226031Sstas
235226031Sstas	i = cur >> 16;
236226031Sstas	if (i < c->next_start || i >= c->next_end)
237226031Sstas	    canon_index = 0;
238226031Sstas	else
239226031Sstas	    canon_index =
240226031Sstas		_wind_canon_next_table[c->next_offset + i - c->next_start];
241226031Sstas	if (canon_index != 0) {
242226031Sstas	    cur = (cur << 4) & 0xFFFFF;
243226031Sstas	    ++n;
244226031Sstas	}
245226031Sstas    } while (canon_index != 0);
246226031Sstas    return 0;
247226031Sstas}
248226031Sstas
249226031Sstasstatic int
250226031Sstascombine(const uint32_t *in, size_t in_len,
251226031Sstas	uint32_t *out, size_t *out_len)
252226031Sstas{
253226031Sstas    unsigned i;
254226031Sstas    int ostarter;
255226031Sstas    unsigned o = 0;
256226031Sstas    int old_cc;
257226031Sstas
258226031Sstas    for (i = 0; i < in_len;) {
259226031Sstas	while (i < in_len && _wind_combining_class(in[i]) != 0) {
260226031Sstas	    out[o++] = in[i++];
261226031Sstas	}
262226031Sstas	if (i < in_len) {
263226031Sstas	    if (o >= *out_len)
264226031Sstas		return WIND_ERR_OVERRUN;
265226031Sstas	    ostarter = o;
266226031Sstas	    out[o++] = in[i++];
267226031Sstas	    old_cc   = -1;
268226031Sstas
269226031Sstas	    while (i < in_len) {
270226031Sstas		uint32_t comb;
271226031Sstas		uint32_t v[2];
272226031Sstas		int cc;
273226031Sstas
274226031Sstas		v[0] = out[ostarter];
275226031Sstas		v[1] = in[i];
276226031Sstas
277226031Sstas		cc = _wind_combining_class(in[i]);
278226031Sstas		if (old_cc != cc && (comb = find_composition(v, 2))) {
279226031Sstas		    out[ostarter] = comb;
280226031Sstas		} else if (cc == 0) {
281226031Sstas		    break;
282226031Sstas		} else {
283226031Sstas		    if (o >= *out_len)
284226031Sstas			return WIND_ERR_OVERRUN;
285226031Sstas		    out[o++] = in[i];
286226031Sstas		    old_cc   = cc;
287226031Sstas		}
288226031Sstas		++i;
289226031Sstas	    }
290226031Sstas	}
291226031Sstas    }
292226031Sstas    *out_len = o;
293226031Sstas    return 0;
294226031Sstas}
295226031Sstas
296226031Sstasint
297226031Sstas_wind_stringprep_normalize(const uint32_t *in, size_t in_len,
298226031Sstas			   uint32_t *out, size_t *out_len)
299226031Sstas{
300226031Sstas    size_t tmp_len;
301226031Sstas    uint32_t *tmp;
302226031Sstas    int ret;
303226031Sstas
304226031Sstas    if (in_len == 0) {
305226031Sstas	*out_len = 0;
306226031Sstas	return 0;
307226031Sstas    }
308226031Sstas
309226031Sstas    tmp_len = in_len * 4;
310226031Sstas    if (tmp_len < MAX_LENGTH_CANON)
311226031Sstas	tmp_len = MAX_LENGTH_CANON;
312226031Sstas    tmp = malloc(tmp_len * sizeof(uint32_t));
313226031Sstas    if (tmp == NULL)
314226031Sstas	return ENOMEM;
315226031Sstas
316226031Sstas    ret = compat_decomp(in, in_len, tmp, &tmp_len);
317226031Sstas    if (ret) {
318226031Sstas	free(tmp);
319226031Sstas	return ret;
320226031Sstas    }
321226031Sstas    canonical_reorder(tmp, tmp_len);
322226031Sstas    ret = combine(tmp, tmp_len, out, out_len);
323226031Sstas    free(tmp);
324226031Sstas    return ret;
325226031Sstas}
326