1/*	$NetBSD$	*/
2
3/*
4 * Copyright (c) 2004 Kungliga Tekniska H��gskolan
5 * (Royal Institute of Technology, Stockholm, Sweden).
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * 3. Neither the name of the Institute nor the names of its contributors
20 *    may be used to endorse or promote products derived from this software
21 *    without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36#ifdef HAVE_CONFIG_H
37#include <config.h>
38#endif
39#include "windlocl.h"
40
41#include <assert.h>
42#include <stdlib.h>
43#include <errno.h>
44#include <stdio.h>
45
46#include <krb5/roken.h>
47
48#include "normalize_table.h"
49
50static int
51translation_cmp(const void *key, const void *data)
52{
53    const struct translation *t1 = (const struct translation *)key;
54    const struct translation *t2 = (const struct translation *)data;
55
56    return t1->key - t2->key;
57}
58
59enum { s_base  = 0xAC00};
60enum { s_count = 11172};
61enum { l_base  = 0x1100};
62enum { l_count = 19};
63enum { v_base  = 0x1161};
64enum { v_count = 21};
65enum { t_base  = 0x11A7};
66enum { t_count = 28};
67enum { n_count = v_count * t_count};
68
69static int
70hangul_decomp(const uint32_t *in, size_t in_len,
71	      uint32_t *out, size_t *out_len)
72{
73    uint32_t u = *in;
74    unsigned s_index;
75    unsigned l, v, t;
76    unsigned o;
77
78    if (u < s_base || u >= s_base + s_count)
79	return 0;
80    s_index = u - s_base;
81    l = l_base + s_index / n_count;
82    v = v_base + (s_index % n_count) / t_count;
83    t = t_base + s_index % t_count;
84    o = 2;
85    if (t != t_base)
86	++o;
87    if (*out_len < o)
88	return WIND_ERR_OVERRUN;
89    out[0] = l;
90    out[1] = v;
91    if (t != t_base)
92	out[2] = t;
93    *out_len = o;
94    return 1;
95}
96
97static uint32_t
98hangul_composition(const uint32_t *in, size_t in_len)
99{
100    if (in_len < 2)
101	return 0;
102    if (in[0] >= l_base && in[0] < l_base + l_count) {
103	unsigned l_index = in[0] - l_base;
104	unsigned v_index;
105
106	if (in[1] < v_base || in[1] >= v_base + v_count)
107	    return 0;
108	v_index = in[1] - v_base;
109	return (l_index * v_count + v_index) * t_count + s_base;
110    } else if (in[0] >= s_base && in[0] < s_base + s_count) {
111	unsigned s_index = in[0] - s_base;
112	unsigned t_index;
113
114	if (s_index % t_count != 0)
115	    return 0;
116	if (in[1] < t_base || in[1] >= t_base + t_count)
117	    return 0;
118	t_index = in[1] - t_base;
119	return in[0] + t_index;
120    }
121    return 0;
122}
123
124static int
125compat_decomp(const uint32_t *in, size_t in_len,
126	      uint32_t *out, size_t *out_len)
127{
128    unsigned i;
129    unsigned o = 0;
130
131    for (i = 0; i < in_len; ++i) {
132	struct translation ts = {in[i]};
133	size_t sub_len = *out_len - o;
134	int ret;
135
136	ret = hangul_decomp(in + i, in_len - i,
137			    out + o, &sub_len);
138	if (ret) {
139	    if (ret == WIND_ERR_OVERRUN)
140		return ret;
141	    o += sub_len;
142	} else {
143	    void *s = bsearch(&ts,
144			      _wind_normalize_table,
145			      _wind_normalize_table_size,
146			      sizeof(_wind_normalize_table[0]),
147			      translation_cmp);
148	    if (s != NULL) {
149		const struct translation *t = (const struct translation *)s;
150
151		ret = compat_decomp(_wind_normalize_val_table + t->val_offset,
152				    t->val_len,
153				    out + o, &sub_len);
154		if (ret)
155		    return ret;
156		o += sub_len;
157	    } else {
158		if (o >= *out_len)
159		    return WIND_ERR_OVERRUN;
160		out[o++] = in[i];
161
162	    }
163	}
164    }
165    *out_len = o;
166    return 0;
167}
168
169static void
170swap_char(uint32_t * a, uint32_t * b)
171{
172    uint32_t t;
173    t = *a;
174    *a = *b;
175    *b = t;
176}
177
178/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points
179 * that all have Canonical_Combining_Class > 0 */
180static void
181canonical_reorder_sequence(uint32_t * a, size_t len)
182{
183    size_t i, j;
184
185    if (len <= 1)
186	return;
187
188    for (i = 1; i < len; i++) {
189	for (j = i;
190	     j > 0 &&
191		 _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]);
192	     j--)
193	    swap_char(&a[j], &a[j-1]);
194    }
195}
196
197static void
198canonical_reorder(uint32_t *tmp, size_t tmp_len)
199{
200    size_t i;
201
202    for (i = 0; i < tmp_len; ++i) {
203	int cc = _wind_combining_class(tmp[i]);
204	if (cc) {
205	    size_t j;
206	    for (j = i + 1;
207		 j < tmp_len && _wind_combining_class(tmp[j]);
208		 ++j)
209		;
210	    canonical_reorder_sequence(&tmp[i], j - i);
211	    i = j;
212	}
213    }
214}
215
216static uint32_t
217find_composition(const uint32_t *in, unsigned in_len)
218{
219    unsigned short canon_index = 0;
220    uint32_t cur;
221    unsigned n = 0;
222
223    cur = hangul_composition(in, in_len);
224    if (cur)
225	return cur;
226
227    do {
228	const struct canon_node *c = &_wind_canon_table[canon_index];
229	unsigned i;
230
231	if (n % 5 == 0) {
232	    cur = *in++;
233	    if (in_len-- == 0)
234		return c->val;
235	}
236
237	i = cur >> 16;
238	if (i < c->next_start || i >= c->next_end)
239	    canon_index = 0;
240	else
241	    canon_index =
242		_wind_canon_next_table[c->next_offset + i - c->next_start];
243	if (canon_index != 0) {
244	    cur = (cur << 4) & 0xFFFFF;
245	    ++n;
246	}
247    } while (canon_index != 0);
248    return 0;
249}
250
251static int
252combine(const uint32_t *in, size_t in_len,
253	uint32_t *out, size_t *out_len)
254{
255    unsigned i;
256    int ostarter;
257    unsigned o = 0;
258    int old_cc;
259
260    for (i = 0; i < in_len;) {
261	while (i < in_len && _wind_combining_class(in[i]) != 0) {
262	    out[o++] = in[i++];
263	}
264	if (i < in_len) {
265	    if (o >= *out_len)
266		return WIND_ERR_OVERRUN;
267	    ostarter = o;
268	    out[o++] = in[i++];
269	    old_cc   = -1;
270
271	    while (i < in_len) {
272		uint32_t comb;
273		uint32_t v[2];
274		int cc;
275
276		v[0] = out[ostarter];
277		v[1] = in[i];
278
279		cc = _wind_combining_class(in[i]);
280		if (old_cc != cc && (comb = find_composition(v, 2))) {
281		    out[ostarter] = comb;
282		} else if (cc == 0) {
283		    break;
284		} else {
285		    if (o >= *out_len)
286			return WIND_ERR_OVERRUN;
287		    out[o++] = in[i];
288		    old_cc   = cc;
289		}
290		++i;
291	    }
292	}
293    }
294    *out_len = o;
295    return 0;
296}
297
298int
299_wind_stringprep_normalize(const uint32_t *in, size_t in_len,
300			   uint32_t *out, size_t *out_len)
301{
302    size_t tmp_len;
303    uint32_t *tmp;
304    int ret;
305
306    if (in_len == 0) {
307	*out_len = 0;
308	return 0;
309    }
310
311    tmp_len = in_len * 4;
312    if (tmp_len < MAX_LENGTH_CANON)
313	tmp_len = MAX_LENGTH_CANON;
314    tmp = malloc(tmp_len * sizeof(uint32_t));
315    if (tmp == NULL)
316	return ENOMEM;
317
318    ret = compat_decomp(in, in_len, tmp, &tmp_len);
319    if (ret) {
320	free(tmp);
321	return ret;
322    }
323    canonical_reorder(tmp, tmp_len);
324    ret = combine(tmp, tmp_len, out, out_len);
325    free(tmp);
326    return ret;
327}
328