1226031Sstas/* 2226031Sstas * Copyright (c) 2004 Kungliga Tekniska H��gskolan 3226031Sstas * (Royal Institute of Technology, Stockholm, Sweden). 4226031Sstas * All rights reserved. 5226031Sstas * 6226031Sstas * Redistribution and use in source and binary forms, with or without 7226031Sstas * modification, are permitted provided that the following conditions 8226031Sstas * are met: 9226031Sstas * 10226031Sstas * 1. Redistributions of source code must retain the above copyright 11226031Sstas * notice, this list of conditions and the following disclaimer. 12226031Sstas * 13226031Sstas * 2. Redistributions in binary form must reproduce the above copyright 14226031Sstas * notice, this list of conditions and the following disclaimer in the 15226031Sstas * documentation and/or other materials provided with the distribution. 16226031Sstas * 17226031Sstas * 3. Neither the name of the Institute nor the names of its contributors 18226031Sstas * may be used to endorse or promote products derived from this software 19226031Sstas * without specific prior written permission. 20226031Sstas * 21226031Sstas * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 22226031Sstas * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23226031Sstas * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24226031Sstas * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 25226031Sstas * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26226031Sstas * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27226031Sstas * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28226031Sstas * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29226031Sstas * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30226031Sstas * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31226031Sstas * SUCH DAMAGE. 32226031Sstas */ 33226031Sstas 34226031Sstas#ifdef HAVE_CONFIG_H 35226031Sstas#include <config.h> 36226031Sstas#endif 37226031Sstas#include "windlocl.h" 38226031Sstas 39226031Sstas#include <assert.h> 40226031Sstas#include <stdlib.h> 41226031Sstas#include <errno.h> 42226031Sstas#include <stdio.h> 43226031Sstas 44226031Sstas#include "roken.h" 45226031Sstas 46226031Sstas#include "normalize_table.h" 47226031Sstas 48226031Sstasstatic int 49226031Sstastranslation_cmp(const void *key, const void *data) 50226031Sstas{ 51226031Sstas const struct translation *t1 = (const struct translation *)key; 52226031Sstas const struct translation *t2 = (const struct translation *)data; 53226031Sstas 54226031Sstas return t1->key - t2->key; 55226031Sstas} 56226031Sstas 57226031Sstasenum { s_base = 0xAC00}; 58226031Sstasenum { s_count = 11172}; 59226031Sstasenum { l_base = 0x1100}; 60226031Sstasenum { l_count = 19}; 61226031Sstasenum { v_base = 0x1161}; 62226031Sstasenum { v_count = 21}; 63226031Sstasenum { t_base = 0x11A7}; 64226031Sstasenum { t_count = 28}; 65226031Sstasenum { n_count = v_count * t_count}; 66226031Sstas 67226031Sstasstatic int 68226031Sstashangul_decomp(const uint32_t *in, size_t in_len, 69226031Sstas uint32_t *out, size_t *out_len) 70226031Sstas{ 71226031Sstas uint32_t u = *in; 72226031Sstas unsigned s_index; 73226031Sstas unsigned l, v, t; 74226031Sstas unsigned o; 75226031Sstas 76226031Sstas if (u < s_base || u >= s_base + s_count) 77226031Sstas return 0; 78226031Sstas s_index = u - s_base; 79226031Sstas l = l_base + s_index / n_count; 80226031Sstas v = v_base + (s_index % n_count) / t_count; 81226031Sstas t = t_base + s_index % t_count; 82226031Sstas o = 2; 83226031Sstas if (t != t_base) 84226031Sstas ++o; 85226031Sstas if (*out_len < o) 86226031Sstas return WIND_ERR_OVERRUN; 87226031Sstas out[0] = l; 88226031Sstas out[1] = v; 89226031Sstas if (t != t_base) 90226031Sstas out[2] = t; 91226031Sstas *out_len = o; 92226031Sstas return 1; 93226031Sstas} 94226031Sstas 95226031Sstasstatic uint32_t 96226031Sstashangul_composition(const uint32_t *in, size_t in_len) 97226031Sstas{ 98226031Sstas if (in_len < 2) 99226031Sstas return 0; 100226031Sstas if (in[0] >= l_base && in[0] < l_base + l_count) { 101226031Sstas unsigned l_index = in[0] - l_base; 102226031Sstas unsigned v_index; 103226031Sstas 104226031Sstas if (in[1] < v_base || in[1] >= v_base + v_count) 105226031Sstas return 0; 106226031Sstas v_index = in[1] - v_base; 107226031Sstas return (l_index * v_count + v_index) * t_count + s_base; 108226031Sstas } else if (in[0] >= s_base && in[0] < s_base + s_count) { 109226031Sstas unsigned s_index = in[0] - s_base; 110226031Sstas unsigned t_index; 111226031Sstas 112226031Sstas if (s_index % t_count != 0) 113226031Sstas return 0; 114226031Sstas if (in[1] < t_base || in[1] >= t_base + t_count) 115226031Sstas return 0; 116226031Sstas t_index = in[1] - t_base; 117226031Sstas return in[0] + t_index; 118226031Sstas } 119226031Sstas return 0; 120226031Sstas} 121226031Sstas 122226031Sstasstatic int 123226031Sstascompat_decomp(const uint32_t *in, size_t in_len, 124226031Sstas uint32_t *out, size_t *out_len) 125226031Sstas{ 126226031Sstas unsigned i; 127226031Sstas unsigned o = 0; 128226031Sstas 129226031Sstas for (i = 0; i < in_len; ++i) { 130226031Sstas struct translation ts = {in[i]}; 131226031Sstas size_t sub_len = *out_len - o; 132226031Sstas int ret; 133226031Sstas 134226031Sstas ret = hangul_decomp(in + i, in_len - i, 135226031Sstas out + o, &sub_len); 136226031Sstas if (ret) { 137226031Sstas if (ret == WIND_ERR_OVERRUN) 138226031Sstas return ret; 139226031Sstas o += sub_len; 140226031Sstas } else { 141226031Sstas void *s = bsearch(&ts, 142226031Sstas _wind_normalize_table, 143226031Sstas _wind_normalize_table_size, 144226031Sstas sizeof(_wind_normalize_table[0]), 145226031Sstas translation_cmp); 146226031Sstas if (s != NULL) { 147226031Sstas const struct translation *t = (const struct translation *)s; 148226031Sstas 149226031Sstas ret = compat_decomp(_wind_normalize_val_table + t->val_offset, 150226031Sstas t->val_len, 151226031Sstas out + o, &sub_len); 152226031Sstas if (ret) 153226031Sstas return ret; 154226031Sstas o += sub_len; 155226031Sstas } else { 156226031Sstas if (o >= *out_len) 157226031Sstas return WIND_ERR_OVERRUN; 158226031Sstas out[o++] = in[i]; 159226031Sstas 160226031Sstas } 161226031Sstas } 162226031Sstas } 163226031Sstas *out_len = o; 164226031Sstas return 0; 165226031Sstas} 166226031Sstas 167226031Sstasstatic void 168226031Sstasswap_char(uint32_t * a, uint32_t * b) 169226031Sstas{ 170226031Sstas uint32_t t; 171226031Sstas t = *a; 172226031Sstas *a = *b; 173226031Sstas *b = t; 174226031Sstas} 175226031Sstas 176226031Sstas/* Unicode 5.2.0 D109 Canonical Ordering for a sequence of code points 177226031Sstas * that all have Canonical_Combining_Class > 0 */ 178226031Sstasstatic void 179226031Sstascanonical_reorder_sequence(uint32_t * a, size_t len) 180226031Sstas{ 181226031Sstas size_t i, j; 182226031Sstas 183226031Sstas if (len <= 1) 184226031Sstas return; 185226031Sstas 186226031Sstas for (i = 1; i < len; i++) { 187226031Sstas for (j = i; 188226031Sstas j > 0 && 189226031Sstas _wind_combining_class(a[j]) < _wind_combining_class(a[j-1]); 190226031Sstas j--) 191226031Sstas swap_char(&a[j], &a[j-1]); 192226031Sstas } 193226031Sstas} 194226031Sstas 195226031Sstasstatic void 196226031Sstascanonical_reorder(uint32_t *tmp, size_t tmp_len) 197226031Sstas{ 198226031Sstas size_t i; 199226031Sstas 200226031Sstas for (i = 0; i < tmp_len; ++i) { 201226031Sstas int cc = _wind_combining_class(tmp[i]); 202226031Sstas if (cc) { 203226031Sstas size_t j; 204226031Sstas for (j = i + 1; 205226031Sstas j < tmp_len && _wind_combining_class(tmp[j]); 206226031Sstas ++j) 207226031Sstas ; 208226031Sstas canonical_reorder_sequence(&tmp[i], j - i); 209226031Sstas i = j; 210226031Sstas } 211226031Sstas } 212226031Sstas} 213226031Sstas 214226031Sstasstatic uint32_t 215226031Sstasfind_composition(const uint32_t *in, unsigned in_len) 216226031Sstas{ 217226031Sstas unsigned short canon_index = 0; 218226031Sstas uint32_t cur; 219226031Sstas unsigned n = 0; 220226031Sstas 221226031Sstas cur = hangul_composition(in, in_len); 222226031Sstas if (cur) 223226031Sstas return cur; 224226031Sstas 225226031Sstas do { 226226031Sstas const struct canon_node *c = &_wind_canon_table[canon_index]; 227226031Sstas unsigned i; 228226031Sstas 229226031Sstas if (n % 5 == 0) { 230226031Sstas cur = *in++; 231226031Sstas if (in_len-- == 0) 232226031Sstas return c->val; 233226031Sstas } 234226031Sstas 235226031Sstas i = cur >> 16; 236226031Sstas if (i < c->next_start || i >= c->next_end) 237226031Sstas canon_index = 0; 238226031Sstas else 239226031Sstas canon_index = 240226031Sstas _wind_canon_next_table[c->next_offset + i - c->next_start]; 241226031Sstas if (canon_index != 0) { 242226031Sstas cur = (cur << 4) & 0xFFFFF; 243226031Sstas ++n; 244226031Sstas } 245226031Sstas } while (canon_index != 0); 246226031Sstas return 0; 247226031Sstas} 248226031Sstas 249226031Sstasstatic int 250226031Sstascombine(const uint32_t *in, size_t in_len, 251226031Sstas uint32_t *out, size_t *out_len) 252226031Sstas{ 253226031Sstas unsigned i; 254226031Sstas int ostarter; 255226031Sstas unsigned o = 0; 256226031Sstas int old_cc; 257226031Sstas 258226031Sstas for (i = 0; i < in_len;) { 259226031Sstas while (i < in_len && _wind_combining_class(in[i]) != 0) { 260226031Sstas out[o++] = in[i++]; 261226031Sstas } 262226031Sstas if (i < in_len) { 263226031Sstas if (o >= *out_len) 264226031Sstas return WIND_ERR_OVERRUN; 265226031Sstas ostarter = o; 266226031Sstas out[o++] = in[i++]; 267226031Sstas old_cc = -1; 268226031Sstas 269226031Sstas while (i < in_len) { 270226031Sstas uint32_t comb; 271226031Sstas uint32_t v[2]; 272226031Sstas int cc; 273226031Sstas 274226031Sstas v[0] = out[ostarter]; 275226031Sstas v[1] = in[i]; 276226031Sstas 277226031Sstas cc = _wind_combining_class(in[i]); 278226031Sstas if (old_cc != cc && (comb = find_composition(v, 2))) { 279226031Sstas out[ostarter] = comb; 280226031Sstas } else if (cc == 0) { 281226031Sstas break; 282226031Sstas } else { 283226031Sstas if (o >= *out_len) 284226031Sstas return WIND_ERR_OVERRUN; 285226031Sstas out[o++] = in[i]; 286226031Sstas old_cc = cc; 287226031Sstas } 288226031Sstas ++i; 289226031Sstas } 290226031Sstas } 291226031Sstas } 292226031Sstas *out_len = o; 293226031Sstas return 0; 294226031Sstas} 295226031Sstas 296226031Sstasint 297226031Sstas_wind_stringprep_normalize(const uint32_t *in, size_t in_len, 298226031Sstas uint32_t *out, size_t *out_len) 299226031Sstas{ 300226031Sstas size_t tmp_len; 301226031Sstas uint32_t *tmp; 302226031Sstas int ret; 303226031Sstas 304226031Sstas if (in_len == 0) { 305226031Sstas *out_len = 0; 306226031Sstas return 0; 307226031Sstas } 308226031Sstas 309226031Sstas tmp_len = in_len * 4; 310226031Sstas if (tmp_len < MAX_LENGTH_CANON) 311226031Sstas tmp_len = MAX_LENGTH_CANON; 312226031Sstas tmp = malloc(tmp_len * sizeof(uint32_t)); 313226031Sstas if (tmp == NULL) 314226031Sstas return ENOMEM; 315226031Sstas 316226031Sstas ret = compat_decomp(in, in_len, tmp, &tmp_len); 317226031Sstas if (ret) { 318226031Sstas free(tmp); 319226031Sstas return ret; 320226031Sstas } 321226031Sstas canonical_reorder(tmp, tmp_len); 322226031Sstas ret = combine(tmp, tmp_len, out, out_len); 323226031Sstas free(tmp); 324226031Sstas return ret; 325226031Sstas} 326