1#ifndef lint 2static char *rcsid = "$Id: unormalize.c,v 1.1 2003/06/04 00:26:43 marka Exp $"; 3#endif 4 5/* 6 * Copyright (c) 2000,2001,2002 Japan Network Information Center. 7 * All rights reserved. 8 * 9 * By using this file, you agree to the terms and conditions set forth bellow. 10 * 11 * LICENSE TERMS AND CONDITIONS 12 * 13 * The following License Terms and Conditions apply, unless a different 14 * license is obtained from Japan Network Information Center ("JPNIC"), 15 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 16 * Chiyoda-ku, Tokyo 101-0047, Japan. 17 * 18 * 1. Use, Modification and Redistribution (including distribution of any 19 * modified or derived work) in source and/or binary forms is permitted 20 * under this License Terms and Conditions. 21 * 22 * 2. Redistribution of source code must retain the copyright notices as they 23 * appear in each source code file, this License Terms and Conditions. 24 * 25 * 3. Redistribution in binary form must reproduce the Copyright Notice, 26 * this License Terms and Conditions, in the documentation and/or other 27 * materials provided with the distribution. For the purposes of binary 28 * distribution the "Copyright Notice" refers to the following language: 29 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 30 * 31 * 4. The name of JPNIC may not be used to endorse or promote products 32 * derived from this Software without specific prior written approval of 33 * JPNIC. 34 * 35 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 36 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 37 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 38 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 39 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 40 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 41 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 42 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 43 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 44 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 45 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 46 */ 47 48#include <config.h> 49 50#include <stddef.h> 51#include <stdlib.h> 52#include <string.h> 53 54#include <idn/result.h> 55#include <idn/assert.h> 56#include <idn/logmacro.h> 57#include <idn/ucs4.h> 58#include <idn/unicode.h> 59#include <idn/unormalize.h> 60#include <idn/debug.h> 61 62#if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY) 63#define memmove(a,b,c) bcopy((char *)(b),(char *)(a),(int)(c)) 64#endif 65 66#define WORKBUF_SIZE 128 67#define WORKBUF_SIZE_MAX 10000 68 69typedef struct { 70 idn__unicode_version_t version; /* Unicode version */ 71 int cur; /* pointing now processing character */ 72 int last; /* pointing just after the last character */ 73 int size; /* size of UCS and CLASS array */ 74 unsigned long *ucs4; /* UCS-4 characters */ 75 int *class; /* and their canonical classes */ 76 unsigned long ucs4_buf[WORKBUF_SIZE]; /* local buffer */ 77 int class_buf[WORKBUF_SIZE]; /* ditto */ 78} workbuf_t; 79 80static idn_result_t normalize(idn__unicode_version_t version, 81 int do_composition, int compat, 82 const unsigned long *from, 83 unsigned long *to, size_t tolen); 84static idn_result_t decompose(workbuf_t *wb, unsigned long c, int compat); 85static void get_class(workbuf_t *wb); 86static void reorder(workbuf_t *wb); 87static void compose(workbuf_t *wb); 88static idn_result_t flush_before_cur(workbuf_t *wb, 89 unsigned long **top, size_t *tolenp); 90static void workbuf_init(workbuf_t *wb); 91static void workbuf_free(workbuf_t *wb); 92static idn_result_t workbuf_extend(workbuf_t *wb); 93static idn_result_t workbuf_append(workbuf_t *wb, unsigned long c); 94static void workbuf_shift(workbuf_t *wb, int shift); 95static void workbuf_removevoid(workbuf_t *wb); 96 97idn_result_t 98idn__unormalize_formkc(idn__unicode_version_t version, 99 const unsigned long *from, unsigned long *to, 100 size_t tolen) { 101 assert(version != NULL && from != NULL && to != NULL && tolen >= 0); 102 TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n", 103 idn__debug_ucs4xstring(from, 50), tolen)); 104 return (normalize(version, 1, 1, from, to, tolen)); 105} 106 107static idn_result_t 108normalize(idn__unicode_version_t version, int do_composition, int compat, 109 const unsigned long *from, unsigned long *to, size_t tolen) { 110 workbuf_t wb; 111 idn_result_t r = idn_success; 112 113 /* 114 * Initialize working buffer. 115 */ 116 workbuf_init(&wb); 117 wb.version = version; 118 119 while (*from != '\0') { 120 unsigned long c; 121 122 assert(wb.cur == wb.last); 123 124 /* 125 * Get one character from 'from'. 126 */ 127 c = *from++; 128 129 /* 130 * Decompose it. 131 */ 132 if ((r = decompose(&wb, c, compat)) != idn_success) 133 goto ret; 134 135 /* 136 * Get canonical class. 137 */ 138 get_class(&wb); 139 140 /* 141 * Reorder & compose. 142 */ 143 for (; wb.cur < wb.last; wb.cur++) { 144 if (wb.cur == 0) { 145 continue; 146 } else if (wb.class[wb.cur] > 0) { 147 /* 148 * This is not a starter. Try reordering. 149 * Note that characters up to it are 150 * already in canonical order. 151 */ 152 reorder(&wb); 153 continue; 154 } 155 156 /* 157 * This is a starter character, and there are 158 * some characters before it. Those characters 159 * have been reordered properly, and 160 * ready for composition. 161 */ 162 if (do_composition && wb.class[0] == 0) 163 compose(&wb); 164 165 /* 166 * If CUR points to a starter character, 167 * then process of characters before CUR are 168 * already finished, because any further 169 * reordering/composition for them are blocked 170 * by the starter CUR points. 171 */ 172 if (wb.cur > 0 && wb.class[wb.cur] == 0) { 173 /* Flush everything before CUR. */ 174 r = flush_before_cur(&wb, &to, &tolen); 175 if (r != idn_success) 176 goto ret; 177 } 178 } 179 } 180 181 if (r == idn_success) { 182 if (do_composition && wb.cur > 0 && wb.class[0] == 0) { 183 /* 184 * There is some characters left in WB. 185 * They are ordered, but not composed yet. 186 * Now CUR points just after the last character in WB, 187 * and since compose() tries to compose characters 188 * between top and CUR inclusive, we must make CUR 189 * one character back during compose(). 190 */ 191 wb.cur--; 192 compose(&wb); 193 wb.cur++; 194 } 195 /* 196 * Call this even when WB.CUR == 0, to make TO 197 * NUL-terminated. 198 */ 199 r = flush_before_cur(&wb, &to, &tolen); 200 if (r != idn_success) 201 goto ret; 202 } 203 204 if (tolen <= 0) { 205 r = idn_buffer_overflow; 206 goto ret; 207 } 208 *to = '\0'; 209 210ret: 211 workbuf_free(&wb); 212 return (r); 213} 214 215static idn_result_t 216decompose(workbuf_t *wb, unsigned long c, int compat) { 217 idn_result_t r; 218 int dec_len; 219 220again: 221 r = idn__unicode_decompose(wb->version, compat, wb->ucs4 + wb->last, 222 wb->size - wb->last, c, &dec_len); 223 switch (r) { 224 case idn_success: 225 wb->last += dec_len; 226 return (idn_success); 227 case idn_notfound: 228 return (workbuf_append(wb, c)); 229 case idn_buffer_overflow: 230 if ((r = workbuf_extend(wb)) != idn_success) 231 return (r); 232 if (wb->size > WORKBUF_SIZE_MAX) { 233 WARNING(("idn__unormalize_form*: " 234 "working buffer too large\n")); 235 return (idn_nomemory); 236 } 237 goto again; 238 default: 239 return (r); 240 } 241 /* NOTREACHED */ 242} 243 244static void 245get_class(workbuf_t *wb) { 246 int i; 247 248 for (i = wb->cur; i < wb->last; i++) 249 wb->class[i] = idn__unicode_canonicalclass(wb->version, 250 wb->ucs4[i]); 251} 252 253static void 254reorder(workbuf_t *wb) { 255 unsigned long c; 256 int i; 257 int class; 258 259 assert(wb != NULL); 260 261 i = wb->cur; 262 c = wb->ucs4[i]; 263 class = wb->class[i]; 264 265 while (i > 0 && wb->class[i - 1] > class) { 266 wb->ucs4[i] = wb->ucs4[i - 1]; 267 wb->class[i] =wb->class[i - 1]; 268 i--; 269 wb->ucs4[i] = c; 270 wb->class[i] = class; 271 } 272} 273 274static void 275compose(workbuf_t *wb) { 276 int cur; 277 unsigned long *ucs4; 278 int *class; 279 int last_class; 280 int nvoids; 281 int i; 282 idn__unicode_version_t ver; 283 284 assert(wb != NULL && wb->class[0] == 0); 285 286 cur = wb->cur; 287 ucs4 = wb->ucs4; 288 class = wb->class; 289 ver = wb->version; 290 291 /* 292 * If there are no decomposition sequence that begins with 293 * the top character, composition is impossible. 294 */ 295 if (!idn__unicode_iscompositecandidate(ver, ucs4[0])) 296 return; 297 298 last_class = 0; 299 nvoids = 0; 300 for (i = 1; i <= cur; i++) { 301 unsigned long c; 302 int cl = class[i]; 303 304 if ((last_class < cl || cl == 0) && 305 idn__unicode_compose(ver, ucs4[0], ucs4[i], 306 &c) == idn_success) { 307 /* 308 * Replace the top character with the composed one. 309 */ 310 ucs4[0] = c; 311 class[0] = idn__unicode_canonicalclass(ver, c); 312 313 class[i] = -1; /* void this character */ 314 nvoids++; 315 } else { 316 last_class = cl; 317 } 318 } 319 320 /* Purge void characters, if any. */ 321 if (nvoids > 0) 322 workbuf_removevoid(wb); 323} 324 325static idn_result_t 326flush_before_cur(workbuf_t *wb, unsigned long **top, size_t *tolenp) { 327 if (*tolenp < wb->cur) 328 return (idn_buffer_overflow); 329 330 memcpy(*top, wb->ucs4, sizeof(**top) * wb->cur); 331 *top += wb->cur; 332 *tolenp -= wb->cur; 333 workbuf_shift(wb, wb->cur); 334 335 return (idn_success); 336} 337 338static void 339workbuf_init(workbuf_t *wb) { 340 wb->cur = 0; 341 wb->last = 0; 342 wb->size = WORKBUF_SIZE; 343 wb->ucs4 = wb->ucs4_buf; 344 wb->class = wb->class_buf; 345} 346 347static void 348workbuf_free(workbuf_t *wb) { 349 if (wb->ucs4 != wb->ucs4_buf) { 350 free(wb->ucs4); 351 free(wb->class); 352 } 353} 354 355static idn_result_t 356workbuf_extend(workbuf_t *wb) { 357 int newsize = wb->size * 3; 358 359 if (wb->ucs4 == wb->ucs4_buf) { 360 wb->ucs4 = malloc(sizeof(wb->ucs4[0]) * newsize); 361 wb->class = malloc(sizeof(wb->class[0]) * newsize); 362 } else { 363 wb->ucs4 = realloc(wb->ucs4, sizeof(wb->ucs4[0]) * newsize); 364 wb->class = realloc(wb->class, sizeof(wb->class[0]) * newsize); 365 } 366 if (wb->ucs4 == NULL || wb->class == NULL) 367 return (idn_nomemory); 368 else 369 return (idn_success); 370} 371 372static idn_result_t 373workbuf_append(workbuf_t *wb, unsigned long c) { 374 idn_result_t r; 375 376 if (wb->last >= wb->size && (r = workbuf_extend(wb)) != idn_success) 377 return (r); 378 wb->ucs4[wb->last++] = c; 379 return (idn_success); 380} 381 382static void 383workbuf_shift(workbuf_t *wb, int shift) { 384 int nmove; 385 386 assert(wb != NULL && wb->cur >= shift); 387 388 nmove = wb->last - shift; 389 (void)memmove(&wb->ucs4[0], &wb->ucs4[shift], 390 nmove * sizeof(wb->ucs4[0])); 391 (void)memmove(&wb->class[0], &wb->class[shift], 392 nmove * sizeof(wb->class[0])); 393 wb->cur -= shift; 394 wb->last -= shift; 395} 396 397static void 398workbuf_removevoid(workbuf_t *wb) { 399 int i, j; 400 int last = wb->last; 401 402 for (i = j = 0; i < last; i++) { 403 if (wb->class[i] >= 0) { 404 if (j < i) { 405 wb->ucs4[j] = wb->ucs4[i]; 406 wb->class[j] = wb->class[i]; 407 } 408 j++; 409 } 410 } 411 wb->cur -= last - j; 412 wb->last = j; 413} 414