1#ifndef lint 2static char *rcsid = "$Id: ucsset.c,v 1.1 2003/06/04 00:26:15 marka Exp $"; 3#endif 4 5/* 6 * Copyright (c) 2001 Japan Network Information Center. All rights reserved. 7 * 8 * By using this file, you agree to the terms and conditions set forth bellow. 9 * 10 * LICENSE TERMS AND CONDITIONS 11 * 12 * The following License Terms and Conditions apply, unless a different 13 * license is obtained from Japan Network Information Center ("JPNIC"), 14 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 15 * Chiyoda-ku, Tokyo 101-0047, Japan. 16 * 17 * 1. Use, Modification and Redistribution (including distribution of any 18 * modified or derived work) in source and/or binary forms is permitted 19 * under this License Terms and Conditions. 20 * 21 * 2. Redistribution of source code must retain the copyright notices as they 22 * appear in each source code file, this License Terms and Conditions. 23 * 24 * 3. Redistribution in binary form must reproduce the Copyright Notice, 25 * this License Terms and Conditions, in the documentation and/or other 26 * materials provided with the distribution. For the purposes of binary 27 * distribution the "Copyright Notice" refers to the following language: 28 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 29 * 30 * 4. The name of JPNIC may not be used to endorse or promote products 31 * derived from this Software without specific prior written approval of 32 * JPNIC. 33 * 34 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 37 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 39 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 40 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 41 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 42 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 43 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 44 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 45 */ 46 47#include <config.h> 48 49#include <stdlib.h> 50#include <stdio.h> 51#include <string.h> 52 53#include <idn/result.h> 54#include <idn/assert.h> 55#include <idn/logmacro.h> 56#include <idn/ucsset.h> 57 58#define UCS_MAX 0x80000000UL 59 60#define INIT_SIZE 50 61 62/* 63 * Code point range. 64 * 65 * The set of code points is represented by an array of code point ranges. 66 * In the building phase, specified ranges by 'idn_ucsset_add' or 67 * 'idn_ucsset_addrange' are simply appended to the array. 68 * And 'idn_ucsset_fix' sorts the array by the code point value, and also 69 * merges any intersecting ranges. Since the array is sorted, a binary 70 * search can be used for looking up. 71 */ 72typedef struct { 73 unsigned long from; 74 unsigned long to; 75} range_t; 76 77/* 78 * Code point segment. 79 * 80 * To speed up searching further, the entire region of UCS-4 code points 81 * (U+0000 - U+7FFFFFFF) are divided into segments. For each segment, 82 * the first and last element of the range array corresponding to the 83 * segment are computed by 'idn_ucsset_fix'. This narrows down the 84 * (initial) search range. 85 */ 86typedef struct { 87 int range_start; /* index of ucsset.ranges */ 88 int range_end; /* ditto */ 89} segment_t; 90 91/* 92 * Code point to segment index conversion. 93 * 94 * Below is the function that maps a code point to the corresponding segment. 95 * The mapping is non-uniform, so that BMP, the following 16 planes that 96 * comprise Unicode code points together with BMP, and other planes 97 * have different granularity. 98 */ 99#define SEG_THLD1 0x10000 /* BMP */ 100#define SEG_THLD2 0x110000 /* Unicode (BMP+16planes) */ 101#define SEG_SFT1 10 /* BMP: 1K code points/segment */ 102#define SEG_SFT2 14 /* following 16 planes: 16K cp/seg */ 103#define SEG_SFT3 24 /* rest: 16M cp/seg */ 104#define SEG_OFF1 (SEG_THLD1 >> SEG_SFT1) 105#define SEG_OFF2 (((SEG_THLD2 - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) 106#define SEG_INDEX(v) \ 107 (((v) < SEG_THLD1) ? ((v) >> SEG_SFT1) : \ 108 ((v) < SEG_THLD2) ? ((((v) - SEG_THLD1) >> SEG_SFT2) + SEG_OFF1) : \ 109 ((((v) - SEG_THLD2) >> SEG_SFT3) + SEG_OFF2)) 110#define SEG_LEN (SEG_INDEX(UCS_MAX - 1) + 1) 111 112/* 113 * Representation of set of UCS code points. 114 */ 115typedef struct idn_ucsset { 116 segment_t segments[SEG_LEN]; 117 int fixed; 118 int size; /* allocated size of 'ranges' */ 119 int nranges; /* num of ranges */ 120 range_t *ranges; 121 int refcnt; /* reference count */ 122} ucsset; 123 124static idn_result_t addrange(idn_ucsset_t ctx, unsigned long from, 125 unsigned long to, char *func_name); 126static int comp_range(const void *v1, const void *v2); 127 128idn_result_t 129idn_ucsset_create(idn_ucsset_t *ctx) { 130 idn_ucsset_t bm; 131 132 assert(ctx != NULL); 133 134 TRACE(("idn_ucsset_create()\n")); 135 136 if ((bm = malloc(sizeof(ucsset))) == NULL) { 137 WARNING(("idn_ucsset_create: malloc failed\n")); 138 return idn_nomemory; 139 } 140 bm->size = bm->nranges = 0; 141 bm->ranges = NULL; 142 bm->fixed = 0; 143 bm->refcnt = 1; 144 *ctx = bm; 145 return (idn_success); 146} 147 148void 149idn_ucsset_destroy(idn_ucsset_t ctx) { 150 assert(ctx != NULL && ctx->refcnt > 0); 151 152 TRACE(("idn_ucsset_destroy()\n")); 153 154 if (--ctx->refcnt == 0) { 155 if (ctx->ranges != NULL) 156 free(ctx->ranges); 157 free(ctx); 158 } 159} 160 161void 162idn_ucsset_incrref(idn_ucsset_t ctx) { 163 assert(ctx != NULL && ctx->refcnt > 0); 164 165 TRACE(("idn_ucsset_incrref()\n")); 166 167 ctx->refcnt++; 168} 169 170idn_result_t 171idn_ucsset_add(idn_ucsset_t ctx, unsigned long v) { 172 assert(ctx != NULL && ctx->refcnt > 0); 173 174 TRACE(("idn_ucsset_add(v=U+%lX)\n", v)); 175 176 return (addrange(ctx, v, v, "idn_ucsset_add")); 177} 178 179idn_result_t 180idn_ucsset_addrange(idn_ucsset_t ctx, unsigned long from, 181 unsigned long to) 182{ 183 assert(ctx != NULL && ctx->refcnt > 0); 184 185 TRACE(("idn_ucsset_addrange(from=U+%lX, to=U+%lX)\n", 186 from, to)); 187 188 return (addrange(ctx, from, to, "idn_ucsset_addrange")); 189} 190 191void 192idn_ucsset_fix(idn_ucsset_t ctx) { 193 int nranges; 194 range_t *ranges; 195 segment_t *segments; 196 int i, j; 197 198 assert(ctx != NULL && ctx->refcnt > 0); 199 200 TRACE(("idn_ucsset_fix()\n")); 201 202 nranges = ctx->nranges; 203 ranges = ctx->ranges; 204 segments = ctx->segments; 205 206 if (ctx->fixed) 207 return; 208 209 ctx->fixed = 1; 210 211 /* Initialize segment array */ 212 for (i = 0; i < SEG_LEN; i++) { 213 segments[i].range_start = -1; 214 segments[i].range_end = -1; 215 } 216 217 /* If the set is empty, there's nothing to be done. */ 218 if (nranges == 0) 219 return; 220 221 /* Sort ranges. */ 222 qsort(ranges, nranges, sizeof(range_t), comp_range); 223 224 /* Merge overlapped/continuous ranges. */ 225 for (i = 0, j = 1; j < nranges; j++) { 226 if (ranges[i].to + 1 >= ranges[j].from) { 227 /* can be merged */ 228 if (ranges[i].to < ranges[j].to) { 229 ranges[i].to = ranges[j].to; 230 } 231 } else { 232 i++; 233 if (i < j) 234 ranges[i] = ranges[j]; 235 } 236 } 237 /* 'i' points the last range in the array. */ 238 ctx->nranges = nranges = ++i; 239 240 /* Create segment array. */ 241 for (i = 0; i < nranges; i++) { 242 int fidx = SEG_INDEX(ranges[i].from); 243 int tidx = SEG_INDEX(ranges[i].to); 244 245 for (j = fidx; j <= tidx; j++) { 246 if (segments[j].range_start < 0) 247 segments[j].range_start = i; 248 segments[j].range_end = i; 249 } 250 } 251 252#if 0 253 /* 254 * Does the standard guarantee realloc() always succeeds 255 * when shrinking? 256 */ 257 /* Shrink malloc'ed space if possible. */ 258 ctx->ranges = realloc(ctx->ranges, ctx->nranges * sizeof(range_t)); 259#endif 260} 261 262idn_result_t 263idn_ucsset_lookup(idn_ucsset_t ctx, unsigned long v, int *found) { 264 int idx; 265 segment_t *segments; 266 267 assert(ctx != NULL && ctx->refcnt > 0 && found != NULL); 268 269 TRACE(("idn_ucsset_lookup(v=U+%lX)\n", v)); 270 271 /* Make sure it is fixed. */ 272 if (!ctx->fixed) { 273 WARNING(("idn_ucsset_lookup: not fixed yet\n")); 274 return (idn_failure); 275 } 276 277 /* Check the given code point. */ 278 if (v >= UCS_MAX) 279 return (idn_invalid_codepoint); 280 281 /* Get the segment 'v' belongs to. */ 282 segments = ctx->segments; 283 idx = SEG_INDEX(v); 284 285 /* Do binary search. */ 286 *found = 0; 287 if (segments[idx].range_start >= 0) { 288 int lo = segments[idx].range_start; 289 int hi = segments[idx].range_end; 290 range_t *ranges = ctx->ranges; 291 292 while (lo <= hi) { 293 int mid = (lo + hi) / 2; 294 if (v < ranges[mid].from) { 295 hi = mid - 1; 296 } else if (v > ranges[mid].to) { 297 lo = mid + 1; 298 } else { 299 *found = 1; 300 break; 301 } 302 } 303 } 304 return (idn_success); 305} 306 307static idn_result_t 308addrange(idn_ucsset_t ctx, unsigned long from, unsigned long to, 309 char *func_name) 310{ 311 range_t *newbuf; 312 313 /* Check the given code points. */ 314 if (from > UCS_MAX) { 315 WARNING(("%s: code point out of range (U+%lX)\n", 316 func_name, from)); 317 return (idn_invalid_codepoint); 318 } else if (to > UCS_MAX) { 319 WARNING(("%s: code point out of range (U+%lX)\n", 320 func_name, to)); 321 return (idn_invalid_codepoint); 322 } else if (from > to) { 323 WARNING(("%s: invalid range spec (U+%lX-U+%lX)\n", 324 func_name, from, to)); 325 return (idn_invalid_codepoint); 326 } 327 328 /* Make sure it is not fixed yet. */ 329 if (ctx->fixed) { 330 WARNING(("%s: attempt to add to already fixed object\n", 331 func_name)); 332 return (idn_failure); 333 } 334 335 /* Append the specified range to the 'ranges' array. */ 336 if (ctx->nranges >= ctx->size) { 337 /* Make it bigger. */ 338 if (ctx->size == 0) 339 ctx->size = INIT_SIZE; 340 else 341 ctx->size *= 2; 342 newbuf = realloc(ctx->ranges, ctx->size * sizeof(range_t)); 343 if (newbuf == NULL) 344 return (idn_nomemory); 345 ctx->ranges = newbuf; 346 } 347 ctx->ranges[ctx->nranges].from = from; 348 ctx->ranges[ctx->nranges].to = to; 349 ctx->nranges++; 350 351 return (idn_success); 352} 353 354static int 355comp_range(const void *v1, const void *v2) { 356 /* 357 * Range comparation function suitable for qsort(). 358 */ 359 const range_t *r1 = v1; 360 const range_t *r2 = v2; 361 362 if (r1->from < r2->from) 363 return (-1); 364 else if (r1->from > r2->from) 365 return (1); 366 else 367 return (0); 368} 369