1#ifndef lint 2static char *rcsid = "$Id: ucs4.c,v 1.1 2003/06/04 00:26:14 marka Exp $"; 3#endif 4 5/* 6 * Copyright (c) 2001 Japan Network Information Center. All rights reserved. 7 * 8 * By using this file, you agree to the terms and conditions set forth bellow. 9 * 10 * LICENSE TERMS AND CONDITIONS 11 * 12 * The following License Terms and Conditions apply, unless a different 13 * license is obtained from Japan Network Information Center ("JPNIC"), 14 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, 15 * Chiyoda-ku, Tokyo 101-0047, Japan. 16 * 17 * 1. Use, Modification and Redistribution (including distribution of any 18 * modified or derived work) in source and/or binary forms is permitted 19 * under this License Terms and Conditions. 20 * 21 * 2. Redistribution of source code must retain the copyright notices as they 22 * appear in each source code file, this License Terms and Conditions. 23 * 24 * 3. Redistribution in binary form must reproduce the Copyright Notice, 25 * this License Terms and Conditions, in the documentation and/or other 26 * materials provided with the distribution. For the purposes of binary 27 * distribution the "Copyright Notice" refers to the following language: 28 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." 29 * 30 * 4. The name of JPNIC may not be used to endorse or promote products 31 * derived from this Software without specific prior written approval of 32 * JPNIC. 33 * 34 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 37 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE 38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 39 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 40 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 41 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 42 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 43 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 44 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 45 */ 46 47#include <config.h> 48 49#include <stddef.h> 50#include <stdlib.h> 51#include <string.h> 52 53#include <idn/assert.h> 54#include <idn/result.h> 55#include <idn/logmacro.h> 56#include <idn/util.h> 57#include <idn/ucs4.h> 58#include <idn/debug.h> 59 60/* 61 * Unicode surrogate pair. 62 */ 63#define IS_SURROGATE_HIGH(v) (0xd800 <= (v) && (v) <= 0xdbff) 64#define IS_SURROGATE_LOW(v) (0xdc00 <= (v) && (v) <= 0xdfff) 65#define SURROGATE_HIGH(v) (SURROGATE_H_OFF + (((v) - 0x10000) >> 10)) 66#define SURROGATE_LOW(v) (SURROGATE_L_OFF + ((v) & 0x3ff)) 67#define SURROGATE_BASE 0x10000 68#define SURROGATE_H_OFF 0xd800 69#define SURROGATE_L_OFF 0xdc00 70#define COMBINE_SURROGATE(h, l) \ 71 (SURROGATE_BASE + (((h)-SURROGATE_H_OFF)<<10) + ((l)-SURROGATE_L_OFF)) 72 73/* 74 * ASCII ctype macros. 75 * Note that these macros evaluate the argument multiple times. Be careful. 76 */ 77#define ASCII_TOUPPER(c) \ 78 (('a' <= (c) && (c) <= 'z') ? ((c) - 'a' + 'A') : (c)) 79#define ASCII_TOLOWER(c) \ 80 (('A' <= (c) && (c) <= 'Z') ? ((c) - 'A' + 'a') : (c)) 81 82idn_result_t 83idn_ucs4_ucs4toutf16(const unsigned long *ucs4, unsigned short *utf16, 84 size_t tolen) { 85 unsigned short *utf16p = utf16; 86 unsigned long v; 87 idn_result_t r; 88 89 TRACE(("idn_ucs4_ucs4toutf16(ucs4=\"%s\", tolen=%d)\n", 90 idn__debug_ucs4xstring(ucs4, 50), (int)tolen)); 91 92 while (*ucs4 != '\0') { 93 v = *ucs4++; 94 95 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 96 WARNING(("idn_ucs4_ucs4toutf16: UCS4 string contains " 97 "surrogate pair\n")); 98 r = idn_invalid_encoding; 99 goto ret; 100 } else if (v > 0xffff) { 101 /* Convert to surrogate pair */ 102 if (v >= 0x110000) { 103 r = idn_invalid_encoding; 104 goto ret; 105 } 106 if (tolen < 2) { 107 r = idn_buffer_overflow; 108 goto ret; 109 } 110 *utf16p++ = SURROGATE_HIGH(v); 111 *utf16p++ = SURROGATE_LOW(v); 112 tolen -= 2; 113 } else { 114 if (tolen < 1) { 115 r = idn_buffer_overflow; 116 goto ret; 117 } 118 *utf16p++ = v; 119 tolen--; 120 } 121 } 122 123 if (tolen < 1) { 124 r = idn_buffer_overflow; 125 goto ret; 126 } 127 *utf16p = '\0'; 128 129 r = idn_success; 130ret: 131 if (r == idn_success) { 132 TRACE(("idn_ucs4_ucs4toutf16(): success (utf16=\"%s\")\n", 133 idn__debug_utf16xstring(utf16, 50))); 134 } else { 135 TRACE(("idn_ucs4_ucs4toutf16(): %s\n", 136 idn_result_tostring(r))); 137 } 138 return (r); 139} 140 141idn_result_t 142idn_ucs4_utf16toucs4(const unsigned short *utf16, unsigned long *ucs4, 143 size_t tolen) { 144 unsigned long *ucs4p = ucs4; 145 unsigned short v0, v1; 146 idn_result_t r; 147 148 TRACE(("idn_ucs4_utf16toucs4(utf16=\"%s\", tolen=%d)\n", 149 idn__debug_utf16xstring(utf16, 50), (int)tolen)); 150 151 while (*utf16 != '\0') { 152 v0 = *utf16; 153 154 if (tolen < 1) { 155 r = idn_buffer_overflow; 156 goto ret; 157 } 158 159 if (IS_SURROGATE_HIGH(v0)) { 160 v1 = *(utf16 + 1); 161 if (!IS_SURROGATE_LOW(v1)) { 162 WARNING(("idn_ucs4_utf16toucs4: " 163 "corrupted surrogate pair\n")); 164 r = idn_invalid_encoding; 165 goto ret; 166 } 167 *ucs4p++ = COMBINE_SURROGATE(v0, v1); 168 tolen--; 169 utf16 += 2; 170 171 } else { 172 *ucs4p++ = v0; 173 tolen--; 174 utf16++; 175 176 } 177 } 178 179 if (tolen < 1) { 180 r = idn_buffer_overflow; 181 goto ret; 182 } 183 *ucs4p = '\0'; 184 185 r = idn_success; 186ret: 187 if (r == idn_success) { 188 TRACE(("idn_ucs4_utf16toucs4(): success (ucs4=\"%s\")\n", 189 idn__debug_ucs4xstring(ucs4, 50))); 190 } else { 191 TRACE(("idn_ucs4_utf16toucs4(): %s\n", 192 idn_result_tostring(r))); 193 } 194 return (r); 195} 196 197idn_result_t 198idn_ucs4_utf8toucs4(const char *utf8, unsigned long *ucs4, size_t tolen) { 199 const unsigned char *utf8p = (const unsigned char *)utf8; 200 unsigned long *ucs4p = ucs4; 201 unsigned long v, min; 202 unsigned char c; 203 int width; 204 int i; 205 idn_result_t r; 206 207 TRACE(("idn_ucs4_utf8toucs4(utf8=\"%s\", tolen=%d)\n", 208 idn__debug_xstring(utf8, 50), (int)tolen)); 209 210 while(*utf8p != '\0') { 211 c = *utf8p++; 212 if (c < 0x80) { 213 v = c; 214 min = 0; 215 width = 1; 216 } else if (c < 0xc0) { 217 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 218 r = idn_invalid_encoding; 219 goto ret; 220 } else if (c < 0xe0) { 221 v = c & 0x1f; 222 min = 0x80; 223 width = 2; 224 } else if (c < 0xf0) { 225 v = c & 0x0f; 226 min = 0x800; 227 width = 3; 228 } else if (c < 0xf8) { 229 v = c & 0x07; 230 min = 0x10000; 231 width = 4; 232 } else if (c < 0xfc) { 233 v = c & 0x03; 234 min = 0x200000; 235 width = 5; 236 } else if (c < 0xfe) { 237 v = c & 0x01; 238 min = 0x4000000; 239 width = 6; 240 } else { 241 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 242 r = idn_invalid_encoding; 243 goto ret; 244 } 245 246 for (i = width - 1; i > 0; i--) { 247 c = *utf8p++; 248 if (c < 0x80 || 0xc0 <= c) { 249 WARNING(("idn_ucs4_utf8toucs4: " 250 "invalid character\n")); 251 r = idn_invalid_encoding; 252 goto ret; 253 } 254 v = (v << 6) | (c & 0x3f); 255 } 256 257 if (v < min) { 258 WARNING(("idn_ucs4_utf8toucs4: invalid character\n")); 259 r = idn_invalid_encoding; 260 goto ret; 261 } 262 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 263 WARNING(("idn_ucs4_utf8toucs4: UTF-8 string contains " 264 "surrogate pair\n")); 265 r = idn_invalid_encoding; 266 goto ret; 267 } 268 if (tolen < 1) { 269 r = idn_buffer_overflow; 270 goto ret; 271 } 272 tolen--; 273 *ucs4p++ = v; 274 } 275 276 if (tolen < 1) { 277 r = idn_buffer_overflow; 278 goto ret; 279 } 280 *ucs4p = '\0'; 281 282 r = idn_success; 283ret: 284 if (r == idn_success) { 285 TRACE(("idn_ucs4_utf8toucs4(): success (ucs4=\"%s\")\n", 286 idn__debug_ucs4xstring(ucs4, 50))); 287 } else { 288 TRACE(("idn_ucs4_utf8toucs4(): %s\n", 289 idn_result_tostring(r))); 290 } 291 return (r); 292} 293 294idn_result_t 295idn_ucs4_ucs4toutf8(const unsigned long *ucs4, char *utf8, size_t tolen) { 296 unsigned char *utf8p = (unsigned char *)utf8; 297 unsigned long v; 298 int width; 299 int mask; 300 int offset; 301 idn_result_t r; 302 303 TRACE(("idn_ucs4_ucs4toutf8(ucs4=\"%s\", tolen=%d)\n", 304 idn__debug_ucs4xstring(ucs4, 50), (int)tolen)); 305 306 while (*ucs4 != '\0') { 307 v = *ucs4++; 308 if (IS_SURROGATE_LOW(v) || IS_SURROGATE_HIGH(v)) { 309 WARNING(("idn_ucs4_ucs4toutf8: UCS4 string contains " 310 "surrogate pair\n")); 311 r = idn_invalid_encoding; 312 goto ret; 313 } 314 if (v < 0x80) { 315 mask = 0; 316 width = 1; 317 } else if (v < 0x800) { 318 mask = 0xc0; 319 width = 2; 320 } else if (v < 0x10000) { 321 mask = 0xe0; 322 width = 3; 323 } else if (v < 0x200000) { 324 mask = 0xf0; 325 width = 4; 326 } else if (v < 0x4000000) { 327 mask = 0xf8; 328 width = 5; 329 } else if (v < 0x80000000) { 330 mask = 0xfc; 331 width = 6; 332 } else { 333 WARNING(("idn_ucs4_ucs4toutf8: invalid character\n")); 334 r = idn_invalid_encoding; 335 goto ret; 336 } 337 338 if (tolen < width) { 339 r = idn_buffer_overflow; 340 goto ret; 341 } 342 offset = 6 * (width - 1); 343 *utf8p++ = (v >> offset) | mask; 344 mask = 0x80; 345 while (offset > 0) { 346 offset -= 6; 347 *utf8p++ = ((v >> offset) & 0x3f) | mask; 348 } 349 tolen -= width; 350 } 351 352 if (tolen < 1) { 353 r = idn_buffer_overflow; 354 goto ret; 355 } 356 *utf8p = '\0'; 357 358 r = idn_success; 359ret: 360 if (r == idn_success) { 361 TRACE(("idn_ucs4_ucs4toutf8(): success (utf8=\"%s\")\n", 362 idn__debug_xstring(utf8, 50))); 363 } else { 364 TRACE(("idn_ucs4_ucs4toutf8(): %s\n", 365 idn_result_tostring(r))); 366 } 367 return (r); 368} 369 370size_t 371idn_ucs4_strlen(const unsigned long *ucs4) { 372 size_t len; 373 374 for (len = 0; *ucs4 != '\0'; ucs4++, len++) 375 /* nothing to do */ ; 376 377 return (len); 378} 379 380unsigned long * 381idn_ucs4_strcpy(unsigned long *to, const unsigned long *from) { 382 unsigned long *result = to; 383 384 while (*from != '\0') 385 *to++ = *from++; 386 *to = '\0'; 387 388 return (result); 389} 390 391unsigned long * 392idn_ucs4_strcat(unsigned long *to, const unsigned long *from) { 393 unsigned long *result = to; 394 395 while (*to != '\0') 396 to++; 397 398 while (*from != '\0') 399 *to++ = *from++; 400 *to = '\0'; 401 402 return (result); 403} 404 405int 406idn_ucs4_strcmp(const unsigned long *str1, const unsigned long *str2) { 407 while (*str1 != '\0') { 408 if (*str1 > *str2) 409 return (1); 410 else if (*str1 < *str2) 411 return (-1); 412 str1++; 413 str2++; 414 } 415 416 if (*str1 > *str2) 417 return (1); 418 else if (*str1 < *str2) 419 return (-1); 420 421 return (0); 422} 423 424int 425idn_ucs4_strcasecmp(const unsigned long *str1, const unsigned long *str2) { 426 unsigned long c1, c2; 427 428 while (*str1 != '\0') { 429 c1 = ASCII_TOLOWER(*str1); 430 c2 = ASCII_TOLOWER(*str2); 431 if (c1 > c2) 432 return (1); 433 else if (c1 < c2) 434 return (-1); 435 str1++; 436 str2++; 437 } 438 439 c1 = ASCII_TOLOWER(*str1); 440 c2 = ASCII_TOLOWER(*str2); 441 if (c1 > c2) 442 return (1); 443 else if (c1 < c2) 444 return (-1); 445 446 return (0); 447} 448 449 450unsigned long * 451idn_ucs4_strdup(const unsigned long *str) { 452 size_t length = idn_ucs4_strlen(str); 453 unsigned long *dupstr; 454 455 dupstr = (unsigned long *)malloc(sizeof(*str) * (length + 1)); 456 if (dupstr == NULL) 457 return NULL; 458 memcpy(dupstr, str, sizeof(*str) * (length + 1)); 459 460 return dupstr; 461} 462