1/* 2 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan 3 * (Royal Institute of Technology, Stockholm, Sweden). 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the Institute nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34#include <config.h> 35#include "windlocl.h" 36 37static int 38utf8toutf32(const unsigned char **pp, uint32_t *out) 39{ 40 const unsigned char *p = *pp; 41 unsigned c = *p; 42 43 if (c & 0x80) { 44 if ((c & 0xE0) == 0xC0) { 45 const unsigned c2 = *++p; 46 if ((c2 & 0xC0) == 0x80) { 47 *out = ((c & 0x1F) << 6) 48 | (c2 & 0x3F); 49 } else { 50 return WIND_ERR_INVALID_UTF8; 51 } 52 } else if ((c & 0xF0) == 0xE0) { 53 const unsigned c2 = *++p; 54 if ((c2 & 0xC0) == 0x80) { 55 const unsigned c3 = *++p; 56 if ((c3 & 0xC0) == 0x80) { 57 *out = ((c & 0x0F) << 12) 58 | ((c2 & 0x3F) << 6) 59 | (c3 & 0x3F); 60 } else { 61 return WIND_ERR_INVALID_UTF8; 62 } 63 } else { 64 return WIND_ERR_INVALID_UTF8; 65 } 66 } else if ((c & 0xF8) == 0xF0) { 67 const unsigned c2 = *++p; 68 if ((c2 & 0xC0) == 0x80) { 69 const unsigned c3 = *++p; 70 if ((c3 & 0xC0) == 0x80) { 71 const unsigned c4 = *++p; 72 if ((c4 & 0xC0) == 0x80) { 73 *out = ((c & 0x07) << 18) 74 | ((c2 & 0x3F) << 12) 75 | ((c3 & 0x3F) << 6) 76 | (c4 & 0x3F); 77 } else { 78 return WIND_ERR_INVALID_UTF8; 79 } 80 } else { 81 return WIND_ERR_INVALID_UTF8; 82 } 83 } else { 84 return WIND_ERR_INVALID_UTF8; 85 } 86 } else { 87 return WIND_ERR_INVALID_UTF8; 88 } 89 } else { 90 *out = c; 91 } 92 93 *pp = p; 94 95 return 0; 96} 97 98/** 99 * Convert an UTF-8 string to an UCS4 string. 100 * 101 * @param in an UTF-8 string to convert. 102 * @param out the resulting UCS4 strint, must be at least 103 * wind_utf8ucs4_length() long. If out is NULL, the function will 104 * calculate the needed space for the out variable (just like 105 * wind_utf8ucs4_length()). 106 * @param out_len before processing out_len should be the length of 107 * the out variable, after processing it will be the length of the out 108 * string. 109 * 110 * @return returns 0 on success, an wind error code otherwise 111 * @ingroup wind 112 */ 113 114int 115wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len) 116{ 117 const unsigned char *p; 118 size_t o = 0; 119 int ret; 120 121 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 122 uint32_t u; 123 124 ret = utf8toutf32(&p, &u); 125 if (ret) 126 return ret; 127 128 if (out) { 129 if (o >= *out_len) 130 return WIND_ERR_OVERRUN; 131 out[o] = u; 132 } 133 o++; 134 } 135 *out_len = o; 136 return 0; 137} 138 139/** 140 * Calculate the length of from converting a UTF-8 string to a UCS4 141 * string. 142 * 143 * @param in an UTF-8 string to convert. 144 * @param out_len the length of the resulting UCS4 string. 145 * 146 * @return returns 0 on success, an wind error code otherwise 147 * @ingroup wind 148 */ 149 150int 151wind_utf8ucs4_length(const char *in, size_t *out_len) 152{ 153 return wind_utf8ucs4(in, NULL, out_len); 154} 155 156/** 157 * Convert an UTF-8 string to an UCS4 string. 158 * 159 * @param in an UTF-8 string to convert. 160 * @param out the resulting UCS4 strint, must be free with free(). 161 * @param out_len will be the length of the out string. 162 * 163 * @return returns 0 on success, an wind error code otherwise 164 * @ingroup wind 165 */ 166 167int 168wind_utf8ucs4_copy(const char *in, uint32_t **out, size_t *out_len) 169{ 170 int ret; 171 172 ret = wind_utf8ucs4_length(in, out_len); 173 if (ret) 174 return ret; 175 if (*out_len > UINT_MAX / sizeof((*out)[0])) 176 return ERANGE; 177 if (*out_len == 0) 178 return 0; 179 180 *out = malloc(*out_len * sizeof((*out)[0])); 181 if (*out == NULL) { 182 *out_len = 0; 183 return ENOMEM; 184 } 185 186 ret = wind_utf8ucs4(in, *out, out_len); 187 if (ret) { 188 free(*out); 189 *out = NULL; 190 *out_len = 0; 191 } 192 return ret; 193} 194 195 196static const char first_char[4] = 197 { 0x00, 0xC0, 0xE0, 0xF0 }; 198 199/** 200 * Convert an UCS4 string to a UTF-8 string. 201 * 202 * @param in an UCS4 string to convert. 203 * @param in_len the length input array. 204 205 * @param out the resulting UTF-8 strint, must be at least 206 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If 207 * out is NULL, the function will calculate the needed space for the 208 * out variable (just like wind_ucs4utf8_length()). 209 210 * @param out_len before processing out_len should be the length of 211 * the out variable, after processing it will be the length of the out 212 * string. NUL not included. 213 * 214 * @return returns 0 on success, an wind error code otherwise 215 * @ingroup wind 216 */ 217 218int 219wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len) 220{ 221 uint32_t ch; 222 size_t i, len, o; 223 224 for (o = 0, i = 0; i < in_len; i++) { 225 ch = in[i]; 226 227 if (ch < 0x80) { 228 len = 1; 229 } else if (ch < 0x800) { 230 len = 2; 231 } else if (ch < 0x10000) { 232 len = 3; 233 } else if (ch <= 0x10FFFF) { 234 len = 4; 235 } else 236 return WIND_ERR_INVALID_UTF32; 237 238 o += len; 239 240 if (out) { 241 if (o >= *out_len) 242 return WIND_ERR_OVERRUN; 243 244 switch(len) { 245 case 4: 246 out[3] = (ch | 0x80) & 0xbf; 247 ch = ch << 6; 248 case 3: 249 out[2] = (ch | 0x80) & 0xbf; 250 ch = ch << 6; 251 case 2: 252 out[1] = (ch | 0x80) & 0xbf; 253 ch = ch << 6; 254 case 1: 255 out[0] = ch | first_char[len - 1]; 256 } 257 out += len; 258 } 259 } 260 if (out) { 261 if (o >= *out_len) 262 return WIND_ERR_OVERRUN; 263 *out = '\0'; 264 } 265 *out_len = o; 266 return 0; 267} 268 269/** 270 * Calculate the length of from converting a UCS4 string to an UTF-8 string. 271 * 272 * @param in an UCS4 string to convert. 273 * @param in_len the length of UCS4 string to convert. 274 * @param out_len the length of the resulting UTF-8 string. 275 * 276 * @return returns 0 on success, an wind error code otherwise 277 * @ingroup wind 278 */ 279 280int 281wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len) 282{ 283 return wind_ucs4utf8(in, in_len, NULL, out_len); 284} 285 286/** 287 * Convert an UCS4 string to a UTF-8 string. 288 * 289 * @param in an UCS4 string to convert. 290 * @param in_len the length input array. 291 * @param out an allocated string, should be released with free(). 292 * @param out_len size of out string, NUL not included in count. 293 * 294 * @return returns 0 on success, an wind error code otherwise 295 * @ingroup wind 296 */ 297 298int 299wind_ucs4utf8_copy(const uint32_t *in, size_t in_len, char **out, size_t *out_len) 300{ 301 size_t size; 302 int ret; 303 304 ret = wind_ucs4utf8_length(in, in_len, &size); 305 if (ret) 306 return ret; 307 308 size += 1; 309 310 *out = malloc(size); 311 if (*out == NULL) 312 return ENOMEM; 313 314 ret = wind_ucs4utf8(in, in_len, *out, &size); 315 if (ret) { 316 free(*out); 317 *out = NULL; 318 return ret; 319 } 320 321 if (out_len) 322 *out_len = size; 323 324 return 0; 325} 326 327 328/** 329 * Read in an UCS2 from a buffer. 330 * 331 * @param ptr The input buffer to read from. 332 * @param len the length of the input buffer. 333 * @param flags Flags to control the behavior of the function. 334 * @param out the output UCS2, the array must be at least out/2 long. 335 * @param out_len the output length 336 * 337 * @return returns 0 on success, an wind error code otherwise. 338 * @ingroup wind 339 */ 340 341int 342wind_ucs2read(const void *ptr, size_t len, unsigned int *flags, 343 uint16_t *out, size_t *out_len) 344{ 345 const unsigned char *p = ptr; 346 int little = ((*flags) & WIND_RW_LE); 347 size_t olen = *out_len; 348 349 /** if len is zero, flags are unchanged */ 350 if (len == 0) { 351 *out_len = 0; 352 return 0; 353 } 354 355 /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */ 356 if (len & 1) 357 return WIND_ERR_LENGTH_NOT_MOD2; 358 359 /** 360 * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is 361 * found, check is LE/BE flag is already and use that otherwise 362 * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and 363 * the LE/BE flag and set the resulting LE/BE flag. 364 */ 365 if ((*flags) & WIND_RW_BOM) { 366 uint16_t bom = (p[0] << 8) + p[1]; 367 if (bom == 0xfffe || bom == 0xfeff) { 368 little = (bom == 0xfffe); 369 p += 2; 370 len -= 2; 371 } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) { 372 /* little already set */ 373 } else 374 return WIND_ERR_NO_BOM; 375 *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE)); 376 *flags |= little ? WIND_RW_LE : WIND_RW_BE; 377 } 378 379 while (len) { 380 if (olen < 1) 381 return WIND_ERR_OVERRUN; 382 if (little) 383 *out = (p[1] << 8) + p[0]; 384 else 385 *out = (p[0] << 8) + p[1]; 386 out++; p += 2; len -= 2; olen--; 387 } 388 *out_len -= olen; 389 return 0; 390} 391 392/** 393 * Write an UCS2 string to a buffer. 394 * 395 * @param in The input UCS2 string. 396 * @param in_len the length of the input buffer. 397 * @param flags Flags to control the behavior of the function. 398 * @param ptr The input buffer to write to, the array must be at least 399 * (in + 1) * 2 bytes long. 400 * @param out_len the output length 401 * 402 * @return returns 0 on success, an wind error code otherwise. 403 * @ingroup wind 404 */ 405 406int 407wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags, 408 void *ptr, size_t *out_len) 409{ 410 unsigned char *p = ptr; 411 size_t len = *out_len; 412 413 /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/ 414 if (len & 1) 415 return WIND_ERR_LENGTH_NOT_MOD2; 416 417 /** On zero input length, flags are preserved */ 418 if (in_len == 0) { 419 *out_len = 0; 420 return 0; 421 } 422 /** If flags have WIND_RW_BOM set, the byte order mark is written 423 * first to the output data */ 424 if ((*flags) & WIND_RW_BOM) { 425 uint16_t bom = 0xfffe; 426 427 if (len < 2) 428 return WIND_ERR_OVERRUN; 429 430 if ((*flags) & WIND_RW_LE) { 431 p[0] = (bom >> 8) & 0xff; 432 p[1] = (bom ) & 0xff; 433 } else { 434 p[1] = (bom ) & 0xff; 435 p[0] = (bom >> 8) & 0xff; 436 } 437 len -= 2; 438 } 439 440 while (in_len) { 441 /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */ 442 if (len < 2) 443 return WIND_ERR_OVERRUN; 444 if ((*flags) & WIND_RW_LE) { 445 p[0] = (in[0] >> 8) & 0xff; 446 p[1] = (in[0] ) & 0xff; 447 } else { 448 p[1] = (in[0] ) & 0xff; 449 p[0] = (in[0] >> 8) & 0xff; 450 } 451 len -= 2; 452 in_len--; 453 p += 2; 454 in++; 455 } 456 *out_len -= len; 457 return 0; 458} 459 460 461/** 462 * Convert an UTF-8 string to an UCS2 string. 463 * 464 * @param in an UTF-8 string to convert. 465 * @param out the resulting UCS2 strint, must be at least 466 * wind_utf8ucs2_length() long. If out is NULL, the function will 467 * calculate the needed space for the out variable (just like 468 * wind_utf8ucs2_length()). 469 * @param out_len before processing out_len should be the length of 470 * the out variable, after processing it will be the length of the out 471 * string. 472 * 473 * @return returns 0 on success, an wind error code otherwise 474 * @ingroup wind 475 */ 476 477int 478wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len) 479{ 480 const unsigned char *p; 481 size_t o = 0; 482 int ret; 483 484 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 485 uint32_t u; 486 487 ret = utf8toutf32(&p, &u); 488 if (ret) 489 return ret; 490 491 if (u & 0xffff0000) 492 return WIND_ERR_NOT_UTF16; 493 494 if (out) { 495 if (o >= *out_len) 496 return WIND_ERR_OVERRUN; 497 out[o] = u; 498 } 499 o++; 500 } 501 *out_len = o; 502 return 0; 503} 504 505/** 506 * Calculate the length of from converting a UTF-8 string to a UCS2 507 * string. 508 * 509 * @param in an UTF-8 string to convert. 510 * @param out_len the length of the resulting UCS4 string. 511 * 512 * @return returns 0 on success, an wind error code otherwise 513 * @ingroup wind 514 */ 515 516int 517wind_utf8ucs2_length(const char *in, size_t *out_len) 518{ 519 return wind_utf8ucs2(in, NULL, out_len); 520} 521 522/** 523 * Convert an UCS2 string to a UTF-8 string. 524 * 525 * @param in an UCS2 string to convert. 526 * @param in_len the length of the in UCS2 string. 527 * @param out the resulting UTF-8 strint, must be at least 528 * wind_ucs2utf8_length() long. If out is NULL, the function will 529 * calculate the needed space for the out variable (just like 530 * wind_ucs2utf8_length()). 531 * @param out_len before processing out_len should be the length of 532 * the out variable, after processing it will be the length of the out 533 * string. 534 * 535 * @return returns 0 on success, an wind error code otherwise 536 * @ingroup wind 537 */ 538 539int 540wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) 541{ 542 uint16_t ch; 543 size_t i, len, o; 544 545 for (o = 0, i = 0; i < in_len; i++) { 546 ch = in[i]; 547 548 if (ch < 0x80) { 549 len = 1; 550 } else if (ch < 0x800) { 551 len = 2; 552 } else 553 len = 3; 554 555 o += len; 556 557 if (out) { 558 if (o >= *out_len) 559 return WIND_ERR_OVERRUN; 560 561 switch(len) { 562 case 3: 563 out[2] = (ch | 0x80) & 0xbf; 564 ch = ch << 6; 565 case 2: 566 out[1] = (ch | 0x80) & 0xbf; 567 ch = ch << 6; 568 case 1: 569 out[0] = ch | first_char[len - 1]; 570 } 571 out += len; 572 } 573 } 574 if (out) { 575 if (o >= *out_len) 576 return WIND_ERR_OVERRUN; 577 *out = '\0'; 578 } 579 *out_len = o; 580 return 0; 581} 582 583/** 584 * Calculate the length of from converting a UCS2 string to an UTF-8 string. 585 * 586 * @param in an UCS2 string to convert. 587 * @param in_len an UCS2 string length to convert. 588 * @param out_len the length of the resulting UTF-8 string. 589 * 590 * @return returns 0 on success, an wind error code otherwise 591 * @ingroup wind 592 */ 593 594int 595wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len) 596{ 597 return wind_ucs2utf8(in, in_len, NULL, out_len); 598} 599