1/* $NetBSD: utf8.c,v 1.3 2023/06/19 21:41:45 christos Exp $ */ 2 3/* 4 * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska H��gskolan 5 * (Royal Institute of Technology, Stockholm, Sweden). 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * 3. Neither the name of the Institute nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36#include <config.h> 37#include "windlocl.h" 38 39static int 40utf8toutf32(const unsigned char **pp, uint32_t *out) 41{ 42 const unsigned char *p = *pp; 43 unsigned c = *p; 44 45 if (c & 0x80) { 46 if ((c & 0xE0) == 0xC0) { 47 const unsigned c2 = *++p; 48 if ((c2 & 0xC0) == 0x80) { 49 *out = ((c & 0x1F) << 6) 50 | (c2 & 0x3F); 51 } else { 52 return WIND_ERR_INVALID_UTF8; 53 } 54 } else if ((c & 0xF0) == 0xE0) { 55 const unsigned c2 = *++p; 56 if ((c2 & 0xC0) == 0x80) { 57 const unsigned c3 = *++p; 58 if ((c3 & 0xC0) == 0x80) { 59 *out = ((c & 0x0F) << 12) 60 | ((c2 & 0x3F) << 6) 61 | (c3 & 0x3F); 62 } else { 63 return WIND_ERR_INVALID_UTF8; 64 } 65 } else { 66 return WIND_ERR_INVALID_UTF8; 67 } 68 } else if ((c & 0xF8) == 0xF0) { 69 const unsigned c2 = *++p; 70 if ((c2 & 0xC0) == 0x80) { 71 const unsigned c3 = *++p; 72 if ((c3 & 0xC0) == 0x80) { 73 const unsigned c4 = *++p; 74 if ((c4 & 0xC0) == 0x80) { 75 *out = ((c & 0x07) << 18) 76 | ((c2 & 0x3F) << 12) 77 | ((c3 & 0x3F) << 6) 78 | (c4 & 0x3F); 79 } else { 80 return WIND_ERR_INVALID_UTF8; 81 } 82 } else { 83 return WIND_ERR_INVALID_UTF8; 84 } 85 } else { 86 return WIND_ERR_INVALID_UTF8; 87 } 88 } else { 89 return WIND_ERR_INVALID_UTF8; 90 } 91 } else { 92 *out = c; 93 } 94 95 *pp = p; 96 97 return 0; 98} 99 100/** 101 * Convert an UTF-8 string to an UCS4 string. 102 * 103 * @param in an UTF-8 string to convert. 104 * @param out the resulting UCS4 strint, must be at least 105 * wind_utf8ucs4_length() long. If out is NULL, the function will 106 * calculate the needed space for the out variable (just like 107 * wind_utf8ucs4_length()). 108 * @param out_len before processing out_len should be the length of 109 * the out variable, after processing it will be the length of the out 110 * string. 111 * 112 * @return returns 0 on success, an wind error code otherwise 113 * @ingroup wind 114 */ 115 116int 117wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len) 118{ 119 const unsigned char *p; 120 size_t o = 0; 121 int ret; 122 123 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 124 uint32_t u; 125 126 ret = utf8toutf32(&p, &u); 127 if (ret) 128 return ret; 129 130 if (out) { 131 if (o >= *out_len) 132 return WIND_ERR_OVERRUN; 133 out[o] = u; 134 } 135 o++; 136 } 137 *out_len = o; 138 return 0; 139} 140 141/** 142 * Calculate the length of from converting a UTF-8 string to a UCS4 143 * string. 144 * 145 * @param in an UTF-8 string to convert. 146 * @param out_len the length of the resulting UCS4 string. 147 * 148 * @return returns 0 on success, an wind error code otherwise 149 * @ingroup wind 150 */ 151 152int 153wind_utf8ucs4_length(const char *in, size_t *out_len) 154{ 155 return wind_utf8ucs4(in, NULL, out_len); 156} 157 158static const char first_char[4] = 159 { 0x00, 0xC0, 0xE0, 0xF0 }; 160 161/** 162 * Convert an UCS4 string to a UTF-8 string. 163 * 164 * @param in an UCS4 string to convert. 165 * @param in_len the length input array. 166 167 * @param out the resulting UTF-8 strint, must be at least 168 * wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If 169 * out is NULL, the function will calculate the needed space for the 170 * out variable (just like wind_ucs4utf8_length()). 171 172 * @param out_len before processing out_len should be the length of 173 * the out variable, after processing it will be the length of the out 174 * string. 175 * 176 * @return returns 0 on success, an wind error code otherwise 177 * @ingroup wind 178 */ 179 180int 181wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len) 182{ 183 uint32_t ch; 184 size_t i, len, o; 185 186 for (o = 0, i = 0; i < in_len; i++) { 187 ch = in[i]; 188 189 if (ch < 0x80) { 190 len = 1; 191 } else if (ch < 0x800) { 192 len = 2; 193 } else if (ch < 0x10000) { 194 len = 3; 195 } else if (ch <= 0x10FFFF) { 196 len = 4; 197 } else 198 return WIND_ERR_INVALID_UTF32; 199 200 o += len; 201 202 if (out) { 203 if (o >= *out_len) 204 return WIND_ERR_OVERRUN; 205 206 switch(len) { 207 case 4: 208 out[3] = (ch | 0x80) & 0xbf; 209 ch = ch >> 6; 210 /* FALLTHROUGH */ 211 case 3: 212 out[2] = (ch | 0x80) & 0xbf; 213 ch = ch >> 6; 214 /* FALLTHROUGH */ 215 case 2: 216 out[1] = (ch | 0x80) & 0xbf; 217 ch = ch >> 6; 218 /* FALLTHROUGH */ 219 case 1: 220 out[0] = ch | first_char[len - 1]; 221 /* FALLTHROUGH */ 222 } 223 } 224 out += len; 225 } 226 if (out) { 227 if (o + 1 >= *out_len) 228 return WIND_ERR_OVERRUN; 229 *out = '\0'; 230 } 231 *out_len = o; 232 return 0; 233} 234 235/** 236 * Calculate the length of from converting a UCS4 string to an UTF-8 string. 237 * 238 * @param in an UCS4 string to convert. 239 * @param in_len the length of UCS4 string to convert. 240 * @param out_len the length of the resulting UTF-8 string. 241 * 242 * @return returns 0 on success, an wind error code otherwise 243 * @ingroup wind 244 */ 245 246int 247wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len) 248{ 249 return wind_ucs4utf8(in, in_len, NULL, out_len); 250} 251 252/** 253 * Read in an UCS2 from a buffer. 254 * 255 * @param ptr The input buffer to read from. 256 * @param len the length of the input buffer. 257 * @param flags Flags to control the behavior of the function. 258 * @param out the output UCS2, the array must be at least out/2 long. 259 * @param out_len the output length 260 * 261 * @return returns 0 on success, an wind error code otherwise. 262 * @ingroup wind 263 */ 264 265int 266wind_ucs2read(const void *ptr, size_t len, unsigned int *flags, 267 uint16_t *out, size_t *out_len) 268{ 269 const unsigned char *p = ptr; 270 int little = ((*flags) & WIND_RW_LE); 271 size_t olen = *out_len; 272 273 /** if len is zero, flags are unchanged */ 274 if (len == 0) { 275 *out_len = 0; 276 return 0; 277 } 278 279 /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */ 280 if (len & 1) 281 return WIND_ERR_LENGTH_NOT_MOD2; 282 283 /** 284 * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is 285 * found, check is LE/BE flag is already and use that otherwise 286 * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and 287 * the LE/BE flag and set the resulting LE/BE flag. 288 */ 289 if ((*flags) & WIND_RW_BOM) { 290 uint16_t bom = (p[0] << 8) + p[1]; 291 if (bom == 0xfffe || bom == 0xfeff) { 292 little = (bom == 0xfffe); 293 p += 2; 294 len -= 2; 295 } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) { 296 /* little already set */ 297 } else 298 return WIND_ERR_NO_BOM; 299 *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE)); 300 *flags |= little ? WIND_RW_LE : WIND_RW_BE; 301 } 302 303 while (len) { 304 if (olen < 1) 305 return WIND_ERR_OVERRUN; 306 if (little) 307 *out = (p[1] << 8) + p[0]; 308 else 309 *out = (p[0] << 8) + p[1]; 310 out++; p += 2; len -= 2; olen--; 311 } 312 *out_len -= olen; 313 return 0; 314} 315 316/** 317 * Write an UCS2 string to a buffer. 318 * 319 * @param in The input UCS2 string. 320 * @param in_len the length of the input buffer. 321 * @param flags Flags to control the behavior of the function. 322 * @param ptr The input buffer to write to, the array must be at least 323 * (in + 1) * 2 bytes long. 324 * @param out_len the output length 325 * 326 * @return returns 0 on success, an wind error code otherwise. 327 * @ingroup wind 328 */ 329 330int 331wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags, 332 void *ptr, size_t *out_len) 333{ 334 unsigned char *p = ptr; 335 size_t len = *out_len; 336 337 /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/ 338 if (len & 1) 339 return WIND_ERR_LENGTH_NOT_MOD2; 340 341 /** On zero input length, flags are preserved */ 342 if (in_len == 0) { 343 *out_len = 0; 344 return 0; 345 } 346 /** If flags have WIND_RW_BOM set, the byte order mark is written 347 * first to the output data */ 348 if ((*flags) & WIND_RW_BOM) { 349 uint16_t bom = 0xfffe; 350 351 if (len < 2) 352 return WIND_ERR_OVERRUN; 353 354 if ((*flags) & WIND_RW_LE) { 355 p[0] = (bom ) & 0xff; 356 p[1] = (bom >> 8) & 0xff; 357 } else { 358 p[1] = (bom ) & 0xff; 359 p[0] = (bom >> 8) & 0xff; 360 } 361 len -= 2; 362 } 363 364 while (in_len) { 365 /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */ 366 if (len < 2) 367 return WIND_ERR_OVERRUN; 368 if ((*flags) & WIND_RW_LE) { 369 p[0] = (in[0] ) & 0xff; 370 p[1] = (in[0] >> 8) & 0xff; 371 } else { 372 p[1] = (in[0] ) & 0xff; 373 p[0] = (in[0] >> 8) & 0xff; 374 } 375 len -= 2; 376 in_len--; 377 p += 2; 378 in++; 379 } 380 *out_len -= len; 381 return 0; 382} 383 384 385/** 386 * Convert an UTF-8 string to an UCS2 string. 387 * 388 * @param in an UTF-8 string to convert. 389 * @param out the resulting UCS2 strint, must be at least 390 * wind_utf8ucs2_length() long. If out is NULL, the function will 391 * calculate the needed space for the out variable (just like 392 * wind_utf8ucs2_length()). 393 * @param out_len before processing out_len should be the length of 394 * the out variable, after processing it will be the length of the out 395 * string. 396 * 397 * @return returns 0 on success, an wind error code otherwise 398 * @ingroup wind 399 */ 400 401int 402wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len) 403{ 404 const unsigned char *p; 405 size_t o = 0; 406 int ret; 407 408 for (p = (const unsigned char *)in; *p != '\0'; ++p) { 409 uint32_t u; 410 411 ret = utf8toutf32(&p, &u); 412 if (ret) 413 return ret; 414 415 if (u & 0xffff0000) 416 return WIND_ERR_NOT_UTF16; 417 418 if (out) { 419 if (o >= *out_len) 420 return WIND_ERR_OVERRUN; 421 out[o] = u; 422 } 423 o++; 424 } 425 *out_len = o; 426 return 0; 427} 428 429/** 430 * Calculate the length of from converting a UTF-8 string to a UCS2 431 * string. 432 * 433 * @param in an UTF-8 string to convert. 434 * @param out_len the length of the resulting UCS4 string. 435 * 436 * @return returns 0 on success, an wind error code otherwise 437 * @ingroup wind 438 */ 439 440int 441wind_utf8ucs2_length(const char *in, size_t *out_len) 442{ 443 return wind_utf8ucs2(in, NULL, out_len); 444} 445 446/** 447 * Convert an UCS2 string to a UTF-8 string. 448 * 449 * @param in an UCS2 string to convert. 450 * @param in_len the length of the in UCS2 string. 451 * @param out the resulting UTF-8 strint, must be at least 452 * wind_ucs2utf8_length() long. If out is NULL, the function will 453 * calculate the needed space for the out variable (just like 454 * wind_ucs2utf8_length()). 455 * @param out_len before processing out_len should be the length of 456 * the out variable, after processing it will be the length of the out 457 * string. 458 * 459 * @return returns 0 on success, an wind error code otherwise 460 * @ingroup wind 461 */ 462 463int 464wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) 465{ 466 uint16_t ch; 467 size_t i, len, o; 468 469 for (o = 0, i = 0; i < in_len; i++) { 470 ch = in[i]; 471 472 if (ch < 0x80) { 473 len = 1; 474 } else if (ch < 0x800) { 475 len = 2; 476 } else 477 len = 3; 478 479 o += len; 480 481 if (out) { 482 if (o >= *out_len) 483 return WIND_ERR_OVERRUN; 484 485 switch(len) { 486 case 3: 487 out[2] = (ch | 0x80) & 0xbf; 488 ch = ch >> 6; 489 /* FALLTHROUGH */ 490 case 2: 491 out[1] = (ch | 0x80) & 0xbf; 492 ch = ch >> 6; 493 /* FALLTHROUGH */ 494 case 1: 495 out[0] = ch | first_char[len - 1]; 496 /* FALLTHROUGH */ 497 } 498 out += len; 499 } 500 } 501 if (out) { 502 if (o >= *out_len) 503 return WIND_ERR_OVERRUN; 504 *out = '\0'; 505 } 506 *out_len = o; 507 return 0; 508} 509 510/** 511 * Calculate the length of from converting a UCS2 string to an UTF-8 string. 512 * 513 * @param in an UCS2 string to convert. 514 * @param in_len an UCS2 string length to convert. 515 * @param out_len the length of the resulting UTF-8 string. 516 * 517 * @return returns 0 on success, an wind error code otherwise 518 * @ingroup wind 519 */ 520 521int 522wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len) 523{ 524 return wind_ucs2utf8(in, in_len, NULL, out_len); 525} 526