1226031Sstas/* 2226031Sstas * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska H��gskolan 3226031Sstas * (Royal Institute of Technology, Stockholm, Sweden). 4226031Sstas * All rights reserved. 5226031Sstas * 6226031Sstas * Redistribution and use in source and binary forms, with or without 7226031Sstas * modification, are permitted provided that the following conditions 8226031Sstas * are met: 9226031Sstas * 10226031Sstas * 1. Redistributions of source code must retain the above copyright 11226031Sstas * notice, this list of conditions and the following disclaimer. 12226031Sstas * 13226031Sstas * 2. Redistributions in binary form must reproduce the above copyright 14226031Sstas * notice, this list of conditions and the following disclaimer in the 15226031Sstas * documentation and/or other materials provided with the distribution. 16226031Sstas * 17226031Sstas * 3. Neither the name of the Institute nor the names of its contributors 18226031Sstas * may be used to endorse or promote products derived from this software 19226031Sstas * without specific prior written permission. 20226031Sstas * 21226031Sstas * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 22226031Sstas * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23226031Sstas * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24226031Sstas * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 25226031Sstas * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26226031Sstas * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27226031Sstas * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28226031Sstas * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29226031Sstas * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30226031Sstas * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31226031Sstas * SUCH DAMAGE. 32226031Sstas */ 33226031Sstas 34226031Sstas#include <config.h> 35226031Sstas#include "windlocl.h" 36226031Sstas 37226031Sstasstatic int 38226031Sstasutf8toutf32(const unsigned char **pp, uint32_t *out) 39226031Sstas{ 40226031Sstas const unsigned char *p = *pp; 41226031Sstas unsigned c = *p; 42226031Sstas 43226031Sstas if (c & 0x80) { 44226031Sstas if ((c & 0xE0) == 0xC0) { 45226031Sstas const unsigned c2 = *++p; 46226031Sstas if ((c2 & 0xC0) == 0x80) { 47226031Sstas *out = ((c & 0x1F) << 6) 48226031Sstas | (c2 & 0x3F); 49226031Sstas } else { 50226031Sstas return WIND_ERR_INVALID_UTF8; 51226031Sstas } 52226031Sstas } else if ((c & 0xF0) == 0xE0) { 53226031Sstas const unsigned c2 = *++p; 54226031Sstas if ((c2 & 0xC0) == 0x80) { 55226031Sstas const unsigned c3 = *++p; 56226031Sstas if ((c3 & 0xC0) == 0x80) { 57226031Sstas *out = ((c & 0x0F) << 12) 58226031Sstas | ((c2 & 0x3F) << 6) 59226031Sstas | (c3 & 0x3F); 60226031Sstas } else { 61226031Sstas return WIND_ERR_INVALID_UTF8; 62226031Sstas } 63226031Sstas } else { 64226031Sstas return WIND_ERR_INVALID_UTF8; 65226031Sstas } 66226031Sstas } else if ((c & 0xF8) == 0xF0) { 67226031Sstas const unsigned c2 = *++p; 68226031Sstas if ((c2 & 0xC0) == 0x80) { 69226031Sstas const unsigned c3 = *++p; 70226031Sstas if ((c3 & 0xC0) == 0x80) { 71226031Sstas const unsigned c4 = *++p; 72226031Sstas if ((c4 & 0xC0) == 0x80) { 73226031Sstas *out = ((c & 0x07) << 18) 74226031Sstas | ((c2 & 0x3F) << 12) 75226031Sstas | ((c3 & 0x3F) << 6) 76226031Sstas | (c4 & 0x3F); 77226031Sstas } else { 78226031Sstas return WIND_ERR_INVALID_UTF8; 79226031Sstas } 80226031Sstas } else { 81226031Sstas return WIND_ERR_INVALID_UTF8; 82226031Sstas } 83226031Sstas } else { 84226031Sstas return WIND_ERR_INVALID_UTF8; 85226031Sstas } 86226031Sstas } else { 87226031Sstas return WIND_ERR_INVALID_UTF8; 88226031Sstas } 89226031Sstas } else { 90226031Sstas *out = c; 91226031Sstas } 92226031Sstas 93226031Sstas *pp = p; 94226031Sstas 95226031Sstas return 0; 96226031Sstas} 97226031Sstas 98226031Sstas/** 99226031Sstas * Convert an UTF-8 string to an UCS4 string. 100226031Sstas * 101226031Sstas * @param in an UTF-8 string to convert. 102226031Sstas * @param out the resulting UCS4 strint, must be at least 103226031Sstas * wind_utf8ucs4_length() long. If out is NULL, the function will 104226031Sstas * calculate the needed space for the out variable (just like 105226031Sstas * wind_utf8ucs4_length()). 106226031Sstas * @param out_len before processing out_len should be the length of 107226031Sstas * the out variable, after processing it will be the length of the out 108226031Sstas * string. 109226031Sstas * 110226031Sstas * @return returns 0 on success, an wind error code otherwise 111226031Sstas * @ingroup wind 112226031Sstas */ 113226031Sstas 114226031Sstasint 115226031Sstaswind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len) 116226031Sstas{ 117226031Sstas const unsigned char *p; 118226031Sstas size_t o = 0; 119226031Sstas int ret; 120226031Sstas 121226031Sstas for (p = (const unsigned char *)in; *p != '\0'; ++p) { 122226031Sstas uint32_t u; 123226031Sstas 124226031Sstas ret = utf8toutf32(&p, &u); 125226031Sstas if (ret) 126226031Sstas return ret; 127226031Sstas 128226031Sstas if (out) { 129226031Sstas if (o >= *out_len) 130226031Sstas return WIND_ERR_OVERRUN; 131226031Sstas out[o] = u; 132226031Sstas } 133226031Sstas o++; 134226031Sstas } 135226031Sstas *out_len = o; 136226031Sstas return 0; 137226031Sstas} 138226031Sstas 139226031Sstas/** 140226031Sstas * Calculate the length of from converting a UTF-8 string to a UCS4 141226031Sstas * string. 142226031Sstas * 143226031Sstas * @param in an UTF-8 string to convert. 144226031Sstas * @param out_len the length of the resulting UCS4 string. 145226031Sstas * 146226031Sstas * @return returns 0 on success, an wind error code otherwise 147226031Sstas * @ingroup wind 148226031Sstas */ 149226031Sstas 150226031Sstasint 151226031Sstaswind_utf8ucs4_length(const char *in, size_t *out_len) 152226031Sstas{ 153226031Sstas return wind_utf8ucs4(in, NULL, out_len); 154226031Sstas} 155226031Sstas 156226031Sstasstatic const char first_char[4] = 157226031Sstas { 0x00, 0xC0, 0xE0, 0xF0 }; 158226031Sstas 159226031Sstas/** 160226031Sstas * Convert an UCS4 string to a UTF-8 string. 161226031Sstas * 162226031Sstas * @param in an UCS4 string to convert. 163226031Sstas * @param in_len the length input array. 164226031Sstas 165226031Sstas * @param out the resulting UTF-8 strint, must be at least 166226031Sstas * wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If 167226031Sstas * out is NULL, the function will calculate the needed space for the 168226031Sstas * out variable (just like wind_ucs4utf8_length()). 169226031Sstas 170226031Sstas * @param out_len before processing out_len should be the length of 171226031Sstas * the out variable, after processing it will be the length of the out 172226031Sstas * string. 173226031Sstas * 174226031Sstas * @return returns 0 on success, an wind error code otherwise 175226031Sstas * @ingroup wind 176226031Sstas */ 177226031Sstas 178226031Sstasint 179226031Sstaswind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len) 180226031Sstas{ 181226031Sstas uint32_t ch; 182226031Sstas size_t i, len, o; 183226031Sstas 184226031Sstas for (o = 0, i = 0; i < in_len; i++) { 185226031Sstas ch = in[i]; 186226031Sstas 187226031Sstas if (ch < 0x80) { 188226031Sstas len = 1; 189226031Sstas } else if (ch < 0x800) { 190226031Sstas len = 2; 191226031Sstas } else if (ch < 0x10000) { 192226031Sstas len = 3; 193226031Sstas } else if (ch <= 0x10FFFF) { 194226031Sstas len = 4; 195226031Sstas } else 196226031Sstas return WIND_ERR_INVALID_UTF32; 197226031Sstas 198226031Sstas o += len; 199226031Sstas 200226031Sstas if (out) { 201226031Sstas if (o >= *out_len) 202226031Sstas return WIND_ERR_OVERRUN; 203226031Sstas 204226031Sstas switch(len) { 205226031Sstas case 4: 206226031Sstas out[3] = (ch | 0x80) & 0xbf; 207234027Sstas ch = ch >> 6; 208226031Sstas case 3: 209226031Sstas out[2] = (ch | 0x80) & 0xbf; 210234027Sstas ch = ch >> 6; 211226031Sstas case 2: 212226031Sstas out[1] = (ch | 0x80) & 0xbf; 213234027Sstas ch = ch >> 6; 214226031Sstas case 1: 215226031Sstas out[0] = ch | first_char[len - 1]; 216226031Sstas } 217226031Sstas } 218226031Sstas out += len; 219226031Sstas } 220226031Sstas if (out) { 221226031Sstas if (o + 1 >= *out_len) 222226031Sstas return WIND_ERR_OVERRUN; 223226031Sstas *out = '\0'; 224226031Sstas } 225226031Sstas *out_len = o; 226226031Sstas return 0; 227226031Sstas} 228226031Sstas 229226031Sstas/** 230226031Sstas * Calculate the length of from converting a UCS4 string to an UTF-8 string. 231226031Sstas * 232226031Sstas * @param in an UCS4 string to convert. 233226031Sstas * @param in_len the length of UCS4 string to convert. 234226031Sstas * @param out_len the length of the resulting UTF-8 string. 235226031Sstas * 236226031Sstas * @return returns 0 on success, an wind error code otherwise 237226031Sstas * @ingroup wind 238226031Sstas */ 239226031Sstas 240226031Sstasint 241226031Sstaswind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len) 242226031Sstas{ 243226031Sstas return wind_ucs4utf8(in, in_len, NULL, out_len); 244226031Sstas} 245226031Sstas 246226031Sstas/** 247226031Sstas * Read in an UCS2 from a buffer. 248226031Sstas * 249226031Sstas * @param ptr The input buffer to read from. 250226031Sstas * @param len the length of the input buffer. 251226031Sstas * @param flags Flags to control the behavior of the function. 252226031Sstas * @param out the output UCS2, the array must be at least out/2 long. 253226031Sstas * @param out_len the output length 254226031Sstas * 255226031Sstas * @return returns 0 on success, an wind error code otherwise. 256226031Sstas * @ingroup wind 257226031Sstas */ 258226031Sstas 259226031Sstasint 260226031Sstaswind_ucs2read(const void *ptr, size_t len, unsigned int *flags, 261226031Sstas uint16_t *out, size_t *out_len) 262226031Sstas{ 263226031Sstas const unsigned char *p = ptr; 264226031Sstas int little = ((*flags) & WIND_RW_LE); 265226031Sstas size_t olen = *out_len; 266226031Sstas 267226031Sstas /** if len is zero, flags are unchanged */ 268226031Sstas if (len == 0) { 269226031Sstas *out_len = 0; 270226031Sstas return 0; 271226031Sstas } 272226031Sstas 273226031Sstas /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */ 274226031Sstas if (len & 1) 275226031Sstas return WIND_ERR_LENGTH_NOT_MOD2; 276226031Sstas 277226031Sstas /** 278226031Sstas * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is 279226031Sstas * found, check is LE/BE flag is already and use that otherwise 280226031Sstas * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and 281226031Sstas * the LE/BE flag and set the resulting LE/BE flag. 282226031Sstas */ 283226031Sstas if ((*flags) & WIND_RW_BOM) { 284226031Sstas uint16_t bom = (p[0] << 8) + p[1]; 285226031Sstas if (bom == 0xfffe || bom == 0xfeff) { 286226031Sstas little = (bom == 0xfffe); 287226031Sstas p += 2; 288226031Sstas len -= 2; 289226031Sstas } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) { 290226031Sstas /* little already set */ 291226031Sstas } else 292226031Sstas return WIND_ERR_NO_BOM; 293226031Sstas *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE)); 294226031Sstas *flags |= little ? WIND_RW_LE : WIND_RW_BE; 295226031Sstas } 296226031Sstas 297226031Sstas while (len) { 298226031Sstas if (olen < 1) 299226031Sstas return WIND_ERR_OVERRUN; 300226031Sstas if (little) 301226031Sstas *out = (p[1] << 8) + p[0]; 302226031Sstas else 303226031Sstas *out = (p[0] << 8) + p[1]; 304226031Sstas out++; p += 2; len -= 2; olen--; 305226031Sstas } 306226031Sstas *out_len -= olen; 307226031Sstas return 0; 308226031Sstas} 309226031Sstas 310226031Sstas/** 311226031Sstas * Write an UCS2 string to a buffer. 312226031Sstas * 313226031Sstas * @param in The input UCS2 string. 314226031Sstas * @param in_len the length of the input buffer. 315226031Sstas * @param flags Flags to control the behavior of the function. 316226031Sstas * @param ptr The input buffer to write to, the array must be at least 317226031Sstas * (in + 1) * 2 bytes long. 318226031Sstas * @param out_len the output length 319226031Sstas * 320226031Sstas * @return returns 0 on success, an wind error code otherwise. 321226031Sstas * @ingroup wind 322226031Sstas */ 323226031Sstas 324226031Sstasint 325226031Sstaswind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags, 326226031Sstas void *ptr, size_t *out_len) 327226031Sstas{ 328226031Sstas unsigned char *p = ptr; 329226031Sstas size_t len = *out_len; 330226031Sstas 331226031Sstas /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/ 332226031Sstas if (len & 1) 333226031Sstas return WIND_ERR_LENGTH_NOT_MOD2; 334226031Sstas 335226031Sstas /** On zero input length, flags are preserved */ 336226031Sstas if (in_len == 0) { 337226031Sstas *out_len = 0; 338226031Sstas return 0; 339226031Sstas } 340226031Sstas /** If flags have WIND_RW_BOM set, the byte order mark is written 341226031Sstas * first to the output data */ 342226031Sstas if ((*flags) & WIND_RW_BOM) { 343226031Sstas uint16_t bom = 0xfffe; 344226031Sstas 345226031Sstas if (len < 2) 346226031Sstas return WIND_ERR_OVERRUN; 347226031Sstas 348226031Sstas if ((*flags) & WIND_RW_LE) { 349234027Sstas p[0] = (bom ) & 0xff; 350234027Sstas p[1] = (bom >> 8) & 0xff; 351226031Sstas } else { 352226031Sstas p[1] = (bom ) & 0xff; 353226031Sstas p[0] = (bom >> 8) & 0xff; 354226031Sstas } 355226031Sstas len -= 2; 356226031Sstas } 357226031Sstas 358226031Sstas while (in_len) { 359226031Sstas /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */ 360226031Sstas if (len < 2) 361226031Sstas return WIND_ERR_OVERRUN; 362226031Sstas if ((*flags) & WIND_RW_LE) { 363234027Sstas p[0] = (in[0] ) & 0xff; 364234027Sstas p[1] = (in[0] >> 8) & 0xff; 365226031Sstas } else { 366226031Sstas p[1] = (in[0] ) & 0xff; 367226031Sstas p[0] = (in[0] >> 8) & 0xff; 368226031Sstas } 369226031Sstas len -= 2; 370226031Sstas in_len--; 371226031Sstas p += 2; 372226031Sstas in++; 373226031Sstas } 374226031Sstas *out_len -= len; 375226031Sstas return 0; 376226031Sstas} 377226031Sstas 378226031Sstas 379226031Sstas/** 380226031Sstas * Convert an UTF-8 string to an UCS2 string. 381226031Sstas * 382226031Sstas * @param in an UTF-8 string to convert. 383226031Sstas * @param out the resulting UCS2 strint, must be at least 384226031Sstas * wind_utf8ucs2_length() long. If out is NULL, the function will 385226031Sstas * calculate the needed space for the out variable (just like 386226031Sstas * wind_utf8ucs2_length()). 387226031Sstas * @param out_len before processing out_len should be the length of 388226031Sstas * the out variable, after processing it will be the length of the out 389226031Sstas * string. 390226031Sstas * 391226031Sstas * @return returns 0 on success, an wind error code otherwise 392226031Sstas * @ingroup wind 393226031Sstas */ 394226031Sstas 395226031Sstasint 396226031Sstaswind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len) 397226031Sstas{ 398226031Sstas const unsigned char *p; 399226031Sstas size_t o = 0; 400226031Sstas int ret; 401226031Sstas 402226031Sstas for (p = (const unsigned char *)in; *p != '\0'; ++p) { 403226031Sstas uint32_t u; 404226031Sstas 405226031Sstas ret = utf8toutf32(&p, &u); 406226031Sstas if (ret) 407226031Sstas return ret; 408226031Sstas 409226031Sstas if (u & 0xffff0000) 410226031Sstas return WIND_ERR_NOT_UTF16; 411226031Sstas 412226031Sstas if (out) { 413226031Sstas if (o >= *out_len) 414226031Sstas return WIND_ERR_OVERRUN; 415226031Sstas out[o] = u; 416226031Sstas } 417226031Sstas o++; 418226031Sstas } 419226031Sstas *out_len = o; 420226031Sstas return 0; 421226031Sstas} 422226031Sstas 423226031Sstas/** 424226031Sstas * Calculate the length of from converting a UTF-8 string to a UCS2 425226031Sstas * string. 426226031Sstas * 427226031Sstas * @param in an UTF-8 string to convert. 428226031Sstas * @param out_len the length of the resulting UCS4 string. 429226031Sstas * 430226031Sstas * @return returns 0 on success, an wind error code otherwise 431226031Sstas * @ingroup wind 432226031Sstas */ 433226031Sstas 434226031Sstasint 435226031Sstaswind_utf8ucs2_length(const char *in, size_t *out_len) 436226031Sstas{ 437226031Sstas return wind_utf8ucs2(in, NULL, out_len); 438226031Sstas} 439226031Sstas 440226031Sstas/** 441226031Sstas * Convert an UCS2 string to a UTF-8 string. 442226031Sstas * 443226031Sstas * @param in an UCS2 string to convert. 444226031Sstas * @param in_len the length of the in UCS2 string. 445226031Sstas * @param out the resulting UTF-8 strint, must be at least 446226031Sstas * wind_ucs2utf8_length() long. If out is NULL, the function will 447226031Sstas * calculate the needed space for the out variable (just like 448226031Sstas * wind_ucs2utf8_length()). 449226031Sstas * @param out_len before processing out_len should be the length of 450226031Sstas * the out variable, after processing it will be the length of the out 451226031Sstas * string. 452226031Sstas * 453226031Sstas * @return returns 0 on success, an wind error code otherwise 454226031Sstas * @ingroup wind 455226031Sstas */ 456226031Sstas 457226031Sstasint 458226031Sstaswind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len) 459226031Sstas{ 460226031Sstas uint16_t ch; 461226031Sstas size_t i, len, o; 462226031Sstas 463226031Sstas for (o = 0, i = 0; i < in_len; i++) { 464226031Sstas ch = in[i]; 465226031Sstas 466226031Sstas if (ch < 0x80) { 467226031Sstas len = 1; 468226031Sstas } else if (ch < 0x800) { 469226031Sstas len = 2; 470226031Sstas } else 471226031Sstas len = 3; 472226031Sstas 473226031Sstas o += len; 474226031Sstas 475226031Sstas if (out) { 476226031Sstas if (o >= *out_len) 477226031Sstas return WIND_ERR_OVERRUN; 478226031Sstas 479226031Sstas switch(len) { 480226031Sstas case 3: 481226031Sstas out[2] = (ch | 0x80) & 0xbf; 482234027Sstas ch = ch >> 6; 483226031Sstas case 2: 484226031Sstas out[1] = (ch | 0x80) & 0xbf; 485234027Sstas ch = ch >> 6; 486226031Sstas case 1: 487226031Sstas out[0] = ch | first_char[len - 1]; 488226031Sstas } 489226031Sstas out += len; 490226031Sstas } 491226031Sstas } 492226031Sstas if (out) { 493226031Sstas if (o >= *out_len) 494226031Sstas return WIND_ERR_OVERRUN; 495226031Sstas *out = '\0'; 496226031Sstas } 497226031Sstas *out_len = o; 498226031Sstas return 0; 499226031Sstas} 500226031Sstas 501226031Sstas/** 502226031Sstas * Calculate the length of from converting a UCS2 string to an UTF-8 string. 503226031Sstas * 504226031Sstas * @param in an UCS2 string to convert. 505226031Sstas * @param in_len an UCS2 string length to convert. 506226031Sstas * @param out_len the length of the resulting UTF-8 string. 507226031Sstas * 508226031Sstas * @return returns 0 on success, an wind error code otherwise 509226031Sstas * @ingroup wind 510226031Sstas */ 511226031Sstas 512226031Sstasint 513226031Sstaswind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len) 514226031Sstas{ 515226031Sstas return wind_ucs2utf8(in, in_len, NULL, out_len); 516226031Sstas} 517