1289177Speter/* 2289177Speter * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 3289177Speter * 4289177Speter * Permission is hereby granted, free of charge, to any person obtaining a 5289177Speter * copy of this software and associated documentation files (the "Software"), 6289177Speter * to deal in the Software without restriction, including without limitation 7289177Speter * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8289177Speter * and/or sell copies of the Software, and to permit persons to whom the 9289177Speter * Software is furnished to do so, subject to the following conditions: 10289177Speter * 11289177Speter * The above copyright notice and this permission notice shall be included in 12289177Speter * all copies or substantial portions of the Software. 13289177Speter * 14289177Speter * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15289177Speter * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16289177Speter * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17289177Speter * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18289177Speter * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19289177Speter * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20289177Speter * DEALINGS IN THE SOFTWARE. 21289177Speter */ 22289177Speter 23289177Speter/* 24289177Speter * This library contains derived data from a modified version of the 25289177Speter * Unicode data files. 26289177Speter * 27289177Speter * The original data files are available at 28289177Speter * http://www.unicode.org/Public/UNIDATA/ 29289177Speter * 30289177Speter * Please notice the copyright statement in the file "utf8proc_data.c". 31289177Speter */ 32289177Speter 33289177Speter 34289177Speter/* 35289177Speter * File name: utf8proc.c 36289177Speter * 37289177Speter * Description: 38289177Speter * Implementation of libutf8proc. 39289177Speter */ 40289177Speter 41289177Speter 42289177Speter#include "utf8proc.h" 43289177Speter#include "utf8proc_data.c" 44289177Speter 45289177Speter 46289177SpeterUTF8PROC_DATA 47289177Speterconst int8_t utf8proc_utf8class[256] = { 48289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 56289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60289177Speter 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 61289177Speter 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 62289177Speter 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 63289177Speter 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 64289177Speter 65289177Speter#define UTF8PROC_HANGUL_SBASE 0xAC00 66289177Speter#define UTF8PROC_HANGUL_LBASE 0x1100 67289177Speter#define UTF8PROC_HANGUL_VBASE 0x1161 68289177Speter#define UTF8PROC_HANGUL_TBASE 0x11A7 69289177Speter#define UTF8PROC_HANGUL_LCOUNT 19 70289177Speter#define UTF8PROC_HANGUL_VCOUNT 21 71289177Speter#define UTF8PROC_HANGUL_TCOUNT 28 72289177Speter#define UTF8PROC_HANGUL_NCOUNT 588 73289177Speter#define UTF8PROC_HANGUL_SCOUNT 11172 74289177Speter/* END is exclusive */ 75289177Speter#define UTF8PROC_HANGUL_L_START 0x1100 76289177Speter#define UTF8PROC_HANGUL_L_END 0x115A 77289177Speter#define UTF8PROC_HANGUL_L_FILLER 0x115F 78289177Speter#define UTF8PROC_HANGUL_V_START 0x1160 79289177Speter#define UTF8PROC_HANGUL_V_END 0x11A3 80289177Speter#define UTF8PROC_HANGUL_T_START 0x11A8 81289177Speter#define UTF8PROC_HANGUL_T_END 0x11FA 82289177Speter#define UTF8PROC_HANGUL_S_START 0xAC00 83289177Speter#define UTF8PROC_HANGUL_S_END 0xD7A4 84289177Speter 85289177Speter 86289177Speter#define UTF8PROC_BOUNDCLASS_START 0 87289177Speter#define UTF8PROC_BOUNDCLASS_OTHER 1 88289177Speter#define UTF8PROC_BOUNDCLASS_CR 2 89289177Speter#define UTF8PROC_BOUNDCLASS_LF 3 90289177Speter#define UTF8PROC_BOUNDCLASS_CONTROL 4 91289177Speter#define UTF8PROC_BOUNDCLASS_EXTEND 5 92289177Speter#define UTF8PROC_BOUNDCLASS_L 6 93289177Speter#define UTF8PROC_BOUNDCLASS_V 7 94289177Speter#define UTF8PROC_BOUNDCLASS_T 8 95289177Speter#define UTF8PROC_BOUNDCLASS_LV 9 96289177Speter#define UTF8PROC_BOUNDCLASS_LVT 10 97289177Speter 98289177Speter 99289177SpeterUTF8PROC_API 100289177Speterconst char *utf8proc_version(void) { 101289177Speter return "1.1.5"; 102289177Speter} 103289177Speter 104289177Speter/* 105289177Speter * This macro tells translators that string X should be translated, 106289177Speter * but does not look up the translation at run time. This is standard 107289177Speter * GNU gettext notation for annotating compile-time constant strings. 108289177Speter */ 109289177Speter#ifndef N_ 110289177Speter#define N_(x) x 111289177Speter#endif 112289177Speter 113289177SpeterUTF8PROC_API 114289177Speterconst char *utf8proc_errmsg(ssize_t errcode) { 115289177Speter switch (errcode) { 116289177Speter case UTF8PROC_ERROR_NOMEM: 117289177Speter return N_("Memory for processing UTF-8 data could not be allocated."); 118289177Speter case UTF8PROC_ERROR_OVERFLOW: 119289177Speter return N_("UTF-8 string is too long to be processed."); 120289177Speter case UTF8PROC_ERROR_INVALIDUTF8: 121289177Speter return N_("Invalid UTF-8 string"); 122289177Speter case UTF8PROC_ERROR_NOTASSIGNED: 123289177Speter return N_("Unassigned Unicode code point found in UTF-8 string."); 124289177Speter case UTF8PROC_ERROR_INVALIDOPTS: 125289177Speter return N_("Invalid options for UTF-8 processing chosen."); 126289177Speter default: 127289177Speter return N_("An unknown error occured while processing UTF-8 data."); 128289177Speter } 129289177Speter} 130289177Speter 131289177SpeterUTF8PROC_API 132289177Speterssize_t utf8proc_iterate( 133289177Speter const uint8_t *str, ssize_t strlen, int32_t *dst 134289177Speter) { 135289177Speter int length; 136289177Speter int i; 137289177Speter int32_t uc = -1; 138289177Speter *dst = -1; 139289177Speter if (!strlen) return 0; 140289177Speter length = utf8proc_utf8class[str[0]]; 141289177Speter if (!length) return UTF8PROC_ERROR_INVALIDUTF8; 142289177Speter if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8; 143289177Speter for (i=1; i<length; i++) { 144289177Speter if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8; 145289177Speter } 146289177Speter switch (length) { 147289177Speter case 1: 148289177Speter uc = str[0]; 149289177Speter break; 150289177Speter case 2: 151289177Speter uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); 152289177Speter if (uc < 0x80) uc = -1; 153289177Speter break; 154289177Speter case 3: 155289177Speter uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) 156289177Speter + (str[2] & 0x3F); 157289177Speter if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || 158289177Speter (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; 159289177Speter break; 160289177Speter case 4: 161289177Speter uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) 162289177Speter + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); 163289177Speter if (uc < 0x10000 || uc >= 0x110000) uc = -1; 164289177Speter break; 165289177Speter } 166289177Speter if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) 167289177Speter return UTF8PROC_ERROR_INVALIDUTF8; 168289177Speter *dst = uc; 169289177Speter return length; 170289177Speter} 171289177Speter 172289177SpeterUTF8PROC_API 173289177Speterbool utf8proc_codepoint_valid(int32_t uc) { 174289177Speter if (uc < 0 || uc >= 0x110000 || 175289177Speter ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) || 176289177Speter (uc >= 0xFDD0 && uc < 0xFDF0)) return false; 177289177Speter else return true; 178289177Speter} 179289177Speter 180289177SpeterUTF8PROC_API 181289177Speterssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) { 182289177Speter if (uc < 0x00) { 183289177Speter return 0; 184289177Speter } else if (uc < 0x80) { 185289177Speter dst[0] = (uint8_t)uc; 186289177Speter return 1; 187289177Speter } else if (uc < 0x800) { 188289177Speter dst[0] = 0xC0 + (uint8_t)(uc >> 6); 189289177Speter dst[1] = 0x80 + (uc & 0x3F); 190289177Speter return 2; 191289177Speter } else if (uc == 0xFFFF) { 192289177Speter dst[0] = 0xFF; 193289177Speter return 1; 194289177Speter } else if (uc == 0xFFFE) { 195289177Speter dst[0] = 0xFE; 196289177Speter return 1; 197289177Speter } else if (uc < 0x10000) { 198289177Speter dst[0] = 0xE0 + (uint8_t)(uc >> 12); 199289177Speter dst[1] = 0x80 + ((uc >> 6) & 0x3F); 200289177Speter dst[2] = 0x80 + (uc & 0x3F); 201289177Speter return 3; 202289177Speter } else if (uc < 0x110000) { 203289177Speter dst[0] = 0xF0 + (uint8_t)(uc >> 18); 204289177Speter dst[1] = 0x80 + ((uc >> 12) & 0x3F); 205289177Speter dst[2] = 0x80 + ((uc >> 6) & 0x3F); 206289177Speter dst[3] = 0x80 + (uc & 0x3F); 207289177Speter return 4; 208289177Speter } else return 0; 209289177Speter} 210289177Speter 211289177SpeterUTF8PROC_API 212289177Speterconst utf8proc_property_t *utf8proc_get_property(int32_t uc) { 213289177Speter /* ASSERT: uc >= 0 && uc < 0x110000 */ 214289177Speter return utf8proc_properties + ( 215289177Speter utf8proc_stage2table[ 216289177Speter utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 217289177Speter ] 218289177Speter ); 219289177Speter} 220289177Speter 221289177Speter#define utf8proc_decompose_lump(replacement_uc) \ 222289177Speter return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 223289177Speter options & ~UTF8PROC_LUMP, last_boundclass) 224289177Speter 225289177SpeterUTF8PROC_API 226289177Speterssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize, 227289177Speter int options, int *last_boundclass) { 228289177Speter /* ASSERT: uc >= 0 && uc < 0x110000 */ 229289177Speter const utf8proc_property_t *property; 230289177Speter utf8proc_propval_t category; 231289177Speter int32_t hangul_sindex; 232289177Speter property = utf8proc_get_property(uc); 233289177Speter category = property->category; 234289177Speter hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 235289177Speter if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 236289177Speter if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 237289177Speter int32_t hangul_tindex; 238289177Speter if (bufsize >= 1) { 239289177Speter dst[0] = UTF8PROC_HANGUL_LBASE + 240289177Speter hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 241289177Speter if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 242289177Speter (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 243289177Speter } 244289177Speter hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 245289177Speter if (!hangul_tindex) return 2; 246289177Speter if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 247289177Speter return 3; 248289177Speter } 249289177Speter } 250289177Speter if (options & UTF8PROC_REJECTNA) { 251289177Speter if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 252289177Speter } 253289177Speter if (options & UTF8PROC_IGNORE) { 254289177Speter if (property->ignorable) return 0; 255289177Speter } 256289177Speter if (options & UTF8PROC_LUMP) { 257289177Speter if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 258289177Speter if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 259289177Speter utf8proc_decompose_lump(0x0027); 260289177Speter if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 261289177Speter utf8proc_decompose_lump(0x002D); 262289177Speter if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 263289177Speter if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 264289177Speter if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 265289177Speter utf8proc_decompose_lump(0x003C); 266289177Speter if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 267289177Speter utf8proc_decompose_lump(0x003E); 268289177Speter if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 269289177Speter if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 270289177Speter utf8proc_decompose_lump(0x005E); 271289177Speter if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 272289177Speter utf8proc_decompose_lump(0x005F); 273289177Speter if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 274289177Speter if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 275289177Speter if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 276289177Speter if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 277289177Speter if (category == UTF8PROC_CATEGORY_ZL || 278289177Speter category == UTF8PROC_CATEGORY_ZP) 279289177Speter utf8proc_decompose_lump(0x000A); 280289177Speter } 281289177Speter } 282289177Speter if (options & UTF8PROC_STRIPMARK) { 283289177Speter if (category == UTF8PROC_CATEGORY_MN || 284289177Speter category == UTF8PROC_CATEGORY_MC || 285289177Speter category == UTF8PROC_CATEGORY_ME) return 0; 286289177Speter } 287289177Speter if (options & UTF8PROC_CASEFOLD) { 288289177Speter if (property->casefold_mapping) { 289289177Speter const int32_t *casefold_entry; 290289177Speter ssize_t written = 0; 291289177Speter for (casefold_entry = property->casefold_mapping; 292289177Speter *casefold_entry >= 0; casefold_entry++) { 293289177Speter written += utf8proc_decompose_char(*casefold_entry, dst+written, 294289177Speter (bufsize > written) ? (bufsize - written) : 0, options, 295289177Speter last_boundclass); 296289177Speter if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 297289177Speter } 298289177Speter return written; 299289177Speter } 300289177Speter } 301289177Speter if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 302289177Speter if (property->decomp_mapping && 303289177Speter (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 304289177Speter const int32_t *decomp_entry; 305289177Speter ssize_t written = 0; 306289177Speter for (decomp_entry = property->decomp_mapping; 307289177Speter *decomp_entry >= 0; decomp_entry++) { 308289177Speter written += utf8proc_decompose_char(*decomp_entry, dst+written, 309289177Speter (bufsize > written) ? (bufsize - written) : 0, options, 310289177Speter last_boundclass); 311289177Speter if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 312289177Speter } 313289177Speter return written; 314289177Speter } 315289177Speter } 316289177Speter if (options & UTF8PROC_CHARBOUND) { 317289177Speter bool boundary; 318289177Speter int tbc, lbc; 319289177Speter tbc = 320289177Speter (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : 321289177Speter (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : 322289177Speter ((category == UTF8PROC_CATEGORY_ZL || 323289177Speter category == UTF8PROC_CATEGORY_ZP || 324289177Speter category == UTF8PROC_CATEGORY_CC || 325289177Speter category == UTF8PROC_CATEGORY_CF) && 326289177Speter !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : 327289177Speter property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : 328289177Speter ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) || 329289177Speter uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : 330289177Speter (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? 331289177Speter UTF8PROC_BOUNDCLASS_V : 332289177Speter (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? 333289177Speter UTF8PROC_BOUNDCLASS_T : 334289177Speter (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( 335289177Speter ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? 336289177Speter UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT 337289177Speter ) : 338289177Speter UTF8PROC_BOUNDCLASS_OTHER; 339289177Speter lbc = *last_boundclass; 340289177Speter boundary = 341289177Speter (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : 342289177Speter (lbc == UTF8PROC_BOUNDCLASS_START) ? true : 343289177Speter (lbc == UTF8PROC_BOUNDCLASS_CR && 344289177Speter tbc == UTF8PROC_BOUNDCLASS_LF) ? false : 345289177Speter (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 346289177Speter (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 347289177Speter (lbc == UTF8PROC_BOUNDCLASS_L && 348289177Speter (tbc == UTF8PROC_BOUNDCLASS_L || 349289177Speter tbc == UTF8PROC_BOUNDCLASS_V || 350289177Speter tbc == UTF8PROC_BOUNDCLASS_LV || 351289177Speter tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : 352289177Speter ((lbc == UTF8PROC_BOUNDCLASS_LV || 353289177Speter lbc == UTF8PROC_BOUNDCLASS_V) && 354289177Speter (tbc == UTF8PROC_BOUNDCLASS_V || 355289177Speter tbc == UTF8PROC_BOUNDCLASS_T)) ? false : 356289177Speter ((lbc == UTF8PROC_BOUNDCLASS_LVT || 357289177Speter lbc == UTF8PROC_BOUNDCLASS_T) && 358289177Speter tbc == UTF8PROC_BOUNDCLASS_T) ? false : 359289177Speter true; 360289177Speter *last_boundclass = tbc; 361289177Speter if (boundary) { 362289177Speter if (bufsize >= 1) dst[0] = 0xFFFF; 363289177Speter if (bufsize >= 2) dst[1] = uc; 364289177Speter return 2; 365289177Speter } 366289177Speter } 367289177Speter if (bufsize >= 1) *dst = uc; 368289177Speter return 1; 369289177Speter} 370289177Speter 371289177SpeterUTF8PROC_API 372289177Speterssize_t utf8proc_decompose( 373289177Speter const uint8_t *str, ssize_t strlen, 374289177Speter int32_t *buffer, ssize_t bufsize, int options 375289177Speter) { 376289177Speter /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ 377289177Speter ssize_t wpos = 0; 378289177Speter if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 379289177Speter return UTF8PROC_ERROR_INVALIDOPTS; 380289177Speter if ((options & UTF8PROC_STRIPMARK) && 381289177Speter !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 382289177Speter return UTF8PROC_ERROR_INVALIDOPTS; 383289177Speter { 384289177Speter int32_t uc; 385289177Speter ssize_t rpos = 0; 386289177Speter ssize_t decomp_result; 387289177Speter int boundclass = UTF8PROC_BOUNDCLASS_START; 388289177Speter while (1) { 389289177Speter if (options & UTF8PROC_NULLTERM) { 390289177Speter rpos += utf8proc_iterate(str + rpos, -1, &uc); 391289177Speter /* checking of return value is not neccessary, 392289177Speter as 'uc' is < 0 in case of error */ 393289177Speter if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 394289177Speter if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 395289177Speter if (uc == 0) break; 396289177Speter } else { 397289177Speter if (rpos >= strlen) break; 398289177Speter rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 399289177Speter if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 400289177Speter } 401289177Speter decomp_result = utf8proc_decompose_char( 402289177Speter uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 403289177Speter &boundclass 404289177Speter ); 405289177Speter if (decomp_result < 0) return decomp_result; 406289177Speter wpos += decomp_result; 407289177Speter /* prohibiting integer overflows due to too long strings: */ 408289177Speter if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2) 409289177Speter return UTF8PROC_ERROR_OVERFLOW; 410289177Speter } 411289177Speter } 412289177Speter if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { 413289177Speter ssize_t pos = 0; 414289177Speter while (pos < wpos-1) { 415289177Speter int32_t uc1, uc2; 416289177Speter const utf8proc_property_t *property1, *property2; 417289177Speter uc1 = buffer[pos]; 418289177Speter uc2 = buffer[pos+1]; 419289177Speter property1 = utf8proc_get_property(uc1); 420289177Speter property2 = utf8proc_get_property(uc2); 421289177Speter if (property1->combining_class > property2->combining_class && 422289177Speter property2->combining_class > 0) { 423289177Speter buffer[pos] = uc2; 424289177Speter buffer[pos+1] = uc1; 425289177Speter if (pos > 0) pos--; else pos++; 426289177Speter } else { 427289177Speter pos++; 428289177Speter } 429289177Speter } 430289177Speter } 431289177Speter return wpos; 432289177Speter} 433289177Speter 434289177SpeterUTF8PROC_API 435289177Speterssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) { 436289177Speter /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 437289177Speter ASSERT: 'buffer' has one spare byte of free space at the end! */ 438289177Speter if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { 439289177Speter ssize_t rpos; 440289177Speter ssize_t wpos = 0; 441289177Speter int32_t uc; 442289177Speter for (rpos = 0; rpos < length; rpos++) { 443289177Speter uc = buffer[rpos]; 444289177Speter if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 445289177Speter if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || 446289177Speter ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { 447289177Speter if (options & UTF8PROC_NLF2LS) { 448289177Speter if (options & UTF8PROC_NLF2PS) { 449289177Speter buffer[wpos++] = 0x000A; 450289177Speter } else { 451289177Speter buffer[wpos++] = 0x2028; 452289177Speter } 453289177Speter } else { 454289177Speter if (options & UTF8PROC_NLF2PS) { 455289177Speter buffer[wpos++] = 0x2029; 456289177Speter } else { 457289177Speter buffer[wpos++] = 0x0020; 458289177Speter } 459289177Speter } 460289177Speter } else if ((options & UTF8PROC_STRIPCC) && 461289177Speter (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { 462289177Speter if (uc == 0x0009) buffer[wpos++] = 0x0020; 463289177Speter } else { 464289177Speter buffer[wpos++] = uc; 465289177Speter } 466289177Speter } 467289177Speter length = wpos; 468289177Speter } 469289177Speter if (options & UTF8PROC_COMPOSE) { 470289177Speter int32_t *starter = NULL; 471289177Speter int32_t current_char; 472289177Speter const utf8proc_property_t *starter_property = NULL, *current_property; 473289177Speter utf8proc_propval_t max_combining_class = -1; 474289177Speter ssize_t rpos; 475289177Speter ssize_t wpos = 0; 476289177Speter int32_t composition; 477289177Speter for (rpos = 0; rpos < length; rpos++) { 478289177Speter current_char = buffer[rpos]; 479289177Speter current_property = utf8proc_get_property(current_char); 480289177Speter if (starter && current_property->combining_class > max_combining_class) { 481289177Speter /* combination perhaps possible */ 482289177Speter int32_t hangul_lindex; 483289177Speter int32_t hangul_sindex; 484289177Speter hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; 485289177Speter if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { 486289177Speter int32_t hangul_vindex; 487289177Speter hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 488289177Speter if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 489289177Speter *starter = UTF8PROC_HANGUL_SBASE + 490289177Speter (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 491289177Speter UTF8PROC_HANGUL_TCOUNT; 492289177Speter starter_property = NULL; 493289177Speter continue; 494289177Speter } 495289177Speter } 496289177Speter hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; 497289177Speter if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 498289177Speter (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { 499289177Speter int32_t hangul_tindex; 500289177Speter hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 501289177Speter if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 502289177Speter *starter += hangul_tindex; 503289177Speter starter_property = NULL; 504289177Speter continue; 505289177Speter } 506289177Speter } 507289177Speter if (!starter_property) { 508289177Speter starter_property = utf8proc_get_property(*starter); 509289177Speter } 510289177Speter if (starter_property->comb1st_index >= 0 && 511289177Speter current_property->comb2nd_index >= 0) { 512289177Speter composition = utf8proc_combinations[ 513289177Speter starter_property->comb1st_index + 514289177Speter current_property->comb2nd_index 515289177Speter ]; 516289177Speter if (composition >= 0 && (!(options & UTF8PROC_STABLE) || 517289177Speter !(utf8proc_get_property(composition)->comp_exclusion))) { 518289177Speter *starter = composition; 519289177Speter starter_property = NULL; 520289177Speter continue; 521289177Speter } 522289177Speter } 523289177Speter } 524289177Speter buffer[wpos] = current_char; 525289177Speter if (current_property->combining_class) { 526289177Speter if (current_property->combining_class > max_combining_class) { 527289177Speter max_combining_class = current_property->combining_class; 528289177Speter } 529289177Speter } else { 530289177Speter starter = buffer + wpos; 531289177Speter starter_property = NULL; 532289177Speter max_combining_class = -1; 533289177Speter } 534289177Speter wpos++; 535289177Speter } 536289177Speter length = wpos; 537289177Speter } 538289177Speter { 539289177Speter ssize_t rpos, wpos = 0; 540289177Speter int32_t uc; 541289177Speter for (rpos = 0; rpos < length; rpos++) { 542289177Speter uc = buffer[rpos]; 543289177Speter wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos); 544289177Speter } 545289177Speter ((uint8_t *)buffer)[wpos] = 0; 546289177Speter return wpos; 547289177Speter } 548289177Speter} 549289177Speter 550289177SpeterUTF8PROC_API 551289177Speterssize_t utf8proc_map( 552289177Speter const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options 553289177Speter) { 554289177Speter int32_t *buffer; 555289177Speter ssize_t result; 556289177Speter *dstptr = NULL; 557289177Speter result = utf8proc_decompose(str, strlen, NULL, 0, options); 558289177Speter if (result < 0) return result; 559289177Speter buffer = malloc(result * sizeof(int32_t) + 1); 560289177Speter if (!buffer) return UTF8PROC_ERROR_NOMEM; 561289177Speter result = utf8proc_decompose(str, strlen, buffer, result, options); 562289177Speter if (result < 0) { 563289177Speter free(buffer); 564289177Speter return result; 565289177Speter } 566289177Speter result = utf8proc_reencode(buffer, result, options); 567289177Speter if (result < 0) { 568289177Speter free(buffer); 569289177Speter return result; 570289177Speter } 571289177Speter { 572289177Speter int32_t *newptr; 573289177Speter newptr = realloc(buffer, (size_t)result+1); 574289177Speter if (newptr) buffer = newptr; 575289177Speter } 576289177Speter *dstptr = (uint8_t *)buffer; 577289177Speter return result; 578289177Speter} 579289177Speter 580289177SpeterUTF8PROC_API 581289177Speteruint8_t *utf8proc_NFD(const uint8_t *str) { 582289177Speter uint8_t *retval; 583289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 584289177Speter UTF8PROC_DECOMPOSE); 585289177Speter return retval; 586289177Speter} 587289177Speter 588289177SpeterUTF8PROC_API 589289177Speteruint8_t *utf8proc_NFC(const uint8_t *str) { 590289177Speter uint8_t *retval; 591289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 592289177Speter UTF8PROC_COMPOSE); 593289177Speter return retval; 594289177Speter} 595289177Speter 596289177SpeterUTF8PROC_API 597289177Speteruint8_t *utf8proc_NFKD(const uint8_t *str) { 598289177Speter uint8_t *retval; 599289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 600289177Speter UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 601289177Speter return retval; 602289177Speter} 603289177Speter 604289177SpeterUTF8PROC_API 605289177Speteruint8_t *utf8proc_NFKC(const uint8_t *str) { 606289177Speter uint8_t *retval; 607289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 608289177Speter UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 609289177Speter return retval; 610289177Speter} 611289177Speter 612