1362181Sdim/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ 2289177Speter/* 3362181Sdim * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors. 4289177Speter * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 5289177Speter * 6289177Speter * Permission is hereby granted, free of charge, to any person obtaining a 7289177Speter * copy of this software and associated documentation files (the "Software"), 8289177Speter * to deal in the Software without restriction, including without limitation 9289177Speter * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10289177Speter * and/or sell copies of the Software, and to permit persons to whom the 11289177Speter * Software is furnished to do so, subject to the following conditions: 12289177Speter * 13289177Speter * The above copyright notice and this permission notice shall be included in 14289177Speter * all copies or substantial portions of the Software. 15289177Speter * 16289177Speter * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17289177Speter * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18289177Speter * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19289177Speter * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20289177Speter * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21289177Speter * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22289177Speter * DEALINGS IN THE SOFTWARE. 23289177Speter */ 24289177Speter 25289177Speter/* 26289177Speter * This library contains derived data from a modified version of the 27289177Speter * Unicode data files. 28289177Speter * 29289177Speter * The original data files are available at 30289177Speter * http://www.unicode.org/Public/UNIDATA/ 31289177Speter * 32289177Speter * Please notice the copyright statement in the file "utf8proc_data.c". 33289177Speter */ 34289177Speter 35289177Speter 36289177Speter/* 37289177Speter * File name: utf8proc.c 38289177Speter * 39289177Speter * Description: 40289177Speter * Implementation of libutf8proc. 41289177Speter */ 42289177Speter 43289177Speter 44362181Sdim#include "utf8proc_internal.h" 45289177Speter#include "utf8proc_data.c" 46289177Speter 47289177Speter 48362181SdimUTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = { 49289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 56289177Speter 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 57289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60289177Speter 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61289177Speter 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 62289177Speter 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 63289177Speter 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 64289177Speter 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 }; 65289177Speter 66289177Speter#define UTF8PROC_HANGUL_SBASE 0xAC00 67289177Speter#define UTF8PROC_HANGUL_LBASE 0x1100 68289177Speter#define UTF8PROC_HANGUL_VBASE 0x1161 69289177Speter#define UTF8PROC_HANGUL_TBASE 0x11A7 70289177Speter#define UTF8PROC_HANGUL_LCOUNT 19 71289177Speter#define UTF8PROC_HANGUL_VCOUNT 21 72289177Speter#define UTF8PROC_HANGUL_TCOUNT 28 73289177Speter#define UTF8PROC_HANGUL_NCOUNT 588 74289177Speter#define UTF8PROC_HANGUL_SCOUNT 11172 75289177Speter/* END is exclusive */ 76289177Speter#define UTF8PROC_HANGUL_L_START 0x1100 77289177Speter#define UTF8PROC_HANGUL_L_END 0x115A 78289177Speter#define UTF8PROC_HANGUL_L_FILLER 0x115F 79289177Speter#define UTF8PROC_HANGUL_V_START 0x1160 80289177Speter#define UTF8PROC_HANGUL_V_END 0x11A3 81289177Speter#define UTF8PROC_HANGUL_T_START 0x11A8 82289177Speter#define UTF8PROC_HANGUL_T_END 0x11FA 83289177Speter#define UTF8PROC_HANGUL_S_START 0xAC00 84289177Speter#define UTF8PROC_HANGUL_S_END 0xD7A4 85289177Speter 86362181Sdim/* Should follow semantic-versioning rules (semver.org) based on API 87362181Sdim compatibility. (Note that the shared-library version number will 88362181Sdim be different, being based on ABI compatibility.): */ 89362181Sdim#define STRINGIZEx(x) #x 90362181Sdim#define STRINGIZE(x) STRINGIZEx(x) 91362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_version(void) { 92362181Sdim return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) ""; 93289177Speter} 94289177Speter 95362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) { 96289177Speter switch (errcode) { 97289177Speter case UTF8PROC_ERROR_NOMEM: 98362181Sdim return "Memory for processing UTF-8 data could not be allocated."; 99289177Speter case UTF8PROC_ERROR_OVERFLOW: 100362181Sdim return "UTF-8 string is too long to be processed."; 101289177Speter case UTF8PROC_ERROR_INVALIDUTF8: 102362181Sdim return "Invalid UTF-8 string"; 103289177Speter case UTF8PROC_ERROR_NOTASSIGNED: 104362181Sdim return "Unassigned Unicode code point found in UTF-8 string."; 105289177Speter case UTF8PROC_ERROR_INVALIDOPTS: 106362181Sdim return "Invalid options for UTF-8 processing chosen."; 107289177Speter default: 108362181Sdim return "An unknown error occurred while processing UTF-8 data."; 109289177Speter } 110289177Speter} 111289177Speter 112362181Sdim#define utf_cont(ch) (((ch) & 0xc0) == 0x80) 113362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( 114362181Sdim const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst 115289177Speter) { 116362181Sdim utf8proc_uint32_t uc; 117362181Sdim const utf8proc_uint8_t *end; 118362181Sdim 119289177Speter *dst = -1; 120289177Speter if (!strlen) return 0; 121362181Sdim end = str + ((strlen < 0) ? 4 : strlen); 122362181Sdim uc = *str++; 123362181Sdim if (uc < 0x80) { 124362181Sdim *dst = uc; 125362181Sdim return 1; 126289177Speter } 127362181Sdim /* Must be between 0xc2 and 0xf4 inclusive to be valid */ 128362181Sdim if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; 129362181Sdim if (uc < 0xe0) { /* 2-byte sequence */ 130362181Sdim /* Must have valid continuation character */ 131362181Sdim if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8; 132362181Sdim *dst = ((uc & 0x1f)<<6) | (*str & 0x3f); 133362181Sdim return 2; 134289177Speter } 135362181Sdim if (uc < 0xf0) { /* 3-byte sequence */ 136362181Sdim if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) 137362181Sdim return UTF8PROC_ERROR_INVALIDUTF8; 138362181Sdim /* Check for surrogate chars */ 139362181Sdim if (uc == 0xed && *str > 0x9f) 140362181Sdim return UTF8PROC_ERROR_INVALIDUTF8; 141362181Sdim uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f); 142362181Sdim if (uc < 0x800) 143362181Sdim return UTF8PROC_ERROR_INVALIDUTF8; 144362181Sdim *dst = uc; 145362181Sdim return 3; 146362181Sdim } 147362181Sdim /* 4-byte sequence 148362181Sdim Must have 3 valid continuation characters */ 149362181Sdim if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) 150362181Sdim return UTF8PROC_ERROR_INVALIDUTF8; 151362181Sdim /* Make sure in correct range (0x10000 - 0x10ffff) */ 152362181Sdim if (uc == 0xf0) { 153362181Sdim if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; 154362181Sdim } else if (uc == 0xf4) { 155362181Sdim if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; 156362181Sdim } 157362181Sdim *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f); 158362181Sdim return 4; 159289177Speter} 160289177Speter 161362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { 162362181Sdim return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000); 163289177Speter} 164289177Speter 165362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 166289177Speter if (uc < 0x00) { 167289177Speter return 0; 168289177Speter } else if (uc < 0x80) { 169362181Sdim dst[0] = (utf8proc_uint8_t) uc; 170289177Speter return 1; 171289177Speter } else if (uc < 0x800) { 172362181Sdim dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 173362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 174289177Speter return 2; 175362181Sdim /* Note: we allow encoding 0xd800-0xdfff here, so as not to change 176362181Sdim the API, however, these are actually invalid in UTF-8 */ 177289177Speter } else if (uc < 0x10000) { 178362181Sdim dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 179362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 180362181Sdim dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 181289177Speter return 3; 182289177Speter } else if (uc < 0x110000) { 183362181Sdim dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 184362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 185362181Sdim dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 186362181Sdim dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 187289177Speter return 4; 188289177Speter } else return 0; 189289177Speter} 190289177Speter 191362181Sdim/* internal "unsafe" version that does not check whether uc is in range */ 192362181Sdimstatic utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { 193362181Sdim if (uc < 0x00) { 194362181Sdim return 0; 195362181Sdim } else if (uc < 0x80) { 196362181Sdim dst[0] = (utf8proc_uint8_t)uc; 197362181Sdim return 1; 198362181Sdim } else if (uc < 0x800) { 199362181Sdim dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 200362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 201362181Sdim return 2; 202362181Sdim } else if (uc == 0xFFFF) { 203362181Sdim dst[0] = (utf8proc_uint8_t)0xFF; 204362181Sdim return 1; 205362181Sdim } else if (uc == 0xFFFE) { 206362181Sdim dst[0] = (utf8proc_uint8_t)0xFE; 207362181Sdim return 1; 208362181Sdim } else if (uc < 0x10000) { 209362181Sdim dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 210362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 211362181Sdim dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 212362181Sdim return 3; 213362181Sdim } else if (uc < 0x110000) { 214362181Sdim dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 215362181Sdim dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 216362181Sdim dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 217362181Sdim dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 218362181Sdim return 4; 219362181Sdim } else return 0; 220362181Sdim} 221362181Sdim 222362181Sdim/* internal "unsafe" version that does not check whether uc is in range */ 223362181Sdimstatic const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) { 224289177Speter /* ASSERT: uc >= 0 && uc < 0x110000 */ 225289177Speter return utf8proc_properties + ( 226289177Speter utf8proc_stage2table[ 227289177Speter utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 228289177Speter ] 229289177Speter ); 230289177Speter} 231289177Speter 232362181SdimUTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) { 233362181Sdim return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); 234362181Sdim} 235362181Sdim 236362181Sdim/* return whether there is a grapheme break between boundclasses lbc and tbc 237362181Sdim (according to the definition of extended grapheme clusters) 238362181Sdim 239362181Sdim Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): 240362181Sdim http://www.unicode.org/reports/tr29/tr29-29.html 241362181Sdim 242362181Sdim CAVEATS: 243362181Sdim Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) 244362181Sdim and GB 12/13 (regional indicator code points) require knowledge of previous characters 245362181Sdim and are thus not handled by this function. This may result in an incorrect break before 246362181Sdim an E_Modifier class codepoint and an incorrectly missing break between two 247362181Sdim REGIONAL_INDICATOR class code points if such support does not exist in the caller. 248362181Sdim 249362181Sdim See the special support in grapheme_break_extended, for required bookkeeping by the caller. 250362181Sdim*/ 251362181Sdimstatic utf8proc_bool grapheme_break_simple(int lbc, int tbc) { 252362181Sdim return 253362181Sdim (lbc == UTF8PROC_BOUNDCLASS_START) ? true : /* GB1 */ 254362181Sdim (lbc == UTF8PROC_BOUNDCLASS_CR && /* GB3 */ 255362181Sdim tbc == UTF8PROC_BOUNDCLASS_LF) ? false : /* --- */ 256362181Sdim (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB4 */ 257362181Sdim (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB5 */ 258362181Sdim (lbc == UTF8PROC_BOUNDCLASS_L && /* GB6 */ 259362181Sdim (tbc == UTF8PROC_BOUNDCLASS_L || /* --- */ 260362181Sdim tbc == UTF8PROC_BOUNDCLASS_V || /* --- */ 261362181Sdim tbc == UTF8PROC_BOUNDCLASS_LV || /* --- */ 262362181Sdim tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : /* --- */ 263362181Sdim ((lbc == UTF8PROC_BOUNDCLASS_LV || /* GB7 */ 264362181Sdim lbc == UTF8PROC_BOUNDCLASS_V) && /* --- */ 265362181Sdim (tbc == UTF8PROC_BOUNDCLASS_V || /* --- */ 266362181Sdim tbc == UTF8PROC_BOUNDCLASS_T)) ? false : /* --- */ 267362181Sdim ((lbc == UTF8PROC_BOUNDCLASS_LVT || /* GB8 */ 268362181Sdim lbc == UTF8PROC_BOUNDCLASS_T) && /* --- */ 269362181Sdim tbc == UTF8PROC_BOUNDCLASS_T) ? false : /* --- */ 270362181Sdim (tbc == UTF8PROC_BOUNDCLASS_EXTEND || /* GB9 */ 271362181Sdim tbc == UTF8PROC_BOUNDCLASS_ZWJ || /* --- */ 272362181Sdim tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || /* GB9a */ 273362181Sdim lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : /* GB9b */ 274362181Sdim ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || /* GB10 (requires additional handling below) */ 275362181Sdim lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && /* ---- */ 276362181Sdim tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : /* ---- */ 277362181Sdim (lbc == UTF8PROC_BOUNDCLASS_ZWJ && /* GB11 */ 278362181Sdim (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || /* ---- */ 279362181Sdim tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : /* ---- */ 280362181Sdim (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && /* GB12/13 (requires additional handling below) */ 281362181Sdim tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : /* ---- */ 282362181Sdim true; /* GB999 */ 283362181Sdim} 284362181Sdim 285362181Sdimstatic utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) 286362181Sdim{ 287362181Sdim utf8proc_bool break_permitted; 288362181Sdim int lbc_override = lbc; 289362181Sdim if (state && *state != UTF8PROC_BOUNDCLASS_START) 290362181Sdim lbc_override = *state; 291362181Sdim break_permitted = grapheme_break_simple(lbc_override, tbc); 292362181Sdim if (state) { 293362181Sdim /* Special support for GB 12/13 made possible by GB999. After two RI 294362181Sdim class codepoints we want to force a break. Do this by resetting the 295362181Sdim second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break 296362181Sdim after that character according to GB999 (unless of course such a break is 297362181Sdim forbidden by a different rule such as GB9). */ 298362181Sdim if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) 299362181Sdim *state = UTF8PROC_BOUNDCLASS_OTHER; 300362181Sdim /* Special support for GB10. Fold any EXTEND codepoints into the previous 301362181Sdim boundclass if we're dealing with an emoji base boundclass. */ 302362181Sdim else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE || 303362181Sdim *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && 304362181Sdim tbc == UTF8PROC_BOUNDCLASS_EXTEND) 305362181Sdim *state = UTF8PROC_BOUNDCLASS_E_BASE; 306362181Sdim else 307362181Sdim *state = tbc; 308362181Sdim } 309362181Sdim return break_permitted; 310362181Sdim} 311362181Sdim 312362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( 313362181Sdim utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { 314362181Sdim 315362181Sdim return grapheme_break_extended(utf8proc_get_property(c1)->boundclass, 316362181Sdim utf8proc_get_property(c2)->boundclass, 317362181Sdim state); 318362181Sdim} 319362181Sdim 320362181Sdim 321362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( 322362181Sdim utf8proc_int32_t c1, utf8proc_int32_t c2) { 323362181Sdim return utf8proc_grapheme_break_stateful(c1, c2, NULL); 324362181Sdim} 325362181Sdim 326362181Sdimstatic utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) 327362181Sdim{ 328362181Sdim utf8proc_int32_t entry_cp = **entry; 329362181Sdim if ((entry_cp & 0xF800) == 0xD800) { 330362181Sdim *entry = *entry + 1; 331362181Sdim entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); 332362181Sdim entry_cp += 0x10000; 333362181Sdim } 334362181Sdim return entry_cp; 335362181Sdim} 336362181Sdim 337362181Sdimstatic utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) 338362181Sdim{ 339362181Sdim const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; 340362181Sdim return seqindex_decode_entry(&entry); 341362181Sdim} 342362181Sdim 343362181Sdimstatic utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 344362181Sdim utf8proc_ssize_t written = 0; 345362181Sdim const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF]; 346362181Sdim int len = seqindex >> 13; 347362181Sdim if (len >= 7) { 348362181Sdim len = *entry; 349362181Sdim entry++; 350362181Sdim } 351362181Sdim for (; len >= 0; entry++, len--) { 352362181Sdim utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); 353362181Sdim 354362181Sdim written += utf8proc_decompose_char(entry_cp, dst+written, 355362181Sdim (bufsize > written) ? (bufsize - written) : 0, options, 356362181Sdim last_boundclass); 357362181Sdim if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 358362181Sdim } 359362181Sdim return written; 360362181Sdim} 361362181Sdim 362362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) 363362181Sdim{ 364362181Sdim utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; 365362181Sdim return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; 366362181Sdim} 367362181Sdim 368362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) 369362181Sdim{ 370362181Sdim utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; 371362181Sdim return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; 372362181Sdim} 373362181Sdim 374362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) 375362181Sdim{ 376362181Sdim utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; 377362181Sdim return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; 378362181Sdim} 379362181Sdim 380362181Sdim/* return a character width analogous to wcwidth (except portable and 381362181Sdim hopefully less buggy than most system wcwidth functions). */ 382362181SdimUTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { 383362181Sdim return utf8proc_get_property(c)->charwidth; 384362181Sdim} 385362181Sdim 386362181SdimUTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { 387362181Sdim return utf8proc_get_property(c)->category; 388362181Sdim} 389362181Sdim 390362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { 391362181Sdim static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; 392362181Sdim return s[utf8proc_category(c)]; 393362181Sdim} 394362181Sdim 395289177Speter#define utf8proc_decompose_lump(replacement_uc) \ 396289177Speter return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 397289177Speter options & ~UTF8PROC_LUMP, last_boundclass) 398289177Speter 399362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { 400289177Speter const utf8proc_property_t *property; 401289177Speter utf8proc_propval_t category; 402362181Sdim utf8proc_int32_t hangul_sindex; 403362181Sdim if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; 404362181Sdim property = unsafe_get_property(uc); 405289177Speter category = property->category; 406289177Speter hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 407289177Speter if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 408289177Speter if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) { 409362181Sdim utf8proc_int32_t hangul_tindex; 410289177Speter if (bufsize >= 1) { 411289177Speter dst[0] = UTF8PROC_HANGUL_LBASE + 412289177Speter hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 413289177Speter if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 414289177Speter (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 415289177Speter } 416289177Speter hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 417289177Speter if (!hangul_tindex) return 2; 418289177Speter if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex; 419289177Speter return 3; 420289177Speter } 421289177Speter } 422289177Speter if (options & UTF8PROC_REJECTNA) { 423289177Speter if (!category) return UTF8PROC_ERROR_NOTASSIGNED; 424289177Speter } 425289177Speter if (options & UTF8PROC_IGNORE) { 426289177Speter if (property->ignorable) return 0; 427289177Speter } 428289177Speter if (options & UTF8PROC_LUMP) { 429289177Speter if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020); 430289177Speter if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8) 431289177Speter utf8proc_decompose_lump(0x0027); 432289177Speter if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212) 433289177Speter utf8proc_decompose_lump(0x002D); 434289177Speter if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F); 435289177Speter if (uc == 0x2236) utf8proc_decompose_lump(0x003A); 436289177Speter if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008) 437289177Speter utf8proc_decompose_lump(0x003C); 438289177Speter if (uc == 0x203A || uc == 0x232A || uc == 0x3009) 439289177Speter utf8proc_decompose_lump(0x003E); 440289177Speter if (uc == 0x2216) utf8proc_decompose_lump(0x005C); 441289177Speter if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303) 442289177Speter utf8proc_decompose_lump(0x005E); 443289177Speter if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD) 444289177Speter utf8proc_decompose_lump(0x005F); 445289177Speter if (uc == 0x02CB) utf8proc_decompose_lump(0x0060); 446289177Speter if (uc == 0x2223) utf8proc_decompose_lump(0x007C); 447289177Speter if (uc == 0x223C) utf8proc_decompose_lump(0x007E); 448289177Speter if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) { 449289177Speter if (category == UTF8PROC_CATEGORY_ZL || 450289177Speter category == UTF8PROC_CATEGORY_ZP) 451289177Speter utf8proc_decompose_lump(0x000A); 452289177Speter } 453289177Speter } 454289177Speter if (options & UTF8PROC_STRIPMARK) { 455289177Speter if (category == UTF8PROC_CATEGORY_MN || 456289177Speter category == UTF8PROC_CATEGORY_MC || 457289177Speter category == UTF8PROC_CATEGORY_ME) return 0; 458289177Speter } 459289177Speter if (options & UTF8PROC_CASEFOLD) { 460362181Sdim if (property->casefold_seqindex != UINT16_MAX) { 461362181Sdim return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); 462289177Speter } 463289177Speter } 464289177Speter if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { 465362181Sdim if (property->decomp_seqindex != UINT16_MAX && 466289177Speter (!property->decomp_type || (options & UTF8PROC_COMPAT))) { 467362181Sdim return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); 468289177Speter } 469289177Speter } 470289177Speter if (options & UTF8PROC_CHARBOUND) { 471362181Sdim utf8proc_bool boundary; 472362181Sdim int tbc = property->boundclass; 473362181Sdim boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass); 474289177Speter if (boundary) { 475289177Speter if (bufsize >= 1) dst[0] = 0xFFFF; 476289177Speter if (bufsize >= 2) dst[1] = uc; 477289177Speter return 2; 478289177Speter } 479289177Speter } 480289177Speter if (bufsize >= 1) *dst = uc; 481289177Speter return 1; 482289177Speter} 483289177Speter 484362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( 485362181Sdim const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 486362181Sdim utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options 487289177Speter) { 488362181Sdim return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); 489362181Sdim} 490362181Sdim 491362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( 492362181Sdim const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, 493362181Sdim utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, 494362181Sdim utf8proc_custom_func custom_func, void *custom_data 495362181Sdim) { 496289177Speter /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */ 497362181Sdim utf8proc_ssize_t wpos = 0; 498289177Speter if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 499289177Speter return UTF8PROC_ERROR_INVALIDOPTS; 500289177Speter if ((options & UTF8PROC_STRIPMARK) && 501289177Speter !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 502289177Speter return UTF8PROC_ERROR_INVALIDOPTS; 503289177Speter { 504362181Sdim utf8proc_int32_t uc; 505362181Sdim utf8proc_ssize_t rpos = 0; 506362181Sdim utf8proc_ssize_t decomp_result; 507289177Speter int boundclass = UTF8PROC_BOUNDCLASS_START; 508289177Speter while (1) { 509289177Speter if (options & UTF8PROC_NULLTERM) { 510289177Speter rpos += utf8proc_iterate(str + rpos, -1, &uc); 511362181Sdim /* checking of return value is not necessary, 512289177Speter as 'uc' is < 0 in case of error */ 513289177Speter if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 514289177Speter if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 515289177Speter if (uc == 0) break; 516289177Speter } else { 517289177Speter if (rpos >= strlen) break; 518289177Speter rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 519289177Speter if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 520289177Speter } 521362181Sdim if (custom_func != NULL) { 522362181Sdim uc = custom_func(uc, custom_data); /* user-specified custom mapping */ 523362181Sdim } 524289177Speter decomp_result = utf8proc_decompose_char( 525289177Speter uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 526289177Speter &boundclass 527289177Speter ); 528289177Speter if (decomp_result < 0) return decomp_result; 529289177Speter wpos += decomp_result; 530289177Speter /* prohibiting integer overflows due to too long strings: */ 531362181Sdim if (wpos < 0 || 532362181Sdim wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2)) 533289177Speter return UTF8PROC_ERROR_OVERFLOW; 534289177Speter } 535289177Speter } 536289177Speter if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) { 537362181Sdim utf8proc_ssize_t pos = 0; 538289177Speter while (pos < wpos-1) { 539362181Sdim utf8proc_int32_t uc1, uc2; 540289177Speter const utf8proc_property_t *property1, *property2; 541289177Speter uc1 = buffer[pos]; 542289177Speter uc2 = buffer[pos+1]; 543362181Sdim property1 = unsafe_get_property(uc1); 544362181Sdim property2 = unsafe_get_property(uc2); 545289177Speter if (property1->combining_class > property2->combining_class && 546289177Speter property2->combining_class > 0) { 547289177Speter buffer[pos] = uc2; 548289177Speter buffer[pos+1] = uc1; 549289177Speter if (pos > 0) pos--; else pos++; 550289177Speter } else { 551289177Speter pos++; 552289177Speter } 553289177Speter } 554289177Speter } 555289177Speter return wpos; 556289177Speter} 557289177Speter 558362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 559362181Sdim /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */ 560289177Speter if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) { 561362181Sdim utf8proc_ssize_t rpos; 562362181Sdim utf8proc_ssize_t wpos = 0; 563362181Sdim utf8proc_int32_t uc; 564289177Speter for (rpos = 0; rpos < length; rpos++) { 565289177Speter uc = buffer[rpos]; 566289177Speter if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 567289177Speter if (uc == 0x000A || uc == 0x000D || uc == 0x0085 || 568289177Speter ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) { 569289177Speter if (options & UTF8PROC_NLF2LS) { 570289177Speter if (options & UTF8PROC_NLF2PS) { 571289177Speter buffer[wpos++] = 0x000A; 572289177Speter } else { 573289177Speter buffer[wpos++] = 0x2028; 574289177Speter } 575289177Speter } else { 576289177Speter if (options & UTF8PROC_NLF2PS) { 577289177Speter buffer[wpos++] = 0x2029; 578289177Speter } else { 579289177Speter buffer[wpos++] = 0x0020; 580289177Speter } 581289177Speter } 582289177Speter } else if ((options & UTF8PROC_STRIPCC) && 583289177Speter (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) { 584289177Speter if (uc == 0x0009) buffer[wpos++] = 0x0020; 585289177Speter } else { 586289177Speter buffer[wpos++] = uc; 587289177Speter } 588289177Speter } 589289177Speter length = wpos; 590289177Speter } 591289177Speter if (options & UTF8PROC_COMPOSE) { 592362181Sdim utf8proc_int32_t *starter = NULL; 593362181Sdim utf8proc_int32_t current_char; 594289177Speter const utf8proc_property_t *starter_property = NULL, *current_property; 595289177Speter utf8proc_propval_t max_combining_class = -1; 596362181Sdim utf8proc_ssize_t rpos; 597362181Sdim utf8proc_ssize_t wpos = 0; 598362181Sdim utf8proc_int32_t composition; 599289177Speter for (rpos = 0; rpos < length; rpos++) { 600289177Speter current_char = buffer[rpos]; 601362181Sdim current_property = unsafe_get_property(current_char); 602289177Speter if (starter && current_property->combining_class > max_combining_class) { 603289177Speter /* combination perhaps possible */ 604362181Sdim utf8proc_int32_t hangul_lindex; 605362181Sdim utf8proc_int32_t hangul_sindex; 606289177Speter hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE; 607289177Speter if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) { 608362181Sdim utf8proc_int32_t hangul_vindex; 609289177Speter hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 610289177Speter if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 611289177Speter *starter = UTF8PROC_HANGUL_SBASE + 612289177Speter (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 613289177Speter UTF8PROC_HANGUL_TCOUNT; 614289177Speter starter_property = NULL; 615289177Speter continue; 616289177Speter } 617289177Speter } 618289177Speter hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE; 619289177Speter if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 620289177Speter (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) { 621362181Sdim utf8proc_int32_t hangul_tindex; 622289177Speter hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 623289177Speter if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 624289177Speter *starter += hangul_tindex; 625289177Speter starter_property = NULL; 626289177Speter continue; 627289177Speter } 628289177Speter } 629289177Speter if (!starter_property) { 630362181Sdim starter_property = unsafe_get_property(*starter); 631289177Speter } 632362181Sdim if (starter_property->comb_index < 0x8000 && 633362181Sdim current_property->comb_index != UINT16_MAX && 634362181Sdim current_property->comb_index >= 0x8000) { 635362181Sdim int sidx = starter_property->comb_index; 636362181Sdim int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx]; 637362181Sdim if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) { 638362181Sdim idx += sidx + 2; 639362181Sdim if (current_property->comb_index & 0x4000) { 640362181Sdim composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1]; 641362181Sdim } else 642362181Sdim composition = utf8proc_combinations[idx]; 643362181Sdim 644362181Sdim if (composition > 0 && (!(options & UTF8PROC_STABLE) || 645362181Sdim !(unsafe_get_property(composition)->comp_exclusion))) { 646362181Sdim *starter = composition; 647362181Sdim starter_property = NULL; 648362181Sdim continue; 649362181Sdim } 650289177Speter } 651289177Speter } 652289177Speter } 653289177Speter buffer[wpos] = current_char; 654289177Speter if (current_property->combining_class) { 655289177Speter if (current_property->combining_class > max_combining_class) { 656289177Speter max_combining_class = current_property->combining_class; 657289177Speter } 658289177Speter } else { 659289177Speter starter = buffer + wpos; 660289177Speter starter_property = NULL; 661289177Speter max_combining_class = -1; 662289177Speter } 663289177Speter wpos++; 664289177Speter } 665289177Speter length = wpos; 666289177Speter } 667362181Sdim return length; 668362181Sdim} 669362181Sdim 670362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 671362181Sdim /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 672362181Sdim ASSERT: 'buffer' has one spare byte of free space at the end! */ 673362181Sdim length = utf8proc_normalize_utf32(buffer, length, options); 674362181Sdim if (length < 0) return length; 675289177Speter { 676362181Sdim utf8proc_ssize_t rpos, wpos = 0; 677362181Sdim utf8proc_int32_t uc; 678362181Sdim if (options & UTF8PROC_CHARBOUND) { 679362181Sdim for (rpos = 0; rpos < length; rpos++) { 680362181Sdim uc = buffer[rpos]; 681362181Sdim wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 682362181Sdim } 683362181Sdim } else { 684362181Sdim for (rpos = 0; rpos < length; rpos++) { 685362181Sdim uc = buffer[rpos]; 686362181Sdim wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); 687362181Sdim } 688289177Speter } 689362181Sdim ((utf8proc_uint8_t *)buffer)[wpos] = 0; 690289177Speter return wpos; 691289177Speter } 692289177Speter} 693289177Speter 694362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( 695362181Sdim const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options 696289177Speter) { 697362181Sdim return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); 698362181Sdim} 699362181Sdim 700362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( 701362181Sdim const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, 702362181Sdim utf8proc_custom_func custom_func, void *custom_data 703362181Sdim) { 704362181Sdim utf8proc_int32_t *buffer; 705362181Sdim utf8proc_ssize_t result; 706289177Speter *dstptr = NULL; 707362181Sdim result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data); 708289177Speter if (result < 0) return result; 709362181Sdim buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1); 710289177Speter if (!buffer) return UTF8PROC_ERROR_NOMEM; 711362181Sdim result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data); 712289177Speter if (result < 0) { 713289177Speter free(buffer); 714289177Speter return result; 715289177Speter } 716289177Speter result = utf8proc_reencode(buffer, result, options); 717289177Speter if (result < 0) { 718289177Speter free(buffer); 719289177Speter return result; 720289177Speter } 721289177Speter { 722362181Sdim utf8proc_int32_t *newptr; 723362181Sdim newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1); 724289177Speter if (newptr) buffer = newptr; 725289177Speter } 726362181Sdim *dstptr = (utf8proc_uint8_t *)buffer; 727289177Speter return result; 728289177Speter} 729289177Speter 730362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) { 731362181Sdim utf8proc_uint8_t *retval; 732289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 733289177Speter UTF8PROC_DECOMPOSE); 734289177Speter return retval; 735289177Speter} 736289177Speter 737362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) { 738362181Sdim utf8proc_uint8_t *retval; 739289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 740289177Speter UTF8PROC_COMPOSE); 741289177Speter return retval; 742289177Speter} 743289177Speter 744362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) { 745362181Sdim utf8proc_uint8_t *retval; 746289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 747289177Speter UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT); 748289177Speter return retval; 749289177Speter} 750289177Speter 751362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) { 752362181Sdim utf8proc_uint8_t *retval; 753289177Speter utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | 754289177Speter UTF8PROC_COMPOSE | UTF8PROC_COMPAT); 755289177Speter return retval; 756289177Speter} 757