1362181Sdim/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2289177Speter/*
3362181Sdim *  Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
4289177Speter *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5289177Speter *
6289177Speter *  Permission is hereby granted, free of charge, to any person obtaining a
7289177Speter *  copy of this software and associated documentation files (the "Software"),
8289177Speter *  to deal in the Software without restriction, including without limitation
9289177Speter *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
10289177Speter *  and/or sell copies of the Software, and to permit persons to whom the
11289177Speter *  Software is furnished to do so, subject to the following conditions:
12289177Speter *
13289177Speter *  The above copyright notice and this permission notice shall be included in
14289177Speter *  all copies or substantial portions of the Software.
15289177Speter *
16289177Speter *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17289177Speter *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18289177Speter *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19289177Speter *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20289177Speter *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21289177Speter *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22289177Speter *  DEALINGS IN THE SOFTWARE.
23289177Speter */
24289177Speter
25289177Speter/*
26289177Speter *  This library contains derived data from a modified version of the
27289177Speter *  Unicode data files.
28289177Speter *
29289177Speter *  The original data files are available at
30289177Speter *  http://www.unicode.org/Public/UNIDATA/
31289177Speter *
32289177Speter *  Please notice the copyright statement in the file "utf8proc_data.c".
33289177Speter */
34289177Speter
35289177Speter
36289177Speter/*
37289177Speter *  File name:    utf8proc.c
38289177Speter *
39289177Speter *  Description:
40289177Speter *  Implementation of libutf8proc.
41289177Speter */
42289177Speter
43289177Speter
44362181Sdim#include "utf8proc_internal.h"
45289177Speter#include "utf8proc_data.c"
46289177Speter
47289177Speter
48362181SdimUTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
49289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61289177Speter  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62289177Speter  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
63289177Speter  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
64289177Speter  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
65289177Speter
66289177Speter#define UTF8PROC_HANGUL_SBASE 0xAC00
67289177Speter#define UTF8PROC_HANGUL_LBASE 0x1100
68289177Speter#define UTF8PROC_HANGUL_VBASE 0x1161
69289177Speter#define UTF8PROC_HANGUL_TBASE 0x11A7
70289177Speter#define UTF8PROC_HANGUL_LCOUNT 19
71289177Speter#define UTF8PROC_HANGUL_VCOUNT 21
72289177Speter#define UTF8PROC_HANGUL_TCOUNT 28
73289177Speter#define UTF8PROC_HANGUL_NCOUNT 588
74289177Speter#define UTF8PROC_HANGUL_SCOUNT 11172
75289177Speter/* END is exclusive */
76289177Speter#define UTF8PROC_HANGUL_L_START  0x1100
77289177Speter#define UTF8PROC_HANGUL_L_END    0x115A
78289177Speter#define UTF8PROC_HANGUL_L_FILLER 0x115F
79289177Speter#define UTF8PROC_HANGUL_V_START  0x1160
80289177Speter#define UTF8PROC_HANGUL_V_END    0x11A3
81289177Speter#define UTF8PROC_HANGUL_T_START  0x11A8
82289177Speter#define UTF8PROC_HANGUL_T_END    0x11FA
83289177Speter#define UTF8PROC_HANGUL_S_START  0xAC00
84289177Speter#define UTF8PROC_HANGUL_S_END    0xD7A4
85289177Speter
86362181Sdim/* Should follow semantic-versioning rules (semver.org) based on API
87362181Sdim   compatibility.  (Note that the shared-library version number will
88362181Sdim   be different, being based on ABI compatibility.): */
89362181Sdim#define STRINGIZEx(x) #x
90362181Sdim#define STRINGIZE(x) STRINGIZEx(x)
91362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
92362181Sdim  return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
93289177Speter}
94289177Speter
95362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
96289177Speter  switch (errcode) {
97289177Speter    case UTF8PROC_ERROR_NOMEM:
98362181Sdim    return "Memory for processing UTF-8 data could not be allocated.";
99289177Speter    case UTF8PROC_ERROR_OVERFLOW:
100362181Sdim    return "UTF-8 string is too long to be processed.";
101289177Speter    case UTF8PROC_ERROR_INVALIDUTF8:
102362181Sdim    return "Invalid UTF-8 string";
103289177Speter    case UTF8PROC_ERROR_NOTASSIGNED:
104362181Sdim    return "Unassigned Unicode code point found in UTF-8 string.";
105289177Speter    case UTF8PROC_ERROR_INVALIDOPTS:
106362181Sdim    return "Invalid options for UTF-8 processing chosen.";
107289177Speter    default:
108362181Sdim    return "An unknown error occurred while processing UTF-8 data.";
109289177Speter  }
110289177Speter}
111289177Speter
112362181Sdim#define utf_cont(ch)  (((ch) & 0xc0) == 0x80)
113362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
114362181Sdim  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
115289177Speter) {
116362181Sdim  utf8proc_uint32_t uc;
117362181Sdim  const utf8proc_uint8_t *end;
118362181Sdim
119289177Speter  *dst = -1;
120289177Speter  if (!strlen) return 0;
121362181Sdim  end = str + ((strlen < 0) ? 4 : strlen);
122362181Sdim  uc = *str++;
123362181Sdim  if (uc < 0x80) {
124362181Sdim    *dst = uc;
125362181Sdim    return 1;
126289177Speter  }
127362181Sdim  /* Must be between 0xc2 and 0xf4 inclusive to be valid */
128362181Sdim  if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
129362181Sdim  if (uc < 0xe0) {         /* 2-byte sequence */
130362181Sdim     /* Must have valid continuation character */
131362181Sdim     if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
132362181Sdim     *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
133362181Sdim     return 2;
134289177Speter  }
135362181Sdim  if (uc < 0xf0) {        /* 3-byte sequence */
136362181Sdim     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
137362181Sdim        return UTF8PROC_ERROR_INVALIDUTF8;
138362181Sdim     /* Check for surrogate chars */
139362181Sdim     if (uc == 0xed && *str > 0x9f)
140362181Sdim         return UTF8PROC_ERROR_INVALIDUTF8;
141362181Sdim     uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
142362181Sdim     if (uc < 0x800)
143362181Sdim         return UTF8PROC_ERROR_INVALIDUTF8;
144362181Sdim     *dst = uc;
145362181Sdim     return 3;
146362181Sdim  }
147362181Sdim  /* 4-byte sequence
148362181Sdim     Must have 3 valid continuation characters */
149362181Sdim  if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
150362181Sdim     return UTF8PROC_ERROR_INVALIDUTF8;
151362181Sdim  /* Make sure in correct range (0x10000 - 0x10ffff) */
152362181Sdim  if (uc == 0xf0) {
153362181Sdim    if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
154362181Sdim  } else if (uc == 0xf4) {
155362181Sdim    if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
156362181Sdim  }
157362181Sdim  *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
158362181Sdim  return 4;
159289177Speter}
160289177Speter
161362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
162362181Sdim    return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
163289177Speter}
164289177Speter
165362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
166289177Speter  if (uc < 0x00) {
167289177Speter    return 0;
168289177Speter  } else if (uc < 0x80) {
169362181Sdim    dst[0] = (utf8proc_uint8_t) uc;
170289177Speter    return 1;
171289177Speter  } else if (uc < 0x800) {
172362181Sdim    dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
173362181Sdim    dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
174289177Speter    return 2;
175362181Sdim  /* Note: we allow encoding 0xd800-0xdfff here, so as not to change
176362181Sdim     the API, however, these are actually invalid in UTF-8 */
177289177Speter  } else if (uc < 0x10000) {
178362181Sdim    dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
179362181Sdim    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
180362181Sdim    dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
181289177Speter    return 3;
182289177Speter  } else if (uc < 0x110000) {
183362181Sdim    dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
184362181Sdim    dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
185362181Sdim    dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
186362181Sdim    dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
187289177Speter    return 4;
188289177Speter  } else return 0;
189289177Speter}
190289177Speter
191362181Sdim/* internal "unsafe" version that does not check whether uc is in range */
192362181Sdimstatic utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
193362181Sdim   if (uc < 0x00) {
194362181Sdim      return 0;
195362181Sdim   } else if (uc < 0x80) {
196362181Sdim      dst[0] = (utf8proc_uint8_t)uc;
197362181Sdim      return 1;
198362181Sdim   } else if (uc < 0x800) {
199362181Sdim      dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
200362181Sdim      dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
201362181Sdim      return 2;
202362181Sdim   } else if (uc == 0xFFFF) {
203362181Sdim       dst[0] = (utf8proc_uint8_t)0xFF;
204362181Sdim       return 1;
205362181Sdim   } else if (uc == 0xFFFE) {
206362181Sdim       dst[0] = (utf8proc_uint8_t)0xFE;
207362181Sdim       return 1;
208362181Sdim   } else if (uc < 0x10000) {
209362181Sdim      dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
210362181Sdim      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
211362181Sdim      dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
212362181Sdim      return 3;
213362181Sdim   } else if (uc < 0x110000) {
214362181Sdim      dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
215362181Sdim      dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
216362181Sdim      dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
217362181Sdim      dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
218362181Sdim      return 4;
219362181Sdim   } else return 0;
220362181Sdim}
221362181Sdim
222362181Sdim/* internal "unsafe" version that does not check whether uc is in range */
223362181Sdimstatic const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
224289177Speter  /* ASSERT: uc >= 0 && uc < 0x110000 */
225289177Speter  return utf8proc_properties + (
226289177Speter    utf8proc_stage2table[
227289177Speter      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
228289177Speter    ]
229289177Speter  );
230289177Speter}
231289177Speter
232362181SdimUTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
233362181Sdim  return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
234362181Sdim}
235362181Sdim
236362181Sdim/* return whether there is a grapheme break between boundclasses lbc and tbc
237362181Sdim   (according to the definition of extended grapheme clusters)
238362181Sdim
239362181Sdim  Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
240362181Sdim  http://www.unicode.org/reports/tr29/tr29-29.html
241362181Sdim
242362181Sdim  CAVEATS:
243362181Sdim   Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
244362181Sdim   and GB 12/13 (regional indicator code points) require knowledge of previous characters
245362181Sdim   and are thus not handled by this function. This may result in an incorrect break before
246362181Sdim   an E_Modifier class codepoint and an incorrectly missing break between two
247362181Sdim   REGIONAL_INDICATOR class code points if such support does not exist in the caller.
248362181Sdim
249362181Sdim   See the special support in grapheme_break_extended, for required bookkeeping by the caller.
250362181Sdim*/
251362181Sdimstatic utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
252362181Sdim  return
253362181Sdim    (lbc == UTF8PROC_BOUNDCLASS_START) ? true :       /* GB1 */
254362181Sdim    (lbc == UTF8PROC_BOUNDCLASS_CR &&                 /* GB3 */
255362181Sdim     tbc == UTF8PROC_BOUNDCLASS_LF) ? false :         /* --- */
256362181Sdim    (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  /* GB4 */
257362181Sdim    (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true :  /* GB5 */
258362181Sdim    (lbc == UTF8PROC_BOUNDCLASS_L &&                  /* GB6 */
259362181Sdim     (tbc == UTF8PROC_BOUNDCLASS_L ||                 /* --- */
260362181Sdim      tbc == UTF8PROC_BOUNDCLASS_V ||                 /* --- */
261362181Sdim      tbc == UTF8PROC_BOUNDCLASS_LV ||                /* --- */
262362181Sdim      tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :      /* --- */
263362181Sdim    ((lbc == UTF8PROC_BOUNDCLASS_LV ||                /* GB7 */
264362181Sdim      lbc == UTF8PROC_BOUNDCLASS_V) &&                /* --- */
265362181Sdim     (tbc == UTF8PROC_BOUNDCLASS_V ||                 /* --- */
266362181Sdim      tbc == UTF8PROC_BOUNDCLASS_T)) ? false :        /* --- */
267362181Sdim    ((lbc == UTF8PROC_BOUNDCLASS_LVT ||               /* GB8 */
268362181Sdim      lbc == UTF8PROC_BOUNDCLASS_T) &&                /* --- */
269362181Sdim     tbc == UTF8PROC_BOUNDCLASS_T) ? false :          /* --- */
270362181Sdim    (tbc == UTF8PROC_BOUNDCLASS_EXTEND ||             /* GB9 */
271362181Sdim     tbc == UTF8PROC_BOUNDCLASS_ZWJ ||                /* --- */
272362181Sdim     tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK ||        /* GB9a */
273362181Sdim     lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false :    /* GB9b */
274362181Sdim    ((lbc == UTF8PROC_BOUNDCLASS_E_BASE ||            /* GB10 (requires additional handling below) */
275362181Sdim      lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&       /* ---- */
276362181Sdim     tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : /* ---- */
277362181Sdim    (lbc == UTF8PROC_BOUNDCLASS_ZWJ &&                         /* GB11 */
278362181Sdim     (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ ||             /* ---- */
279362181Sdim      tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false :        /* ---- */
280362181Sdim    (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR &&          /* GB12/13 (requires additional handling below) */
281362181Sdim     tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false :  /* ---- */
282362181Sdim    true; /* GB999 */
283362181Sdim}
284362181Sdim
285362181Sdimstatic utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
286362181Sdim{
287362181Sdim  utf8proc_bool break_permitted;
288362181Sdim  int lbc_override = lbc;
289362181Sdim  if (state && *state != UTF8PROC_BOUNDCLASS_START)
290362181Sdim    lbc_override = *state;
291362181Sdim  break_permitted = grapheme_break_simple(lbc_override, tbc);
292362181Sdim  if (state) {
293362181Sdim    /* Special support for GB 12/13 made possible by GB999. After two RI
294362181Sdim       class codepoints we want to force a break. Do this by resetting the
295362181Sdim       second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
296362181Sdim       after that character according to GB999 (unless of course such a break is
297362181Sdim       forbidden by a different rule such as GB9). */
298362181Sdim    if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
299362181Sdim      *state = UTF8PROC_BOUNDCLASS_OTHER;
300362181Sdim    /* Special support for GB10. Fold any EXTEND codepoints into the previous
301362181Sdim       boundclass if we're dealing with an emoji base boundclass. */
302362181Sdim    else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE      ||
303362181Sdim              *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
304362181Sdim             tbc == UTF8PROC_BOUNDCLASS_EXTEND)
305362181Sdim      *state = UTF8PROC_BOUNDCLASS_E_BASE;
306362181Sdim    else
307362181Sdim      *state = tbc;
308362181Sdim  }
309362181Sdim  return break_permitted;
310362181Sdim}
311362181Sdim
312362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
313362181Sdim    utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
314362181Sdim
315362181Sdim  return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
316362181Sdim                                 utf8proc_get_property(c2)->boundclass,
317362181Sdim                                 state);
318362181Sdim}
319362181Sdim
320362181Sdim
321362181SdimUTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
322362181Sdim    utf8proc_int32_t c1, utf8proc_int32_t c2) {
323362181Sdim  return utf8proc_grapheme_break_stateful(c1, c2, NULL);
324362181Sdim}
325362181Sdim
326362181Sdimstatic utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
327362181Sdim{
328362181Sdim  utf8proc_int32_t entry_cp = **entry;
329362181Sdim  if ((entry_cp & 0xF800) == 0xD800) {
330362181Sdim    *entry = *entry + 1;
331362181Sdim    entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
332362181Sdim    entry_cp += 0x10000;
333362181Sdim  }
334362181Sdim  return entry_cp;
335362181Sdim}
336362181Sdim
337362181Sdimstatic utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
338362181Sdim{
339362181Sdim  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
340362181Sdim  return seqindex_decode_entry(&entry);
341362181Sdim}
342362181Sdim
343362181Sdimstatic utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
344362181Sdim  utf8proc_ssize_t written = 0;
345362181Sdim  const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
346362181Sdim  int len = seqindex >> 13;
347362181Sdim  if (len >= 7) {
348362181Sdim    len = *entry;
349362181Sdim    entry++;
350362181Sdim  }
351362181Sdim  for (; len >= 0; entry++, len--) {
352362181Sdim    utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
353362181Sdim
354362181Sdim    written += utf8proc_decompose_char(entry_cp, dst+written,
355362181Sdim      (bufsize > written) ? (bufsize - written) : 0, options,
356362181Sdim    last_boundclass);
357362181Sdim    if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
358362181Sdim  }
359362181Sdim  return written;
360362181Sdim}
361362181Sdim
362362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
363362181Sdim{
364362181Sdim  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
365362181Sdim  return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
366362181Sdim}
367362181Sdim
368362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
369362181Sdim{
370362181Sdim  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
371362181Sdim  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
372362181Sdim}
373362181Sdim
374362181SdimUTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
375362181Sdim{
376362181Sdim  utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
377362181Sdim  return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
378362181Sdim}
379362181Sdim
380362181Sdim/* return a character width analogous to wcwidth (except portable and
381362181Sdim   hopefully less buggy than most system wcwidth functions). */
382362181SdimUTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
383362181Sdim  return utf8proc_get_property(c)->charwidth;
384362181Sdim}
385362181Sdim
386362181SdimUTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
387362181Sdim  return utf8proc_get_property(c)->category;
388362181Sdim}
389362181Sdim
390362181SdimUTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
391362181Sdim  static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
392362181Sdim  return s[utf8proc_category(c)];
393362181Sdim}
394362181Sdim
395289177Speter#define utf8proc_decompose_lump(replacement_uc) \
396289177Speter  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
397289177Speter  options & ~UTF8PROC_LUMP, last_boundclass)
398289177Speter
399362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
400289177Speter  const utf8proc_property_t *property;
401289177Speter  utf8proc_propval_t category;
402362181Sdim  utf8proc_int32_t hangul_sindex;
403362181Sdim  if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
404362181Sdim  property = unsafe_get_property(uc);
405289177Speter  category = property->category;
406289177Speter  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
407289177Speter  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
408289177Speter    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
409362181Sdim      utf8proc_int32_t hangul_tindex;
410289177Speter      if (bufsize >= 1) {
411289177Speter        dst[0] = UTF8PROC_HANGUL_LBASE +
412289177Speter          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
413289177Speter        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
414289177Speter          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
415289177Speter      }
416289177Speter      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
417289177Speter      if (!hangul_tindex) return 2;
418289177Speter      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
419289177Speter      return 3;
420289177Speter    }
421289177Speter  }
422289177Speter  if (options & UTF8PROC_REJECTNA) {
423289177Speter    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
424289177Speter  }
425289177Speter  if (options & UTF8PROC_IGNORE) {
426289177Speter    if (property->ignorable) return 0;
427289177Speter  }
428289177Speter  if (options & UTF8PROC_LUMP) {
429289177Speter    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
430289177Speter    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
431289177Speter      utf8proc_decompose_lump(0x0027);
432289177Speter    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
433289177Speter      utf8proc_decompose_lump(0x002D);
434289177Speter    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
435289177Speter    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
436289177Speter    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
437289177Speter      utf8proc_decompose_lump(0x003C);
438289177Speter    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
439289177Speter      utf8proc_decompose_lump(0x003E);
440289177Speter    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
441289177Speter    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
442289177Speter      utf8proc_decompose_lump(0x005E);
443289177Speter    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
444289177Speter      utf8proc_decompose_lump(0x005F);
445289177Speter    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
446289177Speter    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
447289177Speter    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
448289177Speter    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
449289177Speter      if (category == UTF8PROC_CATEGORY_ZL ||
450289177Speter          category == UTF8PROC_CATEGORY_ZP)
451289177Speter        utf8proc_decompose_lump(0x000A);
452289177Speter    }
453289177Speter  }
454289177Speter  if (options & UTF8PROC_STRIPMARK) {
455289177Speter    if (category == UTF8PROC_CATEGORY_MN ||
456289177Speter      category == UTF8PROC_CATEGORY_MC ||
457289177Speter      category == UTF8PROC_CATEGORY_ME) return 0;
458289177Speter  }
459289177Speter  if (options & UTF8PROC_CASEFOLD) {
460362181Sdim    if (property->casefold_seqindex != UINT16_MAX) {
461362181Sdim      return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
462289177Speter    }
463289177Speter  }
464289177Speter  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
465362181Sdim    if (property->decomp_seqindex != UINT16_MAX &&
466289177Speter        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
467362181Sdim      return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
468289177Speter    }
469289177Speter  }
470289177Speter  if (options & UTF8PROC_CHARBOUND) {
471362181Sdim    utf8proc_bool boundary;
472362181Sdim    int tbc = property->boundclass;
473362181Sdim    boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
474289177Speter    if (boundary) {
475289177Speter      if (bufsize >= 1) dst[0] = 0xFFFF;
476289177Speter      if (bufsize >= 2) dst[1] = uc;
477289177Speter      return 2;
478289177Speter    }
479289177Speter  }
480289177Speter  if (bufsize >= 1) *dst = uc;
481289177Speter  return 1;
482289177Speter}
483289177Speter
484362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
485362181Sdim  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
486362181Sdim  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
487289177Speter) {
488362181Sdim    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
489362181Sdim}
490362181Sdim
491362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
492362181Sdim  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
493362181Sdim  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
494362181Sdim  utf8proc_custom_func custom_func, void *custom_data
495362181Sdim) {
496289177Speter  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
497362181Sdim  utf8proc_ssize_t wpos = 0;
498289177Speter  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
499289177Speter    return UTF8PROC_ERROR_INVALIDOPTS;
500289177Speter  if ((options & UTF8PROC_STRIPMARK) &&
501289177Speter      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
502289177Speter    return UTF8PROC_ERROR_INVALIDOPTS;
503289177Speter  {
504362181Sdim    utf8proc_int32_t uc;
505362181Sdim    utf8proc_ssize_t rpos = 0;
506362181Sdim    utf8proc_ssize_t decomp_result;
507289177Speter    int boundclass = UTF8PROC_BOUNDCLASS_START;
508289177Speter    while (1) {
509289177Speter      if (options & UTF8PROC_NULLTERM) {
510289177Speter        rpos += utf8proc_iterate(str + rpos, -1, &uc);
511362181Sdim        /* checking of return value is not necessary,
512289177Speter           as 'uc' is < 0 in case of error */
513289177Speter        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
514289177Speter        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
515289177Speter        if (uc == 0) break;
516289177Speter      } else {
517289177Speter        if (rpos >= strlen) break;
518289177Speter        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
519289177Speter        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
520289177Speter      }
521362181Sdim      if (custom_func != NULL) {
522362181Sdim        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
523362181Sdim      }
524289177Speter      decomp_result = utf8proc_decompose_char(
525289177Speter        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
526289177Speter        &boundclass
527289177Speter      );
528289177Speter      if (decomp_result < 0) return decomp_result;
529289177Speter      wpos += decomp_result;
530289177Speter      /* prohibiting integer overflows due to too long strings: */
531362181Sdim      if (wpos < 0 ||
532362181Sdim          wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
533289177Speter        return UTF8PROC_ERROR_OVERFLOW;
534289177Speter    }
535289177Speter  }
536289177Speter  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
537362181Sdim    utf8proc_ssize_t pos = 0;
538289177Speter    while (pos < wpos-1) {
539362181Sdim      utf8proc_int32_t uc1, uc2;
540289177Speter      const utf8proc_property_t *property1, *property2;
541289177Speter      uc1 = buffer[pos];
542289177Speter      uc2 = buffer[pos+1];
543362181Sdim      property1 = unsafe_get_property(uc1);
544362181Sdim      property2 = unsafe_get_property(uc2);
545289177Speter      if (property1->combining_class > property2->combining_class &&
546289177Speter          property2->combining_class > 0) {
547289177Speter        buffer[pos] = uc2;
548289177Speter        buffer[pos+1] = uc1;
549289177Speter        if (pos > 0) pos--; else pos++;
550289177Speter      } else {
551289177Speter        pos++;
552289177Speter      }
553289177Speter    }
554289177Speter  }
555289177Speter  return wpos;
556289177Speter}
557289177Speter
558362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
559362181Sdim  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
560289177Speter  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
561362181Sdim    utf8proc_ssize_t rpos;
562362181Sdim    utf8proc_ssize_t wpos = 0;
563362181Sdim    utf8proc_int32_t uc;
564289177Speter    for (rpos = 0; rpos < length; rpos++) {
565289177Speter      uc = buffer[rpos];
566289177Speter      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
567289177Speter      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
568289177Speter          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
569289177Speter        if (options & UTF8PROC_NLF2LS) {
570289177Speter          if (options & UTF8PROC_NLF2PS) {
571289177Speter            buffer[wpos++] = 0x000A;
572289177Speter          } else {
573289177Speter            buffer[wpos++] = 0x2028;
574289177Speter          }
575289177Speter        } else {
576289177Speter          if (options & UTF8PROC_NLF2PS) {
577289177Speter            buffer[wpos++] = 0x2029;
578289177Speter          } else {
579289177Speter            buffer[wpos++] = 0x0020;
580289177Speter          }
581289177Speter        }
582289177Speter      } else if ((options & UTF8PROC_STRIPCC) &&
583289177Speter          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
584289177Speter        if (uc == 0x0009) buffer[wpos++] = 0x0020;
585289177Speter      } else {
586289177Speter        buffer[wpos++] = uc;
587289177Speter      }
588289177Speter    }
589289177Speter    length = wpos;
590289177Speter  }
591289177Speter  if (options & UTF8PROC_COMPOSE) {
592362181Sdim    utf8proc_int32_t *starter = NULL;
593362181Sdim    utf8proc_int32_t current_char;
594289177Speter    const utf8proc_property_t *starter_property = NULL, *current_property;
595289177Speter    utf8proc_propval_t max_combining_class = -1;
596362181Sdim    utf8proc_ssize_t rpos;
597362181Sdim    utf8proc_ssize_t wpos = 0;
598362181Sdim    utf8proc_int32_t composition;
599289177Speter    for (rpos = 0; rpos < length; rpos++) {
600289177Speter      current_char = buffer[rpos];
601362181Sdim      current_property = unsafe_get_property(current_char);
602289177Speter      if (starter && current_property->combining_class > max_combining_class) {
603289177Speter        /* combination perhaps possible */
604362181Sdim        utf8proc_int32_t hangul_lindex;
605362181Sdim        utf8proc_int32_t hangul_sindex;
606289177Speter        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
607289177Speter        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
608362181Sdim          utf8proc_int32_t hangul_vindex;
609289177Speter          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
610289177Speter          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
611289177Speter            *starter = UTF8PROC_HANGUL_SBASE +
612289177Speter              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
613289177Speter              UTF8PROC_HANGUL_TCOUNT;
614289177Speter            starter_property = NULL;
615289177Speter            continue;
616289177Speter          }
617289177Speter        }
618289177Speter        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
619289177Speter        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
620289177Speter            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
621362181Sdim          utf8proc_int32_t hangul_tindex;
622289177Speter          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
623289177Speter          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
624289177Speter            *starter += hangul_tindex;
625289177Speter            starter_property = NULL;
626289177Speter            continue;
627289177Speter          }
628289177Speter        }
629289177Speter        if (!starter_property) {
630362181Sdim          starter_property = unsafe_get_property(*starter);
631289177Speter        }
632362181Sdim        if (starter_property->comb_index < 0x8000 &&
633362181Sdim            current_property->comb_index != UINT16_MAX &&
634362181Sdim            current_property->comb_index >= 0x8000) {
635362181Sdim          int sidx = starter_property->comb_index;
636362181Sdim          int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
637362181Sdim          if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
638362181Sdim            idx += sidx + 2;
639362181Sdim            if (current_property->comb_index & 0x4000) {
640362181Sdim              composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
641362181Sdim            } else
642362181Sdim              composition = utf8proc_combinations[idx];
643362181Sdim
644362181Sdim            if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
645362181Sdim                !(unsafe_get_property(composition)->comp_exclusion))) {
646362181Sdim              *starter = composition;
647362181Sdim              starter_property = NULL;
648362181Sdim              continue;
649362181Sdim            }
650289177Speter          }
651289177Speter        }
652289177Speter      }
653289177Speter      buffer[wpos] = current_char;
654289177Speter      if (current_property->combining_class) {
655289177Speter        if (current_property->combining_class > max_combining_class) {
656289177Speter          max_combining_class = current_property->combining_class;
657289177Speter        }
658289177Speter      } else {
659289177Speter        starter = buffer + wpos;
660289177Speter        starter_property = NULL;
661289177Speter        max_combining_class = -1;
662289177Speter      }
663289177Speter      wpos++;
664289177Speter    }
665289177Speter    length = wpos;
666289177Speter  }
667362181Sdim  return length;
668362181Sdim}
669362181Sdim
670362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
671362181Sdim  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
672362181Sdim     ASSERT: 'buffer' has one spare byte of free space at the end! */
673362181Sdim  length = utf8proc_normalize_utf32(buffer, length, options);
674362181Sdim  if (length < 0) return length;
675289177Speter  {
676362181Sdim    utf8proc_ssize_t rpos, wpos = 0;
677362181Sdim    utf8proc_int32_t uc;
678362181Sdim    if (options & UTF8PROC_CHARBOUND) {
679362181Sdim        for (rpos = 0; rpos < length; rpos++) {
680362181Sdim            uc = buffer[rpos];
681362181Sdim            wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
682362181Sdim        }
683362181Sdim    } else {
684362181Sdim        for (rpos = 0; rpos < length; rpos++) {
685362181Sdim            uc = buffer[rpos];
686362181Sdim            wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
687362181Sdim        }
688289177Speter    }
689362181Sdim    ((utf8proc_uint8_t *)buffer)[wpos] = 0;
690289177Speter    return wpos;
691289177Speter  }
692289177Speter}
693289177Speter
694362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
695362181Sdim  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
696289177Speter) {
697362181Sdim    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
698362181Sdim}
699362181Sdim
700362181SdimUTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
701362181Sdim  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
702362181Sdim  utf8proc_custom_func custom_func, void *custom_data
703362181Sdim) {
704362181Sdim  utf8proc_int32_t *buffer;
705362181Sdim  utf8proc_ssize_t result;
706289177Speter  *dstptr = NULL;
707362181Sdim  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
708289177Speter  if (result < 0) return result;
709362181Sdim  buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
710289177Speter  if (!buffer) return UTF8PROC_ERROR_NOMEM;
711362181Sdim  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
712289177Speter  if (result < 0) {
713289177Speter    free(buffer);
714289177Speter    return result;
715289177Speter  }
716289177Speter  result = utf8proc_reencode(buffer, result, options);
717289177Speter  if (result < 0) {
718289177Speter    free(buffer);
719289177Speter    return result;
720289177Speter  }
721289177Speter  {
722362181Sdim    utf8proc_int32_t *newptr;
723362181Sdim    newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
724289177Speter    if (newptr) buffer = newptr;
725289177Speter  }
726362181Sdim  *dstptr = (utf8proc_uint8_t *)buffer;
727289177Speter  return result;
728289177Speter}
729289177Speter
730362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
731362181Sdim  utf8proc_uint8_t *retval;
732289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
733289177Speter    UTF8PROC_DECOMPOSE);
734289177Speter  return retval;
735289177Speter}
736289177Speter
737362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
738362181Sdim  utf8proc_uint8_t *retval;
739289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
740289177Speter    UTF8PROC_COMPOSE);
741289177Speter  return retval;
742289177Speter}
743289177Speter
744362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
745362181Sdim  utf8proc_uint8_t *retval;
746289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
747289177Speter    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
748289177Speter  return retval;
749289177Speter}
750289177Speter
751362181SdimUTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
752362181Sdim  utf8proc_uint8_t *retval;
753289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
754289177Speter    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
755289177Speter  return retval;
756289177Speter}
757