1289177Speter/*
2289177Speter *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
3289177Speter *
4289177Speter *  Permission is hereby granted, free of charge, to any person obtaining a
5289177Speter *  copy of this software and associated documentation files (the "Software"),
6289177Speter *  to deal in the Software without restriction, including without limitation
7289177Speter *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
8289177Speter *  and/or sell copies of the Software, and to permit persons to whom the
9289177Speter *  Software is furnished to do so, subject to the following conditions:
10289177Speter *
11289177Speter *  The above copyright notice and this permission notice shall be included in
12289177Speter *  all copies or substantial portions of the Software.
13289177Speter *
14289177Speter *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15289177Speter *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16289177Speter *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17289177Speter *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18289177Speter *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19289177Speter *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20289177Speter *  DEALINGS IN THE SOFTWARE.
21289177Speter */
22289177Speter
23289177Speter/*
24289177Speter *  This library contains derived data from a modified version of the
25289177Speter *  Unicode data files.
26289177Speter *
27289177Speter *  The original data files are available at
28289177Speter *  http://www.unicode.org/Public/UNIDATA/
29289177Speter *
30289177Speter *  Please notice the copyright statement in the file "utf8proc_data.c".
31289177Speter */
32289177Speter
33289177Speter
34289177Speter/*
35289177Speter *  File name:    utf8proc.c
36289177Speter *
37289177Speter *  Description:
38289177Speter *  Implementation of libutf8proc.
39289177Speter */
40289177Speter
41289177Speter
42289177Speter#include "utf8proc.h"
43289177Speter#include "utf8proc_data.c"
44289177Speter
45289177Speter
46289177SpeterUTF8PROC_DATA
47289177Speterconst int8_t utf8proc_utf8class[256] = {
48289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55289177Speter  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59289177Speter  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60289177Speter  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61289177Speter  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62289177Speter  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63289177Speter  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
64289177Speter
65289177Speter#define UTF8PROC_HANGUL_SBASE 0xAC00
66289177Speter#define UTF8PROC_HANGUL_LBASE 0x1100
67289177Speter#define UTF8PROC_HANGUL_VBASE 0x1161
68289177Speter#define UTF8PROC_HANGUL_TBASE 0x11A7
69289177Speter#define UTF8PROC_HANGUL_LCOUNT 19
70289177Speter#define UTF8PROC_HANGUL_VCOUNT 21
71289177Speter#define UTF8PROC_HANGUL_TCOUNT 28
72289177Speter#define UTF8PROC_HANGUL_NCOUNT 588
73289177Speter#define UTF8PROC_HANGUL_SCOUNT 11172
74289177Speter/* END is exclusive */
75289177Speter#define UTF8PROC_HANGUL_L_START  0x1100
76289177Speter#define UTF8PROC_HANGUL_L_END    0x115A
77289177Speter#define UTF8PROC_HANGUL_L_FILLER 0x115F
78289177Speter#define UTF8PROC_HANGUL_V_START  0x1160
79289177Speter#define UTF8PROC_HANGUL_V_END    0x11A3
80289177Speter#define UTF8PROC_HANGUL_T_START  0x11A8
81289177Speter#define UTF8PROC_HANGUL_T_END    0x11FA
82289177Speter#define UTF8PROC_HANGUL_S_START  0xAC00
83289177Speter#define UTF8PROC_HANGUL_S_END    0xD7A4
84289177Speter
85289177Speter
86289177Speter#define UTF8PROC_BOUNDCLASS_START    0
87289177Speter#define UTF8PROC_BOUNDCLASS_OTHER    1
88289177Speter#define UTF8PROC_BOUNDCLASS_CR       2
89289177Speter#define UTF8PROC_BOUNDCLASS_LF       3
90289177Speter#define UTF8PROC_BOUNDCLASS_CONTROL  4
91289177Speter#define UTF8PROC_BOUNDCLASS_EXTEND   5
92289177Speter#define UTF8PROC_BOUNDCLASS_L        6
93289177Speter#define UTF8PROC_BOUNDCLASS_V        7
94289177Speter#define UTF8PROC_BOUNDCLASS_T        8
95289177Speter#define UTF8PROC_BOUNDCLASS_LV       9
96289177Speter#define UTF8PROC_BOUNDCLASS_LVT     10
97289177Speter
98289177Speter
99289177SpeterUTF8PROC_API
100289177Speterconst char *utf8proc_version(void) {
101289177Speter  return "1.1.5";
102289177Speter}
103289177Speter
104289177Speter/*
105289177Speter * This macro tells translators that string X should be translated,
106289177Speter * but does not look up the translation at run time.  This is standard
107289177Speter * GNU gettext notation for annotating compile-time constant strings.
108289177Speter */
109289177Speter#ifndef N_
110289177Speter#define N_(x) x
111289177Speter#endif
112289177Speter
113289177SpeterUTF8PROC_API
114289177Speterconst char *utf8proc_errmsg(ssize_t errcode) {
115289177Speter  switch (errcode) {
116289177Speter    case UTF8PROC_ERROR_NOMEM:
117289177Speter    return N_("Memory for processing UTF-8 data could not be allocated.");
118289177Speter    case UTF8PROC_ERROR_OVERFLOW:
119289177Speter    return N_("UTF-8 string is too long to be processed.");
120289177Speter    case UTF8PROC_ERROR_INVALIDUTF8:
121289177Speter    return N_("Invalid UTF-8 string");
122289177Speter    case UTF8PROC_ERROR_NOTASSIGNED:
123289177Speter    return N_("Unassigned Unicode code point found in UTF-8 string.");
124289177Speter    case UTF8PROC_ERROR_INVALIDOPTS:
125289177Speter    return N_("Invalid options for UTF-8 processing chosen.");
126289177Speter    default:
127289177Speter    return N_("An unknown error occured while processing UTF-8 data.");
128289177Speter  }
129289177Speter}
130289177Speter
131289177SpeterUTF8PROC_API
132289177Speterssize_t utf8proc_iterate(
133289177Speter  const uint8_t *str, ssize_t strlen, int32_t *dst
134289177Speter) {
135289177Speter  int length;
136289177Speter  int i;
137289177Speter  int32_t uc = -1;
138289177Speter  *dst = -1;
139289177Speter  if (!strlen) return 0;
140289177Speter  length = utf8proc_utf8class[str[0]];
141289177Speter  if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
142289177Speter  if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
143289177Speter  for (i=1; i<length; i++) {
144289177Speter    if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
145289177Speter  }
146289177Speter  switch (length) {
147289177Speter    case 1:
148289177Speter    uc = str[0];
149289177Speter    break;
150289177Speter    case 2:
151289177Speter    uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
152289177Speter    if (uc < 0x80) uc = -1;
153289177Speter    break;
154289177Speter    case 3:
155289177Speter    uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
156289177Speter      + (str[2] & 0x3F);
157289177Speter    if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
158289177Speter      (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
159289177Speter    break;
160289177Speter    case 4:
161289177Speter    uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
162289177Speter      + ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
163289177Speter    if (uc < 0x10000 || uc >= 0x110000) uc = -1;
164289177Speter    break;
165289177Speter  }
166289177Speter  if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
167289177Speter    return UTF8PROC_ERROR_INVALIDUTF8;
168289177Speter  *dst = uc;
169289177Speter  return length;
170289177Speter}
171289177Speter
172289177SpeterUTF8PROC_API
173289177Speterbool utf8proc_codepoint_valid(int32_t uc) {
174289177Speter  if (uc < 0 || uc >= 0x110000 ||
175289177Speter    ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
176289177Speter    (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
177289177Speter  else return true;
178289177Speter}
179289177Speter
180289177SpeterUTF8PROC_API
181289177Speterssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
182289177Speter  if (uc < 0x00) {
183289177Speter    return 0;
184289177Speter  } else if (uc < 0x80) {
185289177Speter    dst[0] = (uint8_t)uc;
186289177Speter    return 1;
187289177Speter  } else if (uc < 0x800) {
188289177Speter    dst[0] = 0xC0 + (uint8_t)(uc >> 6);
189289177Speter    dst[1] = 0x80 + (uc & 0x3F);
190289177Speter    return 2;
191289177Speter  } else if (uc == 0xFFFF) {
192289177Speter    dst[0] = 0xFF;
193289177Speter    return 1;
194289177Speter  } else if (uc == 0xFFFE) {
195289177Speter    dst[0] = 0xFE;
196289177Speter    return 1;
197289177Speter  } else if (uc < 0x10000) {
198289177Speter    dst[0] = 0xE0 + (uint8_t)(uc >> 12);
199289177Speter    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
200289177Speter    dst[2] = 0x80 + (uc & 0x3F);
201289177Speter    return 3;
202289177Speter  } else if (uc < 0x110000) {
203289177Speter    dst[0] = 0xF0 + (uint8_t)(uc >> 18);
204289177Speter    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
205289177Speter    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
206289177Speter    dst[3] = 0x80 + (uc & 0x3F);
207289177Speter    return 4;
208289177Speter  } else return 0;
209289177Speter}
210289177Speter
211289177SpeterUTF8PROC_API
212289177Speterconst utf8proc_property_t *utf8proc_get_property(int32_t uc) {
213289177Speter  /* ASSERT: uc >= 0 && uc < 0x110000 */
214289177Speter  return utf8proc_properties + (
215289177Speter    utf8proc_stage2table[
216289177Speter      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
217289177Speter    ]
218289177Speter  );
219289177Speter}
220289177Speter
221289177Speter#define utf8proc_decompose_lump(replacement_uc) \
222289177Speter  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
223289177Speter  options & ~UTF8PROC_LUMP, last_boundclass)
224289177Speter
225289177SpeterUTF8PROC_API
226289177Speterssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
227289177Speter    int options, int *last_boundclass) {
228289177Speter  /* ASSERT: uc >= 0 && uc < 0x110000 */
229289177Speter  const utf8proc_property_t *property;
230289177Speter  utf8proc_propval_t category;
231289177Speter  int32_t hangul_sindex;
232289177Speter  property = utf8proc_get_property(uc);
233289177Speter  category = property->category;
234289177Speter  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
235289177Speter  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
236289177Speter    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
237289177Speter      int32_t hangul_tindex;
238289177Speter      if (bufsize >= 1) {
239289177Speter        dst[0] = UTF8PROC_HANGUL_LBASE +
240289177Speter          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
241289177Speter        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
242289177Speter          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
243289177Speter      }
244289177Speter      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
245289177Speter      if (!hangul_tindex) return 2;
246289177Speter      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
247289177Speter      return 3;
248289177Speter    }
249289177Speter  }
250289177Speter  if (options & UTF8PROC_REJECTNA) {
251289177Speter    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
252289177Speter  }
253289177Speter  if (options & UTF8PROC_IGNORE) {
254289177Speter    if (property->ignorable) return 0;
255289177Speter  }
256289177Speter  if (options & UTF8PROC_LUMP) {
257289177Speter    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
258289177Speter    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
259289177Speter      utf8proc_decompose_lump(0x0027);
260289177Speter    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
261289177Speter      utf8proc_decompose_lump(0x002D);
262289177Speter    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
263289177Speter    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
264289177Speter    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
265289177Speter      utf8proc_decompose_lump(0x003C);
266289177Speter    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
267289177Speter      utf8proc_decompose_lump(0x003E);
268289177Speter    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
269289177Speter    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
270289177Speter      utf8proc_decompose_lump(0x005E);
271289177Speter    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
272289177Speter      utf8proc_decompose_lump(0x005F);
273289177Speter    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
274289177Speter    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
275289177Speter    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
276289177Speter    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
277289177Speter      if (category == UTF8PROC_CATEGORY_ZL ||
278289177Speter          category == UTF8PROC_CATEGORY_ZP)
279289177Speter        utf8proc_decompose_lump(0x000A);
280289177Speter    }
281289177Speter  }
282289177Speter  if (options & UTF8PROC_STRIPMARK) {
283289177Speter    if (category == UTF8PROC_CATEGORY_MN ||
284289177Speter      category == UTF8PROC_CATEGORY_MC ||
285289177Speter      category == UTF8PROC_CATEGORY_ME) return 0;
286289177Speter  }
287289177Speter  if (options & UTF8PROC_CASEFOLD) {
288289177Speter    if (property->casefold_mapping) {
289289177Speter      const int32_t *casefold_entry;
290289177Speter      ssize_t written = 0;
291289177Speter      for (casefold_entry = property->casefold_mapping;
292289177Speter          *casefold_entry >= 0; casefold_entry++) {
293289177Speter        written += utf8proc_decompose_char(*casefold_entry, dst+written,
294289177Speter          (bufsize > written) ? (bufsize - written) : 0, options,
295289177Speter          last_boundclass);
296289177Speter        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
297289177Speter      }
298289177Speter      return written;
299289177Speter    }
300289177Speter  }
301289177Speter  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
302289177Speter    if (property->decomp_mapping &&
303289177Speter        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
304289177Speter      const int32_t *decomp_entry;
305289177Speter      ssize_t written = 0;
306289177Speter      for (decomp_entry = property->decomp_mapping;
307289177Speter          *decomp_entry >= 0; decomp_entry++) {
308289177Speter        written += utf8proc_decompose_char(*decomp_entry, dst+written,
309289177Speter          (bufsize > written) ? (bufsize - written) : 0, options,
310289177Speter        last_boundclass);
311289177Speter        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
312289177Speter      }
313289177Speter      return written;
314289177Speter    }
315289177Speter  }
316289177Speter  if (options & UTF8PROC_CHARBOUND) {
317289177Speter    bool boundary;
318289177Speter    int tbc, lbc;
319289177Speter    tbc =
320289177Speter      (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
321289177Speter      (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
322289177Speter      ((category == UTF8PROC_CATEGORY_ZL ||
323289177Speter        category == UTF8PROC_CATEGORY_ZP ||
324289177Speter        category == UTF8PROC_CATEGORY_CC ||
325289177Speter        category == UTF8PROC_CATEGORY_CF) &&
326289177Speter        !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
327289177Speter      property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
328289177Speter      ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
329289177Speter        uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
330289177Speter      (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
331289177Speter        UTF8PROC_BOUNDCLASS_V :
332289177Speter      (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
333289177Speter        UTF8PROC_BOUNDCLASS_T :
334289177Speter      (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
335289177Speter        ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
336289177Speter          UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
337289177Speter      ) :
338289177Speter      UTF8PROC_BOUNDCLASS_OTHER;
339289177Speter    lbc = *last_boundclass;
340289177Speter    boundary =
341289177Speter      (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
342289177Speter      (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
343289177Speter      (lbc == UTF8PROC_BOUNDCLASS_CR &&
344289177Speter       tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
345289177Speter      (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
346289177Speter      (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
347289177Speter      (lbc == UTF8PROC_BOUNDCLASS_L &&
348289177Speter       (tbc == UTF8PROC_BOUNDCLASS_L ||
349289177Speter        tbc == UTF8PROC_BOUNDCLASS_V ||
350289177Speter        tbc == UTF8PROC_BOUNDCLASS_LV ||
351289177Speter        tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
352289177Speter      ((lbc == UTF8PROC_BOUNDCLASS_LV ||
353289177Speter        lbc == UTF8PROC_BOUNDCLASS_V) &&
354289177Speter       (tbc == UTF8PROC_BOUNDCLASS_V ||
355289177Speter        tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
356289177Speter      ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
357289177Speter        lbc == UTF8PROC_BOUNDCLASS_T) &&
358289177Speter       tbc == UTF8PROC_BOUNDCLASS_T) ? false :
359289177Speter       true;
360289177Speter    *last_boundclass = tbc;
361289177Speter    if (boundary) {
362289177Speter      if (bufsize >= 1) dst[0] = 0xFFFF;
363289177Speter      if (bufsize >= 2) dst[1] = uc;
364289177Speter      return 2;
365289177Speter    }
366289177Speter  }
367289177Speter  if (bufsize >= 1) *dst = uc;
368289177Speter  return 1;
369289177Speter}
370289177Speter
371289177SpeterUTF8PROC_API
372289177Speterssize_t utf8proc_decompose(
373289177Speter  const uint8_t *str, ssize_t strlen,
374289177Speter  int32_t *buffer, ssize_t bufsize, int options
375289177Speter) {
376289177Speter  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
377289177Speter  ssize_t wpos = 0;
378289177Speter  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
379289177Speter    return UTF8PROC_ERROR_INVALIDOPTS;
380289177Speter  if ((options & UTF8PROC_STRIPMARK) &&
381289177Speter      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
382289177Speter    return UTF8PROC_ERROR_INVALIDOPTS;
383289177Speter  {
384289177Speter    int32_t uc;
385289177Speter    ssize_t rpos = 0;
386289177Speter    ssize_t decomp_result;
387289177Speter    int boundclass = UTF8PROC_BOUNDCLASS_START;
388289177Speter    while (1) {
389289177Speter      if (options & UTF8PROC_NULLTERM) {
390289177Speter        rpos += utf8proc_iterate(str + rpos, -1, &uc);
391289177Speter        /* checking of return value is not neccessary,
392289177Speter           as 'uc' is < 0 in case of error */
393289177Speter        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
394289177Speter        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
395289177Speter        if (uc == 0) break;
396289177Speter      } else {
397289177Speter        if (rpos >= strlen) break;
398289177Speter        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
399289177Speter        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
400289177Speter      }
401289177Speter      decomp_result = utf8proc_decompose_char(
402289177Speter        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
403289177Speter        &boundclass
404289177Speter      );
405289177Speter      if (decomp_result < 0) return decomp_result;
406289177Speter      wpos += decomp_result;
407289177Speter      /* prohibiting integer overflows due to too long strings: */
408289177Speter      if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
409289177Speter        return UTF8PROC_ERROR_OVERFLOW;
410289177Speter    }
411289177Speter  }
412289177Speter  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
413289177Speter    ssize_t pos = 0;
414289177Speter    while (pos < wpos-1) {
415289177Speter      int32_t uc1, uc2;
416289177Speter      const utf8proc_property_t *property1, *property2;
417289177Speter      uc1 = buffer[pos];
418289177Speter      uc2 = buffer[pos+1];
419289177Speter      property1 = utf8proc_get_property(uc1);
420289177Speter      property2 = utf8proc_get_property(uc2);
421289177Speter      if (property1->combining_class > property2->combining_class &&
422289177Speter          property2->combining_class > 0) {
423289177Speter        buffer[pos] = uc2;
424289177Speter        buffer[pos+1] = uc1;
425289177Speter        if (pos > 0) pos--; else pos++;
426289177Speter      } else {
427289177Speter        pos++;
428289177Speter      }
429289177Speter    }
430289177Speter  }
431289177Speter  return wpos;
432289177Speter}
433289177Speter
434289177SpeterUTF8PROC_API
435289177Speterssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
436289177Speter  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
437289177Speter     ASSERT: 'buffer' has one spare byte of free space at the end! */
438289177Speter  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
439289177Speter    ssize_t rpos;
440289177Speter    ssize_t wpos = 0;
441289177Speter    int32_t uc;
442289177Speter    for (rpos = 0; rpos < length; rpos++) {
443289177Speter      uc = buffer[rpos];
444289177Speter      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
445289177Speter      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
446289177Speter          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
447289177Speter        if (options & UTF8PROC_NLF2LS) {
448289177Speter          if (options & UTF8PROC_NLF2PS) {
449289177Speter            buffer[wpos++] = 0x000A;
450289177Speter          } else {
451289177Speter            buffer[wpos++] = 0x2028;
452289177Speter          }
453289177Speter        } else {
454289177Speter          if (options & UTF8PROC_NLF2PS) {
455289177Speter            buffer[wpos++] = 0x2029;
456289177Speter          } else {
457289177Speter            buffer[wpos++] = 0x0020;
458289177Speter          }
459289177Speter        }
460289177Speter      } else if ((options & UTF8PROC_STRIPCC) &&
461289177Speter          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
462289177Speter        if (uc == 0x0009) buffer[wpos++] = 0x0020;
463289177Speter      } else {
464289177Speter        buffer[wpos++] = uc;
465289177Speter      }
466289177Speter    }
467289177Speter    length = wpos;
468289177Speter  }
469289177Speter  if (options & UTF8PROC_COMPOSE) {
470289177Speter    int32_t *starter = NULL;
471289177Speter    int32_t current_char;
472289177Speter    const utf8proc_property_t *starter_property = NULL, *current_property;
473289177Speter    utf8proc_propval_t max_combining_class = -1;
474289177Speter    ssize_t rpos;
475289177Speter    ssize_t wpos = 0;
476289177Speter    int32_t composition;
477289177Speter    for (rpos = 0; rpos < length; rpos++) {
478289177Speter      current_char = buffer[rpos];
479289177Speter      current_property = utf8proc_get_property(current_char);
480289177Speter      if (starter && current_property->combining_class > max_combining_class) {
481289177Speter        /* combination perhaps possible */
482289177Speter        int32_t hangul_lindex;
483289177Speter        int32_t hangul_sindex;
484289177Speter        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
485289177Speter        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
486289177Speter          int32_t hangul_vindex;
487289177Speter          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
488289177Speter          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
489289177Speter            *starter = UTF8PROC_HANGUL_SBASE +
490289177Speter              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
491289177Speter              UTF8PROC_HANGUL_TCOUNT;
492289177Speter            starter_property = NULL;
493289177Speter            continue;
494289177Speter          }
495289177Speter        }
496289177Speter        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
497289177Speter        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
498289177Speter            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
499289177Speter          int32_t hangul_tindex;
500289177Speter          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
501289177Speter          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
502289177Speter            *starter += hangul_tindex;
503289177Speter            starter_property = NULL;
504289177Speter            continue;
505289177Speter          }
506289177Speter        }
507289177Speter        if (!starter_property) {
508289177Speter          starter_property = utf8proc_get_property(*starter);
509289177Speter        }
510289177Speter        if (starter_property->comb1st_index >= 0 &&
511289177Speter            current_property->comb2nd_index >= 0) {
512289177Speter          composition = utf8proc_combinations[
513289177Speter            starter_property->comb1st_index +
514289177Speter            current_property->comb2nd_index
515289177Speter          ];
516289177Speter          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
517289177Speter              !(utf8proc_get_property(composition)->comp_exclusion))) {
518289177Speter            *starter = composition;
519289177Speter            starter_property = NULL;
520289177Speter            continue;
521289177Speter          }
522289177Speter        }
523289177Speter      }
524289177Speter      buffer[wpos] = current_char;
525289177Speter      if (current_property->combining_class) {
526289177Speter        if (current_property->combining_class > max_combining_class) {
527289177Speter          max_combining_class = current_property->combining_class;
528289177Speter        }
529289177Speter      } else {
530289177Speter        starter = buffer + wpos;
531289177Speter        starter_property = NULL;
532289177Speter        max_combining_class = -1;
533289177Speter      }
534289177Speter      wpos++;
535289177Speter    }
536289177Speter    length = wpos;
537289177Speter  }
538289177Speter  {
539289177Speter    ssize_t rpos, wpos = 0;
540289177Speter    int32_t uc;
541289177Speter    for (rpos = 0; rpos < length; rpos++) {
542289177Speter      uc = buffer[rpos];
543289177Speter      wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
544289177Speter    }
545289177Speter    ((uint8_t *)buffer)[wpos] = 0;
546289177Speter    return wpos;
547289177Speter  }
548289177Speter}
549289177Speter
550289177SpeterUTF8PROC_API
551289177Speterssize_t utf8proc_map(
552289177Speter  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
553289177Speter) {
554289177Speter  int32_t *buffer;
555289177Speter  ssize_t result;
556289177Speter  *dstptr = NULL;
557289177Speter  result = utf8proc_decompose(str, strlen, NULL, 0, options);
558289177Speter  if (result < 0) return result;
559289177Speter  buffer = malloc(result * sizeof(int32_t) + 1);
560289177Speter  if (!buffer) return UTF8PROC_ERROR_NOMEM;
561289177Speter  result = utf8proc_decompose(str, strlen, buffer, result, options);
562289177Speter  if (result < 0) {
563289177Speter    free(buffer);
564289177Speter    return result;
565289177Speter  }
566289177Speter  result = utf8proc_reencode(buffer, result, options);
567289177Speter  if (result < 0) {
568289177Speter    free(buffer);
569289177Speter    return result;
570289177Speter  }
571289177Speter  {
572289177Speter    int32_t *newptr;
573289177Speter    newptr = realloc(buffer, (size_t)result+1);
574289177Speter    if (newptr) buffer = newptr;
575289177Speter  }
576289177Speter  *dstptr = (uint8_t *)buffer;
577289177Speter  return result;
578289177Speter}
579289177Speter
580289177SpeterUTF8PROC_API
581289177Speteruint8_t *utf8proc_NFD(const uint8_t *str) {
582289177Speter  uint8_t *retval;
583289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
584289177Speter    UTF8PROC_DECOMPOSE);
585289177Speter  return retval;
586289177Speter}
587289177Speter
588289177SpeterUTF8PROC_API
589289177Speteruint8_t *utf8proc_NFC(const uint8_t *str) {
590289177Speter  uint8_t *retval;
591289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
592289177Speter    UTF8PROC_COMPOSE);
593289177Speter  return retval;
594289177Speter}
595289177Speter
596289177SpeterUTF8PROC_API
597289177Speteruint8_t *utf8proc_NFKD(const uint8_t *str) {
598289177Speter  uint8_t *retval;
599289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
600289177Speter    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
601289177Speter  return retval;
602289177Speter}
603289177Speter
604289177SpeterUTF8PROC_API
605289177Speteruint8_t *utf8proc_NFKC(const uint8_t *str) {
606289177Speter  uint8_t *retval;
607289177Speter  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
608289177Speter    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
609289177Speter  return retval;
610289177Speter}
611289177Speter
612