Deleted Added
full compact
utf8proc.c (302408) utf8proc.c (362181)
1/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
1/*
2/*
3 * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
2 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:

--- 24 unchanged lines hidden (view full) ---

34/*
35 * File name: utf8proc.c
36 *
37 * Description:
38 * Implementation of libutf8proc.
39 */
40
41
4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:

--- 24 unchanged lines hidden (view full) ---

36/*
37 * File name: utf8proc.c
38 *
39 * Description:
40 * Implementation of libutf8proc.
41 */
42
43
42#include "utf8proc.h"
44#include "utf8proc_internal.h"
43#include "utf8proc_data.c"
44
45
45#include "utf8proc_data.c"
46
47
46UTF8PROC_DATA
47const int8_t utf8proc_utf8class[256] = {
48UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

--- 21 unchanged lines hidden (view full) ---

77#define UTF8PROC_HANGUL_L_FILLER 0x115F
78#define UTF8PROC_HANGUL_V_START 0x1160
79#define UTF8PROC_HANGUL_V_END 0x11A3
80#define UTF8PROC_HANGUL_T_START 0x11A8
81#define UTF8PROC_HANGUL_T_END 0x11FA
82#define UTF8PROC_HANGUL_S_START 0xAC00
83#define UTF8PROC_HANGUL_S_END 0xD7A4
84
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

--- 21 unchanged lines hidden (view full) ---

78#define UTF8PROC_HANGUL_L_FILLER 0x115F
79#define UTF8PROC_HANGUL_V_START 0x1160
80#define UTF8PROC_HANGUL_V_END 0x11A3
81#define UTF8PROC_HANGUL_T_START 0x11A8
82#define UTF8PROC_HANGUL_T_END 0x11FA
83#define UTF8PROC_HANGUL_S_START 0xAC00
84#define UTF8PROC_HANGUL_S_END 0xD7A4
85
85
86#define UTF8PROC_BOUNDCLASS_START 0
87#define UTF8PROC_BOUNDCLASS_OTHER 1
88#define UTF8PROC_BOUNDCLASS_CR 2
89#define UTF8PROC_BOUNDCLASS_LF 3
90#define UTF8PROC_BOUNDCLASS_CONTROL 4
91#define UTF8PROC_BOUNDCLASS_EXTEND 5
92#define UTF8PROC_BOUNDCLASS_L 6
93#define UTF8PROC_BOUNDCLASS_V 7
94#define UTF8PROC_BOUNDCLASS_T 8
95#define UTF8PROC_BOUNDCLASS_LV 9
96#define UTF8PROC_BOUNDCLASS_LVT 10
97
98
99UTF8PROC_API
100const char *utf8proc_version(void) {
101 return "1.1.5";
86/* Should follow semantic-versioning rules (semver.org) based on API
87 compatibility. (Note that the shared-library version number will
88 be different, being based on ABI compatibility.): */
89#define STRINGIZEx(x) #x
90#define STRINGIZE(x) STRINGIZEx(x)
91UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
92 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
102}
103
93}
94
104/*
105 * This macro tells translators that string X should be translated,
106 * but does not look up the translation at run time. This is standard
107 * GNU gettext notation for annotating compile-time constant strings.
108 */
109#ifndef N_
110#define N_(x) x
111#endif
112
113UTF8PROC_API
114const char *utf8proc_errmsg(ssize_t errcode) {
95UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
115 switch (errcode) {
116 case UTF8PROC_ERROR_NOMEM:
96 switch (errcode) {
97 case UTF8PROC_ERROR_NOMEM:
117 return N_("Memory for processing UTF-8 data could not be allocated.");
98 return "Memory for processing UTF-8 data could not be allocated.";
118 case UTF8PROC_ERROR_OVERFLOW:
99 case UTF8PROC_ERROR_OVERFLOW:
119 return N_("UTF-8 string is too long to be processed.");
100 return "UTF-8 string is too long to be processed.";
120 case UTF8PROC_ERROR_INVALIDUTF8:
101 case UTF8PROC_ERROR_INVALIDUTF8:
121 return N_("Invalid UTF-8 string");
102 return "Invalid UTF-8 string";
122 case UTF8PROC_ERROR_NOTASSIGNED:
103 case UTF8PROC_ERROR_NOTASSIGNED:
123 return N_("Unassigned Unicode code point found in UTF-8 string.");
104 return "Unassigned Unicode code point found in UTF-8 string.";
124 case UTF8PROC_ERROR_INVALIDOPTS:
105 case UTF8PROC_ERROR_INVALIDOPTS:
125 return N_("Invalid options for UTF-8 processing chosen.");
106 return "Invalid options for UTF-8 processing chosen.";
126 default:
107 default:
127 return N_("An unknown error occured while processing UTF-8 data.");
108 return "An unknown error occurred while processing UTF-8 data.";
128 }
129}
130
109 }
110}
111
131UTF8PROC_API
132ssize_t utf8proc_iterate(
133 const uint8_t *str, ssize_t strlen, int32_t *dst
112#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
113UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
114 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
134) {
115) {
135 int length;
136 int i;
137 int32_t uc = -1;
116 utf8proc_uint32_t uc;
117 const utf8proc_uint8_t *end;
118
138 *dst = -1;
139 if (!strlen) return 0;
119 *dst = -1;
120 if (!strlen) return 0;
140 length = utf8proc_utf8class[str[0]];
141 if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
142 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
143 for (i=1; i<length; i++) {
144 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
121 end = str + ((strlen < 0) ? 4 : strlen);
122 uc = *str++;
123 if (uc < 0x80) {
124 *dst = uc;
125 return 1;
145 }
126 }
146 switch (length) {
147 case 1:
148 uc = str[0];
149 break;
150 case 2:
151 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
152 if (uc < 0x80) uc = -1;
153 break;
154 case 3:
155 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
156 + (str[2] & 0x3F);
157 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
158 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
159 break;
160 case 4:
161 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
162 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
163 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
164 break;
127 /* Must be between 0xc2 and 0xf4 inclusive to be valid */
128 if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
129 if (uc < 0xe0) { /* 2-byte sequence */
130 /* Must have valid continuation character */
131 if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
132 *dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
133 return 2;
165 }
134 }
166 if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
167 return UTF8PROC_ERROR_INVALIDUTF8;
168 *dst = uc;
169 return length;
135 if (uc < 0xf0) { /* 3-byte sequence */
136 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
137 return UTF8PROC_ERROR_INVALIDUTF8;
138 /* Check for surrogate chars */
139 if (uc == 0xed && *str > 0x9f)
140 return UTF8PROC_ERROR_INVALIDUTF8;
141 uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
142 if (uc < 0x800)
143 return UTF8PROC_ERROR_INVALIDUTF8;
144 *dst = uc;
145 return 3;
146 }
147 /* 4-byte sequence
148 Must have 3 valid continuation characters */
149 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
150 return UTF8PROC_ERROR_INVALIDUTF8;
151 /* Make sure in correct range (0x10000 - 0x10ffff) */
152 if (uc == 0xf0) {
153 if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
154 } else if (uc == 0xf4) {
155 if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
156 }
157 *dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
158 return 4;
170}
171
159}
160
172UTF8PROC_API
173bool utf8proc_codepoint_valid(int32_t uc) {
174 if (uc < 0 || uc >= 0x110000 ||
175 ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
176 (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
177 else return true;
161UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
162 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
178}
179
163}
164
180UTF8PROC_API
181ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
165UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
182 if (uc < 0x00) {
183 return 0;
184 } else if (uc < 0x80) {
166 if (uc < 0x00) {
167 return 0;
168 } else if (uc < 0x80) {
185 dst[0] = (uint8_t)uc;
169 dst[0] = (utf8proc_uint8_t) uc;
186 return 1;
187 } else if (uc < 0x800) {
170 return 1;
171 } else if (uc < 0x800) {
188 dst[0] = 0xC0 + (uint8_t)(uc >> 6);
189 dst[1] = 0x80 + (uc & 0x3F);
172 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
173 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
190 return 2;
174 return 2;
191 } else if (uc == 0xFFFF) {
192 dst[0] = 0xFF;
193 return 1;
194 } else if (uc == 0xFFFE) {
195 dst[0] = 0xFE;
196 return 1;
175 /* Note: we allow encoding 0xd800-0xdfff here, so as not to change
176 the API, however, these are actually invalid in UTF-8 */
197 } else if (uc < 0x10000) {
177 } else if (uc < 0x10000) {
198 dst[0] = 0xE0 + (uint8_t)(uc >> 12);
199 dst[1] = 0x80 + ((uc >> 6) & 0x3F);
200 dst[2] = 0x80 + (uc & 0x3F);
178 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
179 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
180 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
201 return 3;
202 } else if (uc < 0x110000) {
181 return 3;
182 } else if (uc < 0x110000) {
203 dst[0] = 0xF0 + (uint8_t)(uc >> 18);
204 dst[1] = 0x80 + ((uc >> 12) & 0x3F);
205 dst[2] = 0x80 + ((uc >> 6) & 0x3F);
206 dst[3] = 0x80 + (uc & 0x3F);
183 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
184 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
185 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
186 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
207 return 4;
208 } else return 0;
209}
210
187 return 4;
188 } else return 0;
189}
190
211UTF8PROC_API
212const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
191/* internal "unsafe" version that does not check whether uc is in range */
192static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
193 if (uc < 0x00) {
194 return 0;
195 } else if (uc < 0x80) {
196 dst[0] = (utf8proc_uint8_t)uc;
197 return 1;
198 } else if (uc < 0x800) {
199 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
200 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
201 return 2;
202 } else if (uc == 0xFFFF) {
203 dst[0] = (utf8proc_uint8_t)0xFF;
204 return 1;
205 } else if (uc == 0xFFFE) {
206 dst[0] = (utf8proc_uint8_t)0xFE;
207 return 1;
208 } else if (uc < 0x10000) {
209 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
210 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
211 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
212 return 3;
213 } else if (uc < 0x110000) {
214 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
215 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
216 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
217 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
218 return 4;
219 } else return 0;
220}
221
222/* internal "unsafe" version that does not check whether uc is in range */
223static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
213 /* ASSERT: uc >= 0 && uc < 0x110000 */
214 return utf8proc_properties + (
215 utf8proc_stage2table[
216 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
217 ]
218 );
219}
220
224 /* ASSERT: uc >= 0 && uc < 0x110000 */
225 return utf8proc_properties + (
226 utf8proc_stage2table[
227 utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
228 ]
229 );
230}
231
232UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
233 return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
234}
235
236/* return whether there is a grapheme break between boundclasses lbc and tbc
237 (according to the definition of extended grapheme clusters)
238
239 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
240 http://www.unicode.org/reports/tr29/tr29-29.html
241
242 CAVEATS:
243 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
244 and GB 12/13 (regional indicator code points) require knowledge of previous characters
245 and are thus not handled by this function. This may result in an incorrect break before
246 an E_Modifier class codepoint and an incorrectly missing break between two
247 REGIONAL_INDICATOR class code points if such support does not exist in the caller.
248
249 See the special support in grapheme_break_extended, for required bookkeeping by the caller.
250*/
251static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
252 return
253 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : /* GB1 */
254 (lbc == UTF8PROC_BOUNDCLASS_CR && /* GB3 */
255 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : /* --- */
256 (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB4 */
257 (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB5 */
258 (lbc == UTF8PROC_BOUNDCLASS_L && /* GB6 */
259 (tbc == UTF8PROC_BOUNDCLASS_L || /* --- */
260 tbc == UTF8PROC_BOUNDCLASS_V || /* --- */
261 tbc == UTF8PROC_BOUNDCLASS_LV || /* --- */
262 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : /* --- */
263 ((lbc == UTF8PROC_BOUNDCLASS_LV || /* GB7 */
264 lbc == UTF8PROC_BOUNDCLASS_V) && /* --- */
265 (tbc == UTF8PROC_BOUNDCLASS_V || /* --- */
266 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : /* --- */
267 ((lbc == UTF8PROC_BOUNDCLASS_LVT || /* GB8 */
268 lbc == UTF8PROC_BOUNDCLASS_T) && /* --- */
269 tbc == UTF8PROC_BOUNDCLASS_T) ? false : /* --- */
270 (tbc == UTF8PROC_BOUNDCLASS_EXTEND || /* GB9 */
271 tbc == UTF8PROC_BOUNDCLASS_ZWJ || /* --- */
272 tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || /* GB9a */
273 lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : /* GB9b */
274 ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || /* GB10 (requires additional handling below) */
275 lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && /* ---- */
276 tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : /* ---- */
277 (lbc == UTF8PROC_BOUNDCLASS_ZWJ && /* GB11 */
278 (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || /* ---- */
279 tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : /* ---- */
280 (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && /* GB12/13 (requires additional handling below) */
281 tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : /* ---- */
282 true; /* GB999 */
283}
284
285static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
286{
287 utf8proc_bool break_permitted;
288 int lbc_override = lbc;
289 if (state && *state != UTF8PROC_BOUNDCLASS_START)
290 lbc_override = *state;
291 break_permitted = grapheme_break_simple(lbc_override, tbc);
292 if (state) {
293 /* Special support for GB 12/13 made possible by GB999. After two RI
294 class codepoints we want to force a break. Do this by resetting the
295 second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
296 after that character according to GB999 (unless of course such a break is
297 forbidden by a different rule such as GB9). */
298 if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
299 *state = UTF8PROC_BOUNDCLASS_OTHER;
300 /* Special support for GB10. Fold any EXTEND codepoints into the previous
301 boundclass if we're dealing with an emoji base boundclass. */
302 else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
303 *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
304 tbc == UTF8PROC_BOUNDCLASS_EXTEND)
305 *state = UTF8PROC_BOUNDCLASS_E_BASE;
306 else
307 *state = tbc;
308 }
309 return break_permitted;
310}
311
312UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
313 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
314
315 return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
316 utf8proc_get_property(c2)->boundclass,
317 state);
318}
319
320
321UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
322 utf8proc_int32_t c1, utf8proc_int32_t c2) {
323 return utf8proc_grapheme_break_stateful(c1, c2, NULL);
324}
325
326static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
327{
328 utf8proc_int32_t entry_cp = **entry;
329 if ((entry_cp & 0xF800) == 0xD800) {
330 *entry = *entry + 1;
331 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
332 entry_cp += 0x10000;
333 }
334 return entry_cp;
335}
336
337static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
338{
339 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
340 return seqindex_decode_entry(&entry);
341}
342
343static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
344 utf8proc_ssize_t written = 0;
345 const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
346 int len = seqindex >> 13;
347 if (len >= 7) {
348 len = *entry;
349 entry++;
350 }
351 for (; len >= 0; entry++, len--) {
352 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
353
354 written += utf8proc_decompose_char(entry_cp, dst+written,
355 (bufsize > written) ? (bufsize - written) : 0, options,
356 last_boundclass);
357 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
358 }
359 return written;
360}
361
362UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
363{
364 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
365 return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
366}
367
368UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
369{
370 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
371 return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
372}
373
374UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
375{
376 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
377 return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
378}
379
380/* return a character width analogous to wcwidth (except portable and
381 hopefully less buggy than most system wcwidth functions). */
382UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
383 return utf8proc_get_property(c)->charwidth;
384}
385
386UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
387 return utf8proc_get_property(c)->category;
388}
389
390UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
391 static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
392 return s[utf8proc_category(c)];
393}
394
221#define utf8proc_decompose_lump(replacement_uc) \
222 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
223 options & ~UTF8PROC_LUMP, last_boundclass)
224
395#define utf8proc_decompose_lump(replacement_uc) \
396 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
397 options & ~UTF8PROC_LUMP, last_boundclass)
398
225UTF8PROC_API
226ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
227 int options, int *last_boundclass) {
228 /* ASSERT: uc >= 0 && uc < 0x110000 */
399UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
229 const utf8proc_property_t *property;
230 utf8proc_propval_t category;
400 const utf8proc_property_t *property;
401 utf8proc_propval_t category;
231 int32_t hangul_sindex;
232 property = utf8proc_get_property(uc);
402 utf8proc_int32_t hangul_sindex;
403 if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
404 property = unsafe_get_property(uc);
233 category = property->category;
234 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
235 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
236 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
405 category = property->category;
406 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
407 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
408 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
237 int32_t hangul_tindex;
409 utf8proc_int32_t hangul_tindex;
238 if (bufsize >= 1) {
239 dst[0] = UTF8PROC_HANGUL_LBASE +
240 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
241 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
242 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
243 }
244 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
245 if (!hangul_tindex) return 2;

--- 34 unchanged lines hidden (view full) ---

280 }
281 }
282 if (options & UTF8PROC_STRIPMARK) {
283 if (category == UTF8PROC_CATEGORY_MN ||
284 category == UTF8PROC_CATEGORY_MC ||
285 category == UTF8PROC_CATEGORY_ME) return 0;
286 }
287 if (options & UTF8PROC_CASEFOLD) {
410 if (bufsize >= 1) {
411 dst[0] = UTF8PROC_HANGUL_LBASE +
412 hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
413 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
414 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
415 }
416 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
417 if (!hangul_tindex) return 2;

--- 34 unchanged lines hidden (view full) ---

452 }
453 }
454 if (options & UTF8PROC_STRIPMARK) {
455 if (category == UTF8PROC_CATEGORY_MN ||
456 category == UTF8PROC_CATEGORY_MC ||
457 category == UTF8PROC_CATEGORY_ME) return 0;
458 }
459 if (options & UTF8PROC_CASEFOLD) {
288 if (property->casefold_mapping) {
289 const int32_t *casefold_entry;
290 ssize_t written = 0;
291 for (casefold_entry = property->casefold_mapping;
292 *casefold_entry >= 0; casefold_entry++) {
293 written += utf8proc_decompose_char(*casefold_entry, dst+written,
294 (bufsize > written) ? (bufsize - written) : 0, options,
295 last_boundclass);
296 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
297 }
298 return written;
460 if (property->casefold_seqindex != UINT16_MAX) {
461 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
299 }
300 }
301 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
462 }
463 }
464 if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
302 if (property->decomp_mapping &&
465 if (property->decomp_seqindex != UINT16_MAX &&
303 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
466 (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
304 const int32_t *decomp_entry;
305 ssize_t written = 0;
306 for (decomp_entry = property->decomp_mapping;
307 *decomp_entry >= 0; decomp_entry++) {
308 written += utf8proc_decompose_char(*decomp_entry, dst+written,
309 (bufsize > written) ? (bufsize - written) : 0, options,
310 last_boundclass);
311 if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
312 }
313 return written;
467 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
314 }
315 }
316 if (options & UTF8PROC_CHARBOUND) {
468 }
469 }
470 if (options & UTF8PROC_CHARBOUND) {
317 bool boundary;
318 int tbc, lbc;
319 tbc =
320 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
321 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
322 ((category == UTF8PROC_CATEGORY_ZL ||
323 category == UTF8PROC_CATEGORY_ZP ||
324 category == UTF8PROC_CATEGORY_CC ||
325 category == UTF8PROC_CATEGORY_CF) &&
326 !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
327 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
328 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
329 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
330 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
331 UTF8PROC_BOUNDCLASS_V :
332 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
333 UTF8PROC_BOUNDCLASS_T :
334 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
335 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
336 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
337 ) :
338 UTF8PROC_BOUNDCLASS_OTHER;
339 lbc = *last_boundclass;
340 boundary =
341 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
342 (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
343 (lbc == UTF8PROC_BOUNDCLASS_CR &&
344 tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
345 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
346 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
347 (lbc == UTF8PROC_BOUNDCLASS_L &&
348 (tbc == UTF8PROC_BOUNDCLASS_L ||
349 tbc == UTF8PROC_BOUNDCLASS_V ||
350 tbc == UTF8PROC_BOUNDCLASS_LV ||
351 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
352 ((lbc == UTF8PROC_BOUNDCLASS_LV ||
353 lbc == UTF8PROC_BOUNDCLASS_V) &&
354 (tbc == UTF8PROC_BOUNDCLASS_V ||
355 tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
356 ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
357 lbc == UTF8PROC_BOUNDCLASS_T) &&
358 tbc == UTF8PROC_BOUNDCLASS_T) ? false :
359 true;
360 *last_boundclass = tbc;
471 utf8proc_bool boundary;
472 int tbc = property->boundclass;
473 boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
361 if (boundary) {
362 if (bufsize >= 1) dst[0] = 0xFFFF;
363 if (bufsize >= 2) dst[1] = uc;
364 return 2;
365 }
366 }
367 if (bufsize >= 1) *dst = uc;
368 return 1;
369}
370
474 if (boundary) {
475 if (bufsize >= 1) dst[0] = 0xFFFF;
476 if (bufsize >= 2) dst[1] = uc;
477 return 2;
478 }
479 }
480 if (bufsize >= 1) *dst = uc;
481 return 1;
482}
483
371UTF8PROC_API
372ssize_t utf8proc_decompose(
373 const uint8_t *str, ssize_t strlen,
374 int32_t *buffer, ssize_t bufsize, int options
484UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
485 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
486 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
375) {
487) {
488 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
489}
490
491UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
492 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
493 utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
494 utf8proc_custom_func custom_func, void *custom_data
495) {
376 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
496 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
377 ssize_t wpos = 0;
497 utf8proc_ssize_t wpos = 0;
378 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
379 return UTF8PROC_ERROR_INVALIDOPTS;
380 if ((options & UTF8PROC_STRIPMARK) &&
381 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
382 return UTF8PROC_ERROR_INVALIDOPTS;
383 {
498 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
499 return UTF8PROC_ERROR_INVALIDOPTS;
500 if ((options & UTF8PROC_STRIPMARK) &&
501 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
502 return UTF8PROC_ERROR_INVALIDOPTS;
503 {
384 int32_t uc;
385 ssize_t rpos = 0;
386 ssize_t decomp_result;
504 utf8proc_int32_t uc;
505 utf8proc_ssize_t rpos = 0;
506 utf8proc_ssize_t decomp_result;
387 int boundclass = UTF8PROC_BOUNDCLASS_START;
388 while (1) {
389 if (options & UTF8PROC_NULLTERM) {
390 rpos += utf8proc_iterate(str + rpos, -1, &uc);
507 int boundclass = UTF8PROC_BOUNDCLASS_START;
508 while (1) {
509 if (options & UTF8PROC_NULLTERM) {
510 rpos += utf8proc_iterate(str + rpos, -1, &uc);
391 /* checking of return value is not neccessary,
511 /* checking of return value is not necessary,
392 as 'uc' is < 0 in case of error */
393 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
394 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
395 if (uc == 0) break;
396 } else {
397 if (rpos >= strlen) break;
398 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
399 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
400 }
512 as 'uc' is < 0 in case of error */
513 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
514 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
515 if (uc == 0) break;
516 } else {
517 if (rpos >= strlen) break;
518 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
519 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
520 }
521 if (custom_func != NULL) {
522 uc = custom_func(uc, custom_data); /* user-specified custom mapping */
523 }
401 decomp_result = utf8proc_decompose_char(
402 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
403 &boundclass
404 );
405 if (decomp_result < 0) return decomp_result;
406 wpos += decomp_result;
407 /* prohibiting integer overflows due to too long strings: */
524 decomp_result = utf8proc_decompose_char(
525 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
526 &boundclass
527 );
528 if (decomp_result < 0) return decomp_result;
529 wpos += decomp_result;
530 /* prohibiting integer overflows due to too long strings: */
408 if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
531 if (wpos < 0 ||
532 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
409 return UTF8PROC_ERROR_OVERFLOW;
410 }
411 }
412 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
533 return UTF8PROC_ERROR_OVERFLOW;
534 }
535 }
536 if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
413 ssize_t pos = 0;
537 utf8proc_ssize_t pos = 0;
414 while (pos < wpos-1) {
538 while (pos < wpos-1) {
415 int32_t uc1, uc2;
539 utf8proc_int32_t uc1, uc2;
416 const utf8proc_property_t *property1, *property2;
417 uc1 = buffer[pos];
418 uc2 = buffer[pos+1];
540 const utf8proc_property_t *property1, *property2;
541 uc1 = buffer[pos];
542 uc2 = buffer[pos+1];
419 property1 = utf8proc_get_property(uc1);
420 property2 = utf8proc_get_property(uc2);
543 property1 = unsafe_get_property(uc1);
544 property2 = unsafe_get_property(uc2);
421 if (property1->combining_class > property2->combining_class &&
422 property2->combining_class > 0) {
423 buffer[pos] = uc2;
424 buffer[pos+1] = uc1;
425 if (pos > 0) pos--; else pos++;
426 } else {
427 pos++;
428 }
429 }
430 }
431 return wpos;
432}
433
545 if (property1->combining_class > property2->combining_class &&
546 property2->combining_class > 0) {
547 buffer[pos] = uc2;
548 buffer[pos+1] = uc1;
549 if (pos > 0) pos--; else pos++;
550 } else {
551 pos++;
552 }
553 }
554 }
555 return wpos;
556}
557
434UTF8PROC_API
435ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
436 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
437 ASSERT: 'buffer' has one spare byte of free space at the end! */
558UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
559 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
438 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
560 if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
439 ssize_t rpos;
440 ssize_t wpos = 0;
441 int32_t uc;
561 utf8proc_ssize_t rpos;
562 utf8proc_ssize_t wpos = 0;
563 utf8proc_int32_t uc;
442 for (rpos = 0; rpos < length; rpos++) {
443 uc = buffer[rpos];
444 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
445 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
446 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
447 if (options & UTF8PROC_NLF2LS) {
448 if (options & UTF8PROC_NLF2PS) {
449 buffer[wpos++] = 0x000A;

--- 12 unchanged lines hidden (view full) ---

462 if (uc == 0x0009) buffer[wpos++] = 0x0020;
463 } else {
464 buffer[wpos++] = uc;
465 }
466 }
467 length = wpos;
468 }
469 if (options & UTF8PROC_COMPOSE) {
564 for (rpos = 0; rpos < length; rpos++) {
565 uc = buffer[rpos];
566 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
567 if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
568 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
569 if (options & UTF8PROC_NLF2LS) {
570 if (options & UTF8PROC_NLF2PS) {
571 buffer[wpos++] = 0x000A;

--- 12 unchanged lines hidden (view full) ---

584 if (uc == 0x0009) buffer[wpos++] = 0x0020;
585 } else {
586 buffer[wpos++] = uc;
587 }
588 }
589 length = wpos;
590 }
591 if (options & UTF8PROC_COMPOSE) {
470 int32_t *starter = NULL;
471 int32_t current_char;
592 utf8proc_int32_t *starter = NULL;
593 utf8proc_int32_t current_char;
472 const utf8proc_property_t *starter_property = NULL, *current_property;
473 utf8proc_propval_t max_combining_class = -1;
594 const utf8proc_property_t *starter_property = NULL, *current_property;
595 utf8proc_propval_t max_combining_class = -1;
474 ssize_t rpos;
475 ssize_t wpos = 0;
476 int32_t composition;
596 utf8proc_ssize_t rpos;
597 utf8proc_ssize_t wpos = 0;
598 utf8proc_int32_t composition;
477 for (rpos = 0; rpos < length; rpos++) {
478 current_char = buffer[rpos];
599 for (rpos = 0; rpos < length; rpos++) {
600 current_char = buffer[rpos];
479 current_property = utf8proc_get_property(current_char);
601 current_property = unsafe_get_property(current_char);
480 if (starter && current_property->combining_class > max_combining_class) {
481 /* combination perhaps possible */
602 if (starter && current_property->combining_class > max_combining_class) {
603 /* combination perhaps possible */
482 int32_t hangul_lindex;
483 int32_t hangul_sindex;
604 utf8proc_int32_t hangul_lindex;
605 utf8proc_int32_t hangul_sindex;
484 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
485 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
606 hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
607 if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
486 int32_t hangul_vindex;
608 utf8proc_int32_t hangul_vindex;
487 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
488 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
489 *starter = UTF8PROC_HANGUL_SBASE +
490 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
491 UTF8PROC_HANGUL_TCOUNT;
492 starter_property = NULL;
493 continue;
494 }
495 }
496 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
497 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
498 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
609 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
610 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
611 *starter = UTF8PROC_HANGUL_SBASE +
612 (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
613 UTF8PROC_HANGUL_TCOUNT;
614 starter_property = NULL;
615 continue;
616 }
617 }
618 hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
619 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
620 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
499 int32_t hangul_tindex;
621 utf8proc_int32_t hangul_tindex;
500 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
501 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
502 *starter += hangul_tindex;
503 starter_property = NULL;
504 continue;
505 }
506 }
507 if (!starter_property) {
622 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
623 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
624 *starter += hangul_tindex;
625 starter_property = NULL;
626 continue;
627 }
628 }
629 if (!starter_property) {
508 starter_property = utf8proc_get_property(*starter);
630 starter_property = unsafe_get_property(*starter);
509 }
631 }
510 if (starter_property->comb1st_index >= 0 &&
511 current_property->comb2nd_index >= 0) {
512 composition = utf8proc_combinations[
513 starter_property->comb1st_index +
514 current_property->comb2nd_index
515 ];
516 if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
517 !(utf8proc_get_property(composition)->comp_exclusion))) {
518 *starter = composition;
519 starter_property = NULL;
520 continue;
632 if (starter_property->comb_index < 0x8000 &&
633 current_property->comb_index != UINT16_MAX &&
634 current_property->comb_index >= 0x8000) {
635 int sidx = starter_property->comb_index;
636 int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
637 if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
638 idx += sidx + 2;
639 if (current_property->comb_index & 0x4000) {
640 composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
641 } else
642 composition = utf8proc_combinations[idx];
643
644 if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
645 !(unsafe_get_property(composition)->comp_exclusion))) {
646 *starter = composition;
647 starter_property = NULL;
648 continue;
649 }
521 }
522 }
523 }
524 buffer[wpos] = current_char;
525 if (current_property->combining_class) {
526 if (current_property->combining_class > max_combining_class) {
527 max_combining_class = current_property->combining_class;
528 }
529 } else {
530 starter = buffer + wpos;
531 starter_property = NULL;
532 max_combining_class = -1;
533 }
534 wpos++;
535 }
536 length = wpos;
537 }
650 }
651 }
652 }
653 buffer[wpos] = current_char;
654 if (current_property->combining_class) {
655 if (current_property->combining_class > max_combining_class) {
656 max_combining_class = current_property->combining_class;
657 }
658 } else {
659 starter = buffer + wpos;
660 starter_property = NULL;
661 max_combining_class = -1;
662 }
663 wpos++;
664 }
665 length = wpos;
666 }
667 return length;
668}
669
670UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
671 /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
672 ASSERT: 'buffer' has one spare byte of free space at the end! */
673 length = utf8proc_normalize_utf32(buffer, length, options);
674 if (length < 0) return length;
538 {
675 {
539 ssize_t rpos, wpos = 0;
540 int32_t uc;
541 for (rpos = 0; rpos < length; rpos++) {
542 uc = buffer[rpos];
543 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
676 utf8proc_ssize_t rpos, wpos = 0;
677 utf8proc_int32_t uc;
678 if (options & UTF8PROC_CHARBOUND) {
679 for (rpos = 0; rpos < length; rpos++) {
680 uc = buffer[rpos];
681 wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
682 }
683 } else {
684 for (rpos = 0; rpos < length; rpos++) {
685 uc = buffer[rpos];
686 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
687 }
544 }
688 }
545 ((uint8_t *)buffer)[wpos] = 0;
689 ((utf8proc_uint8_t *)buffer)[wpos] = 0;
546 return wpos;
547 }
548}
549
690 return wpos;
691 }
692}
693
550UTF8PROC_API
551ssize_t utf8proc_map(
552 const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
694UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
695 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
553) {
696) {
554 int32_t *buffer;
555 ssize_t result;
697 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
698}
699
700UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
701 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
702 utf8proc_custom_func custom_func, void *custom_data
703) {
704 utf8proc_int32_t *buffer;
705 utf8proc_ssize_t result;
556 *dstptr = NULL;
706 *dstptr = NULL;
557 result = utf8proc_decompose(str, strlen, NULL, 0, options);
707 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
558 if (result < 0) return result;
708 if (result < 0) return result;
559 buffer = malloc(result * sizeof(int32_t) + 1);
709 buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
560 if (!buffer) return UTF8PROC_ERROR_NOMEM;
710 if (!buffer) return UTF8PROC_ERROR_NOMEM;
561 result = utf8proc_decompose(str, strlen, buffer, result, options);
711 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
562 if (result < 0) {
563 free(buffer);
564 return result;
565 }
566 result = utf8proc_reencode(buffer, result, options);
567 if (result < 0) {
568 free(buffer);
569 return result;
570 }
571 {
712 if (result < 0) {
713 free(buffer);
714 return result;
715 }
716 result = utf8proc_reencode(buffer, result, options);
717 if (result < 0) {
718 free(buffer);
719 return result;
720 }
721 {
572 int32_t *newptr;
573 newptr = realloc(buffer, (size_t)result+1);
722 utf8proc_int32_t *newptr;
723 newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
574 if (newptr) buffer = newptr;
575 }
724 if (newptr) buffer = newptr;
725 }
576 *dstptr = (uint8_t *)buffer;
726 *dstptr = (utf8proc_uint8_t *)buffer;
577 return result;
578}
579
727 return result;
728}
729
580UTF8PROC_API
581uint8_t *utf8proc_NFD(const uint8_t *str) {
582 uint8_t *retval;
730UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
731 utf8proc_uint8_t *retval;
583 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
584 UTF8PROC_DECOMPOSE);
585 return retval;
586}
587
732 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
733 UTF8PROC_DECOMPOSE);
734 return retval;
735}
736
588UTF8PROC_API
589uint8_t *utf8proc_NFC(const uint8_t *str) {
590 uint8_t *retval;
737UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
738 utf8proc_uint8_t *retval;
591 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
592 UTF8PROC_COMPOSE);
593 return retval;
594}
595
739 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
740 UTF8PROC_COMPOSE);
741 return retval;
742}
743
596UTF8PROC_API
597uint8_t *utf8proc_NFKD(const uint8_t *str) {
598 uint8_t *retval;
744UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
745 utf8proc_uint8_t *retval;
599 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
600 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
601 return retval;
602}
603
746 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
747 UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
748 return retval;
749}
750
604UTF8PROC_API
605uint8_t *utf8proc_NFKC(const uint8_t *str) {
606 uint8_t *retval;
751UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
752 utf8proc_uint8_t *retval;
607 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
608 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
609 return retval;
610}
753 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
754 UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
755 return retval;
756}
611