Cross Reference: /freebsd-11-stable/contrib/subversion/subversion/libsvn

Deleted Added

sdiff udiff text old ( 302408 ) new ( 362181 )

full compact

utf8proc.c (302408)	utf8proc.c (362181)
	1/* -- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -- */
1/*	2/*
	3 * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
2 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: --- 24 unchanged lines hidden (view full) --- 34/* 35 * File name: utf8proc.c 36 * 37 * Description: 38 * Implementation of libutf8proc. 39 */ 40 41	4 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: --- 24 unchanged lines hidden (view full) --- 36/* 37 * File name: utf8proc.c 38 * 39 * Description: 40 * Implementation of libutf8proc. 41 */ 42 43
42#include "utf8proc.h"	44#include "utf8proc_internal.h"
43#include "utf8proc_data.c" 44 45	45#include "utf8proc_data.c" 46 47
46UTF8PROC_DATA 47const int8_t utf8proc_utf8class[256] = {	48UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, --- 21 unchanged lines hidden (view full) --- 77#define UTF8PROC_HANGUL_L_FILLER 0x115F 78#define UTF8PROC_HANGUL_V_START 0x1160 79#define UTF8PROC_HANGUL_V_END 0x11A3 80#define UTF8PROC_HANGUL_T_START 0x11A8 81#define UTF8PROC_HANGUL_T_END 0x11FA 82#define UTF8PROC_HANGUL_S_START 0xAC00 83#define UTF8PROC_HANGUL_S_END 0xD7A4 84	49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 56 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, --- 21 unchanged lines hidden (view full) --- 78#define UTF8PROC_HANGUL_L_FILLER 0x115F 79#define UTF8PROC_HANGUL_V_START 0x1160 80#define UTF8PROC_HANGUL_V_END 0x11A3 81#define UTF8PROC_HANGUL_T_START 0x11A8 82#define UTF8PROC_HANGUL_T_END 0x11FA 83#define UTF8PROC_HANGUL_S_START 0xAC00 84#define UTF8PROC_HANGUL_S_END 0xD7A4 85
85 86#define UTF8PROC_BOUNDCLASS_START 0 87#define UTF8PROC_BOUNDCLASS_OTHER 1 88#define UTF8PROC_BOUNDCLASS_CR 2 89#define UTF8PROC_BOUNDCLASS_LF 3 90#define UTF8PROC_BOUNDCLASS_CONTROL 4 91#define UTF8PROC_BOUNDCLASS_EXTEND 5 92#define UTF8PROC_BOUNDCLASS_L 6 93#define UTF8PROC_BOUNDCLASS_V 7 94#define UTF8PROC_BOUNDCLASS_T 8 95#define UTF8PROC_BOUNDCLASS_LV 9 96#define UTF8PROC_BOUNDCLASS_LVT 10 97 98 99UTF8PROC_API 100const char utf8proc_version(void) { 101* return "1.1.5";	86/* Should follow semantic-versioning rules (semver.org) based on API 87 compatibility. (Note that the shared-library version number will 88 be different, being based on ABI compatibility.): / 89#define STRINGIZEx(x) #x 90#define STRINGIZE(x) STRINGIZEx(x) 91UTF8PROC_DLLEXPORT const char utf8proc_version(void) { 92 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
102} 103	93} 94
104/* 105 * This macro tells translators that string X should be translated, 106 * but does not look up the translation at run time. This is standard 107 * GNU gettext notation for annotating compile-time constant strings. 108 / 109#ifndef N_ 110#define N_(x) x 111#endif 112* 113UTF8PROC_API 114const char *utf8proc_errmsg(ssize_t errcode) {	95UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
115 switch (errcode) { 116 case UTF8PROC_ERROR_NOMEM:	96 switch (errcode) { 97 case UTF8PROC_ERROR_NOMEM:
117 return N_("Memory for processing UTF-8 data could not be allocated.");	98 return "Memory for processing UTF-8 data could not be allocated.";
118 case UTF8PROC_ERROR_OVERFLOW:	99 case UTF8PROC_ERROR_OVERFLOW:
119 return N_("UTF-8 string is too long to be processed.");	100 return "UTF-8 string is too long to be processed.";
120 case UTF8PROC_ERROR_INVALIDUTF8:	101 case UTF8PROC_ERROR_INVALIDUTF8:
121 return N_("Invalid UTF-8 string");	102 return "Invalid UTF-8 string";
122 case UTF8PROC_ERROR_NOTASSIGNED:	103 case UTF8PROC_ERROR_NOTASSIGNED:
123 return N_("Unassigned Unicode code point found in UTF-8 string.");	104 return "Unassigned Unicode code point found in UTF-8 string.";
124 case UTF8PROC_ERROR_INVALIDOPTS:	105 case UTF8PROC_ERROR_INVALIDOPTS:
125 return N_("Invalid options for UTF-8 processing chosen.");	106 return "Invalid options for UTF-8 processing chosen.";
126 default:	107 default:
127 return N_("An unknown error occured while processing UTF-8 data.");	108 return "An unknown error occurred while processing UTF-8 data.";
128 } 129} 130	109 } 110} 111
131UTF8PROC_API 132ssize_t utf8proc_iterate( 133 const uint8_t str, ssize_t strlen, int32_t dst	112#define utf_cont(ch) (((ch) & 0xc0) == 0x80) 113UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate( 114 const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t dst
134) {	115) {
135 int length; 136 int i; 137 int32_t uc = -1;	116 utf8proc_uint32_t uc; 117 const utf8proc_uint8_t end; 118*
138 dst = -1; 139* if (!strlen) return 0;	119 dst = -1; 120* if (!strlen) return 0;
140 length = utf8proc_utf8class[str[0]]; 141 if (!length) return UTF8PROC_ERROR_INVALIDUTF8; 142 if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8; 143 for (i=1; i<length; i++) { 144 if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;	121 end = str + ((strlen < 0) ? 4 : strlen); 122 uc = str++; 123* if (uc < 0x80) { 124 dst = uc; 125* return 1;
145 }	126 }
146 switch (length) { 147 case 1: 148 uc = str[0]; 149 break; 150 case 2: 151 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); 152 if (uc < 0x80) uc = -1; 153 break; 154 case 3: 155 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) 156 + (str[2] & 0x3F); 157 if (uc < 0x800 \|\| (uc >= 0xD800 && uc < 0xE000) \|\| 158 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; 159 break; 160 case 4: 161 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) 162 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); 163 if (uc < 0x10000 \|\| uc >= 0x110000) uc = -1; 164 break;	127 /* Must be between 0xc2 and 0xf4 inclusive to be valid / 128* if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8; 129 if (uc < 0xe0) { /* 2-byte sequence / 130* /* Must have valid continuation character / 131* if (str >= end \|\| !utf_cont(str)) return UTF8PROC_ERROR_INVALIDUTF8; 132* dst = ((uc & 0x1f)<<6) \| (str & 0x3f); 133 return 2;
165 }	134 }
166 if (uc < 0 \|\| ((uc & 0xFFFF) >= 0xFFFE)) 167 return UTF8PROC_ERROR_INVALIDUTF8; 168 dst = uc; 169* return length;	135 if (uc < 0xf0) { /* 3-byte sequence / 136* if ((str + 1 >= end) \|\| !utf_cont(str) \|\| !utf_cont(str[1])) 137* return UTF8PROC_ERROR_INVALIDUTF8; 138 /* Check for surrogate chars / 139* if (uc == 0xed && str > 0x9f) 140* return UTF8PROC_ERROR_INVALIDUTF8; 141 uc = ((uc & 0xf)<<12) \| ((str & 0x3f)<<6) \| (str[1] & 0x3f); 142* if (uc < 0x800) 143 return UTF8PROC_ERROR_INVALIDUTF8; 144 dst = uc; 145* return 3; 146 } 147 /* 4-byte sequence 148 Must have 3 valid continuation characters / 149* if ((str + 2 >= end) \|\| !utf_cont(str) \|\| !utf_cont(str[1]) \|\| !utf_cont(str[2])) 150* return UTF8PROC_ERROR_INVALIDUTF8; 151 /* Make sure in correct range (0x10000 - 0x10ffff) / 152* if (uc == 0xf0) { 153 if (str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8; 154* } else if (uc == 0xf4) { 155 if (str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8; 156* } 157 dst = ((uc & 7)<<18) \| ((str & 0x3f)<<12) \| ((str[1] & 0x3f)<<6) \| (str[2] & 0x3f); 158 return 4;
170} 171	159} 160
172UTF8PROC_API 173bool utf8proc_codepoint_valid(int32_t uc) { 174 if (uc < 0 \|\| uc >= 0x110000 \|\| 175 ((uc & 0xFFFF) >= 0xFFFE) \|\| (uc >= 0xD800 && uc < 0xE000) \|\| 176 (uc >= 0xFDD0 && uc < 0xFDF0)) return false; 177 else return true;	161UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) { 162 return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
178} 179	163} 164
180UTF8PROC_API 181ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {	165UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
182 if (uc < 0x00) { 183 return 0; 184 } else if (uc < 0x80) {	166 if (uc < 0x00) { 167 return 0; 168 } else if (uc < 0x80) {
185 dst[0] = (uint8_t)uc;	169 dst[0] = (utf8proc_uint8_t) uc;
186 return 1; 187 } else if (uc < 0x800) {	170 return 1; 171 } else if (uc < 0x800) {
188 dst[0] = 0xC0 + (uint8_t)(uc >> 6); 189 dst[1] = 0x80 + (uc & 0x3F);	172 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 173 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
190 return 2;	174 return 2;
191 } else if (uc == 0xFFFF) { 192 dst[0] = 0xFF; 193 return 1; 194 } else if (uc == 0xFFFE) { 195 dst[0] = 0xFE; 196 return 1;	175 /* Note: we allow encoding 0xd800-0xdfff here, so as not to change 176 the API, however, these are actually invalid in UTF-8 */
197 } else if (uc < 0x10000) {	177 } else if (uc < 0x10000) {
198 dst[0] = 0xE0 + (uint8_t)(uc >> 12); 199 dst[1] = 0x80 + ((uc >> 6) & 0x3F); 200 dst[2] = 0x80 + (uc & 0x3F);	178 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 179 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 180 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
201 return 3; 202 } else if (uc < 0x110000) {	181 return 3; 182 } else if (uc < 0x110000) {
203 dst[0] = 0xF0 + (uint8_t)(uc >> 18); 204 dst[1] = 0x80 + ((uc >> 12) & 0x3F); 205 dst[2] = 0x80 + ((uc >> 6) & 0x3F); 206 dst[3] = 0x80 + (uc & 0x3F);	183 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 184 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 185 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 186 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
207 return 4; 208 } else return 0; 209} 210	187 return 4; 188 } else return 0; 189} 190
211UTF8PROC_API 212const utf8proc_property_t *utf8proc_get_property(int32_t uc) {	191/* internal "unsafe" version that does not check whether uc is in range / 192static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t dst) { 193 if (uc < 0x00) { 194 return 0; 195 } else if (uc < 0x80) { 196 dst[0] = (utf8proc_uint8_t)uc; 197 return 1; 198 } else if (uc < 0x800) { 199 dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); 200 dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 201 return 2; 202 } else if (uc == 0xFFFF) { 203 dst[0] = (utf8proc_uint8_t)0xFF; 204 return 1; 205 } else if (uc == 0xFFFE) { 206 dst[0] = (utf8proc_uint8_t)0xFE; 207 return 1; 208 } else if (uc < 0x10000) { 209 dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); 210 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 211 dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 212 return 3; 213 } else if (uc < 0x110000) { 214 dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18)); 215 dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F)); 216 dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); 217 dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); 218 return 4; 219 } else return 0; 220} 221 222/* internal "unsafe" version that does not check whether uc is in range / 223static const utf8proc_property_t unsafe_get_property(utf8proc_int32_t uc) {
213 /* ASSERT: uc >= 0 && uc < 0x110000 / 214* return utf8proc_properties + ( 215 utf8proc_stage2table[ 216 utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 217 ] 218 ); 219} 220	224 /* ASSERT: uc >= 0 && uc < 0x110000 / 225* return utf8proc_properties + ( 226 utf8proc_stage2table[ 227 utf8proc_stage1table[uc >> 8] + (uc & 0xFF) 228 ] 229 ); 230} 231
	232UTF8PROC_DLLEXPORT const utf8proc_property_t utf8proc_get_property(utf8proc_int32_t uc) { 233* return uc < 0 \|\| uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); 234} 235 236/* return whether there is a grapheme break between boundclasses lbc and tbc 237 (according to the definition of extended grapheme clusters) 238 239 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): 240 http://www.unicode.org/reports/tr29/tr29-29.html 241 242 CAVEATS: 243 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) 244 and GB 12/13 (regional indicator code points) require knowledge of previous characters 245 and are thus not handled by this function. This may result in an incorrect break before 246 an E_Modifier class codepoint and an incorrectly missing break between two 247 REGIONAL_INDICATOR class code points if such support does not exist in the caller. 248 249 See the special support in grapheme_break_extended, for required bookkeeping by the caller. 250/ 251static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { 252* return 253 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : /* GB1 / 254* (lbc == UTF8PROC_BOUNDCLASS_CR && /* GB3 / 255* tbc == UTF8PROC_BOUNDCLASS_LF) ? false : /* --- / 256* (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB4 / 257* (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : /* GB5 / 258* (lbc == UTF8PROC_BOUNDCLASS_L && /* GB6 / 259* (tbc == UTF8PROC_BOUNDCLASS_L \|\| /* --- / 260* tbc == UTF8PROC_BOUNDCLASS_V \|\| /* --- / 261* tbc == UTF8PROC_BOUNDCLASS_LV \|\| /* --- / 262* tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : /* --- / 263* ((lbc == UTF8PROC_BOUNDCLASS_LV \|\| /* GB7 / 264* lbc == UTF8PROC_BOUNDCLASS_V) && /* --- / 265* (tbc == UTF8PROC_BOUNDCLASS_V \|\| /* --- / 266* tbc == UTF8PROC_BOUNDCLASS_T)) ? false : /* --- / 267* ((lbc == UTF8PROC_BOUNDCLASS_LVT \|\| /* GB8 / 268* lbc == UTF8PROC_BOUNDCLASS_T) && /* --- / 269* tbc == UTF8PROC_BOUNDCLASS_T) ? false : /* --- / 270* (tbc == UTF8PROC_BOUNDCLASS_EXTEND \|\| /* GB9 / 271* tbc == UTF8PROC_BOUNDCLASS_ZWJ \|\| /* --- / 272* tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK \|\| /* GB9a / 273* lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : /* GB9b / 274* ((lbc == UTF8PROC_BOUNDCLASS_E_BASE \|\| /* GB10 (requires additional handling below) / 275* lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && /* ---- / 276* tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : /* ---- / 277* (lbc == UTF8PROC_BOUNDCLASS_ZWJ && /* GB11 / 278* (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ \|\| /* ---- / 279* tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : /* ---- / 280* (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && /* GB12/13 (requires additional handling below) / 281* tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : /* ---- / 282* true; /* GB999 / 283} 284* 285static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t state) 286{ 287* utf8proc_bool break_permitted; 288 int lbc_override = lbc; 289 if (state && state != UTF8PROC_BOUNDCLASS_START) 290* lbc_override = state; 291* break_permitted = grapheme_break_simple(lbc_override, tbc); 292 if (state) { 293 /* Special support for GB 12/13 made possible by GB999. After two RI 294 class codepoints we want to force a break. Do this by resetting the 295 second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break 296 after that character according to GB999 (unless of course such a break is 297 forbidden by a different rule such as GB9). / 298* if (state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) 299* state = UTF8PROC_BOUNDCLASS_OTHER; 300* /* Special support for GB10. Fold any EXTEND codepoints into the previous 301 boundclass if we're dealing with an emoji base boundclass. / 302* else if ((state == UTF8PROC_BOUNDCLASS_E_BASE \|\| 303* state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && 304* tbc == UTF8PROC_BOUNDCLASS_EXTEND) 305 state = UTF8PROC_BOUNDCLASS_E_BASE; 306* else 307 state = tbc; 308* } 309 return break_permitted; 310} 311 312UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( 313 utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t state) { 314* 315 return grapheme_break_extended(utf8proc_get_property(c1)->boundclass, 316 utf8proc_get_property(c2)->boundclass, 317 state); 318} 319 320 321UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( 322 utf8proc_int32_t c1, utf8proc_int32_t c2) { 323 return utf8proc_grapheme_break_stateful(c1, c2, NULL); 324} 325 326static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t *entry) 327{ 328* utf8proc_int32_t entry_cp = *entry; 329* if ((entry_cp & 0xF800) == 0xD800) { 330 entry = entry + 1; 331 entry_cp = ((entry_cp & 0x03FF) << 10) \| (*entry & 0x03FF); 332* entry_cp += 0x10000; 333 } 334 return entry_cp; 335} 336 337static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) 338{ 339 const utf8proc_uint16_t entry = &utf8proc_sequences[seqindex]; 340* return seqindex_decode_entry(&entry); 341} 342 343static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int last_boundclass) { 344 utf8proc_ssize_t written = 0; 345 const utf8proc_uint16_t entry = &utf8proc_sequences[seqindex & 0x1FFF]; 346* int len = seqindex >> 13; 347 if (len >= 7) { 348 len = entry; 349* entry++; 350 } 351 for (; len >= 0; entry++, len--) { 352 utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); 353 354 written += utf8proc_decompose_char(entry_cp, dst+written, 355 (bufsize > written) ? (bufsize - written) : 0, options, 356 last_boundclass); 357 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 358 } 359 return written; 360} 361 362UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) 363{ 364 utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; 365 return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; 366} 367 368UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) 369{ 370 utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; 371 return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; 372} 373 374UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) 375{ 376 utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; 377 return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; 378} 379 380/* return a character width analogous to wcwidth (except portable and 381 hopefully less buggy than most system wcwidth functions). / 382UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { 383* return utf8proc_get_property(c)->charwidth; 384} 385 386UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) { 387 return utf8proc_get_property(c)->category; 388} 389 390UTF8PROC_DLLEXPORT const char utf8proc_category_string(utf8proc_int32_t c) { 391* static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; 392 return s[utf8proc_category(c)]; 393} 394
221#define utf8proc_decompose_lump(replacement_uc) \ 222 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 223 options & ~UTF8PROC_LUMP, last_boundclass) 224	395#define utf8proc_decompose_lump(replacement_uc) \ 396 return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ 397 options & ~UTF8PROC_LUMP, last_boundclass) 398
225UTF8PROC_API 226ssize_t utf8proc_decompose_char(int32_t uc, int32_t dst, ssize_t bufsize, 227* int options, int last_boundclass) { 228* /* ASSERT: uc >= 0 && uc < 0x110000 */	399UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int last_boundclass) {
229 const utf8proc_property_t property; 230* utf8proc_propval_t category;	400 const utf8proc_property_t property; 401* utf8proc_propval_t category;
231 int32_t hangul_sindex; 232 property = utf8proc_get_property(uc);	402 utf8proc_int32_t hangul_sindex; 403 if (uc < 0 \|\| uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED; 404 property = unsafe_get_property(uc);
233 category = property->category; 234 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 235 if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) { 236 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {	405 category = property->category; 406 hangul_sindex = uc - UTF8PROC_HANGUL_SBASE; 407 if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) { 408 if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
237 int32_t hangul_tindex;	409 utf8proc_int32_t hangul_tindex;
238 if (bufsize >= 1) { 239 dst[0] = UTF8PROC_HANGUL_LBASE + 240 hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 241 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 242 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 243 } 244 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 245 if (!hangul_tindex) return 2; --- 34 unchanged lines hidden (view full) --- 280 } 281 } 282 if (options & UTF8PROC_STRIPMARK) { 283 if (category == UTF8PROC_CATEGORY_MN \|\| 284 category == UTF8PROC_CATEGORY_MC \|\| 285 category == UTF8PROC_CATEGORY_ME) return 0; 286 } 287 if (options & UTF8PROC_CASEFOLD) {	410 if (bufsize >= 1) { 411 dst[0] = UTF8PROC_HANGUL_LBASE + 412 hangul_sindex / UTF8PROC_HANGUL_NCOUNT; 413 if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE + 414 (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT; 415 } 416 hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT; 417 if (!hangul_tindex) return 2; --- 34 unchanged lines hidden (view full) --- 452 } 453 } 454 if (options & UTF8PROC_STRIPMARK) { 455 if (category == UTF8PROC_CATEGORY_MN \|\| 456 category == UTF8PROC_CATEGORY_MC \|\| 457 category == UTF8PROC_CATEGORY_ME) return 0; 458 } 459 if (options & UTF8PROC_CASEFOLD) {
288 if (property->casefold_mapping) { 289 const int32_t casefold_entry; 290* ssize_t written = 0; 291 for (casefold_entry = property->casefold_mapping; 292 casefold_entry >= 0; casefold_entry++) { 293* written += utf8proc_decompose_char(casefold_entry, dst+written, 294* (bufsize > written) ? (bufsize - written) : 0, options, 295 last_boundclass); 296 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 297 } 298 return written;	460 if (property->casefold_seqindex != UINT16_MAX) { 461 return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
299 } 300 } 301 if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {	462 } 463 } 464 if (options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) {
302 if (property->decomp_mapping &&	465 if (property->decomp_seqindex != UINT16_MAX &&
303 (!property->decomp_type \|\| (options & UTF8PROC_COMPAT))) {	466 (!property->decomp_type \|\| (options & UTF8PROC_COMPAT))) {
304 const int32_t decomp_entry; 305* ssize_t written = 0; 306 for (decomp_entry = property->decomp_mapping; 307 decomp_entry >= 0; decomp_entry++) { 308* written += utf8proc_decompose_char(decomp_entry, dst+written, 309* (bufsize > written) ? (bufsize - written) : 0, options, 310 last_boundclass); 311 if (written < 0) return UTF8PROC_ERROR_OVERFLOW; 312 } 313 return written;	467 return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
314 } 315 } 316 if (options & UTF8PROC_CHARBOUND) {	468 } 469 } 470 if (options & UTF8PROC_CHARBOUND) {
317 bool boundary; 318 int tbc, lbc; 319 tbc = 320 (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR : 321 (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF : 322 ((category == UTF8PROC_CATEGORY_ZL \|\| 323 category == UTF8PROC_CATEGORY_ZP \|\| 324 category == UTF8PROC_CATEGORY_CC \|\| 325 category == UTF8PROC_CATEGORY_CF) && 326 !(uc == 0x200C \|\| uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL : 327 property->extend ? UTF8PROC_BOUNDCLASS_EXTEND : 328 ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) \|\| 329 uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L : 330 (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ? 331 UTF8PROC_BOUNDCLASS_V : 332 (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ? 333 UTF8PROC_BOUNDCLASS_T : 334 (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? ( 335 ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ? 336 UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT 337 ) : 338 UTF8PROC_BOUNDCLASS_OTHER; 339 lbc = last_boundclass; 340* boundary = 341 (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : 342 (lbc == UTF8PROC_BOUNDCLASS_START) ? true : 343 (lbc == UTF8PROC_BOUNDCLASS_CR && 344 tbc == UTF8PROC_BOUNDCLASS_LF) ? false : 345 (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 346 (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true : 347 (lbc == UTF8PROC_BOUNDCLASS_L && 348 (tbc == UTF8PROC_BOUNDCLASS_L \|\| 349 tbc == UTF8PROC_BOUNDCLASS_V \|\| 350 tbc == UTF8PROC_BOUNDCLASS_LV \|\| 351 tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : 352 ((lbc == UTF8PROC_BOUNDCLASS_LV \|\| 353 lbc == UTF8PROC_BOUNDCLASS_V) && 354 (tbc == UTF8PROC_BOUNDCLASS_V \|\| 355 tbc == UTF8PROC_BOUNDCLASS_T)) ? false : 356 ((lbc == UTF8PROC_BOUNDCLASS_LVT \|\| 357 lbc == UTF8PROC_BOUNDCLASS_T) && 358 tbc == UTF8PROC_BOUNDCLASS_T) ? false : 359 true; 360 *last_boundclass = tbc;	471 utf8proc_bool boundary; 472 int tbc = property->boundclass; 473 boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
361 if (boundary) { 362 if (bufsize >= 1) dst[0] = 0xFFFF; 363 if (bufsize >= 2) dst[1] = uc; 364 return 2; 365 } 366 } 367 if (bufsize >= 1) dst = uc; 368* return 1; 369} 370	474 if (boundary) { 475 if (bufsize >= 1) dst[0] = 0xFFFF; 476 if (bufsize >= 2) dst[1] = uc; 477 return 2; 478 } 479 } 480 if (bufsize >= 1) dst = uc; 481* return 1; 482} 483
371UTF8PROC_API 372ssize_t utf8proc_decompose( 373 const uint8_t str, ssize_t strlen, 374* int32_t *buffer, ssize_t bufsize, int options	484UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose( 485 const utf8proc_uint8_t str, utf8proc_ssize_t strlen, 486* utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
375) {	487) {
	488 return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL); 489} 490 491UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( 492 const utf8proc_uint8_t str, utf8proc_ssize_t strlen, 493* utf8proc_int32_t buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, 494* utf8proc_custom_func custom_func, void custom_data 495*) {
376 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */	496 /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
377 ssize_t wpos = 0;	497 utf8proc_ssize_t wpos = 0;
378 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 379 return UTF8PROC_ERROR_INVALIDOPTS; 380 if ((options & UTF8PROC_STRIPMARK) && 381 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 382 return UTF8PROC_ERROR_INVALIDOPTS; 383 {	498 if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE)) 499 return UTF8PROC_ERROR_INVALIDOPTS; 500 if ((options & UTF8PROC_STRIPMARK) && 501 !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE)) 502 return UTF8PROC_ERROR_INVALIDOPTS; 503 {
384 int32_t uc; 385 ssize_t rpos = 0; 386 ssize_t decomp_result;	504 utf8proc_int32_t uc; 505 utf8proc_ssize_t rpos = 0; 506 utf8proc_ssize_t decomp_result;
387 int boundclass = UTF8PROC_BOUNDCLASS_START; 388 while (1) { 389 if (options & UTF8PROC_NULLTERM) { 390 rpos += utf8proc_iterate(str + rpos, -1, &uc);	507 int boundclass = UTF8PROC_BOUNDCLASS_START; 508 while (1) { 509 if (options & UTF8PROC_NULLTERM) { 510 rpos += utf8proc_iterate(str + rpos, -1, &uc);
391 /* checking of return value is not neccessary,	511 /* checking of return value is not necessary,
392 as 'uc' is < 0 in case of error / 393* if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 394 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 395 if (uc == 0) break; 396 } else { 397 if (rpos >= strlen) break; 398 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 399 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 400 }	512 as 'uc' is < 0 in case of error / 513* if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 514 if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW; 515 if (uc == 0) break; 516 } else { 517 if (rpos >= strlen) break; 518 rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc); 519 if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8; 520 }
	521 if (custom_func != NULL) { 522 uc = custom_func(uc, custom_data); /* user-specified custom mapping / 523* }
401 decomp_result = utf8proc_decompose_char( 402 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 403 &boundclass 404 ); 405 if (decomp_result < 0) return decomp_result; 406 wpos += decomp_result; 407 /* prohibiting integer overflows due to too long strings: */	524 decomp_result = utf8proc_decompose_char( 525 uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options, 526 &boundclass 527 ); 528 if (decomp_result < 0) return decomp_result; 529 wpos += decomp_result; 530 /* prohibiting integer overflows due to too long strings: */
408 if (wpos < 0 \|\| wpos > SSIZE_MAX/sizeof(int32_t)/2)	531 if (wpos < 0 \|\| 532 wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
409 return UTF8PROC_ERROR_OVERFLOW; 410 } 411 } 412 if ((options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {	533 return UTF8PROC_ERROR_OVERFLOW; 534 } 535 } 536 if ((options & (UTF8PROC_COMPOSE\|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
413 ssize_t pos = 0;	537 utf8proc_ssize_t pos = 0;
414 while (pos < wpos-1) {	538 while (pos < wpos-1) {
415 int32_t uc1, uc2;	539 utf8proc_int32_t uc1, uc2;
416 const utf8proc_property_t property1, property2; 417 uc1 = buffer[pos]; 418 uc2 = buffer[pos+1];	540 const utf8proc_property_t property1, property2; 541 uc1 = buffer[pos]; 542 uc2 = buffer[pos+1];
419 property1 = utf8proc_get_property(uc1); 420 property2 = utf8proc_get_property(uc2);	543 property1 = unsafe_get_property(uc1); 544 property2 = unsafe_get_property(uc2);
421 if (property1->combining_class > property2->combining_class && 422 property2->combining_class > 0) { 423 buffer[pos] = uc2; 424 buffer[pos+1] = uc1; 425 if (pos > 0) pos--; else pos++; 426 } else { 427 pos++; 428 } 429 } 430 } 431 return wpos; 432} 433	545 if (property1->combining_class > property2->combining_class && 546 property2->combining_class > 0) { 547 buffer[pos] = uc2; 548 buffer[pos+1] = uc1; 549 if (pos > 0) pos--; else pos++; 550 } else { 551 pos++; 552 } 553 } 554 } 555 return wpos; 556} 557
434UTF8PROC_API 435ssize_t utf8proc_reencode(int32_t buffer, ssize_t length, int options) { 436* /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 437 ASSERT: 'buffer' has one spare byte of free space at the end! */	558UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 559* /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
438 if (options & (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS \| UTF8PROC_STRIPCC)) {	560 if (options & (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS \| UTF8PROC_STRIPCC)) {
439 ssize_t rpos; 440 ssize_t wpos = 0; 441 int32_t uc;	561 utf8proc_ssize_t rpos; 562 utf8proc_ssize_t wpos = 0; 563 utf8proc_int32_t uc;
442 for (rpos = 0; rpos < length; rpos++) { 443 uc = buffer[rpos]; 444 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 445 if (uc == 0x000A \|\| uc == 0x000D \|\| uc == 0x0085 \|\| 446 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B \|\| uc == 0x000C))) { 447 if (options & UTF8PROC_NLF2LS) { 448 if (options & UTF8PROC_NLF2PS) { 449 buffer[wpos++] = 0x000A; --- 12 unchanged lines hidden (view full) --- 462 if (uc == 0x0009) buffer[wpos++] = 0x0020; 463 } else { 464 buffer[wpos++] = uc; 465 } 466 } 467 length = wpos; 468 } 469 if (options & UTF8PROC_COMPOSE) {	564 for (rpos = 0; rpos < length; rpos++) { 565 uc = buffer[rpos]; 566 if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++; 567 if (uc == 0x000A \|\| uc == 0x000D \|\| uc == 0x0085 \|\| 568 ((options & UTF8PROC_STRIPCC) && (uc == 0x000B \|\| uc == 0x000C))) { 569 if (options & UTF8PROC_NLF2LS) { 570 if (options & UTF8PROC_NLF2PS) { 571 buffer[wpos++] = 0x000A; --- 12 unchanged lines hidden (view full) --- 584 if (uc == 0x0009) buffer[wpos++] = 0x0020; 585 } else { 586 buffer[wpos++] = uc; 587 } 588 } 589 length = wpos; 590 } 591 if (options & UTF8PROC_COMPOSE) {
470 int32_t starter = NULL; 471* int32_t current_char;	592 utf8proc_int32_t starter = NULL; 593* utf8proc_int32_t current_char;
472 const utf8proc_property_t starter_property = NULL, current_property; 473 utf8proc_propval_t max_combining_class = -1;	594 const utf8proc_property_t starter_property = NULL, current_property; 595 utf8proc_propval_t max_combining_class = -1;
474 ssize_t rpos; 475 ssize_t wpos = 0; 476 int32_t composition;	596 utf8proc_ssize_t rpos; 597 utf8proc_ssize_t wpos = 0; 598 utf8proc_int32_t composition;
477 for (rpos = 0; rpos < length; rpos++) { 478 current_char = buffer[rpos];	599 for (rpos = 0; rpos < length; rpos++) { 600 current_char = buffer[rpos];
479 current_property = utf8proc_get_property(current_char);	601 current_property = unsafe_get_property(current_char);
480 if (starter && current_property->combining_class > max_combining_class) { 481 /* combination perhaps possible */	602 if (starter && current_property->combining_class > max_combining_class) { 603 /* combination perhaps possible */
482 int32_t hangul_lindex; 483 int32_t hangul_sindex;	604 utf8proc_int32_t hangul_lindex; 605 utf8proc_int32_t hangul_sindex;
484 hangul_lindex = starter - UTF8PROC_HANGUL_LBASE; 485* if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {	606 hangul_lindex = starter - UTF8PROC_HANGUL_LBASE; 607* if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
486 int32_t hangul_vindex;	608 utf8proc_int32_t hangul_vindex;
487 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 488 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 489 starter = UTF8PROC_HANGUL_SBASE + 490* (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 491 UTF8PROC_HANGUL_TCOUNT; 492 starter_property = NULL; 493 continue; 494 } 495 } 496 hangul_sindex = starter - UTF8PROC_HANGUL_SBASE; 497* if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 498 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {	609 hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE; 610 if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) { 611 starter = UTF8PROC_HANGUL_SBASE + 612* (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) * 613 UTF8PROC_HANGUL_TCOUNT; 614 starter_property = NULL; 615 continue; 616 } 617 } 618 hangul_sindex = starter - UTF8PROC_HANGUL_SBASE; 619* if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT && 620 (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
499 int32_t hangul_tindex;	621 utf8proc_int32_t hangul_tindex;
500 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 501 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 502 starter += hangul_tindex; 503* starter_property = NULL; 504 continue; 505 } 506 } 507 if (!starter_property) {	622 hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE; 623 if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) { 624 starter += hangul_tindex; 625* starter_property = NULL; 626 continue; 627 } 628 } 629 if (!starter_property) {
508 starter_property = utf8proc_get_property(*starter);	630 starter_property = unsafe_get_property(*starter);
509 }	631 }
510 if (starter_property->comb1st_index >= 0 && 511 current_property->comb2nd_index >= 0) { 512 composition = utf8proc_combinations[ 513 starter_property->comb1st_index + 514 current_property->comb2nd_index 515 ]; 516 if (composition >= 0 && (!(options & UTF8PROC_STABLE) \|\| 517 !(utf8proc_get_property(composition)->comp_exclusion))) { 518 starter = composition; 519* starter_property = NULL; 520 continue;	632 if (starter_property->comb_index < 0x8000 && 633 current_property->comb_index != UINT16_MAX && 634 current_property->comb_index >= 0x8000) { 635 int sidx = starter_property->comb_index; 636 int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx]; 637 if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) { 638 idx += sidx + 2; 639 if (current_property->comb_index & 0x4000) { 640 composition = (utf8proc_combinations[idx] << 16) \| utf8proc_combinations[idx+1]; 641 } else 642 composition = utf8proc_combinations[idx]; 643 644 if (composition > 0 && (!(options & UTF8PROC_STABLE) \|\| 645 !(unsafe_get_property(composition)->comp_exclusion))) { 646 starter = composition; 647* starter_property = NULL; 648 continue; 649 }
521 } 522 } 523 } 524 buffer[wpos] = current_char; 525 if (current_property->combining_class) { 526 if (current_property->combining_class > max_combining_class) { 527 max_combining_class = current_property->combining_class; 528 } 529 } else { 530 starter = buffer + wpos; 531 starter_property = NULL; 532 max_combining_class = -1; 533 } 534 wpos++; 535 } 536 length = wpos; 537 }	650 } 651 } 652 } 653 buffer[wpos] = current_char; 654 if (current_property->combining_class) { 655 if (current_property->combining_class > max_combining_class) { 656 max_combining_class = current_property->combining_class; 657 } 658 } else { 659 starter = buffer + wpos; 660 starter_property = NULL; 661 max_combining_class = -1; 662 } 663 wpos++; 664 } 665 length = wpos; 666 }
	667 return length; 668} 669 670UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t buffer, utf8proc_ssize_t length, utf8proc_option_t options) { 671* /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored 672 ASSERT: 'buffer' has one spare byte of free space at the end! / 673* length = utf8proc_normalize_utf32(buffer, length, options); 674 if (length < 0) return length;
538 {	675 {
539 ssize_t rpos, wpos = 0; 540 int32_t uc; 541 for (rpos = 0; rpos < length; rpos++) { 542 uc = buffer[rpos]; 543 wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);	676 utf8proc_ssize_t rpos, wpos = 0; 677 utf8proc_int32_t uc; 678 if (options & UTF8PROC_CHARBOUND) { 679 for (rpos = 0; rpos < length; rpos++) { 680 uc = buffer[rpos]; 681 wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t )buffer) + wpos); 682* } 683 } else { 684 for (rpos = 0; rpos < length; rpos++) { 685 uc = buffer[rpos]; 686 wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t )buffer) + wpos); 687* }
544 }	688 }
545 ((uint8_t *)buffer)[wpos] = 0;	689 ((utf8proc_uint8_t *)buffer)[wpos] = 0;
546 return wpos; 547 } 548} 549	690 return wpos; 691 } 692} 693
550UTF8PROC_API 551ssize_t utf8proc_map( 552 const uint8_t str, ssize_t strlen, uint8_t *dstptr, int options	694UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( 695 const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options
553) {	696) {
554 int32_t buffer; 555* ssize_t result;	697 return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL); 698} 699 700UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( 701 const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t dstptr, utf8proc_option_t options, 702* utf8proc_custom_func custom_func, void custom_data 703) { 704* utf8proc_int32_t buffer; 705* utf8proc_ssize_t result;
556 *dstptr = NULL;	706 *dstptr = NULL;
557 result = utf8proc_decompose(str, strlen, NULL, 0, options);	707 result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
558 if (result < 0) return result;	708 if (result < 0) return result;
559 buffer = malloc(result * sizeof(int32_t) + 1);	709 buffer = (utf8proc_int32_t ) malloc(result sizeof(utf8proc_int32_t) + 1);
560 if (!buffer) return UTF8PROC_ERROR_NOMEM;	710 if (!buffer) return UTF8PROC_ERROR_NOMEM;
561 result = utf8proc_decompose(str, strlen, buffer, result, options);	711 result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
562 if (result < 0) { 563 free(buffer); 564 return result; 565 } 566 result = utf8proc_reencode(buffer, result, options); 567 if (result < 0) { 568 free(buffer); 569 return result; 570 } 571 {	712 if (result < 0) { 713 free(buffer); 714 return result; 715 } 716 result = utf8proc_reencode(buffer, result, options); 717 if (result < 0) { 718 free(buffer); 719 return result; 720 } 721 {
572 int32_t newptr; 573* newptr = realloc(buffer, (size_t)result+1);	722 utf8proc_int32_t newptr; 723* newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
574 if (newptr) buffer = newptr; 575 }	724 if (newptr) buffer = newptr; 725 }
576 dstptr = (uint8_t )buffer;	726 dstptr = (utf8proc_uint8_t )buffer;
577 return result; 578} 579	727 return result; 728} 729
580UTF8PROC_API 581uint8_t utf8proc_NFD(const uint8_t str) { 582 uint8_t *retval;	730UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFD(const utf8proc_uint8_t str) { 731 utf8proc_uint8_t *retval;
583 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 584 UTF8PROC_DECOMPOSE); 585 return retval; 586} 587	732 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 733 UTF8PROC_DECOMPOSE); 734 return retval; 735} 736
588UTF8PROC_API 589uint8_t utf8proc_NFC(const uint8_t str) { 590 uint8_t *retval;	737UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFC(const utf8proc_uint8_t str) { 738 utf8proc_uint8_t *retval;
591 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 592 UTF8PROC_COMPOSE); 593 return retval; 594} 595	739 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 740 UTF8PROC_COMPOSE); 741 return retval; 742} 743
596UTF8PROC_API 597uint8_t utf8proc_NFKD(const uint8_t str) { 598 uint8_t *retval;	744UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKD(const utf8proc_uint8_t str) { 745 utf8proc_uint8_t *retval;
599 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 600 UTF8PROC_DECOMPOSE \| UTF8PROC_COMPAT); 601 return retval; 602} 603	746 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 747 UTF8PROC_DECOMPOSE \| UTF8PROC_COMPAT); 748 return retval; 749} 750
604UTF8PROC_API 605uint8_t utf8proc_NFKC(const uint8_t str) { 606 uint8_t *retval;	751UTF8PROC_DLLEXPORT utf8proc_uint8_t utf8proc_NFKC(const utf8proc_uint8_t str) { 752 utf8proc_uint8_t *retval;
607 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 608 UTF8PROC_COMPOSE \| UTF8PROC_COMPAT); 609 return retval; 610}	753 utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM \| UTF8PROC_STABLE \| 754 UTF8PROC_COMPOSE \| UTF8PROC_COMPAT); 755 return retval; 756}
611