1289177Speter/* 2289177Speter * utf8proc.c: Wrappers for the utf8proc library 3289177Speter * 4289177Speter * ==================================================================== 5289177Speter * Licensed to the Apache Software Foundation (ASF) under one 6289177Speter * or more contributor license agreements. See the NOTICE file 7289177Speter * distributed with this work for additional information 8289177Speter * regarding copyright ownership. The ASF licenses this file 9289177Speter * to you under the Apache License, Version 2.0 (the 10289177Speter * "License"); you may not use this file except in compliance 11289177Speter * with the License. You may obtain a copy of the License at 12289177Speter * 13289177Speter * http://www.apache.org/licenses/LICENSE-2.0 14289177Speter * 15289177Speter * Unless required by applicable law or agreed to in writing, 16289177Speter * software distributed under the License is distributed on an 17289177Speter * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18289177Speter * KIND, either express or implied. See the License for the 19289177Speter * specific language governing permissions and limitations 20289177Speter * under the License. 21289177Speter * ==================================================================== 22289177Speter */ 23289177Speter 24289177Speter 25289177Speter 26289177Speter#include <apr_fnmatch.h> 27289177Speter 28289177Speter#include "private/svn_string_private.h" 29289177Speter#include "private/svn_utf_private.h" 30289177Speter#include "svn_private_config.h" 31289177Speter 32362181Sdim#if SVN_INTERNAL_UTF8PROC 33289177Speter#define UTF8PROC_INLINE 34289177Speter/* Somehow utf8proc thinks it is nice to use strlen as an argument name, 35289177Speter while this function is already defined via apr.h */ 36289177Speter#define strlen svn__strlen_var 37289177Speter#include "utf8proc/utf8proc.c" 38289177Speter#undef strlen 39362181Sdim#else 40362181Sdim#include <utf8proc.h> 41362181Sdim#endif 42289177Speter 43289177Speter 44289177Speter 45289177Speterconst char * 46289177Spetersvn_utf__utf8proc_compiled_version(void) 47289177Speter{ 48289177Speter static const char utf8proc_version[] = 49289177Speter APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "." 50289177Speter APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "." 51289177Speter APR_STRINGIFY(UTF8PROC_VERSION_PATCH); 52289177Speter return utf8proc_version; 53289177Speter} 54289177Speter 55289177Speterconst char * 56289177Spetersvn_utf__utf8proc_runtime_version(void) 57289177Speter{ 58289177Speter /* Unused static function warning removal hack. */ 59362181Sdim SVN_UNUSED(utf8proc_grapheme_break); 60362181Sdim SVN_UNUSED(utf8proc_tolower); 61362181Sdim SVN_UNUSED(utf8proc_toupper); 62362181Sdim#if UTF8PROC_VERSION_MAJOR >= 2 63362181Sdim SVN_UNUSED(utf8proc_totitle); 64362181Sdim#endif 65362181Sdim SVN_UNUSED(utf8proc_charwidth); 66362181Sdim SVN_UNUSED(utf8proc_category_string); 67289177Speter SVN_UNUSED(utf8proc_NFD); 68289177Speter SVN_UNUSED(utf8proc_NFC); 69289177Speter SVN_UNUSED(utf8proc_NFKD); 70289177Speter SVN_UNUSED(utf8proc_NFKC); 71289177Speter 72289177Speter return utf8proc_version(); 73289177Speter} 74289177Speter 75289177Speter 76289177Speter 77289177Speter/* Fill the given BUFFER with decomposed UCS-4 representation of the 78289177Speter * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING 79289177Speter * is NUL-terminated; otherwise look only at the first LENGTH bytes in 80289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4 81289177Speter * characters, and return the length of the array. TRANSFORM_FLAGS 82289177Speter * define exactly how the decomposition is performed. 83289177Speter * 84289177Speter * A negative return value is an utf8proc error code and may indicate 85289177Speter * that STRING contains invalid UTF-8 or was so long that an overflow 86289177Speter * occurred. 87289177Speter */ 88362181Sdimstatic apr_ssize_t 89289177Speterunicode_decomposition(int transform_flags, 90289177Speter const char *string, apr_size_t length, 91289177Speter svn_membuf_t *buffer) 92289177Speter{ 93289177Speter const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH 94289177Speter ? UTF8PROC_NULLTERM : 0); 95289177Speter 96289177Speter for (;;) 97289177Speter { 98289177Speter apr_int32_t *const ucs4buf = buffer->data; 99362181Sdim const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf); 100362181Sdim const apr_ssize_t result = 101289177Speter utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len, 102289177Speter UTF8PROC_DECOMPOSE | UTF8PROC_STABLE 103289177Speter | transform_flags | nullterm); 104289177Speter 105289177Speter if (result < 0 || result <= ucs4len) 106289177Speter return result; 107289177Speter 108289177Speter /* Increase the decomposition buffer size and retry */ 109289177Speter svn_membuf__ensure(buffer, result * sizeof(*ucs4buf)); 110289177Speter } 111289177Speter} 112289177Speter 113289177Speter/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8 114289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 115289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in 116289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4 117289177Speter * characters and *RESULT_LENGTH contains the length of the array. 118289177Speter * 119289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or 120289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc. 121289177Speter */ 122289177Speterstatic svn_error_t * 123289177Speterdecompose_normalized(apr_size_t *result_length, 124289177Speter const char *string, apr_size_t length, 125289177Speter svn_membuf_t *buffer) 126289177Speter{ 127362181Sdim apr_ssize_t result = unicode_decomposition(0, string, length, buffer); 128289177Speter if (result < 0) 129289177Speter return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 130289177Speter gettext(utf8proc_errmsg(result))); 131289177Speter *result_length = result; 132289177Speter return SVN_NO_ERROR; 133289177Speter} 134289177Speter 135289177Speter/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8 136289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 137289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in 138289177Speter * STRING. Upon return, BUFFER->data points at a NUL-terminated string 139289177Speter * of UTF-8 characters. 140289177Speter * 141362181Sdim * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for 142362181Sdim * case-insensitive string comparison. If STRIPMARK is non-zero, strip 143362181Sdim * all diacritical marks (e.g., accents) from the string. 144362181Sdim * 145289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or 146289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc. 147289177Speter */ 148289177Speterstatic svn_error_t * 149289177Speternormalize_cstring(apr_size_t *result_length, 150289177Speter const char *string, apr_size_t length, 151362181Sdim svn_boolean_t casefold, 152362181Sdim svn_boolean_t stripmark, 153289177Speter svn_membuf_t *buffer) 154289177Speter{ 155362181Sdim int flags = 0; 156362181Sdim apr_ssize_t result; 157362181Sdim 158362181Sdim if (casefold) 159362181Sdim flags |= UTF8PROC_CASEFOLD; 160362181Sdim 161362181Sdim if (stripmark) 162362181Sdim flags |= UTF8PROC_STRIPMARK; 163362181Sdim 164362181Sdim result = unicode_decomposition(flags, string, length, buffer); 165289177Speter if (result >= 0) 166289177Speter { 167289177Speter svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1); 168289177Speter result = utf8proc_reencode(buffer->data, result, 169289177Speter UTF8PROC_COMPOSE | UTF8PROC_STABLE); 170289177Speter } 171289177Speter if (result < 0) 172289177Speter return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 173289177Speter gettext(utf8proc_errmsg(result))); 174289177Speter *result_length = result; 175289177Speter return SVN_NO_ERROR; 176289177Speter} 177289177Speter 178289177Speter/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of 179289177Speter * length LENB. Return 0 if they're equal, a negative value if BUFA is 180289177Speter * less than BUFB, otherwise a positive value. 181289177Speter * 182289177Speter * Yes, this is strcmp for known-length UCS-4 strings. 183289177Speter */ 184289177Speterstatic int 185289177Speterucs4cmp(const apr_int32_t *bufa, apr_size_t lena, 186289177Speter const apr_int32_t *bufb, apr_size_t lenb) 187289177Speter{ 188289177Speter const apr_size_t len = (lena < lenb ? lena : lenb); 189289177Speter apr_size_t i; 190289177Speter 191289177Speter for (i = 0; i < len; ++i) 192289177Speter { 193289177Speter const int diff = bufa[i] - bufb[i]; 194289177Speter if (diff) 195289177Speter return diff; 196289177Speter } 197289177Speter return (lena == lenb ? 0 : (lena < lenb ? -1 : 1)); 198289177Speter} 199289177Speter 200289177Spetersvn_error_t * 201289177Spetersvn_utf__normcmp(int *result, 202289177Speter const char *str1, apr_size_t len1, 203289177Speter const char *str2, apr_size_t len2, 204289177Speter svn_membuf_t *buf1, svn_membuf_t *buf2) 205289177Speter{ 206289177Speter apr_size_t buflen1; 207289177Speter apr_size_t buflen2; 208289177Speter 209289177Speter /* Shortcut-circuit the decision if at least one of the strings is empty. */ 210289177Speter const svn_boolean_t empty1 = 211289177Speter (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1)); 212289177Speter const svn_boolean_t empty2 = 213289177Speter (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2)); 214289177Speter if (empty1 || empty2) 215289177Speter { 216289177Speter *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1)); 217289177Speter return SVN_NO_ERROR; 218289177Speter } 219289177Speter 220289177Speter SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1)); 221289177Speter SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2)); 222289177Speter *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2); 223289177Speter return SVN_NO_ERROR; 224289177Speter} 225289177Speter 226289177Spetersvn_error_t* 227289177Spetersvn_utf__normalize(const char **result, 228289177Speter const char *str, apr_size_t len, 229289177Speter svn_membuf_t *buf) 230289177Speter{ 231289177Speter apr_size_t result_length; 232362181Sdim SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf)); 233289177Speter *result = (const char*)(buf->data); 234289177Speter return SVN_NO_ERROR; 235289177Speter} 236289177Speter 237362181Sdimsvn_error_t * 238362181Sdimsvn_utf__xfrm(const char **result, 239362181Sdim const char *str, apr_size_t len, 240362181Sdim svn_boolean_t case_insensitive, 241362181Sdim svn_boolean_t accent_insensitive, 242362181Sdim svn_membuf_t *buf) 243362181Sdim{ 244362181Sdim apr_size_t result_length; 245362181Sdim SVN_ERR(normalize_cstring(&result_length, str, len, 246362181Sdim case_insensitive, accent_insensitive, buf)); 247362181Sdim *result = (const char*)(buf->data); 248362181Sdim return SVN_NO_ERROR; 249362181Sdim} 250362181Sdim 251362181Sdimsvn_boolean_t 252362181Sdimsvn_utf__fuzzy_glob_match(const char *str, 253362181Sdim const apr_array_header_t *patterns, 254362181Sdim svn_membuf_t *buf) 255362181Sdim{ 256362181Sdim const char *normalized; 257362181Sdim svn_error_t *err; 258362181Sdim int i; 259362181Sdim 260362181Sdim /* Try to normalize case and accents in STR. 261362181Sdim * 262362181Sdim * If that should fail for some reason, consider STR a mismatch. */ 263362181Sdim err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf); 264362181Sdim if (err) 265362181Sdim { 266362181Sdim svn_error_clear(err); 267362181Sdim return FALSE; 268362181Sdim } 269362181Sdim 270362181Sdim /* Now see whether it matches any/all of the patterns. */ 271362181Sdim for (i = 0; i < patterns->nelts; ++i) 272362181Sdim { 273362181Sdim const char *pattern = APR_ARRAY_IDX(patterns, i, const char *); 274362181Sdim if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS) 275362181Sdim return TRUE; 276362181Sdim } 277362181Sdim 278362181Sdim return FALSE; 279362181Sdim} 280362181Sdim 281289177Speter/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER. 282289177Speter * Assume BUFFER is already filled to *LENGTH and return the new size there. 283289177Speter * This function does *not* nul-terminate the stringbuf! 284289177Speter * 285289177Speter * A returned error indicates that the codepoint is invalid. 286289177Speter */ 287289177Speterstatic svn_error_t * 288289177Speterencode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length) 289289177Speter{ 290289177Speter apr_size_t utf8len; 291289177Speter 292289177Speter if (buffer->size - *length < 4) 293289177Speter svn_membuf__resize(buffer, buffer->size + 4); 294289177Speter 295362181Sdim utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length)); 296289177Speter if (!utf8len) 297289177Speter return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL, 298289177Speter _("Invalid Unicode character U+%04lX"), 299289177Speter (long)ucs4chr); 300289177Speter *length += utf8len; 301289177Speter return SVN_NO_ERROR; 302289177Speter} 303289177Speter 304289177Spetersvn_error_t * 305289177Spetersvn_utf__encode_ucs4_string(svn_membuf_t *buffer, 306289177Speter const apr_int32_t *ucs4str, 307289177Speter apr_size_t length, 308289177Speter apr_size_t *result_length) 309289177Speter{ 310289177Speter *result_length = 0; 311289177Speter while (length-- > 0) 312289177Speter SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length)); 313289177Speter svn_membuf__resize(buffer, *result_length + 1); 314289177Speter ((char*)buffer->data)[*result_length] = '\0'; 315289177Speter return SVN_NO_ERROR; 316289177Speter} 317289177Speter 318289177Speter 319289177Spetersvn_error_t * 320289177Spetersvn_utf__glob(svn_boolean_t *match, 321289177Speter const char *pattern, apr_size_t pattern_len, 322289177Speter const char *string, apr_size_t string_len, 323289177Speter const char *escape, apr_size_t escape_len, 324289177Speter svn_boolean_t sql_like, 325289177Speter svn_membuf_t *pattern_buf, 326289177Speter svn_membuf_t *string_buf, 327289177Speter svn_membuf_t *temp_buf) 328289177Speter{ 329289177Speter apr_size_t patternbuf_len; 330289177Speter apr_size_t tempbuf_len; 331289177Speter 332289177Speter /* If we're in GLOB mode, we don't do custom escape chars. */ 333289177Speter if (escape && !sql_like) 334289177Speter return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 335289177Speter _("Cannot use a custom escape token" 336289177Speter " in glob matching mode")); 337289177Speter 338289177Speter /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result 339289177Speter because apr_fnmatch can't handle it.*/ 340289177Speter SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf)); 341289177Speter if (!sql_like) 342289177Speter SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data, 343289177Speter tempbuf_len, &patternbuf_len)); 344289177Speter else 345289177Speter { 346289177Speter /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */ 347289177Speter const apr_int32_t *like = temp_buf->data; 348289177Speter apr_int32_t ucs4esc; 349289177Speter svn_boolean_t escaped; 350289177Speter apr_size_t i; 351289177Speter 352289177Speter if (!escape) 353289177Speter ucs4esc = -1; /* Definitely an invalid UCS-4 character. */ 354289177Speter else 355289177Speter { 356289177Speter const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH 357289177Speter ? UTF8PROC_NULLTERM : 0); 358362181Sdim apr_ssize_t result = 359289177Speter utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1, 360289177Speter UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm); 361289177Speter if (result < 0) 362289177Speter return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 363289177Speter gettext(utf8proc_errmsg(result))); 364289177Speter if (result == 0 || result > 1) 365289177Speter return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 366289177Speter _("Escape token must be one character")); 367289177Speter if ((ucs4esc & 0xFF) != ucs4esc) 368289177Speter return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL, 369289177Speter _("Invalid escape character U+%04lX"), 370289177Speter (long)ucs4esc); 371289177Speter } 372289177Speter 373289177Speter patternbuf_len = 0; 374289177Speter svn_membuf__ensure(pattern_buf, tempbuf_len + 1); 375289177Speter for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like) 376289177Speter { 377289177Speter if (*like == ucs4esc && !escaped) 378289177Speter { 379289177Speter svn_membuf__resize(pattern_buf, patternbuf_len + 1); 380289177Speter ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 381289177Speter escaped = TRUE; 382289177Speter } 383289177Speter else if (escaped) 384289177Speter { 385289177Speter SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 386289177Speter escaped = FALSE; 387289177Speter } 388289177Speter else 389289177Speter { 390289177Speter if ((*like == '[' || *like == '\\') && !escaped) 391289177Speter { 392289177Speter /* Escape brackets and backslashes which are always 393289177Speter literals in LIKE patterns. */ 394289177Speter svn_membuf__resize(pattern_buf, patternbuf_len + 1); 395289177Speter ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 396289177Speter escaped = TRUE; 397289177Speter --i; --like; 398289177Speter continue; 399289177Speter } 400289177Speter 401289177Speter /* Replace LIKE wildcards with their GLOB equivalents. */ 402289177Speter if (*like == '%' || *like == '_') 403289177Speter { 404289177Speter const char wildcard = (*like == '%' ? '*' : '?'); 405289177Speter svn_membuf__resize(pattern_buf, patternbuf_len + 1); 406289177Speter ((char*)pattern_buf->data)[patternbuf_len++] = wildcard; 407289177Speter } 408289177Speter else 409289177Speter SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 410289177Speter } 411289177Speter } 412289177Speter svn_membuf__resize(pattern_buf, patternbuf_len + 1); 413289177Speter ((char*)pattern_buf->data)[patternbuf_len] = '\0'; 414289177Speter } 415289177Speter 416289177Speter /* Now normalize the string */ 417289177Speter SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf)); 418289177Speter SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data, 419289177Speter tempbuf_len, &tempbuf_len)); 420289177Speter 421289177Speter *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0); 422289177Speter return SVN_NO_ERROR; 423289177Speter} 424289177Speter 425289177Spetersvn_boolean_t 426289177Spetersvn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool) 427289177Speter{ 428289177Speter svn_error_t *err; 429289177Speter svn_membuf_t buffer; 430289177Speter apr_size_t result_length; 431289177Speter const apr_size_t length = strlen(string); 432289177Speter svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool); 433362181Sdim err = normalize_cstring(&result_length, string, length, 434362181Sdim FALSE, FALSE, &buffer); 435289177Speter if (err) 436289177Speter { 437289177Speter svn_error_clear(err); 438289177Speter return FALSE; 439289177Speter } 440289177Speter return (length == result_length && 0 == strcmp(string, buffer.data)); 441289177Speter} 442289177Speter 443289177Speterconst char * 444289177Spetersvn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool) 445289177Speter{ 446289177Speter /* Hexadecimal digits for code conversion. */ 447289177Speter static const char digits[] = "0123456789ABCDEF"; 448289177Speter 449289177Speter /* Flags used for Unicode decomposition. */ 450289177Speter static const int decomp_flags = ( 451289177Speter UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP 452289177Speter | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK); 453289177Speter 454289177Speter svn_stringbuf_t *result; 455289177Speter svn_membuf_t buffer; 456362181Sdim apr_ssize_t decomp_length; 457362181Sdim apr_ssize_t len; 458289177Speter 459289177Speter /* Decompose to a non-reversible compatibility format. */ 460289177Speter svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool); 461289177Speter decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer); 462289177Speter if (decomp_length < 0) 463289177Speter { 464289177Speter svn_membuf_t part; 465289177Speter apr_size_t done, prev; 466289177Speter 467289177Speter /* The only other error we can receive here indicates an integer 468289177Speter overflow due to the length of the input string. Not very 469289177Speter likely, but we certainly shouldn't continue in that case. */ 470289177Speter SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8); 471289177Speter 472289177Speter /* Break the decomposition into parts that are valid UTF-8, and 473289177Speter bytes that are not. Represent the invalid bytes in the target 474289177Speter erray by their negative value. This works because utf8proc 475289177Speter will not generate Unicode code points with values larger than 476289177Speter U+10FFFF. */ 477289177Speter svn_membuf__create(&part, sizeof(apr_int32_t), pool); 478289177Speter decomp_length = 0; 479289177Speter done = prev = 0; 480289177Speter while (done < length) 481289177Speter { 482289177Speter apr_int32_t uc; 483289177Speter 484289177Speter while (done < length) 485289177Speter { 486362181Sdim len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc); 487289177Speter if (len < 0) 488289177Speter break; 489289177Speter done += len; 490289177Speter } 491289177Speter 492289177Speter /* Decompose the valid part */ 493289177Speter if (done > prev) 494289177Speter { 495289177Speter len = unicode_decomposition( 496289177Speter decomp_flags, src + prev, done - prev, &part); 497289177Speter SVN_ERR_ASSERT_NO_RETURN(len > 0); 498289177Speter svn_membuf__resize( 499289177Speter &buffer, (decomp_length + len) * sizeof(apr_int32_t)); 500289177Speter memcpy((apr_int32_t*)buffer.data + decomp_length, 501289177Speter part.data, len * sizeof(apr_int32_t)); 502289177Speter decomp_length += len; 503289177Speter prev = done; 504289177Speter } 505289177Speter 506289177Speter /* What follows could be a valid UTF-8 sequence, but not 507289177Speter a valid Unicode character. */ 508289177Speter if (done < length) 509289177Speter { 510289177Speter const char *last; 511289177Speter 512289177Speter /* Determine the length of the UTF-8 sequence */ 513289177Speter const char *const p = src + done; 514362181Sdim len = utf8proc_utf8class[(apr_byte_t)*p]; 515289177Speter 516289177Speter /* Check if the multi-byte sequence is valid UTF-8. */ 517289177Speter if (len > 1 && len <= (apr_ssize_t)(length - done)) 518289177Speter last = svn_utf__last_valid(p, len); 519289177Speter else 520289177Speter last = NULL; 521289177Speter 522289177Speter /* Might not be a valid UTF-8 sequence at all */ 523289177Speter if (!last || (last && last - p < len)) 524289177Speter { 525289177Speter uc = -((apr_int32_t)(*p & 0xff)); 526289177Speter len = 1; 527289177Speter } 528289177Speter else 529289177Speter { 530289177Speter switch (len) 531289177Speter { 532289177Speter /* Decode the UTF-8 sequence without validation. */ 533289177Speter case 2: 534289177Speter uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); 535289177Speter break; 536289177Speter case 3: 537289177Speter uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) 538289177Speter + (p[2] & 0x3f)); 539289177Speter break; 540289177Speter case 4: 541289177Speter uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) 542289177Speter + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); 543289177Speter break; 544289177Speter default: 545289177Speter SVN_ERR_ASSERT_NO_RETURN( 546289177Speter !"Unexpected invalid UTF-8 byte"); 547289177Speter } 548289177Speter 549289177Speter } 550289177Speter 551289177Speter svn_membuf__resize( 552289177Speter &buffer, (decomp_length + 1) * sizeof(apr_int32_t)); 553289177Speter ((apr_int32_t*)buffer.data)[decomp_length++] = uc; 554289177Speter done += len; 555289177Speter prev = done; 556289177Speter } 557289177Speter } 558289177Speter } 559289177Speter 560289177Speter /* Scan the result and deleting any combining diacriticals and 561289177Speter inserting placeholders where any non-ascii characters remain. */ 562289177Speter result = svn_stringbuf_create_ensure(decomp_length, pool); 563289177Speter for (len = 0; len < decomp_length; ++len) 564289177Speter { 565289177Speter const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len]; 566289177Speter if (cp > 0 && cp < 127) 567289177Speter svn_stringbuf_appendbyte(result, (char)cp); 568289177Speter else if (cp == 0) 569289177Speter svn_stringbuf_appendcstr(result, "\\0"); 570289177Speter else if (cp < 0) 571289177Speter { 572289177Speter const apr_int32_t rcp = ((-cp) & 0xff); 573289177Speter svn_stringbuf_appendcstr(result, "?\\"); 574289177Speter svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]); 575289177Speter svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]); 576289177Speter } 577289177Speter else 578289177Speter { 579289177Speter if (utf8proc_codepoint_valid(cp)) 580289177Speter { 581289177Speter const utf8proc_property_t *prop = utf8proc_get_property(cp); 582289177Speter if (prop->combining_class != 0) 583289177Speter continue; /* Combining mark; ignore */ 584289177Speter svn_stringbuf_appendcstr(result, "{U+"); 585289177Speter } 586289177Speter else 587289177Speter svn_stringbuf_appendcstr(result, "{U?"); 588289177Speter if (cp > 0xffff) 589289177Speter { 590289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]); 591289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]); 592289177Speter } 593289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]); 594289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]); 595289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]); 596289177Speter svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]); 597289177Speter svn_stringbuf_appendbyte(result, '}'); 598289177Speter } 599289177Speter } 600289177Speter 601289177Speter return result->data; 602289177Speter} 603