1/* 2 * utf8proc.c: Wrappers for the utf8proc library 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24 25 26#include <apr_fnmatch.h> 27 28#include "private/svn_string_private.h" 29#include "private/svn_utf_private.h" 30#include "svn_private_config.h" 31 32#if SVN_INTERNAL_UTF8PROC 33#define UTF8PROC_INLINE 34/* Somehow utf8proc thinks it is nice to use strlen as an argument name, 35 while this function is already defined via apr.h */ 36#define strlen svn__strlen_var 37#include "utf8proc/utf8proc.c" 38#undef strlen 39#else 40#include <utf8proc.h> 41#endif 42 43 44 45const char * 46svn_utf__utf8proc_compiled_version(void) 47{ 48 static const char utf8proc_version[] = 49 APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "." 50 APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "." 51 APR_STRINGIFY(UTF8PROC_VERSION_PATCH); 52 return utf8proc_version; 53} 54 55const char * 56svn_utf__utf8proc_runtime_version(void) 57{ 58 /* Unused static function warning removal hack. */ 59 SVN_UNUSED(utf8proc_grapheme_break); 60 SVN_UNUSED(utf8proc_tolower); 61 SVN_UNUSED(utf8proc_toupper); 62#if UTF8PROC_VERSION_MAJOR >= 2 63 SVN_UNUSED(utf8proc_totitle); 64#endif 65 SVN_UNUSED(utf8proc_charwidth); 66 SVN_UNUSED(utf8proc_category_string); 67 SVN_UNUSED(utf8proc_NFD); 68 SVN_UNUSED(utf8proc_NFC); 69 SVN_UNUSED(utf8proc_NFKD); 70 SVN_UNUSED(utf8proc_NFKC); 71 72 return utf8proc_version(); 73} 74 75 76 77/* Fill the given BUFFER with decomposed UCS-4 representation of the 78 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING 79 * is NUL-terminated; otherwise look only at the first LENGTH bytes in 80 * STRING. Upon return, BUFFER->data points at an array of UCS-4 81 * characters, and return the length of the array. TRANSFORM_FLAGS 82 * define exactly how the decomposition is performed. 83 * 84 * A negative return value is an utf8proc error code and may indicate 85 * that STRING contains invalid UTF-8 or was so long that an overflow 86 * occurred. 87 */ 88static apr_ssize_t 89unicode_decomposition(int transform_flags, 90 const char *string, apr_size_t length, 91 svn_membuf_t *buffer) 92{ 93 const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH 94 ? UTF8PROC_NULLTERM : 0); 95 96 for (;;) 97 { 98 apr_int32_t *const ucs4buf = buffer->data; 99 const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf); 100 const apr_ssize_t result = 101 utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len, 102 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE 103 | transform_flags | nullterm); 104 105 if (result < 0 || result <= ucs4len) 106 return result; 107 108 /* Increase the decomposition buffer size and retry */ 109 svn_membuf__ensure(buffer, result * sizeof(*ucs4buf)); 110 } 111} 112 113/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8 114 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 115 * NUL-terminated; otherwise look only at the first LENGTH bytes in 116 * STRING. Upon return, BUFFER->data points at an array of UCS-4 117 * characters and *RESULT_LENGTH contains the length of the array. 118 * 119 * A returned error may indicate that STRING contains invalid UTF-8 or 120 * invalid Unicode codepoints. Any error message comes from utf8proc. 121 */ 122static svn_error_t * 123decompose_normalized(apr_size_t *result_length, 124 const char *string, apr_size_t length, 125 svn_membuf_t *buffer) 126{ 127 apr_ssize_t result = unicode_decomposition(0, string, length, buffer); 128 if (result < 0) 129 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 130 gettext(utf8proc_errmsg(result))); 131 *result_length = result; 132 return SVN_NO_ERROR; 133} 134 135/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8 136 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 137 * NUL-terminated; otherwise look only at the first LENGTH bytes in 138 * STRING. Upon return, BUFFER->data points at a NUL-terminated string 139 * of UTF-8 characters. 140 * 141 * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for 142 * case-insensitive string comparison. If STRIPMARK is non-zero, strip 143 * all diacritical marks (e.g., accents) from the string. 144 * 145 * A returned error may indicate that STRING contains invalid UTF-8 or 146 * invalid Unicode codepoints. Any error message comes from utf8proc. 147 */ 148static svn_error_t * 149normalize_cstring(apr_size_t *result_length, 150 const char *string, apr_size_t length, 151 svn_boolean_t casefold, 152 svn_boolean_t stripmark, 153 svn_membuf_t *buffer) 154{ 155 int flags = 0; 156 apr_ssize_t result; 157 158 if (casefold) 159 flags |= UTF8PROC_CASEFOLD; 160 161 if (stripmark) 162 flags |= UTF8PROC_STRIPMARK; 163 164 result = unicode_decomposition(flags, string, length, buffer); 165 if (result >= 0) 166 { 167 svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1); 168 result = utf8proc_reencode(buffer->data, result, 169 UTF8PROC_COMPOSE | UTF8PROC_STABLE); 170 } 171 if (result < 0) 172 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 173 gettext(utf8proc_errmsg(result))); 174 *result_length = result; 175 return SVN_NO_ERROR; 176} 177 178/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of 179 * length LENB. Return 0 if they're equal, a negative value if BUFA is 180 * less than BUFB, otherwise a positive value. 181 * 182 * Yes, this is strcmp for known-length UCS-4 strings. 183 */ 184static int 185ucs4cmp(const apr_int32_t *bufa, apr_size_t lena, 186 const apr_int32_t *bufb, apr_size_t lenb) 187{ 188 const apr_size_t len = (lena < lenb ? lena : lenb); 189 apr_size_t i; 190 191 for (i = 0; i < len; ++i) 192 { 193 const int diff = bufa[i] - bufb[i]; 194 if (diff) 195 return diff; 196 } 197 return (lena == lenb ? 0 : (lena < lenb ? -1 : 1)); 198} 199 200svn_error_t * 201svn_utf__normcmp(int *result, 202 const char *str1, apr_size_t len1, 203 const char *str2, apr_size_t len2, 204 svn_membuf_t *buf1, svn_membuf_t *buf2) 205{ 206 apr_size_t buflen1; 207 apr_size_t buflen2; 208 209 /* Shortcut-circuit the decision if at least one of the strings is empty. */ 210 const svn_boolean_t empty1 = 211 (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1)); 212 const svn_boolean_t empty2 = 213 (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2)); 214 if (empty1 || empty2) 215 { 216 *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1)); 217 return SVN_NO_ERROR; 218 } 219 220 SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1)); 221 SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2)); 222 *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2); 223 return SVN_NO_ERROR; 224} 225 226svn_error_t* 227svn_utf__normalize(const char **result, 228 const char *str, apr_size_t len, 229 svn_membuf_t *buf) 230{ 231 apr_size_t result_length; 232 SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf)); 233 *result = (const char*)(buf->data); 234 return SVN_NO_ERROR; 235} 236 237svn_error_t * 238svn_utf__xfrm(const char **result, 239 const char *str, apr_size_t len, 240 svn_boolean_t case_insensitive, 241 svn_boolean_t accent_insensitive, 242 svn_membuf_t *buf) 243{ 244 apr_size_t result_length; 245 SVN_ERR(normalize_cstring(&result_length, str, len, 246 case_insensitive, accent_insensitive, buf)); 247 *result = (const char*)(buf->data); 248 return SVN_NO_ERROR; 249} 250 251svn_boolean_t 252svn_utf__fuzzy_glob_match(const char *str, 253 const apr_array_header_t *patterns, 254 svn_membuf_t *buf) 255{ 256 const char *normalized; 257 svn_error_t *err; 258 int i; 259 260 /* Try to normalize case and accents in STR. 261 * 262 * If that should fail for some reason, consider STR a mismatch. */ 263 err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf); 264 if (err) 265 { 266 svn_error_clear(err); 267 return FALSE; 268 } 269 270 /* Now see whether it matches any/all of the patterns. */ 271 for (i = 0; i < patterns->nelts; ++i) 272 { 273 const char *pattern = APR_ARRAY_IDX(patterns, i, const char *); 274 if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS) 275 return TRUE; 276 } 277 278 return FALSE; 279} 280 281/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER. 282 * Assume BUFFER is already filled to *LENGTH and return the new size there. 283 * This function does *not* nul-terminate the stringbuf! 284 * 285 * A returned error indicates that the codepoint is invalid. 286 */ 287static svn_error_t * 288encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length) 289{ 290 apr_size_t utf8len; 291 292 if (buffer->size - *length < 4) 293 svn_membuf__resize(buffer, buffer->size + 4); 294 295 utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length)); 296 if (!utf8len) 297 return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL, 298 _("Invalid Unicode character U+%04lX"), 299 (long)ucs4chr); 300 *length += utf8len; 301 return SVN_NO_ERROR; 302} 303 304svn_error_t * 305svn_utf__encode_ucs4_string(svn_membuf_t *buffer, 306 const apr_int32_t *ucs4str, 307 apr_size_t length, 308 apr_size_t *result_length) 309{ 310 *result_length = 0; 311 while (length-- > 0) 312 SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length)); 313 svn_membuf__resize(buffer, *result_length + 1); 314 ((char*)buffer->data)[*result_length] = '\0'; 315 return SVN_NO_ERROR; 316} 317 318 319svn_error_t * 320svn_utf__glob(svn_boolean_t *match, 321 const char *pattern, apr_size_t pattern_len, 322 const char *string, apr_size_t string_len, 323 const char *escape, apr_size_t escape_len, 324 svn_boolean_t sql_like, 325 svn_membuf_t *pattern_buf, 326 svn_membuf_t *string_buf, 327 svn_membuf_t *temp_buf) 328{ 329 apr_size_t patternbuf_len; 330 apr_size_t tempbuf_len; 331 332 /* If we're in GLOB mode, we don't do custom escape chars. */ 333 if (escape && !sql_like) 334 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 335 _("Cannot use a custom escape token" 336 " in glob matching mode")); 337 338 /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result 339 because apr_fnmatch can't handle it.*/ 340 SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf)); 341 if (!sql_like) 342 SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data, 343 tempbuf_len, &patternbuf_len)); 344 else 345 { 346 /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */ 347 const apr_int32_t *like = temp_buf->data; 348 apr_int32_t ucs4esc; 349 svn_boolean_t escaped; 350 apr_size_t i; 351 352 if (!escape) 353 ucs4esc = -1; /* Definitely an invalid UCS-4 character. */ 354 else 355 { 356 const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH 357 ? UTF8PROC_NULLTERM : 0); 358 apr_ssize_t result = 359 utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1, 360 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm); 361 if (result < 0) 362 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 363 gettext(utf8proc_errmsg(result))); 364 if (result == 0 || result > 1) 365 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 366 _("Escape token must be one character")); 367 if ((ucs4esc & 0xFF) != ucs4esc) 368 return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL, 369 _("Invalid escape character U+%04lX"), 370 (long)ucs4esc); 371 } 372 373 patternbuf_len = 0; 374 svn_membuf__ensure(pattern_buf, tempbuf_len + 1); 375 for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like) 376 { 377 if (*like == ucs4esc && !escaped) 378 { 379 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 380 ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 381 escaped = TRUE; 382 } 383 else if (escaped) 384 { 385 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 386 escaped = FALSE; 387 } 388 else 389 { 390 if ((*like == '[' || *like == '\\') && !escaped) 391 { 392 /* Escape brackets and backslashes which are always 393 literals in LIKE patterns. */ 394 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 395 ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 396 escaped = TRUE; 397 --i; --like; 398 continue; 399 } 400 401 /* Replace LIKE wildcards with their GLOB equivalents. */ 402 if (*like == '%' || *like == '_') 403 { 404 const char wildcard = (*like == '%' ? '*' : '?'); 405 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 406 ((char*)pattern_buf->data)[patternbuf_len++] = wildcard; 407 } 408 else 409 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 410 } 411 } 412 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 413 ((char*)pattern_buf->data)[patternbuf_len] = '\0'; 414 } 415 416 /* Now normalize the string */ 417 SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf)); 418 SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data, 419 tempbuf_len, &tempbuf_len)); 420 421 *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0); 422 return SVN_NO_ERROR; 423} 424 425svn_boolean_t 426svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool) 427{ 428 svn_error_t *err; 429 svn_membuf_t buffer; 430 apr_size_t result_length; 431 const apr_size_t length = strlen(string); 432 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool); 433 err = normalize_cstring(&result_length, string, length, 434 FALSE, FALSE, &buffer); 435 if (err) 436 { 437 svn_error_clear(err); 438 return FALSE; 439 } 440 return (length == result_length && 0 == strcmp(string, buffer.data)); 441} 442 443const char * 444svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool) 445{ 446 /* Hexadecimal digits for code conversion. */ 447 static const char digits[] = "0123456789ABCDEF"; 448 449 /* Flags used for Unicode decomposition. */ 450 static const int decomp_flags = ( 451 UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP 452 | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK); 453 454 svn_stringbuf_t *result; 455 svn_membuf_t buffer; 456 apr_ssize_t decomp_length; 457 apr_ssize_t len; 458 459 /* Decompose to a non-reversible compatibility format. */ 460 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool); 461 decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer); 462 if (decomp_length < 0) 463 { 464 svn_membuf_t part; 465 apr_size_t done, prev; 466 467 /* The only other error we can receive here indicates an integer 468 overflow due to the length of the input string. Not very 469 likely, but we certainly shouldn't continue in that case. */ 470 SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8); 471 472 /* Break the decomposition into parts that are valid UTF-8, and 473 bytes that are not. Represent the invalid bytes in the target 474 erray by their negative value. This works because utf8proc 475 will not generate Unicode code points with values larger than 476 U+10FFFF. */ 477 svn_membuf__create(&part, sizeof(apr_int32_t), pool); 478 decomp_length = 0; 479 done = prev = 0; 480 while (done < length) 481 { 482 apr_int32_t uc; 483 484 while (done < length) 485 { 486 len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc); 487 if (len < 0) 488 break; 489 done += len; 490 } 491 492 /* Decompose the valid part */ 493 if (done > prev) 494 { 495 len = unicode_decomposition( 496 decomp_flags, src + prev, done - prev, &part); 497 SVN_ERR_ASSERT_NO_RETURN(len > 0); 498 svn_membuf__resize( 499 &buffer, (decomp_length + len) * sizeof(apr_int32_t)); 500 memcpy((apr_int32_t*)buffer.data + decomp_length, 501 part.data, len * sizeof(apr_int32_t)); 502 decomp_length += len; 503 prev = done; 504 } 505 506 /* What follows could be a valid UTF-8 sequence, but not 507 a valid Unicode character. */ 508 if (done < length) 509 { 510 const char *last; 511 512 /* Determine the length of the UTF-8 sequence */ 513 const char *const p = src + done; 514 len = utf8proc_utf8class[(apr_byte_t)*p]; 515 516 /* Check if the multi-byte sequence is valid UTF-8. */ 517 if (len > 1 && len <= (apr_ssize_t)(length - done)) 518 last = svn_utf__last_valid(p, len); 519 else 520 last = NULL; 521 522 /* Might not be a valid UTF-8 sequence at all */ 523 if (!last || (last && last - p < len)) 524 { 525 uc = -((apr_int32_t)(*p & 0xff)); 526 len = 1; 527 } 528 else 529 { 530 switch (len) 531 { 532 /* Decode the UTF-8 sequence without validation. */ 533 case 2: 534 uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); 535 break; 536 case 3: 537 uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) 538 + (p[2] & 0x3f)); 539 break; 540 case 4: 541 uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) 542 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); 543 break; 544 default: 545 SVN_ERR_ASSERT_NO_RETURN( 546 !"Unexpected invalid UTF-8 byte"); 547 } 548 549 } 550 551 svn_membuf__resize( 552 &buffer, (decomp_length + 1) * sizeof(apr_int32_t)); 553 ((apr_int32_t*)buffer.data)[decomp_length++] = uc; 554 done += len; 555 prev = done; 556 } 557 } 558 } 559 560 /* Scan the result and deleting any combining diacriticals and 561 inserting placeholders where any non-ascii characters remain. */ 562 result = svn_stringbuf_create_ensure(decomp_length, pool); 563 for (len = 0; len < decomp_length; ++len) 564 { 565 const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len]; 566 if (cp > 0 && cp < 127) 567 svn_stringbuf_appendbyte(result, (char)cp); 568 else if (cp == 0) 569 svn_stringbuf_appendcstr(result, "\\0"); 570 else if (cp < 0) 571 { 572 const apr_int32_t rcp = ((-cp) & 0xff); 573 svn_stringbuf_appendcstr(result, "?\\"); 574 svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]); 575 svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]); 576 } 577 else 578 { 579 if (utf8proc_codepoint_valid(cp)) 580 { 581 const utf8proc_property_t *prop = utf8proc_get_property(cp); 582 if (prop->combining_class != 0) 583 continue; /* Combining mark; ignore */ 584 svn_stringbuf_appendcstr(result, "{U+"); 585 } 586 else 587 svn_stringbuf_appendcstr(result, "{U?"); 588 if (cp > 0xffff) 589 { 590 svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]); 591 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]); 592 } 593 svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]); 594 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]); 595 svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]); 596 svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]); 597 svn_stringbuf_appendbyte(result, '}'); 598 } 599 } 600 601 return result->data; 602} 603