utf8proc.c revision 289180
1/* 2 * utf8proc.c: Wrappers for the utf8proc library 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24 25 26#include <apr_fnmatch.h> 27 28#include "private/svn_string_private.h" 29#include "private/svn_utf_private.h" 30#include "svn_private_config.h" 31 32#define UTF8PROC_INLINE 33/* Somehow utf8proc thinks it is nice to use strlen as an argument name, 34 while this function is already defined via apr.h */ 35#define strlen svn__strlen_var 36#include "utf8proc/utf8proc.c" 37#undef strlen 38 39 40 41const char * 42svn_utf__utf8proc_compiled_version(void) 43{ 44 static const char utf8proc_version[] = 45 APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "." 46 APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "." 47 APR_STRINGIFY(UTF8PROC_VERSION_PATCH); 48 return utf8proc_version; 49} 50 51const char * 52svn_utf__utf8proc_runtime_version(void) 53{ 54 /* Unused static function warning removal hack. */ 55 SVN_UNUSED(utf8proc_NFD); 56 SVN_UNUSED(utf8proc_NFC); 57 SVN_UNUSED(utf8proc_NFKD); 58 SVN_UNUSED(utf8proc_NFKC); 59 60 return utf8proc_version(); 61} 62 63 64 65/* Fill the given BUFFER with decomposed UCS-4 representation of the 66 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING 67 * is NUL-terminated; otherwise look only at the first LENGTH bytes in 68 * STRING. Upon return, BUFFER->data points at an array of UCS-4 69 * characters, and return the length of the array. TRANSFORM_FLAGS 70 * define exactly how the decomposition is performed. 71 * 72 * A negative return value is an utf8proc error code and may indicate 73 * that STRING contains invalid UTF-8 or was so long that an overflow 74 * occurred. 75 */ 76static ssize_t 77unicode_decomposition(int transform_flags, 78 const char *string, apr_size_t length, 79 svn_membuf_t *buffer) 80{ 81 const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH 82 ? UTF8PROC_NULLTERM : 0); 83 84 for (;;) 85 { 86 apr_int32_t *const ucs4buf = buffer->data; 87 const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf); 88 const ssize_t result = 89 utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len, 90 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE 91 | transform_flags | nullterm); 92 93 if (result < 0 || result <= ucs4len) 94 return result; 95 96 /* Increase the decomposition buffer size and retry */ 97 svn_membuf__ensure(buffer, result * sizeof(*ucs4buf)); 98 } 99} 100 101/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8 102 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 103 * NUL-terminated; otherwise look only at the first LENGTH bytes in 104 * STRING. Upon return, BUFFER->data points at an array of UCS-4 105 * characters and *RESULT_LENGTH contains the length of the array. 106 * 107 * A returned error may indicate that STRING contains invalid UTF-8 or 108 * invalid Unicode codepoints. Any error message comes from utf8proc. 109 */ 110static svn_error_t * 111decompose_normalized(apr_size_t *result_length, 112 const char *string, apr_size_t length, 113 svn_membuf_t *buffer) 114{ 115 ssize_t result = unicode_decomposition(0, string, length, buffer); 116 if (result < 0) 117 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 118 gettext(utf8proc_errmsg(result))); 119 *result_length = result; 120 return SVN_NO_ERROR; 121} 122 123/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8 124 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is 125 * NUL-terminated; otherwise look only at the first LENGTH bytes in 126 * STRING. Upon return, BUFFER->data points at a NUL-terminated string 127 * of UTF-8 characters. 128 * 129 * A returned error may indicate that STRING contains invalid UTF-8 or 130 * invalid Unicode codepoints. Any error message comes from utf8proc. 131 */ 132static svn_error_t * 133normalize_cstring(apr_size_t *result_length, 134 const char *string, apr_size_t length, 135 svn_membuf_t *buffer) 136{ 137 ssize_t result = unicode_decomposition(0, string, length, buffer); 138 if (result >= 0) 139 { 140 svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1); 141 result = utf8proc_reencode(buffer->data, result, 142 UTF8PROC_COMPOSE | UTF8PROC_STABLE); 143 } 144 if (result < 0) 145 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 146 gettext(utf8proc_errmsg(result))); 147 *result_length = result; 148 return SVN_NO_ERROR; 149} 150 151/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of 152 * length LENB. Return 0 if they're equal, a negative value if BUFA is 153 * less than BUFB, otherwise a positive value. 154 * 155 * Yes, this is strcmp for known-length UCS-4 strings. 156 */ 157static int 158ucs4cmp(const apr_int32_t *bufa, apr_size_t lena, 159 const apr_int32_t *bufb, apr_size_t lenb) 160{ 161 const apr_size_t len = (lena < lenb ? lena : lenb); 162 apr_size_t i; 163 164 for (i = 0; i < len; ++i) 165 { 166 const int diff = bufa[i] - bufb[i]; 167 if (diff) 168 return diff; 169 } 170 return (lena == lenb ? 0 : (lena < lenb ? -1 : 1)); 171} 172 173svn_error_t * 174svn_utf__normcmp(int *result, 175 const char *str1, apr_size_t len1, 176 const char *str2, apr_size_t len2, 177 svn_membuf_t *buf1, svn_membuf_t *buf2) 178{ 179 apr_size_t buflen1; 180 apr_size_t buflen2; 181 182 /* Shortcut-circuit the decision if at least one of the strings is empty. */ 183 const svn_boolean_t empty1 = 184 (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1)); 185 const svn_boolean_t empty2 = 186 (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2)); 187 if (empty1 || empty2) 188 { 189 *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1)); 190 return SVN_NO_ERROR; 191 } 192 193 SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1)); 194 SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2)); 195 *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2); 196 return SVN_NO_ERROR; 197} 198 199svn_error_t* 200svn_utf__normalize(const char **result, 201 const char *str, apr_size_t len, 202 svn_membuf_t *buf) 203{ 204 apr_size_t result_length; 205 SVN_ERR(normalize_cstring(&result_length, str, len, buf)); 206 *result = (const char*)(buf->data); 207 return SVN_NO_ERROR; 208} 209 210/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER. 211 * Assume BUFFER is already filled to *LENGTH and return the new size there. 212 * This function does *not* nul-terminate the stringbuf! 213 * 214 * A returned error indicates that the codepoint is invalid. 215 */ 216static svn_error_t * 217encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length) 218{ 219 apr_size_t utf8len; 220 221 if (buffer->size - *length < 4) 222 svn_membuf__resize(buffer, buffer->size + 4); 223 224 utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length)); 225 if (!utf8len) 226 return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL, 227 _("Invalid Unicode character U+%04lX"), 228 (long)ucs4chr); 229 *length += utf8len; 230 return SVN_NO_ERROR; 231} 232 233svn_error_t * 234svn_utf__encode_ucs4_string(svn_membuf_t *buffer, 235 const apr_int32_t *ucs4str, 236 apr_size_t length, 237 apr_size_t *result_length) 238{ 239 *result_length = 0; 240 while (length-- > 0) 241 SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length)); 242 svn_membuf__resize(buffer, *result_length + 1); 243 ((char*)buffer->data)[*result_length] = '\0'; 244 return SVN_NO_ERROR; 245} 246 247 248svn_error_t * 249svn_utf__glob(svn_boolean_t *match, 250 const char *pattern, apr_size_t pattern_len, 251 const char *string, apr_size_t string_len, 252 const char *escape, apr_size_t escape_len, 253 svn_boolean_t sql_like, 254 svn_membuf_t *pattern_buf, 255 svn_membuf_t *string_buf, 256 svn_membuf_t *temp_buf) 257{ 258 apr_size_t patternbuf_len; 259 apr_size_t tempbuf_len; 260 261 /* If we're in GLOB mode, we don't do custom escape chars. */ 262 if (escape && !sql_like) 263 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 264 _("Cannot use a custom escape token" 265 " in glob matching mode")); 266 267 /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result 268 because apr_fnmatch can't handle it.*/ 269 SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf)); 270 if (!sql_like) 271 SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data, 272 tempbuf_len, &patternbuf_len)); 273 else 274 { 275 /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */ 276 const apr_int32_t *like = temp_buf->data; 277 apr_int32_t ucs4esc; 278 svn_boolean_t escaped; 279 apr_size_t i; 280 281 if (!escape) 282 ucs4esc = -1; /* Definitely an invalid UCS-4 character. */ 283 else 284 { 285 const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH 286 ? UTF8PROC_NULLTERM : 0); 287 ssize_t result = 288 utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1, 289 UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm); 290 if (result < 0) 291 return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL, 292 gettext(utf8proc_errmsg(result))); 293 if (result == 0 || result > 1) 294 return svn_error_create(SVN_ERR_UTF8_GLOB, NULL, 295 _("Escape token must be one character")); 296 if ((ucs4esc & 0xFF) != ucs4esc) 297 return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL, 298 _("Invalid escape character U+%04lX"), 299 (long)ucs4esc); 300 } 301 302 patternbuf_len = 0; 303 svn_membuf__ensure(pattern_buf, tempbuf_len + 1); 304 for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like) 305 { 306 if (*like == ucs4esc && !escaped) 307 { 308 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 309 ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 310 escaped = TRUE; 311 } 312 else if (escaped) 313 { 314 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 315 escaped = FALSE; 316 } 317 else 318 { 319 if ((*like == '[' || *like == '\\') && !escaped) 320 { 321 /* Escape brackets and backslashes which are always 322 literals in LIKE patterns. */ 323 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 324 ((char*)pattern_buf->data)[patternbuf_len++] = '\\'; 325 escaped = TRUE; 326 --i; --like; 327 continue; 328 } 329 330 /* Replace LIKE wildcards with their GLOB equivalents. */ 331 if (*like == '%' || *like == '_') 332 { 333 const char wildcard = (*like == '%' ? '*' : '?'); 334 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 335 ((char*)pattern_buf->data)[patternbuf_len++] = wildcard; 336 } 337 else 338 SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len)); 339 } 340 } 341 svn_membuf__resize(pattern_buf, patternbuf_len + 1); 342 ((char*)pattern_buf->data)[patternbuf_len] = '\0'; 343 } 344 345 /* Now normalize the string */ 346 SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf)); 347 SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data, 348 tempbuf_len, &tempbuf_len)); 349 350 *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0); 351 return SVN_NO_ERROR; 352} 353 354svn_boolean_t 355svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool) 356{ 357 svn_error_t *err; 358 svn_membuf_t buffer; 359 apr_size_t result_length; 360 const apr_size_t length = strlen(string); 361 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool); 362 err = normalize_cstring(&result_length, string, length, &buffer); 363 if (err) 364 { 365 svn_error_clear(err); 366 return FALSE; 367 } 368 return (length == result_length && 0 == strcmp(string, buffer.data)); 369} 370 371const char * 372svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool) 373{ 374 /* Hexadecimal digits for code conversion. */ 375 static const char digits[] = "0123456789ABCDEF"; 376 377 /* Flags used for Unicode decomposition. */ 378 static const int decomp_flags = ( 379 UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP 380 | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK); 381 382 svn_stringbuf_t *result; 383 svn_membuf_t buffer; 384 ssize_t decomp_length; 385 ssize_t len; 386 387 /* Decompose to a non-reversible compatibility format. */ 388 svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool); 389 decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer); 390 if (decomp_length < 0) 391 { 392 svn_membuf_t part; 393 apr_size_t done, prev; 394 395 /* The only other error we can receive here indicates an integer 396 overflow due to the length of the input string. Not very 397 likely, but we certainly shouldn't continue in that case. */ 398 SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8); 399 400 /* Break the decomposition into parts that are valid UTF-8, and 401 bytes that are not. Represent the invalid bytes in the target 402 erray by their negative value. This works because utf8proc 403 will not generate Unicode code points with values larger than 404 U+10FFFF. */ 405 svn_membuf__create(&part, sizeof(apr_int32_t), pool); 406 decomp_length = 0; 407 done = prev = 0; 408 while (done < length) 409 { 410 apr_int32_t uc; 411 412 while (done < length) 413 { 414 len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc); 415 if (len < 0) 416 break; 417 done += len; 418 } 419 420 /* Decompose the valid part */ 421 if (done > prev) 422 { 423 len = unicode_decomposition( 424 decomp_flags, src + prev, done - prev, &part); 425 SVN_ERR_ASSERT_NO_RETURN(len > 0); 426 svn_membuf__resize( 427 &buffer, (decomp_length + len) * sizeof(apr_int32_t)); 428 memcpy((apr_int32_t*)buffer.data + decomp_length, 429 part.data, len * sizeof(apr_int32_t)); 430 decomp_length += len; 431 prev = done; 432 } 433 434 /* What follows could be a valid UTF-8 sequence, but not 435 a valid Unicode character. */ 436 if (done < length) 437 { 438 const char *last; 439 440 /* Determine the length of the UTF-8 sequence */ 441 const char *const p = src + done; 442 len = utf8proc_utf8class[(uint8_t)*p]; 443 444 /* Check if the multi-byte sequence is valid UTF-8. */ 445 if (len > 1 && len <= (apr_ssize_t)(length - done)) 446 last = svn_utf__last_valid(p, len); 447 else 448 last = NULL; 449 450 /* Might not be a valid UTF-8 sequence at all */ 451 if (!last || (last && last - p < len)) 452 { 453 uc = -((apr_int32_t)(*p & 0xff)); 454 len = 1; 455 } 456 else 457 { 458 switch (len) 459 { 460 /* Decode the UTF-8 sequence without validation. */ 461 case 2: 462 uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f); 463 break; 464 case 3: 465 uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) 466 + (p[2] & 0x3f)); 467 break; 468 case 4: 469 uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12) 470 + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f)); 471 break; 472 default: 473 SVN_ERR_ASSERT_NO_RETURN( 474 !"Unexpected invalid UTF-8 byte"); 475 } 476 477 } 478 479 svn_membuf__resize( 480 &buffer, (decomp_length + 1) * sizeof(apr_int32_t)); 481 ((apr_int32_t*)buffer.data)[decomp_length++] = uc; 482 done += len; 483 prev = done; 484 } 485 } 486 } 487 488 /* Scan the result and deleting any combining diacriticals and 489 inserting placeholders where any non-ascii characters remain. */ 490 result = svn_stringbuf_create_ensure(decomp_length, pool); 491 for (len = 0; len < decomp_length; ++len) 492 { 493 const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len]; 494 if (cp > 0 && cp < 127) 495 svn_stringbuf_appendbyte(result, (char)cp); 496 else if (cp == 0) 497 svn_stringbuf_appendcstr(result, "\\0"); 498 else if (cp < 0) 499 { 500 const apr_int32_t rcp = ((-cp) & 0xff); 501 svn_stringbuf_appendcstr(result, "?\\"); 502 svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]); 503 svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]); 504 } 505 else 506 { 507 if (utf8proc_codepoint_valid(cp)) 508 { 509 const utf8proc_property_t *prop = utf8proc_get_property(cp); 510 if (prop->combining_class != 0) 511 continue; /* Combining mark; ignore */ 512 svn_stringbuf_appendcstr(result, "{U+"); 513 } 514 else 515 svn_stringbuf_appendcstr(result, "{U?"); 516 if (cp > 0xffff) 517 { 518 svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]); 519 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]); 520 } 521 svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]); 522 svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]); 523 svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]); 524 svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]); 525 svn_stringbuf_appendbyte(result, '}'); 526 } 527 } 528 529 return result->data; 530} 531