1/** 2 * @copyright 3 * ==================================================================== 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, 15 * software distributed under the License is distributed on an 16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 17 * KIND, either express or implied. See the License for the 18 * specific language governing permissions and limitations 19 * under the License. 20 * ==================================================================== 21 * @endcopyright 22 * 23 * @file svn_utf_private.h 24 * @brief UTF validation and normalization routines 25 */ 26 27#ifndef SVN_UTF_PRIVATE_H 28#define SVN_UTF_PRIVATE_H 29 30#include <apr.h> 31#include <apr_pools.h> 32 33#include "svn_types.h" 34#include "svn_string.h" 35#include "svn_string_private.h" 36 37#ifdef __cplusplus 38extern "C" { 39#endif /* __cplusplus */ 40 41 42/* Return TRUE if the string SRC of length LEN is a valid UTF-8 encoding 43 * according to the rules laid down by the Unicode 4.0 standard, FALSE 44 * otherwise. This function is faster than svn_utf__last_valid(). 45 */ 46svn_boolean_t 47svn_utf__is_valid(const char *src, apr_size_t len); 48 49/* As for svn_utf__is_valid but SRC is NULL terminated. */ 50svn_boolean_t 51svn_utf__cstring_is_valid(const char *src); 52 53/* Return a pointer to the first character after the last valid UTF-8 54 * potentially multi-byte character in the string SRC of length LEN. 55 * Validity of bytes from SRC to SRC+LEN-1, inclusively, is checked. 56 * If SRC is a valid UTF-8, the return value will point to the byte SRC+LEN, 57 * otherwise it will point to the start of the first invalid character. 58 * In either case all the characters between SRC and the return pointer - 1, 59 * inclusively, are valid UTF-8. 60 * 61 * See also svn_utf__is_valid(). 62 */ 63const char * 64svn_utf__last_valid(const char *src, apr_size_t len); 65 66/* As for svn_utf__last_valid but uses a different implementation without 67 lookup tables. It avoids the table memory use (about 400 bytes) but the 68 function is longer (about 200 bytes extra) and likely to be slower when 69 the string is valid. If the string is invalid this function may be 70 faster since it returns immediately rather than continuing to the end of 71 the string. The main reason this function exists is to test the table 72 driven implementation. */ 73const char * 74svn_utf__last_valid2(const char *src, apr_size_t len); 75 76/* Copy LENGTH bytes of SRC, converting characters as follows: 77 - Pass characters from the ASCII subset to the result 78 - Strip all combining marks from the string 79 - Represent other valid Unicode chars as {U+XXXX} 80 - Replace invalid Unicode chars with {U?XXXX} 81 - Represent chars that are not valid UTF-8 as ?\XX 82 - Replace codes outside the Unicode range with a sequence of ?\XX 83 - Represent the null byte as \0 84 Allocate the result in POOL. */ 85const char * 86svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool); 87 88const char * 89svn_utf__cstring_from_utf8_fuzzy(const char *src, 90 apr_pool_t *pool, 91 svn_error_t *(*convert_from_utf8) 92 (const char **, 93 const char *, 94 apr_pool_t *)); 95 96 97#if defined(WIN32) 98/* On Windows: Convert the UTF-8 string SRC to UTF-16. 99 If PREFIX is not NULL, prepend it to the converted result. 100 The result, if not empty, will be allocated in RESULT_POOL. */ 101svn_error_t * 102svn_utf__win32_utf8_to_utf16(const WCHAR **result, 103 const char *src, 104 const WCHAR *prefix, 105 apr_pool_t *result_pool); 106 107/* On Windows: Convert the UTF-16 string SRC to UTF-8. 108 If PREFIX is not NULL, prepend it to the converted result. 109 The result, if not empty, will be allocated in RESULT_POOL. */ 110svn_error_t * 111svn_utf__win32_utf16_to_utf8(const char **result, 112 const WCHAR *src, 113 const char *prefix, 114 apr_pool_t *result_pool); 115#endif /* WIN32*/ 116 117 118/* A constant used for many length parameters in the utf8proc wrappers 119 * to indicate that the length of a string is unknonw. */ 120#define SVN_UTF__UNKNOWN_LENGTH ((apr_size_t) -1) 121 122 123/* Compare two UTF-8 strings, ignoring normalization, using buffers 124 * BUF1 and BUF2 for temporary storage. If either of LEN1 or LEN2 is 125 * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is 126 * null-terminated; otherwise, consider the string only up to the 127 * given length. 128 * 129 * Return compare value in *RESULT. 130 */ 131svn_error_t * 132svn_utf__normcmp(int *result, 133 const char *str1, apr_size_t len1, 134 const char *str2, apr_size_t len2, 135 svn_membuf_t *buf1, svn_membuf_t *buf2); 136 137/* Normalize the UTF-8 string STR to form C, using BUF for temporary 138 * storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, assume STR is 139 * null-terminated; otherwise, consider the string only up to the 140 * given length. 141 * 142 * Return the normalized string in *RESULT, which shares storage with 143 * BUF and is valid only until the next time BUF is modified. 144 * 145 * A returned error may indicate that STRING contains invalid UTF-8 or 146 * invalid Unicode codepoints. 147 */ 148svn_error_t* 149svn_utf__normalize(const char **result, 150 const char *str, apr_size_t len, 151 svn_membuf_t *buf); 152 153/* Transform the UTF-8 string to a shape suitable for comparison with 154 * strcmp(). The tranformation is defined by CASE_INSENSITIVE and 155 * ACCENT_INSENSITIVE arguments. If CASE_INSENSITIVE is non-zero, 156 * remove case distinctions from the string. If ACCENT_INSENSITIVE 157 * is non-zero, remove diacritical marks from the string. 158 * 159 * Use BUF as a temporary storage. If LEN is SVN_UTF__UNKNOWN_LENGTH, 160 * assume STR is null-terminated; otherwise, consider the string only 161 * up to the given length. Place the tranformed string in *RESULT, which 162 * shares storage with BUF and is valid only until the next time BUF is 163 * modified. 164 * 165 * A returned error may indicate that STRING contains invalid UTF-8 or 166 * invalid Unicode codepoints. 167 */ 168svn_error_t * 169svn_utf__xfrm(const char **result, 170 const char *str, apr_size_t len, 171 svn_boolean_t case_insensitive, 172 svn_boolean_t accent_insensitive, 173 svn_membuf_t *buf); 174 175/* Return TRUE if S matches any of the const char * glob patterns in 176 * PATTERNS. 177 * 178 * S will internally be normalized to lower-case and accents removed 179 * using svn_utf__xfrm. To get a match, the PATTERNS must have been 180 * normalized accordingly before calling this function. 181 */ 182svn_boolean_t 183svn_utf__fuzzy_glob_match(const char *str, 184 const apr_array_header_t *patterns, 185 svn_membuf_t *buf); 186 187/* Check if STRING is a valid, NFC-normalized UTF-8 string. Note that 188 * a FALSE return value may indicate that STRING is not valid UTF-8 at 189 * all. 190 * 191 * Use SCRATCH_POOL for temporary allocations. 192 */ 193svn_boolean_t 194svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool); 195 196/* Encode an UCS-4 string to UTF-8, placing the result into BUFFER. 197 * While utf8proc does have a similar function, it does more checking 198 * and processing than we want here; this function does not attempt 199 * any normalizations but just encodes the individual code points. 200 * The encoded string will always be NUL-terminated. 201 * 202 * Return the length of the result (excluding the NUL terminator) in 203 * *result_length. 204 * 205 * A returned error indicates that a codepoint is invalid. 206 */ 207svn_error_t * 208svn_utf__encode_ucs4_string(svn_membuf_t *buffer, 209 const apr_int32_t *ucs4str, 210 apr_size_t length, 211 apr_size_t *result_length); 212 213/* Pattern matching similar to the SQLite LIKE and GLOB 214 * operators. PATTERN, KEY and ESCAPE must all point to UTF-8 215 * strings. Furthermore, ESCAPE, if provided, must be a character from 216 * the ASCII subset. 217 * 218 * If any of PATTERN_LEN, STRING_LEN or ESCAPE_LEN are 219 * SVN_UTF__UNKNOWN_LENGTH, assume the associated string is 220 * null-terminated; otherwise, consider the string only up to the 221 * given length. 222 * 223 * Use buffers PATTERN_BUF, STRING_BUF and TEMP_BUF for temporary storage. 224 * 225 * If SQL_LIKE is true, interpret PATTERN as a pattern used by the SQL 226 * LIKE operator and notice ESCAPE. Otherwise it's a Unix fileglob 227 * pattern, and ESCAPE must be NULL. 228 * 229 * Set *MATCH to the result of the comparison. 230*/ 231svn_error_t * 232svn_utf__glob(svn_boolean_t *match, 233 const char *pattern, apr_size_t pattern_len, 234 const char *string, apr_size_t string_len, 235 const char *escape, apr_size_t escape_len, 236 svn_boolean_t sql_like, 237 svn_membuf_t *pattern_buf, 238 svn_membuf_t *string_buf, 239 svn_membuf_t *temp_buf); 240 241/* Return the compiled version of the wrapped utf8proc library. */ 242const char * 243svn_utf__utf8proc_compiled_version(void); 244 245/* Return the runtime version of the wrapped utf8proc library. */ 246const char * 247svn_utf__utf8proc_runtime_version(void); 248 249/* Convert an UTF-16 (or UCS-2) string to UTF-8, returning the pointer 250 * in RESULT. If BIG_ENDIAN is set, then UTF16STR is big-endian; 251 * otherwise, it's little-endian. 252 * 253 * If UTF16LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF16STR must be 254 * terminated with a zero; otherwise, it is the number of 16-bit codes 255 * to convert, and the source string may contain NUL values. 256 * 257 * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for 258 * intermediate allocation. 259 * 260 * This function combines UTF-16 surrogate pairs into single code 261 * points, but will leave single lead or trail surrogates unchanged. 262 */ 263svn_error_t * 264svn_utf__utf16_to_utf8(const svn_string_t **result, 265 const apr_uint16_t *utf16str, 266 apr_size_t utf16len, 267 svn_boolean_t big_endian, 268 apr_pool_t *result_pool, 269 apr_pool_t *scratch_pool); 270 271/* Convert an UTF-32 string to UTF-8, returning the pointer in 272 * RESULT. If BIG_ENDIAN is set, then UTF32STR is big-endian; 273 * otherwise, it's little-endian. 274 * 275 * If UTF32LEN is SVN_UTF__UNKNOWN_LENGTH, then UTF32STR must be 276 * terminated with a zero; otherwise, it is the number of 32-bit codes 277 * to convert, and the source string may contain NUL values. 278 * 279 * Allocate RESULT in RESULT_POOL and use SCRATCH_POOL for 280 * intermediate allocation. 281 */ 282svn_error_t * 283svn_utf__utf32_to_utf8(const svn_string_t **result, 284 const apr_int32_t *utf32str, 285 apr_size_t utf32len, 286 svn_boolean_t big_endian, 287 apr_pool_t *result_pool, 288 apr_pool_t *scratch_pool); 289 290 291#ifdef __cplusplus 292} 293#endif /* __cplusplus */ 294 295#endif /* SVN_UTF_PRIVATE_H */ 296