utf8proc.c revision 289180
1/*
2 * utf8proc.c:  Wrappers for the utf8proc library
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <apr_fnmatch.h>
27
28#include "private/svn_string_private.h"
29#include "private/svn_utf_private.h"
30#include "svn_private_config.h"
31
32#define UTF8PROC_INLINE
33/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
34   while this function is already defined via apr.h */
35#define strlen svn__strlen_var
36#include "utf8proc/utf8proc.c"
37#undef strlen
38
39
40
41const char *
42svn_utf__utf8proc_compiled_version(void)
43{
44  static const char utf8proc_version[] =
45                                  APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
46                                  APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
47                                  APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
48  return utf8proc_version;
49}
50
51const char *
52svn_utf__utf8proc_runtime_version(void)
53{
54  /* Unused static function warning removal hack. */
55  SVN_UNUSED(utf8proc_NFD);
56  SVN_UNUSED(utf8proc_NFC);
57  SVN_UNUSED(utf8proc_NFKD);
58  SVN_UNUSED(utf8proc_NFKC);
59
60  return utf8proc_version();
61}
62
63
64
65/* Fill the given BUFFER with decomposed UCS-4 representation of the
66 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
67 * is NUL-terminated; otherwise look only at the first LENGTH bytes in
68 * STRING. Upon return, BUFFER->data points at an array of UCS-4
69 * characters, and return the length of the array. TRANSFORM_FLAGS
70 * define exactly how the decomposition is performed.
71 *
72 * A negative return value is an utf8proc error code and may indicate
73 * that STRING contains invalid UTF-8 or was so long that an overflow
74 * occurred.
75 */
76static ssize_t
77unicode_decomposition(int transform_flags,
78                      const char *string, apr_size_t length,
79                      svn_membuf_t *buffer)
80{
81  const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
82                        ? UTF8PROC_NULLTERM : 0);
83
84  for (;;)
85    {
86      apr_int32_t *const ucs4buf = buffer->data;
87      const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
88      const ssize_t result =
89        utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
90                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
91                           | transform_flags | nullterm);
92
93      if (result < 0 || result <= ucs4len)
94        return result;
95
96      /* Increase the decomposition buffer size and retry */
97      svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
98    }
99}
100
101/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
102 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
103 * NUL-terminated; otherwise look only at the first LENGTH bytes in
104 * STRING. Upon return, BUFFER->data points at an array of UCS-4
105 * characters and *RESULT_LENGTH contains the length of the array.
106 *
107 * A returned error may indicate that STRING contains invalid UTF-8 or
108 * invalid Unicode codepoints. Any error message comes from utf8proc.
109 */
110static svn_error_t *
111decompose_normalized(apr_size_t *result_length,
112                     const char *string, apr_size_t length,
113                     svn_membuf_t *buffer)
114{
115  ssize_t result = unicode_decomposition(0, string, length, buffer);
116  if (result < 0)
117    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
118                            gettext(utf8proc_errmsg(result)));
119  *result_length = result;
120  return SVN_NO_ERROR;
121}
122
123/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
124 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
125 * NUL-terminated; otherwise look only at the first LENGTH bytes in
126 * STRING. Upon return, BUFFER->data points at a NUL-terminated string
127 * of UTF-8 characters.
128 *
129 * A returned error may indicate that STRING contains invalid UTF-8 or
130 * invalid Unicode codepoints. Any error message comes from utf8proc.
131 */
132static svn_error_t *
133normalize_cstring(apr_size_t *result_length,
134                  const char *string, apr_size_t length,
135                  svn_membuf_t *buffer)
136{
137  ssize_t result = unicode_decomposition(0, string, length, buffer);
138  if (result >= 0)
139    {
140      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
141      result = utf8proc_reencode(buffer->data, result,
142                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
143    }
144  if (result < 0)
145    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
146                            gettext(utf8proc_errmsg(result)));
147  *result_length = result;
148  return SVN_NO_ERROR;
149}
150
151/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
152 * length LENB. Return 0 if they're equal, a negative value if BUFA is
153 * less than BUFB, otherwise a positive value.
154 *
155 * Yes, this is strcmp for known-length UCS-4 strings.
156 */
157static int
158ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
159        const apr_int32_t *bufb, apr_size_t lenb)
160{
161  const apr_size_t len = (lena < lenb ? lena : lenb);
162  apr_size_t i;
163
164  for (i = 0; i < len; ++i)
165    {
166      const int diff = bufa[i] - bufb[i];
167      if (diff)
168        return diff;
169    }
170  return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
171}
172
173svn_error_t *
174svn_utf__normcmp(int *result,
175                 const char *str1, apr_size_t len1,
176                 const char *str2, apr_size_t len2,
177                 svn_membuf_t *buf1, svn_membuf_t *buf2)
178{
179  apr_size_t buflen1;
180  apr_size_t buflen2;
181
182  /* Shortcut-circuit the decision if at least one of the strings is empty. */
183  const svn_boolean_t empty1 =
184    (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
185  const svn_boolean_t empty2 =
186    (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
187  if (empty1 || empty2)
188    {
189      *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
190      return SVN_NO_ERROR;
191    }
192
193  SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
194  SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
195  *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
196  return SVN_NO_ERROR;
197}
198
199svn_error_t*
200svn_utf__normalize(const char **result,
201                   const char *str, apr_size_t len,
202                   svn_membuf_t *buf)
203{
204  apr_size_t result_length;
205  SVN_ERR(normalize_cstring(&result_length, str, len, buf));
206  *result = (const char*)(buf->data);
207  return SVN_NO_ERROR;
208}
209
210/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
211 * Assume BUFFER is already filled to *LENGTH and return the new size there.
212 * This function does *not* nul-terminate the stringbuf!
213 *
214 * A returned error indicates that the codepoint is invalid.
215 */
216static svn_error_t *
217encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
218{
219  apr_size_t utf8len;
220
221  if (buffer->size - *length < 4)
222    svn_membuf__resize(buffer, buffer->size + 4);
223
224  utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
225  if (!utf8len)
226    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
227                             _("Invalid Unicode character U+%04lX"),
228                             (long)ucs4chr);
229  *length += utf8len;
230  return SVN_NO_ERROR;
231}
232
233svn_error_t *
234svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
235                            const apr_int32_t *ucs4str,
236                            apr_size_t length,
237                            apr_size_t *result_length)
238{
239  *result_length = 0;
240  while (length-- > 0)
241    SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
242  svn_membuf__resize(buffer, *result_length + 1);
243  ((char*)buffer->data)[*result_length] = '\0';
244  return SVN_NO_ERROR;
245}
246
247
248svn_error_t *
249svn_utf__glob(svn_boolean_t *match,
250              const char *pattern, apr_size_t pattern_len,
251              const char *string, apr_size_t string_len,
252              const char *escape, apr_size_t escape_len,
253              svn_boolean_t sql_like,
254              svn_membuf_t *pattern_buf,
255              svn_membuf_t *string_buf,
256              svn_membuf_t *temp_buf)
257{
258  apr_size_t patternbuf_len;
259  apr_size_t tempbuf_len;
260
261  /* If we're in GLOB mode, we don't do custom escape chars. */
262  if (escape && !sql_like)
263    return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
264                            _("Cannot use a custom escape token"
265                              " in glob matching mode"));
266
267  /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
268     because apr_fnmatch can't handle it.*/
269  SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
270  if (!sql_like)
271    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
272                                        tempbuf_len, &patternbuf_len));
273  else
274    {
275      /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
276      const apr_int32_t *like = temp_buf->data;
277      apr_int32_t ucs4esc;
278      svn_boolean_t escaped;
279      apr_size_t i;
280
281      if (!escape)
282        ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
283      else
284        {
285          const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
286                                ? UTF8PROC_NULLTERM : 0);
287          ssize_t result =
288            utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
289                               UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
290          if (result < 0)
291            return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
292                                    gettext(utf8proc_errmsg(result)));
293          if (result == 0 || result > 1)
294            return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
295                                    _("Escape token must be one character"));
296          if ((ucs4esc & 0xFF) != ucs4esc)
297            return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
298                                     _("Invalid escape character U+%04lX"),
299                                     (long)ucs4esc);
300        }
301
302      patternbuf_len = 0;
303      svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
304      for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
305        {
306          if (*like == ucs4esc && !escaped)
307            {
308              svn_membuf__resize(pattern_buf, patternbuf_len + 1);
309              ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
310              escaped = TRUE;
311            }
312          else if (escaped)
313            {
314              SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
315              escaped = FALSE;
316            }
317          else
318            {
319              if ((*like == '[' || *like == '\\') && !escaped)
320                {
321                  /* Escape brackets and backslashes which are always
322                     literals in LIKE patterns. */
323                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
324                  ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
325                  escaped = TRUE;
326                  --i; --like;
327                  continue;
328                }
329
330              /* Replace LIKE wildcards with their GLOB equivalents. */
331              if (*like == '%' || *like == '_')
332                {
333                  const char wildcard = (*like == '%' ? '*' : '?');
334                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
335                  ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
336                }
337              else
338                SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
339            }
340        }
341      svn_membuf__resize(pattern_buf, patternbuf_len + 1);
342      ((char*)pattern_buf->data)[patternbuf_len] = '\0';
343    }
344
345  /* Now normalize the string */
346  SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
347  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
348                                      tempbuf_len, &tempbuf_len));
349
350  *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
351  return SVN_NO_ERROR;
352}
353
354svn_boolean_t
355svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
356{
357  svn_error_t *err;
358  svn_membuf_t buffer;
359  apr_size_t result_length;
360  const apr_size_t length = strlen(string);
361  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
362  err = normalize_cstring(&result_length, string, length, &buffer);
363  if (err)
364    {
365      svn_error_clear(err);
366      return FALSE;
367    }
368  return (length == result_length && 0 == strcmp(string, buffer.data));
369}
370
371const char *
372svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
373{
374  /* Hexadecimal digits for code conversion. */
375  static const char digits[] = "0123456789ABCDEF";
376
377  /* Flags used for Unicode decomposition. */
378  static const int decomp_flags = (
379      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
380      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
381
382  svn_stringbuf_t *result;
383  svn_membuf_t buffer;
384  ssize_t decomp_length;
385  ssize_t len;
386
387  /* Decompose to a non-reversible compatibility format. */
388  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
389  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
390  if (decomp_length < 0)
391    {
392      svn_membuf_t part;
393      apr_size_t done, prev;
394
395      /* The only other error we can receive here indicates an integer
396         overflow due to the length of the input string. Not very
397         likely, but we certainly shouldn't continue in that case. */
398      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
399
400      /* Break the decomposition into parts that are valid UTF-8, and
401         bytes that are not. Represent the invalid bytes in the target
402         erray by their negative value. This works because utf8proc
403         will not generate Unicode code points with values larger than
404         U+10FFFF. */
405      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
406      decomp_length = 0;
407      done = prev = 0;
408      while (done < length)
409        {
410          apr_int32_t uc;
411
412          while (done < length)
413            {
414              len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
415              if (len < 0)
416                break;
417              done += len;
418            }
419
420          /* Decompose the valid part */
421          if (done > prev)
422            {
423              len = unicode_decomposition(
424                  decomp_flags, src + prev, done - prev, &part);
425              SVN_ERR_ASSERT_NO_RETURN(len > 0);
426              svn_membuf__resize(
427                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
428              memcpy((apr_int32_t*)buffer.data + decomp_length,
429                     part.data, len * sizeof(apr_int32_t));
430              decomp_length += len;
431              prev = done;
432            }
433
434          /* What follows could be a valid UTF-8 sequence, but not
435             a valid Unicode character. */
436          if (done < length)
437            {
438              const char *last;
439
440              /* Determine the length of the UTF-8 sequence */
441              const char *const p = src + done;
442              len = utf8proc_utf8class[(uint8_t)*p];
443
444              /* Check if the multi-byte sequence is valid UTF-8. */
445              if (len > 1 && len <= (apr_ssize_t)(length - done))
446                last = svn_utf__last_valid(p, len);
447              else
448                last = NULL;
449
450              /* Might not be a valid UTF-8 sequence at all */
451              if (!last || (last && last - p < len))
452                {
453                  uc = -((apr_int32_t)(*p & 0xff));
454                  len = 1;
455                }
456              else
457                {
458                  switch (len)
459                    {
460                      /* Decode the UTF-8 sequence without validation. */
461                    case 2:
462                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
463                      break;
464                    case 3:
465                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
466                            + (p[2] & 0x3f));
467                      break;
468                    case 4:
469                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
470                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
471                      break;
472                    default:
473                      SVN_ERR_ASSERT_NO_RETURN(
474                          !"Unexpected invalid UTF-8 byte");
475                    }
476
477                }
478
479              svn_membuf__resize(
480                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
481              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
482              done += len;
483              prev = done;
484            }
485        }
486    }
487
488  /* Scan the result and deleting any combining diacriticals and
489     inserting placeholders where any non-ascii characters remain.  */
490  result = svn_stringbuf_create_ensure(decomp_length, pool);
491  for (len = 0; len < decomp_length; ++len)
492    {
493      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
494      if (cp > 0 && cp < 127)
495        svn_stringbuf_appendbyte(result, (char)cp);
496      else if (cp == 0)
497        svn_stringbuf_appendcstr(result, "\\0");
498      else if (cp < 0)
499        {
500          const apr_int32_t rcp = ((-cp) & 0xff);
501          svn_stringbuf_appendcstr(result, "?\\");
502          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
503          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
504        }
505      else
506        {
507          if (utf8proc_codepoint_valid(cp))
508            {
509              const utf8proc_property_t *prop = utf8proc_get_property(cp);
510              if (prop->combining_class != 0)
511                continue;           /* Combining mark; ignore */
512              svn_stringbuf_appendcstr(result, "{U+");
513            }
514          else
515            svn_stringbuf_appendcstr(result, "{U?");
516          if (cp > 0xffff)
517            {
518              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
519              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
520            }
521          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
522          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
523          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
524          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
525          svn_stringbuf_appendbyte(result, '}');
526        }
527    }
528
529  return result->data;
530}
531