1/*
2 * utf8proc.c:  Wrappers for the utf8proc library
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <apr_fnmatch.h>
27
28#include "private/svn_string_private.h"
29#include "private/svn_utf_private.h"
30#include "svn_private_config.h"
31
32#if SVN_INTERNAL_UTF8PROC
33#define UTF8PROC_INLINE
34/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
35   while this function is already defined via apr.h */
36#define strlen svn__strlen_var
37#include "utf8proc/utf8proc.c"
38#undef strlen
39#else
40#include <utf8proc.h>
41#endif
42
43
44
45const char *
46svn_utf__utf8proc_compiled_version(void)
47{
48  static const char utf8proc_version[] =
49                                  APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
50                                  APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
51                                  APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
52  return utf8proc_version;
53}
54
55const char *
56svn_utf__utf8proc_runtime_version(void)
57{
58  /* Unused static function warning removal hack. */
59  SVN_UNUSED(utf8proc_grapheme_break);
60  SVN_UNUSED(utf8proc_tolower);
61  SVN_UNUSED(utf8proc_toupper);
62#if UTF8PROC_VERSION_MAJOR >= 2
63  SVN_UNUSED(utf8proc_totitle);
64#endif
65  SVN_UNUSED(utf8proc_charwidth);
66  SVN_UNUSED(utf8proc_category_string);
67  SVN_UNUSED(utf8proc_NFD);
68  SVN_UNUSED(utf8proc_NFC);
69  SVN_UNUSED(utf8proc_NFKD);
70  SVN_UNUSED(utf8proc_NFKC);
71
72  return utf8proc_version();
73}
74
75
76
77/* Fill the given BUFFER with decomposed UCS-4 representation of the
78 * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
79 * is NUL-terminated; otherwise look only at the first LENGTH bytes in
80 * STRING. Upon return, BUFFER->data points at an array of UCS-4
81 * characters, and return the length of the array. TRANSFORM_FLAGS
82 * define exactly how the decomposition is performed.
83 *
84 * A negative return value is an utf8proc error code and may indicate
85 * that STRING contains invalid UTF-8 or was so long that an overflow
86 * occurred.
87 */
88static apr_ssize_t
89unicode_decomposition(int transform_flags,
90                      const char *string, apr_size_t length,
91                      svn_membuf_t *buffer)
92{
93  const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
94                        ? UTF8PROC_NULLTERM : 0);
95
96  for (;;)
97    {
98      apr_int32_t *const ucs4buf = buffer->data;
99      const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
100      const apr_ssize_t result =
101        utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
102                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
103                           | transform_flags | nullterm);
104
105      if (result < 0 || result <= ucs4len)
106        return result;
107
108      /* Increase the decomposition buffer size and retry */
109      svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
110    }
111}
112
113/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
114 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
115 * NUL-terminated; otherwise look only at the first LENGTH bytes in
116 * STRING. Upon return, BUFFER->data points at an array of UCS-4
117 * characters and *RESULT_LENGTH contains the length of the array.
118 *
119 * A returned error may indicate that STRING contains invalid UTF-8 or
120 * invalid Unicode codepoints. Any error message comes from utf8proc.
121 */
122static svn_error_t *
123decompose_normalized(apr_size_t *result_length,
124                     const char *string, apr_size_t length,
125                     svn_membuf_t *buffer)
126{
127  apr_ssize_t result = unicode_decomposition(0, string, length, buffer);
128  if (result < 0)
129    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
130                            gettext(utf8proc_errmsg(result)));
131  *result_length = result;
132  return SVN_NO_ERROR;
133}
134
135/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
136 * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
137 * NUL-terminated; otherwise look only at the first LENGTH bytes in
138 * STRING. Upon return, BUFFER->data points at a NUL-terminated string
139 * of UTF-8 characters.
140 *
141 * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
142 * case-insensitive string comparison. If STRIPMARK is non-zero, strip
143 * all diacritical marks (e.g., accents) from the string.
144 *
145 * A returned error may indicate that STRING contains invalid UTF-8 or
146 * invalid Unicode codepoints. Any error message comes from utf8proc.
147 */
148static svn_error_t *
149normalize_cstring(apr_size_t *result_length,
150                  const char *string, apr_size_t length,
151                  svn_boolean_t casefold,
152                  svn_boolean_t stripmark,
153                  svn_membuf_t *buffer)
154{
155  int flags = 0;
156  apr_ssize_t result;
157
158  if (casefold)
159    flags |= UTF8PROC_CASEFOLD;
160
161  if (stripmark)
162    flags |= UTF8PROC_STRIPMARK;
163
164  result = unicode_decomposition(flags, string, length, buffer);
165  if (result >= 0)
166    {
167      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
168      result = utf8proc_reencode(buffer->data, result,
169                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
170    }
171  if (result < 0)
172    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
173                            gettext(utf8proc_errmsg(result)));
174  *result_length = result;
175  return SVN_NO_ERROR;
176}
177
178/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
179 * length LENB. Return 0 if they're equal, a negative value if BUFA is
180 * less than BUFB, otherwise a positive value.
181 *
182 * Yes, this is strcmp for known-length UCS-4 strings.
183 */
184static int
185ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
186        const apr_int32_t *bufb, apr_size_t lenb)
187{
188  const apr_size_t len = (lena < lenb ? lena : lenb);
189  apr_size_t i;
190
191  for (i = 0; i < len; ++i)
192    {
193      const int diff = bufa[i] - bufb[i];
194      if (diff)
195        return diff;
196    }
197  return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
198}
199
200svn_error_t *
201svn_utf__normcmp(int *result,
202                 const char *str1, apr_size_t len1,
203                 const char *str2, apr_size_t len2,
204                 svn_membuf_t *buf1, svn_membuf_t *buf2)
205{
206  apr_size_t buflen1;
207  apr_size_t buflen2;
208
209  /* Shortcut-circuit the decision if at least one of the strings is empty. */
210  const svn_boolean_t empty1 =
211    (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
212  const svn_boolean_t empty2 =
213    (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
214  if (empty1 || empty2)
215    {
216      *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
217      return SVN_NO_ERROR;
218    }
219
220  SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
221  SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
222  *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
223  return SVN_NO_ERROR;
224}
225
226svn_error_t*
227svn_utf__normalize(const char **result,
228                   const char *str, apr_size_t len,
229                   svn_membuf_t *buf)
230{
231  apr_size_t result_length;
232  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
233  *result = (const char*)(buf->data);
234  return SVN_NO_ERROR;
235}
236
237svn_error_t *
238svn_utf__xfrm(const char **result,
239              const char *str, apr_size_t len,
240              svn_boolean_t case_insensitive,
241              svn_boolean_t accent_insensitive,
242              svn_membuf_t *buf)
243{
244  apr_size_t result_length;
245  SVN_ERR(normalize_cstring(&result_length, str, len,
246                            case_insensitive, accent_insensitive, buf));
247  *result = (const char*)(buf->data);
248  return SVN_NO_ERROR;
249}
250
251svn_boolean_t
252svn_utf__fuzzy_glob_match(const char *str,
253                          const apr_array_header_t *patterns,
254                          svn_membuf_t *buf)
255{
256  const char *normalized;
257  svn_error_t *err;
258  int i;
259
260  /* Try to normalize case and accents in STR.
261   *
262   * If that should fail for some reason, consider STR a mismatch. */
263  err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf);
264  if (err)
265    {
266      svn_error_clear(err);
267      return FALSE;
268    }
269
270  /* Now see whether it matches any/all of the patterns. */
271  for (i = 0; i < patterns->nelts; ++i)
272    {
273      const char *pattern = APR_ARRAY_IDX(patterns, i, const char *);
274      if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS)
275        return TRUE;
276    }
277
278  return FALSE;
279}
280
281/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
282 * Assume BUFFER is already filled to *LENGTH and return the new size there.
283 * This function does *not* nul-terminate the stringbuf!
284 *
285 * A returned error indicates that the codepoint is invalid.
286 */
287static svn_error_t *
288encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
289{
290  apr_size_t utf8len;
291
292  if (buffer->size - *length < 4)
293    svn_membuf__resize(buffer, buffer->size + 4);
294
295  utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length));
296  if (!utf8len)
297    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
298                             _("Invalid Unicode character U+%04lX"),
299                             (long)ucs4chr);
300  *length += utf8len;
301  return SVN_NO_ERROR;
302}
303
304svn_error_t *
305svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
306                            const apr_int32_t *ucs4str,
307                            apr_size_t length,
308                            apr_size_t *result_length)
309{
310  *result_length = 0;
311  while (length-- > 0)
312    SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
313  svn_membuf__resize(buffer, *result_length + 1);
314  ((char*)buffer->data)[*result_length] = '\0';
315  return SVN_NO_ERROR;
316}
317
318
319svn_error_t *
320svn_utf__glob(svn_boolean_t *match,
321              const char *pattern, apr_size_t pattern_len,
322              const char *string, apr_size_t string_len,
323              const char *escape, apr_size_t escape_len,
324              svn_boolean_t sql_like,
325              svn_membuf_t *pattern_buf,
326              svn_membuf_t *string_buf,
327              svn_membuf_t *temp_buf)
328{
329  apr_size_t patternbuf_len;
330  apr_size_t tempbuf_len;
331
332  /* If we're in GLOB mode, we don't do custom escape chars. */
333  if (escape && !sql_like)
334    return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
335                            _("Cannot use a custom escape token"
336                              " in glob matching mode"));
337
338  /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
339     because apr_fnmatch can't handle it.*/
340  SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
341  if (!sql_like)
342    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
343                                        tempbuf_len, &patternbuf_len));
344  else
345    {
346      /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
347      const apr_int32_t *like = temp_buf->data;
348      apr_int32_t ucs4esc;
349      svn_boolean_t escaped;
350      apr_size_t i;
351
352      if (!escape)
353        ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
354      else
355        {
356          const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
357                                ? UTF8PROC_NULLTERM : 0);
358          apr_ssize_t result =
359            utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
360                               UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
361          if (result < 0)
362            return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
363                                    gettext(utf8proc_errmsg(result)));
364          if (result == 0 || result > 1)
365            return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
366                                    _("Escape token must be one character"));
367          if ((ucs4esc & 0xFF) != ucs4esc)
368            return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
369                                     _("Invalid escape character U+%04lX"),
370                                     (long)ucs4esc);
371        }
372
373      patternbuf_len = 0;
374      svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
375      for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
376        {
377          if (*like == ucs4esc && !escaped)
378            {
379              svn_membuf__resize(pattern_buf, patternbuf_len + 1);
380              ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
381              escaped = TRUE;
382            }
383          else if (escaped)
384            {
385              SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
386              escaped = FALSE;
387            }
388          else
389            {
390              if ((*like == '[' || *like == '\\') && !escaped)
391                {
392                  /* Escape brackets and backslashes which are always
393                     literals in LIKE patterns. */
394                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
395                  ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
396                  escaped = TRUE;
397                  --i; --like;
398                  continue;
399                }
400
401              /* Replace LIKE wildcards with their GLOB equivalents. */
402              if (*like == '%' || *like == '_')
403                {
404                  const char wildcard = (*like == '%' ? '*' : '?');
405                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
406                  ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
407                }
408              else
409                SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
410            }
411        }
412      svn_membuf__resize(pattern_buf, patternbuf_len + 1);
413      ((char*)pattern_buf->data)[patternbuf_len] = '\0';
414    }
415
416  /* Now normalize the string */
417  SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
418  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
419                                      tempbuf_len, &tempbuf_len));
420
421  *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
422  return SVN_NO_ERROR;
423}
424
425svn_boolean_t
426svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
427{
428  svn_error_t *err;
429  svn_membuf_t buffer;
430  apr_size_t result_length;
431  const apr_size_t length = strlen(string);
432  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
433  err = normalize_cstring(&result_length, string, length,
434                          FALSE, FALSE, &buffer);
435  if (err)
436    {
437      svn_error_clear(err);
438      return FALSE;
439    }
440  return (length == result_length && 0 == strcmp(string, buffer.data));
441}
442
443const char *
444svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
445{
446  /* Hexadecimal digits for code conversion. */
447  static const char digits[] = "0123456789ABCDEF";
448
449  /* Flags used for Unicode decomposition. */
450  static const int decomp_flags = (
451      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
452      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
453
454  svn_stringbuf_t *result;
455  svn_membuf_t buffer;
456  apr_ssize_t decomp_length;
457  apr_ssize_t len;
458
459  /* Decompose to a non-reversible compatibility format. */
460  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
461  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
462  if (decomp_length < 0)
463    {
464      svn_membuf_t part;
465      apr_size_t done, prev;
466
467      /* The only other error we can receive here indicates an integer
468         overflow due to the length of the input string. Not very
469         likely, but we certainly shouldn't continue in that case. */
470      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
471
472      /* Break the decomposition into parts that are valid UTF-8, and
473         bytes that are not. Represent the invalid bytes in the target
474         erray by their negative value. This works because utf8proc
475         will not generate Unicode code points with values larger than
476         U+10FFFF. */
477      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
478      decomp_length = 0;
479      done = prev = 0;
480      while (done < length)
481        {
482          apr_int32_t uc;
483
484          while (done < length)
485            {
486              len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc);
487              if (len < 0)
488                break;
489              done += len;
490            }
491
492          /* Decompose the valid part */
493          if (done > prev)
494            {
495              len = unicode_decomposition(
496                  decomp_flags, src + prev, done - prev, &part);
497              SVN_ERR_ASSERT_NO_RETURN(len > 0);
498              svn_membuf__resize(
499                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
500              memcpy((apr_int32_t*)buffer.data + decomp_length,
501                     part.data, len * sizeof(apr_int32_t));
502              decomp_length += len;
503              prev = done;
504            }
505
506          /* What follows could be a valid UTF-8 sequence, but not
507             a valid Unicode character. */
508          if (done < length)
509            {
510              const char *last;
511
512              /* Determine the length of the UTF-8 sequence */
513              const char *const p = src + done;
514              len = utf8proc_utf8class[(apr_byte_t)*p];
515
516              /* Check if the multi-byte sequence is valid UTF-8. */
517              if (len > 1 && len <= (apr_ssize_t)(length - done))
518                last = svn_utf__last_valid(p, len);
519              else
520                last = NULL;
521
522              /* Might not be a valid UTF-8 sequence at all */
523              if (!last || (last && last - p < len))
524                {
525                  uc = -((apr_int32_t)(*p & 0xff));
526                  len = 1;
527                }
528              else
529                {
530                  switch (len)
531                    {
532                      /* Decode the UTF-8 sequence without validation. */
533                    case 2:
534                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
535                      break;
536                    case 3:
537                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
538                            + (p[2] & 0x3f));
539                      break;
540                    case 4:
541                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
542                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
543                      break;
544                    default:
545                      SVN_ERR_ASSERT_NO_RETURN(
546                          !"Unexpected invalid UTF-8 byte");
547                    }
548
549                }
550
551              svn_membuf__resize(
552                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
553              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
554              done += len;
555              prev = done;
556            }
557        }
558    }
559
560  /* Scan the result and deleting any combining diacriticals and
561     inserting placeholders where any non-ascii characters remain.  */
562  result = svn_stringbuf_create_ensure(decomp_length, pool);
563  for (len = 0; len < decomp_length; ++len)
564    {
565      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
566      if (cp > 0 && cp < 127)
567        svn_stringbuf_appendbyte(result, (char)cp);
568      else if (cp == 0)
569        svn_stringbuf_appendcstr(result, "\\0");
570      else if (cp < 0)
571        {
572          const apr_int32_t rcp = ((-cp) & 0xff);
573          svn_stringbuf_appendcstr(result, "?\\");
574          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
575          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
576        }
577      else
578        {
579          if (utf8proc_codepoint_valid(cp))
580            {
581              const utf8proc_property_t *prop = utf8proc_get_property(cp);
582              if (prop->combining_class != 0)
583                continue;           /* Combining mark; ignore */
584              svn_stringbuf_appendcstr(result, "{U+");
585            }
586          else
587            svn_stringbuf_appendcstr(result, "{U?");
588          if (cp > 0xffff)
589            {
590              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
591              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
592            }
593          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
594          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
595          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
596          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
597          svn_stringbuf_appendbyte(result, '}');
598        }
599    }
600
601  return result->data;
602}
603