1289177Speter/*
2289177Speter * utf8proc.c:  Wrappers for the utf8proc library
3289177Speter *
4289177Speter * ====================================================================
5289177Speter *    Licensed to the Apache Software Foundation (ASF) under one
6289177Speter *    or more contributor license agreements.  See the NOTICE file
7289177Speter *    distributed with this work for additional information
8289177Speter *    regarding copyright ownership.  The ASF licenses this file
9289177Speter *    to you under the Apache License, Version 2.0 (the
10289177Speter *    "License"); you may not use this file except in compliance
11289177Speter *    with the License.  You may obtain a copy of the License at
12289177Speter *
13289177Speter *      http://www.apache.org/licenses/LICENSE-2.0
14289177Speter *
15289177Speter *    Unless required by applicable law or agreed to in writing,
16289177Speter *    software distributed under the License is distributed on an
17289177Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18289177Speter *    KIND, either express or implied.  See the License for the
19289177Speter *    specific language governing permissions and limitations
20289177Speter *    under the License.
21289177Speter * ====================================================================
22289177Speter */
23289177Speter
24289177Speter
25289177Speter
26289177Speter#include <apr_fnmatch.h>
27289177Speter
28289177Speter#include "private/svn_string_private.h"
29289177Speter#include "private/svn_utf_private.h"
30289177Speter#include "svn_private_config.h"
31289177Speter
32362181Sdim#if SVN_INTERNAL_UTF8PROC
33289177Speter#define UTF8PROC_INLINE
34289177Speter/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
35289177Speter   while this function is already defined via apr.h */
36289177Speter#define strlen svn__strlen_var
37289177Speter#include "utf8proc/utf8proc.c"
38289177Speter#undef strlen
39362181Sdim#else
40362181Sdim#include <utf8proc.h>
41362181Sdim#endif
42289177Speter
43289177Speter
44289177Speter
45289177Speterconst char *
46289177Spetersvn_utf__utf8proc_compiled_version(void)
47289177Speter{
48289177Speter  static const char utf8proc_version[] =
49289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
50289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
51289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
52289177Speter  return utf8proc_version;
53289177Speter}
54289177Speter
55289177Speterconst char *
56289177Spetersvn_utf__utf8proc_runtime_version(void)
57289177Speter{
58289177Speter  /* Unused static function warning removal hack. */
59362181Sdim  SVN_UNUSED(utf8proc_grapheme_break);
60362181Sdim  SVN_UNUSED(utf8proc_tolower);
61362181Sdim  SVN_UNUSED(utf8proc_toupper);
62362181Sdim#if UTF8PROC_VERSION_MAJOR >= 2
63362181Sdim  SVN_UNUSED(utf8proc_totitle);
64362181Sdim#endif
65362181Sdim  SVN_UNUSED(utf8proc_charwidth);
66362181Sdim  SVN_UNUSED(utf8proc_category_string);
67289177Speter  SVN_UNUSED(utf8proc_NFD);
68289177Speter  SVN_UNUSED(utf8proc_NFC);
69289177Speter  SVN_UNUSED(utf8proc_NFKD);
70289177Speter  SVN_UNUSED(utf8proc_NFKC);
71289177Speter
72289177Speter  return utf8proc_version();
73289177Speter}
74289177Speter
75289177Speter
76289177Speter
77289177Speter/* Fill the given BUFFER with decomposed UCS-4 representation of the
78289177Speter * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
79289177Speter * is NUL-terminated; otherwise look only at the first LENGTH bytes in
80289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4
81289177Speter * characters, and return the length of the array. TRANSFORM_FLAGS
82289177Speter * define exactly how the decomposition is performed.
83289177Speter *
84289177Speter * A negative return value is an utf8proc error code and may indicate
85289177Speter * that STRING contains invalid UTF-8 or was so long that an overflow
86289177Speter * occurred.
87289177Speter */
88362181Sdimstatic apr_ssize_t
89289177Speterunicode_decomposition(int transform_flags,
90289177Speter                      const char *string, apr_size_t length,
91289177Speter                      svn_membuf_t *buffer)
92289177Speter{
93289177Speter  const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
94289177Speter                        ? UTF8PROC_NULLTERM : 0);
95289177Speter
96289177Speter  for (;;)
97289177Speter    {
98289177Speter      apr_int32_t *const ucs4buf = buffer->data;
99362181Sdim      const apr_ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
100362181Sdim      const apr_ssize_t result =
101289177Speter        utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
102289177Speter                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
103289177Speter                           | transform_flags | nullterm);
104289177Speter
105289177Speter      if (result < 0 || result <= ucs4len)
106289177Speter        return result;
107289177Speter
108289177Speter      /* Increase the decomposition buffer size and retry */
109289177Speter      svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
110289177Speter    }
111289177Speter}
112289177Speter
113289177Speter/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
114289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
115289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in
116289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4
117289177Speter * characters and *RESULT_LENGTH contains the length of the array.
118289177Speter *
119289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or
120289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc.
121289177Speter */
122289177Speterstatic svn_error_t *
123289177Speterdecompose_normalized(apr_size_t *result_length,
124289177Speter                     const char *string, apr_size_t length,
125289177Speter                     svn_membuf_t *buffer)
126289177Speter{
127362181Sdim  apr_ssize_t result = unicode_decomposition(0, string, length, buffer);
128289177Speter  if (result < 0)
129289177Speter    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
130289177Speter                            gettext(utf8proc_errmsg(result)));
131289177Speter  *result_length = result;
132289177Speter  return SVN_NO_ERROR;
133289177Speter}
134289177Speter
135289177Speter/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
136289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
137289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in
138289177Speter * STRING. Upon return, BUFFER->data points at a NUL-terminated string
139289177Speter * of UTF-8 characters.
140289177Speter *
141362181Sdim * If CASEFOLD is non-zero, perform Unicode case folding, e.g., for
142362181Sdim * case-insensitive string comparison. If STRIPMARK is non-zero, strip
143362181Sdim * all diacritical marks (e.g., accents) from the string.
144362181Sdim *
145289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or
146289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc.
147289177Speter */
148289177Speterstatic svn_error_t *
149289177Speternormalize_cstring(apr_size_t *result_length,
150289177Speter                  const char *string, apr_size_t length,
151362181Sdim                  svn_boolean_t casefold,
152362181Sdim                  svn_boolean_t stripmark,
153289177Speter                  svn_membuf_t *buffer)
154289177Speter{
155362181Sdim  int flags = 0;
156362181Sdim  apr_ssize_t result;
157362181Sdim
158362181Sdim  if (casefold)
159362181Sdim    flags |= UTF8PROC_CASEFOLD;
160362181Sdim
161362181Sdim  if (stripmark)
162362181Sdim    flags |= UTF8PROC_STRIPMARK;
163362181Sdim
164362181Sdim  result = unicode_decomposition(flags, string, length, buffer);
165289177Speter  if (result >= 0)
166289177Speter    {
167289177Speter      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
168289177Speter      result = utf8proc_reencode(buffer->data, result,
169289177Speter                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
170289177Speter    }
171289177Speter  if (result < 0)
172289177Speter    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
173289177Speter                            gettext(utf8proc_errmsg(result)));
174289177Speter  *result_length = result;
175289177Speter  return SVN_NO_ERROR;
176289177Speter}
177289177Speter
178289177Speter/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
179289177Speter * length LENB. Return 0 if they're equal, a negative value if BUFA is
180289177Speter * less than BUFB, otherwise a positive value.
181289177Speter *
182289177Speter * Yes, this is strcmp for known-length UCS-4 strings.
183289177Speter */
184289177Speterstatic int
185289177Speterucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
186289177Speter        const apr_int32_t *bufb, apr_size_t lenb)
187289177Speter{
188289177Speter  const apr_size_t len = (lena < lenb ? lena : lenb);
189289177Speter  apr_size_t i;
190289177Speter
191289177Speter  for (i = 0; i < len; ++i)
192289177Speter    {
193289177Speter      const int diff = bufa[i] - bufb[i];
194289177Speter      if (diff)
195289177Speter        return diff;
196289177Speter    }
197289177Speter  return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
198289177Speter}
199289177Speter
200289177Spetersvn_error_t *
201289177Spetersvn_utf__normcmp(int *result,
202289177Speter                 const char *str1, apr_size_t len1,
203289177Speter                 const char *str2, apr_size_t len2,
204289177Speter                 svn_membuf_t *buf1, svn_membuf_t *buf2)
205289177Speter{
206289177Speter  apr_size_t buflen1;
207289177Speter  apr_size_t buflen2;
208289177Speter
209289177Speter  /* Shortcut-circuit the decision if at least one of the strings is empty. */
210289177Speter  const svn_boolean_t empty1 =
211289177Speter    (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
212289177Speter  const svn_boolean_t empty2 =
213289177Speter    (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
214289177Speter  if (empty1 || empty2)
215289177Speter    {
216289177Speter      *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
217289177Speter      return SVN_NO_ERROR;
218289177Speter    }
219289177Speter
220289177Speter  SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
221289177Speter  SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
222289177Speter  *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
223289177Speter  return SVN_NO_ERROR;
224289177Speter}
225289177Speter
226289177Spetersvn_error_t*
227289177Spetersvn_utf__normalize(const char **result,
228289177Speter                   const char *str, apr_size_t len,
229289177Speter                   svn_membuf_t *buf)
230289177Speter{
231289177Speter  apr_size_t result_length;
232362181Sdim  SVN_ERR(normalize_cstring(&result_length, str, len, FALSE, FALSE, buf));
233289177Speter  *result = (const char*)(buf->data);
234289177Speter  return SVN_NO_ERROR;
235289177Speter}
236289177Speter
237362181Sdimsvn_error_t *
238362181Sdimsvn_utf__xfrm(const char **result,
239362181Sdim              const char *str, apr_size_t len,
240362181Sdim              svn_boolean_t case_insensitive,
241362181Sdim              svn_boolean_t accent_insensitive,
242362181Sdim              svn_membuf_t *buf)
243362181Sdim{
244362181Sdim  apr_size_t result_length;
245362181Sdim  SVN_ERR(normalize_cstring(&result_length, str, len,
246362181Sdim                            case_insensitive, accent_insensitive, buf));
247362181Sdim  *result = (const char*)(buf->data);
248362181Sdim  return SVN_NO_ERROR;
249362181Sdim}
250362181Sdim
251362181Sdimsvn_boolean_t
252362181Sdimsvn_utf__fuzzy_glob_match(const char *str,
253362181Sdim                          const apr_array_header_t *patterns,
254362181Sdim                          svn_membuf_t *buf)
255362181Sdim{
256362181Sdim  const char *normalized;
257362181Sdim  svn_error_t *err;
258362181Sdim  int i;
259362181Sdim
260362181Sdim  /* Try to normalize case and accents in STR.
261362181Sdim   *
262362181Sdim   * If that should fail for some reason, consider STR a mismatch. */
263362181Sdim  err = svn_utf__xfrm(&normalized, str, strlen(str), TRUE, TRUE, buf);
264362181Sdim  if (err)
265362181Sdim    {
266362181Sdim      svn_error_clear(err);
267362181Sdim      return FALSE;
268362181Sdim    }
269362181Sdim
270362181Sdim  /* Now see whether it matches any/all of the patterns. */
271362181Sdim  for (i = 0; i < patterns->nelts; ++i)
272362181Sdim    {
273362181Sdim      const char *pattern = APR_ARRAY_IDX(patterns, i, const char *);
274362181Sdim      if (apr_fnmatch(pattern, normalized, 0) == APR_SUCCESS)
275362181Sdim        return TRUE;
276362181Sdim    }
277362181Sdim
278362181Sdim  return FALSE;
279362181Sdim}
280362181Sdim
281289177Speter/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
282289177Speter * Assume BUFFER is already filled to *LENGTH and return the new size there.
283289177Speter * This function does *not* nul-terminate the stringbuf!
284289177Speter *
285289177Speter * A returned error indicates that the codepoint is invalid.
286289177Speter */
287289177Speterstatic svn_error_t *
288289177Speterencode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
289289177Speter{
290289177Speter  apr_size_t utf8len;
291289177Speter
292289177Speter  if (buffer->size - *length < 4)
293289177Speter    svn_membuf__resize(buffer, buffer->size + 4);
294289177Speter
295362181Sdim  utf8len = utf8proc_encode_char(ucs4chr, ((apr_byte_t*)buffer->data + *length));
296289177Speter  if (!utf8len)
297289177Speter    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
298289177Speter                             _("Invalid Unicode character U+%04lX"),
299289177Speter                             (long)ucs4chr);
300289177Speter  *length += utf8len;
301289177Speter  return SVN_NO_ERROR;
302289177Speter}
303289177Speter
304289177Spetersvn_error_t *
305289177Spetersvn_utf__encode_ucs4_string(svn_membuf_t *buffer,
306289177Speter                            const apr_int32_t *ucs4str,
307289177Speter                            apr_size_t length,
308289177Speter                            apr_size_t *result_length)
309289177Speter{
310289177Speter  *result_length = 0;
311289177Speter  while (length-- > 0)
312289177Speter    SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
313289177Speter  svn_membuf__resize(buffer, *result_length + 1);
314289177Speter  ((char*)buffer->data)[*result_length] = '\0';
315289177Speter  return SVN_NO_ERROR;
316289177Speter}
317289177Speter
318289177Speter
319289177Spetersvn_error_t *
320289177Spetersvn_utf__glob(svn_boolean_t *match,
321289177Speter              const char *pattern, apr_size_t pattern_len,
322289177Speter              const char *string, apr_size_t string_len,
323289177Speter              const char *escape, apr_size_t escape_len,
324289177Speter              svn_boolean_t sql_like,
325289177Speter              svn_membuf_t *pattern_buf,
326289177Speter              svn_membuf_t *string_buf,
327289177Speter              svn_membuf_t *temp_buf)
328289177Speter{
329289177Speter  apr_size_t patternbuf_len;
330289177Speter  apr_size_t tempbuf_len;
331289177Speter
332289177Speter  /* If we're in GLOB mode, we don't do custom escape chars. */
333289177Speter  if (escape && !sql_like)
334289177Speter    return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
335289177Speter                            _("Cannot use a custom escape token"
336289177Speter                              " in glob matching mode"));
337289177Speter
338289177Speter  /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
339289177Speter     because apr_fnmatch can't handle it.*/
340289177Speter  SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
341289177Speter  if (!sql_like)
342289177Speter    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
343289177Speter                                        tempbuf_len, &patternbuf_len));
344289177Speter  else
345289177Speter    {
346289177Speter      /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
347289177Speter      const apr_int32_t *like = temp_buf->data;
348289177Speter      apr_int32_t ucs4esc;
349289177Speter      svn_boolean_t escaped;
350289177Speter      apr_size_t i;
351289177Speter
352289177Speter      if (!escape)
353289177Speter        ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
354289177Speter      else
355289177Speter        {
356289177Speter          const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
357289177Speter                                ? UTF8PROC_NULLTERM : 0);
358362181Sdim          apr_ssize_t result =
359289177Speter            utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
360289177Speter                               UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
361289177Speter          if (result < 0)
362289177Speter            return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
363289177Speter                                    gettext(utf8proc_errmsg(result)));
364289177Speter          if (result == 0 || result > 1)
365289177Speter            return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
366289177Speter                                    _("Escape token must be one character"));
367289177Speter          if ((ucs4esc & 0xFF) != ucs4esc)
368289177Speter            return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
369289177Speter                                     _("Invalid escape character U+%04lX"),
370289177Speter                                     (long)ucs4esc);
371289177Speter        }
372289177Speter
373289177Speter      patternbuf_len = 0;
374289177Speter      svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
375289177Speter      for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
376289177Speter        {
377289177Speter          if (*like == ucs4esc && !escaped)
378289177Speter            {
379289177Speter              svn_membuf__resize(pattern_buf, patternbuf_len + 1);
380289177Speter              ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
381289177Speter              escaped = TRUE;
382289177Speter            }
383289177Speter          else if (escaped)
384289177Speter            {
385289177Speter              SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
386289177Speter              escaped = FALSE;
387289177Speter            }
388289177Speter          else
389289177Speter            {
390289177Speter              if ((*like == '[' || *like == '\\') && !escaped)
391289177Speter                {
392289177Speter                  /* Escape brackets and backslashes which are always
393289177Speter                     literals in LIKE patterns. */
394289177Speter                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
395289177Speter                  ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
396289177Speter                  escaped = TRUE;
397289177Speter                  --i; --like;
398289177Speter                  continue;
399289177Speter                }
400289177Speter
401289177Speter              /* Replace LIKE wildcards with their GLOB equivalents. */
402289177Speter              if (*like == '%' || *like == '_')
403289177Speter                {
404289177Speter                  const char wildcard = (*like == '%' ? '*' : '?');
405289177Speter                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
406289177Speter                  ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
407289177Speter                }
408289177Speter              else
409289177Speter                SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
410289177Speter            }
411289177Speter        }
412289177Speter      svn_membuf__resize(pattern_buf, patternbuf_len + 1);
413289177Speter      ((char*)pattern_buf->data)[patternbuf_len] = '\0';
414289177Speter    }
415289177Speter
416289177Speter  /* Now normalize the string */
417289177Speter  SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
418289177Speter  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
419289177Speter                                      tempbuf_len, &tempbuf_len));
420289177Speter
421289177Speter  *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
422289177Speter  return SVN_NO_ERROR;
423289177Speter}
424289177Speter
425289177Spetersvn_boolean_t
426289177Spetersvn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
427289177Speter{
428289177Speter  svn_error_t *err;
429289177Speter  svn_membuf_t buffer;
430289177Speter  apr_size_t result_length;
431289177Speter  const apr_size_t length = strlen(string);
432289177Speter  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
433362181Sdim  err = normalize_cstring(&result_length, string, length,
434362181Sdim                          FALSE, FALSE, &buffer);
435289177Speter  if (err)
436289177Speter    {
437289177Speter      svn_error_clear(err);
438289177Speter      return FALSE;
439289177Speter    }
440289177Speter  return (length == result_length && 0 == strcmp(string, buffer.data));
441289177Speter}
442289177Speter
443289177Speterconst char *
444289177Spetersvn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
445289177Speter{
446289177Speter  /* Hexadecimal digits for code conversion. */
447289177Speter  static const char digits[] = "0123456789ABCDEF";
448289177Speter
449289177Speter  /* Flags used for Unicode decomposition. */
450289177Speter  static const int decomp_flags = (
451289177Speter      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
452289177Speter      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
453289177Speter
454289177Speter  svn_stringbuf_t *result;
455289177Speter  svn_membuf_t buffer;
456362181Sdim  apr_ssize_t decomp_length;
457362181Sdim  apr_ssize_t len;
458289177Speter
459289177Speter  /* Decompose to a non-reversible compatibility format. */
460289177Speter  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
461289177Speter  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
462289177Speter  if (decomp_length < 0)
463289177Speter    {
464289177Speter      svn_membuf_t part;
465289177Speter      apr_size_t done, prev;
466289177Speter
467289177Speter      /* The only other error we can receive here indicates an integer
468289177Speter         overflow due to the length of the input string. Not very
469289177Speter         likely, but we certainly shouldn't continue in that case. */
470289177Speter      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
471289177Speter
472289177Speter      /* Break the decomposition into parts that are valid UTF-8, and
473289177Speter         bytes that are not. Represent the invalid bytes in the target
474289177Speter         erray by their negative value. This works because utf8proc
475289177Speter         will not generate Unicode code points with values larger than
476289177Speter         U+10FFFF. */
477289177Speter      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
478289177Speter      decomp_length = 0;
479289177Speter      done = prev = 0;
480289177Speter      while (done < length)
481289177Speter        {
482289177Speter          apr_int32_t uc;
483289177Speter
484289177Speter          while (done < length)
485289177Speter            {
486362181Sdim              len = utf8proc_iterate((apr_byte_t*)src + done, length - done, &uc);
487289177Speter              if (len < 0)
488289177Speter                break;
489289177Speter              done += len;
490289177Speter            }
491289177Speter
492289177Speter          /* Decompose the valid part */
493289177Speter          if (done > prev)
494289177Speter            {
495289177Speter              len = unicode_decomposition(
496289177Speter                  decomp_flags, src + prev, done - prev, &part);
497289177Speter              SVN_ERR_ASSERT_NO_RETURN(len > 0);
498289177Speter              svn_membuf__resize(
499289177Speter                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
500289177Speter              memcpy((apr_int32_t*)buffer.data + decomp_length,
501289177Speter                     part.data, len * sizeof(apr_int32_t));
502289177Speter              decomp_length += len;
503289177Speter              prev = done;
504289177Speter            }
505289177Speter
506289177Speter          /* What follows could be a valid UTF-8 sequence, but not
507289177Speter             a valid Unicode character. */
508289177Speter          if (done < length)
509289177Speter            {
510289177Speter              const char *last;
511289177Speter
512289177Speter              /* Determine the length of the UTF-8 sequence */
513289177Speter              const char *const p = src + done;
514362181Sdim              len = utf8proc_utf8class[(apr_byte_t)*p];
515289177Speter
516289177Speter              /* Check if the multi-byte sequence is valid UTF-8. */
517289177Speter              if (len > 1 && len <= (apr_ssize_t)(length - done))
518289177Speter                last = svn_utf__last_valid(p, len);
519289177Speter              else
520289177Speter                last = NULL;
521289177Speter
522289177Speter              /* Might not be a valid UTF-8 sequence at all */
523289177Speter              if (!last || (last && last - p < len))
524289177Speter                {
525289177Speter                  uc = -((apr_int32_t)(*p & 0xff));
526289177Speter                  len = 1;
527289177Speter                }
528289177Speter              else
529289177Speter                {
530289177Speter                  switch (len)
531289177Speter                    {
532289177Speter                      /* Decode the UTF-8 sequence without validation. */
533289177Speter                    case 2:
534289177Speter                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
535289177Speter                      break;
536289177Speter                    case 3:
537289177Speter                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
538289177Speter                            + (p[2] & 0x3f));
539289177Speter                      break;
540289177Speter                    case 4:
541289177Speter                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
542289177Speter                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
543289177Speter                      break;
544289177Speter                    default:
545289177Speter                      SVN_ERR_ASSERT_NO_RETURN(
546289177Speter                          !"Unexpected invalid UTF-8 byte");
547289177Speter                    }
548289177Speter
549289177Speter                }
550289177Speter
551289177Speter              svn_membuf__resize(
552289177Speter                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
553289177Speter              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
554289177Speter              done += len;
555289177Speter              prev = done;
556289177Speter            }
557289177Speter        }
558289177Speter    }
559289177Speter
560289177Speter  /* Scan the result and deleting any combining diacriticals and
561289177Speter     inserting placeholders where any non-ascii characters remain.  */
562289177Speter  result = svn_stringbuf_create_ensure(decomp_length, pool);
563289177Speter  for (len = 0; len < decomp_length; ++len)
564289177Speter    {
565289177Speter      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
566289177Speter      if (cp > 0 && cp < 127)
567289177Speter        svn_stringbuf_appendbyte(result, (char)cp);
568289177Speter      else if (cp == 0)
569289177Speter        svn_stringbuf_appendcstr(result, "\\0");
570289177Speter      else if (cp < 0)
571289177Speter        {
572289177Speter          const apr_int32_t rcp = ((-cp) & 0xff);
573289177Speter          svn_stringbuf_appendcstr(result, "?\\");
574289177Speter          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
575289177Speter          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
576289177Speter        }
577289177Speter      else
578289177Speter        {
579289177Speter          if (utf8proc_codepoint_valid(cp))
580289177Speter            {
581289177Speter              const utf8proc_property_t *prop = utf8proc_get_property(cp);
582289177Speter              if (prop->combining_class != 0)
583289177Speter                continue;           /* Combining mark; ignore */
584289177Speter              svn_stringbuf_appendcstr(result, "{U+");
585289177Speter            }
586289177Speter          else
587289177Speter            svn_stringbuf_appendcstr(result, "{U?");
588289177Speter          if (cp > 0xffff)
589289177Speter            {
590289177Speter              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
591289177Speter              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
592289177Speter            }
593289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
594289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
595289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
596289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
597289177Speter          svn_stringbuf_appendbyte(result, '}');
598289177Speter        }
599289177Speter    }
600289177Speter
601289177Speter  return result->data;
602289177Speter}
603