1289177Speter/*
2289177Speter * utf8proc.c:  Wrappers for the utf8proc library
3289177Speter *
4289177Speter * ====================================================================
5289177Speter *    Licensed to the Apache Software Foundation (ASF) under one
6289177Speter *    or more contributor license agreements.  See the NOTICE file
7289177Speter *    distributed with this work for additional information
8289177Speter *    regarding copyright ownership.  The ASF licenses this file
9289177Speter *    to you under the Apache License, Version 2.0 (the
10289177Speter *    "License"); you may not use this file except in compliance
11289177Speter *    with the License.  You may obtain a copy of the License at
12289177Speter *
13289177Speter *      http://www.apache.org/licenses/LICENSE-2.0
14289177Speter *
15289177Speter *    Unless required by applicable law or agreed to in writing,
16289177Speter *    software distributed under the License is distributed on an
17289177Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18289177Speter *    KIND, either express or implied.  See the License for the
19289177Speter *    specific language governing permissions and limitations
20289177Speter *    under the License.
21289177Speter * ====================================================================
22289177Speter */
23289177Speter
24289177Speter
25289177Speter
26289177Speter#include <apr_fnmatch.h>
27289177Speter
28289177Speter#include "private/svn_string_private.h"
29289177Speter#include "private/svn_utf_private.h"
30289177Speter#include "svn_private_config.h"
31289177Speter
32289177Speter#define UTF8PROC_INLINE
33289177Speter/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
34289177Speter   while this function is already defined via apr.h */
35289177Speter#define strlen svn__strlen_var
36289177Speter#include "utf8proc/utf8proc.c"
37289177Speter#undef strlen
38289177Speter
39289177Speter
40289177Speter
41289177Speterconst char *
42289177Spetersvn_utf__utf8proc_compiled_version(void)
43289177Speter{
44289177Speter  static const char utf8proc_version[] =
45289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_MAJOR) "."
46289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_MINOR) "."
47289177Speter                                  APR_STRINGIFY(UTF8PROC_VERSION_PATCH);
48289177Speter  return utf8proc_version;
49289177Speter}
50289177Speter
51289177Speterconst char *
52289177Spetersvn_utf__utf8proc_runtime_version(void)
53289177Speter{
54289177Speter  /* Unused static function warning removal hack. */
55289177Speter  SVN_UNUSED(utf8proc_NFD);
56289177Speter  SVN_UNUSED(utf8proc_NFC);
57289177Speter  SVN_UNUSED(utf8proc_NFKD);
58289177Speter  SVN_UNUSED(utf8proc_NFKC);
59289177Speter
60289177Speter  return utf8proc_version();
61289177Speter}
62289177Speter
63289177Speter
64289177Speter
65289177Speter/* Fill the given BUFFER with decomposed UCS-4 representation of the
66289177Speter * UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
67289177Speter * is NUL-terminated; otherwise look only at the first LENGTH bytes in
68289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4
69289177Speter * characters, and return the length of the array. TRANSFORM_FLAGS
70289177Speter * define exactly how the decomposition is performed.
71289177Speter *
72289177Speter * A negative return value is an utf8proc error code and may indicate
73289177Speter * that STRING contains invalid UTF-8 or was so long that an overflow
74289177Speter * occurred.
75289177Speter */
76289177Speterstatic ssize_t
77289177Speterunicode_decomposition(int transform_flags,
78289177Speter                      const char *string, apr_size_t length,
79289177Speter                      svn_membuf_t *buffer)
80289177Speter{
81289177Speter  const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
82289177Speter                        ? UTF8PROC_NULLTERM : 0);
83289177Speter
84289177Speter  for (;;)
85289177Speter    {
86289177Speter      apr_int32_t *const ucs4buf = buffer->data;
87289177Speter      const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
88289177Speter      const ssize_t result =
89289177Speter        utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
90289177Speter                           UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
91289177Speter                           | transform_flags | nullterm);
92289177Speter
93289177Speter      if (result < 0 || result <= ucs4len)
94289177Speter        return result;
95289177Speter
96289177Speter      /* Increase the decomposition buffer size and retry */
97289177Speter      svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
98289177Speter    }
99289177Speter}
100289177Speter
101289177Speter/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
102289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
103289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in
104289177Speter * STRING. Upon return, BUFFER->data points at an array of UCS-4
105289177Speter * characters and *RESULT_LENGTH contains the length of the array.
106289177Speter *
107289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or
108289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc.
109289177Speter */
110289177Speterstatic svn_error_t *
111289177Speterdecompose_normalized(apr_size_t *result_length,
112289177Speter                     const char *string, apr_size_t length,
113289177Speter                     svn_membuf_t *buffer)
114289177Speter{
115289177Speter  ssize_t result = unicode_decomposition(0, string, length, buffer);
116289177Speter  if (result < 0)
117289177Speter    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
118289177Speter                            gettext(utf8proc_errmsg(result)));
119289177Speter  *result_length = result;
120289177Speter  return SVN_NO_ERROR;
121289177Speter}
122289177Speter
123289177Speter/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
124289177Speter * STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
125289177Speter * NUL-terminated; otherwise look only at the first LENGTH bytes in
126289177Speter * STRING. Upon return, BUFFER->data points at a NUL-terminated string
127289177Speter * of UTF-8 characters.
128289177Speter *
129289177Speter * A returned error may indicate that STRING contains invalid UTF-8 or
130289177Speter * invalid Unicode codepoints. Any error message comes from utf8proc.
131289177Speter */
132289177Speterstatic svn_error_t *
133289177Speternormalize_cstring(apr_size_t *result_length,
134289177Speter                  const char *string, apr_size_t length,
135289177Speter                  svn_membuf_t *buffer)
136289177Speter{
137289177Speter  ssize_t result = unicode_decomposition(0, string, length, buffer);
138289177Speter  if (result >= 0)
139289177Speter    {
140289177Speter      svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
141289177Speter      result = utf8proc_reencode(buffer->data, result,
142289177Speter                                 UTF8PROC_COMPOSE | UTF8PROC_STABLE);
143289177Speter    }
144289177Speter  if (result < 0)
145289177Speter    return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
146289177Speter                            gettext(utf8proc_errmsg(result)));
147289177Speter  *result_length = result;
148289177Speter  return SVN_NO_ERROR;
149289177Speter}
150289177Speter
151289177Speter/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
152289177Speter * length LENB. Return 0 if they're equal, a negative value if BUFA is
153289177Speter * less than BUFB, otherwise a positive value.
154289177Speter *
155289177Speter * Yes, this is strcmp for known-length UCS-4 strings.
156289177Speter */
157289177Speterstatic int
158289177Speterucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
159289177Speter        const apr_int32_t *bufb, apr_size_t lenb)
160289177Speter{
161289177Speter  const apr_size_t len = (lena < lenb ? lena : lenb);
162289177Speter  apr_size_t i;
163289177Speter
164289177Speter  for (i = 0; i < len; ++i)
165289177Speter    {
166289177Speter      const int diff = bufa[i] - bufb[i];
167289177Speter      if (diff)
168289177Speter        return diff;
169289177Speter    }
170289177Speter  return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
171289177Speter}
172289177Speter
173289177Spetersvn_error_t *
174289177Spetersvn_utf__normcmp(int *result,
175289177Speter                 const char *str1, apr_size_t len1,
176289177Speter                 const char *str2, apr_size_t len2,
177289177Speter                 svn_membuf_t *buf1, svn_membuf_t *buf2)
178289177Speter{
179289177Speter  apr_size_t buflen1;
180289177Speter  apr_size_t buflen2;
181289177Speter
182289177Speter  /* Shortcut-circuit the decision if at least one of the strings is empty. */
183289177Speter  const svn_boolean_t empty1 =
184289177Speter    (0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
185289177Speter  const svn_boolean_t empty2 =
186289177Speter    (0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
187289177Speter  if (empty1 || empty2)
188289177Speter    {
189289177Speter      *result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
190289177Speter      return SVN_NO_ERROR;
191289177Speter    }
192289177Speter
193289177Speter  SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
194289177Speter  SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
195289177Speter  *result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
196289177Speter  return SVN_NO_ERROR;
197289177Speter}
198289177Speter
199289177Spetersvn_error_t*
200289177Spetersvn_utf__normalize(const char **result,
201289177Speter                   const char *str, apr_size_t len,
202289177Speter                   svn_membuf_t *buf)
203289177Speter{
204289177Speter  apr_size_t result_length;
205289177Speter  SVN_ERR(normalize_cstring(&result_length, str, len, buf));
206289177Speter  *result = (const char*)(buf->data);
207289177Speter  return SVN_NO_ERROR;
208289177Speter}
209289177Speter
210289177Speter/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
211289177Speter * Assume BUFFER is already filled to *LENGTH and return the new size there.
212289177Speter * This function does *not* nul-terminate the stringbuf!
213289177Speter *
214289177Speter * A returned error indicates that the codepoint is invalid.
215289177Speter */
216289177Speterstatic svn_error_t *
217289177Speterencode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
218289177Speter{
219289177Speter  apr_size_t utf8len;
220289177Speter
221289177Speter  if (buffer->size - *length < 4)
222289177Speter    svn_membuf__resize(buffer, buffer->size + 4);
223289177Speter
224289177Speter  utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
225289177Speter  if (!utf8len)
226289177Speter    return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
227289177Speter                             _("Invalid Unicode character U+%04lX"),
228289177Speter                             (long)ucs4chr);
229289177Speter  *length += utf8len;
230289177Speter  return SVN_NO_ERROR;
231289177Speter}
232289177Speter
233289177Spetersvn_error_t *
234289177Spetersvn_utf__encode_ucs4_string(svn_membuf_t *buffer,
235289177Speter                            const apr_int32_t *ucs4str,
236289177Speter                            apr_size_t length,
237289177Speter                            apr_size_t *result_length)
238289177Speter{
239289177Speter  *result_length = 0;
240289177Speter  while (length-- > 0)
241289177Speter    SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
242289177Speter  svn_membuf__resize(buffer, *result_length + 1);
243289177Speter  ((char*)buffer->data)[*result_length] = '\0';
244289177Speter  return SVN_NO_ERROR;
245289177Speter}
246289177Speter
247289177Speter
248289177Spetersvn_error_t *
249289177Spetersvn_utf__glob(svn_boolean_t *match,
250289177Speter              const char *pattern, apr_size_t pattern_len,
251289177Speter              const char *string, apr_size_t string_len,
252289177Speter              const char *escape, apr_size_t escape_len,
253289177Speter              svn_boolean_t sql_like,
254289177Speter              svn_membuf_t *pattern_buf,
255289177Speter              svn_membuf_t *string_buf,
256289177Speter              svn_membuf_t *temp_buf)
257289177Speter{
258289177Speter  apr_size_t patternbuf_len;
259289177Speter  apr_size_t tempbuf_len;
260289177Speter
261289177Speter  /* If we're in GLOB mode, we don't do custom escape chars. */
262289177Speter  if (escape && !sql_like)
263289177Speter    return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
264289177Speter                            _("Cannot use a custom escape token"
265289177Speter                              " in glob matching mode"));
266289177Speter
267289177Speter  /* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
268289177Speter     because apr_fnmatch can't handle it.*/
269289177Speter  SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
270289177Speter  if (!sql_like)
271289177Speter    SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
272289177Speter                                        tempbuf_len, &patternbuf_len));
273289177Speter  else
274289177Speter    {
275289177Speter      /* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
276289177Speter      const apr_int32_t *like = temp_buf->data;
277289177Speter      apr_int32_t ucs4esc;
278289177Speter      svn_boolean_t escaped;
279289177Speter      apr_size_t i;
280289177Speter
281289177Speter      if (!escape)
282289177Speter        ucs4esc = -1;           /* Definitely an invalid UCS-4 character. */
283289177Speter      else
284289177Speter        {
285289177Speter          const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
286289177Speter                                ? UTF8PROC_NULLTERM : 0);
287289177Speter          ssize_t result =
288289177Speter            utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
289289177Speter                               UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
290289177Speter          if (result < 0)
291289177Speter            return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
292289177Speter                                    gettext(utf8proc_errmsg(result)));
293289177Speter          if (result == 0 || result > 1)
294289177Speter            return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
295289177Speter                                    _("Escape token must be one character"));
296289177Speter          if ((ucs4esc & 0xFF) != ucs4esc)
297289177Speter            return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
298289177Speter                                     _("Invalid escape character U+%04lX"),
299289177Speter                                     (long)ucs4esc);
300289177Speter        }
301289177Speter
302289177Speter      patternbuf_len = 0;
303289177Speter      svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
304289177Speter      for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
305289177Speter        {
306289177Speter          if (*like == ucs4esc && !escaped)
307289177Speter            {
308289177Speter              svn_membuf__resize(pattern_buf, patternbuf_len + 1);
309289177Speter              ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
310289177Speter              escaped = TRUE;
311289177Speter            }
312289177Speter          else if (escaped)
313289177Speter            {
314289177Speter              SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
315289177Speter              escaped = FALSE;
316289177Speter            }
317289177Speter          else
318289177Speter            {
319289177Speter              if ((*like == '[' || *like == '\\') && !escaped)
320289177Speter                {
321289177Speter                  /* Escape brackets and backslashes which are always
322289177Speter                     literals in LIKE patterns. */
323289177Speter                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
324289177Speter                  ((char*)pattern_buf->data)[patternbuf_len++] = '\\';
325289177Speter                  escaped = TRUE;
326289177Speter                  --i; --like;
327289177Speter                  continue;
328289177Speter                }
329289177Speter
330289177Speter              /* Replace LIKE wildcards with their GLOB equivalents. */
331289177Speter              if (*like == '%' || *like == '_')
332289177Speter                {
333289177Speter                  const char wildcard = (*like == '%' ? '*' : '?');
334289177Speter                  svn_membuf__resize(pattern_buf, patternbuf_len + 1);
335289177Speter                  ((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
336289177Speter                }
337289177Speter              else
338289177Speter                SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
339289177Speter            }
340289177Speter        }
341289177Speter      svn_membuf__resize(pattern_buf, patternbuf_len + 1);
342289177Speter      ((char*)pattern_buf->data)[patternbuf_len] = '\0';
343289177Speter    }
344289177Speter
345289177Speter  /* Now normalize the string */
346289177Speter  SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
347289177Speter  SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
348289177Speter                                      tempbuf_len, &tempbuf_len));
349289177Speter
350289177Speter  *match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
351289177Speter  return SVN_NO_ERROR;
352289177Speter}
353289177Speter
354289177Spetersvn_boolean_t
355289177Spetersvn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
356289177Speter{
357289177Speter  svn_error_t *err;
358289177Speter  svn_membuf_t buffer;
359289177Speter  apr_size_t result_length;
360289177Speter  const apr_size_t length = strlen(string);
361289177Speter  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
362289177Speter  err = normalize_cstring(&result_length, string, length, &buffer);
363289177Speter  if (err)
364289177Speter    {
365289177Speter      svn_error_clear(err);
366289177Speter      return FALSE;
367289177Speter    }
368289177Speter  return (length == result_length && 0 == strcmp(string, buffer.data));
369289177Speter}
370289177Speter
371289177Speterconst char *
372289177Spetersvn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
373289177Speter{
374289177Speter  /* Hexadecimal digits for code conversion. */
375289177Speter  static const char digits[] = "0123456789ABCDEF";
376289177Speter
377289177Speter  /* Flags used for Unicode decomposition. */
378289177Speter  static const int decomp_flags = (
379289177Speter      UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
380289177Speter      | UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
381289177Speter
382289177Speter  svn_stringbuf_t *result;
383289177Speter  svn_membuf_t buffer;
384289177Speter  ssize_t decomp_length;
385289177Speter  ssize_t len;
386289177Speter
387289177Speter  /* Decompose to a non-reversible compatibility format. */
388289177Speter  svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
389289177Speter  decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
390289177Speter  if (decomp_length < 0)
391289177Speter    {
392289177Speter      svn_membuf_t part;
393289177Speter      apr_size_t done, prev;
394289177Speter
395289177Speter      /* The only other error we can receive here indicates an integer
396289177Speter         overflow due to the length of the input string. Not very
397289177Speter         likely, but we certainly shouldn't continue in that case. */
398289177Speter      SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
399289177Speter
400289177Speter      /* Break the decomposition into parts that are valid UTF-8, and
401289177Speter         bytes that are not. Represent the invalid bytes in the target
402289177Speter         erray by their negative value. This works because utf8proc
403289177Speter         will not generate Unicode code points with values larger than
404289177Speter         U+10FFFF. */
405289177Speter      svn_membuf__create(&part, sizeof(apr_int32_t), pool);
406289177Speter      decomp_length = 0;
407289177Speter      done = prev = 0;
408289177Speter      while (done < length)
409289177Speter        {
410289177Speter          apr_int32_t uc;
411289177Speter
412289177Speter          while (done < length)
413289177Speter            {
414289177Speter              len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
415289177Speter              if (len < 0)
416289177Speter                break;
417289177Speter              done += len;
418289177Speter            }
419289177Speter
420289177Speter          /* Decompose the valid part */
421289177Speter          if (done > prev)
422289177Speter            {
423289177Speter              len = unicode_decomposition(
424289177Speter                  decomp_flags, src + prev, done - prev, &part);
425289177Speter              SVN_ERR_ASSERT_NO_RETURN(len > 0);
426289177Speter              svn_membuf__resize(
427289177Speter                  &buffer, (decomp_length + len) * sizeof(apr_int32_t));
428289177Speter              memcpy((apr_int32_t*)buffer.data + decomp_length,
429289177Speter                     part.data, len * sizeof(apr_int32_t));
430289177Speter              decomp_length += len;
431289177Speter              prev = done;
432289177Speter            }
433289177Speter
434289177Speter          /* What follows could be a valid UTF-8 sequence, but not
435289177Speter             a valid Unicode character. */
436289177Speter          if (done < length)
437289177Speter            {
438289177Speter              const char *last;
439289177Speter
440289177Speter              /* Determine the length of the UTF-8 sequence */
441289177Speter              const char *const p = src + done;
442289177Speter              len = utf8proc_utf8class[(uint8_t)*p];
443289177Speter
444289177Speter              /* Check if the multi-byte sequence is valid UTF-8. */
445289177Speter              if (len > 1 && len <= (apr_ssize_t)(length - done))
446289177Speter                last = svn_utf__last_valid(p, len);
447289177Speter              else
448289177Speter                last = NULL;
449289177Speter
450289177Speter              /* Might not be a valid UTF-8 sequence at all */
451289177Speter              if (!last || (last && last - p < len))
452289177Speter                {
453289177Speter                  uc = -((apr_int32_t)(*p & 0xff));
454289177Speter                  len = 1;
455289177Speter                }
456289177Speter              else
457289177Speter                {
458289177Speter                  switch (len)
459289177Speter                    {
460289177Speter                      /* Decode the UTF-8 sequence without validation. */
461289177Speter                    case 2:
462289177Speter                      uc = ((p[0] & 0x1f) <<  6) + (p[1] & 0x3f);
463289177Speter                      break;
464289177Speter                    case 3:
465289177Speter                      uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) <<  6)
466289177Speter                            + (p[2] & 0x3f));
467289177Speter                      break;
468289177Speter                    case 4:
469289177Speter                      uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
470289177Speter                            + ((p[2] & 0x3f) <<  6) + (p[3] & 0x3f));
471289177Speter                      break;
472289177Speter                    default:
473289177Speter                      SVN_ERR_ASSERT_NO_RETURN(
474289177Speter                          !"Unexpected invalid UTF-8 byte");
475289177Speter                    }
476289177Speter
477289177Speter                }
478289177Speter
479289177Speter              svn_membuf__resize(
480289177Speter                  &buffer, (decomp_length + 1) * sizeof(apr_int32_t));
481289177Speter              ((apr_int32_t*)buffer.data)[decomp_length++] = uc;
482289177Speter              done += len;
483289177Speter              prev = done;
484289177Speter            }
485289177Speter        }
486289177Speter    }
487289177Speter
488289177Speter  /* Scan the result and deleting any combining diacriticals and
489289177Speter     inserting placeholders where any non-ascii characters remain.  */
490289177Speter  result = svn_stringbuf_create_ensure(decomp_length, pool);
491289177Speter  for (len = 0; len < decomp_length; ++len)
492289177Speter    {
493289177Speter      const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
494289177Speter      if (cp > 0 && cp < 127)
495289177Speter        svn_stringbuf_appendbyte(result, (char)cp);
496289177Speter      else if (cp == 0)
497289177Speter        svn_stringbuf_appendcstr(result, "\\0");
498289177Speter      else if (cp < 0)
499289177Speter        {
500289177Speter          const apr_int32_t rcp = ((-cp) & 0xff);
501289177Speter          svn_stringbuf_appendcstr(result, "?\\");
502289177Speter          svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
503289177Speter          svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
504289177Speter        }
505289177Speter      else
506289177Speter        {
507289177Speter          if (utf8proc_codepoint_valid(cp))
508289177Speter            {
509289177Speter              const utf8proc_property_t *prop = utf8proc_get_property(cp);
510289177Speter              if (prop->combining_class != 0)
511289177Speter                continue;           /* Combining mark; ignore */
512289177Speter              svn_stringbuf_appendcstr(result, "{U+");
513289177Speter            }
514289177Speter          else
515289177Speter            svn_stringbuf_appendcstr(result, "{U?");
516289177Speter          if (cp > 0xffff)
517289177Speter            {
518289177Speter              svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
519289177Speter              svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
520289177Speter            }
521289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
522289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
523289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
524289177Speter          svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
525289177Speter          svn_stringbuf_appendbyte(result, '}');
526289177Speter        }
527289177Speter    }
528289177Speter
529289177Speter  return result->data;
530289177Speter}
531