1/*
2 * utf.c:  UTF-8 conversion routines
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24
25
26#include <stdlib.h>
27#include <string.h>
28#include <assert.h>
29
30#include <apr_strings.h>
31#include <apr_lib.h>
32#include <apr_xlate.h>
33#include <apr_atomic.h>
34
35#include "svn_hash.h"
36#include "svn_string.h"
37#include "svn_error.h"
38#include "svn_pools.h"
39#include "svn_ctype.h"
40#include "svn_utf.h"
41#include "svn_private_config.h"
42#include "win32_xlate.h"
43
44#include "private/svn_utf_private.h"
45#include "private/svn_dep_compat.h"
46#include "private/svn_string_private.h"
47#include "private/svn_mutex.h"
48
49
50
51/* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
53 */
54static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
56
57static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
58
59static svn_mutex__t *xlate_handle_mutex = NULL;
60static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
61
62#if defined(WIN32)
63typedef svn_subr__win32_xlate_t xlate_handle_t;
64#else
65typedef apr_xlate_t xlate_handle_t;
66#endif
67
68/* The xlate handle cache is a global hash table with linked lists of xlate
69 * handles.  In multi-threaded environments, a thread "borrows" an xlate
70 * handle from the cache during a translation and puts it back afterwards.
71 * This avoids holding a global lock for all translations.
72 * If there is no handle for a particular key when needed, a new is
73 * handle is created and put in the cache after use.
74 * This means that there will be at most N handles open for a key, where N
75 * is the number of simultanous handles in use for that key. */
76
77typedef struct xlate_handle_node_t {
78  xlate_handle_t *handle;
79  /* FALSE if the handle is not valid, since its pool is being
80     destroyed. */
81  svn_boolean_t valid;
82  /* The name of a char encoding or APR_LOCALE_CHARSET. */
83  const char *frompage, *topage;
84  struct xlate_handle_node_t *next;
85} xlate_handle_node_t;
86
87/* This maps const char * userdata_key strings to xlate_handle_node_t **
88   handles to the first entry in the linked list of xlate handles.  We don't
89   store the pointer to the list head directly in the hash table, since we
90   remove/insert entries at the head in the list in the code below, and
91   we can't use apr_hash_set() in each character translation because that
92   function allocates memory in each call where the value is non-NULL.
93   Since these allocations take place in a global pool, this would be a
94   memory leak. */
95static apr_hash_t *xlate_handle_hash = NULL;
96
97/* "1st level cache" to standard conversion maps. We may access these
98 * using atomic xchange ops, i.e. without further thread synchronization.
99 * If the respective item is NULL, fallback to hash lookup.
100 */
101static void * volatile xlat_ntou_static_handle = NULL;
102static void * volatile xlat_uton_static_handle = NULL;
103
104/* Clean up the xlate handle cache. */
105static apr_status_t
106xlate_cleanup(void *arg)
107{
108  /* We set the cache variables to NULL so that translation works in other
109     cleanup functions, even if it isn't cached then. */
110  xlate_handle_hash = NULL;
111
112  /* ensure no stale objects get accessed */
113  xlat_ntou_static_handle = NULL;
114  xlat_uton_static_handle = NULL;
115
116  return APR_SUCCESS;
117}
118
119/* Set the handle of ARG to NULL. */
120static apr_status_t
121xlate_handle_node_cleanup(void *arg)
122{
123  xlate_handle_node_t *node = arg;
124
125  node->valid = FALSE;
126  return APR_SUCCESS;
127}
128
129void
130svn_utf_initialize2(svn_boolean_t assume_native_utf8,
131                    apr_pool_t *pool)
132{
133  if (!xlate_handle_hash)
134    {
135      /* We create our own subpool, which we protect with the mutex.
136         We can't use the pool passed to us by the caller, since we will
137         use it for xlate handle allocations, possibly in multiple threads,
138         and pool allocation is not thread-safe. */
139      apr_pool_t *subpool = svn_pool_create(pool);
140      svn_mutex__t *mutex;
141      svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
142      if (err)
143        {
144          svn_error_clear(err);
145          return;
146        }
147
148      xlate_handle_mutex = mutex;
149      xlate_handle_hash = apr_hash_make(subpool);
150
151      apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
152                                apr_pool_cleanup_null);
153    }
154
155    if (!assume_native_charset_is_utf8)
156      assume_native_charset_is_utf8 = assume_native_utf8;
157}
158
159/* Return a unique string key based on TOPAGE and FROMPAGE.  TOPAGE and
160 * FROMPAGE can be any valid arguments of the same name to
161 * apr_xlate_open().  Allocate the returned string in POOL. */
162static const char*
163get_xlate_key(const char *topage,
164              const char *frompage,
165              apr_pool_t *pool)
166{
167  /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
168   * topage/frompage is really an int, not a valid string.  So generate a
169   * unique key accordingly. */
170  if (frompage == SVN_APR_LOCALE_CHARSET)
171    frompage = "APR_LOCALE_CHARSET";
172  else if (frompage == SVN_APR_DEFAULT_CHARSET)
173    frompage = "APR_DEFAULT_CHARSET";
174
175  if (topage == SVN_APR_LOCALE_CHARSET)
176    topage = "APR_LOCALE_CHARSET";
177  else if (topage == SVN_APR_DEFAULT_CHARSET)
178    topage = "APR_DEFAULT_CHARSET";
179
180  return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
181                     "-xlate-handle", SVN_VA_NULL);
182}
183
184/* Atomically replace the content in *MEM with NEW_VALUE and return
185 * the previous content of *MEM. If atomicy cannot be guaranteed,
186 * *MEM will not be modified and NEW_VALUE is simply returned to
187 * the caller.
188 */
189static APR_INLINE void*
190atomic_swap(void * volatile * mem, void *new_value)
191{
192#if APR_HAS_THREADS
193   return svn_atomic_xchgptr(mem, new_value);
194#else
195   /* no threads - no sync. necessary */
196   void *old_value = (void*)*mem;
197   *mem = new_value;
198   return old_value;
199#endif
200}
201
202/* Set *RET to a newly created handle node for converting from FROMPAGE
203   to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
204   (*RET)->handle to NULL.  If fail for any other reason, return the error.
205   Allocate *RET and its xlate handle in POOL. */
206static svn_error_t *
207xlate_alloc_handle(xlate_handle_node_t **ret,
208                   const char *topage, const char *frompage,
209                   apr_pool_t *pool)
210{
211  apr_status_t apr_err;
212  xlate_handle_t *handle;
213  const char *name;
214
215  /* The error handling doesn't support the following cases, since we don't
216     use them currently.  Catch this here. */
217  SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
218                 && topage != SVN_APR_DEFAULT_CHARSET
219                 && (frompage != SVN_APR_LOCALE_CHARSET
220                     || topage != SVN_APR_LOCALE_CHARSET));
221
222  /* Try to create a handle. */
223#if defined(WIN32)
224  apr_err = svn_subr__win32_xlate_open(&handle, topage,
225                                       frompage, pool);
226  name = "win32-xlate: ";
227#else
228  apr_err = apr_xlate_open(&handle, topage, frompage, pool);
229  name = "APR: ";
230#endif
231
232  if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
233    handle = NULL;
234  else if (apr_err != APR_SUCCESS)
235    {
236      const char *errstr;
237      char apr_strerr[512];
238
239      /* Can't use svn_error_wrap_apr here because it calls functions in
240         this file, leading to infinite recursion. */
241      if (frompage == SVN_APR_LOCALE_CHARSET)
242        errstr = apr_psprintf(pool,
243                              _("Can't create a character converter from "
244                                "native encoding to '%s'"), topage);
245      else if (topage == SVN_APR_LOCALE_CHARSET)
246        errstr = apr_psprintf(pool,
247                              _("Can't create a character converter from "
248                                "'%s' to native encoding"), frompage);
249      else
250        errstr = apr_psprintf(pool,
251                              _("Can't create a character converter from "
252                                "'%s' to '%s'"), frompage, topage);
253
254      /* Just put the error on the stack, since svn_error_create duplicates it
255         later.  APR_STRERR will be in the local encoding, not in UTF-8, though.
256       */
257      svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
258      return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
259                               svn_error_create(apr_err, NULL, apr_strerr),
260                               "%s%s", name, errstr);
261    }
262
263  /* Allocate and initialize the node. */
264  *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
265  (*ret)->handle = handle;
266  (*ret)->valid = TRUE;
267  (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
268                      ? apr_pstrdup(pool, frompage) : frompage);
269  (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
270                    ? apr_pstrdup(pool, topage) : topage);
271  (*ret)->next = NULL;
272
273  /* If we are called from inside a pool cleanup handler, the just created
274     xlate handle will be closed when that handler returns by a newly
275     registered cleanup handler, however, the handle is still cached by us.
276     To prevent this, we register a cleanup handler that will reset the valid
277     flag of our node, so we don't use an invalid handle. */
278  if (handle)
279    apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
280                              apr_pool_cleanup_null);
281
282  return SVN_NO_ERROR;
283}
284
285/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
286   global hash map, if available.
287
288   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
289   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
290   in the pool of xlate_handle_hash.
291
292   Note: this function is not thread-safe. Call get_xlate_handle_node
293   instead. */
294static svn_error_t *
295get_xlate_handle_node_internal(xlate_handle_node_t **ret,
296                               const char *topage, const char *frompage,
297                               const char *userdata_key, apr_pool_t *pool)
298{
299  /* If we already have a handle, just return it. */
300  if (userdata_key && xlate_handle_hash)
301    {
302      xlate_handle_node_t *old_node = NULL;
303
304      /* 2nd level: hash lookup */
305      xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
306                                                       userdata_key);
307      if (old_node_p)
308        old_node = *old_node_p;
309      if (old_node)
310        {
311          /* Ensure that the handle is still valid. */
312          if (old_node->valid)
313            {
314              /* Remove from the list. */
315              *old_node_p = old_node->next;
316              old_node->next = NULL;
317              *ret = old_node;
318              return SVN_NO_ERROR;
319            }
320        }
321    }
322
323  /* Note that we still have the mutex locked (if it is initialized), so we
324     can use the global pool for creating the new xlate handle. */
325
326  /* Use the correct pool for creating the handle. */
327  pool = apr_hash_pool_get(xlate_handle_hash);
328
329  return xlate_alloc_handle(ret, topage, frompage, pool);
330}
331
332/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
333   creating the handle node if it doesn't exist in USERDATA_KEY.
334   If a node is not cached and apr_xlate_open() returns APR_EINVAL or
335   APR_ENOTIMPL, set (*RET)->handle to NULL.  If fail for any other
336   reason, return the error.
337
338   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
339   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
340   in the pool of xlate_handle_hash. */
341static svn_error_t *
342get_xlate_handle_node(xlate_handle_node_t **ret,
343                      const char *topage, const char *frompage,
344                      const char *userdata_key, apr_pool_t *pool)
345{
346  xlate_handle_node_t *old_node = NULL;
347
348  /* If we already have a handle, just return it. */
349  if (userdata_key)
350    {
351      if (xlate_handle_hash)
352        {
353          /* 1st level: global, static items */
354          if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
355            old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
356          else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
357            old_node = atomic_swap(&xlat_uton_static_handle, NULL);
358
359          if (old_node && old_node->valid)
360            {
361              *ret = old_node;
362              return SVN_NO_ERROR;
363            }
364        }
365      else
366        {
367          void *p;
368          /* We fall back on a per-pool cache instead. */
369          apr_pool_userdata_get(&p, userdata_key, pool);
370          old_node = p;
371          /* Ensure that the handle is still valid. */
372          if (old_node && old_node->valid)
373            {
374              *ret = old_node;
375              return SVN_NO_ERROR;
376            }
377
378          return xlate_alloc_handle(ret, topage, frompage, pool);
379        }
380    }
381
382  SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
383                       get_xlate_handle_node_internal(ret,
384                                                      topage,
385                                                      frompage,
386                                                      userdata_key,
387                                                      pool));
388
389  return SVN_NO_ERROR;
390}
391
392/* Put back NODE into the xlate handle cache for use by other calls.
393
394   Note: this function is not thread-safe. Call put_xlate_handle_node
395   instead. */
396static svn_error_t *
397put_xlate_handle_node_internal(xlate_handle_node_t *node,
398                               const char *userdata_key)
399{
400  xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
401  if (node_p == NULL)
402    {
403      userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
404                                  userdata_key);
405      node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
406                          sizeof(*node_p));
407      *node_p = NULL;
408      svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
409    }
410  node->next = *node_p;
411  *node_p = node;
412
413  return SVN_NO_ERROR;
414}
415
416/* Put back NODE into the xlate handle cache for use by other calls.
417   If there is no global cache, store the handle in POOL.
418   Ignore errors related to locking/unlocking the mutex. */
419static svn_error_t *
420put_xlate_handle_node(xlate_handle_node_t *node,
421                      const char *userdata_key,
422                      apr_pool_t *pool)
423{
424  assert(node->next == NULL);
425  if (!userdata_key)
426    return SVN_NO_ERROR;
427
428  /* push previous global node to the hash */
429  if (xlate_handle_hash)
430    {
431      /* 1st level: global, static items */
432      if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
433        node = atomic_swap(&xlat_ntou_static_handle, node);
434      else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
435        node = atomic_swap(&xlat_uton_static_handle, node);
436      if (node == NULL)
437        return SVN_NO_ERROR;
438
439      SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
440                           put_xlate_handle_node_internal(node,
441                                                          userdata_key));
442    }
443  else
444    {
445      /* Store it in the per-pool cache. */
446      apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
447    }
448
449  return SVN_NO_ERROR;
450}
451
452/* Return the apr_xlate handle for converting native characters to UTF-8. */
453static svn_error_t *
454get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
455{
456  return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
457                               assume_native_charset_is_utf8
458                                 ? SVN_APR_UTF8_CHARSET
459                                 : SVN_APR_LOCALE_CHARSET,
460                               SVN_UTF_NTOU_XLATE_HANDLE, pool);
461}
462
463
464/* Return the apr_xlate handle for converting UTF-8 to native characters.
465   Create one if it doesn't exist.  If unable to find a handle, or
466   unable to create one because apr_xlate_open returned APR_EINVAL, then
467   set *RET to null and return SVN_NO_ERROR; if fail for some other
468   reason, return error. */
469static svn_error_t *
470get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
471{
472  return get_xlate_handle_node(ret,
473                               assume_native_charset_is_utf8
474                                 ? SVN_APR_UTF8_CHARSET
475                                 : SVN_APR_LOCALE_CHARSET,
476                               SVN_APR_UTF8_CHARSET,
477                               SVN_UTF_UTON_XLATE_HANDLE, pool);
478}
479
480
481/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
482   in *DEST, which is allocated in POOL. */
483static svn_error_t *
484convert_to_stringbuf(xlate_handle_node_t *node,
485                     const char *src_data,
486                     apr_size_t src_length,
487                     svn_stringbuf_t **dest,
488                     apr_pool_t *pool)
489{
490#ifdef WIN32
491  apr_status_t apr_err;
492
493  apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data,
494                                               src_length, dest, pool);
495#else
496  apr_size_t buflen = src_length * 2;
497  apr_status_t apr_err;
498  apr_size_t srclen = src_length;
499  apr_size_t destlen = buflen;
500
501  /* Initialize *DEST to an empty stringbuf.
502     A 1:2 ratio of input bytes to output bytes (as assigned above)
503     should be enough for most translations, and if it turns out not
504     to be enough, we'll grow the buffer again, sizing it based on a
505     1:3 ratio of the remainder of the string. */
506  *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
507
508  /* Not only does it not make sense to convert an empty string, but
509     apr-iconv is quite unreasonable about not allowing that. */
510  if (src_length == 0)
511    return SVN_NO_ERROR;
512
513  do
514    {
515      /* Set up state variables for xlate. */
516      destlen = buflen - (*dest)->len;
517
518      /* Attempt the conversion. */
519      apr_err = apr_xlate_conv_buffer(node->handle,
520                                      src_data + (src_length - srclen),
521                                      &srclen,
522                                      (*dest)->data + (*dest)->len,
523                                      &destlen);
524
525      /* Now, update the *DEST->len to track the amount of output data
526         churned out so far from this loop. */
527      (*dest)->len += ((buflen - (*dest)->len) - destlen);
528      buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
529                               for all characters in the buffer, 4 is
530                               maximum character size (currently) */
531
532
533    } while (apr_err == APR_SUCCESS && srclen != 0);
534#endif
535
536  /* If we exited the loop with an error, return the error. */
537  if (apr_err)
538    {
539      const char *errstr;
540      svn_error_t *err;
541
542      /* Can't use svn_error_wrap_apr here because it calls functions in
543         this file, leading to infinite recursion. */
544      if (node->frompage == SVN_APR_LOCALE_CHARSET)
545        errstr = apr_psprintf
546          (pool, _("Can't convert string from native encoding to '%s':"),
547           node->topage);
548      else if (node->topage == SVN_APR_LOCALE_CHARSET)
549        errstr = apr_psprintf
550          (pool, _("Can't convert string from '%s' to native encoding:"),
551           node->frompage);
552      else
553        errstr = apr_psprintf
554          (pool, _("Can't convert string from '%s' to '%s':"),
555           node->frompage, node->topage);
556
557      err = svn_error_create(
558          apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
559      return svn_error_create(apr_err, err, errstr);
560    }
561  /* Else, exited due to success.  Trim the result buffer down to the
562     right length. */
563  (*dest)->data[(*dest)->len] = '\0';
564
565  return SVN_NO_ERROR;
566}
567
568
569/* Return APR_EINVAL if the first LEN bytes of DATA contain anything
570   other than seven-bit, non-control (except for whitespace) ASCII
571   characters, finding the error pool from POOL.  Otherwise, return
572   SVN_NO_ERROR. */
573static svn_error_t *
574check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
575{
576  const char *data_start = data;
577
578  for (; len > 0; --len, data++)
579    {
580      if ((! svn_ctype_isascii(*data))
581          || ((! svn_ctype_isspace(*data))
582              && svn_ctype_iscntrl(*data)))
583        {
584          /* Show the printable part of the data, followed by the
585             decimal code of the questionable character.  Because if a
586             user ever gets this error, she's going to have to spend
587             time tracking down the non-ASCII data, so we want to help
588             as much as possible.  And yes, we just call the unsafe
589             data "non-ASCII", even though the actual constraint is
590             somewhat more complex than that. */
591
592          if (data - data_start)
593            {
594              const char *error_data
595                = apr_pstrndup(pool, data_start, (data - data_start));
596
597              return svn_error_createf
598                (APR_EINVAL, NULL,
599                 _("Safe data '%s' was followed by non-ASCII byte %d: "
600                   "unable to convert to/from UTF-8"),
601                 error_data, *((const unsigned char *) data));
602            }
603          else
604            {
605              return svn_error_createf
606                (APR_EINVAL, NULL,
607                 _("Non-ASCII character (code %d) detected, "
608                   "and unable to convert to/from UTF-8"),
609                 *((const unsigned char *) data));
610            }
611        }
612    }
613
614  return SVN_NO_ERROR;
615}
616
617/* Construct an error with code APR_EINVAL and with a suitable message
618 * to describe the invalid UTF-8 sequence DATA of length LEN (which
619 * may have embedded NULLs).  We can't simply print the data, almost
620 * by definition we don't really know how it is encoded.
621 */
622static svn_error_t *
623invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
624{
625  const char *last = svn_utf__last_valid(data, len);
626  const char *valid_txt = "", *invalid_txt = "";
627  apr_size_t i;
628  size_t valid, invalid;
629
630  /* We will display at most 24 valid octets (this may split a leading
631     multi-byte character) as that should fit on one 80 character line. */
632  valid = last - data;
633  if (valid > 24)
634    valid = 24;
635  for (i = 0; i < valid; ++i)
636    valid_txt = apr_pstrcat(pool, valid_txt,
637                            apr_psprintf(pool, " %02x",
638                                         (unsigned char)last[i-valid]),
639                                         SVN_VA_NULL);
640
641  /* 4 invalid octets will guarantee that the faulty octet is displayed */
642  invalid = data + len - last;
643  if (invalid > 4)
644    invalid = 4;
645  for (i = 0; i < invalid; ++i)
646    invalid_txt = apr_pstrcat(pool, invalid_txt,
647                              apr_psprintf(pool, " %02x",
648                                           (unsigned char)last[i]),
649                                           SVN_VA_NULL);
650
651  return svn_error_createf(APR_EINVAL, NULL,
652                           _("Valid UTF-8 data\n(hex:%s)\n"
653                             "followed by invalid UTF-8 sequence\n(hex:%s)"),
654                           valid_txt, invalid_txt);
655}
656
657/* Verify that the sequence DATA of length LEN is valid UTF-8.
658   If it is not, return an error with code APR_EINVAL. */
659static svn_error_t *
660check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
661{
662  if (! svn_utf__is_valid(data, len))
663    return invalid_utf8(data, len, pool);
664  return SVN_NO_ERROR;
665}
666
667/* Verify that the NULL terminated sequence DATA is valid UTF-8.
668   If it is not, return an error with code APR_EINVAL. */
669static svn_error_t *
670check_cstring_utf8(const char *data, apr_pool_t *pool)
671{
672
673  if (! svn_utf__cstring_is_valid(data))
674    return invalid_utf8(data, strlen(data), pool);
675  return SVN_NO_ERROR;
676}
677
678
679svn_error_t *
680svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
681                          const svn_stringbuf_t *src,
682                          apr_pool_t *pool)
683{
684  xlate_handle_node_t *node;
685  svn_error_t *err;
686
687  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
688
689  if (node->handle)
690    {
691      err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
692      if (! err)
693        err = check_utf8((*dest)->data, (*dest)->len, pool);
694    }
695  else
696    {
697      err = check_non_ascii(src->data, src->len, pool);
698      if (! err)
699        *dest = svn_stringbuf_dup(src, pool);
700    }
701
702  return svn_error_compose_create(err,
703                                  put_xlate_handle_node
704                                     (node,
705                                      SVN_UTF_NTOU_XLATE_HANDLE,
706                                      pool));
707}
708
709
710svn_error_t *
711svn_utf_string_to_utf8(const svn_string_t **dest,
712                       const svn_string_t *src,
713                       apr_pool_t *pool)
714{
715  svn_stringbuf_t *destbuf;
716  xlate_handle_node_t *node;
717  svn_error_t *err;
718
719  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
720
721  if (node->handle)
722    {
723      err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
724      if (! err)
725        err = check_utf8(destbuf->data, destbuf->len, pool);
726      if (! err)
727        *dest = svn_stringbuf__morph_into_string(destbuf);
728    }
729  else
730    {
731      err = check_non_ascii(src->data, src->len, pool);
732      if (! err)
733        *dest = svn_string_dup(src, pool);
734    }
735
736  return svn_error_compose_create(err,
737                                  put_xlate_handle_node
738                                     (node,
739                                      SVN_UTF_NTOU_XLATE_HANDLE,
740                                      pool));
741}
742
743
744/* Common implementation for svn_utf_cstring_to_utf8,
745   svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
746   svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
747   the translator and allocating from POOL. */
748static svn_error_t *
749convert_cstring(const char **dest,
750                const char *src,
751                xlate_handle_node_t *node,
752                apr_pool_t *pool)
753{
754  if (node->handle)
755    {
756      svn_stringbuf_t *destbuf;
757      SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
758                                   &destbuf, pool));
759      *dest = destbuf->data;
760    }
761  else
762    {
763      apr_size_t len = strlen(src);
764      SVN_ERR(check_non_ascii(src, len, pool));
765      *dest = apr_pstrmemdup(pool, src, len);
766    }
767  return SVN_NO_ERROR;
768}
769
770
771svn_error_t *
772svn_utf_cstring_to_utf8(const char **dest,
773                        const char *src,
774                        apr_pool_t *pool)
775{
776  xlate_handle_node_t *node;
777  svn_error_t *err;
778
779  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
780  err = convert_cstring(dest, src, node, pool);
781  SVN_ERR(svn_error_compose_create(err,
782                                   put_xlate_handle_node
783                                      (node,
784                                       SVN_UTF_NTOU_XLATE_HANDLE,
785                                       pool)));
786  return check_cstring_utf8(*dest, pool);
787}
788
789
790svn_error_t *
791svn_utf_cstring_to_utf8_ex2(const char **dest,
792                            const char *src,
793                            const char *frompage,
794                            apr_pool_t *pool)
795{
796  xlate_handle_node_t *node;
797  svn_error_t *err;
798  const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
799                                          pool);
800
801  SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
802                                convset_key, pool));
803  err = convert_cstring(dest, src, node, pool);
804  SVN_ERR(svn_error_compose_create(err,
805                                   put_xlate_handle_node
806                                      (node,
807                                       SVN_UTF_NTOU_XLATE_HANDLE,
808                                       pool)));
809
810  return check_cstring_utf8(*dest, pool);
811}
812
813
814svn_error_t *
815svn_utf_cstring_to_utf8_ex(const char **dest,
816                           const char *src,
817                           const char *frompage,
818                           const char *convset_key,
819                           apr_pool_t *pool)
820{
821  return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
822}
823
824
825svn_error_t *
826svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
827                            const svn_stringbuf_t *src,
828                            apr_pool_t *pool)
829{
830  xlate_handle_node_t *node;
831  svn_error_t *err;
832
833  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
834
835  if (node->handle)
836    {
837      err = check_utf8(src->data, src->len, pool);
838      if (! err)
839        err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
840    }
841  else
842    {
843      err = check_non_ascii(src->data, src->len, pool);
844      if (! err)
845        *dest = svn_stringbuf_dup(src, pool);
846    }
847
848  err = svn_error_compose_create(
849          err,
850          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
851
852  return err;
853}
854
855
856svn_error_t *
857svn_utf_string_from_utf8(const svn_string_t **dest,
858                         const svn_string_t *src,
859                         apr_pool_t *pool)
860{
861  xlate_handle_node_t *node;
862  svn_error_t *err;
863
864  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
865
866  if (node->handle)
867    {
868      err = check_utf8(src->data, src->len, pool);
869      if (! err)
870        {
871          svn_stringbuf_t *dbuf;
872
873          err = convert_to_stringbuf(node, src->data, src->len,
874                                     &dbuf, pool);
875
876          if (! err)
877            *dest = svn_stringbuf__morph_into_string(dbuf);
878        }
879    }
880  else
881    {
882      err = check_non_ascii(src->data, src->len, pool);
883      if (! err)
884        *dest = svn_string_dup(src, pool);
885    }
886
887  err = svn_error_compose_create(
888          err,
889          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
890
891  return err;
892}
893
894
895svn_error_t *
896svn_utf_cstring_from_utf8(const char **dest,
897                          const char *src,
898                          apr_pool_t *pool)
899{
900  xlate_handle_node_t *node;
901  svn_error_t *err;
902
903  SVN_ERR(check_cstring_utf8(src, pool));
904
905  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
906  err = convert_cstring(dest, src, node, pool);
907  err = svn_error_compose_create(
908          err,
909          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
910
911  return err;
912}
913
914
915svn_error_t *
916svn_utf_cstring_from_utf8_ex2(const char **dest,
917                              const char *src,
918                              const char *topage,
919                              apr_pool_t *pool)
920{
921  xlate_handle_node_t *node;
922  svn_error_t *err;
923  const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
924                                          pool);
925
926  SVN_ERR(check_cstring_utf8(src, pool));
927
928  SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
929                                convset_key, pool));
930  err = convert_cstring(dest, src, node, pool);
931  err = svn_error_compose_create(
932          err,
933          put_xlate_handle_node(node, convset_key, pool));
934
935  return err;
936}
937
938const char *
939svn_utf__cstring_from_utf8_fuzzy(const char *src,
940                                 apr_pool_t *pool,
941                                 svn_error_t *(*convert_from_utf8)
942                                 (const char **, const char *, apr_pool_t *))
943{
944  const char *escaped, *converted;
945  svn_error_t *err;
946
947  escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
948
949  /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
950     contain only 7-bit bytes :-).  Recode to native... */
951  err = convert_from_utf8(((const char **) &converted), escaped, pool);
952
953  if (err)
954    {
955      svn_error_clear(err);
956      return escaped;
957    }
958  else
959    return converted;
960
961  /* ### Check the client locale, maybe we can avoid that second
962   * conversion!  See Ulrich Drepper's patch at
963   * https://issues.apache.org/jira/browse/SVN-807.
964   */
965}
966
967
968const char *
969svn_utf_cstring_from_utf8_fuzzy(const char *src,
970                                apr_pool_t *pool)
971{
972  return svn_utf__cstring_from_utf8_fuzzy(src, pool,
973                                          svn_utf_cstring_from_utf8);
974}
975
976
977svn_error_t *
978svn_utf_cstring_from_utf8_stringbuf(const char **dest,
979                                    const svn_stringbuf_t *src,
980                                    apr_pool_t *pool)
981{
982  svn_stringbuf_t *destbuf;
983
984  SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
985  *dest = destbuf->data;
986
987  return SVN_NO_ERROR;
988}
989
990
991svn_error_t *
992svn_utf_cstring_from_utf8_string(const char **dest,
993                                 const svn_string_t *src,
994                                 apr_pool_t *pool)
995{
996  xlate_handle_node_t *node;
997  svn_error_t *err;
998
999  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
1000
1001  if (node->handle)
1002    {
1003      err = check_utf8(src->data, src->len, pool);
1004      if (! err)
1005        {
1006          svn_stringbuf_t *dbuf;
1007
1008          err = convert_to_stringbuf(node, src->data, src->len,
1009                                     &dbuf, pool);
1010          if (! err)
1011            *dest = dbuf->data;
1012        }
1013    }
1014  else
1015    {
1016      err = check_non_ascii(src->data, src->len, pool);
1017      if (! err)
1018        *dest = apr_pstrmemdup(pool, src->data, src->len);
1019    }
1020
1021  err = svn_error_compose_create(
1022          err,
1023          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1024
1025  return err;
1026}
1027
1028
1029/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
1030static void
1031membuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value)
1032{
1033  svn_membuf__resize(buf, (offset + 1) * sizeof(value));
1034  ((apr_int32_t*)buf->data)[offset] = value;
1035}
1036
1037/* TODO: Use compiler intrinsics for byte swaps. */
1038#define SWAP_SHORT(x)  ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
1039#define SWAP_LONG(x)   ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8)    \
1040                        | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
1041
1042#define IS_UTF16_LEAD_SURROGATE(c)   ((c) >= 0xd800 && (c) <= 0xdbff)
1043#define IS_UTF16_TRAIL_SURROGATE(c)  ((c) >= 0xdc00 && (c) <= 0xdfff)
1044
1045svn_error_t *
1046svn_utf__utf16_to_utf8(const svn_string_t **result,
1047                       const apr_uint16_t *utf16str,
1048                       apr_size_t utf16len,
1049                       svn_boolean_t big_endian,
1050                       apr_pool_t *result_pool,
1051                       apr_pool_t *scratch_pool)
1052{
1053  static const apr_uint16_t endiancheck = 0xa55a;
1054  const svn_boolean_t arch_big_endian =
1055    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1056  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1057
1058  apr_uint16_t lead_surrogate;
1059  apr_size_t length;
1060  apr_size_t offset;
1061  svn_membuf_t ucs4buf;
1062  svn_membuf_t resultbuf;
1063  svn_string_t *res;
1064
1065  if (utf16len == SVN_UTF__UNKNOWN_LENGTH)
1066    {
1067      const apr_uint16_t *endp = utf16str;
1068      while (*endp++)
1069        ;
1070      utf16len = (endp - utf16str);
1071    }
1072
1073  svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool);
1074
1075  for (lead_surrogate = 0, length = 0, offset = 0;
1076       offset < utf16len; ++offset)
1077    {
1078      const apr_uint16_t code =
1079        (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]);
1080
1081      if (lead_surrogate)
1082        {
1083          if (IS_UTF16_TRAIL_SURROGATE(code))
1084            {
1085              /* Combine the lead and trail currogates into a 32-bit code. */
1086              membuf_insert_ucs4(&ucs4buf, length++,
1087                                 (0x010000
1088                                  + (((lead_surrogate & 0x03ff) << 10)
1089                                     | (code & 0x03ff))));
1090              lead_surrogate = 0;
1091              continue;
1092            }
1093          else
1094            {
1095              /* If we didn't find a surrogate pair, just dump the
1096                 lead surrogate into the stream. */
1097              membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
1098              lead_surrogate = 0;
1099            }
1100        }
1101
1102      if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code))
1103        {
1104          /* Store a lead surrogate that is followed by at least one
1105             code for the next iteration. */
1106          lead_surrogate = code;
1107          continue;
1108        }
1109      else
1110        membuf_insert_ucs4(&ucs4buf, length++, code);
1111    }
1112
1113  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1114     per code point for encoding. The buffer will grow as
1115     necessary. */
1116  svn_membuf__create(&resultbuf, length * 2, result_pool);
1117  SVN_ERR(svn_utf__encode_ucs4_string(
1118              &resultbuf, ucs4buf.data, length, &length));
1119
1120  res = apr_palloc(result_pool, sizeof(*res));
1121  res->data = resultbuf.data;
1122  res->len = length;
1123  *result = res;
1124  return SVN_NO_ERROR;
1125}
1126
1127
1128svn_error_t *
1129svn_utf__utf32_to_utf8(const svn_string_t **result,
1130                       const apr_int32_t *utf32str,
1131                       apr_size_t utf32len,
1132                       svn_boolean_t big_endian,
1133                       apr_pool_t *result_pool,
1134                       apr_pool_t *scratch_pool)
1135{
1136  static const apr_int32_t endiancheck = 0xa5cbbc5a;
1137  const svn_boolean_t arch_big_endian =
1138    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1139  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1140
1141  apr_size_t length;
1142  svn_membuf_t resultbuf;
1143  svn_string_t *res;
1144
1145  if (utf32len == SVN_UTF__UNKNOWN_LENGTH)
1146    {
1147      const apr_int32_t *endp = utf32str;
1148      while (*endp++)
1149        ;
1150      utf32len = (endp - utf32str);
1151    }
1152
1153  if (swap_order)
1154    {
1155      apr_size_t offset;
1156      svn_membuf_t ucs4buf;
1157
1158      svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t),
1159                         scratch_pool);
1160
1161      for (offset = 0; offset < utf32len; ++offset)
1162        {
1163          const apr_int32_t code = SWAP_LONG(utf32str[offset]);
1164          membuf_insert_ucs4(&ucs4buf, offset, code);
1165        }
1166      utf32str = ucs4buf.data;
1167    }
1168
1169  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1170     per code point for encoding. The buffer will grow as
1171     necessary. */
1172  svn_membuf__create(&resultbuf, utf32len * 2, result_pool);
1173  SVN_ERR(svn_utf__encode_ucs4_string(
1174              &resultbuf, utf32str, utf32len, &length));
1175
1176  res = apr_palloc(result_pool, sizeof(*res));
1177  res->data = resultbuf.data;
1178  res->len = length;
1179  *result = res;
1180  return SVN_NO_ERROR;
1181}
1182
1183
1184#ifdef WIN32
1185
1186
1187svn_error_t *
1188svn_utf__win32_utf8_to_utf16(const WCHAR **result,
1189                             const char *src,
1190                             const WCHAR *prefix,
1191                             apr_pool_t *result_pool)
1192{
1193  const int utf8_count = strlen(src);
1194  const int prefix_len = (prefix ? lstrlenW(prefix) : 0);
1195  WCHAR *wide_str;
1196  int wide_count;
1197
1198  if (0 == prefix_len + utf8_count)
1199    {
1200      *result = L"";
1201      return SVN_NO_ERROR;
1202    }
1203
1204  wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0);
1205  if (wide_count == 0)
1206    return svn_error_wrap_apr(apr_get_os_error(),
1207                              _("Conversion to UTF-16 failed"));
1208
1209  wide_str = apr_palloc(result_pool,
1210                        (prefix_len + wide_count + 1) * sizeof(*wide_str));
1211  if (prefix_len)
1212    memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str));
1213  if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count,
1214                               wide_str + prefix_len, wide_count))
1215    return svn_error_wrap_apr(apr_get_os_error(),
1216                              _("Conversion to UTF-16 failed"));
1217
1218  wide_str[prefix_len + wide_count] = 0;
1219  *result = wide_str;
1220
1221  return SVN_NO_ERROR;
1222}
1223
1224svn_error_t *
1225svn_utf__win32_utf16_to_utf8(const char **result,
1226                             const WCHAR *src,
1227                             const char *prefix,
1228                             apr_pool_t *result_pool)
1229{
1230  const int wide_count = lstrlenW(src);
1231  const int prefix_len = (prefix ? strlen(prefix) : 0);
1232  char *utf8_str;
1233  int utf8_count;
1234
1235  if (0 == prefix_len + wide_count)
1236    {
1237      *result = "";
1238      return SVN_NO_ERROR;
1239    }
1240
1241  utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1242                                   NULL, 0, NULL, FALSE);
1243  if (utf8_count == 0)
1244    return svn_error_wrap_apr(apr_get_os_error(),
1245                              _("Conversion from UTF-16 failed"));
1246
1247  utf8_str = apr_palloc(result_pool,
1248                        (prefix_len + utf8_count + 1) * sizeof(*utf8_str));
1249  if (prefix_len)
1250    memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str));
1251  if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1252                               utf8_str + prefix_len, utf8_count,
1253                               NULL, FALSE))
1254    return svn_error_wrap_apr(apr_get_os_error(),
1255                              _("Conversion from UTF-16 failed"));
1256
1257  utf8_str[prefix_len + utf8_count] = 0;
1258  *result = utf8_str;
1259
1260  return SVN_NO_ERROR;
1261}
1262
1263#endif /* WIN32 */
1264