1251881Speter/*
2251881Speter * utf.c:  UTF-8 conversion routines
3251881Speter *
4251881Speter * ====================================================================
5251881Speter *    Licensed to the Apache Software Foundation (ASF) under one
6251881Speter *    or more contributor license agreements.  See the NOTICE file
7251881Speter *    distributed with this work for additional information
8251881Speter *    regarding copyright ownership.  The ASF licenses this file
9251881Speter *    to you under the Apache License, Version 2.0 (the
10251881Speter *    "License"); you may not use this file except in compliance
11251881Speter *    with the License.  You may obtain a copy of the License at
12251881Speter *
13251881Speter *      http://www.apache.org/licenses/LICENSE-2.0
14251881Speter *
15251881Speter *    Unless required by applicable law or agreed to in writing,
16251881Speter *    software distributed under the License is distributed on an
17251881Speter *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18251881Speter *    KIND, either express or implied.  See the License for the
19251881Speter *    specific language governing permissions and limitations
20251881Speter *    under the License.
21251881Speter * ====================================================================
22251881Speter */
23251881Speter
24251881Speter
25251881Speter
26251881Speter#include <stdlib.h>
27251881Speter#include <string.h>
28251881Speter#include <assert.h>
29251881Speter
30251881Speter#include <apr_strings.h>
31251881Speter#include <apr_lib.h>
32251881Speter#include <apr_xlate.h>
33251881Speter#include <apr_atomic.h>
34251881Speter
35251881Speter#include "svn_hash.h"
36251881Speter#include "svn_string.h"
37251881Speter#include "svn_error.h"
38251881Speter#include "svn_pools.h"
39251881Speter#include "svn_ctype.h"
40251881Speter#include "svn_utf.h"
41251881Speter#include "svn_private_config.h"
42251881Speter#include "win32_xlate.h"
43251881Speter
44251881Speter#include "private/svn_utf_private.h"
45251881Speter#include "private/svn_dep_compat.h"
46251881Speter#include "private/svn_string_private.h"
47251881Speter#include "private/svn_mutex.h"
48251881Speter
49251881Speter
50251881Speter
51251881Speter/* Use these static strings to maximize performance on standard conversions.
52251881Speter * Any strings on other locations are still valid, however.
53251881Speter */
54251881Speterstatic const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55251881Speterstatic const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
56251881Speter
57251881Speterstatic const char *SVN_APR_UTF8_CHARSET = "UTF-8";
58251881Speter
59251881Speterstatic svn_mutex__t *xlate_handle_mutex = NULL;
60251881Speterstatic svn_boolean_t assume_native_charset_is_utf8 = FALSE;
61251881Speter
62299742Sdim#if defined(WIN32)
63299742Sdimtypedef svn_subr__win32_xlate_t xlate_handle_t;
64299742Sdim#else
65299742Sdimtypedef apr_xlate_t xlate_handle_t;
66299742Sdim#endif
67299742Sdim
68251881Speter/* The xlate handle cache is a global hash table with linked lists of xlate
69251881Speter * handles.  In multi-threaded environments, a thread "borrows" an xlate
70251881Speter * handle from the cache during a translation and puts it back afterwards.
71251881Speter * This avoids holding a global lock for all translations.
72251881Speter * If there is no handle for a particular key when needed, a new is
73251881Speter * handle is created and put in the cache after use.
74251881Speter * This means that there will be at most N handles open for a key, where N
75251881Speter * is the number of simultanous handles in use for that key. */
76251881Speter
77251881Spetertypedef struct xlate_handle_node_t {
78299742Sdim  xlate_handle_t *handle;
79251881Speter  /* FALSE if the handle is not valid, since its pool is being
80251881Speter     destroyed. */
81251881Speter  svn_boolean_t valid;
82251881Speter  /* The name of a char encoding or APR_LOCALE_CHARSET. */
83251881Speter  const char *frompage, *topage;
84251881Speter  struct xlate_handle_node_t *next;
85251881Speter} xlate_handle_node_t;
86251881Speter
87251881Speter/* This maps const char * userdata_key strings to xlate_handle_node_t **
88251881Speter   handles to the first entry in the linked list of xlate handles.  We don't
89251881Speter   store the pointer to the list head directly in the hash table, since we
90251881Speter   remove/insert entries at the head in the list in the code below, and
91251881Speter   we can't use apr_hash_set() in each character translation because that
92251881Speter   function allocates memory in each call where the value is non-NULL.
93251881Speter   Since these allocations take place in a global pool, this would be a
94251881Speter   memory leak. */
95251881Speterstatic apr_hash_t *xlate_handle_hash = NULL;
96251881Speter
97251881Speter/* "1st level cache" to standard conversion maps. We may access these
98251881Speter * using atomic xchange ops, i.e. without further thread synchronization.
99251881Speter * If the respective item is NULL, fallback to hash lookup.
100251881Speter */
101251881Speterstatic void * volatile xlat_ntou_static_handle = NULL;
102251881Speterstatic void * volatile xlat_uton_static_handle = NULL;
103251881Speter
104251881Speter/* Clean up the xlate handle cache. */
105251881Speterstatic apr_status_t
106251881Speterxlate_cleanup(void *arg)
107251881Speter{
108251881Speter  /* We set the cache variables to NULL so that translation works in other
109251881Speter     cleanup functions, even if it isn't cached then. */
110251881Speter  xlate_handle_hash = NULL;
111251881Speter
112251881Speter  /* ensure no stale objects get accessed */
113251881Speter  xlat_ntou_static_handle = NULL;
114251881Speter  xlat_uton_static_handle = NULL;
115251881Speter
116251881Speter  return APR_SUCCESS;
117251881Speter}
118251881Speter
119251881Speter/* Set the handle of ARG to NULL. */
120251881Speterstatic apr_status_t
121251881Speterxlate_handle_node_cleanup(void *arg)
122251881Speter{
123251881Speter  xlate_handle_node_t *node = arg;
124251881Speter
125251881Speter  node->valid = FALSE;
126251881Speter  return APR_SUCCESS;
127251881Speter}
128251881Speter
129251881Spetervoid
130251881Spetersvn_utf_initialize2(svn_boolean_t assume_native_utf8,
131251881Speter                    apr_pool_t *pool)
132251881Speter{
133251881Speter  if (!xlate_handle_hash)
134251881Speter    {
135251881Speter      /* We create our own subpool, which we protect with the mutex.
136251881Speter         We can't use the pool passed to us by the caller, since we will
137251881Speter         use it for xlate handle allocations, possibly in multiple threads,
138251881Speter         and pool allocation is not thread-safe. */
139251881Speter      apr_pool_t *subpool = svn_pool_create(pool);
140251881Speter      svn_mutex__t *mutex;
141251881Speter      svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
142251881Speter      if (err)
143251881Speter        {
144251881Speter          svn_error_clear(err);
145251881Speter          return;
146251881Speter        }
147251881Speter
148251881Speter      xlate_handle_mutex = mutex;
149251881Speter      xlate_handle_hash = apr_hash_make(subpool);
150251881Speter
151251881Speter      apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
152251881Speter                                apr_pool_cleanup_null);
153251881Speter    }
154251881Speter
155251881Speter    if (!assume_native_charset_is_utf8)
156251881Speter      assume_native_charset_is_utf8 = assume_native_utf8;
157251881Speter}
158251881Speter
159251881Speter/* Return a unique string key based on TOPAGE and FROMPAGE.  TOPAGE and
160251881Speter * FROMPAGE can be any valid arguments of the same name to
161251881Speter * apr_xlate_open().  Allocate the returned string in POOL. */
162251881Speterstatic const char*
163251881Speterget_xlate_key(const char *topage,
164251881Speter              const char *frompage,
165251881Speter              apr_pool_t *pool)
166251881Speter{
167251881Speter  /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
168251881Speter   * topage/frompage is really an int, not a valid string.  So generate a
169251881Speter   * unique key accordingly. */
170251881Speter  if (frompage == SVN_APR_LOCALE_CHARSET)
171251881Speter    frompage = "APR_LOCALE_CHARSET";
172251881Speter  else if (frompage == SVN_APR_DEFAULT_CHARSET)
173251881Speter    frompage = "APR_DEFAULT_CHARSET";
174251881Speter
175251881Speter  if (topage == SVN_APR_LOCALE_CHARSET)
176251881Speter    topage = "APR_LOCALE_CHARSET";
177251881Speter  else if (topage == SVN_APR_DEFAULT_CHARSET)
178251881Speter    topage = "APR_DEFAULT_CHARSET";
179251881Speter
180251881Speter  return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
181299742Sdim                     "-xlate-handle", SVN_VA_NULL);
182251881Speter}
183251881Speter
184251881Speter/* Atomically replace the content in *MEM with NEW_VALUE and return
185251881Speter * the previous content of *MEM. If atomicy cannot be guaranteed,
186251881Speter * *MEM will not be modified and NEW_VALUE is simply returned to
187251881Speter * the caller.
188251881Speter */
189251881Speterstatic APR_INLINE void*
190251881Speteratomic_swap(void * volatile * mem, void *new_value)
191251881Speter{
192251881Speter#if APR_HAS_THREADS
193251881Speter  /* Cast is necessary because of APR bug:
194251881Speter     https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
195251881Speter   return apr_atomic_xchgptr((volatile void **)mem, new_value);
196251881Speter#else
197251881Speter   /* no threads - no sync. necessary */
198251881Speter   void *old_value = (void*)*mem;
199251881Speter   *mem = new_value;
200251881Speter   return old_value;
201251881Speter#endif
202251881Speter}
203251881Speter
204251881Speter/* Set *RET to a newly created handle node for converting from FROMPAGE
205251881Speter   to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
206251881Speter   (*RET)->handle to NULL.  If fail for any other reason, return the error.
207251881Speter   Allocate *RET and its xlate handle in POOL. */
208251881Speterstatic svn_error_t *
209251881Speterxlate_alloc_handle(xlate_handle_node_t **ret,
210251881Speter                   const char *topage, const char *frompage,
211251881Speter                   apr_pool_t *pool)
212251881Speter{
213251881Speter  apr_status_t apr_err;
214299742Sdim  xlate_handle_t *handle;
215262253Speter  const char *name;
216251881Speter
217251881Speter  /* The error handling doesn't support the following cases, since we don't
218251881Speter     use them currently.  Catch this here. */
219251881Speter  SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
220251881Speter                 && topage != SVN_APR_DEFAULT_CHARSET
221251881Speter                 && (frompage != SVN_APR_LOCALE_CHARSET
222251881Speter                     || topage != SVN_APR_LOCALE_CHARSET));
223251881Speter
224251881Speter  /* Try to create a handle. */
225251881Speter#if defined(WIN32)
226299742Sdim  apr_err = svn_subr__win32_xlate_open(&handle, topage,
227251881Speter                                       frompage, pool);
228262253Speter  name = "win32-xlate: ";
229251881Speter#else
230251881Speter  apr_err = apr_xlate_open(&handle, topage, frompage, pool);
231262253Speter  name = "APR: ";
232251881Speter#endif
233251881Speter
234251881Speter  if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
235251881Speter    handle = NULL;
236251881Speter  else if (apr_err != APR_SUCCESS)
237251881Speter    {
238251881Speter      const char *errstr;
239253734Speter      char apr_strerr[512];
240253734Speter
241251881Speter      /* Can't use svn_error_wrap_apr here because it calls functions in
242251881Speter         this file, leading to infinite recursion. */
243251881Speter      if (frompage == SVN_APR_LOCALE_CHARSET)
244251881Speter        errstr = apr_psprintf(pool,
245251881Speter                              _("Can't create a character converter from "
246251881Speter                                "native encoding to '%s'"), topage);
247251881Speter      else if (topage == SVN_APR_LOCALE_CHARSET)
248251881Speter        errstr = apr_psprintf(pool,
249251881Speter                              _("Can't create a character converter from "
250251881Speter                                "'%s' to native encoding"), frompage);
251251881Speter      else
252251881Speter        errstr = apr_psprintf(pool,
253251881Speter                              _("Can't create a character converter from "
254251881Speter                                "'%s' to '%s'"), frompage, topage);
255251881Speter
256253734Speter      /* Just put the error on the stack, since svn_error_create duplicates it
257253734Speter         later.  APR_STRERR will be in the local encoding, not in UTF-8, though.
258253734Speter       */
259253734Speter      svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr));
260299742Sdim      return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE,
261262253Speter                               svn_error_create(apr_err, NULL, apr_strerr),
262262253Speter                               "%s%s", name, errstr);
263251881Speter    }
264251881Speter
265251881Speter  /* Allocate and initialize the node. */
266251881Speter  *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
267251881Speter  (*ret)->handle = handle;
268251881Speter  (*ret)->valid = TRUE;
269251881Speter  (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
270251881Speter                      ? apr_pstrdup(pool, frompage) : frompage);
271251881Speter  (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
272251881Speter                    ? apr_pstrdup(pool, topage) : topage);
273251881Speter  (*ret)->next = NULL;
274251881Speter
275251881Speter  /* If we are called from inside a pool cleanup handler, the just created
276251881Speter     xlate handle will be closed when that handler returns by a newly
277251881Speter     registered cleanup handler, however, the handle is still cached by us.
278251881Speter     To prevent this, we register a cleanup handler that will reset the valid
279251881Speter     flag of our node, so we don't use an invalid handle. */
280251881Speter  if (handle)
281251881Speter    apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
282251881Speter                              apr_pool_cleanup_null);
283251881Speter
284251881Speter  return SVN_NO_ERROR;
285251881Speter}
286251881Speter
287251881Speter/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
288251881Speter   global hash map, if available.
289251881Speter
290251881Speter   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
291251881Speter   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
292251881Speter   in the pool of xlate_handle_hash.
293251881Speter
294251881Speter   Note: this function is not thread-safe. Call get_xlate_handle_node
295251881Speter   instead. */
296251881Speterstatic svn_error_t *
297251881Speterget_xlate_handle_node_internal(xlate_handle_node_t **ret,
298251881Speter                               const char *topage, const char *frompage,
299251881Speter                               const char *userdata_key, apr_pool_t *pool)
300251881Speter{
301251881Speter  /* If we already have a handle, just return it. */
302251881Speter  if (userdata_key && xlate_handle_hash)
303251881Speter    {
304251881Speter      xlate_handle_node_t *old_node = NULL;
305251881Speter
306251881Speter      /* 2nd level: hash lookup */
307251881Speter      xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
308251881Speter                                                       userdata_key);
309251881Speter      if (old_node_p)
310251881Speter        old_node = *old_node_p;
311251881Speter      if (old_node)
312251881Speter        {
313251881Speter          /* Ensure that the handle is still valid. */
314251881Speter          if (old_node->valid)
315251881Speter            {
316251881Speter              /* Remove from the list. */
317251881Speter              *old_node_p = old_node->next;
318251881Speter              old_node->next = NULL;
319251881Speter              *ret = old_node;
320251881Speter              return SVN_NO_ERROR;
321251881Speter            }
322251881Speter        }
323251881Speter    }
324251881Speter
325251881Speter  /* Note that we still have the mutex locked (if it is initialized), so we
326251881Speter     can use the global pool for creating the new xlate handle. */
327251881Speter
328251881Speter  /* Use the correct pool for creating the handle. */
329251881Speter  pool = apr_hash_pool_get(xlate_handle_hash);
330251881Speter
331251881Speter  return xlate_alloc_handle(ret, topage, frompage, pool);
332251881Speter}
333251881Speter
334251881Speter/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
335251881Speter   creating the handle node if it doesn't exist in USERDATA_KEY.
336251881Speter   If a node is not cached and apr_xlate_open() returns APR_EINVAL or
337251881Speter   APR_ENOTIMPL, set (*RET)->handle to NULL.  If fail for any other
338251881Speter   reason, return the error.
339251881Speter
340251881Speter   Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
341251881Speter   hasn't been called or USERDATA_KEY is NULL.  Else, allocate them
342251881Speter   in the pool of xlate_handle_hash. */
343251881Speterstatic svn_error_t *
344251881Speterget_xlate_handle_node(xlate_handle_node_t **ret,
345251881Speter                      const char *topage, const char *frompage,
346251881Speter                      const char *userdata_key, apr_pool_t *pool)
347251881Speter{
348251881Speter  xlate_handle_node_t *old_node = NULL;
349251881Speter
350251881Speter  /* If we already have a handle, just return it. */
351251881Speter  if (userdata_key)
352251881Speter    {
353251881Speter      if (xlate_handle_hash)
354251881Speter        {
355251881Speter          /* 1st level: global, static items */
356251881Speter          if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
357251881Speter            old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
358251881Speter          else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
359251881Speter            old_node = atomic_swap(&xlat_uton_static_handle, NULL);
360251881Speter
361251881Speter          if (old_node && old_node->valid)
362251881Speter            {
363251881Speter              *ret = old_node;
364251881Speter              return SVN_NO_ERROR;
365251881Speter            }
366251881Speter        }
367251881Speter      else
368251881Speter        {
369251881Speter          void *p;
370251881Speter          /* We fall back on a per-pool cache instead. */
371251881Speter          apr_pool_userdata_get(&p, userdata_key, pool);
372251881Speter          old_node = p;
373251881Speter          /* Ensure that the handle is still valid. */
374251881Speter          if (old_node && old_node->valid)
375251881Speter            {
376251881Speter              *ret = old_node;
377251881Speter              return SVN_NO_ERROR;
378251881Speter            }
379251881Speter
380251881Speter          return xlate_alloc_handle(ret, topage, frompage, pool);
381251881Speter        }
382251881Speter    }
383251881Speter
384251881Speter  SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
385251881Speter                       get_xlate_handle_node_internal(ret,
386251881Speter                                                      topage,
387251881Speter                                                      frompage,
388251881Speter                                                      userdata_key,
389251881Speter                                                      pool));
390251881Speter
391251881Speter  return SVN_NO_ERROR;
392251881Speter}
393251881Speter
394251881Speter/* Put back NODE into the xlate handle cache for use by other calls.
395251881Speter
396251881Speter   Note: this function is not thread-safe. Call put_xlate_handle_node
397251881Speter   instead. */
398251881Speterstatic svn_error_t *
399251881Speterput_xlate_handle_node_internal(xlate_handle_node_t *node,
400251881Speter                               const char *userdata_key)
401251881Speter{
402251881Speter  xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
403251881Speter  if (node_p == NULL)
404251881Speter    {
405251881Speter      userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
406251881Speter                                  userdata_key);
407251881Speter      node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
408251881Speter                          sizeof(*node_p));
409251881Speter      *node_p = NULL;
410251881Speter      svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
411251881Speter    }
412251881Speter  node->next = *node_p;
413251881Speter  *node_p = node;
414251881Speter
415251881Speter  return SVN_NO_ERROR;
416251881Speter}
417251881Speter
418251881Speter/* Put back NODE into the xlate handle cache for use by other calls.
419251881Speter   If there is no global cache, store the handle in POOL.
420251881Speter   Ignore errors related to locking/unlocking the mutex. */
421251881Speterstatic svn_error_t *
422251881Speterput_xlate_handle_node(xlate_handle_node_t *node,
423251881Speter                      const char *userdata_key,
424251881Speter                      apr_pool_t *pool)
425251881Speter{
426251881Speter  assert(node->next == NULL);
427251881Speter  if (!userdata_key)
428251881Speter    return SVN_NO_ERROR;
429251881Speter
430251881Speter  /* push previous global node to the hash */
431251881Speter  if (xlate_handle_hash)
432251881Speter    {
433251881Speter      /* 1st level: global, static items */
434251881Speter      if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
435251881Speter        node = atomic_swap(&xlat_ntou_static_handle, node);
436251881Speter      else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
437251881Speter        node = atomic_swap(&xlat_uton_static_handle, node);
438251881Speter      if (node == NULL)
439251881Speter        return SVN_NO_ERROR;
440251881Speter
441251881Speter      SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
442251881Speter                           put_xlate_handle_node_internal(node,
443251881Speter                                                          userdata_key));
444251881Speter    }
445251881Speter  else
446251881Speter    {
447251881Speter      /* Store it in the per-pool cache. */
448251881Speter      apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
449251881Speter    }
450251881Speter
451251881Speter  return SVN_NO_ERROR;
452251881Speter}
453251881Speter
454251881Speter/* Return the apr_xlate handle for converting native characters to UTF-8. */
455251881Speterstatic svn_error_t *
456251881Speterget_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
457251881Speter{
458251881Speter  return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
459251881Speter                               assume_native_charset_is_utf8
460251881Speter                                 ? SVN_APR_UTF8_CHARSET
461251881Speter                                 : SVN_APR_LOCALE_CHARSET,
462251881Speter                               SVN_UTF_NTOU_XLATE_HANDLE, pool);
463251881Speter}
464251881Speter
465251881Speter
466251881Speter/* Return the apr_xlate handle for converting UTF-8 to native characters.
467251881Speter   Create one if it doesn't exist.  If unable to find a handle, or
468251881Speter   unable to create one because apr_xlate_open returned APR_EINVAL, then
469251881Speter   set *RET to null and return SVN_NO_ERROR; if fail for some other
470251881Speter   reason, return error. */
471251881Speterstatic svn_error_t *
472251881Speterget_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
473251881Speter{
474251881Speter  return get_xlate_handle_node(ret,
475251881Speter                               assume_native_charset_is_utf8
476251881Speter                                 ? SVN_APR_UTF8_CHARSET
477251881Speter                                 : SVN_APR_LOCALE_CHARSET,
478251881Speter                               SVN_APR_UTF8_CHARSET,
479251881Speter                               SVN_UTF_UTON_XLATE_HANDLE, pool);
480251881Speter}
481251881Speter
482251881Speter
483251881Speter/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
484251881Speter   in *DEST, which is allocated in POOL. */
485251881Speterstatic svn_error_t *
486251881Speterconvert_to_stringbuf(xlate_handle_node_t *node,
487251881Speter                     const char *src_data,
488251881Speter                     apr_size_t src_length,
489251881Speter                     svn_stringbuf_t **dest,
490251881Speter                     apr_pool_t *pool)
491251881Speter{
492251881Speter#ifdef WIN32
493251881Speter  apr_status_t apr_err;
494251881Speter
495299742Sdim  apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data,
496299742Sdim                                               src_length, dest, pool);
497251881Speter#else
498251881Speter  apr_size_t buflen = src_length * 2;
499251881Speter  apr_status_t apr_err;
500251881Speter  apr_size_t srclen = src_length;
501251881Speter  apr_size_t destlen = buflen;
502251881Speter
503251881Speter  /* Initialize *DEST to an empty stringbuf.
504251881Speter     A 1:2 ratio of input bytes to output bytes (as assigned above)
505251881Speter     should be enough for most translations, and if it turns out not
506251881Speter     to be enough, we'll grow the buffer again, sizing it based on a
507251881Speter     1:3 ratio of the remainder of the string. */
508251881Speter  *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
509251881Speter
510251881Speter  /* Not only does it not make sense to convert an empty string, but
511251881Speter     apr-iconv is quite unreasonable about not allowing that. */
512251881Speter  if (src_length == 0)
513251881Speter    return SVN_NO_ERROR;
514251881Speter
515251881Speter  do
516251881Speter    {
517251881Speter      /* Set up state variables for xlate. */
518251881Speter      destlen = buflen - (*dest)->len;
519251881Speter
520251881Speter      /* Attempt the conversion. */
521251881Speter      apr_err = apr_xlate_conv_buffer(node->handle,
522251881Speter                                      src_data + (src_length - srclen),
523251881Speter                                      &srclen,
524251881Speter                                      (*dest)->data + (*dest)->len,
525251881Speter                                      &destlen);
526251881Speter
527251881Speter      /* Now, update the *DEST->len to track the amount of output data
528251881Speter         churned out so far from this loop. */
529251881Speter      (*dest)->len += ((buflen - (*dest)->len) - destlen);
530251881Speter      buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
531251881Speter                               for all characters in the buffer, 4 is
532251881Speter                               maximum character size (currently) */
533251881Speter
534251881Speter
535251881Speter    } while (apr_err == APR_SUCCESS && srclen != 0);
536251881Speter#endif
537251881Speter
538251881Speter  /* If we exited the loop with an error, return the error. */
539251881Speter  if (apr_err)
540251881Speter    {
541251881Speter      const char *errstr;
542251881Speter      svn_error_t *err;
543251881Speter
544251881Speter      /* Can't use svn_error_wrap_apr here because it calls functions in
545251881Speter         this file, leading to infinite recursion. */
546251881Speter      if (node->frompage == SVN_APR_LOCALE_CHARSET)
547251881Speter        errstr = apr_psprintf
548251881Speter          (pool, _("Can't convert string from native encoding to '%s':"),
549251881Speter           node->topage);
550251881Speter      else if (node->topage == SVN_APR_LOCALE_CHARSET)
551251881Speter        errstr = apr_psprintf
552251881Speter          (pool, _("Can't convert string from '%s' to native encoding:"),
553251881Speter           node->frompage);
554251881Speter      else
555251881Speter        errstr = apr_psprintf
556251881Speter          (pool, _("Can't convert string from '%s' to '%s':"),
557251881Speter           node->frompage, node->topage);
558251881Speter
559299742Sdim      err = svn_error_create(
560299742Sdim          apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool));
561251881Speter      return svn_error_create(apr_err, err, errstr);
562251881Speter    }
563251881Speter  /* Else, exited due to success.  Trim the result buffer down to the
564251881Speter     right length. */
565251881Speter  (*dest)->data[(*dest)->len] = '\0';
566251881Speter
567251881Speter  return SVN_NO_ERROR;
568251881Speter}
569251881Speter
570251881Speter
571251881Speter/* Return APR_EINVAL if the first LEN bytes of DATA contain anything
572251881Speter   other than seven-bit, non-control (except for whitespace) ASCII
573251881Speter   characters, finding the error pool from POOL.  Otherwise, return
574251881Speter   SVN_NO_ERROR. */
575251881Speterstatic svn_error_t *
576251881Spetercheck_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
577251881Speter{
578251881Speter  const char *data_start = data;
579251881Speter
580251881Speter  for (; len > 0; --len, data++)
581251881Speter    {
582251881Speter      if ((! svn_ctype_isascii(*data))
583251881Speter          || ((! svn_ctype_isspace(*data))
584251881Speter              && svn_ctype_iscntrl(*data)))
585251881Speter        {
586251881Speter          /* Show the printable part of the data, followed by the
587251881Speter             decimal code of the questionable character.  Because if a
588251881Speter             user ever gets this error, she's going to have to spend
589251881Speter             time tracking down the non-ASCII data, so we want to help
590251881Speter             as much as possible.  And yes, we just call the unsafe
591251881Speter             data "non-ASCII", even though the actual constraint is
592251881Speter             somewhat more complex than that. */
593251881Speter
594251881Speter          if (data - data_start)
595251881Speter            {
596251881Speter              const char *error_data
597251881Speter                = apr_pstrndup(pool, data_start, (data - data_start));
598251881Speter
599251881Speter              return svn_error_createf
600251881Speter                (APR_EINVAL, NULL,
601251881Speter                 _("Safe data '%s' was followed by non-ASCII byte %d: "
602251881Speter                   "unable to convert to/from UTF-8"),
603251881Speter                 error_data, *((const unsigned char *) data));
604251881Speter            }
605251881Speter          else
606251881Speter            {
607251881Speter              return svn_error_createf
608251881Speter                (APR_EINVAL, NULL,
609251881Speter                 _("Non-ASCII character (code %d) detected, "
610251881Speter                   "and unable to convert to/from UTF-8"),
611251881Speter                 *((const unsigned char *) data));
612251881Speter            }
613251881Speter        }
614251881Speter    }
615251881Speter
616251881Speter  return SVN_NO_ERROR;
617251881Speter}
618251881Speter
619251881Speter/* Construct an error with code APR_EINVAL and with a suitable message
620251881Speter * to describe the invalid UTF-8 sequence DATA of length LEN (which
621251881Speter * may have embedded NULLs).  We can't simply print the data, almost
622251881Speter * by definition we don't really know how it is encoded.
623251881Speter */
624251881Speterstatic svn_error_t *
625251881Speterinvalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
626251881Speter{
627251881Speter  const char *last = svn_utf__last_valid(data, len);
628251881Speter  const char *valid_txt = "", *invalid_txt = "";
629251881Speter  apr_size_t i;
630251881Speter  size_t valid, invalid;
631251881Speter
632251881Speter  /* We will display at most 24 valid octets (this may split a leading
633251881Speter     multi-byte character) as that should fit on one 80 character line. */
634251881Speter  valid = last - data;
635251881Speter  if (valid > 24)
636251881Speter    valid = 24;
637251881Speter  for (i = 0; i < valid; ++i)
638251881Speter    valid_txt = apr_pstrcat(pool, valid_txt,
639251881Speter                            apr_psprintf(pool, " %02x",
640251881Speter                                         (unsigned char)last[i-valid]),
641299742Sdim                                         SVN_VA_NULL);
642251881Speter
643251881Speter  /* 4 invalid octets will guarantee that the faulty octet is displayed */
644251881Speter  invalid = data + len - last;
645251881Speter  if (invalid > 4)
646251881Speter    invalid = 4;
647251881Speter  for (i = 0; i < invalid; ++i)
648251881Speter    invalid_txt = apr_pstrcat(pool, invalid_txt,
649251881Speter                              apr_psprintf(pool, " %02x",
650251881Speter                                           (unsigned char)last[i]),
651299742Sdim                                           SVN_VA_NULL);
652251881Speter
653251881Speter  return svn_error_createf(APR_EINVAL, NULL,
654251881Speter                           _("Valid UTF-8 data\n(hex:%s)\n"
655251881Speter                             "followed by invalid UTF-8 sequence\n(hex:%s)"),
656251881Speter                           valid_txt, invalid_txt);
657251881Speter}
658251881Speter
659251881Speter/* Verify that the sequence DATA of length LEN is valid UTF-8.
660251881Speter   If it is not, return an error with code APR_EINVAL. */
661251881Speterstatic svn_error_t *
662251881Spetercheck_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
663251881Speter{
664251881Speter  if (! svn_utf__is_valid(data, len))
665251881Speter    return invalid_utf8(data, len, pool);
666251881Speter  return SVN_NO_ERROR;
667251881Speter}
668251881Speter
669251881Speter/* Verify that the NULL terminated sequence DATA is valid UTF-8.
670251881Speter   If it is not, return an error with code APR_EINVAL. */
671251881Speterstatic svn_error_t *
672251881Spetercheck_cstring_utf8(const char *data, apr_pool_t *pool)
673251881Speter{
674251881Speter
675251881Speter  if (! svn_utf__cstring_is_valid(data))
676251881Speter    return invalid_utf8(data, strlen(data), pool);
677251881Speter  return SVN_NO_ERROR;
678251881Speter}
679251881Speter
680251881Speter
681251881Spetersvn_error_t *
682251881Spetersvn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
683251881Speter                          const svn_stringbuf_t *src,
684251881Speter                          apr_pool_t *pool)
685251881Speter{
686251881Speter  xlate_handle_node_t *node;
687251881Speter  svn_error_t *err;
688251881Speter
689251881Speter  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
690251881Speter
691251881Speter  if (node->handle)
692251881Speter    {
693251881Speter      err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
694251881Speter      if (! err)
695251881Speter        err = check_utf8((*dest)->data, (*dest)->len, pool);
696251881Speter    }
697251881Speter  else
698251881Speter    {
699251881Speter      err = check_non_ascii(src->data, src->len, pool);
700251881Speter      if (! err)
701251881Speter        *dest = svn_stringbuf_dup(src, pool);
702251881Speter    }
703251881Speter
704251881Speter  return svn_error_compose_create(err,
705251881Speter                                  put_xlate_handle_node
706251881Speter                                     (node,
707251881Speter                                      SVN_UTF_NTOU_XLATE_HANDLE,
708251881Speter                                      pool));
709251881Speter}
710251881Speter
711251881Speter
712251881Spetersvn_error_t *
713251881Spetersvn_utf_string_to_utf8(const svn_string_t **dest,
714251881Speter                       const svn_string_t *src,
715251881Speter                       apr_pool_t *pool)
716251881Speter{
717251881Speter  svn_stringbuf_t *destbuf;
718251881Speter  xlate_handle_node_t *node;
719251881Speter  svn_error_t *err;
720251881Speter
721251881Speter  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
722251881Speter
723251881Speter  if (node->handle)
724251881Speter    {
725251881Speter      err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
726251881Speter      if (! err)
727251881Speter        err = check_utf8(destbuf->data, destbuf->len, pool);
728251881Speter      if (! err)
729251881Speter        *dest = svn_stringbuf__morph_into_string(destbuf);
730251881Speter    }
731251881Speter  else
732251881Speter    {
733251881Speter      err = check_non_ascii(src->data, src->len, pool);
734251881Speter      if (! err)
735251881Speter        *dest = svn_string_dup(src, pool);
736251881Speter    }
737251881Speter
738251881Speter  return svn_error_compose_create(err,
739251881Speter                                  put_xlate_handle_node
740251881Speter                                     (node,
741251881Speter                                      SVN_UTF_NTOU_XLATE_HANDLE,
742251881Speter                                      pool));
743251881Speter}
744251881Speter
745251881Speter
746251881Speter/* Common implementation for svn_utf_cstring_to_utf8,
747251881Speter   svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
748251881Speter   svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
749251881Speter   the translator and allocating from POOL. */
750251881Speterstatic svn_error_t *
751251881Speterconvert_cstring(const char **dest,
752251881Speter                const char *src,
753251881Speter                xlate_handle_node_t *node,
754251881Speter                apr_pool_t *pool)
755251881Speter{
756251881Speter  if (node->handle)
757251881Speter    {
758251881Speter      svn_stringbuf_t *destbuf;
759251881Speter      SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
760251881Speter                                   &destbuf, pool));
761251881Speter      *dest = destbuf->data;
762251881Speter    }
763251881Speter  else
764251881Speter    {
765251881Speter      apr_size_t len = strlen(src);
766251881Speter      SVN_ERR(check_non_ascii(src, len, pool));
767251881Speter      *dest = apr_pstrmemdup(pool, src, len);
768251881Speter    }
769251881Speter  return SVN_NO_ERROR;
770251881Speter}
771251881Speter
772251881Speter
773251881Spetersvn_error_t *
774251881Spetersvn_utf_cstring_to_utf8(const char **dest,
775251881Speter                        const char *src,
776251881Speter                        apr_pool_t *pool)
777251881Speter{
778251881Speter  xlate_handle_node_t *node;
779251881Speter  svn_error_t *err;
780251881Speter
781251881Speter  SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
782251881Speter  err = convert_cstring(dest, src, node, pool);
783251881Speter  SVN_ERR(svn_error_compose_create(err,
784251881Speter                                   put_xlate_handle_node
785251881Speter                                      (node,
786251881Speter                                       SVN_UTF_NTOU_XLATE_HANDLE,
787251881Speter                                       pool)));
788251881Speter  return check_cstring_utf8(*dest, pool);
789251881Speter}
790251881Speter
791251881Speter
792251881Spetersvn_error_t *
793251881Spetersvn_utf_cstring_to_utf8_ex2(const char **dest,
794251881Speter                            const char *src,
795251881Speter                            const char *frompage,
796251881Speter                            apr_pool_t *pool)
797251881Speter{
798251881Speter  xlate_handle_node_t *node;
799251881Speter  svn_error_t *err;
800251881Speter  const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
801251881Speter                                          pool);
802251881Speter
803251881Speter  SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
804251881Speter                                convset_key, pool));
805251881Speter  err = convert_cstring(dest, src, node, pool);
806251881Speter  SVN_ERR(svn_error_compose_create(err,
807251881Speter                                   put_xlate_handle_node
808251881Speter                                      (node,
809251881Speter                                       SVN_UTF_NTOU_XLATE_HANDLE,
810251881Speter                                       pool)));
811251881Speter
812251881Speter  return check_cstring_utf8(*dest, pool);
813251881Speter}
814251881Speter
815251881Speter
816251881Spetersvn_error_t *
817251881Spetersvn_utf_cstring_to_utf8_ex(const char **dest,
818251881Speter                           const char *src,
819251881Speter                           const char *frompage,
820251881Speter                           const char *convset_key,
821251881Speter                           apr_pool_t *pool)
822251881Speter{
823251881Speter  return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
824251881Speter}
825251881Speter
826251881Speter
827251881Spetersvn_error_t *
828251881Spetersvn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
829251881Speter                            const svn_stringbuf_t *src,
830251881Speter                            apr_pool_t *pool)
831251881Speter{
832251881Speter  xlate_handle_node_t *node;
833251881Speter  svn_error_t *err;
834251881Speter
835251881Speter  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
836251881Speter
837251881Speter  if (node->handle)
838251881Speter    {
839251881Speter      err = check_utf8(src->data, src->len, pool);
840251881Speter      if (! err)
841251881Speter        err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
842251881Speter    }
843251881Speter  else
844251881Speter    {
845251881Speter      err = check_non_ascii(src->data, src->len, pool);
846251881Speter      if (! err)
847251881Speter        *dest = svn_stringbuf_dup(src, pool);
848251881Speter    }
849251881Speter
850251881Speter  err = svn_error_compose_create(
851251881Speter          err,
852251881Speter          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
853251881Speter
854251881Speter  return err;
855251881Speter}
856251881Speter
857251881Speter
858251881Spetersvn_error_t *
859251881Spetersvn_utf_string_from_utf8(const svn_string_t **dest,
860251881Speter                         const svn_string_t *src,
861251881Speter                         apr_pool_t *pool)
862251881Speter{
863251881Speter  svn_stringbuf_t *dbuf;
864251881Speter  xlate_handle_node_t *node;
865251881Speter  svn_error_t *err;
866251881Speter
867251881Speter  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
868251881Speter
869251881Speter  if (node->handle)
870251881Speter    {
871251881Speter      err = check_utf8(src->data, src->len, pool);
872251881Speter      if (! err)
873251881Speter        err = convert_to_stringbuf(node, src->data, src->len,
874251881Speter                                   &dbuf, pool);
875251881Speter      if (! err)
876251881Speter        *dest = svn_stringbuf__morph_into_string(dbuf);
877251881Speter    }
878251881Speter  else
879251881Speter    {
880251881Speter      err = check_non_ascii(src->data, src->len, pool);
881251881Speter      if (! err)
882251881Speter        *dest = svn_string_dup(src, pool);
883251881Speter    }
884251881Speter
885251881Speter  err = svn_error_compose_create(
886251881Speter          err,
887251881Speter          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
888251881Speter
889251881Speter  return err;
890251881Speter}
891251881Speter
892251881Speter
893251881Spetersvn_error_t *
894251881Spetersvn_utf_cstring_from_utf8(const char **dest,
895251881Speter                          const char *src,
896251881Speter                          apr_pool_t *pool)
897251881Speter{
898251881Speter  xlate_handle_node_t *node;
899251881Speter  svn_error_t *err;
900251881Speter
901251881Speter  SVN_ERR(check_cstring_utf8(src, pool));
902251881Speter
903251881Speter  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
904251881Speter  err = convert_cstring(dest, src, node, pool);
905251881Speter  err = svn_error_compose_create(
906251881Speter          err,
907251881Speter          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
908251881Speter
909251881Speter  return err;
910251881Speter}
911251881Speter
912251881Speter
913251881Spetersvn_error_t *
914251881Spetersvn_utf_cstring_from_utf8_ex2(const char **dest,
915251881Speter                              const char *src,
916251881Speter                              const char *topage,
917251881Speter                              apr_pool_t *pool)
918251881Speter{
919251881Speter  xlate_handle_node_t *node;
920251881Speter  svn_error_t *err;
921251881Speter  const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
922251881Speter                                          pool);
923251881Speter
924251881Speter  SVN_ERR(check_cstring_utf8(src, pool));
925251881Speter
926251881Speter  SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
927251881Speter                                convset_key, pool));
928251881Speter  err = convert_cstring(dest, src, node, pool);
929251881Speter  err = svn_error_compose_create(
930251881Speter          err,
931251881Speter          put_xlate_handle_node(node, convset_key, pool));
932251881Speter
933251881Speter  return err;
934251881Speter}
935251881Speter
936251881Speterconst char *
937251881Spetersvn_utf__cstring_from_utf8_fuzzy(const char *src,
938251881Speter                                 apr_pool_t *pool,
939251881Speter                                 svn_error_t *(*convert_from_utf8)
940251881Speter                                 (const char **, const char *, apr_pool_t *))
941251881Speter{
942251881Speter  const char *escaped, *converted;
943251881Speter  svn_error_t *err;
944251881Speter
945299742Sdim  escaped = svn_utf__fuzzy_escape(src, strlen(src), pool);
946251881Speter
947251881Speter  /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
948251881Speter     contain only 7-bit bytes :-).  Recode to native... */
949251881Speter  err = convert_from_utf8(((const char **) &converted), escaped, pool);
950251881Speter
951251881Speter  if (err)
952251881Speter    {
953251881Speter      svn_error_clear(err);
954251881Speter      return escaped;
955251881Speter    }
956251881Speter  else
957251881Speter    return converted;
958251881Speter
959251881Speter  /* ### Check the client locale, maybe we can avoid that second
960251881Speter   * conversion!  See Ulrich Drepper's patch at
961251881Speter   * http://subversion.tigris.org/issues/show_bug.cgi?id=807.
962251881Speter   */
963251881Speter}
964251881Speter
965251881Speter
966251881Speterconst char *
967251881Spetersvn_utf_cstring_from_utf8_fuzzy(const char *src,
968251881Speter                                apr_pool_t *pool)
969251881Speter{
970251881Speter  return svn_utf__cstring_from_utf8_fuzzy(src, pool,
971251881Speter                                          svn_utf_cstring_from_utf8);
972251881Speter}
973251881Speter
974251881Speter
975251881Spetersvn_error_t *
976251881Spetersvn_utf_cstring_from_utf8_stringbuf(const char **dest,
977251881Speter                                    const svn_stringbuf_t *src,
978251881Speter                                    apr_pool_t *pool)
979251881Speter{
980251881Speter  svn_stringbuf_t *destbuf;
981251881Speter
982251881Speter  SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
983251881Speter  *dest = destbuf->data;
984251881Speter
985251881Speter  return SVN_NO_ERROR;
986251881Speter}
987251881Speter
988251881Speter
989251881Spetersvn_error_t *
990251881Spetersvn_utf_cstring_from_utf8_string(const char **dest,
991251881Speter                                 const svn_string_t *src,
992251881Speter                                 apr_pool_t *pool)
993251881Speter{
994251881Speter  svn_stringbuf_t *dbuf;
995251881Speter  xlate_handle_node_t *node;
996251881Speter  svn_error_t *err;
997251881Speter
998251881Speter  SVN_ERR(get_uton_xlate_handle_node(&node, pool));
999251881Speter
1000251881Speter  if (node->handle)
1001251881Speter    {
1002251881Speter      err = check_utf8(src->data, src->len, pool);
1003251881Speter      if (! err)
1004251881Speter        err = convert_to_stringbuf(node, src->data, src->len,
1005251881Speter                                   &dbuf, pool);
1006251881Speter      if (! err)
1007251881Speter        *dest = dbuf->data;
1008251881Speter    }
1009251881Speter  else
1010251881Speter    {
1011251881Speter      err = check_non_ascii(src->data, src->len, pool);
1012251881Speter      if (! err)
1013251881Speter        *dest = apr_pstrmemdup(pool, src->data, src->len);
1014251881Speter    }
1015251881Speter
1016251881Speter  err = svn_error_compose_create(
1017251881Speter          err,
1018251881Speter          put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1019251881Speter
1020251881Speter  return err;
1021251881Speter}
1022299742Sdim
1023299742Sdim
1024299742Sdim/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
1025299742Sdimstatic void
1026299742Sdimmembuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value)
1027299742Sdim{
1028299742Sdim  svn_membuf__resize(buf, (offset + 1) * sizeof(value));
1029299742Sdim  ((apr_int32_t*)buf->data)[offset] = value;
1030299742Sdim}
1031299742Sdim
1032299742Sdim/* TODO: Use compiler intrinsics for byte swaps. */
1033299742Sdim#define SWAP_SHORT(x)  ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff))
1034299742Sdim#define SWAP_LONG(x)   ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8)    \
1035299742Sdim                        | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff))
1036299742Sdim
1037299742Sdim#define IS_UTF16_LEAD_SURROGATE(c)   ((c) >= 0xd800 && (c) <= 0xdbff)
1038299742Sdim#define IS_UTF16_TRAIL_SURROGATE(c)  ((c) >= 0xdc00 && (c) <= 0xdfff)
1039299742Sdim
1040299742Sdimsvn_error_t *
1041299742Sdimsvn_utf__utf16_to_utf8(const svn_string_t **result,
1042299742Sdim                       const apr_uint16_t *utf16str,
1043299742Sdim                       apr_size_t utf16len,
1044299742Sdim                       svn_boolean_t big_endian,
1045299742Sdim                       apr_pool_t *result_pool,
1046299742Sdim                       apr_pool_t *scratch_pool)
1047299742Sdim{
1048299742Sdim  static const apr_uint16_t endiancheck = 0xa55a;
1049299742Sdim  const svn_boolean_t arch_big_endian =
1050299742Sdim    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1051299742Sdim  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1052299742Sdim
1053299742Sdim  apr_uint16_t lead_surrogate;
1054299742Sdim  apr_size_t length;
1055299742Sdim  apr_size_t offset;
1056299742Sdim  svn_membuf_t ucs4buf;
1057299742Sdim  svn_membuf_t resultbuf;
1058299742Sdim  svn_string_t *res;
1059299742Sdim
1060299742Sdim  if (utf16len == SVN_UTF__UNKNOWN_LENGTH)
1061299742Sdim    {
1062299742Sdim      const apr_uint16_t *endp = utf16str;
1063299742Sdim      while (*endp++)
1064299742Sdim        ;
1065299742Sdim      utf16len = (endp - utf16str);
1066299742Sdim    }
1067299742Sdim
1068299742Sdim  svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool);
1069299742Sdim
1070299742Sdim  for (lead_surrogate = 0, length = 0, offset = 0;
1071299742Sdim       offset < utf16len; ++offset)
1072299742Sdim    {
1073299742Sdim      const apr_uint16_t code =
1074299742Sdim        (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]);
1075299742Sdim
1076299742Sdim      if (lead_surrogate)
1077299742Sdim        {
1078299742Sdim          if (IS_UTF16_TRAIL_SURROGATE(code))
1079299742Sdim            {
1080299742Sdim              /* Combine the lead and trail currogates into a 32-bit code. */
1081299742Sdim              membuf_insert_ucs4(&ucs4buf, length++,
1082299742Sdim                                 (0x010000
1083299742Sdim                                  + (((lead_surrogate & 0x03ff) << 10)
1084299742Sdim                                     | (code & 0x03ff))));
1085299742Sdim              lead_surrogate = 0;
1086299742Sdim              continue;
1087299742Sdim            }
1088299742Sdim          else
1089299742Sdim            {
1090299742Sdim              /* If we didn't find a surrogate pair, just dump the
1091299742Sdim                 lead surrogate into the stream. */
1092299742Sdim              membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate);
1093299742Sdim              lead_surrogate = 0;
1094299742Sdim            }
1095299742Sdim        }
1096299742Sdim
1097299742Sdim      if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code))
1098299742Sdim        {
1099299742Sdim          /* Store a lead surrogate that is followed by at least one
1100299742Sdim             code for the next iteration. */
1101299742Sdim          lead_surrogate = code;
1102299742Sdim          continue;
1103299742Sdim        }
1104299742Sdim      else
1105299742Sdim        membuf_insert_ucs4(&ucs4buf, length++, code);
1106299742Sdim    }
1107299742Sdim
1108299742Sdim  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1109299742Sdim     per code point for encoding. The buffer will grow as
1110299742Sdim     necessary. */
1111299742Sdim  svn_membuf__create(&resultbuf, length * 2, result_pool);
1112299742Sdim  SVN_ERR(svn_utf__encode_ucs4_string(
1113299742Sdim              &resultbuf, ucs4buf.data, length, &length));
1114299742Sdim
1115299742Sdim  res = apr_palloc(result_pool, sizeof(*res));
1116299742Sdim  res->data = resultbuf.data;
1117299742Sdim  res->len = length;
1118299742Sdim  *result = res;
1119299742Sdim  return SVN_NO_ERROR;
1120299742Sdim}
1121299742Sdim
1122299742Sdim
1123299742Sdimsvn_error_t *
1124299742Sdimsvn_utf__utf32_to_utf8(const svn_string_t **result,
1125299742Sdim                       const apr_int32_t *utf32str,
1126299742Sdim                       apr_size_t utf32len,
1127299742Sdim                       svn_boolean_t big_endian,
1128299742Sdim                       apr_pool_t *result_pool,
1129299742Sdim                       apr_pool_t *scratch_pool)
1130299742Sdim{
1131299742Sdim  static const apr_int32_t endiancheck = 0xa5cbbc5a;
1132299742Sdim  const svn_boolean_t arch_big_endian =
1133299742Sdim    (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a');
1134299742Sdim  const svn_boolean_t swap_order = (!big_endian != !arch_big_endian);
1135299742Sdim
1136299742Sdim  apr_size_t length;
1137299742Sdim  svn_membuf_t resultbuf;
1138299742Sdim  svn_string_t *res;
1139299742Sdim
1140299742Sdim  if (utf32len == SVN_UTF__UNKNOWN_LENGTH)
1141299742Sdim    {
1142299742Sdim      const apr_int32_t *endp = utf32str;
1143299742Sdim      while (*endp++)
1144299742Sdim        ;
1145299742Sdim      utf32len = (endp - utf32str);
1146299742Sdim    }
1147299742Sdim
1148299742Sdim  if (swap_order)
1149299742Sdim    {
1150299742Sdim      apr_size_t offset;
1151299742Sdim      svn_membuf_t ucs4buf;
1152299742Sdim
1153299742Sdim      svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t),
1154299742Sdim                         scratch_pool);
1155299742Sdim
1156299742Sdim      for (offset = 0; offset < utf32len; ++offset)
1157299742Sdim        {
1158299742Sdim          const apr_int32_t code = SWAP_LONG(utf32str[offset]);
1159299742Sdim          membuf_insert_ucs4(&ucs4buf, offset, code);
1160299742Sdim        }
1161299742Sdim      utf32str = ucs4buf.data;
1162299742Sdim    }
1163299742Sdim
1164299742Sdim  /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes
1165299742Sdim     per code point for encoding. The buffer will grow as
1166299742Sdim     necessary. */
1167299742Sdim  svn_membuf__create(&resultbuf, utf32len * 2, result_pool);
1168299742Sdim  SVN_ERR(svn_utf__encode_ucs4_string(
1169299742Sdim              &resultbuf, utf32str, utf32len, &length));
1170299742Sdim
1171299742Sdim  res = apr_palloc(result_pool, sizeof(*res));
1172299742Sdim  res->data = resultbuf.data;
1173299742Sdim  res->len = length;
1174299742Sdim  *result = res;
1175299742Sdim  return SVN_NO_ERROR;
1176299742Sdim}
1177299742Sdim
1178299742Sdim
1179299742Sdim#ifdef WIN32
1180299742Sdim
1181299742Sdim
1182299742Sdimsvn_error_t *
1183299742Sdimsvn_utf__win32_utf8_to_utf16(const WCHAR **result,
1184299742Sdim                             const char *src,
1185299742Sdim                             const WCHAR *prefix,
1186299742Sdim                             apr_pool_t *result_pool)
1187299742Sdim{
1188299742Sdim  const int utf8_count = strlen(src);
1189299742Sdim  const int prefix_len = (prefix ? lstrlenW(prefix) : 0);
1190299742Sdim  WCHAR *wide_str;
1191299742Sdim  int wide_count;
1192299742Sdim
1193299742Sdim  if (0 == prefix_len + utf8_count)
1194299742Sdim    {
1195299742Sdim      *result = L"";
1196299742Sdim      return SVN_NO_ERROR;
1197299742Sdim    }
1198299742Sdim
1199299742Sdim  wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0);
1200299742Sdim  if (wide_count == 0)
1201299742Sdim    return svn_error_wrap_apr(apr_get_os_error(),
1202299742Sdim                              _("Conversion to UTF-16 failed"));
1203299742Sdim
1204299742Sdim  wide_str = apr_palloc(result_pool,
1205299742Sdim                        (prefix_len + wide_count + 1) * sizeof(*wide_str));
1206299742Sdim  if (prefix_len)
1207299742Sdim    memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str));
1208299742Sdim  if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count,
1209299742Sdim                               wide_str + prefix_len, wide_count))
1210299742Sdim    return svn_error_wrap_apr(apr_get_os_error(),
1211299742Sdim                              _("Conversion to UTF-16 failed"));
1212299742Sdim
1213299742Sdim  wide_str[prefix_len + wide_count] = 0;
1214299742Sdim  *result = wide_str;
1215299742Sdim
1216299742Sdim  return SVN_NO_ERROR;
1217299742Sdim}
1218299742Sdim
1219299742Sdimsvn_error_t *
1220299742Sdimsvn_utf__win32_utf16_to_utf8(const char **result,
1221299742Sdim                             const WCHAR *src,
1222299742Sdim                             const char *prefix,
1223299742Sdim                             apr_pool_t *result_pool)
1224299742Sdim{
1225299742Sdim  const int wide_count = lstrlenW(src);
1226299742Sdim  const int prefix_len = (prefix ? strlen(prefix) : 0);
1227299742Sdim  char *utf8_str;
1228299742Sdim  int utf8_count;
1229299742Sdim
1230299742Sdim  if (0 == prefix_len + wide_count)
1231299742Sdim    {
1232299742Sdim      *result = "";
1233299742Sdim      return SVN_NO_ERROR;
1234299742Sdim    }
1235299742Sdim
1236299742Sdim  utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1237299742Sdim                                   NULL, 0, NULL, FALSE);
1238299742Sdim  if (utf8_count == 0)
1239299742Sdim    return svn_error_wrap_apr(apr_get_os_error(),
1240299742Sdim                              _("Conversion from UTF-16 failed"));
1241299742Sdim
1242299742Sdim  utf8_str = apr_palloc(result_pool,
1243299742Sdim                        (prefix_len + utf8_count + 1) * sizeof(*utf8_str));
1244299742Sdim  if (prefix_len)
1245299742Sdim    memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str));
1246299742Sdim  if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count,
1247299742Sdim                               utf8_str + prefix_len, utf8_count,
1248299742Sdim                               NULL, FALSE))
1249299742Sdim    return svn_error_wrap_apr(apr_get_os_error(),
1250299742Sdim                              _("Conversion from UTF-16 failed"));
1251299742Sdim
1252299742Sdim  utf8_str[prefix_len + utf8_count] = 0;
1253299742Sdim  *result = utf8_str;
1254299742Sdim
1255299742Sdim  return SVN_NO_ERROR;
1256299742Sdim}
1257299742Sdim
1258299742Sdim#endif /* WIN32 */
1259