1251881Speter/* 2251881Speter * utf.c: UTF-8 conversion routines 3251881Speter * 4251881Speter * ==================================================================== 5251881Speter * Licensed to the Apache Software Foundation (ASF) under one 6251881Speter * or more contributor license agreements. See the NOTICE file 7251881Speter * distributed with this work for additional information 8251881Speter * regarding copyright ownership. The ASF licenses this file 9251881Speter * to you under the Apache License, Version 2.0 (the 10251881Speter * "License"); you may not use this file except in compliance 11251881Speter * with the License. You may obtain a copy of the License at 12251881Speter * 13251881Speter * http://www.apache.org/licenses/LICENSE-2.0 14251881Speter * 15251881Speter * Unless required by applicable law or agreed to in writing, 16251881Speter * software distributed under the License is distributed on an 17251881Speter * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18251881Speter * KIND, either express or implied. See the License for the 19251881Speter * specific language governing permissions and limitations 20251881Speter * under the License. 21251881Speter * ==================================================================== 22251881Speter */ 23251881Speter 24251881Speter 25251881Speter 26251881Speter#include <stdlib.h> 27251881Speter#include <string.h> 28251881Speter#include <assert.h> 29251881Speter 30251881Speter#include <apr_strings.h> 31251881Speter#include <apr_lib.h> 32251881Speter#include <apr_xlate.h> 33251881Speter#include <apr_atomic.h> 34251881Speter 35251881Speter#include "svn_hash.h" 36251881Speter#include "svn_string.h" 37251881Speter#include "svn_error.h" 38251881Speter#include "svn_pools.h" 39251881Speter#include "svn_ctype.h" 40251881Speter#include "svn_utf.h" 41251881Speter#include "svn_private_config.h" 42251881Speter#include "win32_xlate.h" 43251881Speter 44251881Speter#include "private/svn_utf_private.h" 45251881Speter#include "private/svn_dep_compat.h" 46251881Speter#include "private/svn_string_private.h" 47251881Speter#include "private/svn_mutex.h" 48251881Speter 49251881Speter 50251881Speter 51251881Speter/* Use these static strings to maximize performance on standard conversions. 52251881Speter * Any strings on other locations are still valid, however. 53251881Speter */ 54251881Speterstatic const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle"; 55251881Speterstatic const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle"; 56251881Speter 57251881Speterstatic const char *SVN_APR_UTF8_CHARSET = "UTF-8"; 58251881Speter 59251881Speterstatic svn_mutex__t *xlate_handle_mutex = NULL; 60251881Speterstatic svn_boolean_t assume_native_charset_is_utf8 = FALSE; 61251881Speter 62299742Sdim#if defined(WIN32) 63299742Sdimtypedef svn_subr__win32_xlate_t xlate_handle_t; 64299742Sdim#else 65299742Sdimtypedef apr_xlate_t xlate_handle_t; 66299742Sdim#endif 67299742Sdim 68251881Speter/* The xlate handle cache is a global hash table with linked lists of xlate 69251881Speter * handles. In multi-threaded environments, a thread "borrows" an xlate 70251881Speter * handle from the cache during a translation and puts it back afterwards. 71251881Speter * This avoids holding a global lock for all translations. 72251881Speter * If there is no handle for a particular key when needed, a new is 73251881Speter * handle is created and put in the cache after use. 74251881Speter * This means that there will be at most N handles open for a key, where N 75251881Speter * is the number of simultanous handles in use for that key. */ 76251881Speter 77251881Spetertypedef struct xlate_handle_node_t { 78299742Sdim xlate_handle_t *handle; 79251881Speter /* FALSE if the handle is not valid, since its pool is being 80251881Speter destroyed. */ 81251881Speter svn_boolean_t valid; 82251881Speter /* The name of a char encoding or APR_LOCALE_CHARSET. */ 83251881Speter const char *frompage, *topage; 84251881Speter struct xlate_handle_node_t *next; 85251881Speter} xlate_handle_node_t; 86251881Speter 87251881Speter/* This maps const char * userdata_key strings to xlate_handle_node_t ** 88251881Speter handles to the first entry in the linked list of xlate handles. We don't 89251881Speter store the pointer to the list head directly in the hash table, since we 90251881Speter remove/insert entries at the head in the list in the code below, and 91251881Speter we can't use apr_hash_set() in each character translation because that 92251881Speter function allocates memory in each call where the value is non-NULL. 93251881Speter Since these allocations take place in a global pool, this would be a 94251881Speter memory leak. */ 95251881Speterstatic apr_hash_t *xlate_handle_hash = NULL; 96251881Speter 97251881Speter/* "1st level cache" to standard conversion maps. We may access these 98251881Speter * using atomic xchange ops, i.e. without further thread synchronization. 99251881Speter * If the respective item is NULL, fallback to hash lookup. 100251881Speter */ 101251881Speterstatic void * volatile xlat_ntou_static_handle = NULL; 102251881Speterstatic void * volatile xlat_uton_static_handle = NULL; 103251881Speter 104251881Speter/* Clean up the xlate handle cache. */ 105251881Speterstatic apr_status_t 106251881Speterxlate_cleanup(void *arg) 107251881Speter{ 108251881Speter /* We set the cache variables to NULL so that translation works in other 109251881Speter cleanup functions, even if it isn't cached then. */ 110251881Speter xlate_handle_hash = NULL; 111251881Speter 112251881Speter /* ensure no stale objects get accessed */ 113251881Speter xlat_ntou_static_handle = NULL; 114251881Speter xlat_uton_static_handle = NULL; 115251881Speter 116251881Speter return APR_SUCCESS; 117251881Speter} 118251881Speter 119251881Speter/* Set the handle of ARG to NULL. */ 120251881Speterstatic apr_status_t 121251881Speterxlate_handle_node_cleanup(void *arg) 122251881Speter{ 123251881Speter xlate_handle_node_t *node = arg; 124251881Speter 125251881Speter node->valid = FALSE; 126251881Speter return APR_SUCCESS; 127251881Speter} 128251881Speter 129251881Spetervoid 130251881Spetersvn_utf_initialize2(svn_boolean_t assume_native_utf8, 131251881Speter apr_pool_t *pool) 132251881Speter{ 133251881Speter if (!xlate_handle_hash) 134251881Speter { 135251881Speter /* We create our own subpool, which we protect with the mutex. 136251881Speter We can't use the pool passed to us by the caller, since we will 137251881Speter use it for xlate handle allocations, possibly in multiple threads, 138251881Speter and pool allocation is not thread-safe. */ 139251881Speter apr_pool_t *subpool = svn_pool_create(pool); 140251881Speter svn_mutex__t *mutex; 141251881Speter svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool); 142251881Speter if (err) 143251881Speter { 144251881Speter svn_error_clear(err); 145251881Speter return; 146251881Speter } 147251881Speter 148251881Speter xlate_handle_mutex = mutex; 149251881Speter xlate_handle_hash = apr_hash_make(subpool); 150251881Speter 151251881Speter apr_pool_cleanup_register(subpool, NULL, xlate_cleanup, 152251881Speter apr_pool_cleanup_null); 153251881Speter } 154251881Speter 155251881Speter if (!assume_native_charset_is_utf8) 156251881Speter assume_native_charset_is_utf8 = assume_native_utf8; 157251881Speter} 158251881Speter 159251881Speter/* Return a unique string key based on TOPAGE and FROMPAGE. TOPAGE and 160251881Speter * FROMPAGE can be any valid arguments of the same name to 161251881Speter * apr_xlate_open(). Allocate the returned string in POOL. */ 162251881Speterstatic const char* 163251881Speterget_xlate_key(const char *topage, 164251881Speter const char *frompage, 165251881Speter apr_pool_t *pool) 166251881Speter{ 167251881Speter /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET 168251881Speter * topage/frompage is really an int, not a valid string. So generate a 169251881Speter * unique key accordingly. */ 170251881Speter if (frompage == SVN_APR_LOCALE_CHARSET) 171251881Speter frompage = "APR_LOCALE_CHARSET"; 172251881Speter else if (frompage == SVN_APR_DEFAULT_CHARSET) 173251881Speter frompage = "APR_DEFAULT_CHARSET"; 174251881Speter 175251881Speter if (topage == SVN_APR_LOCALE_CHARSET) 176251881Speter topage = "APR_LOCALE_CHARSET"; 177251881Speter else if (topage == SVN_APR_DEFAULT_CHARSET) 178251881Speter topage = "APR_DEFAULT_CHARSET"; 179251881Speter 180251881Speter return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage, 181299742Sdim "-xlate-handle", SVN_VA_NULL); 182251881Speter} 183251881Speter 184251881Speter/* Atomically replace the content in *MEM with NEW_VALUE and return 185251881Speter * the previous content of *MEM. If atomicy cannot be guaranteed, 186251881Speter * *MEM will not be modified and NEW_VALUE is simply returned to 187251881Speter * the caller. 188251881Speter */ 189251881Speterstatic APR_INLINE void* 190251881Speteratomic_swap(void * volatile * mem, void *new_value) 191251881Speter{ 192251881Speter#if APR_HAS_THREADS 193251881Speter /* Cast is necessary because of APR bug: 194251881Speter https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */ 195251881Speter return apr_atomic_xchgptr((volatile void **)mem, new_value); 196251881Speter#else 197251881Speter /* no threads - no sync. necessary */ 198251881Speter void *old_value = (void*)*mem; 199251881Speter *mem = new_value; 200251881Speter return old_value; 201251881Speter#endif 202251881Speter} 203251881Speter 204251881Speter/* Set *RET to a newly created handle node for converting from FROMPAGE 205251881Speter to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set 206251881Speter (*RET)->handle to NULL. If fail for any other reason, return the error. 207251881Speter Allocate *RET and its xlate handle in POOL. */ 208251881Speterstatic svn_error_t * 209251881Speterxlate_alloc_handle(xlate_handle_node_t **ret, 210251881Speter const char *topage, const char *frompage, 211251881Speter apr_pool_t *pool) 212251881Speter{ 213251881Speter apr_status_t apr_err; 214299742Sdim xlate_handle_t *handle; 215262253Speter const char *name; 216251881Speter 217251881Speter /* The error handling doesn't support the following cases, since we don't 218251881Speter use them currently. Catch this here. */ 219251881Speter SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET 220251881Speter && topage != SVN_APR_DEFAULT_CHARSET 221251881Speter && (frompage != SVN_APR_LOCALE_CHARSET 222251881Speter || topage != SVN_APR_LOCALE_CHARSET)); 223251881Speter 224251881Speter /* Try to create a handle. */ 225251881Speter#if defined(WIN32) 226299742Sdim apr_err = svn_subr__win32_xlate_open(&handle, topage, 227251881Speter frompage, pool); 228262253Speter name = "win32-xlate: "; 229251881Speter#else 230251881Speter apr_err = apr_xlate_open(&handle, topage, frompage, pool); 231262253Speter name = "APR: "; 232251881Speter#endif 233251881Speter 234251881Speter if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err)) 235251881Speter handle = NULL; 236251881Speter else if (apr_err != APR_SUCCESS) 237251881Speter { 238251881Speter const char *errstr; 239253734Speter char apr_strerr[512]; 240253734Speter 241251881Speter /* Can't use svn_error_wrap_apr here because it calls functions in 242251881Speter this file, leading to infinite recursion. */ 243251881Speter if (frompage == SVN_APR_LOCALE_CHARSET) 244251881Speter errstr = apr_psprintf(pool, 245251881Speter _("Can't create a character converter from " 246251881Speter "native encoding to '%s'"), topage); 247251881Speter else if (topage == SVN_APR_LOCALE_CHARSET) 248251881Speter errstr = apr_psprintf(pool, 249251881Speter _("Can't create a character converter from " 250251881Speter "'%s' to native encoding"), frompage); 251251881Speter else 252251881Speter errstr = apr_psprintf(pool, 253251881Speter _("Can't create a character converter from " 254251881Speter "'%s' to '%s'"), frompage, topage); 255251881Speter 256253734Speter /* Just put the error on the stack, since svn_error_create duplicates it 257253734Speter later. APR_STRERR will be in the local encoding, not in UTF-8, though. 258253734Speter */ 259253734Speter svn_strerror(apr_err, apr_strerr, sizeof(apr_strerr)); 260299742Sdim return svn_error_createf(SVN_ERR_PLUGIN_LOAD_FAILURE, 261262253Speter svn_error_create(apr_err, NULL, apr_strerr), 262262253Speter "%s%s", name, errstr); 263251881Speter } 264251881Speter 265251881Speter /* Allocate and initialize the node. */ 266251881Speter *ret = apr_palloc(pool, sizeof(xlate_handle_node_t)); 267251881Speter (*ret)->handle = handle; 268251881Speter (*ret)->valid = TRUE; 269251881Speter (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET) 270251881Speter ? apr_pstrdup(pool, frompage) : frompage); 271251881Speter (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET) 272251881Speter ? apr_pstrdup(pool, topage) : topage); 273251881Speter (*ret)->next = NULL; 274251881Speter 275251881Speter /* If we are called from inside a pool cleanup handler, the just created 276251881Speter xlate handle will be closed when that handler returns by a newly 277251881Speter registered cleanup handler, however, the handle is still cached by us. 278251881Speter To prevent this, we register a cleanup handler that will reset the valid 279251881Speter flag of our node, so we don't use an invalid handle. */ 280251881Speter if (handle) 281251881Speter apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup, 282251881Speter apr_pool_cleanup_null); 283251881Speter 284251881Speter return SVN_NO_ERROR; 285251881Speter} 286251881Speter 287251881Speter/* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our 288251881Speter global hash map, if available. 289251881Speter 290251881Speter Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 291251881Speter hasn't been called or USERDATA_KEY is NULL. Else, allocate them 292251881Speter in the pool of xlate_handle_hash. 293251881Speter 294251881Speter Note: this function is not thread-safe. Call get_xlate_handle_node 295251881Speter instead. */ 296251881Speterstatic svn_error_t * 297251881Speterget_xlate_handle_node_internal(xlate_handle_node_t **ret, 298251881Speter const char *topage, const char *frompage, 299251881Speter const char *userdata_key, apr_pool_t *pool) 300251881Speter{ 301251881Speter /* If we already have a handle, just return it. */ 302251881Speter if (userdata_key && xlate_handle_hash) 303251881Speter { 304251881Speter xlate_handle_node_t *old_node = NULL; 305251881Speter 306251881Speter /* 2nd level: hash lookup */ 307251881Speter xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash, 308251881Speter userdata_key); 309251881Speter if (old_node_p) 310251881Speter old_node = *old_node_p; 311251881Speter if (old_node) 312251881Speter { 313251881Speter /* Ensure that the handle is still valid. */ 314251881Speter if (old_node->valid) 315251881Speter { 316251881Speter /* Remove from the list. */ 317251881Speter *old_node_p = old_node->next; 318251881Speter old_node->next = NULL; 319251881Speter *ret = old_node; 320251881Speter return SVN_NO_ERROR; 321251881Speter } 322251881Speter } 323251881Speter } 324251881Speter 325251881Speter /* Note that we still have the mutex locked (if it is initialized), so we 326251881Speter can use the global pool for creating the new xlate handle. */ 327251881Speter 328251881Speter /* Use the correct pool for creating the handle. */ 329251881Speter pool = apr_hash_pool_get(xlate_handle_hash); 330251881Speter 331251881Speter return xlate_alloc_handle(ret, topage, frompage, pool); 332251881Speter} 333251881Speter 334251881Speter/* Set *RET to a handle node for converting from FROMPAGE to TOPAGE, 335251881Speter creating the handle node if it doesn't exist in USERDATA_KEY. 336251881Speter If a node is not cached and apr_xlate_open() returns APR_EINVAL or 337251881Speter APR_ENOTIMPL, set (*RET)->handle to NULL. If fail for any other 338251881Speter reason, return the error. 339251881Speter 340251881Speter Allocate *RET and its xlate handle in POOL if svn_utf_initialize() 341251881Speter hasn't been called or USERDATA_KEY is NULL. Else, allocate them 342251881Speter in the pool of xlate_handle_hash. */ 343251881Speterstatic svn_error_t * 344251881Speterget_xlate_handle_node(xlate_handle_node_t **ret, 345251881Speter const char *topage, const char *frompage, 346251881Speter const char *userdata_key, apr_pool_t *pool) 347251881Speter{ 348251881Speter xlate_handle_node_t *old_node = NULL; 349251881Speter 350251881Speter /* If we already have a handle, just return it. */ 351251881Speter if (userdata_key) 352251881Speter { 353251881Speter if (xlate_handle_hash) 354251881Speter { 355251881Speter /* 1st level: global, static items */ 356251881Speter if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 357251881Speter old_node = atomic_swap(&xlat_ntou_static_handle, NULL); 358251881Speter else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 359251881Speter old_node = atomic_swap(&xlat_uton_static_handle, NULL); 360251881Speter 361251881Speter if (old_node && old_node->valid) 362251881Speter { 363251881Speter *ret = old_node; 364251881Speter return SVN_NO_ERROR; 365251881Speter } 366251881Speter } 367251881Speter else 368251881Speter { 369251881Speter void *p; 370251881Speter /* We fall back on a per-pool cache instead. */ 371251881Speter apr_pool_userdata_get(&p, userdata_key, pool); 372251881Speter old_node = p; 373251881Speter /* Ensure that the handle is still valid. */ 374251881Speter if (old_node && old_node->valid) 375251881Speter { 376251881Speter *ret = old_node; 377251881Speter return SVN_NO_ERROR; 378251881Speter } 379251881Speter 380251881Speter return xlate_alloc_handle(ret, topage, frompage, pool); 381251881Speter } 382251881Speter } 383251881Speter 384251881Speter SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 385251881Speter get_xlate_handle_node_internal(ret, 386251881Speter topage, 387251881Speter frompage, 388251881Speter userdata_key, 389251881Speter pool)); 390251881Speter 391251881Speter return SVN_NO_ERROR; 392251881Speter} 393251881Speter 394251881Speter/* Put back NODE into the xlate handle cache for use by other calls. 395251881Speter 396251881Speter Note: this function is not thread-safe. Call put_xlate_handle_node 397251881Speter instead. */ 398251881Speterstatic svn_error_t * 399251881Speterput_xlate_handle_node_internal(xlate_handle_node_t *node, 400251881Speter const char *userdata_key) 401251881Speter{ 402251881Speter xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key); 403251881Speter if (node_p == NULL) 404251881Speter { 405251881Speter userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash), 406251881Speter userdata_key); 407251881Speter node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash), 408251881Speter sizeof(*node_p)); 409251881Speter *node_p = NULL; 410251881Speter svn_hash_sets(xlate_handle_hash, userdata_key, node_p); 411251881Speter } 412251881Speter node->next = *node_p; 413251881Speter *node_p = node; 414251881Speter 415251881Speter return SVN_NO_ERROR; 416251881Speter} 417251881Speter 418251881Speter/* Put back NODE into the xlate handle cache for use by other calls. 419251881Speter If there is no global cache, store the handle in POOL. 420251881Speter Ignore errors related to locking/unlocking the mutex. */ 421251881Speterstatic svn_error_t * 422251881Speterput_xlate_handle_node(xlate_handle_node_t *node, 423251881Speter const char *userdata_key, 424251881Speter apr_pool_t *pool) 425251881Speter{ 426251881Speter assert(node->next == NULL); 427251881Speter if (!userdata_key) 428251881Speter return SVN_NO_ERROR; 429251881Speter 430251881Speter /* push previous global node to the hash */ 431251881Speter if (xlate_handle_hash) 432251881Speter { 433251881Speter /* 1st level: global, static items */ 434251881Speter if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE) 435251881Speter node = atomic_swap(&xlat_ntou_static_handle, node); 436251881Speter else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE) 437251881Speter node = atomic_swap(&xlat_uton_static_handle, node); 438251881Speter if (node == NULL) 439251881Speter return SVN_NO_ERROR; 440251881Speter 441251881Speter SVN_MUTEX__WITH_LOCK(xlate_handle_mutex, 442251881Speter put_xlate_handle_node_internal(node, 443251881Speter userdata_key)); 444251881Speter } 445251881Speter else 446251881Speter { 447251881Speter /* Store it in the per-pool cache. */ 448251881Speter apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool); 449251881Speter } 450251881Speter 451251881Speter return SVN_NO_ERROR; 452251881Speter} 453251881Speter 454251881Speter/* Return the apr_xlate handle for converting native characters to UTF-8. */ 455251881Speterstatic svn_error_t * 456251881Speterget_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 457251881Speter{ 458251881Speter return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET, 459251881Speter assume_native_charset_is_utf8 460251881Speter ? SVN_APR_UTF8_CHARSET 461251881Speter : SVN_APR_LOCALE_CHARSET, 462251881Speter SVN_UTF_NTOU_XLATE_HANDLE, pool); 463251881Speter} 464251881Speter 465251881Speter 466251881Speter/* Return the apr_xlate handle for converting UTF-8 to native characters. 467251881Speter Create one if it doesn't exist. If unable to find a handle, or 468251881Speter unable to create one because apr_xlate_open returned APR_EINVAL, then 469251881Speter set *RET to null and return SVN_NO_ERROR; if fail for some other 470251881Speter reason, return error. */ 471251881Speterstatic svn_error_t * 472251881Speterget_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool) 473251881Speter{ 474251881Speter return get_xlate_handle_node(ret, 475251881Speter assume_native_charset_is_utf8 476251881Speter ? SVN_APR_UTF8_CHARSET 477251881Speter : SVN_APR_LOCALE_CHARSET, 478251881Speter SVN_APR_UTF8_CHARSET, 479251881Speter SVN_UTF_UTON_XLATE_HANDLE, pool); 480251881Speter} 481251881Speter 482251881Speter 483251881Speter/* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result 484251881Speter in *DEST, which is allocated in POOL. */ 485251881Speterstatic svn_error_t * 486251881Speterconvert_to_stringbuf(xlate_handle_node_t *node, 487251881Speter const char *src_data, 488251881Speter apr_size_t src_length, 489251881Speter svn_stringbuf_t **dest, 490251881Speter apr_pool_t *pool) 491251881Speter{ 492251881Speter#ifdef WIN32 493251881Speter apr_status_t apr_err; 494251881Speter 495299742Sdim apr_err = svn_subr__win32_xlate_to_stringbuf(node->handle, src_data, 496299742Sdim src_length, dest, pool); 497251881Speter#else 498251881Speter apr_size_t buflen = src_length * 2; 499251881Speter apr_status_t apr_err; 500251881Speter apr_size_t srclen = src_length; 501251881Speter apr_size_t destlen = buflen; 502251881Speter 503251881Speter /* Initialize *DEST to an empty stringbuf. 504251881Speter A 1:2 ratio of input bytes to output bytes (as assigned above) 505251881Speter should be enough for most translations, and if it turns out not 506251881Speter to be enough, we'll grow the buffer again, sizing it based on a 507251881Speter 1:3 ratio of the remainder of the string. */ 508251881Speter *dest = svn_stringbuf_create_ensure(buflen + 1, pool); 509251881Speter 510251881Speter /* Not only does it not make sense to convert an empty string, but 511251881Speter apr-iconv is quite unreasonable about not allowing that. */ 512251881Speter if (src_length == 0) 513251881Speter return SVN_NO_ERROR; 514251881Speter 515251881Speter do 516251881Speter { 517251881Speter /* Set up state variables for xlate. */ 518251881Speter destlen = buflen - (*dest)->len; 519251881Speter 520251881Speter /* Attempt the conversion. */ 521251881Speter apr_err = apr_xlate_conv_buffer(node->handle, 522251881Speter src_data + (src_length - srclen), 523251881Speter &srclen, 524251881Speter (*dest)->data + (*dest)->len, 525251881Speter &destlen); 526251881Speter 527251881Speter /* Now, update the *DEST->len to track the amount of output data 528251881Speter churned out so far from this loop. */ 529251881Speter (*dest)->len += ((buflen - (*dest)->len) - destlen); 530251881Speter buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough 531251881Speter for all characters in the buffer, 4 is 532251881Speter maximum character size (currently) */ 533251881Speter 534251881Speter 535251881Speter } while (apr_err == APR_SUCCESS && srclen != 0); 536251881Speter#endif 537251881Speter 538251881Speter /* If we exited the loop with an error, return the error. */ 539251881Speter if (apr_err) 540251881Speter { 541251881Speter const char *errstr; 542251881Speter svn_error_t *err; 543251881Speter 544251881Speter /* Can't use svn_error_wrap_apr here because it calls functions in 545251881Speter this file, leading to infinite recursion. */ 546251881Speter if (node->frompage == SVN_APR_LOCALE_CHARSET) 547251881Speter errstr = apr_psprintf 548251881Speter (pool, _("Can't convert string from native encoding to '%s':"), 549251881Speter node->topage); 550251881Speter else if (node->topage == SVN_APR_LOCALE_CHARSET) 551251881Speter errstr = apr_psprintf 552251881Speter (pool, _("Can't convert string from '%s' to native encoding:"), 553251881Speter node->frompage); 554251881Speter else 555251881Speter errstr = apr_psprintf 556251881Speter (pool, _("Can't convert string from '%s' to '%s':"), 557251881Speter node->frompage, node->topage); 558251881Speter 559299742Sdim err = svn_error_create( 560299742Sdim apr_err, NULL, svn_utf__fuzzy_escape(src_data, src_length, pool)); 561251881Speter return svn_error_create(apr_err, err, errstr); 562251881Speter } 563251881Speter /* Else, exited due to success. Trim the result buffer down to the 564251881Speter right length. */ 565251881Speter (*dest)->data[(*dest)->len] = '\0'; 566251881Speter 567251881Speter return SVN_NO_ERROR; 568251881Speter} 569251881Speter 570251881Speter 571251881Speter/* Return APR_EINVAL if the first LEN bytes of DATA contain anything 572251881Speter other than seven-bit, non-control (except for whitespace) ASCII 573251881Speter characters, finding the error pool from POOL. Otherwise, return 574251881Speter SVN_NO_ERROR. */ 575251881Speterstatic svn_error_t * 576251881Spetercheck_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool) 577251881Speter{ 578251881Speter const char *data_start = data; 579251881Speter 580251881Speter for (; len > 0; --len, data++) 581251881Speter { 582251881Speter if ((! svn_ctype_isascii(*data)) 583251881Speter || ((! svn_ctype_isspace(*data)) 584251881Speter && svn_ctype_iscntrl(*data))) 585251881Speter { 586251881Speter /* Show the printable part of the data, followed by the 587251881Speter decimal code of the questionable character. Because if a 588251881Speter user ever gets this error, she's going to have to spend 589251881Speter time tracking down the non-ASCII data, so we want to help 590251881Speter as much as possible. And yes, we just call the unsafe 591251881Speter data "non-ASCII", even though the actual constraint is 592251881Speter somewhat more complex than that. */ 593251881Speter 594251881Speter if (data - data_start) 595251881Speter { 596251881Speter const char *error_data 597251881Speter = apr_pstrndup(pool, data_start, (data - data_start)); 598251881Speter 599251881Speter return svn_error_createf 600251881Speter (APR_EINVAL, NULL, 601251881Speter _("Safe data '%s' was followed by non-ASCII byte %d: " 602251881Speter "unable to convert to/from UTF-8"), 603251881Speter error_data, *((const unsigned char *) data)); 604251881Speter } 605251881Speter else 606251881Speter { 607251881Speter return svn_error_createf 608251881Speter (APR_EINVAL, NULL, 609251881Speter _("Non-ASCII character (code %d) detected, " 610251881Speter "and unable to convert to/from UTF-8"), 611251881Speter *((const unsigned char *) data)); 612251881Speter } 613251881Speter } 614251881Speter } 615251881Speter 616251881Speter return SVN_NO_ERROR; 617251881Speter} 618251881Speter 619251881Speter/* Construct an error with code APR_EINVAL and with a suitable message 620251881Speter * to describe the invalid UTF-8 sequence DATA of length LEN (which 621251881Speter * may have embedded NULLs). We can't simply print the data, almost 622251881Speter * by definition we don't really know how it is encoded. 623251881Speter */ 624251881Speterstatic svn_error_t * 625251881Speterinvalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 626251881Speter{ 627251881Speter const char *last = svn_utf__last_valid(data, len); 628251881Speter const char *valid_txt = "", *invalid_txt = ""; 629251881Speter apr_size_t i; 630251881Speter size_t valid, invalid; 631251881Speter 632251881Speter /* We will display at most 24 valid octets (this may split a leading 633251881Speter multi-byte character) as that should fit on one 80 character line. */ 634251881Speter valid = last - data; 635251881Speter if (valid > 24) 636251881Speter valid = 24; 637251881Speter for (i = 0; i < valid; ++i) 638251881Speter valid_txt = apr_pstrcat(pool, valid_txt, 639251881Speter apr_psprintf(pool, " %02x", 640251881Speter (unsigned char)last[i-valid]), 641299742Sdim SVN_VA_NULL); 642251881Speter 643251881Speter /* 4 invalid octets will guarantee that the faulty octet is displayed */ 644251881Speter invalid = data + len - last; 645251881Speter if (invalid > 4) 646251881Speter invalid = 4; 647251881Speter for (i = 0; i < invalid; ++i) 648251881Speter invalid_txt = apr_pstrcat(pool, invalid_txt, 649251881Speter apr_psprintf(pool, " %02x", 650251881Speter (unsigned char)last[i]), 651299742Sdim SVN_VA_NULL); 652251881Speter 653251881Speter return svn_error_createf(APR_EINVAL, NULL, 654251881Speter _("Valid UTF-8 data\n(hex:%s)\n" 655251881Speter "followed by invalid UTF-8 sequence\n(hex:%s)"), 656251881Speter valid_txt, invalid_txt); 657251881Speter} 658251881Speter 659251881Speter/* Verify that the sequence DATA of length LEN is valid UTF-8. 660251881Speter If it is not, return an error with code APR_EINVAL. */ 661251881Speterstatic svn_error_t * 662251881Spetercheck_utf8(const char *data, apr_size_t len, apr_pool_t *pool) 663251881Speter{ 664251881Speter if (! svn_utf__is_valid(data, len)) 665251881Speter return invalid_utf8(data, len, pool); 666251881Speter return SVN_NO_ERROR; 667251881Speter} 668251881Speter 669251881Speter/* Verify that the NULL terminated sequence DATA is valid UTF-8. 670251881Speter If it is not, return an error with code APR_EINVAL. */ 671251881Speterstatic svn_error_t * 672251881Spetercheck_cstring_utf8(const char *data, apr_pool_t *pool) 673251881Speter{ 674251881Speter 675251881Speter if (! svn_utf__cstring_is_valid(data)) 676251881Speter return invalid_utf8(data, strlen(data), pool); 677251881Speter return SVN_NO_ERROR; 678251881Speter} 679251881Speter 680251881Speter 681251881Spetersvn_error_t * 682251881Spetersvn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest, 683251881Speter const svn_stringbuf_t *src, 684251881Speter apr_pool_t *pool) 685251881Speter{ 686251881Speter xlate_handle_node_t *node; 687251881Speter svn_error_t *err; 688251881Speter 689251881Speter SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 690251881Speter 691251881Speter if (node->handle) 692251881Speter { 693251881Speter err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 694251881Speter if (! err) 695251881Speter err = check_utf8((*dest)->data, (*dest)->len, pool); 696251881Speter } 697251881Speter else 698251881Speter { 699251881Speter err = check_non_ascii(src->data, src->len, pool); 700251881Speter if (! err) 701251881Speter *dest = svn_stringbuf_dup(src, pool); 702251881Speter } 703251881Speter 704251881Speter return svn_error_compose_create(err, 705251881Speter put_xlate_handle_node 706251881Speter (node, 707251881Speter SVN_UTF_NTOU_XLATE_HANDLE, 708251881Speter pool)); 709251881Speter} 710251881Speter 711251881Speter 712251881Spetersvn_error_t * 713251881Spetersvn_utf_string_to_utf8(const svn_string_t **dest, 714251881Speter const svn_string_t *src, 715251881Speter apr_pool_t *pool) 716251881Speter{ 717251881Speter svn_stringbuf_t *destbuf; 718251881Speter xlate_handle_node_t *node; 719251881Speter svn_error_t *err; 720251881Speter 721251881Speter SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 722251881Speter 723251881Speter if (node->handle) 724251881Speter { 725251881Speter err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool); 726251881Speter if (! err) 727251881Speter err = check_utf8(destbuf->data, destbuf->len, pool); 728251881Speter if (! err) 729251881Speter *dest = svn_stringbuf__morph_into_string(destbuf); 730251881Speter } 731251881Speter else 732251881Speter { 733251881Speter err = check_non_ascii(src->data, src->len, pool); 734251881Speter if (! err) 735251881Speter *dest = svn_string_dup(src, pool); 736251881Speter } 737251881Speter 738251881Speter return svn_error_compose_create(err, 739251881Speter put_xlate_handle_node 740251881Speter (node, 741251881Speter SVN_UTF_NTOU_XLATE_HANDLE, 742251881Speter pool)); 743251881Speter} 744251881Speter 745251881Speter 746251881Speter/* Common implementation for svn_utf_cstring_to_utf8, 747251881Speter svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and 748251881Speter svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as 749251881Speter the translator and allocating from POOL. */ 750251881Speterstatic svn_error_t * 751251881Speterconvert_cstring(const char **dest, 752251881Speter const char *src, 753251881Speter xlate_handle_node_t *node, 754251881Speter apr_pool_t *pool) 755251881Speter{ 756251881Speter if (node->handle) 757251881Speter { 758251881Speter svn_stringbuf_t *destbuf; 759251881Speter SVN_ERR(convert_to_stringbuf(node, src, strlen(src), 760251881Speter &destbuf, pool)); 761251881Speter *dest = destbuf->data; 762251881Speter } 763251881Speter else 764251881Speter { 765251881Speter apr_size_t len = strlen(src); 766251881Speter SVN_ERR(check_non_ascii(src, len, pool)); 767251881Speter *dest = apr_pstrmemdup(pool, src, len); 768251881Speter } 769251881Speter return SVN_NO_ERROR; 770251881Speter} 771251881Speter 772251881Speter 773251881Spetersvn_error_t * 774251881Spetersvn_utf_cstring_to_utf8(const char **dest, 775251881Speter const char *src, 776251881Speter apr_pool_t *pool) 777251881Speter{ 778251881Speter xlate_handle_node_t *node; 779251881Speter svn_error_t *err; 780251881Speter 781251881Speter SVN_ERR(get_ntou_xlate_handle_node(&node, pool)); 782251881Speter err = convert_cstring(dest, src, node, pool); 783251881Speter SVN_ERR(svn_error_compose_create(err, 784251881Speter put_xlate_handle_node 785251881Speter (node, 786251881Speter SVN_UTF_NTOU_XLATE_HANDLE, 787251881Speter pool))); 788251881Speter return check_cstring_utf8(*dest, pool); 789251881Speter} 790251881Speter 791251881Speter 792251881Spetersvn_error_t * 793251881Spetersvn_utf_cstring_to_utf8_ex2(const char **dest, 794251881Speter const char *src, 795251881Speter const char *frompage, 796251881Speter apr_pool_t *pool) 797251881Speter{ 798251881Speter xlate_handle_node_t *node; 799251881Speter svn_error_t *err; 800251881Speter const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage, 801251881Speter pool); 802251881Speter 803251881Speter SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage, 804251881Speter convset_key, pool)); 805251881Speter err = convert_cstring(dest, src, node, pool); 806251881Speter SVN_ERR(svn_error_compose_create(err, 807251881Speter put_xlate_handle_node 808251881Speter (node, 809251881Speter SVN_UTF_NTOU_XLATE_HANDLE, 810251881Speter pool))); 811251881Speter 812251881Speter return check_cstring_utf8(*dest, pool); 813251881Speter} 814251881Speter 815251881Speter 816251881Spetersvn_error_t * 817251881Spetersvn_utf_cstring_to_utf8_ex(const char **dest, 818251881Speter const char *src, 819251881Speter const char *frompage, 820251881Speter const char *convset_key, 821251881Speter apr_pool_t *pool) 822251881Speter{ 823251881Speter return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool); 824251881Speter} 825251881Speter 826251881Speter 827251881Spetersvn_error_t * 828251881Spetersvn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest, 829251881Speter const svn_stringbuf_t *src, 830251881Speter apr_pool_t *pool) 831251881Speter{ 832251881Speter xlate_handle_node_t *node; 833251881Speter svn_error_t *err; 834251881Speter 835251881Speter SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 836251881Speter 837251881Speter if (node->handle) 838251881Speter { 839251881Speter err = check_utf8(src->data, src->len, pool); 840251881Speter if (! err) 841251881Speter err = convert_to_stringbuf(node, src->data, src->len, dest, pool); 842251881Speter } 843251881Speter else 844251881Speter { 845251881Speter err = check_non_ascii(src->data, src->len, pool); 846251881Speter if (! err) 847251881Speter *dest = svn_stringbuf_dup(src, pool); 848251881Speter } 849251881Speter 850251881Speter err = svn_error_compose_create( 851251881Speter err, 852251881Speter put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 853251881Speter 854251881Speter return err; 855251881Speter} 856251881Speter 857251881Speter 858251881Spetersvn_error_t * 859251881Spetersvn_utf_string_from_utf8(const svn_string_t **dest, 860251881Speter const svn_string_t *src, 861251881Speter apr_pool_t *pool) 862251881Speter{ 863251881Speter svn_stringbuf_t *dbuf; 864251881Speter xlate_handle_node_t *node; 865251881Speter svn_error_t *err; 866251881Speter 867251881Speter SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 868251881Speter 869251881Speter if (node->handle) 870251881Speter { 871251881Speter err = check_utf8(src->data, src->len, pool); 872251881Speter if (! err) 873251881Speter err = convert_to_stringbuf(node, src->data, src->len, 874251881Speter &dbuf, pool); 875251881Speter if (! err) 876251881Speter *dest = svn_stringbuf__morph_into_string(dbuf); 877251881Speter } 878251881Speter else 879251881Speter { 880251881Speter err = check_non_ascii(src->data, src->len, pool); 881251881Speter if (! err) 882251881Speter *dest = svn_string_dup(src, pool); 883251881Speter } 884251881Speter 885251881Speter err = svn_error_compose_create( 886251881Speter err, 887251881Speter put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 888251881Speter 889251881Speter return err; 890251881Speter} 891251881Speter 892251881Speter 893251881Spetersvn_error_t * 894251881Spetersvn_utf_cstring_from_utf8(const char **dest, 895251881Speter const char *src, 896251881Speter apr_pool_t *pool) 897251881Speter{ 898251881Speter xlate_handle_node_t *node; 899251881Speter svn_error_t *err; 900251881Speter 901251881Speter SVN_ERR(check_cstring_utf8(src, pool)); 902251881Speter 903251881Speter SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 904251881Speter err = convert_cstring(dest, src, node, pool); 905251881Speter err = svn_error_compose_create( 906251881Speter err, 907251881Speter put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 908251881Speter 909251881Speter return err; 910251881Speter} 911251881Speter 912251881Speter 913251881Spetersvn_error_t * 914251881Spetersvn_utf_cstring_from_utf8_ex2(const char **dest, 915251881Speter const char *src, 916251881Speter const char *topage, 917251881Speter apr_pool_t *pool) 918251881Speter{ 919251881Speter xlate_handle_node_t *node; 920251881Speter svn_error_t *err; 921251881Speter const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET, 922251881Speter pool); 923251881Speter 924251881Speter SVN_ERR(check_cstring_utf8(src, pool)); 925251881Speter 926251881Speter SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET, 927251881Speter convset_key, pool)); 928251881Speter err = convert_cstring(dest, src, node, pool); 929251881Speter err = svn_error_compose_create( 930251881Speter err, 931251881Speter put_xlate_handle_node(node, convset_key, pool)); 932251881Speter 933251881Speter return err; 934251881Speter} 935251881Speter 936251881Speterconst char * 937251881Spetersvn_utf__cstring_from_utf8_fuzzy(const char *src, 938251881Speter apr_pool_t *pool, 939251881Speter svn_error_t *(*convert_from_utf8) 940251881Speter (const char **, const char *, apr_pool_t *)) 941251881Speter{ 942251881Speter const char *escaped, *converted; 943251881Speter svn_error_t *err; 944251881Speter 945299742Sdim escaped = svn_utf__fuzzy_escape(src, strlen(src), pool); 946251881Speter 947251881Speter /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to 948251881Speter contain only 7-bit bytes :-). Recode to native... */ 949251881Speter err = convert_from_utf8(((const char **) &converted), escaped, pool); 950251881Speter 951251881Speter if (err) 952251881Speter { 953251881Speter svn_error_clear(err); 954251881Speter return escaped; 955251881Speter } 956251881Speter else 957251881Speter return converted; 958251881Speter 959251881Speter /* ### Check the client locale, maybe we can avoid that second 960251881Speter * conversion! See Ulrich Drepper's patch at 961251881Speter * http://subversion.tigris.org/issues/show_bug.cgi?id=807. 962251881Speter */ 963251881Speter} 964251881Speter 965251881Speter 966251881Speterconst char * 967251881Spetersvn_utf_cstring_from_utf8_fuzzy(const char *src, 968251881Speter apr_pool_t *pool) 969251881Speter{ 970251881Speter return svn_utf__cstring_from_utf8_fuzzy(src, pool, 971251881Speter svn_utf_cstring_from_utf8); 972251881Speter} 973251881Speter 974251881Speter 975251881Spetersvn_error_t * 976251881Spetersvn_utf_cstring_from_utf8_stringbuf(const char **dest, 977251881Speter const svn_stringbuf_t *src, 978251881Speter apr_pool_t *pool) 979251881Speter{ 980251881Speter svn_stringbuf_t *destbuf; 981251881Speter 982251881Speter SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool)); 983251881Speter *dest = destbuf->data; 984251881Speter 985251881Speter return SVN_NO_ERROR; 986251881Speter} 987251881Speter 988251881Speter 989251881Spetersvn_error_t * 990251881Spetersvn_utf_cstring_from_utf8_string(const char **dest, 991251881Speter const svn_string_t *src, 992251881Speter apr_pool_t *pool) 993251881Speter{ 994251881Speter svn_stringbuf_t *dbuf; 995251881Speter xlate_handle_node_t *node; 996251881Speter svn_error_t *err; 997251881Speter 998251881Speter SVN_ERR(get_uton_xlate_handle_node(&node, pool)); 999251881Speter 1000251881Speter if (node->handle) 1001251881Speter { 1002251881Speter err = check_utf8(src->data, src->len, pool); 1003251881Speter if (! err) 1004251881Speter err = convert_to_stringbuf(node, src->data, src->len, 1005251881Speter &dbuf, pool); 1006251881Speter if (! err) 1007251881Speter *dest = dbuf->data; 1008251881Speter } 1009251881Speter else 1010251881Speter { 1011251881Speter err = check_non_ascii(src->data, src->len, pool); 1012251881Speter if (! err) 1013251881Speter *dest = apr_pstrmemdup(pool, src->data, src->len); 1014251881Speter } 1015251881Speter 1016251881Speter err = svn_error_compose_create( 1017251881Speter err, 1018251881Speter put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool)); 1019251881Speter 1020251881Speter return err; 1021251881Speter} 1022299742Sdim 1023299742Sdim 1024299742Sdim/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */ 1025299742Sdimstatic void 1026299742Sdimmembuf_insert_ucs4(svn_membuf_t *buf, apr_size_t offset, apr_int32_t value) 1027299742Sdim{ 1028299742Sdim svn_membuf__resize(buf, (offset + 1) * sizeof(value)); 1029299742Sdim ((apr_int32_t*)buf->data)[offset] = value; 1030299742Sdim} 1031299742Sdim 1032299742Sdim/* TODO: Use compiler intrinsics for byte swaps. */ 1033299742Sdim#define SWAP_SHORT(x) ((((x) & 0xff) << 8) | (((x) >> 8) & 0xff)) 1034299742Sdim#define SWAP_LONG(x) ((((x) & 0xff) << 24) | (((x) & 0xff00) << 8) \ 1035299742Sdim | (((x) >> 8) & 0xff00) | (((x) >> 24) & 0xff)) 1036299742Sdim 1037299742Sdim#define IS_UTF16_LEAD_SURROGATE(c) ((c) >= 0xd800 && (c) <= 0xdbff) 1038299742Sdim#define IS_UTF16_TRAIL_SURROGATE(c) ((c) >= 0xdc00 && (c) <= 0xdfff) 1039299742Sdim 1040299742Sdimsvn_error_t * 1041299742Sdimsvn_utf__utf16_to_utf8(const svn_string_t **result, 1042299742Sdim const apr_uint16_t *utf16str, 1043299742Sdim apr_size_t utf16len, 1044299742Sdim svn_boolean_t big_endian, 1045299742Sdim apr_pool_t *result_pool, 1046299742Sdim apr_pool_t *scratch_pool) 1047299742Sdim{ 1048299742Sdim static const apr_uint16_t endiancheck = 0xa55a; 1049299742Sdim const svn_boolean_t arch_big_endian = 1050299742Sdim (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a'); 1051299742Sdim const svn_boolean_t swap_order = (!big_endian != !arch_big_endian); 1052299742Sdim 1053299742Sdim apr_uint16_t lead_surrogate; 1054299742Sdim apr_size_t length; 1055299742Sdim apr_size_t offset; 1056299742Sdim svn_membuf_t ucs4buf; 1057299742Sdim svn_membuf_t resultbuf; 1058299742Sdim svn_string_t *res; 1059299742Sdim 1060299742Sdim if (utf16len == SVN_UTF__UNKNOWN_LENGTH) 1061299742Sdim { 1062299742Sdim const apr_uint16_t *endp = utf16str; 1063299742Sdim while (*endp++) 1064299742Sdim ; 1065299742Sdim utf16len = (endp - utf16str); 1066299742Sdim } 1067299742Sdim 1068299742Sdim svn_membuf__create(&ucs4buf, utf16len * sizeof(apr_int32_t), scratch_pool); 1069299742Sdim 1070299742Sdim for (lead_surrogate = 0, length = 0, offset = 0; 1071299742Sdim offset < utf16len; ++offset) 1072299742Sdim { 1073299742Sdim const apr_uint16_t code = 1074299742Sdim (swap_order ? SWAP_SHORT(utf16str[offset]) : utf16str[offset]); 1075299742Sdim 1076299742Sdim if (lead_surrogate) 1077299742Sdim { 1078299742Sdim if (IS_UTF16_TRAIL_SURROGATE(code)) 1079299742Sdim { 1080299742Sdim /* Combine the lead and trail currogates into a 32-bit code. */ 1081299742Sdim membuf_insert_ucs4(&ucs4buf, length++, 1082299742Sdim (0x010000 1083299742Sdim + (((lead_surrogate & 0x03ff) << 10) 1084299742Sdim | (code & 0x03ff)))); 1085299742Sdim lead_surrogate = 0; 1086299742Sdim continue; 1087299742Sdim } 1088299742Sdim else 1089299742Sdim { 1090299742Sdim /* If we didn't find a surrogate pair, just dump the 1091299742Sdim lead surrogate into the stream. */ 1092299742Sdim membuf_insert_ucs4(&ucs4buf, length++, lead_surrogate); 1093299742Sdim lead_surrogate = 0; 1094299742Sdim } 1095299742Sdim } 1096299742Sdim 1097299742Sdim if ((offset + 1) < utf16len && IS_UTF16_LEAD_SURROGATE(code)) 1098299742Sdim { 1099299742Sdim /* Store a lead surrogate that is followed by at least one 1100299742Sdim code for the next iteration. */ 1101299742Sdim lead_surrogate = code; 1102299742Sdim continue; 1103299742Sdim } 1104299742Sdim else 1105299742Sdim membuf_insert_ucs4(&ucs4buf, length++, code); 1106299742Sdim } 1107299742Sdim 1108299742Sdim /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes 1109299742Sdim per code point for encoding. The buffer will grow as 1110299742Sdim necessary. */ 1111299742Sdim svn_membuf__create(&resultbuf, length * 2, result_pool); 1112299742Sdim SVN_ERR(svn_utf__encode_ucs4_string( 1113299742Sdim &resultbuf, ucs4buf.data, length, &length)); 1114299742Sdim 1115299742Sdim res = apr_palloc(result_pool, sizeof(*res)); 1116299742Sdim res->data = resultbuf.data; 1117299742Sdim res->len = length; 1118299742Sdim *result = res; 1119299742Sdim return SVN_NO_ERROR; 1120299742Sdim} 1121299742Sdim 1122299742Sdim 1123299742Sdimsvn_error_t * 1124299742Sdimsvn_utf__utf32_to_utf8(const svn_string_t **result, 1125299742Sdim const apr_int32_t *utf32str, 1126299742Sdim apr_size_t utf32len, 1127299742Sdim svn_boolean_t big_endian, 1128299742Sdim apr_pool_t *result_pool, 1129299742Sdim apr_pool_t *scratch_pool) 1130299742Sdim{ 1131299742Sdim static const apr_int32_t endiancheck = 0xa5cbbc5a; 1132299742Sdim const svn_boolean_t arch_big_endian = 1133299742Sdim (((const char*)&endiancheck)[sizeof(endiancheck) - 1] == '\x5a'); 1134299742Sdim const svn_boolean_t swap_order = (!big_endian != !arch_big_endian); 1135299742Sdim 1136299742Sdim apr_size_t length; 1137299742Sdim svn_membuf_t resultbuf; 1138299742Sdim svn_string_t *res; 1139299742Sdim 1140299742Sdim if (utf32len == SVN_UTF__UNKNOWN_LENGTH) 1141299742Sdim { 1142299742Sdim const apr_int32_t *endp = utf32str; 1143299742Sdim while (*endp++) 1144299742Sdim ; 1145299742Sdim utf32len = (endp - utf32str); 1146299742Sdim } 1147299742Sdim 1148299742Sdim if (swap_order) 1149299742Sdim { 1150299742Sdim apr_size_t offset; 1151299742Sdim svn_membuf_t ucs4buf; 1152299742Sdim 1153299742Sdim svn_membuf__create(&ucs4buf, utf32len * sizeof(apr_int32_t), 1154299742Sdim scratch_pool); 1155299742Sdim 1156299742Sdim for (offset = 0; offset < utf32len; ++offset) 1157299742Sdim { 1158299742Sdim const apr_int32_t code = SWAP_LONG(utf32str[offset]); 1159299742Sdim membuf_insert_ucs4(&ucs4buf, offset, code); 1160299742Sdim } 1161299742Sdim utf32str = ucs4buf.data; 1162299742Sdim } 1163299742Sdim 1164299742Sdim /* Convert the UCS-4 buffer to UTF-8, assuming an average of 2 bytes 1165299742Sdim per code point for encoding. The buffer will grow as 1166299742Sdim necessary. */ 1167299742Sdim svn_membuf__create(&resultbuf, utf32len * 2, result_pool); 1168299742Sdim SVN_ERR(svn_utf__encode_ucs4_string( 1169299742Sdim &resultbuf, utf32str, utf32len, &length)); 1170299742Sdim 1171299742Sdim res = apr_palloc(result_pool, sizeof(*res)); 1172299742Sdim res->data = resultbuf.data; 1173299742Sdim res->len = length; 1174299742Sdim *result = res; 1175299742Sdim return SVN_NO_ERROR; 1176299742Sdim} 1177299742Sdim 1178299742Sdim 1179299742Sdim#ifdef WIN32 1180299742Sdim 1181299742Sdim 1182299742Sdimsvn_error_t * 1183299742Sdimsvn_utf__win32_utf8_to_utf16(const WCHAR **result, 1184299742Sdim const char *src, 1185299742Sdim const WCHAR *prefix, 1186299742Sdim apr_pool_t *result_pool) 1187299742Sdim{ 1188299742Sdim const int utf8_count = strlen(src); 1189299742Sdim const int prefix_len = (prefix ? lstrlenW(prefix) : 0); 1190299742Sdim WCHAR *wide_str; 1191299742Sdim int wide_count; 1192299742Sdim 1193299742Sdim if (0 == prefix_len + utf8_count) 1194299742Sdim { 1195299742Sdim *result = L""; 1196299742Sdim return SVN_NO_ERROR; 1197299742Sdim } 1198299742Sdim 1199299742Sdim wide_count = MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, NULL, 0); 1200299742Sdim if (wide_count == 0) 1201299742Sdim return svn_error_wrap_apr(apr_get_os_error(), 1202299742Sdim _("Conversion to UTF-16 failed")); 1203299742Sdim 1204299742Sdim wide_str = apr_palloc(result_pool, 1205299742Sdim (prefix_len + wide_count + 1) * sizeof(*wide_str)); 1206299742Sdim if (prefix_len) 1207299742Sdim memcpy(wide_str, prefix, prefix_len * sizeof(*wide_str)); 1208299742Sdim if (0 == MultiByteToWideChar(CP_UTF8, 0, src, utf8_count, 1209299742Sdim wide_str + prefix_len, wide_count)) 1210299742Sdim return svn_error_wrap_apr(apr_get_os_error(), 1211299742Sdim _("Conversion to UTF-16 failed")); 1212299742Sdim 1213299742Sdim wide_str[prefix_len + wide_count] = 0; 1214299742Sdim *result = wide_str; 1215299742Sdim 1216299742Sdim return SVN_NO_ERROR; 1217299742Sdim} 1218299742Sdim 1219299742Sdimsvn_error_t * 1220299742Sdimsvn_utf__win32_utf16_to_utf8(const char **result, 1221299742Sdim const WCHAR *src, 1222299742Sdim const char *prefix, 1223299742Sdim apr_pool_t *result_pool) 1224299742Sdim{ 1225299742Sdim const int wide_count = lstrlenW(src); 1226299742Sdim const int prefix_len = (prefix ? strlen(prefix) : 0); 1227299742Sdim char *utf8_str; 1228299742Sdim int utf8_count; 1229299742Sdim 1230299742Sdim if (0 == prefix_len + wide_count) 1231299742Sdim { 1232299742Sdim *result = ""; 1233299742Sdim return SVN_NO_ERROR; 1234299742Sdim } 1235299742Sdim 1236299742Sdim utf8_count = WideCharToMultiByte(CP_UTF8, 0, src, wide_count, 1237299742Sdim NULL, 0, NULL, FALSE); 1238299742Sdim if (utf8_count == 0) 1239299742Sdim return svn_error_wrap_apr(apr_get_os_error(), 1240299742Sdim _("Conversion from UTF-16 failed")); 1241299742Sdim 1242299742Sdim utf8_str = apr_palloc(result_pool, 1243299742Sdim (prefix_len + utf8_count + 1) * sizeof(*utf8_str)); 1244299742Sdim if (prefix_len) 1245299742Sdim memcpy(utf8_str, prefix, prefix_len * sizeof(*utf8_str)); 1246299742Sdim if (0 == WideCharToMultiByte(CP_UTF8, 0, src, wide_count, 1247299742Sdim utf8_str + prefix_len, utf8_count, 1248299742Sdim NULL, FALSE)) 1249299742Sdim return svn_error_wrap_apr(apr_get_os_error(), 1250299742Sdim _("Conversion from UTF-16 failed")); 1251299742Sdim 1252299742Sdim utf8_str[prefix_len + utf8_count] = 0; 1253299742Sdim *result = utf8_str; 1254299742Sdim 1255299742Sdim return SVN_NO_ERROR; 1256299742Sdim} 1257299742Sdim 1258299742Sdim#endif /* WIN32 */ 1259