archive_string.c revision 248616
1228753Smm/*- 2232153Smm * Copyright (c) 2003-2011 Tim Kientzle 3232153Smm * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4228753Smm * All rights reserved. 5228753Smm * 6228753Smm * Redistribution and use in source and binary forms, with or without 7228753Smm * modification, are permitted provided that the following conditions 8228753Smm * are met: 9228753Smm * 1. Redistributions of source code must retain the above copyright 10228753Smm * notice, this list of conditions and the following disclaimer. 11228753Smm * 2. Redistributions in binary form must reproduce the above copyright 12228753Smm * notice, this list of conditions and the following disclaimer in the 13228753Smm * documentation and/or other materials provided with the distribution. 14228753Smm * 15228753Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 16228753Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17228753Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18228753Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 19228753Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20228753Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21228753Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22228753Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23228753Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24228753Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25228753Smm */ 26228753Smm 27228753Smm#include "archive_platform.h" 28228763Smm__FBSDID("$FreeBSD: head/contrib/libarchive/libarchive/archive_string.c 248616 2013-03-22 13:36:03Z mm $"); 29228753Smm 30228753Smm/* 31228753Smm * Basic resizable string support, to simplify manipulating arbitrary-sized 32228753Smm * strings while minimizing heap activity. 33232153Smm * 34232153Smm * In particular, the buffer used by a string object is only grown, it 35232153Smm * never shrinks, so you can clear and reuse the same string object 36232153Smm * without incurring additional memory allocations. 37228753Smm */ 38228753Smm 39232153Smm#ifdef HAVE_ERRNO_H 40232153Smm#include <errno.h> 41232153Smm#endif 42232153Smm#ifdef HAVE_ICONV_H 43232153Smm#include <iconv.h> 44232153Smm#endif 45232153Smm#ifdef HAVE_LANGINFO_H 46232153Smm#include <langinfo.h> 47232153Smm#endif 48232153Smm#ifdef HAVE_LOCALCHARSET_H 49232153Smm#include <localcharset.h> 50232153Smm#endif 51228753Smm#ifdef HAVE_STDLIB_H 52228753Smm#include <stdlib.h> 53228753Smm#endif 54228753Smm#ifdef HAVE_STRING_H 55228753Smm#include <string.h> 56228753Smm#endif 57228753Smm#ifdef HAVE_WCHAR_H 58228753Smm#include <wchar.h> 59228753Smm#endif 60228753Smm#if defined(_WIN32) && !defined(__CYGWIN__) 61228753Smm#include <windows.h> 62232153Smm#include <locale.h> 63228753Smm#endif 64228753Smm 65232153Smm#include "archive_endian.h" 66228753Smm#include "archive_private.h" 67228753Smm#include "archive_string.h" 68232153Smm#include "archive_string_composition.h" 69228753Smm 70232153Smm#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) 71232153Smm#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) 72232153Smm#endif 73232153Smm 74232153Smmstruct archive_string_conv { 75232153Smm struct archive_string_conv *next; 76232153Smm char *from_charset; 77232153Smm char *to_charset; 78232153Smm unsigned from_cp; 79232153Smm unsigned to_cp; 80232153Smm /* Set 1 if from_charset and to_charset are the same. */ 81232153Smm int same; 82232153Smm int flag; 83232153Smm#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified 84232153Smm * charset. */ 85232153Smm#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from 86232153Smm * specified charset. */ 87232153Smm#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ 88232153Smm#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting 89232153Smm * MBS. */ 90232153Smm#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive 91232153Smm * 2.x in the wrong assumption. */ 92232153Smm#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. 93232153Smm * Before UTF-8 characters are actually 94232153Smm * processed. */ 95232153Smm#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. 96232153Smm * Before UTF-8 characters are actually 97232153Smm * processed. 98232153Smm * Currently this only for MAC OS X. */ 99232153Smm#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ 100232153Smm#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ 101232153Smm#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ 102232153Smm#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ 103232153Smm#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ 104232153Smm#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ 105232153Smm#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) 106232153Smm#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) 107232153Smm 108232153Smm#if HAVE_ICONV 109232153Smm iconv_t cd; 110232153Smm iconv_t cd_w;/* Use at archive_mstring on 111232153Smm * Windows. */ 112232153Smm#endif 113232153Smm /* A temporary buffer for normalization. */ 114232153Smm struct archive_string utftmp; 115232153Smm int (*converter[2])(struct archive_string *, const void *, size_t, 116232153Smm struct archive_string_conv *); 117232153Smm int nconverter; 118232153Smm}; 119232153Smm 120232153Smm#define CP_C_LOCALE 0 /* "C" locale only for this file. */ 121232153Smm#define CP_UTF16LE 1200 122232153Smm#define CP_UTF16BE 1201 123232153Smm 124232153Smm#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) 125232153Smm#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) 126232153Smm#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) 127232153Smm#define UNICODE_MAX 0x10FFFF 128232153Smm#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ 129232153Smm/* Set U+FFFD(Replacement character) in UTF-8. */ 130232153Smm#define UTF8_SET_R_CHAR(outp) do { \ 131232153Smm (outp)[0] = 0xef; \ 132232153Smm (outp)[1] = 0xbf; \ 133232153Smm (outp)[2] = 0xbd; \ 134232153Smm} while (0) 135232153Smm#define UTF8_R_CHAR_SIZE 3 136232153Smm 137232153Smmstatic struct archive_string_conv *find_sconv_object(struct archive *, 138232153Smm const char *, const char *); 139232153Smmstatic void add_sconv_object(struct archive *, struct archive_string_conv *); 140232153Smmstatic struct archive_string_conv *create_sconv_object(const char *, 141232153Smm const char *, unsigned, int); 142232153Smmstatic void free_sconv_object(struct archive_string_conv *); 143232153Smmstatic struct archive_string_conv *get_sconv_object(struct archive *, 144232153Smm const char *, const char *, int); 145232153Smmstatic unsigned make_codepage_from_charset(const char *); 146232153Smmstatic unsigned get_current_codepage(void); 147232153Smmstatic unsigned get_current_oemcp(void); 148232153Smmstatic size_t mbsnbytes(const void *, size_t); 149232153Smmstatic size_t utf16nbytes(const void *, size_t); 150232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 151232153Smmstatic int archive_wstring_append_from_mbs_in_codepage( 152232153Smm struct archive_wstring *, const char *, size_t, 153232153Smm struct archive_string_conv *); 154232153Smmstatic int archive_string_append_from_wcs_in_codepage(struct archive_string *, 155232153Smm const wchar_t *, size_t, struct archive_string_conv *); 156232153Smmstatic int is_big_endian(void); 157232153Smmstatic int strncat_in_codepage(struct archive_string *, const void *, 158232153Smm size_t, struct archive_string_conv *); 159238856Smmstatic int win_strncat_from_utf16be(struct archive_string *, const void *, 160232153Smm size_t, struct archive_string_conv *); 161238856Smmstatic int win_strncat_from_utf16le(struct archive_string *, const void *, 162232153Smm size_t, struct archive_string_conv *); 163238856Smmstatic int win_strncat_to_utf16be(struct archive_string *, const void *, 164232153Smm size_t, struct archive_string_conv *); 165238856Smmstatic int win_strncat_to_utf16le(struct archive_string *, const void *, 166232153Smm size_t, struct archive_string_conv *); 167238856Smm#endif 168238856Smmstatic int best_effort_strncat_from_utf16be(struct archive_string *, 169238856Smm const void *, size_t, struct archive_string_conv *); 170238856Smmstatic int best_effort_strncat_from_utf16le(struct archive_string *, 171238856Smm const void *, size_t, struct archive_string_conv *); 172238856Smmstatic int best_effort_strncat_to_utf16be(struct archive_string *, 173238856Smm const void *, size_t, struct archive_string_conv *); 174238856Smmstatic int best_effort_strncat_to_utf16le(struct archive_string *, 175238856Smm const void *, size_t, struct archive_string_conv *); 176232153Smm#if defined(HAVE_ICONV) 177232153Smmstatic int iconv_strncat_in_locale(struct archive_string *, const void *, 178232153Smm size_t, struct archive_string_conv *); 179232153Smm#endif 180238856Smmstatic int best_effort_strncat_in_locale(struct archive_string *, 181238856Smm const void *, size_t, struct archive_string_conv *); 182232153Smmstatic int _utf8_to_unicode(uint32_t *, const char *, size_t); 183232153Smmstatic int utf8_to_unicode(uint32_t *, const char *, size_t); 184232153Smmstatic inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); 185232153Smmstatic int cesu8_to_unicode(uint32_t *, const char *, size_t); 186232153Smmstatic size_t unicode_to_utf8(char *, size_t, uint32_t); 187232153Smmstatic int utf16_to_unicode(uint32_t *, const char *, size_t, int); 188232153Smmstatic size_t unicode_to_utf16be(char *, size_t, uint32_t); 189232153Smmstatic size_t unicode_to_utf16le(char *, size_t, uint32_t); 190232153Smmstatic int strncat_from_utf8_libarchive2(struct archive_string *, 191232153Smm const void *, size_t, struct archive_string_conv *); 192232153Smmstatic int strncat_from_utf8_to_utf8(struct archive_string *, const void *, 193232153Smm size_t, struct archive_string_conv *); 194232153Smmstatic int archive_string_normalize_C(struct archive_string *, const void *, 195232153Smm size_t, struct archive_string_conv *); 196232153Smmstatic int archive_string_normalize_D(struct archive_string *, const void *, 197232153Smm size_t, struct archive_string_conv *); 198232153Smmstatic int archive_string_append_unicode(struct archive_string *, 199232153Smm const void *, size_t, struct archive_string_conv *); 200232153Smm 201232153Smmstatic struct archive_string * 202232153Smmarchive_string_append(struct archive_string *as, const char *p, size_t s) 203228753Smm{ 204232153Smm if (archive_string_ensure(as, as->length + s + 1) == NULL) 205232153Smm return (NULL); 206228753Smm memcpy(as->s + as->length, p, s); 207228753Smm as->length += s; 208232153Smm as->s[as->length] = 0; 209228753Smm return (as); 210228753Smm} 211228753Smm 212232153Smmstatic struct archive_wstring * 213232153Smmarchive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) 214232153Smm{ 215232153Smm if (archive_wstring_ensure(as, as->length + s + 1) == NULL) 216232153Smm return (NULL); 217232153Smm wmemcpy(as->s + as->length, p, s); 218232153Smm as->length += s; 219232153Smm as->s[as->length] = 0; 220232153Smm return (as); 221232153Smm} 222232153Smm 223228753Smmvoid 224232153Smmarchive_string_concat(struct archive_string *dest, struct archive_string *src) 225228753Smm{ 226232153Smm if (archive_string_append(dest, src->s, src->length) == NULL) 227232153Smm __archive_errx(1, "Out of memory"); 228228753Smm} 229228753Smm 230228753Smmvoid 231238856Smmarchive_wstring_concat(struct archive_wstring *dest, 232238856Smm struct archive_wstring *src) 233228753Smm{ 234232153Smm if (archive_wstring_append(dest, src->s, src->length) == NULL) 235232153Smm __archive_errx(1, "Out of memory"); 236228753Smm} 237228753Smm 238228753Smmvoid 239232153Smmarchive_string_free(struct archive_string *as) 240228753Smm{ 241228753Smm as->length = 0; 242228753Smm as->buffer_length = 0; 243232153Smm free(as->s); 244232153Smm as->s = NULL; 245228753Smm} 246228753Smm 247232153Smmvoid 248232153Smmarchive_wstring_free(struct archive_wstring *as) 249232153Smm{ 250232153Smm as->length = 0; 251232153Smm as->buffer_length = 0; 252232153Smm free(as->s); 253232153Smm as->s = NULL; 254232153Smm} 255232153Smm 256232153Smmstruct archive_wstring * 257232153Smmarchive_wstring_ensure(struct archive_wstring *as, size_t s) 258232153Smm{ 259232153Smm return (struct archive_wstring *) 260232153Smm archive_string_ensure((struct archive_string *)as, 261232153Smm s * sizeof(wchar_t)); 262232153Smm} 263232153Smm 264228753Smm/* Returns NULL on any allocation failure. */ 265228753Smmstruct archive_string * 266232153Smmarchive_string_ensure(struct archive_string *as, size_t s) 267228753Smm{ 268232153Smm char *p; 269232153Smm size_t new_length; 270232153Smm 271228753Smm /* If buffer is already big enough, don't reallocate. */ 272228753Smm if (as->s && (s <= as->buffer_length)) 273228753Smm return (as); 274228753Smm 275228753Smm /* 276228753Smm * Growing the buffer at least exponentially ensures that 277228753Smm * append operations are always linear in the number of 278228753Smm * characters appended. Using a smaller growth rate for 279228753Smm * larger buffers reduces memory waste somewhat at the cost of 280228753Smm * a larger constant factor. 281228753Smm */ 282228753Smm if (as->buffer_length < 32) 283228753Smm /* Start with a minimum 32-character buffer. */ 284232153Smm new_length = 32; 285228753Smm else if (as->buffer_length < 8192) 286228753Smm /* Buffers under 8k are doubled for speed. */ 287232153Smm new_length = as->buffer_length + as->buffer_length; 288228753Smm else { 289228753Smm /* Buffers 8k and over grow by at least 25% each time. */ 290232153Smm new_length = as->buffer_length + as->buffer_length / 4; 291232153Smm /* Be safe: If size wraps, fail. */ 292232153Smm if (new_length < as->buffer_length) { 293232153Smm /* On failure, wipe the string and return NULL. */ 294232153Smm archive_string_free(as); 295232153Smm errno = ENOMEM;/* Make sure errno has ENOMEM. */ 296228753Smm return (NULL); 297228753Smm } 298228753Smm } 299228753Smm /* 300228753Smm * The computation above is a lower limit to how much we'll 301228753Smm * grow the buffer. In any case, we have to grow it enough to 302228753Smm * hold the request. 303228753Smm */ 304232153Smm if (new_length < s) 305232153Smm new_length = s; 306228753Smm /* Now we can reallocate the buffer. */ 307232153Smm p = (char *)realloc(as->s, new_length); 308232153Smm if (p == NULL) { 309232153Smm /* On failure, wipe the string and return NULL. */ 310232153Smm archive_string_free(as); 311232153Smm errno = ENOMEM;/* Make sure errno has ENOMEM. */ 312228753Smm return (NULL); 313232153Smm } 314232153Smm 315232153Smm as->s = p; 316232153Smm as->buffer_length = new_length; 317228753Smm return (as); 318228753Smm} 319228753Smm 320232153Smm/* 321232153Smm * TODO: See if there's a way to avoid scanning 322232153Smm * the source string twice. Then test to see 323232153Smm * if it actually helps (remember that we're almost 324232153Smm * always called with pretty short arguments, so 325232153Smm * such an optimization might not help). 326232153Smm */ 327228753Smmstruct archive_string * 328232153Smmarchive_strncat(struct archive_string *as, const void *_p, size_t n) 329228753Smm{ 330228753Smm size_t s; 331228753Smm const char *p, *pp; 332228753Smm 333228753Smm p = (const char *)_p; 334228753Smm 335228753Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 336228753Smm s = 0; 337228753Smm pp = p; 338228753Smm while (s < n && *pp) { 339228753Smm pp++; 340228753Smm s++; 341228753Smm } 342232153Smm if ((as = archive_string_append(as, p, s)) == NULL) 343232153Smm __archive_errx(1, "Out of memory"); 344232153Smm return (as); 345228753Smm} 346228753Smm 347232153Smmstruct archive_wstring * 348232153Smmarchive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) 349232153Smm{ 350232153Smm size_t s; 351232153Smm const wchar_t *pp; 352232153Smm 353232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 354232153Smm s = 0; 355232153Smm pp = p; 356232153Smm while (s < n && *pp) { 357232153Smm pp++; 358232153Smm s++; 359232153Smm } 360232153Smm if ((as = archive_wstring_append(as, p, s)) == NULL) 361232153Smm __archive_errx(1, "Out of memory"); 362232153Smm return (as); 363232153Smm} 364232153Smm 365228753Smmstruct archive_string * 366232153Smmarchive_strcat(struct archive_string *as, const void *p) 367228753Smm{ 368232153Smm /* strcat is just strncat without an effective limit. 369232153Smm * Assert that we'll never get called with a source 370232153Smm * string over 16MB. 371232153Smm * TODO: Review all uses of strcat in the source 372232153Smm * and try to replace them with strncat(). 373232153Smm */ 374232153Smm return archive_strncat(as, p, 0x1000000); 375228753Smm} 376228753Smm 377232153Smmstruct archive_wstring * 378232153Smmarchive_wstrcat(struct archive_wstring *as, const wchar_t *p) 379232153Smm{ 380232153Smm /* Ditto. */ 381232153Smm return archive_wstrncat(as, p, 0x1000000); 382232153Smm} 383232153Smm 384232153Smmstruct archive_string * 385232153Smmarchive_strappend_char(struct archive_string *as, char c) 386232153Smm{ 387232153Smm if ((as = archive_string_append(as, &c, 1)) == NULL) 388232153Smm __archive_errx(1, "Out of memory"); 389232153Smm return (as); 390232153Smm} 391232153Smm 392232153Smmstruct archive_wstring * 393232153Smmarchive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) 394232153Smm{ 395232153Smm if ((as = archive_wstring_append(as, &c, 1)) == NULL) 396232153Smm __archive_errx(1, "Out of memory"); 397232153Smm return (as); 398232153Smm} 399232153Smm 400228753Smm/* 401232153Smm * Get the "current character set" name to use with iconv. 402232153Smm * On FreeBSD, the empty character set name "" chooses 403232153Smm * the correct character encoding for the current locale, 404232153Smm * so this isn't necessary. 405232153Smm * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; 406232153Smm * on that system, we have to explicitly call nl_langinfo() 407232153Smm * to get the right name. Not sure about other platforms. 408232153Smm * 409232153Smm * NOTE: GNU libiconv does not recognize the character-set name 410232153Smm * which some platform nl_langinfo(CODESET) returns, so we should 411232153Smm * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. 412228753Smm */ 413232153Smmstatic const char * 414232153Smmdefault_iconv_charset(const char *charset) { 415232153Smm if (charset != NULL && charset[0] != '\0') 416232153Smm return charset; 417232153Smm#if HAVE_LOCALE_CHARSET && !defined(__APPLE__) 418232153Smm /* locale_charset() is broken on Mac OS */ 419232153Smm return locale_charset(); 420232153Smm#elif HAVE_NL_LANGINFO 421232153Smm return nl_langinfo(CODESET); 422232153Smm#else 423232153Smm return ""; 424232153Smm#endif 425232153Smm} 426232153Smm 427232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 428232153Smm 429232153Smm/* 430232153Smm * Convert MBS to WCS. 431232153Smm * Note: returns -1 if conversion fails. 432232153Smm */ 433232153Smmint 434232153Smmarchive_wstring_append_from_mbs(struct archive_wstring *dest, 435232153Smm const char *p, size_t len) 436228753Smm{ 437238856Smm return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); 438232153Smm} 439232153Smm 440232153Smmstatic int 441232153Smmarchive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, 442232153Smm const char *s, size_t length, struct archive_string_conv *sc) 443232153Smm{ 444232153Smm int count, ret = 0; 445232153Smm UINT from_cp; 446232153Smm 447232153Smm if (sc != NULL) 448232153Smm from_cp = sc->from_cp; 449232153Smm else 450232153Smm from_cp = get_current_codepage(); 451232153Smm 452232153Smm if (from_cp == CP_C_LOCALE) { 453232153Smm /* 454232153Smm * "C" locale special process. 455232153Smm */ 456232153Smm wchar_t *ws; 457232153Smm const unsigned char *mp; 458232153Smm 459232153Smm if (NULL == archive_wstring_ensure(dest, 460232153Smm dest->length + length + 1)) 461232153Smm return (-1); 462232153Smm 463232153Smm ws = dest->s + dest->length; 464232153Smm mp = (const unsigned char *)s; 465232153Smm count = 0; 466232153Smm while (count < (int)length && *mp) { 467232153Smm *ws++ = (wchar_t)*mp++; 468232153Smm count++; 469232153Smm } 470238856Smm } else if (sc != NULL && 471238856Smm (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { 472232153Smm /* 473232153Smm * Normalize UTF-8 and UTF-16BE and convert it directly 474232153Smm * to UTF-16 as wchar_t. 475232153Smm */ 476232153Smm struct archive_string u16; 477232153Smm int saved_flag = sc->flag;/* save current flag. */ 478232153Smm 479232153Smm if (is_big_endian()) 480232153Smm sc->flag |= SCONV_TO_UTF16BE; 481232153Smm else 482232153Smm sc->flag |= SCONV_TO_UTF16LE; 483232153Smm 484232153Smm if (sc->flag & SCONV_FROM_UTF16) { 485232153Smm /* 486232153Smm * UTF-16BE/LE NFD ===> UTF-16 NFC 487238856Smm * UTF-16BE/LE NFC ===> UTF-16 NFD 488232153Smm */ 489248616Smm count = (int)utf16nbytes(s, length); 490232153Smm } else { 491232153Smm /* 492232153Smm * UTF-8 NFD ===> UTF-16 NFC 493238856Smm * UTF-8 NFC ===> UTF-16 NFD 494232153Smm */ 495248616Smm count = (int)mbsnbytes(s, length); 496232153Smm } 497232153Smm u16.s = (char *)dest->s; 498232153Smm u16.length = dest->length << 1;; 499232153Smm u16.buffer_length = dest->buffer_length; 500238856Smm if (sc->flag & SCONV_NORMALIZATION_C) 501238856Smm ret = archive_string_normalize_C(&u16, s, count, sc); 502238856Smm else 503238856Smm ret = archive_string_normalize_D(&u16, s, count, sc); 504232153Smm dest->s = (wchar_t *)u16.s; 505232153Smm dest->length = u16.length >> 1; 506232153Smm dest->buffer_length = u16.buffer_length; 507232153Smm sc->flag = saved_flag;/* restore the saved flag. */ 508232153Smm return (ret); 509232153Smm } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { 510248616Smm count = (int)utf16nbytes(s, length); 511232153Smm count >>= 1; /* to be WCS length */ 512232153Smm /* Allocate memory for WCS. */ 513232153Smm if (NULL == archive_wstring_ensure(dest, 514232153Smm dest->length + count + 1)) 515232153Smm return (-1); 516238856Smm wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); 517232153Smm if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) { 518232153Smm uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 519232153Smm int b; 520232153Smm for (b = 0; b < count; b++) { 521232153Smm uint16_t val = archive_le16dec(u16+b); 522232153Smm archive_be16enc(u16+b, val); 523232153Smm } 524232153Smm } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) { 525232153Smm uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 526232153Smm int b; 527232153Smm for (b = 0; b < count; b++) { 528232153Smm uint16_t val = archive_be16dec(u16+b); 529232153Smm archive_le16enc(u16+b, val); 530232153Smm } 531232153Smm } 532232153Smm } else { 533232153Smm DWORD mbflag; 534238856Smm size_t buffsize; 535232153Smm 536232153Smm if (sc == NULL) 537232153Smm mbflag = 0; 538232153Smm else if (sc->flag & SCONV_FROM_CHARSET) { 539232153Smm /* Do not trust the length which comes from 540232153Smm * an archive file. */ 541232153Smm length = mbsnbytes(s, length); 542232153Smm mbflag = 0; 543232153Smm } else 544232153Smm mbflag = MB_PRECOMPOSED; 545232153Smm 546238856Smm buffsize = dest->length + length + 1; 547238856Smm do { 548238856Smm /* Allocate memory for WCS. */ 549238856Smm if (NULL == archive_wstring_ensure(dest, buffsize)) 550232153Smm return (-1); 551238856Smm /* Convert MBS to WCS. */ 552238856Smm count = MultiByteToWideChar(from_cp, 553248616Smm mbflag, s, (int)length, dest->s + dest->length, 554248616Smm (int)(dest->buffer_length >> 1) -1); 555238856Smm if (count == 0 && 556238856Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 557238856Smm /* Expand the WCS buffer. */ 558238856Smm buffsize = dest->buffer_length << 1; 559238856Smm continue; 560232153Smm } 561238856Smm if (count == 0 && length != 0) 562238856Smm ret = -1; 563238856Smm } while (0); 564232153Smm } 565232153Smm dest->length += count; 566232153Smm dest->s[dest->length] = L'\0'; 567232153Smm return (ret); 568232153Smm} 569232153Smm 570232153Smm#else 571232153Smm 572232153Smm/* 573232153Smm * Convert MBS to WCS. 574232153Smm * Note: returns -1 if conversion fails. 575232153Smm */ 576232153Smmint 577232153Smmarchive_wstring_append_from_mbs(struct archive_wstring *dest, 578232153Smm const char *p, size_t len) 579232153Smm{ 580232153Smm size_t r; 581238856Smm int ret_val = 0; 582232153Smm /* 583232153Smm * No single byte will be more than one wide character, 584232153Smm * so this length estimate will always be big enough. 585232153Smm */ 586232153Smm size_t wcs_length = len; 587232153Smm size_t mbs_length = len; 588232153Smm const char *mbs = p; 589232153Smm wchar_t *wcs; 590232153Smm#if HAVE_MBRTOWC 591232153Smm mbstate_t shift_state; 592232153Smm 593232153Smm memset(&shift_state, 0, sizeof(shift_state)); 594232153Smm#endif 595232153Smm if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1)) 596238856Smm return (-1); 597232153Smm wcs = dest->s + dest->length; 598232153Smm /* 599232153Smm * We cannot use mbsrtowcs/mbstowcs here because those may convert 600232153Smm * extra MBS when strlen(p) > len and one wide character consis of 601232153Smm * multi bytes. 602232153Smm */ 603238856Smm while (*mbs && mbs_length > 0) { 604238856Smm if (wcs_length == 0) { 605238856Smm dest->length = wcs - dest->s; 606238856Smm dest->s[dest->length] = L'\0'; 607238856Smm wcs_length = mbs_length; 608238856Smm if (NULL == archive_wstring_ensure(dest, 609238856Smm dest->length + wcs_length + 1)) 610238856Smm return (-1); 611238856Smm wcs = dest->s + dest->length; 612238856Smm } 613232153Smm#if HAVE_MBRTOWC 614232153Smm r = mbrtowc(wcs, mbs, wcs_length, &shift_state); 615232153Smm#else 616232153Smm r = mbtowc(wcs, mbs, wcs_length); 617232153Smm#endif 618232153Smm if (r == (size_t)-1 || r == (size_t)-2) { 619238856Smm ret_val = -1; 620238856Smm if (errno == EILSEQ) { 621238856Smm ++mbs; 622238856Smm --mbs_length; 623238856Smm continue; 624238856Smm } else 625238856Smm break; 626232153Smm } 627232153Smm if (r == 0 || r > mbs_length) 628232153Smm break; 629232153Smm wcs++; 630232153Smm wcs_length--; 631232153Smm mbs += r; 632232153Smm mbs_length -= r; 633232153Smm } 634232153Smm dest->length = wcs - dest->s; 635232153Smm dest->s[dest->length] = L'\0'; 636238856Smm return (ret_val); 637232153Smm} 638232153Smm 639232153Smm#endif 640232153Smm 641232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 642232153Smm 643232153Smm/* 644232153Smm * WCS ==> MBS. 645232153Smm * Note: returns -1 if conversion fails. 646232153Smm * 647232153Smm * Win32 builds use WideCharToMultiByte from the Windows API. 648232153Smm * (Maybe Cygwin should too? WideCharToMultiByte will know a 649232153Smm * lot more about local character encodings than the wcrtomb() 650232153Smm * wrapper is going to know.) 651232153Smm */ 652232153Smmint 653232153Smmarchive_string_append_from_wcs(struct archive_string *as, 654232153Smm const wchar_t *w, size_t len) 655232153Smm{ 656238856Smm return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); 657232153Smm} 658232153Smm 659232153Smmstatic int 660232153Smmarchive_string_append_from_wcs_in_codepage(struct archive_string *as, 661232153Smm const wchar_t *ws, size_t len, struct archive_string_conv *sc) 662232153Smm{ 663232153Smm BOOL defchar_used, *dp; 664232153Smm int count, ret = 0; 665232153Smm UINT to_cp; 666232153Smm int wslen = (int)len; 667232153Smm 668232153Smm if (sc != NULL) 669232153Smm to_cp = sc->to_cp; 670232153Smm else 671232153Smm to_cp = get_current_codepage(); 672232153Smm 673232153Smm if (to_cp == CP_C_LOCALE) { 674232153Smm /* 675232153Smm * "C" locale special process. 676232153Smm */ 677232153Smm const wchar_t *wp = ws; 678232153Smm char *p; 679232153Smm 680232153Smm if (NULL == archive_string_ensure(as, 681232153Smm as->length + wslen +1)) 682232153Smm return (-1); 683232153Smm p = as->s + as->length; 684232153Smm count = 0; 685232153Smm defchar_used = 0; 686232153Smm while (count < wslen && *wp) { 687232153Smm if (*wp > 255) { 688232153Smm *p++ = '?'; 689232153Smm wp++; 690232153Smm defchar_used = 1; 691232153Smm } else 692232153Smm *p++ = (char)*wp++; 693232153Smm count++; 694232153Smm } 695232153Smm } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { 696232153Smm uint16_t *u16; 697232153Smm 698232153Smm if (NULL == 699232153Smm archive_string_ensure(as, as->length + len * 2 + 2)) 700232153Smm return (-1); 701232153Smm u16 = (uint16_t *)(as->s + as->length); 702232153Smm count = 0; 703232153Smm defchar_used = 0; 704232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 705232153Smm while (count < (int)len && *ws) { 706232153Smm archive_be16enc(u16+count, *ws); 707232153Smm ws++; 708232153Smm count++; 709232153Smm } 710232153Smm } else { 711232153Smm while (count < (int)len && *ws) { 712232153Smm archive_le16enc(u16+count, *ws); 713232153Smm ws++; 714232153Smm count++; 715232153Smm } 716232153Smm } 717232153Smm count <<= 1; /* to be byte size */ 718232153Smm } else { 719232153Smm /* Make sure the MBS buffer has plenty to set. */ 720232153Smm if (NULL == 721232153Smm archive_string_ensure(as, as->length + len * 2 + 1)) 722232153Smm return (-1); 723232153Smm do { 724232153Smm defchar_used = 0; 725232153Smm if (to_cp == CP_UTF8 || sc == NULL) 726232153Smm dp = NULL; 727232153Smm else 728232153Smm dp = &defchar_used; 729232153Smm count = WideCharToMultiByte(to_cp, 0, ws, wslen, 730248616Smm as->s + as->length, (int)as->buffer_length-1, NULL, dp); 731232153Smm if (count == 0 && 732232153Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 733232153Smm /* Expand the MBS buffer and retry. */ 734232153Smm if (NULL == archive_string_ensure(as, 735232153Smm as->buffer_length + len)) 736232153Smm return (-1); 737232153Smm continue; 738232153Smm } 739232153Smm if (count == 0) 740232153Smm ret = -1; 741232153Smm } while (0); 742232153Smm } 743232153Smm as->length += count; 744232153Smm as->s[as->length] = '\0'; 745232153Smm return (defchar_used?-1:ret); 746232153Smm} 747232153Smm 748232153Smm#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) 749232153Smm 750232153Smm/* 751232153Smm * Translates a wide character string into current locale character set 752232153Smm * and appends to the archive_string. Note: returns -1 if conversion 753232153Smm * fails. 754232153Smm */ 755232153Smmint 756232153Smmarchive_string_append_from_wcs(struct archive_string *as, 757232153Smm const wchar_t *w, size_t len) 758232153Smm{ 759232153Smm /* We cannot use the standard wcstombs() here because it 760232153Smm * cannot tell us how big the output buffer should be. So 761232153Smm * I've built a loop around wcrtomb() or wctomb() that 762232153Smm * converts a character at a time and resizes the string as 763232153Smm * needed. We prefer wcrtomb() when it's available because 764232153Smm * it's thread-safe. */ 765232153Smm int n, ret_val = 0; 766228753Smm char *p; 767232153Smm char *end; 768232153Smm#if HAVE_WCRTOMB 769232153Smm mbstate_t shift_state; 770228753Smm 771232153Smm memset(&shift_state, 0, sizeof(shift_state)); 772232153Smm#else 773232153Smm /* Clear the shift state before starting. */ 774232153Smm wctomb(NULL, L'\0'); 775232153Smm#endif 776228753Smm /* 777232153Smm * Allocate buffer for MBS. 778232153Smm * We need this allocation here since it is possible that 779232153Smm * as->s is still NULL. 780228753Smm */ 781232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 782238856Smm return (-1); 783232153Smm 784232153Smm p = as->s + as->length; 785232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 786232153Smm while (*w != L'\0' && len > 0) { 787232153Smm if (p >= end) { 788232153Smm as->length = p - as->s; 789232153Smm as->s[as->length] = '\0'; 790232153Smm /* Re-allocate buffer for MBS. */ 791232153Smm if (archive_string_ensure(as, 792232153Smm as->length + len * 2 + 1) == NULL) 793238856Smm return (-1); 794232153Smm p = as->s + as->length; 795232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 796228753Smm } 797232153Smm#if HAVE_WCRTOMB 798232153Smm n = wcrtomb(p, *w++, &shift_state); 799232153Smm#else 800232153Smm n = wctomb(p, *w++); 801232153Smm#endif 802232153Smm if (n == -1) { 803232153Smm if (errno == EILSEQ) { 804232153Smm /* Skip an illegal wide char. */ 805232153Smm *p++ = '?'; 806232153Smm ret_val = -1; 807232153Smm } else { 808232153Smm ret_val = -1; 809232153Smm break; 810232153Smm } 811232153Smm } else 812232153Smm p += n; 813232153Smm len--; 814232153Smm } 815232153Smm as->length = p - as->s; 816232153Smm as->s[as->length] = '\0'; 817232153Smm return (ret_val); 818232153Smm} 819232153Smm 820232153Smm#else /* HAVE_WCTOMB || HAVE_WCRTOMB */ 821232153Smm 822232153Smm/* 823232153Smm * TODO: Test if __STDC_ISO_10646__ is defined. 824232153Smm * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 825232153Smm * one character at a time. If a non-Windows platform doesn't have 826232153Smm * either of these, fall back to the built-in UTF8 conversion. 827232153Smm */ 828232153Smmint 829232153Smmarchive_string_append_from_wcs(struct archive_string *as, 830232153Smm const wchar_t *w, size_t len) 831232153Smm{ 832232153Smm (void)as;/* UNUSED */ 833232153Smm (void)w;/* UNUSED */ 834232153Smm (void)len;/* UNUSED */ 835238856Smm errno = ENOSYS; 836232153Smm return (-1); 837232153Smm} 838232153Smm 839232153Smm#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ 840232153Smm 841232153Smm/* 842232153Smm * Find a string conversion object by a pair of 'from' charset name 843232153Smm * and 'to' charset name from an archive object. 844232153Smm * Return NULL if not found. 845232153Smm */ 846232153Smmstatic struct archive_string_conv * 847232153Smmfind_sconv_object(struct archive *a, const char *fc, const char *tc) 848232153Smm{ 849232153Smm struct archive_string_conv *sc; 850232153Smm 851232153Smm if (a == NULL) 852232153Smm return (NULL); 853232153Smm 854232153Smm for (sc = a->sconv; sc != NULL; sc = sc->next) { 855232153Smm if (strcmp(sc->from_charset, fc) == 0 && 856232153Smm strcmp(sc->to_charset, tc) == 0) 857232153Smm break; 858232153Smm } 859232153Smm return (sc); 860232153Smm} 861232153Smm 862232153Smm/* 863232153Smm * Register a string object to an archive object. 864232153Smm */ 865232153Smmstatic void 866232153Smmadd_sconv_object(struct archive *a, struct archive_string_conv *sc) 867232153Smm{ 868232153Smm struct archive_string_conv **psc; 869232153Smm 870232153Smm /* Add a new sconv to sconv list. */ 871232153Smm psc = &(a->sconv); 872232153Smm while (*psc != NULL) 873232153Smm psc = &((*psc)->next); 874232153Smm *psc = sc; 875232153Smm} 876232153Smm 877232153Smmstatic void 878232153Smmadd_converter(struct archive_string_conv *sc, int (*converter) 879232153Smm (struct archive_string *, const void *, size_t, 880232153Smm struct archive_string_conv *)) 881232153Smm{ 882232153Smm if (sc == NULL || sc->nconverter >= 2) 883232153Smm __archive_errx(1, "Programing error"); 884232153Smm sc->converter[sc->nconverter++] = converter; 885232153Smm} 886232153Smm 887232153Smmstatic void 888232153Smmsetup_converter(struct archive_string_conv *sc) 889232153Smm{ 890232153Smm 891232153Smm /* Reset. */ 892232153Smm sc->nconverter = 0; 893232153Smm 894232153Smm /* 895232153Smm * Perform special sequence for the incorrect UTF-8 filenames 896232153Smm * made by libarchive2.x. 897232153Smm */ 898232153Smm if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { 899232153Smm add_converter(sc, strncat_from_utf8_libarchive2); 900232153Smm return; 901232153Smm } 902232153Smm 903232153Smm /* 904232153Smm * Convert a string to UTF-16BE/LE. 905232153Smm */ 906232153Smm if (sc->flag & SCONV_TO_UTF16) { 907232153Smm /* 908232153Smm * If the current locale is UTF-8, we can translate 909232153Smm * a UTF-8 string into a UTF-16BE string. 910232153Smm */ 911232153Smm if (sc->flag & SCONV_FROM_UTF8) { 912232153Smm add_converter(sc, archive_string_append_unicode); 913232153Smm return; 914228753Smm } 915232153Smm 916232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 917232153Smm if (sc->flag & SCONV_WIN_CP) { 918232153Smm if (sc->flag & SCONV_TO_UTF16BE) 919232153Smm add_converter(sc, win_strncat_to_utf16be); 920232153Smm else 921232153Smm add_converter(sc, win_strncat_to_utf16le); 922232153Smm return; 923232153Smm } 924232153Smm#endif 925232153Smm 926232153Smm#if defined(HAVE_ICONV) 927232153Smm if (sc->cd != (iconv_t)-1) { 928232153Smm add_converter(sc, iconv_strncat_in_locale); 929232153Smm return; 930232153Smm } 931232153Smm#endif 932232153Smm 933232153Smm if (sc->flag & SCONV_BEST_EFFORT) { 934232153Smm if (sc->flag & SCONV_TO_UTF16BE) 935238856Smm add_converter(sc, 936238856Smm best_effort_strncat_to_utf16be); 937232153Smm else 938238856Smm add_converter(sc, 939238856Smm best_effort_strncat_to_utf16le); 940232153Smm } else 941232153Smm /* Make sure we have no converter. */ 942232153Smm sc->nconverter = 0; 943232153Smm return; 944232153Smm } 945232153Smm 946232153Smm /* 947232153Smm * Convert a string from UTF-16BE/LE. 948232153Smm */ 949232153Smm if (sc->flag & SCONV_FROM_UTF16) { 950232153Smm /* 951232153Smm * At least we should normalize a UTF-16BE string. 952232153Smm */ 953232153Smm if (sc->flag & SCONV_NORMALIZATION_D) 954232153Smm add_converter(sc,archive_string_normalize_D); 955238856Smm else if (sc->flag & SCONV_NORMALIZATION_C) 956232153Smm add_converter(sc, archive_string_normalize_C); 957232153Smm 958232153Smm if (sc->flag & SCONV_TO_UTF8) { 959232153Smm /* 960232153Smm * If the current locale is UTF-8, we can translate 961232153Smm * a UTF-16BE/LE string into a UTF-8 string directly. 962232153Smm */ 963232153Smm if (!(sc->flag & 964232153Smm (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 965232153Smm add_converter(sc, 966232153Smm archive_string_append_unicode); 967232153Smm return; 968232153Smm } 969232153Smm 970232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 971232153Smm if (sc->flag & SCONV_WIN_CP) { 972232153Smm if (sc->flag & SCONV_FROM_UTF16BE) 973232153Smm add_converter(sc, win_strncat_from_utf16be); 974232153Smm else 975232153Smm add_converter(sc, win_strncat_from_utf16le); 976232153Smm return; 977232153Smm } 978232153Smm#endif 979232153Smm 980232153Smm#if defined(HAVE_ICONV) 981232153Smm if (sc->cd != (iconv_t)-1) { 982232153Smm add_converter(sc, iconv_strncat_in_locale); 983232153Smm return; 984232153Smm } 985232153Smm#endif 986232153Smm 987232153Smm if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 988232153Smm == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 989232153Smm add_converter(sc, best_effort_strncat_from_utf16be); 990232153Smm else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 991232153Smm == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 992232153Smm add_converter(sc, best_effort_strncat_from_utf16le); 993232153Smm else 994232153Smm /* Make sure we have no converter. */ 995232153Smm sc->nconverter = 0; 996232153Smm return; 997232153Smm } 998232153Smm 999232153Smm if (sc->flag & SCONV_FROM_UTF8) { 1000232153Smm /* 1001232153Smm * At least we should normalize a UTF-8 string. 1002232153Smm */ 1003232153Smm if (sc->flag & SCONV_NORMALIZATION_D) 1004232153Smm add_converter(sc,archive_string_normalize_D); 1005238856Smm else if (sc->flag & SCONV_NORMALIZATION_C) 1006232153Smm add_converter(sc, archive_string_normalize_C); 1007232153Smm 1008232153Smm /* 1009232153Smm * Copy UTF-8 string with a check of CESU-8. 1010232153Smm * Apparently, iconv does not check surrogate pairs in UTF-8 1011232153Smm * when both from-charset and to-charset are UTF-8, and then 1012232153Smm * we use our UTF-8 copy code. 1013232153Smm */ 1014232153Smm if (sc->flag & SCONV_TO_UTF8) { 1015232153Smm /* 1016232153Smm * If the current locale is UTF-8, we can translate 1017232153Smm * a UTF-16BE string into a UTF-8 string directly. 1018232153Smm */ 1019232153Smm if (!(sc->flag & 1020232153Smm (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 1021232153Smm add_converter(sc, strncat_from_utf8_to_utf8); 1022232153Smm return; 1023232153Smm } 1024232153Smm } 1025232153Smm 1026232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1027232153Smm /* 1028232153Smm * On Windows we can use Windows API for a string conversion. 1029232153Smm */ 1030232153Smm if (sc->flag & SCONV_WIN_CP) { 1031232153Smm add_converter(sc, strncat_in_codepage); 1032232153Smm return; 1033232153Smm } 1034232153Smm#endif 1035232153Smm 1036232153Smm#if HAVE_ICONV 1037232153Smm if (sc->cd != (iconv_t)-1) { 1038232153Smm add_converter(sc, iconv_strncat_in_locale); 1039238856Smm /* 1040238856Smm * iconv generally does not support UTF-8-MAC and so 1041238856Smm * we have to the output of iconv from NFC to NFD if 1042238856Smm * need. 1043238856Smm */ 1044238856Smm if ((sc->flag & SCONV_FROM_CHARSET) && 1045238856Smm (sc->flag & SCONV_TO_UTF8)) { 1046238856Smm if (sc->flag & SCONV_NORMALIZATION_D) 1047238856Smm add_converter(sc, archive_string_normalize_D); 1048238856Smm } 1049232153Smm return; 1050232153Smm } 1051232153Smm#endif 1052232153Smm 1053232153Smm /* 1054232153Smm * Try conversion in the best effort or no conversion. 1055232153Smm */ 1056232153Smm if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) 1057232153Smm add_converter(sc, best_effort_strncat_in_locale); 1058232153Smm else 1059232153Smm /* Make sure we have no converter. */ 1060232153Smm sc->nconverter = 0; 1061232153Smm} 1062232153Smm 1063232153Smm/* 1064232153Smm * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE 1065232153Smm * and CP932 which are referenced in create_sconv_object(). 1066232153Smm */ 1067232153Smmstatic const char * 1068232153Smmcanonical_charset_name(const char *charset) 1069232153Smm{ 1070232153Smm char cs[16]; 1071232153Smm char *p; 1072232153Smm const char *s; 1073232153Smm 1074232153Smm if (charset == NULL || charset[0] == '\0' 1075232153Smm || strlen(charset) > 15) 1076232153Smm return (charset); 1077232153Smm 1078232153Smm /* Copy name to uppercase. */ 1079232153Smm p = cs; 1080232153Smm s = charset; 1081232153Smm while (*s) { 1082232153Smm char c = *s++; 1083232153Smm if (c >= 'a' && c <= 'z') 1084232153Smm c -= 'a' - 'A'; 1085232153Smm *p++ = c; 1086232153Smm } 1087232153Smm *p++ = '\0'; 1088232153Smm 1089232153Smm if (strcmp(cs, "UTF-8") == 0 || 1090232153Smm strcmp(cs, "UTF8") == 0) 1091232153Smm return ("UTF-8"); 1092232153Smm if (strcmp(cs, "UTF-16BE") == 0 || 1093232153Smm strcmp(cs, "UTF16BE") == 0) 1094232153Smm return ("UTF-16BE"); 1095232153Smm if (strcmp(cs, "UTF-16LE") == 0 || 1096232153Smm strcmp(cs, "UTF16LE") == 0) 1097232153Smm return ("UTF-16LE"); 1098232153Smm if (strcmp(cs, "CP932") == 0) 1099232153Smm return ("CP932"); 1100232153Smm return (charset); 1101232153Smm} 1102232153Smm 1103232153Smm/* 1104232153Smm * Create a string conversion object. 1105232153Smm */ 1106232153Smmstatic struct archive_string_conv * 1107232153Smmcreate_sconv_object(const char *fc, const char *tc, 1108232153Smm unsigned current_codepage, int flag) 1109232153Smm{ 1110232153Smm struct archive_string_conv *sc; 1111232153Smm 1112232153Smm sc = calloc(1, sizeof(*sc)); 1113232153Smm if (sc == NULL) 1114232153Smm return (NULL); 1115232153Smm sc->next = NULL; 1116232153Smm sc->from_charset = strdup(fc); 1117232153Smm if (sc->from_charset == NULL) { 1118232153Smm free(sc); 1119232153Smm return (NULL); 1120232153Smm } 1121232153Smm sc->to_charset = strdup(tc); 1122232153Smm if (sc->to_charset == NULL) { 1123248616Smm free(sc->from_charset); 1124232153Smm free(sc); 1125232153Smm return (NULL); 1126232153Smm } 1127232153Smm archive_string_init(&sc->utftmp); 1128232153Smm 1129232153Smm if (flag & SCONV_TO_CHARSET) { 1130232153Smm /* 1131232153Smm * Convert characters from the current locale charset to 1132232153Smm * a specified charset. 1133232153Smm */ 1134232153Smm sc->from_cp = current_codepage; 1135232153Smm sc->to_cp = make_codepage_from_charset(tc); 1136232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1137232153Smm if (IsValidCodePage(sc->to_cp)) 1138232153Smm flag |= SCONV_WIN_CP; 1139232153Smm#endif 1140232153Smm } else if (flag & SCONV_FROM_CHARSET) { 1141232153Smm /* 1142232153Smm * Convert characters from a specified charset to 1143232153Smm * the current locale charset. 1144232153Smm */ 1145232153Smm sc->to_cp = current_codepage; 1146232153Smm sc->from_cp = make_codepage_from_charset(fc); 1147232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1148232153Smm if (IsValidCodePage(sc->from_cp)) 1149232153Smm flag |= SCONV_WIN_CP; 1150232153Smm#endif 1151232153Smm } 1152232153Smm 1153232153Smm /* 1154232153Smm * Check if "from charset" and "to charset" are the same. 1155232153Smm */ 1156232153Smm if (strcmp(fc, tc) == 0 || 1157232153Smm (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) 1158232153Smm sc->same = 1; 1159232153Smm else 1160232153Smm sc->same = 0; 1161232153Smm 1162232153Smm /* 1163232153Smm * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. 1164232153Smm */ 1165232153Smm if (strcmp(tc, "UTF-8") == 0) 1166232153Smm flag |= SCONV_TO_UTF8; 1167232153Smm else if (strcmp(tc, "UTF-16BE") == 0) 1168232153Smm flag |= SCONV_TO_UTF16BE; 1169232153Smm else if (strcmp(tc, "UTF-16LE") == 0) 1170232153Smm flag |= SCONV_TO_UTF16LE; 1171232153Smm if (strcmp(fc, "UTF-8") == 0) 1172232153Smm flag |= SCONV_FROM_UTF8; 1173232153Smm else if (strcmp(fc, "UTF-16BE") == 0) 1174232153Smm flag |= SCONV_FROM_UTF16BE; 1175232153Smm else if (strcmp(fc, "UTF-16LE") == 0) 1176232153Smm flag |= SCONV_FROM_UTF16LE; 1177232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1178232153Smm if (sc->to_cp == CP_UTF8) 1179232153Smm flag |= SCONV_TO_UTF8; 1180232153Smm else if (sc->to_cp == CP_UTF16BE) 1181232153Smm flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; 1182232153Smm else if (sc->to_cp == CP_UTF16LE) 1183232153Smm flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; 1184232153Smm if (sc->from_cp == CP_UTF8) 1185232153Smm flag |= SCONV_FROM_UTF8; 1186232153Smm else if (sc->from_cp == CP_UTF16BE) 1187232153Smm flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; 1188232153Smm else if (sc->from_cp == CP_UTF16LE) 1189232153Smm flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; 1190232153Smm#endif 1191232153Smm 1192232153Smm /* 1193232153Smm * Set a flag for Unicode NFD. Usually iconv cannot correctly 1194232153Smm * handle it. So we have to translate NFD characters to NFC ones 1195232153Smm * ourselves before iconv handles. Another reason is to prevent 1196232153Smm * that the same sight of two filenames, one is NFC and other 1197232153Smm * is NFD, would be in its directory. 1198232153Smm * On Mac OS X, although its filesystem layer automatically 1199232153Smm * convert filenames to NFD, it would be useful for filename 1200232153Smm * comparing to find out the same filenames that we normalize 1201232153Smm * that to be NFD ourselves. 1202232153Smm */ 1203232153Smm if ((flag & SCONV_FROM_CHARSET) && 1204232153Smm (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { 1205232153Smm#if defined(__APPLE__) 1206238856Smm if (flag & SCONV_TO_UTF8) 1207238856Smm flag |= SCONV_NORMALIZATION_D; 1208238856Smm else 1209232153Smm#endif 1210232153Smm flag |= SCONV_NORMALIZATION_C; 1211232153Smm } 1212238856Smm#if defined(__APPLE__) 1213238856Smm /* 1214238856Smm * In case writing an archive file, make sure that a filename 1215238856Smm * going to be passed to iconv is a Unicode NFC string since 1216238856Smm * a filename in HFS Plus filesystem is a Unicode NFD one and 1217238856Smm * iconv cannot handle it with "UTF-8" charset. It is simpler 1218238856Smm * than a use of "UTF-8-MAC" charset. 1219238856Smm */ 1220238856Smm if ((flag & SCONV_TO_CHARSET) && 1221238856Smm (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1222238856Smm !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1223238856Smm flag |= SCONV_NORMALIZATION_C; 1224238856Smm /* 1225238856Smm * In case reading an archive file. make sure that a filename 1226238856Smm * will be passed to users is a Unicode NFD string in order to 1227238856Smm * correctly compare the filename with other one which comes 1228238856Smm * from HFS Plus filesystem. 1229238856Smm */ 1230238856Smm if ((flag & SCONV_FROM_CHARSET) && 1231238856Smm !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1232238856Smm (flag & SCONV_TO_UTF8)) 1233238856Smm flag |= SCONV_NORMALIZATION_D; 1234238856Smm#endif 1235232153Smm 1236232153Smm#if defined(HAVE_ICONV) 1237232153Smm sc->cd_w = (iconv_t)-1; 1238232153Smm /* 1239232153Smm * Create an iconv object. 1240232153Smm */ 1241232153Smm if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && 1242232153Smm (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || 1243232153Smm (flag & SCONV_WIN_CP)) { 1244232153Smm /* This case we won't use iconv. */ 1245232153Smm sc->cd = (iconv_t)-1; 1246232153Smm } else { 1247232153Smm sc->cd = iconv_open(tc, fc); 1248232153Smm if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { 1249232153Smm /* 1250232153Smm * Unfortunaly, all of iconv implements do support 1251232153Smm * "CP932" character-set, so we should use "SJIS" 1252232153Smm * instead if iconv_open failed. 1253232153Smm */ 1254232153Smm if (strcmp(tc, "CP932") == 0) 1255232153Smm sc->cd = iconv_open("SJIS", fc); 1256232153Smm else if (strcmp(fc, "CP932") == 0) 1257232153Smm sc->cd = iconv_open(tc, "SJIS"); 1258232153Smm } 1259232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1260232153Smm /* 1261232153Smm * archive_mstring on Windows directly convert multi-bytes 1262232153Smm * into archive_wstring in order not to depend on locale 1263232153Smm * so that you can do a I18N programing. This will be 1264232153Smm * used only in archive_mstring_copy_mbs_len_l so far. 1265232153Smm */ 1266232153Smm if (flag & SCONV_FROM_CHARSET) { 1267232153Smm sc->cd_w = iconv_open("UTF-8", fc); 1268232153Smm if (sc->cd_w == (iconv_t)-1 && 1269232153Smm (sc->flag & SCONV_BEST_EFFORT)) { 1270232153Smm if (strcmp(fc, "CP932") == 0) 1271232153Smm sc->cd_w = iconv_open("UTF-8", "SJIS"); 1272232153Smm } 1273232153Smm } 1274232153Smm#endif /* _WIN32 && !__CYGWIN__ */ 1275232153Smm } 1276232153Smm#endif /* HAVE_ICONV */ 1277232153Smm 1278232153Smm sc->flag = flag; 1279232153Smm 1280232153Smm /* 1281238856Smm * Set up converters. 1282232153Smm */ 1283232153Smm setup_converter(sc); 1284232153Smm 1285232153Smm return (sc); 1286232153Smm} 1287232153Smm 1288232153Smm/* 1289232153Smm * Free a string conversion object. 1290232153Smm */ 1291232153Smmstatic void 1292232153Smmfree_sconv_object(struct archive_string_conv *sc) 1293232153Smm{ 1294232153Smm free(sc->from_charset); 1295232153Smm free(sc->to_charset); 1296232153Smm archive_string_free(&sc->utftmp); 1297232153Smm#if HAVE_ICONV 1298232153Smm if (sc->cd != (iconv_t)-1) 1299232153Smm iconv_close(sc->cd); 1300232153Smm if (sc->cd_w != (iconv_t)-1) 1301232153Smm iconv_close(sc->cd_w); 1302232153Smm#endif 1303232153Smm free(sc); 1304232153Smm} 1305232153Smm 1306232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1307232153Smmstatic unsigned 1308232153Smmmy_atoi(const char *p) 1309232153Smm{ 1310232153Smm unsigned cp; 1311232153Smm 1312232153Smm cp = 0; 1313232153Smm while (*p) { 1314232153Smm if (*p >= '0' && *p <= '9') 1315232153Smm cp = cp * 10 + (*p - '0'); 1316232153Smm else 1317232153Smm return (-1); 1318232153Smm p++; 1319232153Smm } 1320232153Smm return (cp); 1321232153Smm} 1322232153Smm 1323232153Smm/* 1324232153Smm * Translate Charset name (as used by iconv) into CodePage (as used by Windows) 1325232153Smm * Return -1 if failed. 1326232153Smm * 1327232153Smm * Note: This translation code may be insufficient. 1328232153Smm */ 1329232153Smmstatic struct charset { 1330232153Smm const char *name; 1331232153Smm unsigned cp; 1332232153Smm} charsets[] = { 1333232153Smm /* MUST BE SORTED! */ 1334232153Smm {"ASCII", 1252}, 1335232153Smm {"ASMO-708", 708}, 1336232153Smm {"BIG5", 950}, 1337232153Smm {"CHINESE", 936}, 1338232153Smm {"CP367", 1252}, 1339232153Smm {"CP819", 1252}, 1340232153Smm {"CP1025", 21025}, 1341232153Smm {"DOS-720", 720}, 1342232153Smm {"DOS-862", 862}, 1343232153Smm {"EUC-CN", 51936}, 1344232153Smm {"EUC-JP", 51932}, 1345232153Smm {"EUC-KR", 949}, 1346232153Smm {"EUCCN", 51936}, 1347232153Smm {"EUCJP", 51932}, 1348232153Smm {"EUCKR", 949}, 1349232153Smm {"GB18030", 54936}, 1350232153Smm {"GB2312", 936}, 1351232153Smm {"HEBREW", 1255}, 1352232153Smm {"HZ-GB-2312", 52936}, 1353232153Smm {"IBM273", 20273}, 1354232153Smm {"IBM277", 20277}, 1355232153Smm {"IBM278", 20278}, 1356232153Smm {"IBM280", 20280}, 1357232153Smm {"IBM284", 20284}, 1358232153Smm {"IBM285", 20285}, 1359232153Smm {"IBM290", 20290}, 1360232153Smm {"IBM297", 20297}, 1361232153Smm {"IBM367", 1252}, 1362232153Smm {"IBM420", 20420}, 1363232153Smm {"IBM423", 20423}, 1364232153Smm {"IBM424", 20424}, 1365232153Smm {"IBM819", 1252}, 1366232153Smm {"IBM871", 20871}, 1367232153Smm {"IBM880", 20880}, 1368232153Smm {"IBM905", 20905}, 1369232153Smm {"IBM924", 20924}, 1370232153Smm {"ISO-8859-1", 28591}, 1371232153Smm {"ISO-8859-13", 28603}, 1372232153Smm {"ISO-8859-15", 28605}, 1373232153Smm {"ISO-8859-2", 28592}, 1374232153Smm {"ISO-8859-3", 28593}, 1375232153Smm {"ISO-8859-4", 28594}, 1376232153Smm {"ISO-8859-5", 28595}, 1377232153Smm {"ISO-8859-6", 28596}, 1378232153Smm {"ISO-8859-7", 28597}, 1379232153Smm {"ISO-8859-8", 28598}, 1380232153Smm {"ISO-8859-9", 28599}, 1381232153Smm {"ISO8859-1", 28591}, 1382232153Smm {"ISO8859-13", 28603}, 1383232153Smm {"ISO8859-15", 28605}, 1384232153Smm {"ISO8859-2", 28592}, 1385232153Smm {"ISO8859-3", 28593}, 1386232153Smm {"ISO8859-4", 28594}, 1387232153Smm {"ISO8859-5", 28595}, 1388232153Smm {"ISO8859-6", 28596}, 1389232153Smm {"ISO8859-7", 28597}, 1390232153Smm {"ISO8859-8", 28598}, 1391232153Smm {"ISO8859-9", 28599}, 1392232153Smm {"JOHAB", 1361}, 1393232153Smm {"KOI8-R", 20866}, 1394232153Smm {"KOI8-U", 21866}, 1395232153Smm {"KS_C_5601-1987", 949}, 1396232153Smm {"LATIN1", 1252}, 1397232153Smm {"LATIN2", 28592}, 1398232153Smm {"MACINTOSH", 10000}, 1399232153Smm {"SHIFT-JIS", 932}, 1400232153Smm {"SHIFT_JIS", 932}, 1401232153Smm {"SJIS", 932}, 1402232153Smm {"US", 1252}, 1403232153Smm {"US-ASCII", 1252}, 1404232153Smm {"UTF-16", 1200}, 1405232153Smm {"UTF-16BE", 1201}, 1406232153Smm {"UTF-16LE", 1200}, 1407232153Smm {"UTF-8", CP_UTF8}, 1408232153Smm {"X-EUROPA", 29001}, 1409232153Smm {"X-MAC-ARABIC", 10004}, 1410232153Smm {"X-MAC-CE", 10029}, 1411232153Smm {"X-MAC-CHINESEIMP", 10008}, 1412232153Smm {"X-MAC-CHINESETRAD", 10002}, 1413232153Smm {"X-MAC-CROATIAN", 10082}, 1414232153Smm {"X-MAC-CYRILLIC", 10007}, 1415232153Smm {"X-MAC-GREEK", 10006}, 1416232153Smm {"X-MAC-HEBREW", 10005}, 1417232153Smm {"X-MAC-ICELANDIC", 10079}, 1418232153Smm {"X-MAC-JAPANESE", 10001}, 1419232153Smm {"X-MAC-KOREAN", 10003}, 1420232153Smm {"X-MAC-ROMANIAN", 10010}, 1421232153Smm {"X-MAC-THAI", 10021}, 1422232153Smm {"X-MAC-TURKISH", 10081}, 1423232153Smm {"X-MAC-UKRAINIAN", 10017}, 1424232153Smm}; 1425232153Smmstatic unsigned 1426232153Smmmake_codepage_from_charset(const char *charset) 1427232153Smm{ 1428232153Smm char cs[16]; 1429232153Smm char *p; 1430232153Smm unsigned cp; 1431232153Smm int a, b; 1432232153Smm 1433232153Smm if (charset == NULL || strlen(charset) > 15) 1434232153Smm return -1; 1435232153Smm 1436232153Smm /* Copy name to uppercase. */ 1437232153Smm p = cs; 1438232153Smm while (*charset) { 1439232153Smm char c = *charset++; 1440232153Smm if (c >= 'a' && c <= 'z') 1441232153Smm c -= 'a' - 'A'; 1442232153Smm *p++ = c; 1443232153Smm } 1444232153Smm *p++ = '\0'; 1445232153Smm cp = -1; 1446232153Smm 1447232153Smm /* Look it up in the table first, so that we can easily 1448232153Smm * override CP367, which we map to 1252 instead of 367. */ 1449232153Smm a = 0; 1450232153Smm b = sizeof(charsets)/sizeof(charsets[0]); 1451232153Smm while (b > a) { 1452232153Smm int c = (b + a) / 2; 1453232153Smm int r = strcmp(charsets[c].name, cs); 1454232153Smm if (r < 0) 1455232153Smm a = c + 1; 1456232153Smm else if (r > 0) 1457232153Smm b = c; 1458232153Smm else 1459232153Smm return charsets[c].cp; 1460232153Smm } 1461232153Smm 1462232153Smm /* If it's not in the table, try to parse it. */ 1463232153Smm switch (*cs) { 1464232153Smm case 'C': 1465232153Smm if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { 1466232153Smm cp = my_atoi(cs + 2); 1467232153Smm } else if (strcmp(cs, "CP_ACP") == 0) 1468232153Smm cp = get_current_codepage(); 1469232153Smm else if (strcmp(cs, "CP_OEMCP") == 0) 1470232153Smm cp = get_current_oemcp(); 1471232153Smm break; 1472232153Smm case 'I': 1473232153Smm if (cs[1] == 'B' && cs[2] == 'M' && 1474232153Smm cs[3] >= '0' && cs[3] <= '9') { 1475232153Smm cp = my_atoi(cs + 3); 1476232153Smm } 1477232153Smm break; 1478232153Smm case 'W': 1479232153Smm if (strncmp(cs, "WINDOWS-", 8) == 0) { 1480232153Smm cp = my_atoi(cs + 8); 1481232153Smm if (cp != 874 && (cp < 1250 || cp > 1258)) 1482232153Smm cp = -1;/* This may invalid code. */ 1483232153Smm } 1484232153Smm break; 1485232153Smm } 1486232153Smm return (cp); 1487232153Smm} 1488232153Smm 1489232153Smm/* 1490232153Smm * Return ANSI Code Page of current locale set by setlocale(). 1491232153Smm */ 1492232153Smmstatic unsigned 1493232153Smmget_current_codepage(void) 1494232153Smm{ 1495232153Smm char *locale, *p; 1496232153Smm unsigned cp; 1497232153Smm 1498232153Smm locale = setlocale(LC_CTYPE, NULL); 1499232153Smm if (locale == NULL) 1500232153Smm return (GetACP()); 1501232153Smm if (locale[0] == 'C' && locale[1] == '\0') 1502232153Smm return (CP_C_LOCALE); 1503232153Smm p = strrchr(locale, '.'); 1504232153Smm if (p == NULL) 1505232153Smm return (GetACP()); 1506232153Smm cp = my_atoi(p+1); 1507232153Smm if (cp <= 0) 1508232153Smm return (GetACP()); 1509232153Smm return (cp); 1510232153Smm} 1511232153Smm 1512232153Smm/* 1513232153Smm * Translation table between Locale Name and ACP/OEMCP. 1514232153Smm */ 1515232153Smmstatic struct { 1516232153Smm unsigned acp; 1517232153Smm unsigned ocp; 1518232153Smm const char *locale; 1519232153Smm} acp_ocp_map[] = { 1520232153Smm { 950, 950, "Chinese_Taiwan" }, 1521232153Smm { 936, 936, "Chinese_People's Republic of China" }, 1522232153Smm { 950, 950, "Chinese_Taiwan" }, 1523232153Smm { 1250, 852, "Czech_Czech Republic" }, 1524232153Smm { 1252, 850, "Danish_Denmark" }, 1525232153Smm { 1252, 850, "Dutch_Netherlands" }, 1526232153Smm { 1252, 850, "Dutch_Belgium" }, 1527232153Smm { 1252, 437, "English_United States" }, 1528232153Smm { 1252, 850, "English_Australia" }, 1529232153Smm { 1252, 850, "English_Canada" }, 1530232153Smm { 1252, 850, "English_New Zealand" }, 1531232153Smm { 1252, 850, "English_United Kingdom" }, 1532232153Smm { 1252, 437, "English_United States" }, 1533232153Smm { 1252, 850, "Finnish_Finland" }, 1534232153Smm { 1252, 850, "French_France" }, 1535232153Smm { 1252, 850, "French_Belgium" }, 1536232153Smm { 1252, 850, "French_Canada" }, 1537232153Smm { 1252, 850, "French_Switzerland" }, 1538232153Smm { 1252, 850, "German_Germany" }, 1539232153Smm { 1252, 850, "German_Austria" }, 1540232153Smm { 1252, 850, "German_Switzerland" }, 1541232153Smm { 1253, 737, "Greek_Greece" }, 1542232153Smm { 1250, 852, "Hungarian_Hungary" }, 1543232153Smm { 1252, 850, "Icelandic_Iceland" }, 1544232153Smm { 1252, 850, "Italian_Italy" }, 1545232153Smm { 1252, 850, "Italian_Switzerland" }, 1546232153Smm { 932, 932, "Japanese_Japan" }, 1547232153Smm { 949, 949, "Korean_Korea" }, 1548232153Smm { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1549232153Smm { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1550232153Smm { 1252, 850, "Norwegian-Nynorsk_Norway" }, 1551232153Smm { 1250, 852, "Polish_Poland" }, 1552232153Smm { 1252, 850, "Portuguese_Portugal" }, 1553232153Smm { 1252, 850, "Portuguese_Brazil" }, 1554232153Smm { 1251, 866, "Russian_Russia" }, 1555232153Smm { 1250, 852, "Slovak_Slovakia" }, 1556232153Smm { 1252, 850, "Spanish_Spain" }, 1557232153Smm { 1252, 850, "Spanish_Mexico" }, 1558232153Smm { 1252, 850, "Spanish_Spain" }, 1559232153Smm { 1252, 850, "Swedish_Sweden" }, 1560232153Smm { 1254, 857, "Turkish_Turkey" }, 1561232153Smm { 0, 0, NULL} 1562232153Smm}; 1563232153Smm 1564232153Smm/* 1565232153Smm * Return OEM Code Page of current locale set by setlocale(). 1566232153Smm */ 1567232153Smmstatic unsigned 1568232153Smmget_current_oemcp(void) 1569232153Smm{ 1570232153Smm int i; 1571232153Smm char *locale, *p; 1572232153Smm size_t len; 1573232153Smm 1574232153Smm locale = setlocale(LC_CTYPE, NULL); 1575232153Smm if (locale == NULL) 1576232153Smm return (GetOEMCP()); 1577232153Smm if (locale[0] == 'C' && locale[1] == '\0') 1578232153Smm return (CP_C_LOCALE); 1579232153Smm 1580232153Smm p = strrchr(locale, '.'); 1581232153Smm if (p == NULL) 1582232153Smm return (GetOEMCP()); 1583232153Smm len = p - locale; 1584232153Smm for (i = 0; acp_ocp_map[i].acp; i++) { 1585232153Smm if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) 1586232153Smm return (acp_ocp_map[i].ocp); 1587232153Smm } 1588232153Smm return (GetOEMCP()); 1589232153Smm} 1590232153Smm#else 1591232153Smm 1592232153Smm/* 1593232153Smm * POSIX platform does not use CodePage. 1594232153Smm */ 1595232153Smm 1596232153Smmstatic unsigned 1597232153Smmget_current_codepage(void) 1598232153Smm{ 1599232153Smm return (-1);/* Unknown */ 1600232153Smm} 1601232153Smmstatic unsigned 1602232153Smmmake_codepage_from_charset(const char *charset) 1603232153Smm{ 1604232153Smm (void)charset; /* UNUSED */ 1605232153Smm return (-1);/* Unknown */ 1606232153Smm} 1607232153Smmstatic unsigned 1608232153Smmget_current_oemcp(void) 1609232153Smm{ 1610232153Smm return (-1);/* Unknown */ 1611232153Smm} 1612232153Smm 1613232153Smm#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 1614232153Smm 1615232153Smm/* 1616232153Smm * Return a string conversion object. 1617232153Smm */ 1618232153Smmstatic struct archive_string_conv * 1619232153Smmget_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) 1620232153Smm{ 1621232153Smm struct archive_string_conv *sc; 1622232153Smm unsigned current_codepage; 1623232153Smm 1624232153Smm /* Check if we have made the sconv object. */ 1625232153Smm sc = find_sconv_object(a, fc, tc); 1626232153Smm if (sc != NULL) 1627232153Smm return (sc); 1628232153Smm 1629232153Smm if (a == NULL) 1630232153Smm current_codepage = get_current_codepage(); 1631232153Smm else 1632232153Smm current_codepage = a->current_codepage; 1633232153Smm 1634232153Smm sc = create_sconv_object(canonical_charset_name(fc), 1635232153Smm canonical_charset_name(tc), current_codepage, flag); 1636232153Smm if (sc == NULL) { 1637232153Smm if (a != NULL) 1638232153Smm archive_set_error(a, ENOMEM, 1639232153Smm "Could not allocate memory for " 1640232153Smm "a string conversion object"); 1641232153Smm return (NULL); 1642232153Smm } 1643232153Smm 1644232153Smm /* 1645232153Smm * If there is no converter for current string conversion object, 1646232153Smm * we cannot handle this conversion. 1647232153Smm */ 1648232153Smm if (sc->nconverter == 0) { 1649232153Smm if (a != NULL) { 1650232153Smm#if HAVE_ICONV 1651232153Smm archive_set_error(a, ARCHIVE_ERRNO_MISC, 1652232153Smm "iconv_open failed : Cannot handle ``%s''", 1653232153Smm (flag & SCONV_TO_CHARSET)?tc:fc); 1654232153Smm#else 1655232153Smm archive_set_error(a, ARCHIVE_ERRNO_MISC, 1656232153Smm "A character-set conversion not fully supported " 1657232153Smm "on this platform"); 1658232153Smm#endif 1659232153Smm } 1660232153Smm /* Failed; free a sconv object. */ 1661232153Smm free_sconv_object(sc); 1662232153Smm return (NULL); 1663232153Smm } 1664232153Smm 1665232153Smm /* 1666232153Smm * Success! 1667232153Smm */ 1668232153Smm if (a != NULL) 1669232153Smm add_sconv_object(a, sc); 1670232153Smm return (sc); 1671232153Smm} 1672232153Smm 1673232153Smmstatic const char * 1674232153Smmget_current_charset(struct archive *a) 1675232153Smm{ 1676232153Smm const char *cur_charset; 1677232153Smm 1678232153Smm if (a == NULL) 1679232153Smm cur_charset = default_iconv_charset(""); 1680232153Smm else { 1681232153Smm cur_charset = default_iconv_charset(a->current_code); 1682232153Smm if (a->current_code == NULL) { 1683232153Smm a->current_code = strdup(cur_charset); 1684232153Smm a->current_codepage = get_current_codepage(); 1685232153Smm a->current_oemcp = get_current_oemcp(); 1686232153Smm } 1687232153Smm } 1688232153Smm return (cur_charset); 1689232153Smm} 1690232153Smm 1691232153Smm/* 1692232153Smm * Make and Return a string conversion object. 1693232153Smm * Return NULL if the platform does not support the specified conversion 1694232153Smm * and best_effort is 0. 1695232153Smm * If best_effort is set, A string conversion object must be returned 1696232153Smm * unless memory allocation for the object fails, but the conversion 1697232153Smm * might fail when non-ASCII code is found. 1698232153Smm */ 1699232153Smmstruct archive_string_conv * 1700232153Smmarchive_string_conversion_to_charset(struct archive *a, const char *charset, 1701232153Smm int best_effort) 1702232153Smm{ 1703232153Smm int flag = SCONV_TO_CHARSET; 1704232153Smm 1705232153Smm if (best_effort) 1706232153Smm flag |= SCONV_BEST_EFFORT; 1707232153Smm return (get_sconv_object(a, get_current_charset(a), charset, flag)); 1708232153Smm} 1709232153Smm 1710232153Smmstruct archive_string_conv * 1711232153Smmarchive_string_conversion_from_charset(struct archive *a, const char *charset, 1712232153Smm int best_effort) 1713232153Smm{ 1714232153Smm int flag = SCONV_FROM_CHARSET; 1715232153Smm 1716232153Smm if (best_effort) 1717232153Smm flag |= SCONV_BEST_EFFORT; 1718232153Smm return (get_sconv_object(a, charset, get_current_charset(a), flag)); 1719232153Smm} 1720232153Smm 1721232153Smm/* 1722232153Smm * archive_string_default_conversion_*_archive() are provided for Windows 1723232153Smm * platform because other archiver application use CP_OEMCP for 1724232153Smm * MultiByteToWideChar() and WideCharToMultiByte() for the filenames 1725232153Smm * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP 1726232153Smm * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). 1727232153Smm * So we should make a string conversion between CP_ACP and CP_OEMCP 1728232153Smm * for compatibillty. 1729232153Smm */ 1730232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1731232153Smmstruct archive_string_conv * 1732232153Smmarchive_string_default_conversion_for_read(struct archive *a) 1733232153Smm{ 1734232153Smm const char *cur_charset = get_current_charset(a); 1735232153Smm char oemcp[16]; 1736232153Smm 1737232153Smm /* NOTE: a check of cur_charset is unneeded but we need 1738232153Smm * that get_current_charset() has been surely called at 1739232153Smm * this time whatever C compiler optimized. */ 1740232153Smm if (cur_charset != NULL && 1741232153Smm (a->current_codepage == CP_C_LOCALE || 1742232153Smm a->current_codepage == a->current_oemcp)) 1743232153Smm return (NULL);/* no conversion. */ 1744232153Smm 1745232153Smm _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1746232153Smm /* Make sure a null termination must be set. */ 1747232153Smm oemcp[sizeof(oemcp)-1] = '\0'; 1748232153Smm return (get_sconv_object(a, oemcp, cur_charset, 1749232153Smm SCONV_FROM_CHARSET)); 1750232153Smm} 1751232153Smm 1752232153Smmstruct archive_string_conv * 1753232153Smmarchive_string_default_conversion_for_write(struct archive *a) 1754232153Smm{ 1755232153Smm const char *cur_charset = get_current_charset(a); 1756232153Smm char oemcp[16]; 1757232153Smm 1758232153Smm /* NOTE: a check of cur_charset is unneeded but we need 1759232153Smm * that get_current_charset() has been surely called at 1760232153Smm * this time whatever C compiler optimized. */ 1761232153Smm if (cur_charset != NULL && 1762232153Smm (a->current_codepage == CP_C_LOCALE || 1763232153Smm a->current_codepage == a->current_oemcp)) 1764232153Smm return (NULL);/* no conversion. */ 1765232153Smm 1766232153Smm _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1767232153Smm /* Make sure a null termination must be set. */ 1768232153Smm oemcp[sizeof(oemcp)-1] = '\0'; 1769232153Smm return (get_sconv_object(a, cur_charset, oemcp, 1770232153Smm SCONV_TO_CHARSET)); 1771232153Smm} 1772232153Smm#else 1773232153Smmstruct archive_string_conv * 1774232153Smmarchive_string_default_conversion_for_read(struct archive *a) 1775232153Smm{ 1776232153Smm (void)a; /* UNUSED */ 1777232153Smm return (NULL); 1778232153Smm} 1779232153Smm 1780232153Smmstruct archive_string_conv * 1781232153Smmarchive_string_default_conversion_for_write(struct archive *a) 1782232153Smm{ 1783232153Smm (void)a; /* UNUSED */ 1784232153Smm return (NULL); 1785232153Smm} 1786232153Smm#endif 1787232153Smm 1788232153Smm/* 1789232153Smm * Dispose of all character conversion objects in the archive object. 1790232153Smm */ 1791232153Smmvoid 1792232153Smmarchive_string_conversion_free(struct archive *a) 1793232153Smm{ 1794232153Smm struct archive_string_conv *sc; 1795232153Smm struct archive_string_conv *sc_next; 1796232153Smm 1797232153Smm for (sc = a->sconv; sc != NULL; sc = sc_next) { 1798232153Smm sc_next = sc->next; 1799232153Smm free_sconv_object(sc); 1800232153Smm } 1801232153Smm a->sconv = NULL; 1802232153Smm free(a->current_code); 1803232153Smm a->current_code = NULL; 1804232153Smm} 1805232153Smm 1806232153Smm/* 1807232153Smm * Return a conversion charset name. 1808232153Smm */ 1809232153Smmconst char * 1810232153Smmarchive_string_conversion_charset_name(struct archive_string_conv *sc) 1811232153Smm{ 1812232153Smm if (sc->flag & SCONV_TO_CHARSET) 1813232153Smm return (sc->to_charset); 1814232153Smm else 1815232153Smm return (sc->from_charset); 1816232153Smm} 1817232153Smm 1818232153Smm/* 1819232153Smm * Change the behavior of a string conversion. 1820232153Smm */ 1821232153Smmvoid 1822232153Smmarchive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) 1823232153Smm{ 1824232153Smm switch (opt) { 1825232153Smm /* 1826232153Smm * A filename in UTF-8 was made with libarchive 2.x in a wrong 1827232153Smm * assumption that wchar_t was Unicode. 1828232153Smm * This option enables simulating the assumption in order to read 1829232153Smm * that filname correctly. 1830232153Smm */ 1831232153Smm case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: 1832232153Smm#if (defined(_WIN32) && !defined(__CYGWIN__)) \ 1833232153Smm || defined(__STDC_ISO_10646__) || defined(__APPLE__) 1834232153Smm /* 1835232153Smm * Nothing to do for it since wchar_t on these platforms 1836232153Smm * is really Unicode. 1837232153Smm */ 1838232153Smm (void)sc; /* UNUSED */ 1839232153Smm#else 1840232153Smm if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { 1841232153Smm sc->flag |= SCONV_UTF8_LIBARCHIVE_2; 1842238856Smm /* Set up string converters. */ 1843232153Smm setup_converter(sc); 1844232153Smm } 1845232153Smm#endif 1846232153Smm break; 1847238856Smm case SCONV_SET_OPT_NORMALIZATION_C: 1848238856Smm if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { 1849238856Smm sc->flag |= SCONV_NORMALIZATION_C; 1850238856Smm sc->flag &= ~SCONV_NORMALIZATION_D; 1851238856Smm /* Set up string converters. */ 1852238856Smm setup_converter(sc); 1853238856Smm } 1854238856Smm break; 1855238856Smm case SCONV_SET_OPT_NORMALIZATION_D: 1856238856Smm#if defined(HAVE_ICONV) 1857238856Smm /* 1858238856Smm * If iconv will take the string, do not change the 1859238856Smm * setting of the normalization. 1860238856Smm */ 1861238856Smm if (!(sc->flag & SCONV_WIN_CP) && 1862238856Smm (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1863238856Smm !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1864238856Smm break; 1865238856Smm#endif 1866238856Smm if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { 1867238856Smm sc->flag |= SCONV_NORMALIZATION_D; 1868238856Smm sc->flag &= ~SCONV_NORMALIZATION_C; 1869238856Smm /* Set up string converters. */ 1870238856Smm setup_converter(sc); 1871238856Smm } 1872238856Smm break; 1873232153Smm default: 1874232153Smm break; 1875232153Smm } 1876232153Smm} 1877232153Smm 1878232153Smm/* 1879232153Smm * 1880232153Smm * Copy one archive_string to another in locale conversion. 1881232153Smm * 1882238856Smm * archive_strncat_l(); 1883238856Smm * archive_strncpy_l(); 1884232153Smm * 1885232153Smm */ 1886232153Smm 1887232153Smmstatic size_t 1888232153Smmmbsnbytes(const void *_p, size_t n) 1889232153Smm{ 1890232153Smm size_t s; 1891232153Smm const char *p, *pp; 1892232153Smm 1893232153Smm if (_p == NULL) 1894232153Smm return (0); 1895232153Smm p = (const char *)_p; 1896232153Smm 1897232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 1898232153Smm s = 0; 1899232153Smm pp = p; 1900232153Smm while (s < n && *pp) { 1901232153Smm pp++; 1902232153Smm s++; 1903232153Smm } 1904232153Smm return (s); 1905232153Smm} 1906232153Smm 1907232153Smmstatic size_t 1908232153Smmutf16nbytes(const void *_p, size_t n) 1909232153Smm{ 1910232153Smm size_t s; 1911232153Smm const char *p, *pp; 1912232153Smm 1913232153Smm if (_p == NULL) 1914232153Smm return (0); 1915232153Smm p = (const char *)_p; 1916232153Smm 1917232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 1918232153Smm s = 0; 1919232153Smm pp = p; 1920232153Smm n >>= 1; 1921232153Smm while (s < n && (pp[0] || pp[1])) { 1922232153Smm pp += 2; 1923232153Smm s++; 1924232153Smm } 1925232153Smm return (s<<1); 1926232153Smm} 1927232153Smm 1928232153Smmint 1929238856Smmarchive_strncpy_l(struct archive_string *as, const void *_p, size_t n, 1930232153Smm struct archive_string_conv *sc) 1931232153Smm{ 1932232153Smm as->length = 0; 1933238856Smm return (archive_strncat_l(as, _p, n, sc)); 1934232153Smm} 1935232153Smm 1936232153Smmint 1937238856Smmarchive_strncat_l(struct archive_string *as, const void *_p, size_t n, 1938232153Smm struct archive_string_conv *sc) 1939232153Smm{ 1940232153Smm const void *s; 1941232153Smm size_t length; 1942232153Smm int i, r = 0, r2; 1943232153Smm 1944232153Smm /* We must allocate memory even if there is no data for conversion 1945232153Smm * or copy. This simulates archive_string_append behavior. */ 1946232153Smm if (_p == NULL || n == 0) { 1947232153Smm int tn = 1; 1948232153Smm if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) 1949232153Smm tn = 2; 1950232153Smm if (archive_string_ensure(as, as->length + tn) == NULL) 1951232153Smm return (-1); 1952232153Smm as->s[as->length] = 0; 1953232153Smm if (tn == 2) 1954232153Smm as->s[as->length+1] = 0; 1955232153Smm return (0); 1956232153Smm } 1957232153Smm 1958232153Smm /* 1959232153Smm * If sc is NULL, we just make a copy. 1960232153Smm */ 1961232153Smm if (sc == NULL) { 1962232153Smm length = mbsnbytes(_p, n); 1963232153Smm if (archive_string_append(as, _p, length) == NULL) 1964232153Smm return (-1);/* No memory */ 1965232153Smm return (0); 1966232153Smm } 1967232153Smm 1968232153Smm if (sc->flag & SCONV_FROM_UTF16) 1969232153Smm length = utf16nbytes(_p, n); 1970232153Smm else 1971232153Smm length = mbsnbytes(_p, n); 1972232153Smm s = _p; 1973232153Smm i = 0; 1974232153Smm if (sc->nconverter > 1) { 1975232153Smm sc->utftmp.length = 0; 1976232153Smm r2 = sc->converter[0](&(sc->utftmp), s, length, sc); 1977232153Smm if (r2 != 0 && errno == ENOMEM) 1978232153Smm return (r2); 1979232153Smm if (r > r2) 1980232153Smm r = r2; 1981232153Smm s = sc->utftmp.s; 1982232153Smm length = sc->utftmp.length; 1983232153Smm ++i; 1984232153Smm } 1985232153Smm r2 = sc->converter[i](as, s, length, sc); 1986232153Smm if (r > r2) 1987232153Smm r = r2; 1988232153Smm return (r); 1989232153Smm} 1990232153Smm 1991232153Smm#if HAVE_ICONV 1992232153Smm 1993232153Smm/* 1994232153Smm * Return -1 if conversion failes. 1995232153Smm */ 1996232153Smmstatic int 1997232153Smmiconv_strncat_in_locale(struct archive_string *as, const void *_p, 1998232153Smm size_t length, struct archive_string_conv *sc) 1999232153Smm{ 2000248616Smm ICONV_CONST char *itp; 2001232153Smm size_t remaining; 2002232153Smm iconv_t cd; 2003232153Smm char *outp; 2004232153Smm size_t avail, bs; 2005232153Smm int return_value = 0; /* success */ 2006232153Smm int to_size, from_size; 2007232153Smm 2008232153Smm if (sc->flag & SCONV_TO_UTF16) 2009232153Smm to_size = 2; 2010232153Smm else 2011232153Smm to_size = 1; 2012232153Smm if (sc->flag & SCONV_FROM_UTF16) 2013232153Smm from_size = 2; 2014232153Smm else 2015232153Smm from_size = 1; 2016232153Smm 2017232153Smm if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) 2018232153Smm return (-1); 2019232153Smm 2020232153Smm cd = sc->cd; 2021248616Smm itp = (char *)(uintptr_t)_p; 2022232153Smm remaining = length; 2023232153Smm outp = as->s + as->length; 2024232153Smm avail = as->buffer_length - as->length - to_size; 2025232153Smm while (remaining >= (size_t)from_size) { 2026248616Smm size_t result = iconv(cd, &itp, &remaining, &outp, &avail); 2027232153Smm 2028232153Smm if (result != (size_t)-1) 2029232153Smm break; /* Conversion completed. */ 2030232153Smm 2031232153Smm if (errno == EILSEQ || errno == EINVAL) { 2032232153Smm /* 2033232153Smm * If an output charset is UTF-8 or UTF-16BE/LE, 2034232153Smm * unknown character should be U+FFFD 2035232153Smm * (replacement character). 2036232153Smm */ 2037232153Smm if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { 2038232153Smm size_t rbytes; 2039232153Smm if (sc->flag & SCONV_TO_UTF8) 2040232153Smm rbytes = UTF8_R_CHAR_SIZE; 2041232153Smm else 2042232153Smm rbytes = 2; 2043232153Smm 2044232153Smm if (avail < rbytes) { 2045232153Smm as->length = outp - as->s; 2046232153Smm bs = as->buffer_length + 2047232153Smm (remaining * to_size) + rbytes; 2048232153Smm if (NULL == 2049232153Smm archive_string_ensure(as, bs)) 2050232153Smm return (-1); 2051232153Smm outp = as->s + as->length; 2052232153Smm avail = as->buffer_length 2053232153Smm - as->length - to_size; 2054232153Smm } 2055232153Smm if (sc->flag & SCONV_TO_UTF8) 2056232153Smm UTF8_SET_R_CHAR(outp); 2057232153Smm else if (sc->flag & SCONV_TO_UTF16BE) 2058232153Smm archive_be16enc(outp, UNICODE_R_CHAR); 2059232153Smm else 2060232153Smm archive_le16enc(outp, UNICODE_R_CHAR); 2061232153Smm outp += rbytes; 2062232153Smm avail -= rbytes; 2063232153Smm } else { 2064232153Smm /* Skip the illegal input bytes. */ 2065232153Smm *outp++ = '?'; 2066232153Smm avail--; 2067232153Smm } 2068248616Smm itp += from_size; 2069232153Smm remaining -= from_size; 2070232153Smm return_value = -1; /* failure */ 2071228753Smm } else { 2072232153Smm /* E2BIG no output buffer, 2073232153Smm * Increase an output buffer. */ 2074232153Smm as->length = outp - as->s; 2075232153Smm bs = as->buffer_length + remaining * 2; 2076232153Smm if (NULL == archive_string_ensure(as, bs)) 2077232153Smm return (-1); 2078232153Smm outp = as->s + as->length; 2079232153Smm avail = as->buffer_length - as->length - to_size; 2080228753Smm } 2081228753Smm } 2082232153Smm as->length = outp - as->s; 2083232153Smm as->s[as->length] = 0; 2084232153Smm if (to_size == 2) 2085232153Smm as->s[as->length+1] = 0; 2086232153Smm return (return_value); 2087228753Smm} 2088228753Smm 2089232153Smm#endif /* HAVE_ICONV */ 2090232153Smm 2091232153Smm 2092232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 2093232153Smm 2094232153Smm/* 2095232153Smm * Translate a string from a some CodePage to an another CodePage by 2096232153Smm * Windows APIs, and copy the result. Return -1 if conversion failes. 2097232153Smm */ 2098228753Smmstatic int 2099232153Smmstrncat_in_codepage(struct archive_string *as, 2100232153Smm const void *_p, size_t length, struct archive_string_conv *sc) 2101228753Smm{ 2102232153Smm const char *s = (const char *)_p; 2103232153Smm struct archive_wstring aws; 2104232153Smm size_t l; 2105232153Smm int r, saved_flag; 2106228753Smm 2107232153Smm archive_string_init(&aws); 2108232153Smm saved_flag = sc->flag; 2109232153Smm sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); 2110232153Smm r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); 2111232153Smm sc->flag = saved_flag; 2112232153Smm if (r != 0) { 2113232153Smm archive_wstring_free(&aws); 2114232153Smm if (errno != ENOMEM) 2115232153Smm archive_string_append(as, s, length); 2116232153Smm return (-1); 2117232153Smm } 2118232153Smm 2119232153Smm l = as->length; 2120232153Smm r = archive_string_append_from_wcs_in_codepage( 2121232153Smm as, aws.s, aws.length, sc); 2122232153Smm if (r != 0 && errno != ENOMEM && l == as->length) 2123232153Smm archive_string_append(as, s, length); 2124232153Smm archive_wstring_free(&aws); 2125232153Smm return (r); 2126232153Smm} 2127232153Smm 2128232153Smm/* 2129232153Smm * Test whether MBS ==> WCS is okay. 2130232153Smm */ 2131232153Smmstatic int 2132232153Smminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2133232153Smm{ 2134232153Smm const char *p = (const char *)_p; 2135232153Smm unsigned codepage; 2136232153Smm DWORD mbflag = MB_ERR_INVALID_CHARS; 2137232153Smm 2138232153Smm if (sc->flag & SCONV_FROM_CHARSET) 2139232153Smm codepage = sc->to_cp; 2140232153Smm else 2141232153Smm codepage = sc->from_cp; 2142232153Smm 2143232153Smm if (codepage == CP_C_LOCALE) 2144232153Smm return (0); 2145232153Smm if (codepage != CP_UTF8) 2146232153Smm mbflag |= MB_PRECOMPOSED; 2147232153Smm 2148248616Smm if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) 2149232153Smm return (-1); /* Invalid */ 2150232153Smm return (0); /* Okay */ 2151232153Smm} 2152232153Smm 2153232153Smm#else 2154232153Smm 2155232153Smm/* 2156232153Smm * Test whether MBS ==> WCS is okay. 2157232153Smm */ 2158232153Smmstatic int 2159232153Smminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2160232153Smm{ 2161232153Smm const char *p = (const char *)_p; 2162232153Smm size_t r; 2163232153Smm 2164232153Smm#if HAVE_MBRTOWC 2165232153Smm mbstate_t shift_state; 2166232153Smm 2167232153Smm memset(&shift_state, 0, sizeof(shift_state)); 2168232153Smm#else 2169232153Smm /* Clear the shift state before starting. */ 2170232153Smm mbtowc(NULL, NULL, 0); 2171232153Smm#endif 2172232153Smm while (n) { 2173232153Smm wchar_t wc; 2174232153Smm 2175232153Smm#if HAVE_MBRTOWC 2176232153Smm r = mbrtowc(&wc, p, n, &shift_state); 2177232153Smm#else 2178232153Smm r = mbtowc(&wc, p, n); 2179232153Smm#endif 2180232153Smm if (r == (size_t)-1 || r == (size_t)-2) 2181232153Smm return (-1);/* Invalid. */ 2182232153Smm if (r == 0) 2183232153Smm break; 2184232153Smm p += r; 2185232153Smm n -= r; 2186232153Smm } 2187238856Smm (void)sc; /* UNUSED */ 2188232153Smm return (0); /* All Okey. */ 2189232153Smm} 2190232153Smm 2191232153Smm#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 2192232153Smm 2193232153Smm/* 2194232153Smm * Basically returns -1 because we cannot make a conversion of charset 2195232153Smm * without iconv but in some cases this would return 0. 2196232153Smm * Returns 0 if all copied characters are ASCII. 2197232153Smm * Returns 0 if both from-locale and to-locale are the same and those 2198232153Smm * can be WCS with no error. 2199232153Smm */ 2200232153Smmstatic int 2201232153Smmbest_effort_strncat_in_locale(struct archive_string *as, const void *_p, 2202232153Smm size_t length, struct archive_string_conv *sc) 2203232153Smm{ 2204232153Smm size_t remaining; 2205248616Smm char *otp; 2206248616Smm const uint8_t *itp; 2207232153Smm size_t avail; 2208232153Smm int return_value = 0; /* success */ 2209232153Smm 2210232153Smm /* 2211232153Smm * If both from-locale and to-locale is the same, this makes a copy. 2212232153Smm * And then this checks all copied MBS can be WCS if so returns 0. 2213232153Smm */ 2214232153Smm if (sc->same) { 2215232153Smm if (archive_string_append(as, _p, length) == NULL) 2216232153Smm return (-1);/* No memory */ 2217232153Smm return (invalid_mbs(_p, length, sc)); 2218232153Smm } 2219232153Smm 2220232153Smm /* 2221232153Smm * If a character is ASCII, this just copies it. If not, this 2222232153Smm * assigns '?' charater instead but in UTF-8 locale this assigns 2223232153Smm * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, 2224232153Smm * a Replacement Character in Unicode. 2225232153Smm */ 2226232153Smm if (archive_string_ensure(as, as->length + length + 1) == NULL) 2227232153Smm return (-1); 2228232153Smm 2229232153Smm remaining = length; 2230248616Smm itp = (const uint8_t *)_p; 2231248616Smm otp = as->s + as->length; 2232232153Smm avail = as->buffer_length - as->length -1; 2233248616Smm while (*itp && remaining > 0) { 2234248616Smm if (*itp > 127 && (sc->flag & SCONV_TO_UTF8)) { 2235232153Smm if (avail < UTF8_R_CHAR_SIZE) { 2236248616Smm as->length = otp - as->s; 2237232153Smm if (NULL == archive_string_ensure(as, 2238232153Smm as->buffer_length + remaining + 2239232153Smm UTF8_R_CHAR_SIZE)) 2240232153Smm return (-1); 2241248616Smm otp = as->s + as->length; 2242232153Smm avail = as->buffer_length - as->length -1; 2243232153Smm } 2244232153Smm /* 2245232153Smm * When coping a string in UTF-8, unknown character 2246232153Smm * should be U+FFFD (replacement character). 2247232153Smm */ 2248248616Smm UTF8_SET_R_CHAR(otp); 2249248616Smm otp += UTF8_R_CHAR_SIZE; 2250232153Smm avail -= UTF8_R_CHAR_SIZE; 2251248616Smm itp++; 2252232153Smm remaining--; 2253232153Smm return_value = -1; 2254248616Smm } else if (*itp > 127) { 2255248616Smm *otp++ = '?'; 2256248616Smm itp++; 2257232153Smm remaining--; 2258232153Smm return_value = -1; 2259232153Smm } else { 2260248616Smm *otp++ = (char)*itp++; 2261232153Smm remaining--; 2262232153Smm } 2263232153Smm } 2264248616Smm as->length = otp - as->s; 2265232153Smm as->s[as->length] = '\0'; 2266232153Smm return (return_value); 2267232153Smm} 2268232153Smm 2269232153Smm 2270232153Smm/* 2271232153Smm * Unicode conversion functions. 2272232153Smm * - UTF-8 <===> UTF-8 in removing surrogate pairs. 2273232153Smm * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. 2274232153Smm * - UTF-8 made by libarchive 2.x ===> UTF-8. 2275232153Smm * - UTF-16BE <===> UTF-8. 2276232153Smm * 2277232153Smm */ 2278232153Smm 2279232153Smm/* 2280232153Smm * Utility to convert a single UTF-8 sequence. 2281232153Smm * 2282232153Smm * Usually return used bytes, return used byte in negative value when 2283232153Smm * a unicode character is replaced with U+FFFD. 2284232153Smm * See also http://unicode.org/review/pr-121.html Public Review Issue #121 2285232153Smm * Recommended Practice for Replacement Characters. 2286232153Smm */ 2287232153Smmstatic int 2288232153Smm_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2289232153Smm{ 2290232153Smm static const char utf8_count[256] = { 2291232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ 2292232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ 2293232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ 2294232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ 2295232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ 2296232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ 2297232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ 2298232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ 2299232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ 2300232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ 2301232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ 2302232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ 2303232153Smm 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ 2304232153Smm 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ 2305232153Smm 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ 2306232153Smm 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ 2307232153Smm }; 2308232153Smm int ch, i; 2309232153Smm int cnt; 2310232153Smm uint32_t wc; 2311232153Smm 2312232153Smm /* Sanity check. */ 2313232153Smm if (n == 0) 2314232153Smm return (0); 2315232153Smm /* 2316228753Smm * Decode 1-4 bytes depending on the value of the first byte. 2317228753Smm */ 2318232153Smm ch = (unsigned char)*s; 2319232153Smm if (ch == 0) 2320228753Smm return (0); /* Standard: return 0 for end-of-string. */ 2321232153Smm cnt = utf8_count[ch]; 2322232153Smm 2323232153Smm /* Invalide sequence or there are not plenty bytes. */ 2324232153Smm if ((int)n < cnt) { 2325248616Smm cnt = (int)n; 2326232153Smm for (i = 1; i < cnt; i++) { 2327232153Smm if ((s[i] & 0xc0) != 0x80) { 2328232153Smm cnt = i; 2329232153Smm break; 2330232153Smm } 2331232153Smm } 2332232153Smm goto invalid_sequence; 2333228753Smm } 2334232153Smm 2335232153Smm /* Make a Unicode code point from a single UTF-8 sequence. */ 2336232153Smm switch (cnt) { 2337232153Smm case 1: /* 1 byte sequence. */ 2338232153Smm *pwc = ch & 0x7f; 2339232153Smm return (cnt); 2340232153Smm case 2: /* 2 bytes sequence. */ 2341232153Smm if ((s[1] & 0xc0) != 0x80) { 2342232153Smm cnt = 1; 2343232153Smm goto invalid_sequence; 2344232153Smm } 2345232153Smm *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 2346232153Smm return (cnt); 2347232153Smm case 3: /* 3 bytes sequence. */ 2348232153Smm if ((s[1] & 0xc0) != 0x80) { 2349232153Smm cnt = 1; 2350232153Smm goto invalid_sequence; 2351232153Smm } 2352232153Smm if ((s[2] & 0xc0) != 0x80) { 2353232153Smm cnt = 2; 2354232153Smm goto invalid_sequence; 2355232153Smm } 2356232153Smm wc = ((ch & 0x0f) << 12) 2357228753Smm | ((s[1] & 0x3f) << 6) 2358228753Smm | (s[2] & 0x3f); 2359232153Smm if (wc < 0x800) 2360232153Smm goto invalid_sequence;/* Overlong sequence. */ 2361232153Smm break; 2362232153Smm case 4: /* 4 bytes sequence. */ 2363232153Smm if ((s[1] & 0xc0) != 0x80) { 2364232153Smm cnt = 1; 2365232153Smm goto invalid_sequence; 2366232153Smm } 2367232153Smm if ((s[2] & 0xc0) != 0x80) { 2368232153Smm cnt = 2; 2369232153Smm goto invalid_sequence; 2370232153Smm } 2371232153Smm if ((s[3] & 0xc0) != 0x80) { 2372232153Smm cnt = 3; 2373232153Smm goto invalid_sequence; 2374232153Smm } 2375232153Smm wc = ((ch & 0x07) << 18) 2376228753Smm | ((s[1] & 0x3f) << 12) 2377228753Smm | ((s[2] & 0x3f) << 6) 2378228753Smm | (s[3] & 0x3f); 2379232153Smm if (wc < 0x10000) 2380232153Smm goto invalid_sequence;/* Overlong sequence. */ 2381232153Smm break; 2382232153Smm default: /* Others are all invalid sequence. */ 2383232153Smm if (ch == 0xc0 || ch == 0xc1) 2384232153Smm cnt = 2; 2385232153Smm else if (ch >= 0xf5 && ch <= 0xf7) 2386232153Smm cnt = 4; 2387232153Smm else if (ch >= 0xf8 && ch <= 0xfb) 2388232153Smm cnt = 5; 2389232153Smm else if (ch == 0xfc || ch == 0xfd) 2390232153Smm cnt = 6; 2391232153Smm else 2392232153Smm cnt = 1; 2393232153Smm if ((int)n < cnt) 2394248616Smm cnt = (int)n; 2395232153Smm for (i = 1; i < cnt; i++) { 2396232153Smm if ((s[i] & 0xc0) != 0x80) { 2397232153Smm cnt = i; 2398232153Smm break; 2399232153Smm } 2400232153Smm } 2401232153Smm goto invalid_sequence; 2402232153Smm } 2403232153Smm 2404232153Smm /* The code point larger than 0x10FFFF is not leagal 2405232153Smm * Unicode values. */ 2406232153Smm if (wc > UNICODE_MAX) 2407232153Smm goto invalid_sequence; 2408232153Smm /* Correctly gets a Unicode, returns used bytes. */ 2409232153Smm *pwc = wc; 2410232153Smm return (cnt); 2411232153Smminvalid_sequence: 2412232153Smm *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2413232153Smm return (cnt * -1); 2414232153Smm} 2415232153Smm 2416232153Smmstatic int 2417232153Smmutf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2418232153Smm{ 2419232153Smm int cnt; 2420232153Smm 2421232153Smm cnt = _utf8_to_unicode(pwc, s, n); 2422232153Smm /* Any of Surrogate pair is not leagal Unicode values. */ 2423232153Smm if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2424232153Smm return (-3); 2425232153Smm return (cnt); 2426232153Smm} 2427232153Smm 2428232153Smmstatic inline uint32_t 2429232153Smmcombine_surrogate_pair(uint32_t uc, uint32_t uc2) 2430232153Smm{ 2431232153Smm uc -= 0xD800; 2432232153Smm uc *= 0x400; 2433232153Smm uc += uc2 - 0xDC00; 2434232153Smm uc += 0x10000; 2435232153Smm return (uc); 2436232153Smm} 2437232153Smm 2438232153Smm/* 2439232153Smm * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in 2440232153Smm * removing surrogate pairs. 2441232153Smm * 2442232153Smm * CESU-8: The Compatibility Encoding Scheme for UTF-16. 2443232153Smm * 2444232153Smm * Usually return used bytes, return used byte in negative value when 2445232153Smm * a unicode character is replaced with U+FFFD. 2446232153Smm */ 2447232153Smmstatic int 2448232153Smmcesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2449232153Smm{ 2450248616Smm uint32_t wc = 0; 2451232153Smm int cnt; 2452232153Smm 2453232153Smm cnt = _utf8_to_unicode(&wc, s, n); 2454232153Smm if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { 2455248616Smm uint32_t wc2 = 0; 2456232153Smm if (n - 3 < 3) { 2457232153Smm /* Invalid byte sequence. */ 2458232153Smm goto invalid_sequence; 2459232153Smm } 2460232153Smm cnt = _utf8_to_unicode(&wc2, s+3, n-3); 2461232153Smm if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { 2462232153Smm /* Invalid byte sequence. */ 2463232153Smm goto invalid_sequence; 2464232153Smm } 2465232153Smm wc = combine_surrogate_pair(wc, wc2); 2466232153Smm cnt = 6; 2467232153Smm } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { 2468232153Smm /* Invalid byte sequence. */ 2469232153Smm goto invalid_sequence; 2470232153Smm } 2471232153Smm *pwc = wc; 2472232153Smm return (cnt); 2473232153Smminvalid_sequence: 2474232153Smm *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2475232153Smm if (cnt > 0) 2476232153Smm cnt *= -1; 2477232153Smm return (cnt); 2478232153Smm} 2479232153Smm 2480232153Smm/* 2481232153Smm * Convert a Unicode code point to a single UTF-8 sequence. 2482232153Smm * 2483232153Smm * NOTE:This function does not check if the Unicode is leagal or not. 2484232153Smm * Please you definitely check it before calling this. 2485232153Smm */ 2486232153Smmstatic size_t 2487232153Smmunicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2488232153Smm{ 2489232153Smm char *_p = p; 2490232153Smm 2491232153Smm /* Translate code point to UTF8 */ 2492232153Smm if (uc <= 0x7f) { 2493232153Smm if (remaining == 0) 2494232153Smm return (0); 2495232153Smm *p++ = (char)uc; 2496232153Smm } else if (uc <= 0x7ff) { 2497232153Smm if (remaining < 2) 2498232153Smm return (0); 2499232153Smm *p++ = 0xc0 | ((uc >> 6) & 0x1f); 2500232153Smm *p++ = 0x80 | (uc & 0x3f); 2501232153Smm } else if (uc <= 0xffff) { 2502232153Smm if (remaining < 3) 2503232153Smm return (0); 2504232153Smm *p++ = 0xe0 | ((uc >> 12) & 0x0f); 2505232153Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 2506232153Smm *p++ = 0x80 | (uc & 0x3f); 2507232153Smm } else if (uc <= UNICODE_MAX) { 2508232153Smm if (remaining < 4) 2509232153Smm return (0); 2510232153Smm *p++ = 0xf0 | ((uc >> 18) & 0x07); 2511232153Smm *p++ = 0x80 | ((uc >> 12) & 0x3f); 2512232153Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 2513232153Smm *p++ = 0x80 | (uc & 0x3f); 2514232153Smm } else { 2515232153Smm /* 2516232153Smm * Undescribed code point should be U+FFFD 2517232153Smm * (replacement character). 2518232153Smm */ 2519232153Smm if (remaining < UTF8_R_CHAR_SIZE) 2520232153Smm return (0); 2521232153Smm UTF8_SET_R_CHAR(p); 2522232153Smm p += UTF8_R_CHAR_SIZE; 2523232153Smm } 2524232153Smm return (p - _p); 2525232153Smm} 2526232153Smm 2527232153Smmstatic int 2528232153Smmutf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) 2529232153Smm{ 2530232153Smm return (utf16_to_unicode(pwc, s, n, 1)); 2531232153Smm} 2532232153Smm 2533232153Smmstatic int 2534232153Smmutf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) 2535232153Smm{ 2536232153Smm return (utf16_to_unicode(pwc, s, n, 0)); 2537232153Smm} 2538232153Smm 2539232153Smmstatic int 2540232153Smmutf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) 2541232153Smm{ 2542232153Smm const char *utf16 = s; 2543232153Smm unsigned uc; 2544232153Smm 2545232153Smm if (n == 0) 2546232153Smm return (0); 2547232153Smm if (n == 1) { 2548232153Smm /* set the Replacement Character instead. */ 2549232153Smm *pwc = UNICODE_R_CHAR; 2550232153Smm return (-1); 2551232153Smm } 2552232153Smm 2553232153Smm if (be) 2554232153Smm uc = archive_be16dec(utf16); 2555232153Smm else 2556232153Smm uc = archive_le16dec(utf16); 2557232153Smm utf16 += 2; 2558232153Smm 2559232153Smm /* If this is a surrogate pair, assemble the full code point.*/ 2560232153Smm if (IS_HIGH_SURROGATE_LA(uc)) { 2561232153Smm unsigned uc2; 2562232153Smm 2563232153Smm if (n >= 4) { 2564232153Smm if (be) 2565232153Smm uc2 = archive_be16dec(utf16); 2566232153Smm else 2567232153Smm uc2 = archive_le16dec(utf16); 2568232153Smm } else 2569232153Smm uc2 = 0; 2570232153Smm if (IS_LOW_SURROGATE_LA(uc2)) { 2571232153Smm uc = combine_surrogate_pair(uc, uc2); 2572232153Smm utf16 += 2; 2573232153Smm } else { 2574232153Smm /* Undescribed code point should be U+FFFD 2575232153Smm * (replacement character). */ 2576232153Smm *pwc = UNICODE_R_CHAR; 2577232153Smm return (-2); 2578232153Smm } 2579232153Smm } 2580232153Smm 2581232153Smm /* 2582232153Smm * Surrogate pair values(0xd800 through 0xdfff) are only 2583232153Smm * used by UTF-16, so, after above culculation, the code 2584232153Smm * must not be surrogate values, and Unicode has no codes 2585232153Smm * larger than 0x10ffff. Thus, those are not leagal Unicode 2586232153Smm * values. 2587232153Smm */ 2588232153Smm if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2589232153Smm /* Undescribed code point should be U+FFFD 2590232153Smm * (replacement character). */ 2591232153Smm *pwc = UNICODE_R_CHAR; 2592232153Smm return (((int)(utf16 - s)) * -1); 2593232153Smm } 2594232153Smm *pwc = uc; 2595232153Smm return ((int)(utf16 - s)); 2596232153Smm} 2597232153Smm 2598232153Smmstatic size_t 2599232153Smmunicode_to_utf16be(char *p, size_t remaining, uint32_t uc) 2600232153Smm{ 2601232153Smm char *utf16 = p; 2602232153Smm 2603232153Smm if (uc > 0xffff) { 2604232153Smm /* We have a code point that won't fit into a 2605232153Smm * wchar_t; convert it to a surrogate pair. */ 2606232153Smm if (remaining < 4) 2607232153Smm return (0); 2608232153Smm uc -= 0x10000; 2609232153Smm archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2610232153Smm archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2611228753Smm return (4); 2612232153Smm } else { 2613232153Smm if (remaining < 2) 2614232153Smm return (0); 2615232153Smm archive_be16enc(utf16, uc); 2616232153Smm return (2); 2617232153Smm } 2618228753Smm} 2619228753Smm 2620232153Smmstatic size_t 2621232153Smmunicode_to_utf16le(char *p, size_t remaining, uint32_t uc) 2622232153Smm{ 2623232153Smm char *utf16 = p; 2624232153Smm 2625232153Smm if (uc > 0xffff) { 2626232153Smm /* We have a code point that won't fit into a 2627232153Smm * wchar_t; convert it to a surrogate pair. */ 2628232153Smm if (remaining < 4) 2629232153Smm return (0); 2630232153Smm uc -= 0x10000; 2631232153Smm archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2632232153Smm archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2633232153Smm return (4); 2634232153Smm } else { 2635232153Smm if (remaining < 2) 2636232153Smm return (0); 2637232153Smm archive_le16enc(utf16, uc); 2638232153Smm return (2); 2639232153Smm } 2640232153Smm} 2641232153Smm 2642228753Smm/* 2643232153Smm * Copy UTF-8 string in checking surrogate pair. 2644232153Smm * If any surrogate pair are found, it would be canonicalized. 2645228753Smm */ 2646232153Smmstatic int 2647238856Smmstrncat_from_utf8_to_utf8(struct archive_string *as, const void *_p, 2648238856Smm size_t len, struct archive_string_conv *sc) 2649228753Smm{ 2650232153Smm const char *s; 2651232153Smm char *p, *endp; 2652232153Smm int n, ret = 0; 2653228753Smm 2654232153Smm (void)sc; /* UNUSED */ 2655232153Smm 2656232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 2657232153Smm return (-1); 2658232153Smm 2659232153Smm s = (const char *)_p; 2660232153Smm p = as->s + as->length; 2661232153Smm endp = as->s + as->buffer_length -1; 2662232153Smm do { 2663232153Smm uint32_t uc; 2664232153Smm const char *ss = s; 2665232153Smm size_t w; 2666232153Smm 2667232153Smm /* 2668232153Smm * Forward byte sequence until a conversion of that is needed. 2669232153Smm */ 2670232153Smm while ((n = utf8_to_unicode(&uc, s, len)) > 0) { 2671232153Smm s += n; 2672232153Smm len -= n; 2673232153Smm } 2674232153Smm if (ss < s) { 2675232153Smm if (p + (s - ss) > endp) { 2676232153Smm as->length = p - as->s; 2677232153Smm if (archive_string_ensure(as, 2678232153Smm as->buffer_length + len + 1) == NULL) 2679232153Smm return (-1); 2680232153Smm p = as->s + as->length; 2681232153Smm endp = as->s + as->buffer_length -1; 2682232153Smm } 2683232153Smm 2684232153Smm memcpy(p, ss, s - ss); 2685232153Smm p += s - ss; 2686232153Smm } 2687232153Smm 2688232153Smm /* 2689232153Smm * If n is negative, current byte sequence needs a replacement. 2690232153Smm */ 2691228753Smm if (n < 0) { 2692232153Smm if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { 2693232153Smm /* Current byte sequence may be CESU-8. */ 2694232153Smm n = cesu8_to_unicode(&uc, s, len); 2695232153Smm } 2696228753Smm if (n < 0) { 2697232153Smm ret = -1; 2698232153Smm n *= -1;/* Use a replaced unicode character. */ 2699228753Smm } 2700232153Smm 2701232153Smm /* Rebuild UTF-8 byte sequence. */ 2702232153Smm while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) { 2703232153Smm as->length = p - as->s; 2704232153Smm if (archive_string_ensure(as, 2705232153Smm as->buffer_length + len + 1) == NULL) 2706232153Smm return (-1); 2707232153Smm p = as->s + as->length; 2708232153Smm endp = as->s + as->buffer_length -1; 2709228753Smm } 2710232153Smm p += w; 2711232153Smm s += n; 2712232153Smm len -= n; 2713228753Smm } 2714232153Smm } while (n > 0); 2715232153Smm as->length = p - as->s; 2716232153Smm as->s[as->length] = '\0'; 2717232153Smm return (ret); 2718232153Smm} 2719232153Smm 2720232153Smmstatic int 2721232153Smmarchive_string_append_unicode(struct archive_string *as, const void *_p, 2722232153Smm size_t len, struct archive_string_conv *sc) 2723232153Smm{ 2724232153Smm const char *s; 2725232153Smm char *p, *endp; 2726232153Smm uint32_t uc; 2727232153Smm size_t w; 2728232153Smm int n, ret = 0, ts, tm; 2729232153Smm int (*parse)(uint32_t *, const char *, size_t); 2730232153Smm size_t (*unparse)(char *, size_t, uint32_t); 2731232153Smm 2732232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 2733232153Smm unparse = unicode_to_utf16be; 2734232153Smm ts = 2; 2735232153Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 2736232153Smm unparse = unicode_to_utf16le; 2737232153Smm ts = 2; 2738232153Smm } else if (sc->flag & SCONV_TO_UTF8) { 2739232153Smm unparse = unicode_to_utf8; 2740232153Smm ts = 1; 2741232153Smm } else { 2742232153Smm /* 2743232153Smm * This case is going to be converted to another 2744232153Smm * character-set through iconv. 2745232153Smm */ 2746232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2747232153Smm unparse = unicode_to_utf16be; 2748232153Smm ts = 2; 2749232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2750232153Smm unparse = unicode_to_utf16le; 2751232153Smm ts = 2; 2752232153Smm } else { 2753232153Smm unparse = unicode_to_utf8; 2754232153Smm ts = 1; 2755232153Smm } 2756228753Smm } 2757232153Smm 2758232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2759232153Smm parse = utf16be_to_unicode; 2760232153Smm tm = 1; 2761232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2762232153Smm parse = utf16le_to_unicode; 2763232153Smm tm = 1; 2764232153Smm } else { 2765232153Smm parse = cesu8_to_unicode; 2766232153Smm tm = ts; 2767232153Smm } 2768232153Smm 2769232153Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2770232153Smm return (-1); 2771232153Smm 2772232153Smm s = (const char *)_p; 2773232153Smm p = as->s + as->length; 2774232153Smm endp = as->s + as->buffer_length - ts; 2775232153Smm while ((n = parse(&uc, s, len)) != 0) { 2776232153Smm if (n < 0) { 2777232153Smm /* Use a replaced unicode character. */ 2778232153Smm n *= -1; 2779232153Smm ret = -1; 2780232153Smm } 2781232153Smm s += n; 2782232153Smm len -= n; 2783232153Smm while ((w = unparse(p, endp - p, uc)) == 0) { 2784232153Smm /* There is not enough output buffer so 2785232153Smm * we have to expand it. */ 2786232153Smm as->length = p - as->s; 2787232153Smm if (archive_string_ensure(as, 2788232153Smm as->buffer_length + len * tm + ts) == NULL) 2789232153Smm return (-1); 2790232153Smm p = as->s + as->length; 2791232153Smm endp = as->s + as->buffer_length - ts; 2792232153Smm } 2793232153Smm p += w; 2794232153Smm } 2795232153Smm as->length = p - as->s; 2796232153Smm as->s[as->length] = '\0'; 2797232153Smm if (ts == 2) 2798232153Smm as->s[as->length+1] = '\0'; 2799232153Smm return (ret); 2800228753Smm} 2801228753Smm 2802232153Smm/* 2803232153Smm * Following Constants for Hangul compositions this information comes from 2804232153Smm * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ 2805232153Smm */ 2806232153Smm#define HC_SBASE 0xAC00 2807232153Smm#define HC_LBASE 0x1100 2808232153Smm#define HC_VBASE 0x1161 2809232153Smm#define HC_TBASE 0x11A7 2810232153Smm#define HC_LCOUNT 19 2811232153Smm#define HC_VCOUNT 21 2812232153Smm#define HC_TCOUNT 28 2813232153Smm#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) 2814232153Smm#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) 2815228753Smm 2816232153Smmstatic uint32_t 2817232153Smmget_nfc(uint32_t uc, uint32_t uc2) 2818232153Smm{ 2819232153Smm int t, b; 2820232153Smm 2821232153Smm t = 0; 2822232153Smm b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; 2823232153Smm while (b >= t) { 2824232153Smm int m = (t + b) / 2; 2825232153Smm if (u_composition_table[m].cp1 < uc) 2826232153Smm t = m + 1; 2827232153Smm else if (u_composition_table[m].cp1 > uc) 2828232153Smm b = m - 1; 2829232153Smm else if (u_composition_table[m].cp2 < uc2) 2830232153Smm t = m + 1; 2831232153Smm else if (u_composition_table[m].cp2 > uc2) 2832232153Smm b = m - 1; 2833232153Smm else 2834232153Smm return (u_composition_table[m].nfc); 2835232153Smm } 2836232153Smm return (0); 2837232153Smm} 2838232153Smm 2839232153Smm#define FDC_MAX 10 /* The maximum number of Following Decomposable 2840232153Smm * Characters. */ 2841232153Smm 2842228753Smm/* 2843232153Smm * Update first code point. 2844232153Smm */ 2845232153Smm#define UPDATE_UC(new_uc) do { \ 2846232153Smm uc = new_uc; \ 2847232153Smm ucptr = NULL; \ 2848232153Smm} while (0) 2849232153Smm 2850232153Smm/* 2851232153Smm * Replace first code point with second code point. 2852232153Smm */ 2853232153Smm#define REPLACE_UC_WITH_UC2() do { \ 2854232153Smm uc = uc2; \ 2855232153Smm ucptr = uc2ptr; \ 2856232153Smm n = n2; \ 2857232153Smm} while (0) 2858232153Smm 2859232153Smm#define EXPAND_BUFFER() do { \ 2860232153Smm as->length = p - as->s; \ 2861232153Smm if (archive_string_ensure(as, \ 2862232153Smm as->buffer_length + len * tm + ts) == NULL)\ 2863232153Smm return (-1); \ 2864232153Smm p = as->s + as->length; \ 2865232153Smm endp = as->s + as->buffer_length - ts; \ 2866232153Smm} while (0) 2867232153Smm 2868232153Smm#define UNPARSE(p, endp, uc) do { \ 2869232153Smm while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ 2870232153Smm EXPAND_BUFFER(); \ 2871232153Smm } \ 2872232153Smm p += w; \ 2873232153Smm} while (0) 2874232153Smm 2875232153Smm/* 2876232153Smm * Write first code point. 2877232153Smm * If the code point has not be changed from its original code, 2878232153Smm * this just copies it from its original buffer pointer. 2879232153Smm * If not, this converts it to UTF-8 byte sequence and copies it. 2880232153Smm */ 2881232153Smm#define WRITE_UC() do { \ 2882232153Smm if (ucptr) { \ 2883232153Smm if (p + n > endp) \ 2884232153Smm EXPAND_BUFFER(); \ 2885232153Smm switch (n) { \ 2886232153Smm case 4: \ 2887232153Smm *p++ = *ucptr++; \ 2888232153Smm /* FALL THROUGH */ \ 2889232153Smm case 3: \ 2890232153Smm *p++ = *ucptr++; \ 2891232153Smm /* FALL THROUGH */ \ 2892232153Smm case 2: \ 2893232153Smm *p++ = *ucptr++; \ 2894232153Smm /* FALL THROUGH */ \ 2895232153Smm case 1: \ 2896232153Smm *p++ = *ucptr; \ 2897232153Smm break; \ 2898232153Smm } \ 2899232153Smm ucptr = NULL; \ 2900232153Smm } else { \ 2901232153Smm UNPARSE(p, endp, uc); \ 2902232153Smm } \ 2903232153Smm} while (0) 2904232153Smm 2905232153Smm/* 2906232153Smm * Collect following decomposable code points. 2907232153Smm */ 2908232153Smm#define COLLECT_CPS(start) do { \ 2909232153Smm int _i; \ 2910232153Smm for (_i = start; _i < FDC_MAX ; _i++) { \ 2911232153Smm nx = parse(&ucx[_i], s, len); \ 2912232153Smm if (nx <= 0) \ 2913232153Smm break; \ 2914232153Smm cx = CCC(ucx[_i]); \ 2915232153Smm if (cl >= cx && cl != 228 && cx != 228)\ 2916232153Smm break; \ 2917232153Smm s += nx; \ 2918232153Smm len -= nx; \ 2919232153Smm cl = cx; \ 2920232153Smm ccx[_i] = cx; \ 2921232153Smm } \ 2922232153Smm if (_i >= FDC_MAX) { \ 2923232153Smm ret = -1; \ 2924232153Smm ucx_size = FDC_MAX; \ 2925232153Smm } else \ 2926232153Smm ucx_size = _i; \ 2927232153Smm} while (0) 2928232153Smm 2929232153Smm/* 2930232153Smm * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. 2931228753Smm * 2932232153Smm * TODO: Convert composition exclusions,which are never converted 2933232153Smm * from NFC,NFD,NFKC and NFKD, to Form C. 2934228753Smm */ 2935232153Smmstatic int 2936232153Smmarchive_string_normalize_C(struct archive_string *as, const void *_p, 2937232153Smm size_t len, struct archive_string_conv *sc) 2938228753Smm{ 2939232153Smm const char *s = (const char *)_p; 2940232153Smm char *p, *endp; 2941232153Smm uint32_t uc, uc2; 2942232153Smm size_t w; 2943232153Smm int always_replace, n, n2, ret = 0, spair, ts, tm; 2944232153Smm int (*parse)(uint32_t *, const char *, size_t); 2945232153Smm size_t (*unparse)(char *, size_t, uint32_t); 2946228753Smm 2947232153Smm always_replace = 1; 2948232153Smm ts = 1;/* text size. */ 2949232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 2950232153Smm unparse = unicode_to_utf16be; 2951232153Smm ts = 2; 2952232153Smm if (sc->flag & SCONV_FROM_UTF16BE) 2953232153Smm always_replace = 0; 2954232153Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 2955232153Smm unparse = unicode_to_utf16le; 2956232153Smm ts = 2; 2957232153Smm if (sc->flag & SCONV_FROM_UTF16LE) 2958232153Smm always_replace = 0; 2959232153Smm } else if (sc->flag & SCONV_TO_UTF8) { 2960232153Smm unparse = unicode_to_utf8; 2961232153Smm if (sc->flag & SCONV_FROM_UTF8) 2962232153Smm always_replace = 0; 2963232153Smm } else { 2964232153Smm /* 2965232153Smm * This case is going to be converted to another 2966232153Smm * character-set through iconv. 2967232153Smm */ 2968232153Smm always_replace = 0; 2969232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2970232153Smm unparse = unicode_to_utf16be; 2971232153Smm ts = 2; 2972232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2973232153Smm unparse = unicode_to_utf16le; 2974232153Smm ts = 2; 2975232153Smm } else { 2976232153Smm unparse = unicode_to_utf8; 2977232153Smm } 2978232153Smm } 2979232153Smm 2980232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2981232153Smm parse = utf16be_to_unicode; 2982232153Smm tm = 1; 2983232153Smm spair = 4;/* surrogate pair size in UTF-16. */ 2984232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2985232153Smm parse = utf16le_to_unicode; 2986232153Smm tm = 1; 2987232153Smm spair = 4;/* surrogate pair size in UTF-16. */ 2988232153Smm } else { 2989232153Smm parse = cesu8_to_unicode; 2990232153Smm tm = ts; 2991232153Smm spair = 6;/* surrogate pair size in UTF-8. */ 2992232153Smm } 2993232153Smm 2994232153Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2995232153Smm return (-1); 2996232153Smm 2997232153Smm p = as->s + as->length; 2998232153Smm endp = as->s + as->buffer_length - ts; 2999232153Smm while ((n = parse(&uc, s, len)) != 0) { 3000232153Smm const char *ucptr, *uc2ptr; 3001232153Smm 3002232153Smm if (n < 0) { 3003232153Smm /* Use a replaced unicode character. */ 3004232153Smm UNPARSE(p, endp, uc); 3005232153Smm s += n*-1; 3006232153Smm len -= n*-1; 3007232153Smm ret = -1; 3008232153Smm continue; 3009232153Smm } else if (n == spair || always_replace) 3010232153Smm /* uc is converted from a surrogate pair. 3011232153Smm * this should be treated as a changed code. */ 3012232153Smm ucptr = NULL; 3013232153Smm else 3014232153Smm ucptr = s; 3015232153Smm s += n; 3016232153Smm len -= n; 3017232153Smm 3018232153Smm /* Read second code point. */ 3019232153Smm while ((n2 = parse(&uc2, s, len)) > 0) { 3020232153Smm uint32_t ucx[FDC_MAX]; 3021232153Smm int ccx[FDC_MAX]; 3022232153Smm int cl, cx, i, nx, ucx_size; 3023232153Smm int LIndex,SIndex; 3024232153Smm uint32_t nfc; 3025232153Smm 3026232153Smm if (n2 == spair || always_replace) 3027232153Smm /* uc2 is converted from a surrogate pair. 3028232153Smm * this should be treated as a changed code. */ 3029232153Smm uc2ptr = NULL; 3030232153Smm else 3031232153Smm uc2ptr = s; 3032232153Smm s += n2; 3033232153Smm len -= n2; 3034232153Smm 3035232153Smm /* 3036232153Smm * If current second code point is out of decomposable 3037232153Smm * code points, finding compositions is unneeded. 3038232153Smm */ 3039232153Smm if (!IS_DECOMPOSABLE_BLOCK(uc2)) { 3040232153Smm WRITE_UC(); 3041232153Smm REPLACE_UC_WITH_UC2(); 3042232153Smm continue; 3043232153Smm } 3044232153Smm 3045232153Smm /* 3046232153Smm * Try to combine current code points. 3047232153Smm */ 3048232153Smm /* 3049232153Smm * We have to combine Hangul characters according to 3050232153Smm * http://uniicode.org/reports/tr15/#Hangul 3051232153Smm */ 3052232153Smm if (0 <= (LIndex = uc - HC_LBASE) && 3053232153Smm LIndex < HC_LCOUNT) { 3054232153Smm /* 3055232153Smm * Hangul Composition. 3056232153Smm * 1. Two current code points are L and V. 3057232153Smm */ 3058232153Smm int VIndex = uc2 - HC_VBASE; 3059232153Smm if (0 <= VIndex && VIndex < HC_VCOUNT) { 3060232153Smm /* Make syllable of form LV. */ 3061232153Smm UPDATE_UC(HC_SBASE + 3062232153Smm (LIndex * HC_VCOUNT + VIndex) * 3063232153Smm HC_TCOUNT); 3064232153Smm } else { 3065232153Smm WRITE_UC(); 3066232153Smm REPLACE_UC_WITH_UC2(); 3067232153Smm } 3068232153Smm continue; 3069232153Smm } else if (0 <= (SIndex = uc - HC_SBASE) && 3070232153Smm SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { 3071232153Smm /* 3072232153Smm * Hangul Composition. 3073232153Smm * 2. Two current code points are LV and T. 3074232153Smm */ 3075232153Smm int TIndex = uc2 - HC_TBASE; 3076232153Smm if (0 < TIndex && TIndex < HC_TCOUNT) { 3077232153Smm /* Make syllable of form LVT. */ 3078232153Smm UPDATE_UC(uc + TIndex); 3079232153Smm } else { 3080232153Smm WRITE_UC(); 3081232153Smm REPLACE_UC_WITH_UC2(); 3082232153Smm } 3083232153Smm continue; 3084232153Smm } else if ((nfc = get_nfc(uc, uc2)) != 0) { 3085232153Smm /* A composition to current code points 3086232153Smm * is found. */ 3087232153Smm UPDATE_UC(nfc); 3088232153Smm continue; 3089232153Smm } else if ((cl = CCC(uc2)) == 0) { 3090232153Smm /* Clearly 'uc2' the second code point is not 3091232153Smm * a decomposable code. */ 3092232153Smm WRITE_UC(); 3093232153Smm REPLACE_UC_WITH_UC2(); 3094232153Smm continue; 3095232153Smm } 3096232153Smm 3097232153Smm /* 3098232153Smm * Collect following decomposable code points. 3099232153Smm */ 3100232153Smm cx = 0; 3101232153Smm ucx[0] = uc2; 3102232153Smm ccx[0] = cl; 3103232153Smm COLLECT_CPS(1); 3104232153Smm 3105232153Smm /* 3106232153Smm * Find a composed code in the collected code points. 3107232153Smm */ 3108232153Smm i = 1; 3109232153Smm while (i < ucx_size) { 3110232153Smm int j; 3111232153Smm 3112232153Smm if ((nfc = get_nfc(uc, ucx[i])) == 0) { 3113232153Smm i++; 3114232153Smm continue; 3115232153Smm } 3116232153Smm 3117232153Smm /* 3118232153Smm * nfc is composed of uc and ucx[i]. 3119232153Smm */ 3120232153Smm UPDATE_UC(nfc); 3121232153Smm 3122232153Smm /* 3123232153Smm * Remove ucx[i] by shifting 3124232153Smm * following code points. 3125232153Smm */ 3126232153Smm for (j = i; j+1 < ucx_size; j++) { 3127232153Smm ucx[j] = ucx[j+1]; 3128232153Smm ccx[j] = ccx[j+1]; 3129232153Smm } 3130232153Smm ucx_size --; 3131232153Smm 3132232153Smm /* 3133232153Smm * Collect following code points blocked 3134232153Smm * by ucx[i] the removed code point. 3135232153Smm */ 3136232153Smm if (ucx_size > 0 && i == ucx_size && 3137232153Smm nx > 0 && cx == cl) { 3138232153Smm cl = ccx[ucx_size-1]; 3139232153Smm COLLECT_CPS(ucx_size); 3140232153Smm } 3141232153Smm /* 3142232153Smm * Restart finding a composed code with 3143232153Smm * the updated uc from the top of the 3144232153Smm * collected code points. 3145232153Smm */ 3146232153Smm i = 0; 3147232153Smm } 3148232153Smm 3149232153Smm /* 3150232153Smm * Apparently the current code points are not 3151232153Smm * decomposed characters or already composed. 3152232153Smm */ 3153232153Smm WRITE_UC(); 3154232153Smm for (i = 0; i < ucx_size; i++) 3155232153Smm UNPARSE(p, endp, ucx[i]); 3156232153Smm 3157232153Smm /* 3158232153Smm * Flush out remaining canonical combining characters. 3159232153Smm */ 3160232153Smm if (nx > 0 && cx == cl && len > 0) { 3161232153Smm while ((nx = parse(&ucx[0], s, len)) 3162232153Smm > 0) { 3163232153Smm cx = CCC(ucx[0]); 3164232153Smm if (cl > cx) 3165232153Smm break; 3166232153Smm s += nx; 3167232153Smm len -= nx; 3168232153Smm cl = cx; 3169232153Smm UNPARSE(p, endp, ucx[0]); 3170232153Smm } 3171232153Smm } 3172232153Smm break; 3173232153Smm } 3174232153Smm if (n2 < 0) { 3175232153Smm WRITE_UC(); 3176232153Smm /* Use a replaced unicode character. */ 3177232153Smm UNPARSE(p, endp, uc2); 3178232153Smm s += n2*-1; 3179232153Smm len -= n2*-1; 3180232153Smm ret = -1; 3181232153Smm continue; 3182232153Smm } else if (n2 == 0) { 3183232153Smm WRITE_UC(); 3184232153Smm break; 3185232153Smm } 3186232153Smm } 3187232153Smm as->length = p - as->s; 3188232153Smm as->s[as->length] = '\0'; 3189232153Smm if (ts == 2) 3190232153Smm as->s[as->length+1] = '\0'; 3191232153Smm return (ret); 3192232153Smm} 3193232153Smm 3194238856Smmstatic int 3195238856Smmget_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) 3196238856Smm{ 3197238856Smm int t, b; 3198232153Smm 3199238856Smm /* 3200238856Smm * These are not converted to NFD on Mac OS. 3201238856Smm */ 3202238856Smm if ((uc >= 0x2000 && uc <= 0x2FFF) || 3203238856Smm (uc >= 0xF900 && uc <= 0xFAFF) || 3204238856Smm (uc >= 0x2F800 && uc <= 0x2FAFF)) 3205238856Smm return (0); 3206238856Smm /* 3207238856Smm * Those code points are not converted to NFD on Mac OS. 3208238856Smm * I do not know the reason because it is undocumented. 3209238856Smm * NFC NFD 3210238856Smm * 1109A ==> 11099 110BA 3211238856Smm * 1109C ==> 1109B 110BA 3212238856Smm * 110AB ==> 110A5 110BA 3213238856Smm */ 3214238856Smm if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) 3215238856Smm return (0); 3216238856Smm 3217238856Smm t = 0; 3218238856Smm b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; 3219238856Smm while (b >= t) { 3220238856Smm int m = (t + b) / 2; 3221238856Smm if (u_decomposition_table[m].nfc < uc) 3222238856Smm t = m + 1; 3223238856Smm else if (u_decomposition_table[m].nfc > uc) 3224238856Smm b = m - 1; 3225238856Smm else { 3226238856Smm *cp1 = u_decomposition_table[m].cp1; 3227238856Smm *cp2 = u_decomposition_table[m].cp2; 3228238856Smm return (1); 3229238856Smm } 3230238856Smm } 3231238856Smm return (0); 3232238856Smm} 3233238856Smm 3234238856Smm#define REPLACE_UC_WITH(cp) do { \ 3235238856Smm uc = cp; \ 3236238856Smm ucptr = NULL; \ 3237238856Smm} while (0) 3238238856Smm 3239232153Smm/* 3240232153Smm * Normalize UTF-8 characters to Form D and copy the result. 3241232153Smm */ 3242232153Smmstatic int 3243232153Smmarchive_string_normalize_D(struct archive_string *as, const void *_p, 3244232153Smm size_t len, struct archive_string_conv *sc) 3245232153Smm{ 3246238856Smm const char *s = (const char *)_p; 3247238856Smm char *p, *endp; 3248238856Smm uint32_t uc, uc2; 3249238856Smm size_t w; 3250238856Smm int always_replace, n, n2, ret = 0, spair, ts, tm; 3251238856Smm int (*parse)(uint32_t *, const char *, size_t); 3252238856Smm size_t (*unparse)(char *, size_t, uint32_t); 3253232153Smm 3254238856Smm always_replace = 1; 3255238856Smm ts = 1;/* text size. */ 3256238856Smm if (sc->flag & SCONV_TO_UTF16BE) { 3257238856Smm unparse = unicode_to_utf16be; 3258238856Smm ts = 2; 3259238856Smm if (sc->flag & SCONV_FROM_UTF16BE) 3260238856Smm always_replace = 0; 3261238856Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 3262238856Smm unparse = unicode_to_utf16le; 3263238856Smm ts = 2; 3264238856Smm if (sc->flag & SCONV_FROM_UTF16LE) 3265238856Smm always_replace = 0; 3266238856Smm } else if (sc->flag & SCONV_TO_UTF8) { 3267238856Smm unparse = unicode_to_utf8; 3268238856Smm if (sc->flag & SCONV_FROM_UTF8) 3269238856Smm always_replace = 0; 3270238856Smm } else { 3271238856Smm /* 3272238856Smm * This case is going to be converted to another 3273238856Smm * character-set through iconv. 3274238856Smm */ 3275238856Smm always_replace = 0; 3276238856Smm if (sc->flag & SCONV_FROM_UTF16BE) { 3277238856Smm unparse = unicode_to_utf16be; 3278238856Smm ts = 2; 3279238856Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 3280238856Smm unparse = unicode_to_utf16le; 3281238856Smm ts = 2; 3282238856Smm } else { 3283238856Smm unparse = unicode_to_utf8; 3284238856Smm } 3285228753Smm } 3286232153Smm 3287238856Smm if (sc->flag & SCONV_FROM_UTF16BE) { 3288238856Smm parse = utf16be_to_unicode; 3289238856Smm tm = 1; 3290238856Smm spair = 4;/* surrogate pair size in UTF-16. */ 3291238856Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 3292238856Smm parse = utf16le_to_unicode; 3293238856Smm tm = 1; 3294238856Smm spair = 4;/* surrogate pair size in UTF-16. */ 3295238856Smm } else { 3296238856Smm parse = cesu8_to_unicode; 3297238856Smm tm = ts; 3298238856Smm spair = 6;/* surrogate pair size in UTF-8. */ 3299238856Smm } 3300238856Smm 3301238856Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 3302232153Smm return (-1); 3303232153Smm 3304238856Smm p = as->s + as->length; 3305238856Smm endp = as->s + as->buffer_length - ts; 3306238856Smm while ((n = parse(&uc, s, len)) != 0) { 3307238856Smm const char *ucptr; 3308238856Smm uint32_t cp1, cp2; 3309238856Smm int SIndex; 3310238856Smm struct { 3311238856Smm uint32_t uc; 3312238856Smm int ccc; 3313238856Smm } fdc[FDC_MAX]; 3314238856Smm int fdi, fdj; 3315238856Smm int ccc; 3316232153Smm 3317238856Smmcheck_first_code: 3318238856Smm if (n < 0) { 3319238856Smm /* Use a replaced unicode character. */ 3320238856Smm UNPARSE(p, endp, uc); 3321238856Smm s += n*-1; 3322238856Smm len -= n*-1; 3323238856Smm ret = -1; 3324238856Smm continue; 3325238856Smm } else if (n == spair || always_replace) 3326238856Smm /* uc is converted from a surrogate pair. 3327238856Smm * this should be treated as a changed code. */ 3328238856Smm ucptr = NULL; 3329238856Smm else 3330238856Smm ucptr = s; 3331238856Smm s += n; 3332238856Smm len -= n; 3333232153Smm 3334238856Smm /* Hangul Decomposition. */ 3335238856Smm if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { 3336238856Smm int L = HC_LBASE + SIndex / HC_NCOUNT; 3337238856Smm int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; 3338238856Smm int T = HC_TBASE + SIndex % HC_TCOUNT; 3339232153Smm 3340238856Smm REPLACE_UC_WITH(L); 3341238856Smm WRITE_UC(); 3342238856Smm REPLACE_UC_WITH(V); 3343238856Smm WRITE_UC(); 3344238856Smm if (T != HC_TBASE) { 3345238856Smm REPLACE_UC_WITH(T); 3346238856Smm WRITE_UC(); 3347238856Smm } 3348238856Smm continue; 3349238856Smm } 3350238856Smm if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { 3351238856Smm WRITE_UC(); 3352238856Smm continue; 3353238856Smm } 3354232153Smm 3355238856Smm fdi = 0; 3356238856Smm while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { 3357238856Smm int k; 3358238856Smm 3359238856Smm for (k = fdi; k > 0; k--) 3360238856Smm fdc[k] = fdc[k-1]; 3361238856Smm fdc[0].ccc = CCC(cp2); 3362238856Smm fdc[0].uc = cp2; 3363238856Smm fdi++; 3364238856Smm REPLACE_UC_WITH(cp1); 3365238856Smm } 3366238856Smm 3367238856Smm /* Read following code points. */ 3368238856Smm while ((n2 = parse(&uc2, s, len)) > 0 && 3369238856Smm (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { 3370238856Smm int j, k; 3371238856Smm 3372238856Smm s += n2; 3373238856Smm len -= n2; 3374238856Smm for (j = 0; j < fdi; j++) { 3375238856Smm if (fdc[j].ccc > ccc) 3376238856Smm break; 3377238856Smm } 3378238856Smm if (j < fdi) { 3379238856Smm for (k = fdi; k > j; k--) 3380238856Smm fdc[k] = fdc[k-1]; 3381238856Smm fdc[j].ccc = ccc; 3382238856Smm fdc[j].uc = uc2; 3383238856Smm } else { 3384238856Smm fdc[fdi].ccc = ccc; 3385238856Smm fdc[fdi].uc = uc2; 3386238856Smm } 3387238856Smm fdi++; 3388238856Smm } 3389238856Smm 3390238856Smm WRITE_UC(); 3391238856Smm for (fdj = 0; fdj < fdi; fdj++) { 3392238856Smm REPLACE_UC_WITH(fdc[fdj].uc); 3393238856Smm WRITE_UC(); 3394238856Smm } 3395238856Smm 3396238856Smm if (n2 == 0) 3397238856Smm break; 3398238856Smm REPLACE_UC_WITH(uc2); 3399238856Smm n = n2; 3400238856Smm goto check_first_code; 3401232153Smm } 3402238856Smm as->length = p - as->s; 3403238856Smm as->s[as->length] = '\0'; 3404238856Smm if (ts == 2) 3405238856Smm as->s[as->length+1] = '\0'; 3406232153Smm return (ret); 3407228753Smm} 3408228753Smm 3409228753Smm/* 3410232153Smm * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption 3411232153Smm * that WCS is Unicode. It is true for several platforms but some are false. 3412232153Smm * And then people who did not use UTF-8 locale on the non Unicode WCS 3413232153Smm * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those 3414232153Smm * now cannot get right filename from libarchive 3.x and later since we 3415232153Smm * fixed the wrong assumption and it is incompatible to older its versions. 3416232153Smm * So we provide special option, "compat-2x.x", for resolving it. 3417232153Smm * That option enable the string conversion of libarchive 2.x. 3418228753Smm * 3419232153Smm * Translates the wrong UTF-8 string made by libarchive 2.x into current 3420232153Smm * locale character set and appends to the archive_string. 3421232153Smm * Note: returns -1 if conversion fails. 3422228753Smm */ 3423232153Smmstatic int 3424232153Smmstrncat_from_utf8_libarchive2(struct archive_string *as, 3425232153Smm const void *_p, size_t len, struct archive_string_conv *sc) 3426228753Smm{ 3427232153Smm const char *s; 3428228753Smm int n; 3429228753Smm char *p; 3430232153Smm char *end; 3431232153Smm uint32_t unicode; 3432228753Smm#if HAVE_WCRTOMB 3433228753Smm mbstate_t shift_state; 3434228753Smm 3435228753Smm memset(&shift_state, 0, sizeof(shift_state)); 3436228753Smm#else 3437228753Smm /* Clear the shift state before starting. */ 3438228753Smm wctomb(NULL, L'\0'); 3439228753Smm#endif 3440232153Smm (void)sc; /* UNUSED */ 3441228753Smm /* 3442232153Smm * Allocate buffer for MBS. 3443232153Smm * We need this allocation here since it is possible that 3444232153Smm * as->s is still NULL. 3445228753Smm */ 3446232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 3447232153Smm return (-1); 3448232153Smm 3449232153Smm s = (const char *)_p; 3450232153Smm p = as->s + as->length; 3451232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 3452232153Smm while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { 3453232153Smm wchar_t wc; 3454232153Smm 3455232153Smm if (p >= end) { 3456232153Smm as->length = p - as->s; 3457232153Smm /* Re-allocate buffer for MBS. */ 3458232153Smm if (archive_string_ensure(as, 3459232153Smm as->length + len * 2 + 1) == NULL) 3460232153Smm return (-1); 3461232153Smm p = as->s + as->length; 3462232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 3463228753Smm } 3464232153Smm 3465232153Smm /* 3466232153Smm * As libarchie 2.x, translates the UTF-8 characters into 3467232153Smm * wide-characters in the assumption that WCS is Unicode. 3468232153Smm */ 3469232153Smm if (n < 0) { 3470232153Smm n *= -1; 3471232153Smm wc = L'?'; 3472232153Smm } else 3473232153Smm wc = (wchar_t)unicode; 3474232153Smm 3475232153Smm s += n; 3476232153Smm len -= n; 3477232153Smm /* 3478232153Smm * Translates the wide-character into the current locale MBS. 3479232153Smm */ 3480228753Smm#if HAVE_WCRTOMB 3481248616Smm n = (int)wcrtomb(p, wc, &shift_state); 3482228753Smm#else 3483248616Smm n = (int)wctomb(p, wc); 3484228753Smm#endif 3485228753Smm if (n == -1) 3486232153Smm return (-1); 3487228753Smm p += n; 3488228753Smm } 3489232153Smm as->length = p - as->s; 3490232153Smm as->s[as->length] = '\0'; 3491232153Smm return (0); 3492232153Smm} 3493232153Smm 3494232153Smm 3495232153Smm/* 3496232153Smm * Conversion functions between current locale dependent MBS and UTF-16BE. 3497232153Smm * strncat_from_utf16be() : UTF-16BE --> MBS 3498232153Smm * strncat_to_utf16be() : MBS --> UTF16BE 3499232153Smm */ 3500232153Smm 3501232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 3502232153Smm 3503232153Smm/* 3504232153Smm * Convert a UTF-16BE/LE string to current locale and copy the result. 3505232153Smm * Return -1 if conversion failes. 3506232153Smm */ 3507232153Smmstatic int 3508232153Smmwin_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, 3509232153Smm struct archive_string_conv *sc, int be) 3510232153Smm{ 3511232153Smm struct archive_string tmp; 3512232153Smm const char *u16; 3513232153Smm int ll; 3514232153Smm BOOL defchar; 3515232153Smm char *mbs; 3516232153Smm size_t mbs_size, b; 3517232153Smm int ret = 0; 3518232153Smm 3519232153Smm bytes &= ~1; 3520232153Smm if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3521232153Smm return (-1); 3522232153Smm 3523232153Smm mbs = as->s + as->length; 3524232153Smm mbs_size = as->buffer_length - as->length -1; 3525232153Smm 3526232153Smm if (sc->to_cp == CP_C_LOCALE) { 3527232153Smm /* 3528232153Smm * "C" locale special process. 3529232153Smm */ 3530232153Smm u16 = _p; 3531232153Smm ll = 0; 3532232153Smm for (b = 0; b < bytes; b += 2) { 3533232153Smm uint16_t val; 3534232153Smm if (be) 3535232153Smm val = archive_be16dec(u16+b); 3536232153Smm else 3537232153Smm val = archive_le16dec(u16+b); 3538232153Smm if (val > 255) { 3539232153Smm *mbs++ = '?'; 3540232153Smm ret = -1; 3541232153Smm } else 3542232153Smm *mbs++ = (char)(val&0xff); 3543232153Smm ll++; 3544232153Smm } 3545232153Smm as->length += ll; 3546232153Smm as->s[as->length] = '\0'; 3547232153Smm return (ret); 3548232153Smm } 3549232153Smm 3550232153Smm archive_string_init(&tmp); 3551232153Smm if (be) { 3552232153Smm if (is_big_endian()) { 3553232153Smm u16 = _p; 3554232153Smm } else { 3555232153Smm if (archive_string_ensure(&tmp, bytes+2) == NULL) 3556232153Smm return (-1); 3557232153Smm memcpy(tmp.s, _p, bytes); 3558232153Smm for (b = 0; b < bytes; b += 2) { 3559232153Smm uint16_t val = archive_be16dec(tmp.s+b); 3560232153Smm archive_le16enc(tmp.s+b, val); 3561232153Smm } 3562232153Smm u16 = tmp.s; 3563232153Smm } 3564232153Smm } else { 3565232153Smm if (!is_big_endian()) { 3566232153Smm u16 = _p; 3567232153Smm } else { 3568232153Smm if (archive_string_ensure(&tmp, bytes+2) == NULL) 3569232153Smm return (-1); 3570232153Smm memcpy(tmp.s, _p, bytes); 3571232153Smm for (b = 0; b < bytes; b += 2) { 3572232153Smm uint16_t val = archive_le16dec(tmp.s+b); 3573232153Smm archive_be16enc(tmp.s+b, val); 3574232153Smm } 3575232153Smm u16 = tmp.s; 3576232153Smm } 3577232153Smm } 3578232153Smm 3579232153Smm do { 3580232153Smm defchar = 0; 3581232153Smm ll = WideCharToMultiByte(sc->to_cp, 0, 3582248616Smm (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, 3583232153Smm NULL, &defchar); 3584232153Smm if (ll == 0 && 3585232153Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 3586232153Smm /* Need more buffer for MBS. */ 3587232153Smm ll = WideCharToMultiByte(sc->to_cp, 0, 3588248616Smm (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); 3589232153Smm if (archive_string_ensure(as, ll +1) == NULL) 3590232153Smm return (-1); 3591232153Smm mbs = as->s + as->length; 3592232153Smm mbs_size = as->buffer_length - as->length -1; 3593232153Smm continue; 3594232153Smm } 3595232153Smm } while (0); 3596232153Smm archive_string_free(&tmp); 3597232153Smm as->length += ll; 3598232153Smm as->s[as->length] = '\0'; 3599232153Smm if (ll == 0 || defchar) 3600232153Smm ret = -1; 3601232153Smm return (ret); 3602232153Smm} 3603232153Smm 3604232153Smmstatic int 3605238856Smmwin_strncat_from_utf16be(struct archive_string *as, const void *_p, 3606238856Smm size_t bytes, struct archive_string_conv *sc) 3607232153Smm{ 3608232153Smm return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); 3609232153Smm} 3610232153Smm 3611232153Smmstatic int 3612238856Smmwin_strncat_from_utf16le(struct archive_string *as, const void *_p, 3613238856Smm size_t bytes, struct archive_string_conv *sc) 3614232153Smm{ 3615232153Smm return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); 3616232153Smm} 3617232153Smm 3618232153Smmstatic int 3619232153Smmis_big_endian(void) 3620232153Smm{ 3621232153Smm uint16_t d = 1; 3622232153Smm 3623232153Smm return (archive_be16dec(&d) == 1); 3624232153Smm} 3625232153Smm 3626232153Smm/* 3627232153Smm * Convert a current locale string to UTF-16BE/LE and copy the result. 3628232153Smm * Return -1 if conversion failes. 3629232153Smm */ 3630232153Smmstatic int 3631238856Smmwin_strncat_to_utf16(struct archive_string *as16, const void *_p, 3632238856Smm size_t length, struct archive_string_conv *sc, int bigendian) 3633232153Smm{ 3634232153Smm const char *s = (const char *)_p; 3635232153Smm char *u16; 3636232153Smm size_t count, avail; 3637232153Smm 3638232153Smm if (archive_string_ensure(as16, 3639232153Smm as16->length + (length + 1) * 2) == NULL) 3640232153Smm return (-1); 3641232153Smm 3642232153Smm u16 = as16->s + as16->length; 3643232153Smm avail = as16->buffer_length - 2; 3644232153Smm if (sc->from_cp == CP_C_LOCALE) { 3645232153Smm /* 3646232153Smm * "C" locale special process. 3647232153Smm */ 3648232153Smm count = 0; 3649232153Smm while (count < length && *s) { 3650232153Smm if (bigendian) 3651232153Smm archive_be16enc(u16, *s); 3652232153Smm else 3653232153Smm archive_le16enc(u16, *s); 3654232153Smm u16 += 2; 3655232153Smm s++; 3656232153Smm count++; 3657232153Smm } 3658232153Smm as16->length += count << 1; 3659232153Smm as16->s[as16->length] = 0; 3660232153Smm as16->s[as16->length+1] = 0; 3661232153Smm return (0); 3662232153Smm } 3663232153Smm do { 3664232153Smm count = MultiByteToWideChar(sc->from_cp, 3665248616Smm MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); 3666232153Smm if (count == 0 && 3667232153Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 3668232153Smm /* Need more buffer for UTF-16 string */ 3669232153Smm count = MultiByteToWideChar(sc->from_cp, 3670248616Smm MB_PRECOMPOSED, s, (int)length, NULL, 0); 3671232153Smm if (archive_string_ensure(as16, (count +1) * 2) 3672232153Smm == NULL) 3673232153Smm return (-1); 3674232153Smm u16 = as16->s + as16->length; 3675232153Smm avail = as16->buffer_length - 2; 3676232153Smm continue; 3677232153Smm } 3678232153Smm } while (0); 3679232153Smm as16->length += count * 2; 3680232153Smm as16->s[as16->length] = 0; 3681232153Smm as16->s[as16->length+1] = 0; 3682232153Smm if (count == 0) 3683232153Smm return (-1); 3684232153Smm 3685232153Smm if (is_big_endian()) { 3686232153Smm if (!bigendian) { 3687232153Smm while (count > 0) { 3688232153Smm uint16_t v = archive_be16dec(u16); 3689232153Smm archive_le16enc(u16, v); 3690232153Smm u16 += 2; 3691232153Smm count--; 3692232153Smm } 3693232153Smm } 3694232153Smm } else { 3695232153Smm if (bigendian) { 3696232153Smm while (count > 0) { 3697232153Smm uint16_t v = archive_le16dec(u16); 3698232153Smm archive_be16enc(u16, v); 3699232153Smm u16 += 2; 3700232153Smm count--; 3701232153Smm } 3702232153Smm } 3703232153Smm } 3704232153Smm return (0); 3705232153Smm} 3706232153Smm 3707232153Smmstatic int 3708238856Smmwin_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3709238856Smm size_t length, struct archive_string_conv *sc) 3710232153Smm{ 3711232153Smm return (win_strncat_to_utf16(as16, _p, length, sc, 1)); 3712232153Smm} 3713232153Smm 3714232153Smmstatic int 3715238856Smmwin_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3716238856Smm size_t length, struct archive_string_conv *sc) 3717232153Smm{ 3718232153Smm return (win_strncat_to_utf16(as16, _p, length, sc, 0)); 3719232153Smm} 3720232153Smm 3721232153Smm#endif /* _WIN32 && !__CYGWIN__ */ 3722232153Smm 3723232153Smm/* 3724232153Smm * Do the best effort for conversions. 3725232153Smm * We cannot handle UTF-16BE character-set without such iconv, 3726232153Smm * but there is a chance if a string consists just ASCII code or 3727232153Smm * a current locale is UTF-8. 3728232153Smm */ 3729232153Smm 3730232153Smm/* 3731232153Smm * Convert a UTF-16BE string to current locale and copy the result. 3732232153Smm * Return -1 if conversion failes. 3733232153Smm */ 3734232153Smmstatic int 3735232153Smmbest_effort_strncat_from_utf16(struct archive_string *as, const void *_p, 3736232153Smm size_t bytes, struct archive_string_conv *sc, int be) 3737232153Smm{ 3738232153Smm const char *utf16 = (const char *)_p; 3739232153Smm char *mbs; 3740232153Smm uint32_t uc; 3741232153Smm int n, ret; 3742232153Smm 3743232153Smm (void)sc; /* UNUSED */ 3744232153Smm /* 3745232153Smm * Other case, we should do the best effort. 3746232153Smm * If all character are ASCII(<0x7f), we can convert it. 3747232153Smm * if not , we set a alternative character and return -1. 3748232153Smm */ 3749232153Smm ret = 0; 3750232153Smm if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3751232153Smm return (-1); 3752232153Smm mbs = as->s + as->length; 3753232153Smm 3754232153Smm while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { 3755232153Smm if (n < 0) { 3756232153Smm n *= -1; 3757232153Smm ret = -1; 3758232153Smm } 3759232153Smm bytes -= n; 3760232153Smm utf16 += n; 3761232153Smm 3762232153Smm if (uc > 127) { 3763232153Smm /* We cannot handle it. */ 3764232153Smm *mbs++ = '?'; 3765232153Smm ret = -1; 3766232153Smm } else 3767232153Smm *mbs++ = (char)uc; 3768232153Smm } 3769232153Smm as->length = mbs - as->s; 3770232153Smm as->s[as->length] = '\0'; 3771232153Smm return (ret); 3772232153Smm} 3773232153Smm 3774232153Smmstatic int 3775232153Smmbest_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, 3776232153Smm size_t bytes, struct archive_string_conv *sc) 3777232153Smm{ 3778232153Smm return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); 3779232153Smm} 3780232153Smm 3781232153Smmstatic int 3782232153Smmbest_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, 3783232153Smm size_t bytes, struct archive_string_conv *sc) 3784232153Smm{ 3785232153Smm return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); 3786232153Smm} 3787232153Smm 3788232153Smm/* 3789232153Smm * Convert a current locale string to UTF-16BE/LE and copy the result. 3790232153Smm * Return -1 if conversion failes. 3791232153Smm */ 3792232153Smmstatic int 3793232153Smmbest_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, 3794232153Smm size_t length, struct archive_string_conv *sc, int bigendian) 3795232153Smm{ 3796232153Smm const char *s = (const char *)_p; 3797232153Smm char *utf16; 3798232153Smm size_t remaining; 3799232153Smm int ret; 3800232153Smm 3801232153Smm (void)sc; /* UNUSED */ 3802232153Smm /* 3803232153Smm * Other case, we should do the best effort. 3804232153Smm * If all character are ASCII(<0x7f), we can convert it. 3805232153Smm * if not , we set a alternative character and return -1. 3806232153Smm */ 3807232153Smm ret = 0; 3808232153Smm remaining = length; 3809232153Smm 3810232153Smm if (archive_string_ensure(as16, 3811232153Smm as16->length + (length + 1) * 2) == NULL) 3812232153Smm return (-1); 3813232153Smm 3814232153Smm utf16 = as16->s + as16->length; 3815232153Smm while (remaining--) { 3816232153Smm unsigned c = *s++; 3817232153Smm if (c > 127) { 3818232153Smm /* We cannot handle it. */ 3819232153Smm c = UNICODE_R_CHAR; 3820232153Smm ret = -1; 3821232153Smm } 3822232153Smm if (bigendian) 3823232153Smm archive_be16enc(utf16, c); 3824232153Smm else 3825232153Smm archive_le16enc(utf16, c); 3826232153Smm utf16 += 2; 3827232153Smm } 3828232153Smm as16->length = utf16 - as16->s; 3829232153Smm as16->s[as16->length] = 0; 3830232153Smm as16->s[as16->length+1] = 0; 3831232153Smm return (ret); 3832232153Smm} 3833232153Smm 3834232153Smmstatic int 3835232153Smmbest_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3836232153Smm size_t length, struct archive_string_conv *sc) 3837232153Smm{ 3838232153Smm return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); 3839232153Smm} 3840232153Smm 3841232153Smmstatic int 3842232153Smmbest_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3843232153Smm size_t length, struct archive_string_conv *sc) 3844232153Smm{ 3845232153Smm return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); 3846232153Smm} 3847232153Smm 3848232153Smm 3849232153Smm/* 3850232153Smm * Multistring operations. 3851232153Smm */ 3852232153Smm 3853232153Smmvoid 3854232153Smmarchive_mstring_clean(struct archive_mstring *aes) 3855232153Smm{ 3856232153Smm archive_wstring_free(&(aes->aes_wcs)); 3857232153Smm archive_string_free(&(aes->aes_mbs)); 3858232153Smm archive_string_free(&(aes->aes_utf8)); 3859232153Smm archive_string_free(&(aes->aes_mbs_in_locale)); 3860232153Smm aes->aes_set = 0; 3861232153Smm} 3862232153Smm 3863232153Smmvoid 3864232153Smmarchive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) 3865232153Smm{ 3866232153Smm dest->aes_set = src->aes_set; 3867232153Smm archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); 3868232153Smm archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); 3869232153Smm archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); 3870232153Smm} 3871232153Smm 3872232153Smmint 3873232153Smmarchive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, 3874232153Smm const char **p) 3875232153Smm{ 3876232153Smm struct archive_string_conv *sc; 3877232153Smm int r; 3878232153Smm 3879232153Smm /* If we already have a UTF8 form, return that immediately. */ 3880232153Smm if (aes->aes_set & AES_SET_UTF8) { 3881232153Smm *p = aes->aes_utf8.s; 3882232153Smm return (0); 3883232153Smm } 3884232153Smm 3885232153Smm *p = NULL; 3886232153Smm if (aes->aes_set & AES_SET_MBS) { 3887232153Smm sc = archive_string_conversion_to_charset(a, "UTF-8", 1); 3888232153Smm if (sc == NULL) 3889232153Smm return (-1);/* Couldn't allocate memory for sc. */ 3890238856Smm r = archive_strncpy_l(&(aes->aes_mbs), aes->aes_mbs.s, 3891232153Smm aes->aes_mbs.length, sc); 3892232153Smm if (a == NULL) 3893232153Smm free_sconv_object(sc); 3894232153Smm if (r == 0) { 3895232153Smm aes->aes_set |= AES_SET_UTF8; 3896232153Smm *p = aes->aes_utf8.s; 3897232153Smm return (0);/* success. */ 3898232153Smm } else 3899232153Smm return (-1);/* failure. */ 3900232153Smm } 3901232153Smm return (0);/* success. */ 3902232153Smm} 3903232153Smm 3904232153Smmint 3905232153Smmarchive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, 3906232153Smm const char **p) 3907232153Smm{ 3908232153Smm int r, ret = 0; 3909232153Smm 3910232153Smm (void)a; /* UNUSED */ 3911232153Smm /* If we already have an MBS form, return that immediately. */ 3912232153Smm if (aes->aes_set & AES_SET_MBS) { 3913232153Smm *p = aes->aes_mbs.s; 3914232153Smm return (ret); 3915232153Smm } 3916232153Smm 3917232153Smm *p = NULL; 3918232153Smm /* If there's a WCS form, try converting with the native locale. */ 3919232153Smm if (aes->aes_set & AES_SET_WCS) { 3920232153Smm archive_string_empty(&(aes->aes_mbs)); 3921232153Smm r = archive_string_append_from_wcs(&(aes->aes_mbs), 3922232153Smm aes->aes_wcs.s, aes->aes_wcs.length); 3923232153Smm *p = aes->aes_mbs.s; 3924232153Smm if (r == 0) { 3925232153Smm aes->aes_set |= AES_SET_MBS; 3926232153Smm return (ret); 3927232153Smm } else 3928232153Smm ret = -1; 3929232153Smm } 3930232153Smm 3931232153Smm /* 3932232153Smm * Only a UTF-8 form cannot avail because its conversion already 3933232153Smm * failed at archive_mstring_update_utf8(). 3934232153Smm */ 3935232153Smm return (ret); 3936232153Smm} 3937232153Smm 3938232153Smmint 3939232153Smmarchive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, 3940232153Smm const wchar_t **wp) 3941232153Smm{ 3942232153Smm int r, ret = 0; 3943232153Smm 3944232153Smm (void)a;/* UNUSED */ 3945232153Smm /* Return WCS form if we already have it. */ 3946232153Smm if (aes->aes_set & AES_SET_WCS) { 3947232153Smm *wp = aes->aes_wcs.s; 3948232153Smm return (ret); 3949232153Smm } 3950232153Smm 3951232153Smm *wp = NULL; 3952232153Smm /* Try converting MBS to WCS using native locale. */ 3953232153Smm if (aes->aes_set & AES_SET_MBS) { 3954232153Smm archive_wstring_empty(&(aes->aes_wcs)); 3955232153Smm r = archive_wstring_append_from_mbs(&(aes->aes_wcs), 3956232153Smm aes->aes_mbs.s, aes->aes_mbs.length); 3957232153Smm if (r == 0) { 3958232153Smm aes->aes_set |= AES_SET_WCS; 3959232153Smm *wp = aes->aes_wcs.s; 3960232153Smm } else 3961232153Smm ret = -1;/* failure. */ 3962232153Smm } 3963232153Smm return (ret); 3964232153Smm} 3965232153Smm 3966232153Smmint 3967232153Smmarchive_mstring_get_mbs_l(struct archive_mstring *aes, 3968232153Smm const char **p, size_t *length, struct archive_string_conv *sc) 3969232153Smm{ 3970232153Smm int r, ret = 0; 3971232153Smm 3972232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 3973232153Smm /* 3974232153Smm * Internationalization programing on Windows must use Wide 3975232153Smm * characters because Windows platform cannot make locale UTF-8. 3976232153Smm */ 3977232153Smm if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { 3978232153Smm archive_string_empty(&(aes->aes_mbs_in_locale)); 3979232153Smm r = archive_string_append_from_wcs_in_codepage( 3980232153Smm &(aes->aes_mbs_in_locale), aes->aes_wcs.s, 3981232153Smm aes->aes_wcs.length, sc); 3982232153Smm if (r == 0) { 3983232153Smm *p = aes->aes_mbs_in_locale.s; 3984232153Smm if (length != NULL) 3985232153Smm *length = aes->aes_mbs_in_locale.length; 3986232153Smm return (0); 3987232153Smm } else if (errno == ENOMEM) 3988232153Smm return (-1); 3989232153Smm else 3990232153Smm ret = -1; 3991232153Smm } 3992228753Smm#endif 3993232153Smm 3994232153Smm /* If there is not an MBS form but is a WCS form, try converting 3995232153Smm * with the native locale to be used for translating it to specified 3996232153Smm * character-set. */ 3997232153Smm if ((aes->aes_set & AES_SET_MBS) == 0 && 3998232153Smm (aes->aes_set & AES_SET_WCS) != 0) { 3999232153Smm archive_string_empty(&(aes->aes_mbs)); 4000232153Smm r = archive_string_append_from_wcs(&(aes->aes_mbs), 4001232153Smm aes->aes_wcs.s, aes->aes_wcs.length); 4002232153Smm if (r == 0) 4003232153Smm aes->aes_set |= AES_SET_MBS; 4004232153Smm else if (errno == ENOMEM) 4005232153Smm return (-1); 4006232153Smm else 4007232153Smm ret = -1; 4008232153Smm } 4009232153Smm /* If we already have an MBS form, use it to be translated to 4010232153Smm * specified character-set. */ 4011232153Smm if (aes->aes_set & AES_SET_MBS) { 4012232153Smm if (sc == NULL) { 4013232153Smm /* Conversion is unneeded. */ 4014232153Smm *p = aes->aes_mbs.s; 4015232153Smm if (length != NULL) 4016232153Smm *length = aes->aes_mbs.length; 4017232153Smm return (0); 4018232153Smm } 4019238856Smm ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), 4020232153Smm aes->aes_mbs.s, aes->aes_mbs.length, sc); 4021232153Smm *p = aes->aes_mbs_in_locale.s; 4022232153Smm if (length != NULL) 4023232153Smm *length = aes->aes_mbs_in_locale.length; 4024232153Smm } else { 4025232153Smm *p = NULL; 4026232153Smm if (length != NULL) 4027232153Smm *length = 0; 4028232153Smm } 4029232153Smm return (ret); 4030228753Smm} 4031228753Smm 4032232153Smmint 4033232153Smmarchive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) 4034232153Smm{ 4035232153Smm if (mbs == NULL) { 4036232153Smm aes->aes_set = 0; 4037232153Smm return (0); 4038232153Smm } 4039232153Smm return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); 4040232153Smm} 4041232153Smm 4042232153Smmint 4043232153Smmarchive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, 4044232153Smm size_t len) 4045232153Smm{ 4046232153Smm if (mbs == NULL) { 4047232153Smm aes->aes_set = 0; 4048232153Smm return (0); 4049232153Smm } 4050232153Smm aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4051232153Smm archive_strncpy(&(aes->aes_mbs), mbs, len); 4052232153Smm archive_string_empty(&(aes->aes_utf8)); 4053232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4054232153Smm return (0); 4055232153Smm} 4056232153Smm 4057232153Smmint 4058232153Smmarchive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) 4059232153Smm{ 4060238856Smm return archive_mstring_copy_wcs_len(aes, wcs, 4061238856Smm wcs == NULL ? 0 : wcslen(wcs)); 4062232153Smm} 4063232153Smm 4064232153Smmint 4065232153Smmarchive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, 4066232153Smm size_t len) 4067232153Smm{ 4068232153Smm if (wcs == NULL) { 4069232153Smm aes->aes_set = 0; 4070232153Smm } 4071232153Smm aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ 4072232153Smm archive_string_empty(&(aes->aes_mbs)); 4073232153Smm archive_string_empty(&(aes->aes_utf8)); 4074232153Smm archive_wstrncpy(&(aes->aes_wcs), wcs, len); 4075232153Smm return (0); 4076232153Smm} 4077232153Smm 4078232153Smmint 4079232153Smmarchive_mstring_copy_mbs_len_l(struct archive_mstring *aes, 4080232153Smm const char *mbs, size_t len, struct archive_string_conv *sc) 4081232153Smm{ 4082232153Smm int r; 4083232153Smm 4084232153Smm if (mbs == NULL) { 4085232153Smm aes->aes_set = 0; 4086232153Smm return (0); 4087232153Smm } 4088232153Smm archive_string_empty(&(aes->aes_mbs)); 4089232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4090232153Smm archive_string_empty(&(aes->aes_utf8)); 4091232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 4092232153Smm /* 4093232153Smm * Internationalization programing on Windows must use Wide 4094232153Smm * characters because Windows platform cannot make locale UTF-8. 4095232153Smm */ 4096232153Smm if (sc == NULL) { 4097232153Smm if (archive_string_append(&(aes->aes_mbs), 4098232153Smm mbs, mbsnbytes(mbs, len)) == NULL) { 4099232153Smm aes->aes_set = 0; 4100232153Smm r = -1; 4101232153Smm } else { 4102232153Smm aes->aes_set = AES_SET_MBS; 4103232153Smm r = 0; 4104232153Smm } 4105232153Smm#if defined(HAVE_ICONV) 4106232153Smm } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { 4107232153Smm /* 4108232153Smm * This case happens only when MultiByteToWideChar() cannot 4109232153Smm * handle sc->from_cp, and we have to iconv in order to 4110232153Smm * translate character-set to wchar_t,UTF-16. 4111232153Smm */ 4112232153Smm iconv_t cd = sc->cd; 4113232153Smm unsigned from_cp; 4114232153Smm int flag; 4115232153Smm 4116232153Smm /* 4117232153Smm * Translate multi-bytes from some character-set to UTF-8. 4118232153Smm */ 4119232153Smm sc->cd = sc->cd_w; 4120238856Smm r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); 4121232153Smm sc->cd = cd; 4122232153Smm if (r != 0) { 4123232153Smm aes->aes_set = 0; 4124232153Smm return (r); 4125232153Smm } 4126232153Smm aes->aes_set = AES_SET_UTF8; 4127232153Smm 4128232153Smm /* 4129232153Smm * Append the UTF-8 string into wstring. 4130232153Smm */ 4131232153Smm flag = sc->flag; 4132232153Smm sc->flag &= ~(SCONV_NORMALIZATION_C 4133232153Smm | SCONV_TO_UTF16| SCONV_FROM_UTF16); 4134232153Smm from_cp = sc->from_cp; 4135232153Smm sc->from_cp = CP_UTF8; 4136232153Smm r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), 4137232153Smm aes->aes_utf8.s, aes->aes_utf8.length, sc); 4138232153Smm sc->flag = flag; 4139232153Smm sc->from_cp = from_cp; 4140232153Smm if (r == 0) 4141232153Smm aes->aes_set |= AES_SET_WCS; 4142232153Smm#endif 4143232153Smm } else { 4144232153Smm r = archive_wstring_append_from_mbs_in_codepage( 4145232153Smm &(aes->aes_wcs), mbs, len, sc); 4146232153Smm if (r == 0) 4147232153Smm aes->aes_set = AES_SET_WCS; 4148232153Smm else 4149232153Smm aes->aes_set = 0; 4150232153Smm } 4151232153Smm#else 4152238856Smm r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); 4153232153Smm if (r == 0) 4154232153Smm aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4155232153Smm else 4156232153Smm aes->aes_set = 0; 4157232153Smm#endif 4158232153Smm return (r); 4159232153Smm} 4160232153Smm 4161232153Smm/* 4162232153Smm * The 'update' form tries to proactively update all forms of 4163232153Smm * this string (WCS and MBS) and returns an error if any of 4164232153Smm * them fail. This is used by the 'pax' handler, for instance, 4165232153Smm * to detect and report character-conversion failures early while 4166232153Smm * still allowing clients to get potentially useful values from 4167232153Smm * the more tolerant lazy conversions. (get_mbs and get_wcs will 4168232153Smm * strive to give the user something useful, so you can get hopefully 4169232153Smm * usable values even if some of the character conversions are failing.) 4170232153Smm */ 4171232153Smmint 4172232153Smmarchive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, 4173232153Smm const char *utf8) 4174232153Smm{ 4175232153Smm struct archive_string_conv *sc; 4176232153Smm int r; 4177232153Smm 4178232153Smm if (utf8 == NULL) { 4179232153Smm aes->aes_set = 0; 4180232153Smm return (0); /* Succeeded in clearing everything. */ 4181232153Smm } 4182232153Smm 4183232153Smm /* Save the UTF8 string. */ 4184232153Smm archive_strcpy(&(aes->aes_utf8), utf8); 4185232153Smm 4186232153Smm /* Empty the mbs and wcs strings. */ 4187232153Smm archive_string_empty(&(aes->aes_mbs)); 4188232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4189232153Smm 4190232153Smm aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ 4191232153Smm 4192232153Smm /* Try converting UTF-8 to MBS, return false on failure. */ 4193232153Smm sc = archive_string_conversion_from_charset(a, "UTF-8", 1); 4194232153Smm if (sc == NULL) 4195232153Smm return (-1);/* Couldn't allocate memory for sc. */ 4196238856Smm r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); 4197232153Smm if (a == NULL) 4198232153Smm free_sconv_object(sc); 4199232153Smm if (r != 0) 4200232153Smm return (-1); 4201232153Smm aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ 4202232153Smm 4203232153Smm /* Try converting MBS to WCS, return false on failure. */ 4204232153Smm if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, 4205232153Smm aes->aes_mbs.length)) 4206232153Smm return (-1); 4207232153Smm aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; 4208232153Smm 4209232153Smm /* All conversions succeeded. */ 4210232153Smm return (0); 4211232153Smm} 4212