archive_string.c revision 313570
1228753Smm/*- 2232153Smm * Copyright (c) 2003-2011 Tim Kientzle 3232153Smm * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4228753Smm * All rights reserved. 5228753Smm * 6228753Smm * Redistribution and use in source and binary forms, with or without 7228753Smm * modification, are permitted provided that the following conditions 8228753Smm * are met: 9228753Smm * 1. Redistributions of source code must retain the above copyright 10228753Smm * notice, this list of conditions and the following disclaimer. 11228753Smm * 2. Redistributions in binary form must reproduce the above copyright 12228753Smm * notice, this list of conditions and the following disclaimer in the 13228753Smm * documentation and/or other materials provided with the distribution. 14228753Smm * 15228753Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 16228753Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17228753Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18228753Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 19228753Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20228753Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21228753Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22228753Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23228753Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24228753Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25228753Smm */ 26228753Smm 27228753Smm#include "archive_platform.h" 28228763Smm__FBSDID("$FreeBSD: stable/11/contrib/libarchive/libarchive/archive_string.c 313570 2017-02-11 00:54:16Z mm $"); 29228753Smm 30228753Smm/* 31228753Smm * Basic resizable string support, to simplify manipulating arbitrary-sized 32228753Smm * strings while minimizing heap activity. 33232153Smm * 34232153Smm * In particular, the buffer used by a string object is only grown, it 35232153Smm * never shrinks, so you can clear and reuse the same string object 36232153Smm * without incurring additional memory allocations. 37228753Smm */ 38228753Smm 39232153Smm#ifdef HAVE_ERRNO_H 40232153Smm#include <errno.h> 41232153Smm#endif 42232153Smm#ifdef HAVE_ICONV_H 43232153Smm#include <iconv.h> 44232153Smm#endif 45232153Smm#ifdef HAVE_LANGINFO_H 46232153Smm#include <langinfo.h> 47232153Smm#endif 48232153Smm#ifdef HAVE_LOCALCHARSET_H 49232153Smm#include <localcharset.h> 50232153Smm#endif 51228753Smm#ifdef HAVE_STDLIB_H 52228753Smm#include <stdlib.h> 53228753Smm#endif 54228753Smm#ifdef HAVE_STRING_H 55228753Smm#include <string.h> 56228753Smm#endif 57228753Smm#ifdef HAVE_WCHAR_H 58228753Smm#include <wchar.h> 59228753Smm#endif 60228753Smm#if defined(_WIN32) && !defined(__CYGWIN__) 61228753Smm#include <windows.h> 62232153Smm#include <locale.h> 63228753Smm#endif 64228753Smm 65232153Smm#include "archive_endian.h" 66228753Smm#include "archive_private.h" 67228753Smm#include "archive_string.h" 68232153Smm#include "archive_string_composition.h" 69228753Smm 70232153Smm#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) 71232153Smm#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) 72232153Smm#endif 73232153Smm 74299529Smm#if !defined(HAVE_WMEMMOVE) && !defined(wmemmove) 75299529Smm#define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t)) 76299529Smm#endif 77299529Smm 78232153Smmstruct archive_string_conv { 79232153Smm struct archive_string_conv *next; 80232153Smm char *from_charset; 81232153Smm char *to_charset; 82232153Smm unsigned from_cp; 83232153Smm unsigned to_cp; 84232153Smm /* Set 1 if from_charset and to_charset are the same. */ 85232153Smm int same; 86232153Smm int flag; 87232153Smm#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified 88232153Smm * charset. */ 89232153Smm#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from 90232153Smm * specified charset. */ 91232153Smm#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ 92232153Smm#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting 93232153Smm * MBS. */ 94232153Smm#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive 95232153Smm * 2.x in the wrong assumption. */ 96232153Smm#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. 97232153Smm * Before UTF-8 characters are actually 98232153Smm * processed. */ 99232153Smm#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. 100232153Smm * Before UTF-8 characters are actually 101232153Smm * processed. 102232153Smm * Currently this only for MAC OS X. */ 103232153Smm#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ 104232153Smm#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ 105232153Smm#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ 106232153Smm#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ 107232153Smm#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ 108232153Smm#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ 109232153Smm#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) 110232153Smm#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) 111232153Smm 112232153Smm#if HAVE_ICONV 113232153Smm iconv_t cd; 114232153Smm iconv_t cd_w;/* Use at archive_mstring on 115232153Smm * Windows. */ 116232153Smm#endif 117232153Smm /* A temporary buffer for normalization. */ 118232153Smm struct archive_string utftmp; 119232153Smm int (*converter[2])(struct archive_string *, const void *, size_t, 120232153Smm struct archive_string_conv *); 121232153Smm int nconverter; 122232153Smm}; 123232153Smm 124232153Smm#define CP_C_LOCALE 0 /* "C" locale only for this file. */ 125232153Smm#define CP_UTF16LE 1200 126232153Smm#define CP_UTF16BE 1201 127232153Smm 128232153Smm#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) 129232153Smm#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) 130232153Smm#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) 131232153Smm#define UNICODE_MAX 0x10FFFF 132232153Smm#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ 133232153Smm/* Set U+FFFD(Replacement character) in UTF-8. */ 134299529Smmstatic const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; 135232153Smm 136232153Smmstatic struct archive_string_conv *find_sconv_object(struct archive *, 137232153Smm const char *, const char *); 138232153Smmstatic void add_sconv_object(struct archive *, struct archive_string_conv *); 139232153Smmstatic struct archive_string_conv *create_sconv_object(const char *, 140232153Smm const char *, unsigned, int); 141232153Smmstatic void free_sconv_object(struct archive_string_conv *); 142232153Smmstatic struct archive_string_conv *get_sconv_object(struct archive *, 143232153Smm const char *, const char *, int); 144232153Smmstatic unsigned make_codepage_from_charset(const char *); 145232153Smmstatic unsigned get_current_codepage(void); 146232153Smmstatic unsigned get_current_oemcp(void); 147232153Smmstatic size_t mbsnbytes(const void *, size_t); 148232153Smmstatic size_t utf16nbytes(const void *, size_t); 149232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 150232153Smmstatic int archive_wstring_append_from_mbs_in_codepage( 151232153Smm struct archive_wstring *, const char *, size_t, 152232153Smm struct archive_string_conv *); 153232153Smmstatic int archive_string_append_from_wcs_in_codepage(struct archive_string *, 154232153Smm const wchar_t *, size_t, struct archive_string_conv *); 155232153Smmstatic int is_big_endian(void); 156232153Smmstatic int strncat_in_codepage(struct archive_string *, const void *, 157232153Smm size_t, struct archive_string_conv *); 158238856Smmstatic int win_strncat_from_utf16be(struct archive_string *, const void *, 159232153Smm size_t, struct archive_string_conv *); 160238856Smmstatic int win_strncat_from_utf16le(struct archive_string *, const void *, 161232153Smm size_t, struct archive_string_conv *); 162238856Smmstatic int win_strncat_to_utf16be(struct archive_string *, const void *, 163232153Smm size_t, struct archive_string_conv *); 164238856Smmstatic int win_strncat_to_utf16le(struct archive_string *, const void *, 165232153Smm size_t, struct archive_string_conv *); 166238856Smm#endif 167238856Smmstatic int best_effort_strncat_from_utf16be(struct archive_string *, 168238856Smm const void *, size_t, struct archive_string_conv *); 169238856Smmstatic int best_effort_strncat_from_utf16le(struct archive_string *, 170238856Smm const void *, size_t, struct archive_string_conv *); 171238856Smmstatic int best_effort_strncat_to_utf16be(struct archive_string *, 172238856Smm const void *, size_t, struct archive_string_conv *); 173238856Smmstatic int best_effort_strncat_to_utf16le(struct archive_string *, 174238856Smm const void *, size_t, struct archive_string_conv *); 175232153Smm#if defined(HAVE_ICONV) 176232153Smmstatic int iconv_strncat_in_locale(struct archive_string *, const void *, 177232153Smm size_t, struct archive_string_conv *); 178232153Smm#endif 179238856Smmstatic int best_effort_strncat_in_locale(struct archive_string *, 180238856Smm const void *, size_t, struct archive_string_conv *); 181232153Smmstatic int _utf8_to_unicode(uint32_t *, const char *, size_t); 182232153Smmstatic int utf8_to_unicode(uint32_t *, const char *, size_t); 183232153Smmstatic inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); 184232153Smmstatic int cesu8_to_unicode(uint32_t *, const char *, size_t); 185232153Smmstatic size_t unicode_to_utf8(char *, size_t, uint32_t); 186232153Smmstatic int utf16_to_unicode(uint32_t *, const char *, size_t, int); 187232153Smmstatic size_t unicode_to_utf16be(char *, size_t, uint32_t); 188232153Smmstatic size_t unicode_to_utf16le(char *, size_t, uint32_t); 189232153Smmstatic int strncat_from_utf8_libarchive2(struct archive_string *, 190232153Smm const void *, size_t, struct archive_string_conv *); 191232153Smmstatic int strncat_from_utf8_to_utf8(struct archive_string *, const void *, 192232153Smm size_t, struct archive_string_conv *); 193232153Smmstatic int archive_string_normalize_C(struct archive_string *, const void *, 194232153Smm size_t, struct archive_string_conv *); 195232153Smmstatic int archive_string_normalize_D(struct archive_string *, const void *, 196232153Smm size_t, struct archive_string_conv *); 197232153Smmstatic int archive_string_append_unicode(struct archive_string *, 198232153Smm const void *, size_t, struct archive_string_conv *); 199232153Smm 200232153Smmstatic struct archive_string * 201232153Smmarchive_string_append(struct archive_string *as, const char *p, size_t s) 202228753Smm{ 203232153Smm if (archive_string_ensure(as, as->length + s + 1) == NULL) 204232153Smm return (NULL); 205299529Smm memmove(as->s + as->length, p, s); 206228753Smm as->length += s; 207232153Smm as->s[as->length] = 0; 208228753Smm return (as); 209228753Smm} 210228753Smm 211232153Smmstatic struct archive_wstring * 212232153Smmarchive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) 213232153Smm{ 214232153Smm if (archive_wstring_ensure(as, as->length + s + 1) == NULL) 215232153Smm return (NULL); 216299529Smm wmemmove(as->s + as->length, p, s); 217232153Smm as->length += s; 218232153Smm as->s[as->length] = 0; 219232153Smm return (as); 220232153Smm} 221232153Smm 222313570Smmstruct archive_string * 223313570Smmarchive_array_append(struct archive_string *as, const char *p, size_t s) 224313570Smm{ 225313570Smm return archive_string_append(as, p, s); 226313570Smm} 227313570Smm 228228753Smmvoid 229232153Smmarchive_string_concat(struct archive_string *dest, struct archive_string *src) 230228753Smm{ 231232153Smm if (archive_string_append(dest, src->s, src->length) == NULL) 232232153Smm __archive_errx(1, "Out of memory"); 233228753Smm} 234228753Smm 235228753Smmvoid 236238856Smmarchive_wstring_concat(struct archive_wstring *dest, 237238856Smm struct archive_wstring *src) 238228753Smm{ 239232153Smm if (archive_wstring_append(dest, src->s, src->length) == NULL) 240232153Smm __archive_errx(1, "Out of memory"); 241228753Smm} 242228753Smm 243228753Smmvoid 244232153Smmarchive_string_free(struct archive_string *as) 245228753Smm{ 246228753Smm as->length = 0; 247228753Smm as->buffer_length = 0; 248232153Smm free(as->s); 249232153Smm as->s = NULL; 250228753Smm} 251228753Smm 252232153Smmvoid 253232153Smmarchive_wstring_free(struct archive_wstring *as) 254232153Smm{ 255232153Smm as->length = 0; 256232153Smm as->buffer_length = 0; 257232153Smm free(as->s); 258232153Smm as->s = NULL; 259232153Smm} 260232153Smm 261232153Smmstruct archive_wstring * 262232153Smmarchive_wstring_ensure(struct archive_wstring *as, size_t s) 263232153Smm{ 264232153Smm return (struct archive_wstring *) 265232153Smm archive_string_ensure((struct archive_string *)as, 266232153Smm s * sizeof(wchar_t)); 267232153Smm} 268232153Smm 269228753Smm/* Returns NULL on any allocation failure. */ 270228753Smmstruct archive_string * 271232153Smmarchive_string_ensure(struct archive_string *as, size_t s) 272228753Smm{ 273232153Smm char *p; 274232153Smm size_t new_length; 275232153Smm 276228753Smm /* If buffer is already big enough, don't reallocate. */ 277228753Smm if (as->s && (s <= as->buffer_length)) 278228753Smm return (as); 279228753Smm 280228753Smm /* 281228753Smm * Growing the buffer at least exponentially ensures that 282228753Smm * append operations are always linear in the number of 283228753Smm * characters appended. Using a smaller growth rate for 284228753Smm * larger buffers reduces memory waste somewhat at the cost of 285228753Smm * a larger constant factor. 286228753Smm */ 287228753Smm if (as->buffer_length < 32) 288228753Smm /* Start with a minimum 32-character buffer. */ 289232153Smm new_length = 32; 290228753Smm else if (as->buffer_length < 8192) 291228753Smm /* Buffers under 8k are doubled for speed. */ 292232153Smm new_length = as->buffer_length + as->buffer_length; 293228753Smm else { 294228753Smm /* Buffers 8k and over grow by at least 25% each time. */ 295232153Smm new_length = as->buffer_length + as->buffer_length / 4; 296232153Smm /* Be safe: If size wraps, fail. */ 297232153Smm if (new_length < as->buffer_length) { 298232153Smm /* On failure, wipe the string and return NULL. */ 299232153Smm archive_string_free(as); 300232153Smm errno = ENOMEM;/* Make sure errno has ENOMEM. */ 301228753Smm return (NULL); 302228753Smm } 303228753Smm } 304228753Smm /* 305228753Smm * The computation above is a lower limit to how much we'll 306228753Smm * grow the buffer. In any case, we have to grow it enough to 307228753Smm * hold the request. 308228753Smm */ 309232153Smm if (new_length < s) 310232153Smm new_length = s; 311228753Smm /* Now we can reallocate the buffer. */ 312232153Smm p = (char *)realloc(as->s, new_length); 313232153Smm if (p == NULL) { 314232153Smm /* On failure, wipe the string and return NULL. */ 315232153Smm archive_string_free(as); 316232153Smm errno = ENOMEM;/* Make sure errno has ENOMEM. */ 317228753Smm return (NULL); 318232153Smm } 319232153Smm 320232153Smm as->s = p; 321232153Smm as->buffer_length = new_length; 322228753Smm return (as); 323228753Smm} 324228753Smm 325232153Smm/* 326232153Smm * TODO: See if there's a way to avoid scanning 327232153Smm * the source string twice. Then test to see 328232153Smm * if it actually helps (remember that we're almost 329232153Smm * always called with pretty short arguments, so 330232153Smm * such an optimization might not help). 331232153Smm */ 332228753Smmstruct archive_string * 333232153Smmarchive_strncat(struct archive_string *as, const void *_p, size_t n) 334228753Smm{ 335228753Smm size_t s; 336228753Smm const char *p, *pp; 337228753Smm 338228753Smm p = (const char *)_p; 339228753Smm 340228753Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 341228753Smm s = 0; 342228753Smm pp = p; 343228753Smm while (s < n && *pp) { 344228753Smm pp++; 345228753Smm s++; 346228753Smm } 347232153Smm if ((as = archive_string_append(as, p, s)) == NULL) 348232153Smm __archive_errx(1, "Out of memory"); 349232153Smm return (as); 350228753Smm} 351228753Smm 352232153Smmstruct archive_wstring * 353232153Smmarchive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) 354232153Smm{ 355232153Smm size_t s; 356232153Smm const wchar_t *pp; 357232153Smm 358232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 359232153Smm s = 0; 360232153Smm pp = p; 361232153Smm while (s < n && *pp) { 362232153Smm pp++; 363232153Smm s++; 364232153Smm } 365232153Smm if ((as = archive_wstring_append(as, p, s)) == NULL) 366232153Smm __archive_errx(1, "Out of memory"); 367232153Smm return (as); 368232153Smm} 369232153Smm 370228753Smmstruct archive_string * 371232153Smmarchive_strcat(struct archive_string *as, const void *p) 372228753Smm{ 373232153Smm /* strcat is just strncat without an effective limit. 374232153Smm * Assert that we'll never get called with a source 375232153Smm * string over 16MB. 376232153Smm * TODO: Review all uses of strcat in the source 377232153Smm * and try to replace them with strncat(). 378232153Smm */ 379232153Smm return archive_strncat(as, p, 0x1000000); 380228753Smm} 381228753Smm 382232153Smmstruct archive_wstring * 383232153Smmarchive_wstrcat(struct archive_wstring *as, const wchar_t *p) 384232153Smm{ 385232153Smm /* Ditto. */ 386232153Smm return archive_wstrncat(as, p, 0x1000000); 387232153Smm} 388232153Smm 389232153Smmstruct archive_string * 390232153Smmarchive_strappend_char(struct archive_string *as, char c) 391232153Smm{ 392232153Smm if ((as = archive_string_append(as, &c, 1)) == NULL) 393232153Smm __archive_errx(1, "Out of memory"); 394232153Smm return (as); 395232153Smm} 396232153Smm 397232153Smmstruct archive_wstring * 398232153Smmarchive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) 399232153Smm{ 400232153Smm if ((as = archive_wstring_append(as, &c, 1)) == NULL) 401232153Smm __archive_errx(1, "Out of memory"); 402232153Smm return (as); 403232153Smm} 404232153Smm 405228753Smm/* 406232153Smm * Get the "current character set" name to use with iconv. 407232153Smm * On FreeBSD, the empty character set name "" chooses 408232153Smm * the correct character encoding for the current locale, 409232153Smm * so this isn't necessary. 410232153Smm * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; 411232153Smm * on that system, we have to explicitly call nl_langinfo() 412232153Smm * to get the right name. Not sure about other platforms. 413232153Smm * 414232153Smm * NOTE: GNU libiconv does not recognize the character-set name 415232153Smm * which some platform nl_langinfo(CODESET) returns, so we should 416232153Smm * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. 417228753Smm */ 418232153Smmstatic const char * 419232153Smmdefault_iconv_charset(const char *charset) { 420232153Smm if (charset != NULL && charset[0] != '\0') 421232153Smm return charset; 422232153Smm#if HAVE_LOCALE_CHARSET && !defined(__APPLE__) 423232153Smm /* locale_charset() is broken on Mac OS */ 424232153Smm return locale_charset(); 425232153Smm#elif HAVE_NL_LANGINFO 426232153Smm return nl_langinfo(CODESET); 427232153Smm#else 428232153Smm return ""; 429232153Smm#endif 430232153Smm} 431232153Smm 432232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 433232153Smm 434232153Smm/* 435232153Smm * Convert MBS to WCS. 436232153Smm * Note: returns -1 if conversion fails. 437232153Smm */ 438232153Smmint 439232153Smmarchive_wstring_append_from_mbs(struct archive_wstring *dest, 440232153Smm const char *p, size_t len) 441228753Smm{ 442238856Smm return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); 443232153Smm} 444232153Smm 445232153Smmstatic int 446232153Smmarchive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, 447232153Smm const char *s, size_t length, struct archive_string_conv *sc) 448232153Smm{ 449232153Smm int count, ret = 0; 450232153Smm UINT from_cp; 451232153Smm 452232153Smm if (sc != NULL) 453232153Smm from_cp = sc->from_cp; 454232153Smm else 455232153Smm from_cp = get_current_codepage(); 456232153Smm 457232153Smm if (from_cp == CP_C_LOCALE) { 458232153Smm /* 459232153Smm * "C" locale special process. 460232153Smm */ 461232153Smm wchar_t *ws; 462232153Smm const unsigned char *mp; 463232153Smm 464232153Smm if (NULL == archive_wstring_ensure(dest, 465232153Smm dest->length + length + 1)) 466232153Smm return (-1); 467232153Smm 468232153Smm ws = dest->s + dest->length; 469232153Smm mp = (const unsigned char *)s; 470232153Smm count = 0; 471232153Smm while (count < (int)length && *mp) { 472232153Smm *ws++ = (wchar_t)*mp++; 473232153Smm count++; 474232153Smm } 475238856Smm } else if (sc != NULL && 476238856Smm (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { 477232153Smm /* 478232153Smm * Normalize UTF-8 and UTF-16BE and convert it directly 479232153Smm * to UTF-16 as wchar_t. 480232153Smm */ 481232153Smm struct archive_string u16; 482232153Smm int saved_flag = sc->flag;/* save current flag. */ 483232153Smm 484232153Smm if (is_big_endian()) 485232153Smm sc->flag |= SCONV_TO_UTF16BE; 486232153Smm else 487232153Smm sc->flag |= SCONV_TO_UTF16LE; 488232153Smm 489232153Smm if (sc->flag & SCONV_FROM_UTF16) { 490232153Smm /* 491232153Smm * UTF-16BE/LE NFD ===> UTF-16 NFC 492238856Smm * UTF-16BE/LE NFC ===> UTF-16 NFD 493232153Smm */ 494248616Smm count = (int)utf16nbytes(s, length); 495232153Smm } else { 496232153Smm /* 497232153Smm * UTF-8 NFD ===> UTF-16 NFC 498238856Smm * UTF-8 NFC ===> UTF-16 NFD 499232153Smm */ 500248616Smm count = (int)mbsnbytes(s, length); 501232153Smm } 502232153Smm u16.s = (char *)dest->s; 503232153Smm u16.length = dest->length << 1;; 504232153Smm u16.buffer_length = dest->buffer_length; 505238856Smm if (sc->flag & SCONV_NORMALIZATION_C) 506238856Smm ret = archive_string_normalize_C(&u16, s, count, sc); 507238856Smm else 508238856Smm ret = archive_string_normalize_D(&u16, s, count, sc); 509232153Smm dest->s = (wchar_t *)u16.s; 510232153Smm dest->length = u16.length >> 1; 511232153Smm dest->buffer_length = u16.buffer_length; 512232153Smm sc->flag = saved_flag;/* restore the saved flag. */ 513232153Smm return (ret); 514232153Smm } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { 515248616Smm count = (int)utf16nbytes(s, length); 516232153Smm count >>= 1; /* to be WCS length */ 517232153Smm /* Allocate memory for WCS. */ 518232153Smm if (NULL == archive_wstring_ensure(dest, 519232153Smm dest->length + count + 1)) 520232153Smm return (-1); 521238856Smm wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); 522232153Smm if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) { 523232153Smm uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 524232153Smm int b; 525232153Smm for (b = 0; b < count; b++) { 526232153Smm uint16_t val = archive_le16dec(u16+b); 527232153Smm archive_be16enc(u16+b, val); 528232153Smm } 529232153Smm } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) { 530232153Smm uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 531232153Smm int b; 532232153Smm for (b = 0; b < count; b++) { 533232153Smm uint16_t val = archive_be16dec(u16+b); 534232153Smm archive_le16enc(u16+b, val); 535232153Smm } 536232153Smm } 537232153Smm } else { 538232153Smm DWORD mbflag; 539238856Smm size_t buffsize; 540232153Smm 541232153Smm if (sc == NULL) 542232153Smm mbflag = 0; 543232153Smm else if (sc->flag & SCONV_FROM_CHARSET) { 544232153Smm /* Do not trust the length which comes from 545232153Smm * an archive file. */ 546232153Smm length = mbsnbytes(s, length); 547232153Smm mbflag = 0; 548232153Smm } else 549232153Smm mbflag = MB_PRECOMPOSED; 550232153Smm 551238856Smm buffsize = dest->length + length + 1; 552238856Smm do { 553238856Smm /* Allocate memory for WCS. */ 554238856Smm if (NULL == archive_wstring_ensure(dest, buffsize)) 555232153Smm return (-1); 556238856Smm /* Convert MBS to WCS. */ 557238856Smm count = MultiByteToWideChar(from_cp, 558248616Smm mbflag, s, (int)length, dest->s + dest->length, 559248616Smm (int)(dest->buffer_length >> 1) -1); 560238856Smm if (count == 0 && 561238856Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 562238856Smm /* Expand the WCS buffer. */ 563238856Smm buffsize = dest->buffer_length << 1; 564238856Smm continue; 565232153Smm } 566238856Smm if (count == 0 && length != 0) 567238856Smm ret = -1; 568302294Smm break; 569302294Smm } while (1); 570232153Smm } 571232153Smm dest->length += count; 572232153Smm dest->s[dest->length] = L'\0'; 573232153Smm return (ret); 574232153Smm} 575232153Smm 576232153Smm#else 577232153Smm 578232153Smm/* 579232153Smm * Convert MBS to WCS. 580232153Smm * Note: returns -1 if conversion fails. 581232153Smm */ 582232153Smmint 583232153Smmarchive_wstring_append_from_mbs(struct archive_wstring *dest, 584232153Smm const char *p, size_t len) 585232153Smm{ 586232153Smm size_t r; 587238856Smm int ret_val = 0; 588232153Smm /* 589232153Smm * No single byte will be more than one wide character, 590232153Smm * so this length estimate will always be big enough. 591232153Smm */ 592232153Smm size_t wcs_length = len; 593232153Smm size_t mbs_length = len; 594232153Smm const char *mbs = p; 595232153Smm wchar_t *wcs; 596232153Smm#if HAVE_MBRTOWC 597232153Smm mbstate_t shift_state; 598232153Smm 599232153Smm memset(&shift_state, 0, sizeof(shift_state)); 600232153Smm#endif 601232153Smm if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1)) 602238856Smm return (-1); 603232153Smm wcs = dest->s + dest->length; 604232153Smm /* 605232153Smm * We cannot use mbsrtowcs/mbstowcs here because those may convert 606313570Smm * extra MBS when strlen(p) > len and one wide character consists of 607232153Smm * multi bytes. 608232153Smm */ 609238856Smm while (*mbs && mbs_length > 0) { 610238856Smm if (wcs_length == 0) { 611238856Smm dest->length = wcs - dest->s; 612238856Smm dest->s[dest->length] = L'\0'; 613238856Smm wcs_length = mbs_length; 614238856Smm if (NULL == archive_wstring_ensure(dest, 615238856Smm dest->length + wcs_length + 1)) 616238856Smm return (-1); 617238856Smm wcs = dest->s + dest->length; 618238856Smm } 619232153Smm#if HAVE_MBRTOWC 620232153Smm r = mbrtowc(wcs, mbs, wcs_length, &shift_state); 621232153Smm#else 622232153Smm r = mbtowc(wcs, mbs, wcs_length); 623232153Smm#endif 624232153Smm if (r == (size_t)-1 || r == (size_t)-2) { 625238856Smm ret_val = -1; 626238856Smm if (errno == EILSEQ) { 627238856Smm ++mbs; 628238856Smm --mbs_length; 629238856Smm continue; 630238856Smm } else 631238856Smm break; 632232153Smm } 633232153Smm if (r == 0 || r > mbs_length) 634232153Smm break; 635232153Smm wcs++; 636232153Smm wcs_length--; 637232153Smm mbs += r; 638232153Smm mbs_length -= r; 639232153Smm } 640232153Smm dest->length = wcs - dest->s; 641232153Smm dest->s[dest->length] = L'\0'; 642238856Smm return (ret_val); 643232153Smm} 644232153Smm 645232153Smm#endif 646232153Smm 647232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 648232153Smm 649232153Smm/* 650232153Smm * WCS ==> MBS. 651232153Smm * Note: returns -1 if conversion fails. 652232153Smm * 653232153Smm * Win32 builds use WideCharToMultiByte from the Windows API. 654232153Smm * (Maybe Cygwin should too? WideCharToMultiByte will know a 655232153Smm * lot more about local character encodings than the wcrtomb() 656232153Smm * wrapper is going to know.) 657232153Smm */ 658232153Smmint 659232153Smmarchive_string_append_from_wcs(struct archive_string *as, 660232153Smm const wchar_t *w, size_t len) 661232153Smm{ 662238856Smm return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); 663232153Smm} 664232153Smm 665232153Smmstatic int 666232153Smmarchive_string_append_from_wcs_in_codepage(struct archive_string *as, 667232153Smm const wchar_t *ws, size_t len, struct archive_string_conv *sc) 668232153Smm{ 669232153Smm BOOL defchar_used, *dp; 670232153Smm int count, ret = 0; 671232153Smm UINT to_cp; 672232153Smm int wslen = (int)len; 673232153Smm 674232153Smm if (sc != NULL) 675232153Smm to_cp = sc->to_cp; 676232153Smm else 677232153Smm to_cp = get_current_codepage(); 678232153Smm 679232153Smm if (to_cp == CP_C_LOCALE) { 680232153Smm /* 681232153Smm * "C" locale special process. 682232153Smm */ 683232153Smm const wchar_t *wp = ws; 684232153Smm char *p; 685232153Smm 686232153Smm if (NULL == archive_string_ensure(as, 687232153Smm as->length + wslen +1)) 688232153Smm return (-1); 689232153Smm p = as->s + as->length; 690232153Smm count = 0; 691232153Smm defchar_used = 0; 692232153Smm while (count < wslen && *wp) { 693232153Smm if (*wp > 255) { 694232153Smm *p++ = '?'; 695232153Smm wp++; 696232153Smm defchar_used = 1; 697232153Smm } else 698232153Smm *p++ = (char)*wp++; 699232153Smm count++; 700232153Smm } 701232153Smm } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { 702232153Smm uint16_t *u16; 703232153Smm 704232153Smm if (NULL == 705232153Smm archive_string_ensure(as, as->length + len * 2 + 2)) 706232153Smm return (-1); 707232153Smm u16 = (uint16_t *)(as->s + as->length); 708232153Smm count = 0; 709232153Smm defchar_used = 0; 710232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 711232153Smm while (count < (int)len && *ws) { 712232153Smm archive_be16enc(u16+count, *ws); 713232153Smm ws++; 714232153Smm count++; 715232153Smm } 716232153Smm } else { 717232153Smm while (count < (int)len && *ws) { 718232153Smm archive_le16enc(u16+count, *ws); 719232153Smm ws++; 720232153Smm count++; 721232153Smm } 722232153Smm } 723232153Smm count <<= 1; /* to be byte size */ 724232153Smm } else { 725232153Smm /* Make sure the MBS buffer has plenty to set. */ 726232153Smm if (NULL == 727232153Smm archive_string_ensure(as, as->length + len * 2 + 1)) 728232153Smm return (-1); 729232153Smm do { 730232153Smm defchar_used = 0; 731232153Smm if (to_cp == CP_UTF8 || sc == NULL) 732232153Smm dp = NULL; 733232153Smm else 734232153Smm dp = &defchar_used; 735232153Smm count = WideCharToMultiByte(to_cp, 0, ws, wslen, 736248616Smm as->s + as->length, (int)as->buffer_length-1, NULL, dp); 737232153Smm if (count == 0 && 738232153Smm GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 739232153Smm /* Expand the MBS buffer and retry. */ 740232153Smm if (NULL == archive_string_ensure(as, 741232153Smm as->buffer_length + len)) 742232153Smm return (-1); 743232153Smm continue; 744232153Smm } 745232153Smm if (count == 0) 746232153Smm ret = -1; 747299529Smm break; 748299529Smm } while (1); 749232153Smm } 750232153Smm as->length += count; 751232153Smm as->s[as->length] = '\0'; 752232153Smm return (defchar_used?-1:ret); 753232153Smm} 754232153Smm 755232153Smm#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) 756232153Smm 757232153Smm/* 758232153Smm * Translates a wide character string into current locale character set 759232153Smm * and appends to the archive_string. Note: returns -1 if conversion 760232153Smm * fails. 761232153Smm */ 762232153Smmint 763232153Smmarchive_string_append_from_wcs(struct archive_string *as, 764232153Smm const wchar_t *w, size_t len) 765232153Smm{ 766232153Smm /* We cannot use the standard wcstombs() here because it 767232153Smm * cannot tell us how big the output buffer should be. So 768232153Smm * I've built a loop around wcrtomb() or wctomb() that 769232153Smm * converts a character at a time and resizes the string as 770232153Smm * needed. We prefer wcrtomb() when it's available because 771232153Smm * it's thread-safe. */ 772232153Smm int n, ret_val = 0; 773228753Smm char *p; 774232153Smm char *end; 775232153Smm#if HAVE_WCRTOMB 776232153Smm mbstate_t shift_state; 777228753Smm 778232153Smm memset(&shift_state, 0, sizeof(shift_state)); 779232153Smm#else 780232153Smm /* Clear the shift state before starting. */ 781232153Smm wctomb(NULL, L'\0'); 782232153Smm#endif 783228753Smm /* 784232153Smm * Allocate buffer for MBS. 785232153Smm * We need this allocation here since it is possible that 786232153Smm * as->s is still NULL. 787228753Smm */ 788232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 789238856Smm return (-1); 790232153Smm 791232153Smm p = as->s + as->length; 792232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 793232153Smm while (*w != L'\0' && len > 0) { 794232153Smm if (p >= end) { 795232153Smm as->length = p - as->s; 796232153Smm as->s[as->length] = '\0'; 797232153Smm /* Re-allocate buffer for MBS. */ 798232153Smm if (archive_string_ensure(as, 799232153Smm as->length + len * 2 + 1) == NULL) 800238856Smm return (-1); 801232153Smm p = as->s + as->length; 802232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 803228753Smm } 804232153Smm#if HAVE_WCRTOMB 805232153Smm n = wcrtomb(p, *w++, &shift_state); 806232153Smm#else 807232153Smm n = wctomb(p, *w++); 808232153Smm#endif 809232153Smm if (n == -1) { 810232153Smm if (errno == EILSEQ) { 811232153Smm /* Skip an illegal wide char. */ 812232153Smm *p++ = '?'; 813232153Smm ret_val = -1; 814232153Smm } else { 815232153Smm ret_val = -1; 816232153Smm break; 817232153Smm } 818232153Smm } else 819232153Smm p += n; 820232153Smm len--; 821232153Smm } 822232153Smm as->length = p - as->s; 823232153Smm as->s[as->length] = '\0'; 824232153Smm return (ret_val); 825232153Smm} 826232153Smm 827232153Smm#else /* HAVE_WCTOMB || HAVE_WCRTOMB */ 828232153Smm 829232153Smm/* 830232153Smm * TODO: Test if __STDC_ISO_10646__ is defined. 831232153Smm * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 832232153Smm * one character at a time. If a non-Windows platform doesn't have 833232153Smm * either of these, fall back to the built-in UTF8 conversion. 834232153Smm */ 835232153Smmint 836232153Smmarchive_string_append_from_wcs(struct archive_string *as, 837232153Smm const wchar_t *w, size_t len) 838232153Smm{ 839232153Smm (void)as;/* UNUSED */ 840232153Smm (void)w;/* UNUSED */ 841232153Smm (void)len;/* UNUSED */ 842238856Smm errno = ENOSYS; 843232153Smm return (-1); 844232153Smm} 845232153Smm 846232153Smm#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ 847232153Smm 848232153Smm/* 849232153Smm * Find a string conversion object by a pair of 'from' charset name 850232153Smm * and 'to' charset name from an archive object. 851232153Smm * Return NULL if not found. 852232153Smm */ 853232153Smmstatic struct archive_string_conv * 854232153Smmfind_sconv_object(struct archive *a, const char *fc, const char *tc) 855232153Smm{ 856232153Smm struct archive_string_conv *sc; 857232153Smm 858232153Smm if (a == NULL) 859232153Smm return (NULL); 860232153Smm 861232153Smm for (sc = a->sconv; sc != NULL; sc = sc->next) { 862232153Smm if (strcmp(sc->from_charset, fc) == 0 && 863232153Smm strcmp(sc->to_charset, tc) == 0) 864232153Smm break; 865232153Smm } 866232153Smm return (sc); 867232153Smm} 868232153Smm 869232153Smm/* 870232153Smm * Register a string object to an archive object. 871232153Smm */ 872232153Smmstatic void 873232153Smmadd_sconv_object(struct archive *a, struct archive_string_conv *sc) 874232153Smm{ 875232153Smm struct archive_string_conv **psc; 876232153Smm 877232153Smm /* Add a new sconv to sconv list. */ 878232153Smm psc = &(a->sconv); 879232153Smm while (*psc != NULL) 880232153Smm psc = &((*psc)->next); 881232153Smm *psc = sc; 882232153Smm} 883232153Smm 884232153Smmstatic void 885232153Smmadd_converter(struct archive_string_conv *sc, int (*converter) 886232153Smm (struct archive_string *, const void *, size_t, 887232153Smm struct archive_string_conv *)) 888232153Smm{ 889232153Smm if (sc == NULL || sc->nconverter >= 2) 890232153Smm __archive_errx(1, "Programing error"); 891232153Smm sc->converter[sc->nconverter++] = converter; 892232153Smm} 893232153Smm 894232153Smmstatic void 895232153Smmsetup_converter(struct archive_string_conv *sc) 896232153Smm{ 897232153Smm 898232153Smm /* Reset. */ 899232153Smm sc->nconverter = 0; 900232153Smm 901232153Smm /* 902232153Smm * Perform special sequence for the incorrect UTF-8 filenames 903232153Smm * made by libarchive2.x. 904232153Smm */ 905232153Smm if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { 906232153Smm add_converter(sc, strncat_from_utf8_libarchive2); 907232153Smm return; 908232153Smm } 909232153Smm 910232153Smm /* 911232153Smm * Convert a string to UTF-16BE/LE. 912232153Smm */ 913232153Smm if (sc->flag & SCONV_TO_UTF16) { 914232153Smm /* 915232153Smm * If the current locale is UTF-8, we can translate 916232153Smm * a UTF-8 string into a UTF-16BE string. 917232153Smm */ 918232153Smm if (sc->flag & SCONV_FROM_UTF8) { 919232153Smm add_converter(sc, archive_string_append_unicode); 920232153Smm return; 921228753Smm } 922232153Smm 923232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 924232153Smm if (sc->flag & SCONV_WIN_CP) { 925232153Smm if (sc->flag & SCONV_TO_UTF16BE) 926232153Smm add_converter(sc, win_strncat_to_utf16be); 927232153Smm else 928232153Smm add_converter(sc, win_strncat_to_utf16le); 929232153Smm return; 930232153Smm } 931232153Smm#endif 932232153Smm 933232153Smm#if defined(HAVE_ICONV) 934232153Smm if (sc->cd != (iconv_t)-1) { 935232153Smm add_converter(sc, iconv_strncat_in_locale); 936232153Smm return; 937232153Smm } 938232153Smm#endif 939232153Smm 940232153Smm if (sc->flag & SCONV_BEST_EFFORT) { 941232153Smm if (sc->flag & SCONV_TO_UTF16BE) 942238856Smm add_converter(sc, 943238856Smm best_effort_strncat_to_utf16be); 944232153Smm else 945238856Smm add_converter(sc, 946238856Smm best_effort_strncat_to_utf16le); 947232153Smm } else 948232153Smm /* Make sure we have no converter. */ 949232153Smm sc->nconverter = 0; 950232153Smm return; 951232153Smm } 952232153Smm 953232153Smm /* 954232153Smm * Convert a string from UTF-16BE/LE. 955232153Smm */ 956232153Smm if (sc->flag & SCONV_FROM_UTF16) { 957232153Smm /* 958232153Smm * At least we should normalize a UTF-16BE string. 959232153Smm */ 960232153Smm if (sc->flag & SCONV_NORMALIZATION_D) 961232153Smm add_converter(sc,archive_string_normalize_D); 962238856Smm else if (sc->flag & SCONV_NORMALIZATION_C) 963232153Smm add_converter(sc, archive_string_normalize_C); 964232153Smm 965232153Smm if (sc->flag & SCONV_TO_UTF8) { 966232153Smm /* 967232153Smm * If the current locale is UTF-8, we can translate 968232153Smm * a UTF-16BE/LE string into a UTF-8 string directly. 969232153Smm */ 970232153Smm if (!(sc->flag & 971232153Smm (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 972232153Smm add_converter(sc, 973232153Smm archive_string_append_unicode); 974232153Smm return; 975232153Smm } 976232153Smm 977232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 978232153Smm if (sc->flag & SCONV_WIN_CP) { 979232153Smm if (sc->flag & SCONV_FROM_UTF16BE) 980232153Smm add_converter(sc, win_strncat_from_utf16be); 981232153Smm else 982232153Smm add_converter(sc, win_strncat_from_utf16le); 983232153Smm return; 984232153Smm } 985232153Smm#endif 986232153Smm 987232153Smm#if defined(HAVE_ICONV) 988232153Smm if (sc->cd != (iconv_t)-1) { 989232153Smm add_converter(sc, iconv_strncat_in_locale); 990232153Smm return; 991232153Smm } 992232153Smm#endif 993232153Smm 994232153Smm if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 995232153Smm == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 996232153Smm add_converter(sc, best_effort_strncat_from_utf16be); 997232153Smm else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 998232153Smm == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 999232153Smm add_converter(sc, best_effort_strncat_from_utf16le); 1000232153Smm else 1001232153Smm /* Make sure we have no converter. */ 1002232153Smm sc->nconverter = 0; 1003232153Smm return; 1004232153Smm } 1005232153Smm 1006232153Smm if (sc->flag & SCONV_FROM_UTF8) { 1007232153Smm /* 1008232153Smm * At least we should normalize a UTF-8 string. 1009232153Smm */ 1010232153Smm if (sc->flag & SCONV_NORMALIZATION_D) 1011232153Smm add_converter(sc,archive_string_normalize_D); 1012238856Smm else if (sc->flag & SCONV_NORMALIZATION_C) 1013232153Smm add_converter(sc, archive_string_normalize_C); 1014232153Smm 1015232153Smm /* 1016232153Smm * Copy UTF-8 string with a check of CESU-8. 1017232153Smm * Apparently, iconv does not check surrogate pairs in UTF-8 1018232153Smm * when both from-charset and to-charset are UTF-8, and then 1019232153Smm * we use our UTF-8 copy code. 1020232153Smm */ 1021232153Smm if (sc->flag & SCONV_TO_UTF8) { 1022232153Smm /* 1023232153Smm * If the current locale is UTF-8, we can translate 1024232153Smm * a UTF-16BE string into a UTF-8 string directly. 1025232153Smm */ 1026232153Smm if (!(sc->flag & 1027232153Smm (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 1028232153Smm add_converter(sc, strncat_from_utf8_to_utf8); 1029232153Smm return; 1030232153Smm } 1031232153Smm } 1032232153Smm 1033232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1034232153Smm /* 1035232153Smm * On Windows we can use Windows API for a string conversion. 1036232153Smm */ 1037232153Smm if (sc->flag & SCONV_WIN_CP) { 1038232153Smm add_converter(sc, strncat_in_codepage); 1039232153Smm return; 1040232153Smm } 1041232153Smm#endif 1042232153Smm 1043232153Smm#if HAVE_ICONV 1044232153Smm if (sc->cd != (iconv_t)-1) { 1045232153Smm add_converter(sc, iconv_strncat_in_locale); 1046238856Smm /* 1047238856Smm * iconv generally does not support UTF-8-MAC and so 1048238856Smm * we have to the output of iconv from NFC to NFD if 1049238856Smm * need. 1050238856Smm */ 1051238856Smm if ((sc->flag & SCONV_FROM_CHARSET) && 1052238856Smm (sc->flag & SCONV_TO_UTF8)) { 1053238856Smm if (sc->flag & SCONV_NORMALIZATION_D) 1054238856Smm add_converter(sc, archive_string_normalize_D); 1055238856Smm } 1056232153Smm return; 1057232153Smm } 1058232153Smm#endif 1059232153Smm 1060232153Smm /* 1061232153Smm * Try conversion in the best effort or no conversion. 1062232153Smm */ 1063232153Smm if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) 1064232153Smm add_converter(sc, best_effort_strncat_in_locale); 1065232153Smm else 1066232153Smm /* Make sure we have no converter. */ 1067232153Smm sc->nconverter = 0; 1068232153Smm} 1069232153Smm 1070232153Smm/* 1071232153Smm * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE 1072232153Smm * and CP932 which are referenced in create_sconv_object(). 1073232153Smm */ 1074232153Smmstatic const char * 1075232153Smmcanonical_charset_name(const char *charset) 1076232153Smm{ 1077232153Smm char cs[16]; 1078232153Smm char *p; 1079232153Smm const char *s; 1080232153Smm 1081232153Smm if (charset == NULL || charset[0] == '\0' 1082232153Smm || strlen(charset) > 15) 1083232153Smm return (charset); 1084232153Smm 1085232153Smm /* Copy name to uppercase. */ 1086232153Smm p = cs; 1087232153Smm s = charset; 1088232153Smm while (*s) { 1089232153Smm char c = *s++; 1090232153Smm if (c >= 'a' && c <= 'z') 1091232153Smm c -= 'a' - 'A'; 1092232153Smm *p++ = c; 1093232153Smm } 1094232153Smm *p++ = '\0'; 1095232153Smm 1096232153Smm if (strcmp(cs, "UTF-8") == 0 || 1097232153Smm strcmp(cs, "UTF8") == 0) 1098232153Smm return ("UTF-8"); 1099232153Smm if (strcmp(cs, "UTF-16BE") == 0 || 1100232153Smm strcmp(cs, "UTF16BE") == 0) 1101232153Smm return ("UTF-16BE"); 1102232153Smm if (strcmp(cs, "UTF-16LE") == 0 || 1103232153Smm strcmp(cs, "UTF16LE") == 0) 1104232153Smm return ("UTF-16LE"); 1105232153Smm if (strcmp(cs, "CP932") == 0) 1106232153Smm return ("CP932"); 1107232153Smm return (charset); 1108232153Smm} 1109232153Smm 1110232153Smm/* 1111232153Smm * Create a string conversion object. 1112232153Smm */ 1113232153Smmstatic struct archive_string_conv * 1114232153Smmcreate_sconv_object(const char *fc, const char *tc, 1115232153Smm unsigned current_codepage, int flag) 1116232153Smm{ 1117232153Smm struct archive_string_conv *sc; 1118232153Smm 1119232153Smm sc = calloc(1, sizeof(*sc)); 1120232153Smm if (sc == NULL) 1121232153Smm return (NULL); 1122232153Smm sc->next = NULL; 1123232153Smm sc->from_charset = strdup(fc); 1124232153Smm if (sc->from_charset == NULL) { 1125232153Smm free(sc); 1126232153Smm return (NULL); 1127232153Smm } 1128232153Smm sc->to_charset = strdup(tc); 1129232153Smm if (sc->to_charset == NULL) { 1130248616Smm free(sc->from_charset); 1131232153Smm free(sc); 1132232153Smm return (NULL); 1133232153Smm } 1134232153Smm archive_string_init(&sc->utftmp); 1135232153Smm 1136232153Smm if (flag & SCONV_TO_CHARSET) { 1137232153Smm /* 1138232153Smm * Convert characters from the current locale charset to 1139232153Smm * a specified charset. 1140232153Smm */ 1141232153Smm sc->from_cp = current_codepage; 1142232153Smm sc->to_cp = make_codepage_from_charset(tc); 1143232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1144232153Smm if (IsValidCodePage(sc->to_cp)) 1145232153Smm flag |= SCONV_WIN_CP; 1146232153Smm#endif 1147232153Smm } else if (flag & SCONV_FROM_CHARSET) { 1148232153Smm /* 1149232153Smm * Convert characters from a specified charset to 1150232153Smm * the current locale charset. 1151232153Smm */ 1152232153Smm sc->to_cp = current_codepage; 1153232153Smm sc->from_cp = make_codepage_from_charset(fc); 1154232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1155232153Smm if (IsValidCodePage(sc->from_cp)) 1156232153Smm flag |= SCONV_WIN_CP; 1157232153Smm#endif 1158232153Smm } 1159232153Smm 1160232153Smm /* 1161232153Smm * Check if "from charset" and "to charset" are the same. 1162232153Smm */ 1163232153Smm if (strcmp(fc, tc) == 0 || 1164232153Smm (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) 1165232153Smm sc->same = 1; 1166232153Smm else 1167232153Smm sc->same = 0; 1168232153Smm 1169232153Smm /* 1170232153Smm * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. 1171232153Smm */ 1172232153Smm if (strcmp(tc, "UTF-8") == 0) 1173232153Smm flag |= SCONV_TO_UTF8; 1174232153Smm else if (strcmp(tc, "UTF-16BE") == 0) 1175232153Smm flag |= SCONV_TO_UTF16BE; 1176232153Smm else if (strcmp(tc, "UTF-16LE") == 0) 1177232153Smm flag |= SCONV_TO_UTF16LE; 1178232153Smm if (strcmp(fc, "UTF-8") == 0) 1179232153Smm flag |= SCONV_FROM_UTF8; 1180232153Smm else if (strcmp(fc, "UTF-16BE") == 0) 1181232153Smm flag |= SCONV_FROM_UTF16BE; 1182232153Smm else if (strcmp(fc, "UTF-16LE") == 0) 1183232153Smm flag |= SCONV_FROM_UTF16LE; 1184232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1185232153Smm if (sc->to_cp == CP_UTF8) 1186232153Smm flag |= SCONV_TO_UTF8; 1187232153Smm else if (sc->to_cp == CP_UTF16BE) 1188232153Smm flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; 1189232153Smm else if (sc->to_cp == CP_UTF16LE) 1190232153Smm flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; 1191232153Smm if (sc->from_cp == CP_UTF8) 1192232153Smm flag |= SCONV_FROM_UTF8; 1193232153Smm else if (sc->from_cp == CP_UTF16BE) 1194232153Smm flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; 1195232153Smm else if (sc->from_cp == CP_UTF16LE) 1196232153Smm flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; 1197232153Smm#endif 1198232153Smm 1199232153Smm /* 1200232153Smm * Set a flag for Unicode NFD. Usually iconv cannot correctly 1201232153Smm * handle it. So we have to translate NFD characters to NFC ones 1202232153Smm * ourselves before iconv handles. Another reason is to prevent 1203232153Smm * that the same sight of two filenames, one is NFC and other 1204232153Smm * is NFD, would be in its directory. 1205232153Smm * On Mac OS X, although its filesystem layer automatically 1206232153Smm * convert filenames to NFD, it would be useful for filename 1207232153Smm * comparing to find out the same filenames that we normalize 1208232153Smm * that to be NFD ourselves. 1209232153Smm */ 1210232153Smm if ((flag & SCONV_FROM_CHARSET) && 1211232153Smm (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { 1212232153Smm#if defined(__APPLE__) 1213238856Smm if (flag & SCONV_TO_UTF8) 1214238856Smm flag |= SCONV_NORMALIZATION_D; 1215238856Smm else 1216232153Smm#endif 1217232153Smm flag |= SCONV_NORMALIZATION_C; 1218232153Smm } 1219238856Smm#if defined(__APPLE__) 1220238856Smm /* 1221238856Smm * In case writing an archive file, make sure that a filename 1222238856Smm * going to be passed to iconv is a Unicode NFC string since 1223238856Smm * a filename in HFS Plus filesystem is a Unicode NFD one and 1224238856Smm * iconv cannot handle it with "UTF-8" charset. It is simpler 1225238856Smm * than a use of "UTF-8-MAC" charset. 1226238856Smm */ 1227238856Smm if ((flag & SCONV_TO_CHARSET) && 1228238856Smm (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1229238856Smm !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1230238856Smm flag |= SCONV_NORMALIZATION_C; 1231238856Smm /* 1232238856Smm * In case reading an archive file. make sure that a filename 1233238856Smm * will be passed to users is a Unicode NFD string in order to 1234238856Smm * correctly compare the filename with other one which comes 1235238856Smm * from HFS Plus filesystem. 1236238856Smm */ 1237238856Smm if ((flag & SCONV_FROM_CHARSET) && 1238238856Smm !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1239238856Smm (flag & SCONV_TO_UTF8)) 1240238856Smm flag |= SCONV_NORMALIZATION_D; 1241238856Smm#endif 1242232153Smm 1243232153Smm#if defined(HAVE_ICONV) 1244232153Smm sc->cd_w = (iconv_t)-1; 1245232153Smm /* 1246232153Smm * Create an iconv object. 1247232153Smm */ 1248232153Smm if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && 1249232153Smm (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || 1250232153Smm (flag & SCONV_WIN_CP)) { 1251232153Smm /* This case we won't use iconv. */ 1252232153Smm sc->cd = (iconv_t)-1; 1253232153Smm } else { 1254232153Smm sc->cd = iconv_open(tc, fc); 1255232153Smm if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { 1256232153Smm /* 1257313570Smm * Unfortunately, all of iconv implements do support 1258232153Smm * "CP932" character-set, so we should use "SJIS" 1259232153Smm * instead if iconv_open failed. 1260232153Smm */ 1261232153Smm if (strcmp(tc, "CP932") == 0) 1262232153Smm sc->cd = iconv_open("SJIS", fc); 1263232153Smm else if (strcmp(fc, "CP932") == 0) 1264232153Smm sc->cd = iconv_open(tc, "SJIS"); 1265232153Smm } 1266232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1267232153Smm /* 1268232153Smm * archive_mstring on Windows directly convert multi-bytes 1269232153Smm * into archive_wstring in order not to depend on locale 1270313570Smm * so that you can do a I18N programming. This will be 1271232153Smm * used only in archive_mstring_copy_mbs_len_l so far. 1272232153Smm */ 1273232153Smm if (flag & SCONV_FROM_CHARSET) { 1274232153Smm sc->cd_w = iconv_open("UTF-8", fc); 1275232153Smm if (sc->cd_w == (iconv_t)-1 && 1276232153Smm (sc->flag & SCONV_BEST_EFFORT)) { 1277232153Smm if (strcmp(fc, "CP932") == 0) 1278232153Smm sc->cd_w = iconv_open("UTF-8", "SJIS"); 1279232153Smm } 1280232153Smm } 1281232153Smm#endif /* _WIN32 && !__CYGWIN__ */ 1282232153Smm } 1283232153Smm#endif /* HAVE_ICONV */ 1284232153Smm 1285232153Smm sc->flag = flag; 1286232153Smm 1287232153Smm /* 1288238856Smm * Set up converters. 1289232153Smm */ 1290232153Smm setup_converter(sc); 1291232153Smm 1292232153Smm return (sc); 1293232153Smm} 1294232153Smm 1295232153Smm/* 1296232153Smm * Free a string conversion object. 1297232153Smm */ 1298232153Smmstatic void 1299232153Smmfree_sconv_object(struct archive_string_conv *sc) 1300232153Smm{ 1301232153Smm free(sc->from_charset); 1302232153Smm free(sc->to_charset); 1303232153Smm archive_string_free(&sc->utftmp); 1304232153Smm#if HAVE_ICONV 1305232153Smm if (sc->cd != (iconv_t)-1) 1306232153Smm iconv_close(sc->cd); 1307232153Smm if (sc->cd_w != (iconv_t)-1) 1308232153Smm iconv_close(sc->cd_w); 1309232153Smm#endif 1310232153Smm free(sc); 1311232153Smm} 1312232153Smm 1313232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1314232153Smmstatic unsigned 1315232153Smmmy_atoi(const char *p) 1316232153Smm{ 1317232153Smm unsigned cp; 1318232153Smm 1319232153Smm cp = 0; 1320232153Smm while (*p) { 1321232153Smm if (*p >= '0' && *p <= '9') 1322232153Smm cp = cp * 10 + (*p - '0'); 1323232153Smm else 1324232153Smm return (-1); 1325232153Smm p++; 1326232153Smm } 1327232153Smm return (cp); 1328232153Smm} 1329232153Smm 1330232153Smm/* 1331232153Smm * Translate Charset name (as used by iconv) into CodePage (as used by Windows) 1332232153Smm * Return -1 if failed. 1333232153Smm * 1334232153Smm * Note: This translation code may be insufficient. 1335232153Smm */ 1336232153Smmstatic struct charset { 1337232153Smm const char *name; 1338232153Smm unsigned cp; 1339232153Smm} charsets[] = { 1340232153Smm /* MUST BE SORTED! */ 1341232153Smm {"ASCII", 1252}, 1342232153Smm {"ASMO-708", 708}, 1343232153Smm {"BIG5", 950}, 1344232153Smm {"CHINESE", 936}, 1345232153Smm {"CP367", 1252}, 1346232153Smm {"CP819", 1252}, 1347232153Smm {"CP1025", 21025}, 1348232153Smm {"DOS-720", 720}, 1349232153Smm {"DOS-862", 862}, 1350232153Smm {"EUC-CN", 51936}, 1351232153Smm {"EUC-JP", 51932}, 1352232153Smm {"EUC-KR", 949}, 1353232153Smm {"EUCCN", 51936}, 1354232153Smm {"EUCJP", 51932}, 1355232153Smm {"EUCKR", 949}, 1356232153Smm {"GB18030", 54936}, 1357232153Smm {"GB2312", 936}, 1358232153Smm {"HEBREW", 1255}, 1359232153Smm {"HZ-GB-2312", 52936}, 1360232153Smm {"IBM273", 20273}, 1361232153Smm {"IBM277", 20277}, 1362232153Smm {"IBM278", 20278}, 1363232153Smm {"IBM280", 20280}, 1364232153Smm {"IBM284", 20284}, 1365232153Smm {"IBM285", 20285}, 1366232153Smm {"IBM290", 20290}, 1367232153Smm {"IBM297", 20297}, 1368232153Smm {"IBM367", 1252}, 1369232153Smm {"IBM420", 20420}, 1370232153Smm {"IBM423", 20423}, 1371232153Smm {"IBM424", 20424}, 1372232153Smm {"IBM819", 1252}, 1373232153Smm {"IBM871", 20871}, 1374232153Smm {"IBM880", 20880}, 1375232153Smm {"IBM905", 20905}, 1376232153Smm {"IBM924", 20924}, 1377232153Smm {"ISO-8859-1", 28591}, 1378232153Smm {"ISO-8859-13", 28603}, 1379232153Smm {"ISO-8859-15", 28605}, 1380232153Smm {"ISO-8859-2", 28592}, 1381232153Smm {"ISO-8859-3", 28593}, 1382232153Smm {"ISO-8859-4", 28594}, 1383232153Smm {"ISO-8859-5", 28595}, 1384232153Smm {"ISO-8859-6", 28596}, 1385232153Smm {"ISO-8859-7", 28597}, 1386232153Smm {"ISO-8859-8", 28598}, 1387232153Smm {"ISO-8859-9", 28599}, 1388232153Smm {"ISO8859-1", 28591}, 1389232153Smm {"ISO8859-13", 28603}, 1390232153Smm {"ISO8859-15", 28605}, 1391232153Smm {"ISO8859-2", 28592}, 1392232153Smm {"ISO8859-3", 28593}, 1393232153Smm {"ISO8859-4", 28594}, 1394232153Smm {"ISO8859-5", 28595}, 1395232153Smm {"ISO8859-6", 28596}, 1396232153Smm {"ISO8859-7", 28597}, 1397232153Smm {"ISO8859-8", 28598}, 1398232153Smm {"ISO8859-9", 28599}, 1399232153Smm {"JOHAB", 1361}, 1400232153Smm {"KOI8-R", 20866}, 1401232153Smm {"KOI8-U", 21866}, 1402232153Smm {"KS_C_5601-1987", 949}, 1403232153Smm {"LATIN1", 1252}, 1404232153Smm {"LATIN2", 28592}, 1405232153Smm {"MACINTOSH", 10000}, 1406232153Smm {"SHIFT-JIS", 932}, 1407232153Smm {"SHIFT_JIS", 932}, 1408232153Smm {"SJIS", 932}, 1409232153Smm {"US", 1252}, 1410232153Smm {"US-ASCII", 1252}, 1411232153Smm {"UTF-16", 1200}, 1412232153Smm {"UTF-16BE", 1201}, 1413232153Smm {"UTF-16LE", 1200}, 1414232153Smm {"UTF-8", CP_UTF8}, 1415232153Smm {"X-EUROPA", 29001}, 1416232153Smm {"X-MAC-ARABIC", 10004}, 1417232153Smm {"X-MAC-CE", 10029}, 1418232153Smm {"X-MAC-CHINESEIMP", 10008}, 1419232153Smm {"X-MAC-CHINESETRAD", 10002}, 1420232153Smm {"X-MAC-CROATIAN", 10082}, 1421232153Smm {"X-MAC-CYRILLIC", 10007}, 1422232153Smm {"X-MAC-GREEK", 10006}, 1423232153Smm {"X-MAC-HEBREW", 10005}, 1424232153Smm {"X-MAC-ICELANDIC", 10079}, 1425232153Smm {"X-MAC-JAPANESE", 10001}, 1426232153Smm {"X-MAC-KOREAN", 10003}, 1427232153Smm {"X-MAC-ROMANIAN", 10010}, 1428232153Smm {"X-MAC-THAI", 10021}, 1429232153Smm {"X-MAC-TURKISH", 10081}, 1430232153Smm {"X-MAC-UKRAINIAN", 10017}, 1431232153Smm}; 1432232153Smmstatic unsigned 1433232153Smmmake_codepage_from_charset(const char *charset) 1434232153Smm{ 1435232153Smm char cs[16]; 1436232153Smm char *p; 1437232153Smm unsigned cp; 1438232153Smm int a, b; 1439232153Smm 1440232153Smm if (charset == NULL || strlen(charset) > 15) 1441232153Smm return -1; 1442232153Smm 1443232153Smm /* Copy name to uppercase. */ 1444232153Smm p = cs; 1445232153Smm while (*charset) { 1446232153Smm char c = *charset++; 1447232153Smm if (c >= 'a' && c <= 'z') 1448232153Smm c -= 'a' - 'A'; 1449232153Smm *p++ = c; 1450232153Smm } 1451232153Smm *p++ = '\0'; 1452232153Smm cp = -1; 1453232153Smm 1454232153Smm /* Look it up in the table first, so that we can easily 1455232153Smm * override CP367, which we map to 1252 instead of 367. */ 1456232153Smm a = 0; 1457232153Smm b = sizeof(charsets)/sizeof(charsets[0]); 1458232153Smm while (b > a) { 1459232153Smm int c = (b + a) / 2; 1460232153Smm int r = strcmp(charsets[c].name, cs); 1461232153Smm if (r < 0) 1462232153Smm a = c + 1; 1463232153Smm else if (r > 0) 1464232153Smm b = c; 1465232153Smm else 1466232153Smm return charsets[c].cp; 1467232153Smm } 1468232153Smm 1469232153Smm /* If it's not in the table, try to parse it. */ 1470232153Smm switch (*cs) { 1471232153Smm case 'C': 1472232153Smm if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { 1473232153Smm cp = my_atoi(cs + 2); 1474232153Smm } else if (strcmp(cs, "CP_ACP") == 0) 1475232153Smm cp = get_current_codepage(); 1476232153Smm else if (strcmp(cs, "CP_OEMCP") == 0) 1477232153Smm cp = get_current_oemcp(); 1478232153Smm break; 1479232153Smm case 'I': 1480232153Smm if (cs[1] == 'B' && cs[2] == 'M' && 1481232153Smm cs[3] >= '0' && cs[3] <= '9') { 1482232153Smm cp = my_atoi(cs + 3); 1483232153Smm } 1484232153Smm break; 1485232153Smm case 'W': 1486232153Smm if (strncmp(cs, "WINDOWS-", 8) == 0) { 1487232153Smm cp = my_atoi(cs + 8); 1488232153Smm if (cp != 874 && (cp < 1250 || cp > 1258)) 1489232153Smm cp = -1;/* This may invalid code. */ 1490232153Smm } 1491232153Smm break; 1492232153Smm } 1493232153Smm return (cp); 1494232153Smm} 1495232153Smm 1496232153Smm/* 1497232153Smm * Return ANSI Code Page of current locale set by setlocale(). 1498232153Smm */ 1499232153Smmstatic unsigned 1500232153Smmget_current_codepage(void) 1501232153Smm{ 1502232153Smm char *locale, *p; 1503232153Smm unsigned cp; 1504232153Smm 1505232153Smm locale = setlocale(LC_CTYPE, NULL); 1506232153Smm if (locale == NULL) 1507232153Smm return (GetACP()); 1508232153Smm if (locale[0] == 'C' && locale[1] == '\0') 1509232153Smm return (CP_C_LOCALE); 1510232153Smm p = strrchr(locale, '.'); 1511232153Smm if (p == NULL) 1512232153Smm return (GetACP()); 1513232153Smm cp = my_atoi(p+1); 1514232153Smm if (cp <= 0) 1515232153Smm return (GetACP()); 1516232153Smm return (cp); 1517232153Smm} 1518232153Smm 1519232153Smm/* 1520232153Smm * Translation table between Locale Name and ACP/OEMCP. 1521232153Smm */ 1522232153Smmstatic struct { 1523232153Smm unsigned acp; 1524232153Smm unsigned ocp; 1525232153Smm const char *locale; 1526232153Smm} acp_ocp_map[] = { 1527232153Smm { 950, 950, "Chinese_Taiwan" }, 1528232153Smm { 936, 936, "Chinese_People's Republic of China" }, 1529232153Smm { 950, 950, "Chinese_Taiwan" }, 1530232153Smm { 1250, 852, "Czech_Czech Republic" }, 1531232153Smm { 1252, 850, "Danish_Denmark" }, 1532232153Smm { 1252, 850, "Dutch_Netherlands" }, 1533232153Smm { 1252, 850, "Dutch_Belgium" }, 1534232153Smm { 1252, 437, "English_United States" }, 1535232153Smm { 1252, 850, "English_Australia" }, 1536232153Smm { 1252, 850, "English_Canada" }, 1537232153Smm { 1252, 850, "English_New Zealand" }, 1538232153Smm { 1252, 850, "English_United Kingdom" }, 1539232153Smm { 1252, 437, "English_United States" }, 1540232153Smm { 1252, 850, "Finnish_Finland" }, 1541232153Smm { 1252, 850, "French_France" }, 1542232153Smm { 1252, 850, "French_Belgium" }, 1543232153Smm { 1252, 850, "French_Canada" }, 1544232153Smm { 1252, 850, "French_Switzerland" }, 1545232153Smm { 1252, 850, "German_Germany" }, 1546232153Smm { 1252, 850, "German_Austria" }, 1547232153Smm { 1252, 850, "German_Switzerland" }, 1548232153Smm { 1253, 737, "Greek_Greece" }, 1549232153Smm { 1250, 852, "Hungarian_Hungary" }, 1550232153Smm { 1252, 850, "Icelandic_Iceland" }, 1551232153Smm { 1252, 850, "Italian_Italy" }, 1552232153Smm { 1252, 850, "Italian_Switzerland" }, 1553232153Smm { 932, 932, "Japanese_Japan" }, 1554232153Smm { 949, 949, "Korean_Korea" }, 1555232153Smm { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1556232153Smm { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1557232153Smm { 1252, 850, "Norwegian-Nynorsk_Norway" }, 1558232153Smm { 1250, 852, "Polish_Poland" }, 1559232153Smm { 1252, 850, "Portuguese_Portugal" }, 1560232153Smm { 1252, 850, "Portuguese_Brazil" }, 1561232153Smm { 1251, 866, "Russian_Russia" }, 1562232153Smm { 1250, 852, "Slovak_Slovakia" }, 1563232153Smm { 1252, 850, "Spanish_Spain" }, 1564232153Smm { 1252, 850, "Spanish_Mexico" }, 1565232153Smm { 1252, 850, "Spanish_Spain" }, 1566232153Smm { 1252, 850, "Swedish_Sweden" }, 1567232153Smm { 1254, 857, "Turkish_Turkey" }, 1568232153Smm { 0, 0, NULL} 1569232153Smm}; 1570232153Smm 1571232153Smm/* 1572232153Smm * Return OEM Code Page of current locale set by setlocale(). 1573232153Smm */ 1574232153Smmstatic unsigned 1575232153Smmget_current_oemcp(void) 1576232153Smm{ 1577232153Smm int i; 1578232153Smm char *locale, *p; 1579232153Smm size_t len; 1580232153Smm 1581232153Smm locale = setlocale(LC_CTYPE, NULL); 1582232153Smm if (locale == NULL) 1583232153Smm return (GetOEMCP()); 1584232153Smm if (locale[0] == 'C' && locale[1] == '\0') 1585232153Smm return (CP_C_LOCALE); 1586232153Smm 1587232153Smm p = strrchr(locale, '.'); 1588232153Smm if (p == NULL) 1589232153Smm return (GetOEMCP()); 1590232153Smm len = p - locale; 1591232153Smm for (i = 0; acp_ocp_map[i].acp; i++) { 1592232153Smm if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) 1593232153Smm return (acp_ocp_map[i].ocp); 1594232153Smm } 1595232153Smm return (GetOEMCP()); 1596232153Smm} 1597232153Smm#else 1598232153Smm 1599232153Smm/* 1600232153Smm * POSIX platform does not use CodePage. 1601232153Smm */ 1602232153Smm 1603232153Smmstatic unsigned 1604232153Smmget_current_codepage(void) 1605232153Smm{ 1606232153Smm return (-1);/* Unknown */ 1607232153Smm} 1608232153Smmstatic unsigned 1609232153Smmmake_codepage_from_charset(const char *charset) 1610232153Smm{ 1611232153Smm (void)charset; /* UNUSED */ 1612232153Smm return (-1);/* Unknown */ 1613232153Smm} 1614232153Smmstatic unsigned 1615232153Smmget_current_oemcp(void) 1616232153Smm{ 1617232153Smm return (-1);/* Unknown */ 1618232153Smm} 1619232153Smm 1620232153Smm#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 1621232153Smm 1622232153Smm/* 1623232153Smm * Return a string conversion object. 1624232153Smm */ 1625232153Smmstatic struct archive_string_conv * 1626232153Smmget_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) 1627232153Smm{ 1628232153Smm struct archive_string_conv *sc; 1629232153Smm unsigned current_codepage; 1630232153Smm 1631232153Smm /* Check if we have made the sconv object. */ 1632232153Smm sc = find_sconv_object(a, fc, tc); 1633232153Smm if (sc != NULL) 1634232153Smm return (sc); 1635232153Smm 1636232153Smm if (a == NULL) 1637232153Smm current_codepage = get_current_codepage(); 1638232153Smm else 1639232153Smm current_codepage = a->current_codepage; 1640232153Smm 1641232153Smm sc = create_sconv_object(canonical_charset_name(fc), 1642232153Smm canonical_charset_name(tc), current_codepage, flag); 1643232153Smm if (sc == NULL) { 1644232153Smm if (a != NULL) 1645232153Smm archive_set_error(a, ENOMEM, 1646232153Smm "Could not allocate memory for " 1647232153Smm "a string conversion object"); 1648232153Smm return (NULL); 1649232153Smm } 1650232153Smm 1651232153Smm /* 1652232153Smm * If there is no converter for current string conversion object, 1653232153Smm * we cannot handle this conversion. 1654232153Smm */ 1655232153Smm if (sc->nconverter == 0) { 1656232153Smm if (a != NULL) { 1657232153Smm#if HAVE_ICONV 1658232153Smm archive_set_error(a, ARCHIVE_ERRNO_MISC, 1659232153Smm "iconv_open failed : Cannot handle ``%s''", 1660232153Smm (flag & SCONV_TO_CHARSET)?tc:fc); 1661232153Smm#else 1662232153Smm archive_set_error(a, ARCHIVE_ERRNO_MISC, 1663232153Smm "A character-set conversion not fully supported " 1664232153Smm "on this platform"); 1665232153Smm#endif 1666232153Smm } 1667232153Smm /* Failed; free a sconv object. */ 1668232153Smm free_sconv_object(sc); 1669232153Smm return (NULL); 1670232153Smm } 1671232153Smm 1672232153Smm /* 1673232153Smm * Success! 1674232153Smm */ 1675232153Smm if (a != NULL) 1676232153Smm add_sconv_object(a, sc); 1677232153Smm return (sc); 1678232153Smm} 1679232153Smm 1680232153Smmstatic const char * 1681232153Smmget_current_charset(struct archive *a) 1682232153Smm{ 1683232153Smm const char *cur_charset; 1684232153Smm 1685232153Smm if (a == NULL) 1686232153Smm cur_charset = default_iconv_charset(""); 1687232153Smm else { 1688232153Smm cur_charset = default_iconv_charset(a->current_code); 1689232153Smm if (a->current_code == NULL) { 1690232153Smm a->current_code = strdup(cur_charset); 1691232153Smm a->current_codepage = get_current_codepage(); 1692232153Smm a->current_oemcp = get_current_oemcp(); 1693232153Smm } 1694232153Smm } 1695232153Smm return (cur_charset); 1696232153Smm} 1697232153Smm 1698232153Smm/* 1699232153Smm * Make and Return a string conversion object. 1700232153Smm * Return NULL if the platform does not support the specified conversion 1701232153Smm * and best_effort is 0. 1702232153Smm * If best_effort is set, A string conversion object must be returned 1703232153Smm * unless memory allocation for the object fails, but the conversion 1704232153Smm * might fail when non-ASCII code is found. 1705232153Smm */ 1706232153Smmstruct archive_string_conv * 1707232153Smmarchive_string_conversion_to_charset(struct archive *a, const char *charset, 1708232153Smm int best_effort) 1709232153Smm{ 1710232153Smm int flag = SCONV_TO_CHARSET; 1711232153Smm 1712232153Smm if (best_effort) 1713232153Smm flag |= SCONV_BEST_EFFORT; 1714232153Smm return (get_sconv_object(a, get_current_charset(a), charset, flag)); 1715232153Smm} 1716232153Smm 1717232153Smmstruct archive_string_conv * 1718232153Smmarchive_string_conversion_from_charset(struct archive *a, const char *charset, 1719232153Smm int best_effort) 1720232153Smm{ 1721232153Smm int flag = SCONV_FROM_CHARSET; 1722232153Smm 1723232153Smm if (best_effort) 1724232153Smm flag |= SCONV_BEST_EFFORT; 1725232153Smm return (get_sconv_object(a, charset, get_current_charset(a), flag)); 1726232153Smm} 1727232153Smm 1728232153Smm/* 1729232153Smm * archive_string_default_conversion_*_archive() are provided for Windows 1730232153Smm * platform because other archiver application use CP_OEMCP for 1731232153Smm * MultiByteToWideChar() and WideCharToMultiByte() for the filenames 1732232153Smm * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP 1733232153Smm * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). 1734232153Smm * So we should make a string conversion between CP_ACP and CP_OEMCP 1735313570Smm * for compatibility. 1736232153Smm */ 1737232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 1738232153Smmstruct archive_string_conv * 1739232153Smmarchive_string_default_conversion_for_read(struct archive *a) 1740232153Smm{ 1741232153Smm const char *cur_charset = get_current_charset(a); 1742232153Smm char oemcp[16]; 1743232153Smm 1744232153Smm /* NOTE: a check of cur_charset is unneeded but we need 1745232153Smm * that get_current_charset() has been surely called at 1746232153Smm * this time whatever C compiler optimized. */ 1747232153Smm if (cur_charset != NULL && 1748232153Smm (a->current_codepage == CP_C_LOCALE || 1749232153Smm a->current_codepage == a->current_oemcp)) 1750232153Smm return (NULL);/* no conversion. */ 1751232153Smm 1752232153Smm _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1753232153Smm /* Make sure a null termination must be set. */ 1754232153Smm oemcp[sizeof(oemcp)-1] = '\0'; 1755232153Smm return (get_sconv_object(a, oemcp, cur_charset, 1756232153Smm SCONV_FROM_CHARSET)); 1757232153Smm} 1758232153Smm 1759232153Smmstruct archive_string_conv * 1760232153Smmarchive_string_default_conversion_for_write(struct archive *a) 1761232153Smm{ 1762232153Smm const char *cur_charset = get_current_charset(a); 1763232153Smm char oemcp[16]; 1764232153Smm 1765232153Smm /* NOTE: a check of cur_charset is unneeded but we need 1766232153Smm * that get_current_charset() has been surely called at 1767232153Smm * this time whatever C compiler optimized. */ 1768232153Smm if (cur_charset != NULL && 1769232153Smm (a->current_codepage == CP_C_LOCALE || 1770232153Smm a->current_codepage == a->current_oemcp)) 1771232153Smm return (NULL);/* no conversion. */ 1772232153Smm 1773232153Smm _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1774232153Smm /* Make sure a null termination must be set. */ 1775232153Smm oemcp[sizeof(oemcp)-1] = '\0'; 1776232153Smm return (get_sconv_object(a, cur_charset, oemcp, 1777232153Smm SCONV_TO_CHARSET)); 1778232153Smm} 1779232153Smm#else 1780232153Smmstruct archive_string_conv * 1781232153Smmarchive_string_default_conversion_for_read(struct archive *a) 1782232153Smm{ 1783232153Smm (void)a; /* UNUSED */ 1784232153Smm return (NULL); 1785232153Smm} 1786232153Smm 1787232153Smmstruct archive_string_conv * 1788232153Smmarchive_string_default_conversion_for_write(struct archive *a) 1789232153Smm{ 1790232153Smm (void)a; /* UNUSED */ 1791232153Smm return (NULL); 1792232153Smm} 1793232153Smm#endif 1794232153Smm 1795232153Smm/* 1796232153Smm * Dispose of all character conversion objects in the archive object. 1797232153Smm */ 1798232153Smmvoid 1799232153Smmarchive_string_conversion_free(struct archive *a) 1800232153Smm{ 1801232153Smm struct archive_string_conv *sc; 1802232153Smm struct archive_string_conv *sc_next; 1803232153Smm 1804232153Smm for (sc = a->sconv; sc != NULL; sc = sc_next) { 1805232153Smm sc_next = sc->next; 1806232153Smm free_sconv_object(sc); 1807232153Smm } 1808232153Smm a->sconv = NULL; 1809232153Smm free(a->current_code); 1810232153Smm a->current_code = NULL; 1811232153Smm} 1812232153Smm 1813232153Smm/* 1814232153Smm * Return a conversion charset name. 1815232153Smm */ 1816232153Smmconst char * 1817232153Smmarchive_string_conversion_charset_name(struct archive_string_conv *sc) 1818232153Smm{ 1819232153Smm if (sc->flag & SCONV_TO_CHARSET) 1820232153Smm return (sc->to_charset); 1821232153Smm else 1822232153Smm return (sc->from_charset); 1823232153Smm} 1824232153Smm 1825232153Smm/* 1826232153Smm * Change the behavior of a string conversion. 1827232153Smm */ 1828232153Smmvoid 1829232153Smmarchive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) 1830232153Smm{ 1831232153Smm switch (opt) { 1832232153Smm /* 1833232153Smm * A filename in UTF-8 was made with libarchive 2.x in a wrong 1834232153Smm * assumption that wchar_t was Unicode. 1835232153Smm * This option enables simulating the assumption in order to read 1836311041Smm * that filename correctly. 1837232153Smm */ 1838232153Smm case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: 1839232153Smm#if (defined(_WIN32) && !defined(__CYGWIN__)) \ 1840232153Smm || defined(__STDC_ISO_10646__) || defined(__APPLE__) 1841232153Smm /* 1842232153Smm * Nothing to do for it since wchar_t on these platforms 1843232153Smm * is really Unicode. 1844232153Smm */ 1845232153Smm (void)sc; /* UNUSED */ 1846232153Smm#else 1847232153Smm if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { 1848232153Smm sc->flag |= SCONV_UTF8_LIBARCHIVE_2; 1849238856Smm /* Set up string converters. */ 1850232153Smm setup_converter(sc); 1851232153Smm } 1852232153Smm#endif 1853232153Smm break; 1854238856Smm case SCONV_SET_OPT_NORMALIZATION_C: 1855238856Smm if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { 1856238856Smm sc->flag |= SCONV_NORMALIZATION_C; 1857238856Smm sc->flag &= ~SCONV_NORMALIZATION_D; 1858238856Smm /* Set up string converters. */ 1859238856Smm setup_converter(sc); 1860238856Smm } 1861238856Smm break; 1862238856Smm case SCONV_SET_OPT_NORMALIZATION_D: 1863238856Smm#if defined(HAVE_ICONV) 1864238856Smm /* 1865238856Smm * If iconv will take the string, do not change the 1866238856Smm * setting of the normalization. 1867238856Smm */ 1868238856Smm if (!(sc->flag & SCONV_WIN_CP) && 1869238856Smm (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1870238856Smm !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1871238856Smm break; 1872238856Smm#endif 1873238856Smm if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { 1874238856Smm sc->flag |= SCONV_NORMALIZATION_D; 1875238856Smm sc->flag &= ~SCONV_NORMALIZATION_C; 1876238856Smm /* Set up string converters. */ 1877238856Smm setup_converter(sc); 1878238856Smm } 1879238856Smm break; 1880232153Smm default: 1881232153Smm break; 1882232153Smm } 1883232153Smm} 1884232153Smm 1885232153Smm/* 1886232153Smm * 1887232153Smm * Copy one archive_string to another in locale conversion. 1888232153Smm * 1889238856Smm * archive_strncat_l(); 1890238856Smm * archive_strncpy_l(); 1891232153Smm * 1892232153Smm */ 1893232153Smm 1894232153Smmstatic size_t 1895232153Smmmbsnbytes(const void *_p, size_t n) 1896232153Smm{ 1897232153Smm size_t s; 1898232153Smm const char *p, *pp; 1899232153Smm 1900232153Smm if (_p == NULL) 1901232153Smm return (0); 1902232153Smm p = (const char *)_p; 1903232153Smm 1904232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 1905232153Smm s = 0; 1906232153Smm pp = p; 1907232153Smm while (s < n && *pp) { 1908232153Smm pp++; 1909232153Smm s++; 1910232153Smm } 1911232153Smm return (s); 1912232153Smm} 1913232153Smm 1914232153Smmstatic size_t 1915232153Smmutf16nbytes(const void *_p, size_t n) 1916232153Smm{ 1917232153Smm size_t s; 1918232153Smm const char *p, *pp; 1919232153Smm 1920232153Smm if (_p == NULL) 1921232153Smm return (0); 1922232153Smm p = (const char *)_p; 1923232153Smm 1924232153Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 1925232153Smm s = 0; 1926232153Smm pp = p; 1927232153Smm n >>= 1; 1928232153Smm while (s < n && (pp[0] || pp[1])) { 1929232153Smm pp += 2; 1930232153Smm s++; 1931232153Smm } 1932232153Smm return (s<<1); 1933232153Smm} 1934232153Smm 1935232153Smmint 1936238856Smmarchive_strncpy_l(struct archive_string *as, const void *_p, size_t n, 1937232153Smm struct archive_string_conv *sc) 1938232153Smm{ 1939232153Smm as->length = 0; 1940238856Smm return (archive_strncat_l(as, _p, n, sc)); 1941232153Smm} 1942232153Smm 1943232153Smmint 1944238856Smmarchive_strncat_l(struct archive_string *as, const void *_p, size_t n, 1945232153Smm struct archive_string_conv *sc) 1946232153Smm{ 1947232153Smm const void *s; 1948311041Smm size_t length = 0; 1949232153Smm int i, r = 0, r2; 1950232153Smm 1951311041Smm if (_p != NULL && n > 0) { 1952311041Smm if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) 1953311041Smm length = utf16nbytes(_p, n); 1954311041Smm else 1955311041Smm length = mbsnbytes(_p, n); 1956311041Smm } 1957311041Smm 1958232153Smm /* We must allocate memory even if there is no data for conversion 1959232153Smm * or copy. This simulates archive_string_append behavior. */ 1960311041Smm if (length == 0) { 1961232153Smm int tn = 1; 1962232153Smm if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) 1963232153Smm tn = 2; 1964232153Smm if (archive_string_ensure(as, as->length + tn) == NULL) 1965232153Smm return (-1); 1966232153Smm as->s[as->length] = 0; 1967232153Smm if (tn == 2) 1968232153Smm as->s[as->length+1] = 0; 1969232153Smm return (0); 1970232153Smm } 1971232153Smm 1972232153Smm /* 1973232153Smm * If sc is NULL, we just make a copy. 1974232153Smm */ 1975232153Smm if (sc == NULL) { 1976232153Smm if (archive_string_append(as, _p, length) == NULL) 1977232153Smm return (-1);/* No memory */ 1978232153Smm return (0); 1979232153Smm } 1980232153Smm 1981232153Smm s = _p; 1982232153Smm i = 0; 1983232153Smm if (sc->nconverter > 1) { 1984232153Smm sc->utftmp.length = 0; 1985232153Smm r2 = sc->converter[0](&(sc->utftmp), s, length, sc); 1986232153Smm if (r2 != 0 && errno == ENOMEM) 1987232153Smm return (r2); 1988232153Smm if (r > r2) 1989232153Smm r = r2; 1990232153Smm s = sc->utftmp.s; 1991232153Smm length = sc->utftmp.length; 1992232153Smm ++i; 1993232153Smm } 1994232153Smm r2 = sc->converter[i](as, s, length, sc); 1995232153Smm if (r > r2) 1996232153Smm r = r2; 1997232153Smm return (r); 1998232153Smm} 1999232153Smm 2000232153Smm#if HAVE_ICONV 2001232153Smm 2002232153Smm/* 2003311041Smm * Return -1 if conversion fails. 2004232153Smm */ 2005232153Smmstatic int 2006232153Smmiconv_strncat_in_locale(struct archive_string *as, const void *_p, 2007232153Smm size_t length, struct archive_string_conv *sc) 2008232153Smm{ 2009248616Smm ICONV_CONST char *itp; 2010232153Smm size_t remaining; 2011232153Smm iconv_t cd; 2012232153Smm char *outp; 2013232153Smm size_t avail, bs; 2014232153Smm int return_value = 0; /* success */ 2015232153Smm int to_size, from_size; 2016232153Smm 2017232153Smm if (sc->flag & SCONV_TO_UTF16) 2018232153Smm to_size = 2; 2019232153Smm else 2020232153Smm to_size = 1; 2021232153Smm if (sc->flag & SCONV_FROM_UTF16) 2022232153Smm from_size = 2; 2023232153Smm else 2024232153Smm from_size = 1; 2025232153Smm 2026232153Smm if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) 2027232153Smm return (-1); 2028232153Smm 2029232153Smm cd = sc->cd; 2030248616Smm itp = (char *)(uintptr_t)_p; 2031232153Smm remaining = length; 2032232153Smm outp = as->s + as->length; 2033232153Smm avail = as->buffer_length - as->length - to_size; 2034232153Smm while (remaining >= (size_t)from_size) { 2035248616Smm size_t result = iconv(cd, &itp, &remaining, &outp, &avail); 2036232153Smm 2037232153Smm if (result != (size_t)-1) 2038232153Smm break; /* Conversion completed. */ 2039232153Smm 2040232153Smm if (errno == EILSEQ || errno == EINVAL) { 2041232153Smm /* 2042232153Smm * If an output charset is UTF-8 or UTF-16BE/LE, 2043232153Smm * unknown character should be U+FFFD 2044232153Smm * (replacement character). 2045232153Smm */ 2046232153Smm if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { 2047232153Smm size_t rbytes; 2048232153Smm if (sc->flag & SCONV_TO_UTF8) 2049299529Smm rbytes = sizeof(utf8_replacement_char); 2050232153Smm else 2051232153Smm rbytes = 2; 2052232153Smm 2053232153Smm if (avail < rbytes) { 2054232153Smm as->length = outp - as->s; 2055232153Smm bs = as->buffer_length + 2056232153Smm (remaining * to_size) + rbytes; 2057232153Smm if (NULL == 2058232153Smm archive_string_ensure(as, bs)) 2059232153Smm return (-1); 2060232153Smm outp = as->s + as->length; 2061232153Smm avail = as->buffer_length 2062232153Smm - as->length - to_size; 2063232153Smm } 2064232153Smm if (sc->flag & SCONV_TO_UTF8) 2065299529Smm memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); 2066232153Smm else if (sc->flag & SCONV_TO_UTF16BE) 2067232153Smm archive_be16enc(outp, UNICODE_R_CHAR); 2068232153Smm else 2069232153Smm archive_le16enc(outp, UNICODE_R_CHAR); 2070232153Smm outp += rbytes; 2071232153Smm avail -= rbytes; 2072232153Smm } else { 2073232153Smm /* Skip the illegal input bytes. */ 2074232153Smm *outp++ = '?'; 2075232153Smm avail--; 2076232153Smm } 2077248616Smm itp += from_size; 2078232153Smm remaining -= from_size; 2079232153Smm return_value = -1; /* failure */ 2080228753Smm } else { 2081232153Smm /* E2BIG no output buffer, 2082232153Smm * Increase an output buffer. */ 2083232153Smm as->length = outp - as->s; 2084232153Smm bs = as->buffer_length + remaining * 2; 2085232153Smm if (NULL == archive_string_ensure(as, bs)) 2086232153Smm return (-1); 2087232153Smm outp = as->s + as->length; 2088232153Smm avail = as->buffer_length - as->length - to_size; 2089228753Smm } 2090228753Smm } 2091232153Smm as->length = outp - as->s; 2092232153Smm as->s[as->length] = 0; 2093232153Smm if (to_size == 2) 2094232153Smm as->s[as->length+1] = 0; 2095232153Smm return (return_value); 2096228753Smm} 2097228753Smm 2098232153Smm#endif /* HAVE_ICONV */ 2099232153Smm 2100232153Smm 2101232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 2102232153Smm 2103232153Smm/* 2104232153Smm * Translate a string from a some CodePage to an another CodePage by 2105311041Smm * Windows APIs, and copy the result. Return -1 if conversion fails. 2106232153Smm */ 2107228753Smmstatic int 2108232153Smmstrncat_in_codepage(struct archive_string *as, 2109232153Smm const void *_p, size_t length, struct archive_string_conv *sc) 2110228753Smm{ 2111232153Smm const char *s = (const char *)_p; 2112232153Smm struct archive_wstring aws; 2113232153Smm size_t l; 2114232153Smm int r, saved_flag; 2115228753Smm 2116232153Smm archive_string_init(&aws); 2117232153Smm saved_flag = sc->flag; 2118232153Smm sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); 2119232153Smm r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); 2120232153Smm sc->flag = saved_flag; 2121232153Smm if (r != 0) { 2122232153Smm archive_wstring_free(&aws); 2123232153Smm if (errno != ENOMEM) 2124232153Smm archive_string_append(as, s, length); 2125232153Smm return (-1); 2126232153Smm } 2127232153Smm 2128232153Smm l = as->length; 2129232153Smm r = archive_string_append_from_wcs_in_codepage( 2130232153Smm as, aws.s, aws.length, sc); 2131232153Smm if (r != 0 && errno != ENOMEM && l == as->length) 2132232153Smm archive_string_append(as, s, length); 2133232153Smm archive_wstring_free(&aws); 2134232153Smm return (r); 2135232153Smm} 2136232153Smm 2137232153Smm/* 2138232153Smm * Test whether MBS ==> WCS is okay. 2139232153Smm */ 2140232153Smmstatic int 2141232153Smminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2142232153Smm{ 2143232153Smm const char *p = (const char *)_p; 2144232153Smm unsigned codepage; 2145232153Smm DWORD mbflag = MB_ERR_INVALID_CHARS; 2146232153Smm 2147232153Smm if (sc->flag & SCONV_FROM_CHARSET) 2148232153Smm codepage = sc->to_cp; 2149232153Smm else 2150232153Smm codepage = sc->from_cp; 2151232153Smm 2152232153Smm if (codepage == CP_C_LOCALE) 2153232153Smm return (0); 2154232153Smm if (codepage != CP_UTF8) 2155232153Smm mbflag |= MB_PRECOMPOSED; 2156232153Smm 2157248616Smm if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) 2158232153Smm return (-1); /* Invalid */ 2159232153Smm return (0); /* Okay */ 2160232153Smm} 2161232153Smm 2162232153Smm#else 2163232153Smm 2164232153Smm/* 2165232153Smm * Test whether MBS ==> WCS is okay. 2166232153Smm */ 2167232153Smmstatic int 2168232153Smminvalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2169232153Smm{ 2170232153Smm const char *p = (const char *)_p; 2171232153Smm size_t r; 2172232153Smm 2173232153Smm#if HAVE_MBRTOWC 2174232153Smm mbstate_t shift_state; 2175232153Smm 2176232153Smm memset(&shift_state, 0, sizeof(shift_state)); 2177232153Smm#else 2178232153Smm /* Clear the shift state before starting. */ 2179232153Smm mbtowc(NULL, NULL, 0); 2180232153Smm#endif 2181232153Smm while (n) { 2182232153Smm wchar_t wc; 2183232153Smm 2184232153Smm#if HAVE_MBRTOWC 2185232153Smm r = mbrtowc(&wc, p, n, &shift_state); 2186232153Smm#else 2187232153Smm r = mbtowc(&wc, p, n); 2188232153Smm#endif 2189232153Smm if (r == (size_t)-1 || r == (size_t)-2) 2190232153Smm return (-1);/* Invalid. */ 2191232153Smm if (r == 0) 2192232153Smm break; 2193232153Smm p += r; 2194232153Smm n -= r; 2195232153Smm } 2196238856Smm (void)sc; /* UNUSED */ 2197232153Smm return (0); /* All Okey. */ 2198232153Smm} 2199232153Smm 2200232153Smm#endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 2201232153Smm 2202232153Smm/* 2203232153Smm * Basically returns -1 because we cannot make a conversion of charset 2204232153Smm * without iconv but in some cases this would return 0. 2205232153Smm * Returns 0 if all copied characters are ASCII. 2206232153Smm * Returns 0 if both from-locale and to-locale are the same and those 2207232153Smm * can be WCS with no error. 2208232153Smm */ 2209232153Smmstatic int 2210232153Smmbest_effort_strncat_in_locale(struct archive_string *as, const void *_p, 2211232153Smm size_t length, struct archive_string_conv *sc) 2212232153Smm{ 2213232153Smm size_t remaining; 2214248616Smm const uint8_t *itp; 2215232153Smm int return_value = 0; /* success */ 2216232153Smm 2217232153Smm /* 2218232153Smm * If both from-locale and to-locale is the same, this makes a copy. 2219232153Smm * And then this checks all copied MBS can be WCS if so returns 0. 2220232153Smm */ 2221232153Smm if (sc->same) { 2222232153Smm if (archive_string_append(as, _p, length) == NULL) 2223232153Smm return (-1);/* No memory */ 2224232153Smm return (invalid_mbs(_p, length, sc)); 2225232153Smm } 2226232153Smm 2227232153Smm /* 2228232153Smm * If a character is ASCII, this just copies it. If not, this 2229313570Smm * assigns '?' character instead but in UTF-8 locale this assigns 2230232153Smm * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, 2231232153Smm * a Replacement Character in Unicode. 2232232153Smm */ 2233232153Smm 2234232153Smm remaining = length; 2235248616Smm itp = (const uint8_t *)_p; 2236248616Smm while (*itp && remaining > 0) { 2237299529Smm if (*itp > 127) { 2238299529Smm // Non-ASCII: Substitute with suitable replacement 2239299529Smm if (sc->flag & SCONV_TO_UTF8) { 2240299529Smm if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { 2241299529Smm __archive_errx(1, "Out of memory"); 2242299529Smm } 2243299529Smm } else { 2244299529Smm archive_strappend_char(as, '?'); 2245232153Smm } 2246232153Smm return_value = -1; 2247232153Smm } else { 2248299529Smm archive_strappend_char(as, *itp); 2249232153Smm } 2250299529Smm ++itp; 2251232153Smm } 2252232153Smm return (return_value); 2253232153Smm} 2254232153Smm 2255232153Smm 2256232153Smm/* 2257232153Smm * Unicode conversion functions. 2258232153Smm * - UTF-8 <===> UTF-8 in removing surrogate pairs. 2259232153Smm * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. 2260232153Smm * - UTF-8 made by libarchive 2.x ===> UTF-8. 2261232153Smm * - UTF-16BE <===> UTF-8. 2262232153Smm * 2263232153Smm */ 2264232153Smm 2265232153Smm/* 2266232153Smm * Utility to convert a single UTF-8 sequence. 2267232153Smm * 2268232153Smm * Usually return used bytes, return used byte in negative value when 2269232153Smm * a unicode character is replaced with U+FFFD. 2270232153Smm * See also http://unicode.org/review/pr-121.html Public Review Issue #121 2271232153Smm * Recommended Practice for Replacement Characters. 2272232153Smm */ 2273232153Smmstatic int 2274232153Smm_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2275232153Smm{ 2276232153Smm static const char utf8_count[256] = { 2277232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ 2278232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ 2279232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ 2280232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ 2281232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ 2282232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ 2283232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ 2284232153Smm 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ 2285232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ 2286232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ 2287232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ 2288232153Smm 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ 2289232153Smm 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ 2290232153Smm 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ 2291232153Smm 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ 2292232153Smm 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ 2293232153Smm }; 2294232153Smm int ch, i; 2295232153Smm int cnt; 2296232153Smm uint32_t wc; 2297232153Smm 2298232153Smm /* Sanity check. */ 2299232153Smm if (n == 0) 2300232153Smm return (0); 2301232153Smm /* 2302228753Smm * Decode 1-4 bytes depending on the value of the first byte. 2303228753Smm */ 2304232153Smm ch = (unsigned char)*s; 2305232153Smm if (ch == 0) 2306228753Smm return (0); /* Standard: return 0 for end-of-string. */ 2307232153Smm cnt = utf8_count[ch]; 2308232153Smm 2309311041Smm /* Invalid sequence or there are not plenty bytes. */ 2310232153Smm if ((int)n < cnt) { 2311248616Smm cnt = (int)n; 2312232153Smm for (i = 1; i < cnt; i++) { 2313232153Smm if ((s[i] & 0xc0) != 0x80) { 2314232153Smm cnt = i; 2315232153Smm break; 2316232153Smm } 2317232153Smm } 2318232153Smm goto invalid_sequence; 2319228753Smm } 2320232153Smm 2321232153Smm /* Make a Unicode code point from a single UTF-8 sequence. */ 2322232153Smm switch (cnt) { 2323232153Smm case 1: /* 1 byte sequence. */ 2324232153Smm *pwc = ch & 0x7f; 2325232153Smm return (cnt); 2326232153Smm case 2: /* 2 bytes sequence. */ 2327232153Smm if ((s[1] & 0xc0) != 0x80) { 2328232153Smm cnt = 1; 2329232153Smm goto invalid_sequence; 2330232153Smm } 2331232153Smm *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 2332232153Smm return (cnt); 2333232153Smm case 3: /* 3 bytes sequence. */ 2334232153Smm if ((s[1] & 0xc0) != 0x80) { 2335232153Smm cnt = 1; 2336232153Smm goto invalid_sequence; 2337232153Smm } 2338232153Smm if ((s[2] & 0xc0) != 0x80) { 2339232153Smm cnt = 2; 2340232153Smm goto invalid_sequence; 2341232153Smm } 2342232153Smm wc = ((ch & 0x0f) << 12) 2343228753Smm | ((s[1] & 0x3f) << 6) 2344228753Smm | (s[2] & 0x3f); 2345232153Smm if (wc < 0x800) 2346232153Smm goto invalid_sequence;/* Overlong sequence. */ 2347232153Smm break; 2348232153Smm case 4: /* 4 bytes sequence. */ 2349232153Smm if ((s[1] & 0xc0) != 0x80) { 2350232153Smm cnt = 1; 2351232153Smm goto invalid_sequence; 2352232153Smm } 2353232153Smm if ((s[2] & 0xc0) != 0x80) { 2354232153Smm cnt = 2; 2355232153Smm goto invalid_sequence; 2356232153Smm } 2357232153Smm if ((s[3] & 0xc0) != 0x80) { 2358232153Smm cnt = 3; 2359232153Smm goto invalid_sequence; 2360232153Smm } 2361232153Smm wc = ((ch & 0x07) << 18) 2362228753Smm | ((s[1] & 0x3f) << 12) 2363228753Smm | ((s[2] & 0x3f) << 6) 2364228753Smm | (s[3] & 0x3f); 2365232153Smm if (wc < 0x10000) 2366232153Smm goto invalid_sequence;/* Overlong sequence. */ 2367232153Smm break; 2368232153Smm default: /* Others are all invalid sequence. */ 2369232153Smm if (ch == 0xc0 || ch == 0xc1) 2370232153Smm cnt = 2; 2371232153Smm else if (ch >= 0xf5 && ch <= 0xf7) 2372232153Smm cnt = 4; 2373232153Smm else if (ch >= 0xf8 && ch <= 0xfb) 2374232153Smm cnt = 5; 2375232153Smm else if (ch == 0xfc || ch == 0xfd) 2376232153Smm cnt = 6; 2377232153Smm else 2378232153Smm cnt = 1; 2379232153Smm if ((int)n < cnt) 2380248616Smm cnt = (int)n; 2381232153Smm for (i = 1; i < cnt; i++) { 2382232153Smm if ((s[i] & 0xc0) != 0x80) { 2383232153Smm cnt = i; 2384232153Smm break; 2385232153Smm } 2386232153Smm } 2387232153Smm goto invalid_sequence; 2388232153Smm } 2389232153Smm 2390311041Smm /* The code point larger than 0x10FFFF is not legal 2391232153Smm * Unicode values. */ 2392232153Smm if (wc > UNICODE_MAX) 2393232153Smm goto invalid_sequence; 2394232153Smm /* Correctly gets a Unicode, returns used bytes. */ 2395232153Smm *pwc = wc; 2396232153Smm return (cnt); 2397232153Smminvalid_sequence: 2398232153Smm *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2399232153Smm return (cnt * -1); 2400232153Smm} 2401232153Smm 2402232153Smmstatic int 2403232153Smmutf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2404232153Smm{ 2405232153Smm int cnt; 2406232153Smm 2407232153Smm cnt = _utf8_to_unicode(pwc, s, n); 2408311041Smm /* Any of Surrogate pair is not legal Unicode values. */ 2409232153Smm if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2410232153Smm return (-3); 2411232153Smm return (cnt); 2412232153Smm} 2413232153Smm 2414232153Smmstatic inline uint32_t 2415232153Smmcombine_surrogate_pair(uint32_t uc, uint32_t uc2) 2416232153Smm{ 2417232153Smm uc -= 0xD800; 2418232153Smm uc *= 0x400; 2419232153Smm uc += uc2 - 0xDC00; 2420232153Smm uc += 0x10000; 2421232153Smm return (uc); 2422232153Smm} 2423232153Smm 2424232153Smm/* 2425232153Smm * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in 2426232153Smm * removing surrogate pairs. 2427232153Smm * 2428232153Smm * CESU-8: The Compatibility Encoding Scheme for UTF-16. 2429232153Smm * 2430232153Smm * Usually return used bytes, return used byte in negative value when 2431232153Smm * a unicode character is replaced with U+FFFD. 2432232153Smm */ 2433232153Smmstatic int 2434232153Smmcesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2435232153Smm{ 2436248616Smm uint32_t wc = 0; 2437232153Smm int cnt; 2438232153Smm 2439232153Smm cnt = _utf8_to_unicode(&wc, s, n); 2440232153Smm if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { 2441248616Smm uint32_t wc2 = 0; 2442232153Smm if (n - 3 < 3) { 2443232153Smm /* Invalid byte sequence. */ 2444232153Smm goto invalid_sequence; 2445232153Smm } 2446232153Smm cnt = _utf8_to_unicode(&wc2, s+3, n-3); 2447232153Smm if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { 2448232153Smm /* Invalid byte sequence. */ 2449232153Smm goto invalid_sequence; 2450232153Smm } 2451232153Smm wc = combine_surrogate_pair(wc, wc2); 2452232153Smm cnt = 6; 2453232153Smm } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { 2454232153Smm /* Invalid byte sequence. */ 2455232153Smm goto invalid_sequence; 2456232153Smm } 2457232153Smm *pwc = wc; 2458232153Smm return (cnt); 2459232153Smminvalid_sequence: 2460232153Smm *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2461232153Smm if (cnt > 0) 2462232153Smm cnt *= -1; 2463232153Smm return (cnt); 2464232153Smm} 2465232153Smm 2466232153Smm/* 2467232153Smm * Convert a Unicode code point to a single UTF-8 sequence. 2468232153Smm * 2469311041Smm * NOTE:This function does not check if the Unicode is legal or not. 2470232153Smm * Please you definitely check it before calling this. 2471232153Smm */ 2472232153Smmstatic size_t 2473232153Smmunicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2474232153Smm{ 2475232153Smm char *_p = p; 2476232153Smm 2477299529Smm /* Invalid Unicode char maps to Replacement character */ 2478299529Smm if (uc > UNICODE_MAX) 2479299529Smm uc = UNICODE_R_CHAR; 2480232153Smm /* Translate code point to UTF8 */ 2481232153Smm if (uc <= 0x7f) { 2482232153Smm if (remaining == 0) 2483232153Smm return (0); 2484232153Smm *p++ = (char)uc; 2485232153Smm } else if (uc <= 0x7ff) { 2486232153Smm if (remaining < 2) 2487232153Smm return (0); 2488232153Smm *p++ = 0xc0 | ((uc >> 6) & 0x1f); 2489232153Smm *p++ = 0x80 | (uc & 0x3f); 2490232153Smm } else if (uc <= 0xffff) { 2491232153Smm if (remaining < 3) 2492232153Smm return (0); 2493232153Smm *p++ = 0xe0 | ((uc >> 12) & 0x0f); 2494232153Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 2495232153Smm *p++ = 0x80 | (uc & 0x3f); 2496299529Smm } else { 2497232153Smm if (remaining < 4) 2498232153Smm return (0); 2499232153Smm *p++ = 0xf0 | ((uc >> 18) & 0x07); 2500232153Smm *p++ = 0x80 | ((uc >> 12) & 0x3f); 2501232153Smm *p++ = 0x80 | ((uc >> 6) & 0x3f); 2502232153Smm *p++ = 0x80 | (uc & 0x3f); 2503232153Smm } 2504232153Smm return (p - _p); 2505232153Smm} 2506232153Smm 2507232153Smmstatic int 2508232153Smmutf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) 2509232153Smm{ 2510232153Smm return (utf16_to_unicode(pwc, s, n, 1)); 2511232153Smm} 2512232153Smm 2513232153Smmstatic int 2514232153Smmutf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) 2515232153Smm{ 2516232153Smm return (utf16_to_unicode(pwc, s, n, 0)); 2517232153Smm} 2518232153Smm 2519232153Smmstatic int 2520232153Smmutf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) 2521232153Smm{ 2522232153Smm const char *utf16 = s; 2523232153Smm unsigned uc; 2524232153Smm 2525232153Smm if (n == 0) 2526232153Smm return (0); 2527232153Smm if (n == 1) { 2528232153Smm /* set the Replacement Character instead. */ 2529232153Smm *pwc = UNICODE_R_CHAR; 2530232153Smm return (-1); 2531232153Smm } 2532232153Smm 2533232153Smm if (be) 2534232153Smm uc = archive_be16dec(utf16); 2535232153Smm else 2536232153Smm uc = archive_le16dec(utf16); 2537232153Smm utf16 += 2; 2538232153Smm 2539232153Smm /* If this is a surrogate pair, assemble the full code point.*/ 2540232153Smm if (IS_HIGH_SURROGATE_LA(uc)) { 2541232153Smm unsigned uc2; 2542232153Smm 2543232153Smm if (n >= 4) { 2544232153Smm if (be) 2545232153Smm uc2 = archive_be16dec(utf16); 2546232153Smm else 2547232153Smm uc2 = archive_le16dec(utf16); 2548232153Smm } else 2549232153Smm uc2 = 0; 2550232153Smm if (IS_LOW_SURROGATE_LA(uc2)) { 2551232153Smm uc = combine_surrogate_pair(uc, uc2); 2552232153Smm utf16 += 2; 2553232153Smm } else { 2554232153Smm /* Undescribed code point should be U+FFFD 2555232153Smm * (replacement character). */ 2556232153Smm *pwc = UNICODE_R_CHAR; 2557232153Smm return (-2); 2558232153Smm } 2559232153Smm } 2560232153Smm 2561232153Smm /* 2562232153Smm * Surrogate pair values(0xd800 through 0xdfff) are only 2563313570Smm * used by UTF-16, so, after above calculation, the code 2564232153Smm * must not be surrogate values, and Unicode has no codes 2565311041Smm * larger than 0x10ffff. Thus, those are not legal Unicode 2566232153Smm * values. 2567232153Smm */ 2568232153Smm if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2569232153Smm /* Undescribed code point should be U+FFFD 2570232153Smm * (replacement character). */ 2571232153Smm *pwc = UNICODE_R_CHAR; 2572232153Smm return (((int)(utf16 - s)) * -1); 2573232153Smm } 2574232153Smm *pwc = uc; 2575232153Smm return ((int)(utf16 - s)); 2576232153Smm} 2577232153Smm 2578232153Smmstatic size_t 2579232153Smmunicode_to_utf16be(char *p, size_t remaining, uint32_t uc) 2580232153Smm{ 2581232153Smm char *utf16 = p; 2582232153Smm 2583232153Smm if (uc > 0xffff) { 2584232153Smm /* We have a code point that won't fit into a 2585232153Smm * wchar_t; convert it to a surrogate pair. */ 2586232153Smm if (remaining < 4) 2587232153Smm return (0); 2588232153Smm uc -= 0x10000; 2589232153Smm archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2590232153Smm archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2591228753Smm return (4); 2592232153Smm } else { 2593232153Smm if (remaining < 2) 2594232153Smm return (0); 2595232153Smm archive_be16enc(utf16, uc); 2596232153Smm return (2); 2597232153Smm } 2598228753Smm} 2599228753Smm 2600232153Smmstatic size_t 2601232153Smmunicode_to_utf16le(char *p, size_t remaining, uint32_t uc) 2602232153Smm{ 2603232153Smm char *utf16 = p; 2604232153Smm 2605232153Smm if (uc > 0xffff) { 2606232153Smm /* We have a code point that won't fit into a 2607232153Smm * wchar_t; convert it to a surrogate pair. */ 2608232153Smm if (remaining < 4) 2609232153Smm return (0); 2610232153Smm uc -= 0x10000; 2611232153Smm archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2612232153Smm archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2613232153Smm return (4); 2614232153Smm } else { 2615232153Smm if (remaining < 2) 2616232153Smm return (0); 2617232153Smm archive_le16enc(utf16, uc); 2618232153Smm return (2); 2619232153Smm } 2620232153Smm} 2621232153Smm 2622228753Smm/* 2623232153Smm * Copy UTF-8 string in checking surrogate pair. 2624232153Smm * If any surrogate pair are found, it would be canonicalized. 2625228753Smm */ 2626232153Smmstatic int 2627238856Smmstrncat_from_utf8_to_utf8(struct archive_string *as, const void *_p, 2628238856Smm size_t len, struct archive_string_conv *sc) 2629228753Smm{ 2630232153Smm const char *s; 2631232153Smm char *p, *endp; 2632232153Smm int n, ret = 0; 2633228753Smm 2634232153Smm (void)sc; /* UNUSED */ 2635232153Smm 2636232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 2637232153Smm return (-1); 2638232153Smm 2639232153Smm s = (const char *)_p; 2640232153Smm p = as->s + as->length; 2641232153Smm endp = as->s + as->buffer_length -1; 2642232153Smm do { 2643232153Smm uint32_t uc; 2644232153Smm const char *ss = s; 2645232153Smm size_t w; 2646232153Smm 2647232153Smm /* 2648232153Smm * Forward byte sequence until a conversion of that is needed. 2649232153Smm */ 2650232153Smm while ((n = utf8_to_unicode(&uc, s, len)) > 0) { 2651232153Smm s += n; 2652232153Smm len -= n; 2653232153Smm } 2654232153Smm if (ss < s) { 2655232153Smm if (p + (s - ss) > endp) { 2656232153Smm as->length = p - as->s; 2657232153Smm if (archive_string_ensure(as, 2658232153Smm as->buffer_length + len + 1) == NULL) 2659232153Smm return (-1); 2660232153Smm p = as->s + as->length; 2661232153Smm endp = as->s + as->buffer_length -1; 2662232153Smm } 2663232153Smm 2664232153Smm memcpy(p, ss, s - ss); 2665232153Smm p += s - ss; 2666232153Smm } 2667232153Smm 2668232153Smm /* 2669232153Smm * If n is negative, current byte sequence needs a replacement. 2670232153Smm */ 2671228753Smm if (n < 0) { 2672232153Smm if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { 2673232153Smm /* Current byte sequence may be CESU-8. */ 2674232153Smm n = cesu8_to_unicode(&uc, s, len); 2675232153Smm } 2676228753Smm if (n < 0) { 2677232153Smm ret = -1; 2678232153Smm n *= -1;/* Use a replaced unicode character. */ 2679228753Smm } 2680232153Smm 2681232153Smm /* Rebuild UTF-8 byte sequence. */ 2682232153Smm while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) { 2683232153Smm as->length = p - as->s; 2684232153Smm if (archive_string_ensure(as, 2685232153Smm as->buffer_length + len + 1) == NULL) 2686232153Smm return (-1); 2687232153Smm p = as->s + as->length; 2688232153Smm endp = as->s + as->buffer_length -1; 2689228753Smm } 2690232153Smm p += w; 2691232153Smm s += n; 2692232153Smm len -= n; 2693228753Smm } 2694232153Smm } while (n > 0); 2695232153Smm as->length = p - as->s; 2696232153Smm as->s[as->length] = '\0'; 2697232153Smm return (ret); 2698232153Smm} 2699232153Smm 2700232153Smmstatic int 2701232153Smmarchive_string_append_unicode(struct archive_string *as, const void *_p, 2702232153Smm size_t len, struct archive_string_conv *sc) 2703232153Smm{ 2704232153Smm const char *s; 2705232153Smm char *p, *endp; 2706232153Smm uint32_t uc; 2707232153Smm size_t w; 2708232153Smm int n, ret = 0, ts, tm; 2709232153Smm int (*parse)(uint32_t *, const char *, size_t); 2710232153Smm size_t (*unparse)(char *, size_t, uint32_t); 2711232153Smm 2712232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 2713232153Smm unparse = unicode_to_utf16be; 2714232153Smm ts = 2; 2715232153Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 2716232153Smm unparse = unicode_to_utf16le; 2717232153Smm ts = 2; 2718232153Smm } else if (sc->flag & SCONV_TO_UTF8) { 2719232153Smm unparse = unicode_to_utf8; 2720232153Smm ts = 1; 2721232153Smm } else { 2722232153Smm /* 2723232153Smm * This case is going to be converted to another 2724232153Smm * character-set through iconv. 2725232153Smm */ 2726232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2727232153Smm unparse = unicode_to_utf16be; 2728232153Smm ts = 2; 2729232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2730232153Smm unparse = unicode_to_utf16le; 2731232153Smm ts = 2; 2732232153Smm } else { 2733232153Smm unparse = unicode_to_utf8; 2734232153Smm ts = 1; 2735232153Smm } 2736228753Smm } 2737232153Smm 2738232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2739232153Smm parse = utf16be_to_unicode; 2740232153Smm tm = 1; 2741232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2742232153Smm parse = utf16le_to_unicode; 2743232153Smm tm = 1; 2744232153Smm } else { 2745232153Smm parse = cesu8_to_unicode; 2746232153Smm tm = ts; 2747232153Smm } 2748232153Smm 2749232153Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2750232153Smm return (-1); 2751232153Smm 2752232153Smm s = (const char *)_p; 2753232153Smm p = as->s + as->length; 2754232153Smm endp = as->s + as->buffer_length - ts; 2755232153Smm while ((n = parse(&uc, s, len)) != 0) { 2756232153Smm if (n < 0) { 2757232153Smm /* Use a replaced unicode character. */ 2758232153Smm n *= -1; 2759232153Smm ret = -1; 2760232153Smm } 2761232153Smm s += n; 2762232153Smm len -= n; 2763232153Smm while ((w = unparse(p, endp - p, uc)) == 0) { 2764232153Smm /* There is not enough output buffer so 2765232153Smm * we have to expand it. */ 2766232153Smm as->length = p - as->s; 2767232153Smm if (archive_string_ensure(as, 2768232153Smm as->buffer_length + len * tm + ts) == NULL) 2769232153Smm return (-1); 2770232153Smm p = as->s + as->length; 2771232153Smm endp = as->s + as->buffer_length - ts; 2772232153Smm } 2773232153Smm p += w; 2774232153Smm } 2775232153Smm as->length = p - as->s; 2776232153Smm as->s[as->length] = '\0'; 2777232153Smm if (ts == 2) 2778232153Smm as->s[as->length+1] = '\0'; 2779232153Smm return (ret); 2780228753Smm} 2781228753Smm 2782232153Smm/* 2783232153Smm * Following Constants for Hangul compositions this information comes from 2784232153Smm * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ 2785232153Smm */ 2786232153Smm#define HC_SBASE 0xAC00 2787232153Smm#define HC_LBASE 0x1100 2788232153Smm#define HC_VBASE 0x1161 2789232153Smm#define HC_TBASE 0x11A7 2790232153Smm#define HC_LCOUNT 19 2791232153Smm#define HC_VCOUNT 21 2792232153Smm#define HC_TCOUNT 28 2793232153Smm#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) 2794232153Smm#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) 2795228753Smm 2796232153Smmstatic uint32_t 2797232153Smmget_nfc(uint32_t uc, uint32_t uc2) 2798232153Smm{ 2799232153Smm int t, b; 2800232153Smm 2801232153Smm t = 0; 2802232153Smm b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; 2803232153Smm while (b >= t) { 2804232153Smm int m = (t + b) / 2; 2805232153Smm if (u_composition_table[m].cp1 < uc) 2806232153Smm t = m + 1; 2807232153Smm else if (u_composition_table[m].cp1 > uc) 2808232153Smm b = m - 1; 2809232153Smm else if (u_composition_table[m].cp2 < uc2) 2810232153Smm t = m + 1; 2811232153Smm else if (u_composition_table[m].cp2 > uc2) 2812232153Smm b = m - 1; 2813232153Smm else 2814232153Smm return (u_composition_table[m].nfc); 2815232153Smm } 2816232153Smm return (0); 2817232153Smm} 2818232153Smm 2819232153Smm#define FDC_MAX 10 /* The maximum number of Following Decomposable 2820232153Smm * Characters. */ 2821232153Smm 2822228753Smm/* 2823232153Smm * Update first code point. 2824232153Smm */ 2825232153Smm#define UPDATE_UC(new_uc) do { \ 2826232153Smm uc = new_uc; \ 2827232153Smm ucptr = NULL; \ 2828232153Smm} while (0) 2829232153Smm 2830232153Smm/* 2831232153Smm * Replace first code point with second code point. 2832232153Smm */ 2833232153Smm#define REPLACE_UC_WITH_UC2() do { \ 2834232153Smm uc = uc2; \ 2835232153Smm ucptr = uc2ptr; \ 2836232153Smm n = n2; \ 2837232153Smm} while (0) 2838232153Smm 2839232153Smm#define EXPAND_BUFFER() do { \ 2840232153Smm as->length = p - as->s; \ 2841232153Smm if (archive_string_ensure(as, \ 2842232153Smm as->buffer_length + len * tm + ts) == NULL)\ 2843232153Smm return (-1); \ 2844232153Smm p = as->s + as->length; \ 2845232153Smm endp = as->s + as->buffer_length - ts; \ 2846232153Smm} while (0) 2847232153Smm 2848232153Smm#define UNPARSE(p, endp, uc) do { \ 2849232153Smm while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ 2850232153Smm EXPAND_BUFFER(); \ 2851232153Smm } \ 2852232153Smm p += w; \ 2853232153Smm} while (0) 2854232153Smm 2855232153Smm/* 2856232153Smm * Write first code point. 2857232153Smm * If the code point has not be changed from its original code, 2858232153Smm * this just copies it from its original buffer pointer. 2859232153Smm * If not, this converts it to UTF-8 byte sequence and copies it. 2860232153Smm */ 2861232153Smm#define WRITE_UC() do { \ 2862232153Smm if (ucptr) { \ 2863232153Smm if (p + n > endp) \ 2864232153Smm EXPAND_BUFFER(); \ 2865232153Smm switch (n) { \ 2866232153Smm case 4: \ 2867232153Smm *p++ = *ucptr++; \ 2868232153Smm /* FALL THROUGH */ \ 2869232153Smm case 3: \ 2870232153Smm *p++ = *ucptr++; \ 2871232153Smm /* FALL THROUGH */ \ 2872232153Smm case 2: \ 2873232153Smm *p++ = *ucptr++; \ 2874232153Smm /* FALL THROUGH */ \ 2875232153Smm case 1: \ 2876232153Smm *p++ = *ucptr; \ 2877232153Smm break; \ 2878232153Smm } \ 2879232153Smm ucptr = NULL; \ 2880232153Smm } else { \ 2881232153Smm UNPARSE(p, endp, uc); \ 2882232153Smm } \ 2883232153Smm} while (0) 2884232153Smm 2885232153Smm/* 2886232153Smm * Collect following decomposable code points. 2887232153Smm */ 2888232153Smm#define COLLECT_CPS(start) do { \ 2889232153Smm int _i; \ 2890232153Smm for (_i = start; _i < FDC_MAX ; _i++) { \ 2891232153Smm nx = parse(&ucx[_i], s, len); \ 2892232153Smm if (nx <= 0) \ 2893232153Smm break; \ 2894232153Smm cx = CCC(ucx[_i]); \ 2895232153Smm if (cl >= cx && cl != 228 && cx != 228)\ 2896232153Smm break; \ 2897232153Smm s += nx; \ 2898232153Smm len -= nx; \ 2899232153Smm cl = cx; \ 2900232153Smm ccx[_i] = cx; \ 2901232153Smm } \ 2902232153Smm if (_i >= FDC_MAX) { \ 2903232153Smm ret = -1; \ 2904232153Smm ucx_size = FDC_MAX; \ 2905232153Smm } else \ 2906232153Smm ucx_size = _i; \ 2907232153Smm} while (0) 2908232153Smm 2909232153Smm/* 2910232153Smm * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. 2911228753Smm * 2912313570Smm * TODO: Convert composition exclusions, which are never converted 2913232153Smm * from NFC,NFD,NFKC and NFKD, to Form C. 2914228753Smm */ 2915232153Smmstatic int 2916232153Smmarchive_string_normalize_C(struct archive_string *as, const void *_p, 2917232153Smm size_t len, struct archive_string_conv *sc) 2918228753Smm{ 2919232153Smm const char *s = (const char *)_p; 2920232153Smm char *p, *endp; 2921232153Smm uint32_t uc, uc2; 2922232153Smm size_t w; 2923232153Smm int always_replace, n, n2, ret = 0, spair, ts, tm; 2924232153Smm int (*parse)(uint32_t *, const char *, size_t); 2925232153Smm size_t (*unparse)(char *, size_t, uint32_t); 2926228753Smm 2927232153Smm always_replace = 1; 2928232153Smm ts = 1;/* text size. */ 2929232153Smm if (sc->flag & SCONV_TO_UTF16BE) { 2930232153Smm unparse = unicode_to_utf16be; 2931232153Smm ts = 2; 2932232153Smm if (sc->flag & SCONV_FROM_UTF16BE) 2933232153Smm always_replace = 0; 2934232153Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 2935232153Smm unparse = unicode_to_utf16le; 2936232153Smm ts = 2; 2937232153Smm if (sc->flag & SCONV_FROM_UTF16LE) 2938232153Smm always_replace = 0; 2939232153Smm } else if (sc->flag & SCONV_TO_UTF8) { 2940232153Smm unparse = unicode_to_utf8; 2941232153Smm if (sc->flag & SCONV_FROM_UTF8) 2942232153Smm always_replace = 0; 2943232153Smm } else { 2944232153Smm /* 2945232153Smm * This case is going to be converted to another 2946232153Smm * character-set through iconv. 2947232153Smm */ 2948232153Smm always_replace = 0; 2949232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2950232153Smm unparse = unicode_to_utf16be; 2951232153Smm ts = 2; 2952232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2953232153Smm unparse = unicode_to_utf16le; 2954232153Smm ts = 2; 2955232153Smm } else { 2956232153Smm unparse = unicode_to_utf8; 2957232153Smm } 2958232153Smm } 2959232153Smm 2960232153Smm if (sc->flag & SCONV_FROM_UTF16BE) { 2961232153Smm parse = utf16be_to_unicode; 2962232153Smm tm = 1; 2963232153Smm spair = 4;/* surrogate pair size in UTF-16. */ 2964232153Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 2965232153Smm parse = utf16le_to_unicode; 2966232153Smm tm = 1; 2967232153Smm spair = 4;/* surrogate pair size in UTF-16. */ 2968232153Smm } else { 2969232153Smm parse = cesu8_to_unicode; 2970232153Smm tm = ts; 2971232153Smm spair = 6;/* surrogate pair size in UTF-8. */ 2972232153Smm } 2973232153Smm 2974232153Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2975232153Smm return (-1); 2976232153Smm 2977232153Smm p = as->s + as->length; 2978232153Smm endp = as->s + as->buffer_length - ts; 2979232153Smm while ((n = parse(&uc, s, len)) != 0) { 2980232153Smm const char *ucptr, *uc2ptr; 2981232153Smm 2982232153Smm if (n < 0) { 2983232153Smm /* Use a replaced unicode character. */ 2984232153Smm UNPARSE(p, endp, uc); 2985232153Smm s += n*-1; 2986232153Smm len -= n*-1; 2987232153Smm ret = -1; 2988232153Smm continue; 2989232153Smm } else if (n == spair || always_replace) 2990232153Smm /* uc is converted from a surrogate pair. 2991232153Smm * this should be treated as a changed code. */ 2992232153Smm ucptr = NULL; 2993232153Smm else 2994232153Smm ucptr = s; 2995232153Smm s += n; 2996232153Smm len -= n; 2997232153Smm 2998232153Smm /* Read second code point. */ 2999232153Smm while ((n2 = parse(&uc2, s, len)) > 0) { 3000232153Smm uint32_t ucx[FDC_MAX]; 3001232153Smm int ccx[FDC_MAX]; 3002232153Smm int cl, cx, i, nx, ucx_size; 3003232153Smm int LIndex,SIndex; 3004232153Smm uint32_t nfc; 3005232153Smm 3006232153Smm if (n2 == spair || always_replace) 3007232153Smm /* uc2 is converted from a surrogate pair. 3008232153Smm * this should be treated as a changed code. */ 3009232153Smm uc2ptr = NULL; 3010232153Smm else 3011232153Smm uc2ptr = s; 3012232153Smm s += n2; 3013232153Smm len -= n2; 3014232153Smm 3015232153Smm /* 3016232153Smm * If current second code point is out of decomposable 3017232153Smm * code points, finding compositions is unneeded. 3018232153Smm */ 3019232153Smm if (!IS_DECOMPOSABLE_BLOCK(uc2)) { 3020232153Smm WRITE_UC(); 3021232153Smm REPLACE_UC_WITH_UC2(); 3022232153Smm continue; 3023232153Smm } 3024232153Smm 3025232153Smm /* 3026232153Smm * Try to combine current code points. 3027232153Smm */ 3028232153Smm /* 3029232153Smm * We have to combine Hangul characters according to 3030232153Smm * http://uniicode.org/reports/tr15/#Hangul 3031232153Smm */ 3032232153Smm if (0 <= (LIndex = uc - HC_LBASE) && 3033232153Smm LIndex < HC_LCOUNT) { 3034232153Smm /* 3035232153Smm * Hangul Composition. 3036232153Smm * 1. Two current code points are L and V. 3037232153Smm */ 3038232153Smm int VIndex = uc2 - HC_VBASE; 3039232153Smm if (0 <= VIndex && VIndex < HC_VCOUNT) { 3040232153Smm /* Make syllable of form LV. */ 3041232153Smm UPDATE_UC(HC_SBASE + 3042232153Smm (LIndex * HC_VCOUNT + VIndex) * 3043232153Smm HC_TCOUNT); 3044232153Smm } else { 3045232153Smm WRITE_UC(); 3046232153Smm REPLACE_UC_WITH_UC2(); 3047232153Smm } 3048232153Smm continue; 3049232153Smm } else if (0 <= (SIndex = uc - HC_SBASE) && 3050232153Smm SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { 3051232153Smm /* 3052232153Smm * Hangul Composition. 3053232153Smm * 2. Two current code points are LV and T. 3054232153Smm */ 3055232153Smm int TIndex = uc2 - HC_TBASE; 3056232153Smm if (0 < TIndex && TIndex < HC_TCOUNT) { 3057232153Smm /* Make syllable of form LVT. */ 3058232153Smm UPDATE_UC(uc + TIndex); 3059232153Smm } else { 3060232153Smm WRITE_UC(); 3061232153Smm REPLACE_UC_WITH_UC2(); 3062232153Smm } 3063232153Smm continue; 3064232153Smm } else if ((nfc = get_nfc(uc, uc2)) != 0) { 3065232153Smm /* A composition to current code points 3066232153Smm * is found. */ 3067232153Smm UPDATE_UC(nfc); 3068232153Smm continue; 3069232153Smm } else if ((cl = CCC(uc2)) == 0) { 3070232153Smm /* Clearly 'uc2' the second code point is not 3071232153Smm * a decomposable code. */ 3072232153Smm WRITE_UC(); 3073232153Smm REPLACE_UC_WITH_UC2(); 3074232153Smm continue; 3075232153Smm } 3076232153Smm 3077232153Smm /* 3078232153Smm * Collect following decomposable code points. 3079232153Smm */ 3080232153Smm cx = 0; 3081232153Smm ucx[0] = uc2; 3082232153Smm ccx[0] = cl; 3083232153Smm COLLECT_CPS(1); 3084232153Smm 3085232153Smm /* 3086232153Smm * Find a composed code in the collected code points. 3087232153Smm */ 3088232153Smm i = 1; 3089232153Smm while (i < ucx_size) { 3090232153Smm int j; 3091232153Smm 3092232153Smm if ((nfc = get_nfc(uc, ucx[i])) == 0) { 3093232153Smm i++; 3094232153Smm continue; 3095232153Smm } 3096232153Smm 3097232153Smm /* 3098232153Smm * nfc is composed of uc and ucx[i]. 3099232153Smm */ 3100232153Smm UPDATE_UC(nfc); 3101232153Smm 3102232153Smm /* 3103232153Smm * Remove ucx[i] by shifting 3104232153Smm * following code points. 3105232153Smm */ 3106232153Smm for (j = i; j+1 < ucx_size; j++) { 3107232153Smm ucx[j] = ucx[j+1]; 3108232153Smm ccx[j] = ccx[j+1]; 3109232153Smm } 3110232153Smm ucx_size --; 3111232153Smm 3112232153Smm /* 3113232153Smm * Collect following code points blocked 3114232153Smm * by ucx[i] the removed code point. 3115232153Smm */ 3116232153Smm if (ucx_size > 0 && i == ucx_size && 3117232153Smm nx > 0 && cx == cl) { 3118232153Smm cl = ccx[ucx_size-1]; 3119232153Smm COLLECT_CPS(ucx_size); 3120232153Smm } 3121232153Smm /* 3122232153Smm * Restart finding a composed code with 3123232153Smm * the updated uc from the top of the 3124232153Smm * collected code points. 3125232153Smm */ 3126232153Smm i = 0; 3127232153Smm } 3128232153Smm 3129232153Smm /* 3130232153Smm * Apparently the current code points are not 3131232153Smm * decomposed characters or already composed. 3132232153Smm */ 3133232153Smm WRITE_UC(); 3134232153Smm for (i = 0; i < ucx_size; i++) 3135232153Smm UNPARSE(p, endp, ucx[i]); 3136232153Smm 3137232153Smm /* 3138232153Smm * Flush out remaining canonical combining characters. 3139232153Smm */ 3140232153Smm if (nx > 0 && cx == cl && len > 0) { 3141232153Smm while ((nx = parse(&ucx[0], s, len)) 3142232153Smm > 0) { 3143232153Smm cx = CCC(ucx[0]); 3144232153Smm if (cl > cx) 3145232153Smm break; 3146232153Smm s += nx; 3147232153Smm len -= nx; 3148232153Smm cl = cx; 3149232153Smm UNPARSE(p, endp, ucx[0]); 3150232153Smm } 3151232153Smm } 3152232153Smm break; 3153232153Smm } 3154232153Smm if (n2 < 0) { 3155232153Smm WRITE_UC(); 3156232153Smm /* Use a replaced unicode character. */ 3157232153Smm UNPARSE(p, endp, uc2); 3158232153Smm s += n2*-1; 3159232153Smm len -= n2*-1; 3160232153Smm ret = -1; 3161232153Smm continue; 3162232153Smm } else if (n2 == 0) { 3163232153Smm WRITE_UC(); 3164232153Smm break; 3165232153Smm } 3166232153Smm } 3167232153Smm as->length = p - as->s; 3168232153Smm as->s[as->length] = '\0'; 3169232153Smm if (ts == 2) 3170232153Smm as->s[as->length+1] = '\0'; 3171232153Smm return (ret); 3172232153Smm} 3173232153Smm 3174238856Smmstatic int 3175238856Smmget_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) 3176238856Smm{ 3177238856Smm int t, b; 3178232153Smm 3179238856Smm /* 3180238856Smm * These are not converted to NFD on Mac OS. 3181238856Smm */ 3182238856Smm if ((uc >= 0x2000 && uc <= 0x2FFF) || 3183238856Smm (uc >= 0xF900 && uc <= 0xFAFF) || 3184238856Smm (uc >= 0x2F800 && uc <= 0x2FAFF)) 3185238856Smm return (0); 3186238856Smm /* 3187238856Smm * Those code points are not converted to NFD on Mac OS. 3188238856Smm * I do not know the reason because it is undocumented. 3189238856Smm * NFC NFD 3190238856Smm * 1109A ==> 11099 110BA 3191238856Smm * 1109C ==> 1109B 110BA 3192238856Smm * 110AB ==> 110A5 110BA 3193238856Smm */ 3194238856Smm if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) 3195238856Smm return (0); 3196238856Smm 3197238856Smm t = 0; 3198238856Smm b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; 3199238856Smm while (b >= t) { 3200238856Smm int m = (t + b) / 2; 3201238856Smm if (u_decomposition_table[m].nfc < uc) 3202238856Smm t = m + 1; 3203238856Smm else if (u_decomposition_table[m].nfc > uc) 3204238856Smm b = m - 1; 3205238856Smm else { 3206238856Smm *cp1 = u_decomposition_table[m].cp1; 3207238856Smm *cp2 = u_decomposition_table[m].cp2; 3208238856Smm return (1); 3209238856Smm } 3210238856Smm } 3211238856Smm return (0); 3212238856Smm} 3213238856Smm 3214238856Smm#define REPLACE_UC_WITH(cp) do { \ 3215238856Smm uc = cp; \ 3216238856Smm ucptr = NULL; \ 3217238856Smm} while (0) 3218238856Smm 3219232153Smm/* 3220232153Smm * Normalize UTF-8 characters to Form D and copy the result. 3221232153Smm */ 3222232153Smmstatic int 3223232153Smmarchive_string_normalize_D(struct archive_string *as, const void *_p, 3224232153Smm size_t len, struct archive_string_conv *sc) 3225232153Smm{ 3226238856Smm const char *s = (const char *)_p; 3227238856Smm char *p, *endp; 3228238856Smm uint32_t uc, uc2; 3229238856Smm size_t w; 3230238856Smm int always_replace, n, n2, ret = 0, spair, ts, tm; 3231238856Smm int (*parse)(uint32_t *, const char *, size_t); 3232238856Smm size_t (*unparse)(char *, size_t, uint32_t); 3233232153Smm 3234238856Smm always_replace = 1; 3235238856Smm ts = 1;/* text size. */ 3236238856Smm if (sc->flag & SCONV_TO_UTF16BE) { 3237238856Smm unparse = unicode_to_utf16be; 3238238856Smm ts = 2; 3239238856Smm if (sc->flag & SCONV_FROM_UTF16BE) 3240238856Smm always_replace = 0; 3241238856Smm } else if (sc->flag & SCONV_TO_UTF16LE) { 3242238856Smm unparse = unicode_to_utf16le; 3243238856Smm ts = 2; 3244238856Smm if (sc->flag & SCONV_FROM_UTF16LE) 3245238856Smm always_replace = 0; 3246238856Smm } else if (sc->flag & SCONV_TO_UTF8) { 3247238856Smm unparse = unicode_to_utf8; 3248238856Smm if (sc->flag & SCONV_FROM_UTF8) 3249238856Smm always_replace = 0; 3250238856Smm } else { 3251238856Smm /* 3252238856Smm * This case is going to be converted to another 3253238856Smm * character-set through iconv. 3254238856Smm */ 3255238856Smm always_replace = 0; 3256238856Smm if (sc->flag & SCONV_FROM_UTF16BE) { 3257238856Smm unparse = unicode_to_utf16be; 3258238856Smm ts = 2; 3259238856Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 3260238856Smm unparse = unicode_to_utf16le; 3261238856Smm ts = 2; 3262238856Smm } else { 3263238856Smm unparse = unicode_to_utf8; 3264238856Smm } 3265228753Smm } 3266232153Smm 3267238856Smm if (sc->flag & SCONV_FROM_UTF16BE) { 3268238856Smm parse = utf16be_to_unicode; 3269238856Smm tm = 1; 3270238856Smm spair = 4;/* surrogate pair size in UTF-16. */ 3271238856Smm } else if (sc->flag & SCONV_FROM_UTF16LE) { 3272238856Smm parse = utf16le_to_unicode; 3273238856Smm tm = 1; 3274238856Smm spair = 4;/* surrogate pair size in UTF-16. */ 3275238856Smm } else { 3276238856Smm parse = cesu8_to_unicode; 3277238856Smm tm = ts; 3278238856Smm spair = 6;/* surrogate pair size in UTF-8. */ 3279238856Smm } 3280238856Smm 3281238856Smm if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 3282232153Smm return (-1); 3283232153Smm 3284238856Smm p = as->s + as->length; 3285238856Smm endp = as->s + as->buffer_length - ts; 3286238856Smm while ((n = parse(&uc, s, len)) != 0) { 3287238856Smm const char *ucptr; 3288238856Smm uint32_t cp1, cp2; 3289238856Smm int SIndex; 3290238856Smm struct { 3291238856Smm uint32_t uc; 3292238856Smm int ccc; 3293238856Smm } fdc[FDC_MAX]; 3294238856Smm int fdi, fdj; 3295238856Smm int ccc; 3296232153Smm 3297238856Smmcheck_first_code: 3298238856Smm if (n < 0) { 3299238856Smm /* Use a replaced unicode character. */ 3300238856Smm UNPARSE(p, endp, uc); 3301238856Smm s += n*-1; 3302238856Smm len -= n*-1; 3303238856Smm ret = -1; 3304238856Smm continue; 3305238856Smm } else if (n == spair || always_replace) 3306238856Smm /* uc is converted from a surrogate pair. 3307238856Smm * this should be treated as a changed code. */ 3308238856Smm ucptr = NULL; 3309238856Smm else 3310238856Smm ucptr = s; 3311238856Smm s += n; 3312238856Smm len -= n; 3313232153Smm 3314238856Smm /* Hangul Decomposition. */ 3315238856Smm if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { 3316238856Smm int L = HC_LBASE + SIndex / HC_NCOUNT; 3317238856Smm int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; 3318238856Smm int T = HC_TBASE + SIndex % HC_TCOUNT; 3319232153Smm 3320238856Smm REPLACE_UC_WITH(L); 3321238856Smm WRITE_UC(); 3322238856Smm REPLACE_UC_WITH(V); 3323238856Smm WRITE_UC(); 3324238856Smm if (T != HC_TBASE) { 3325238856Smm REPLACE_UC_WITH(T); 3326238856Smm WRITE_UC(); 3327238856Smm } 3328238856Smm continue; 3329238856Smm } 3330238856Smm if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { 3331238856Smm WRITE_UC(); 3332238856Smm continue; 3333238856Smm } 3334232153Smm 3335238856Smm fdi = 0; 3336238856Smm while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { 3337238856Smm int k; 3338238856Smm 3339238856Smm for (k = fdi; k > 0; k--) 3340238856Smm fdc[k] = fdc[k-1]; 3341238856Smm fdc[0].ccc = CCC(cp2); 3342238856Smm fdc[0].uc = cp2; 3343238856Smm fdi++; 3344238856Smm REPLACE_UC_WITH(cp1); 3345238856Smm } 3346238856Smm 3347238856Smm /* Read following code points. */ 3348238856Smm while ((n2 = parse(&uc2, s, len)) > 0 && 3349238856Smm (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { 3350238856Smm int j, k; 3351238856Smm 3352238856Smm s += n2; 3353238856Smm len -= n2; 3354238856Smm for (j = 0; j < fdi; j++) { 3355238856Smm if (fdc[j].ccc > ccc) 3356238856Smm break; 3357238856Smm } 3358238856Smm if (j < fdi) { 3359238856Smm for (k = fdi; k > j; k--) 3360238856Smm fdc[k] = fdc[k-1]; 3361238856Smm fdc[j].ccc = ccc; 3362238856Smm fdc[j].uc = uc2; 3363238856Smm } else { 3364238856Smm fdc[fdi].ccc = ccc; 3365238856Smm fdc[fdi].uc = uc2; 3366238856Smm } 3367238856Smm fdi++; 3368238856Smm } 3369238856Smm 3370238856Smm WRITE_UC(); 3371238856Smm for (fdj = 0; fdj < fdi; fdj++) { 3372238856Smm REPLACE_UC_WITH(fdc[fdj].uc); 3373238856Smm WRITE_UC(); 3374238856Smm } 3375238856Smm 3376238856Smm if (n2 == 0) 3377238856Smm break; 3378238856Smm REPLACE_UC_WITH(uc2); 3379238856Smm n = n2; 3380238856Smm goto check_first_code; 3381232153Smm } 3382238856Smm as->length = p - as->s; 3383238856Smm as->s[as->length] = '\0'; 3384238856Smm if (ts == 2) 3385238856Smm as->s[as->length+1] = '\0'; 3386232153Smm return (ret); 3387228753Smm} 3388228753Smm 3389228753Smm/* 3390232153Smm * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption 3391232153Smm * that WCS is Unicode. It is true for several platforms but some are false. 3392232153Smm * And then people who did not use UTF-8 locale on the non Unicode WCS 3393232153Smm * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those 3394232153Smm * now cannot get right filename from libarchive 3.x and later since we 3395232153Smm * fixed the wrong assumption and it is incompatible to older its versions. 3396232153Smm * So we provide special option, "compat-2x.x", for resolving it. 3397232153Smm * That option enable the string conversion of libarchive 2.x. 3398228753Smm * 3399232153Smm * Translates the wrong UTF-8 string made by libarchive 2.x into current 3400232153Smm * locale character set and appends to the archive_string. 3401232153Smm * Note: returns -1 if conversion fails. 3402228753Smm */ 3403232153Smmstatic int 3404232153Smmstrncat_from_utf8_libarchive2(struct archive_string *as, 3405232153Smm const void *_p, size_t len, struct archive_string_conv *sc) 3406228753Smm{ 3407232153Smm const char *s; 3408228753Smm int n; 3409228753Smm char *p; 3410232153Smm char *end; 3411232153Smm uint32_t unicode; 3412228753Smm#if HAVE_WCRTOMB 3413228753Smm mbstate_t shift_state; 3414228753Smm 3415228753Smm memset(&shift_state, 0, sizeof(shift_state)); 3416228753Smm#else 3417228753Smm /* Clear the shift state before starting. */ 3418228753Smm wctomb(NULL, L'\0'); 3419228753Smm#endif 3420232153Smm (void)sc; /* UNUSED */ 3421228753Smm /* 3422232153Smm * Allocate buffer for MBS. 3423232153Smm * We need this allocation here since it is possible that 3424232153Smm * as->s is still NULL. 3425228753Smm */ 3426232153Smm if (archive_string_ensure(as, as->length + len + 1) == NULL) 3427232153Smm return (-1); 3428232153Smm 3429232153Smm s = (const char *)_p; 3430232153Smm p = as->s + as->length; 3431232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 3432232153Smm while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { 3433232153Smm wchar_t wc; 3434232153Smm 3435232153Smm if (p >= end) { 3436232153Smm as->length = p - as->s; 3437232153Smm /* Re-allocate buffer for MBS. */ 3438232153Smm if (archive_string_ensure(as, 3439232153Smm as->length + len * 2 + 1) == NULL) 3440232153Smm return (-1); 3441232153Smm p = as->s + as->length; 3442232153Smm end = as->s + as->buffer_length - MB_CUR_MAX -1; 3443228753Smm } 3444232153Smm 3445232153Smm /* 3446313570Smm * As libarchive 2.x, translates the UTF-8 characters into 3447232153Smm * wide-characters in the assumption that WCS is Unicode. 3448232153Smm */ 3449232153Smm if (n < 0) { 3450232153Smm n *= -1; 3451232153Smm wc = L'?'; 3452232153Smm } else 3453232153Smm wc = (wchar_t)unicode; 3454232153Smm 3455232153Smm s += n; 3456232153Smm len -= n; 3457232153Smm /* 3458232153Smm * Translates the wide-character into the current locale MBS. 3459232153Smm */ 3460228753Smm#if HAVE_WCRTOMB 3461248616Smm n = (int)wcrtomb(p, wc, &shift_state); 3462228753Smm#else 3463248616Smm n = (int)wctomb(p, wc); 3464228753Smm#endif 3465228753Smm if (n == -1) 3466232153Smm return (-1); 3467228753Smm p += n; 3468228753Smm } 3469232153Smm as->length = p - as->s; 3470232153Smm as->s[as->length] = '\0'; 3471232153Smm return (0); 3472232153Smm} 3473232153Smm 3474232153Smm 3475232153Smm/* 3476232153Smm * Conversion functions between current locale dependent MBS and UTF-16BE. 3477232153Smm * strncat_from_utf16be() : UTF-16BE --> MBS 3478232153Smm * strncat_to_utf16be() : MBS --> UTF16BE 3479232153Smm */ 3480232153Smm 3481232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 3482232153Smm 3483232153Smm/* 3484232153Smm * Convert a UTF-16BE/LE string to current locale and copy the result. 3485311041Smm * Return -1 if conversion fails. 3486232153Smm */ 3487232153Smmstatic int 3488232153Smmwin_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, 3489232153Smm struct archive_string_conv *sc, int be) 3490232153Smm{ 3491232153Smm struct archive_string tmp; 3492232153Smm const char *u16; 3493232153Smm int ll; 3494232153Smm BOOL defchar; 3495232153Smm char *mbs; 3496232153Smm size_t mbs_size, b; 3497232153Smm int ret = 0; 3498232153Smm 3499232153Smm bytes &= ~1; 3500232153Smm if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3501232153Smm return (-1); 3502232153Smm 3503232153Smm mbs = as->s + as->length; 3504232153Smm mbs_size = as->buffer_length - as->length -1; 3505232153Smm 3506232153Smm if (sc->to_cp == CP_C_LOCALE) { 3507232153Smm /* 3508232153Smm * "C" locale special process. 3509232153Smm */ 3510232153Smm u16 = _p; 3511232153Smm ll = 0; 3512232153Smm for (b = 0; b < bytes; b += 2) { 3513232153Smm uint16_t val; 3514232153Smm if (be) 3515232153Smm val = archive_be16dec(u16+b); 3516232153Smm else 3517232153Smm val = archive_le16dec(u16+b); 3518232153Smm if (val > 255) { 3519232153Smm *mbs++ = '?'; 3520232153Smm ret = -1; 3521232153Smm } else 3522232153Smm *mbs++ = (char)(val&0xff); 3523232153Smm ll++; 3524232153Smm } 3525232153Smm as->length += ll; 3526232153Smm as->s[as->length] = '\0'; 3527232153Smm return (ret); 3528232153Smm } 3529232153Smm 3530232153Smm archive_string_init(&tmp); 3531232153Smm if (be) { 3532232153Smm if (is_big_endian()) { 3533232153Smm u16 = _p; 3534232153Smm } else { 3535232153Smm if (archive_string_ensure(&tmp, bytes+2) == NULL) 3536232153Smm return (-1); 3537232153Smm memcpy(tmp.s, _p, bytes); 3538232153Smm for (b = 0; b < bytes; b += 2) { 3539232153Smm uint16_t val = archive_be16dec(tmp.s+b); 3540232153Smm archive_le16enc(tmp.s+b, val); 3541232153Smm } 3542232153Smm u16 = tmp.s; 3543232153Smm } 3544232153Smm } else { 3545232153Smm if (!is_big_endian()) { 3546232153Smm u16 = _p; 3547232153Smm } else { 3548232153Smm if (archive_string_ensure(&tmp, bytes+2) == NULL) 3549232153Smm return (-1); 3550232153Smm memcpy(tmp.s, _p, bytes); 3551232153Smm for (b = 0; b < bytes; b += 2) { 3552232153Smm uint16_t val = archive_le16dec(tmp.s+b); 3553232153Smm archive_be16enc(tmp.s+b, val); 3554232153Smm } 3555232153Smm u16 = tmp.s; 3556232153Smm } 3557232153Smm } 3558232153Smm 3559232153Smm do { 3560232153Smm defchar = 0; 3561232153Smm ll = WideCharToMultiByte(sc->to_cp, 0, 3562248616Smm (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, 3563232153Smm NULL, &defchar); 3564302294Smm /* Exit loop if we succeeded */ 3565302294Smm if (ll != 0 || 3566302294Smm GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3567302294Smm break; 3568232153Smm } 3569302294Smm /* Else expand buffer and loop to try again. */ 3570302294Smm ll = WideCharToMultiByte(sc->to_cp, 0, 3571302294Smm (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); 3572302294Smm if (archive_string_ensure(as, ll +1) == NULL) 3573302294Smm return (-1); 3574302294Smm mbs = as->s + as->length; 3575302294Smm mbs_size = as->buffer_length - as->length -1; 3576302294Smm } while (1); 3577232153Smm archive_string_free(&tmp); 3578232153Smm as->length += ll; 3579232153Smm as->s[as->length] = '\0'; 3580232153Smm if (ll == 0 || defchar) 3581232153Smm ret = -1; 3582232153Smm return (ret); 3583232153Smm} 3584232153Smm 3585232153Smmstatic int 3586238856Smmwin_strncat_from_utf16be(struct archive_string *as, const void *_p, 3587238856Smm size_t bytes, struct archive_string_conv *sc) 3588232153Smm{ 3589232153Smm return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); 3590232153Smm} 3591232153Smm 3592232153Smmstatic int 3593238856Smmwin_strncat_from_utf16le(struct archive_string *as, const void *_p, 3594238856Smm size_t bytes, struct archive_string_conv *sc) 3595232153Smm{ 3596232153Smm return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); 3597232153Smm} 3598232153Smm 3599232153Smmstatic int 3600232153Smmis_big_endian(void) 3601232153Smm{ 3602232153Smm uint16_t d = 1; 3603232153Smm 3604232153Smm return (archive_be16dec(&d) == 1); 3605232153Smm} 3606232153Smm 3607232153Smm/* 3608232153Smm * Convert a current locale string to UTF-16BE/LE and copy the result. 3609311041Smm * Return -1 if conversion fails. 3610232153Smm */ 3611232153Smmstatic int 3612238856Smmwin_strncat_to_utf16(struct archive_string *as16, const void *_p, 3613238856Smm size_t length, struct archive_string_conv *sc, int bigendian) 3614232153Smm{ 3615232153Smm const char *s = (const char *)_p; 3616232153Smm char *u16; 3617232153Smm size_t count, avail; 3618232153Smm 3619232153Smm if (archive_string_ensure(as16, 3620232153Smm as16->length + (length + 1) * 2) == NULL) 3621232153Smm return (-1); 3622232153Smm 3623232153Smm u16 = as16->s + as16->length; 3624232153Smm avail = as16->buffer_length - 2; 3625232153Smm if (sc->from_cp == CP_C_LOCALE) { 3626232153Smm /* 3627232153Smm * "C" locale special process. 3628232153Smm */ 3629232153Smm count = 0; 3630232153Smm while (count < length && *s) { 3631232153Smm if (bigendian) 3632232153Smm archive_be16enc(u16, *s); 3633232153Smm else 3634232153Smm archive_le16enc(u16, *s); 3635232153Smm u16 += 2; 3636232153Smm s++; 3637232153Smm count++; 3638232153Smm } 3639232153Smm as16->length += count << 1; 3640232153Smm as16->s[as16->length] = 0; 3641232153Smm as16->s[as16->length+1] = 0; 3642232153Smm return (0); 3643232153Smm } 3644232153Smm do { 3645232153Smm count = MultiByteToWideChar(sc->from_cp, 3646248616Smm MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); 3647302294Smm /* Exit loop if we succeeded */ 3648302294Smm if (count != 0 || 3649302294Smm GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3650302294Smm break; 3651232153Smm } 3652302294Smm /* Expand buffer and try again */ 3653302294Smm count = MultiByteToWideChar(sc->from_cp, 3654302294Smm MB_PRECOMPOSED, s, (int)length, NULL, 0); 3655302294Smm if (archive_string_ensure(as16, (count +1) * 2) 3656302294Smm == NULL) 3657302294Smm return (-1); 3658302294Smm u16 = as16->s + as16->length; 3659302294Smm avail = as16->buffer_length - 2; 3660302294Smm } while (1); 3661232153Smm as16->length += count * 2; 3662232153Smm as16->s[as16->length] = 0; 3663232153Smm as16->s[as16->length+1] = 0; 3664232153Smm if (count == 0) 3665232153Smm return (-1); 3666232153Smm 3667232153Smm if (is_big_endian()) { 3668232153Smm if (!bigendian) { 3669232153Smm while (count > 0) { 3670232153Smm uint16_t v = archive_be16dec(u16); 3671232153Smm archive_le16enc(u16, v); 3672232153Smm u16 += 2; 3673232153Smm count--; 3674232153Smm } 3675232153Smm } 3676232153Smm } else { 3677232153Smm if (bigendian) { 3678232153Smm while (count > 0) { 3679232153Smm uint16_t v = archive_le16dec(u16); 3680232153Smm archive_be16enc(u16, v); 3681232153Smm u16 += 2; 3682232153Smm count--; 3683232153Smm } 3684232153Smm } 3685232153Smm } 3686232153Smm return (0); 3687232153Smm} 3688232153Smm 3689232153Smmstatic int 3690238856Smmwin_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3691238856Smm size_t length, struct archive_string_conv *sc) 3692232153Smm{ 3693232153Smm return (win_strncat_to_utf16(as16, _p, length, sc, 1)); 3694232153Smm} 3695232153Smm 3696232153Smmstatic int 3697238856Smmwin_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3698238856Smm size_t length, struct archive_string_conv *sc) 3699232153Smm{ 3700232153Smm return (win_strncat_to_utf16(as16, _p, length, sc, 0)); 3701232153Smm} 3702232153Smm 3703232153Smm#endif /* _WIN32 && !__CYGWIN__ */ 3704232153Smm 3705232153Smm/* 3706232153Smm * Do the best effort for conversions. 3707232153Smm * We cannot handle UTF-16BE character-set without such iconv, 3708232153Smm * but there is a chance if a string consists just ASCII code or 3709232153Smm * a current locale is UTF-8. 3710232153Smm */ 3711232153Smm 3712232153Smm/* 3713232153Smm * Convert a UTF-16BE string to current locale and copy the result. 3714311041Smm * Return -1 if conversion fails. 3715232153Smm */ 3716232153Smmstatic int 3717232153Smmbest_effort_strncat_from_utf16(struct archive_string *as, const void *_p, 3718232153Smm size_t bytes, struct archive_string_conv *sc, int be) 3719232153Smm{ 3720232153Smm const char *utf16 = (const char *)_p; 3721232153Smm char *mbs; 3722232153Smm uint32_t uc; 3723232153Smm int n, ret; 3724232153Smm 3725232153Smm (void)sc; /* UNUSED */ 3726232153Smm /* 3727232153Smm * Other case, we should do the best effort. 3728232153Smm * If all character are ASCII(<0x7f), we can convert it. 3729232153Smm * if not , we set a alternative character and return -1. 3730232153Smm */ 3731232153Smm ret = 0; 3732232153Smm if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3733232153Smm return (-1); 3734232153Smm mbs = as->s + as->length; 3735232153Smm 3736232153Smm while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { 3737232153Smm if (n < 0) { 3738232153Smm n *= -1; 3739232153Smm ret = -1; 3740232153Smm } 3741232153Smm bytes -= n; 3742232153Smm utf16 += n; 3743232153Smm 3744232153Smm if (uc > 127) { 3745232153Smm /* We cannot handle it. */ 3746232153Smm *mbs++ = '?'; 3747232153Smm ret = -1; 3748232153Smm } else 3749232153Smm *mbs++ = (char)uc; 3750232153Smm } 3751232153Smm as->length = mbs - as->s; 3752232153Smm as->s[as->length] = '\0'; 3753232153Smm return (ret); 3754232153Smm} 3755232153Smm 3756232153Smmstatic int 3757232153Smmbest_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, 3758232153Smm size_t bytes, struct archive_string_conv *sc) 3759232153Smm{ 3760232153Smm return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); 3761232153Smm} 3762232153Smm 3763232153Smmstatic int 3764232153Smmbest_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, 3765232153Smm size_t bytes, struct archive_string_conv *sc) 3766232153Smm{ 3767232153Smm return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); 3768232153Smm} 3769232153Smm 3770232153Smm/* 3771232153Smm * Convert a current locale string to UTF-16BE/LE and copy the result. 3772311041Smm * Return -1 if conversion fails. 3773232153Smm */ 3774232153Smmstatic int 3775232153Smmbest_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, 3776232153Smm size_t length, struct archive_string_conv *sc, int bigendian) 3777232153Smm{ 3778232153Smm const char *s = (const char *)_p; 3779232153Smm char *utf16; 3780232153Smm size_t remaining; 3781232153Smm int ret; 3782232153Smm 3783232153Smm (void)sc; /* UNUSED */ 3784232153Smm /* 3785232153Smm * Other case, we should do the best effort. 3786232153Smm * If all character are ASCII(<0x7f), we can convert it. 3787232153Smm * if not , we set a alternative character and return -1. 3788232153Smm */ 3789232153Smm ret = 0; 3790232153Smm remaining = length; 3791232153Smm 3792232153Smm if (archive_string_ensure(as16, 3793232153Smm as16->length + (length + 1) * 2) == NULL) 3794232153Smm return (-1); 3795232153Smm 3796232153Smm utf16 = as16->s + as16->length; 3797232153Smm while (remaining--) { 3798232153Smm unsigned c = *s++; 3799232153Smm if (c > 127) { 3800232153Smm /* We cannot handle it. */ 3801232153Smm c = UNICODE_R_CHAR; 3802232153Smm ret = -1; 3803232153Smm } 3804232153Smm if (bigendian) 3805232153Smm archive_be16enc(utf16, c); 3806232153Smm else 3807232153Smm archive_le16enc(utf16, c); 3808232153Smm utf16 += 2; 3809232153Smm } 3810232153Smm as16->length = utf16 - as16->s; 3811232153Smm as16->s[as16->length] = 0; 3812232153Smm as16->s[as16->length+1] = 0; 3813232153Smm return (ret); 3814232153Smm} 3815232153Smm 3816232153Smmstatic int 3817232153Smmbest_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3818232153Smm size_t length, struct archive_string_conv *sc) 3819232153Smm{ 3820232153Smm return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); 3821232153Smm} 3822232153Smm 3823232153Smmstatic int 3824232153Smmbest_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3825232153Smm size_t length, struct archive_string_conv *sc) 3826232153Smm{ 3827232153Smm return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); 3828232153Smm} 3829232153Smm 3830232153Smm 3831232153Smm/* 3832232153Smm * Multistring operations. 3833232153Smm */ 3834232153Smm 3835232153Smmvoid 3836232153Smmarchive_mstring_clean(struct archive_mstring *aes) 3837232153Smm{ 3838232153Smm archive_wstring_free(&(aes->aes_wcs)); 3839232153Smm archive_string_free(&(aes->aes_mbs)); 3840232153Smm archive_string_free(&(aes->aes_utf8)); 3841232153Smm archive_string_free(&(aes->aes_mbs_in_locale)); 3842232153Smm aes->aes_set = 0; 3843232153Smm} 3844232153Smm 3845232153Smmvoid 3846232153Smmarchive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) 3847232153Smm{ 3848232153Smm dest->aes_set = src->aes_set; 3849232153Smm archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); 3850232153Smm archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); 3851232153Smm archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); 3852232153Smm} 3853232153Smm 3854232153Smmint 3855232153Smmarchive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, 3856232153Smm const char **p) 3857232153Smm{ 3858232153Smm struct archive_string_conv *sc; 3859232153Smm int r; 3860232153Smm 3861232153Smm /* If we already have a UTF8 form, return that immediately. */ 3862232153Smm if (aes->aes_set & AES_SET_UTF8) { 3863232153Smm *p = aes->aes_utf8.s; 3864232153Smm return (0); 3865232153Smm } 3866232153Smm 3867232153Smm *p = NULL; 3868232153Smm if (aes->aes_set & AES_SET_MBS) { 3869232153Smm sc = archive_string_conversion_to_charset(a, "UTF-8", 1); 3870232153Smm if (sc == NULL) 3871232153Smm return (-1);/* Couldn't allocate memory for sc. */ 3872299529Smm r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s, 3873232153Smm aes->aes_mbs.length, sc); 3874232153Smm if (a == NULL) 3875232153Smm free_sconv_object(sc); 3876232153Smm if (r == 0) { 3877232153Smm aes->aes_set |= AES_SET_UTF8; 3878232153Smm *p = aes->aes_utf8.s; 3879232153Smm return (0);/* success. */ 3880232153Smm } else 3881232153Smm return (-1);/* failure. */ 3882232153Smm } 3883232153Smm return (0);/* success. */ 3884232153Smm} 3885232153Smm 3886232153Smmint 3887232153Smmarchive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, 3888232153Smm const char **p) 3889232153Smm{ 3890232153Smm int r, ret = 0; 3891232153Smm 3892232153Smm (void)a; /* UNUSED */ 3893232153Smm /* If we already have an MBS form, return that immediately. */ 3894232153Smm if (aes->aes_set & AES_SET_MBS) { 3895232153Smm *p = aes->aes_mbs.s; 3896232153Smm return (ret); 3897232153Smm } 3898232153Smm 3899232153Smm *p = NULL; 3900232153Smm /* If there's a WCS form, try converting with the native locale. */ 3901232153Smm if (aes->aes_set & AES_SET_WCS) { 3902232153Smm archive_string_empty(&(aes->aes_mbs)); 3903232153Smm r = archive_string_append_from_wcs(&(aes->aes_mbs), 3904232153Smm aes->aes_wcs.s, aes->aes_wcs.length); 3905232153Smm *p = aes->aes_mbs.s; 3906232153Smm if (r == 0) { 3907232153Smm aes->aes_set |= AES_SET_MBS; 3908232153Smm return (ret); 3909232153Smm } else 3910232153Smm ret = -1; 3911232153Smm } 3912232153Smm 3913232153Smm /* 3914232153Smm * Only a UTF-8 form cannot avail because its conversion already 3915232153Smm * failed at archive_mstring_update_utf8(). 3916232153Smm */ 3917232153Smm return (ret); 3918232153Smm} 3919232153Smm 3920232153Smmint 3921232153Smmarchive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, 3922232153Smm const wchar_t **wp) 3923232153Smm{ 3924232153Smm int r, ret = 0; 3925232153Smm 3926232153Smm (void)a;/* UNUSED */ 3927232153Smm /* Return WCS form if we already have it. */ 3928232153Smm if (aes->aes_set & AES_SET_WCS) { 3929232153Smm *wp = aes->aes_wcs.s; 3930232153Smm return (ret); 3931232153Smm } 3932232153Smm 3933232153Smm *wp = NULL; 3934232153Smm /* Try converting MBS to WCS using native locale. */ 3935232153Smm if (aes->aes_set & AES_SET_MBS) { 3936232153Smm archive_wstring_empty(&(aes->aes_wcs)); 3937232153Smm r = archive_wstring_append_from_mbs(&(aes->aes_wcs), 3938232153Smm aes->aes_mbs.s, aes->aes_mbs.length); 3939232153Smm if (r == 0) { 3940232153Smm aes->aes_set |= AES_SET_WCS; 3941232153Smm *wp = aes->aes_wcs.s; 3942232153Smm } else 3943232153Smm ret = -1;/* failure. */ 3944232153Smm } 3945232153Smm return (ret); 3946232153Smm} 3947232153Smm 3948232153Smmint 3949232153Smmarchive_mstring_get_mbs_l(struct archive_mstring *aes, 3950232153Smm const char **p, size_t *length, struct archive_string_conv *sc) 3951232153Smm{ 3952232153Smm int r, ret = 0; 3953232153Smm 3954232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 3955232153Smm /* 3956313570Smm * Internationalization programming on Windows must use Wide 3957232153Smm * characters because Windows platform cannot make locale UTF-8. 3958232153Smm */ 3959232153Smm if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { 3960232153Smm archive_string_empty(&(aes->aes_mbs_in_locale)); 3961232153Smm r = archive_string_append_from_wcs_in_codepage( 3962232153Smm &(aes->aes_mbs_in_locale), aes->aes_wcs.s, 3963232153Smm aes->aes_wcs.length, sc); 3964232153Smm if (r == 0) { 3965232153Smm *p = aes->aes_mbs_in_locale.s; 3966232153Smm if (length != NULL) 3967232153Smm *length = aes->aes_mbs_in_locale.length; 3968232153Smm return (0); 3969232153Smm } else if (errno == ENOMEM) 3970232153Smm return (-1); 3971232153Smm else 3972232153Smm ret = -1; 3973232153Smm } 3974228753Smm#endif 3975232153Smm 3976232153Smm /* If there is not an MBS form but is a WCS form, try converting 3977232153Smm * with the native locale to be used for translating it to specified 3978232153Smm * character-set. */ 3979232153Smm if ((aes->aes_set & AES_SET_MBS) == 0 && 3980232153Smm (aes->aes_set & AES_SET_WCS) != 0) { 3981232153Smm archive_string_empty(&(aes->aes_mbs)); 3982232153Smm r = archive_string_append_from_wcs(&(aes->aes_mbs), 3983232153Smm aes->aes_wcs.s, aes->aes_wcs.length); 3984232153Smm if (r == 0) 3985232153Smm aes->aes_set |= AES_SET_MBS; 3986232153Smm else if (errno == ENOMEM) 3987232153Smm return (-1); 3988232153Smm else 3989232153Smm ret = -1; 3990232153Smm } 3991232153Smm /* If we already have an MBS form, use it to be translated to 3992232153Smm * specified character-set. */ 3993232153Smm if (aes->aes_set & AES_SET_MBS) { 3994232153Smm if (sc == NULL) { 3995232153Smm /* Conversion is unneeded. */ 3996232153Smm *p = aes->aes_mbs.s; 3997232153Smm if (length != NULL) 3998232153Smm *length = aes->aes_mbs.length; 3999232153Smm return (0); 4000232153Smm } 4001238856Smm ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), 4002232153Smm aes->aes_mbs.s, aes->aes_mbs.length, sc); 4003232153Smm *p = aes->aes_mbs_in_locale.s; 4004232153Smm if (length != NULL) 4005232153Smm *length = aes->aes_mbs_in_locale.length; 4006232153Smm } else { 4007232153Smm *p = NULL; 4008232153Smm if (length != NULL) 4009232153Smm *length = 0; 4010232153Smm } 4011232153Smm return (ret); 4012228753Smm} 4013228753Smm 4014232153Smmint 4015232153Smmarchive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) 4016232153Smm{ 4017232153Smm if (mbs == NULL) { 4018232153Smm aes->aes_set = 0; 4019232153Smm return (0); 4020232153Smm } 4021232153Smm return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); 4022232153Smm} 4023232153Smm 4024232153Smmint 4025232153Smmarchive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, 4026232153Smm size_t len) 4027232153Smm{ 4028232153Smm if (mbs == NULL) { 4029232153Smm aes->aes_set = 0; 4030232153Smm return (0); 4031232153Smm } 4032232153Smm aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4033232153Smm archive_strncpy(&(aes->aes_mbs), mbs, len); 4034232153Smm archive_string_empty(&(aes->aes_utf8)); 4035232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4036232153Smm return (0); 4037232153Smm} 4038232153Smm 4039232153Smmint 4040232153Smmarchive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) 4041232153Smm{ 4042238856Smm return archive_mstring_copy_wcs_len(aes, wcs, 4043238856Smm wcs == NULL ? 0 : wcslen(wcs)); 4044232153Smm} 4045232153Smm 4046232153Smmint 4047299529Smmarchive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8) 4048299529Smm{ 4049299529Smm if (utf8 == NULL) { 4050299529Smm aes->aes_set = 0; 4051299529Smm } 4052299529Smm aes->aes_set = AES_SET_UTF8; 4053299529Smm archive_string_empty(&(aes->aes_mbs)); 4054299529Smm archive_string_empty(&(aes->aes_wcs)); 4055299529Smm archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8)); 4056299529Smm return (int)strlen(utf8); 4057299529Smm} 4058299529Smm 4059299529Smmint 4060232153Smmarchive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, 4061232153Smm size_t len) 4062232153Smm{ 4063232153Smm if (wcs == NULL) { 4064232153Smm aes->aes_set = 0; 4065232153Smm } 4066232153Smm aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ 4067232153Smm archive_string_empty(&(aes->aes_mbs)); 4068232153Smm archive_string_empty(&(aes->aes_utf8)); 4069232153Smm archive_wstrncpy(&(aes->aes_wcs), wcs, len); 4070232153Smm return (0); 4071232153Smm} 4072232153Smm 4073232153Smmint 4074232153Smmarchive_mstring_copy_mbs_len_l(struct archive_mstring *aes, 4075232153Smm const char *mbs, size_t len, struct archive_string_conv *sc) 4076232153Smm{ 4077232153Smm int r; 4078232153Smm 4079232153Smm if (mbs == NULL) { 4080232153Smm aes->aes_set = 0; 4081232153Smm return (0); 4082232153Smm } 4083232153Smm archive_string_empty(&(aes->aes_mbs)); 4084232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4085232153Smm archive_string_empty(&(aes->aes_utf8)); 4086232153Smm#if defined(_WIN32) && !defined(__CYGWIN__) 4087232153Smm /* 4088313570Smm * Internationalization programming on Windows must use Wide 4089232153Smm * characters because Windows platform cannot make locale UTF-8. 4090232153Smm */ 4091232153Smm if (sc == NULL) { 4092232153Smm if (archive_string_append(&(aes->aes_mbs), 4093232153Smm mbs, mbsnbytes(mbs, len)) == NULL) { 4094232153Smm aes->aes_set = 0; 4095232153Smm r = -1; 4096232153Smm } else { 4097232153Smm aes->aes_set = AES_SET_MBS; 4098232153Smm r = 0; 4099232153Smm } 4100232153Smm#if defined(HAVE_ICONV) 4101232153Smm } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { 4102232153Smm /* 4103232153Smm * This case happens only when MultiByteToWideChar() cannot 4104232153Smm * handle sc->from_cp, and we have to iconv in order to 4105232153Smm * translate character-set to wchar_t,UTF-16. 4106232153Smm */ 4107232153Smm iconv_t cd = sc->cd; 4108232153Smm unsigned from_cp; 4109232153Smm int flag; 4110232153Smm 4111232153Smm /* 4112232153Smm * Translate multi-bytes from some character-set to UTF-8. 4113232153Smm */ 4114232153Smm sc->cd = sc->cd_w; 4115238856Smm r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); 4116232153Smm sc->cd = cd; 4117232153Smm if (r != 0) { 4118232153Smm aes->aes_set = 0; 4119232153Smm return (r); 4120232153Smm } 4121232153Smm aes->aes_set = AES_SET_UTF8; 4122232153Smm 4123232153Smm /* 4124232153Smm * Append the UTF-8 string into wstring. 4125232153Smm */ 4126232153Smm flag = sc->flag; 4127232153Smm sc->flag &= ~(SCONV_NORMALIZATION_C 4128232153Smm | SCONV_TO_UTF16| SCONV_FROM_UTF16); 4129232153Smm from_cp = sc->from_cp; 4130232153Smm sc->from_cp = CP_UTF8; 4131232153Smm r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), 4132232153Smm aes->aes_utf8.s, aes->aes_utf8.length, sc); 4133232153Smm sc->flag = flag; 4134232153Smm sc->from_cp = from_cp; 4135232153Smm if (r == 0) 4136232153Smm aes->aes_set |= AES_SET_WCS; 4137232153Smm#endif 4138232153Smm } else { 4139232153Smm r = archive_wstring_append_from_mbs_in_codepage( 4140232153Smm &(aes->aes_wcs), mbs, len, sc); 4141232153Smm if (r == 0) 4142232153Smm aes->aes_set = AES_SET_WCS; 4143232153Smm else 4144232153Smm aes->aes_set = 0; 4145232153Smm } 4146232153Smm#else 4147238856Smm r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); 4148232153Smm if (r == 0) 4149232153Smm aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4150232153Smm else 4151232153Smm aes->aes_set = 0; 4152232153Smm#endif 4153232153Smm return (r); 4154232153Smm} 4155232153Smm 4156232153Smm/* 4157232153Smm * The 'update' form tries to proactively update all forms of 4158232153Smm * this string (WCS and MBS) and returns an error if any of 4159232153Smm * them fail. This is used by the 'pax' handler, for instance, 4160232153Smm * to detect and report character-conversion failures early while 4161232153Smm * still allowing clients to get potentially useful values from 4162232153Smm * the more tolerant lazy conversions. (get_mbs and get_wcs will 4163232153Smm * strive to give the user something useful, so you can get hopefully 4164232153Smm * usable values even if some of the character conversions are failing.) 4165232153Smm */ 4166232153Smmint 4167232153Smmarchive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, 4168232153Smm const char *utf8) 4169232153Smm{ 4170232153Smm struct archive_string_conv *sc; 4171232153Smm int r; 4172232153Smm 4173232153Smm if (utf8 == NULL) { 4174232153Smm aes->aes_set = 0; 4175232153Smm return (0); /* Succeeded in clearing everything. */ 4176232153Smm } 4177232153Smm 4178232153Smm /* Save the UTF8 string. */ 4179232153Smm archive_strcpy(&(aes->aes_utf8), utf8); 4180232153Smm 4181232153Smm /* Empty the mbs and wcs strings. */ 4182232153Smm archive_string_empty(&(aes->aes_mbs)); 4183232153Smm archive_wstring_empty(&(aes->aes_wcs)); 4184232153Smm 4185232153Smm aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ 4186232153Smm 4187232153Smm /* Try converting UTF-8 to MBS, return false on failure. */ 4188232153Smm sc = archive_string_conversion_from_charset(a, "UTF-8", 1); 4189232153Smm if (sc == NULL) 4190232153Smm return (-1);/* Couldn't allocate memory for sc. */ 4191238856Smm r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); 4192232153Smm if (a == NULL) 4193232153Smm free_sconv_object(sc); 4194232153Smm if (r != 0) 4195232153Smm return (-1); 4196232153Smm aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ 4197232153Smm 4198232153Smm /* Try converting MBS to WCS, return false on failure. */ 4199232153Smm if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, 4200232153Smm aes->aes_mbs.length)) 4201232153Smm return (-1); 4202232153Smm aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; 4203232153Smm 4204232153Smm /* All conversions succeeded. */ 4205232153Smm return (0); 4206232153Smm} 4207