1228753Smm/*- 2228753Smm * Copyright (c) 2003-2007 Tim Kientzle 3228753Smm * All rights reserved. 4228753Smm * 5228753Smm * Redistribution and use in source and binary forms, with or without 6228753Smm * modification, are permitted provided that the following conditions 7228753Smm * are met: 8228753Smm * 1. Redistributions of source code must retain the above copyright 9228753Smm * notice, this list of conditions and the following disclaimer. 10228753Smm * 2. Redistributions in binary form must reproduce the above copyright 11228753Smm * notice, this list of conditions and the following disclaimer in the 12228753Smm * documentation and/or other materials provided with the distribution. 13228753Smm * 14228753Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15228753Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16228753Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17228753Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18228753Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19228753Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20228753Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21228753Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22228753Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23228753Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24228753Smm */ 25228753Smm 26228753Smm#include "archive_platform.h" 27229592Smm__FBSDID("$FreeBSD$"); 28228753Smm 29228753Smm/* 30228753Smm * Basic resizable string support, to simplify manipulating arbitrary-sized 31228753Smm * strings while minimizing heap activity. 32228753Smm */ 33228753Smm 34228753Smm#ifdef HAVE_STDLIB_H 35228753Smm#include <stdlib.h> 36228753Smm#endif 37228753Smm#ifdef HAVE_STRING_H 38228753Smm#include <string.h> 39228753Smm#endif 40228753Smm#ifdef HAVE_WCHAR_H 41228753Smm#include <wchar.h> 42228753Smm#endif 43228753Smm#if defined(_WIN32) && !defined(__CYGWIN__) 44228753Smm#include <windows.h> 45228753Smm#endif 46228753Smm 47228753Smm#include "archive_private.h" 48228753Smm#include "archive_string.h" 49228753Smm 50228753Smmstruct archive_string * 51228753Smm__archive_string_append(struct archive_string *as, const char *p, size_t s) 52228753Smm{ 53228753Smm if (__archive_string_ensure(as, as->length + s + 1) == NULL) 54228753Smm __archive_errx(1, "Out of memory"); 55228753Smm memcpy(as->s + as->length, p, s); 56228753Smm as->s[as->length + s] = 0; 57228753Smm as->length += s; 58228753Smm return (as); 59228753Smm} 60228753Smm 61228753Smmvoid 62228753Smm__archive_string_copy(struct archive_string *dest, struct archive_string *src) 63228753Smm{ 64228753Smm if (src->length == 0) 65228753Smm dest->length = 0; 66228753Smm else { 67228753Smm if (__archive_string_ensure(dest, src->length + 1) == NULL) 68228753Smm __archive_errx(1, "Out of memory"); 69228753Smm memcpy(dest->s, src->s, src->length); 70228753Smm dest->length = src->length; 71228753Smm dest->s[dest->length] = 0; 72228753Smm } 73228753Smm} 74228753Smm 75228753Smmvoid 76228753Smm__archive_string_concat(struct archive_string *dest, struct archive_string *src) 77228753Smm{ 78228753Smm if (src->length > 0) { 79228753Smm if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL) 80228753Smm __archive_errx(1, "Out of memory"); 81228753Smm memcpy(dest->s + dest->length, src->s, src->length); 82228753Smm dest->length += src->length; 83228753Smm dest->s[dest->length] = 0; 84228753Smm } 85228753Smm} 86228753Smm 87228753Smmvoid 88228753Smm__archive_string_free(struct archive_string *as) 89228753Smm{ 90228753Smm as->length = 0; 91228753Smm as->buffer_length = 0; 92228753Smm if (as->s != NULL) { 93228753Smm free(as->s); 94228753Smm as->s = NULL; 95228753Smm } 96228753Smm} 97228753Smm 98228753Smm/* Returns NULL on any allocation failure. */ 99228753Smmstruct archive_string * 100228753Smm__archive_string_ensure(struct archive_string *as, size_t s) 101228753Smm{ 102228753Smm /* If buffer is already big enough, don't reallocate. */ 103228753Smm if (as->s && (s <= as->buffer_length)) 104228753Smm return (as); 105228753Smm 106228753Smm /* 107228753Smm * Growing the buffer at least exponentially ensures that 108228753Smm * append operations are always linear in the number of 109228753Smm * characters appended. Using a smaller growth rate for 110228753Smm * larger buffers reduces memory waste somewhat at the cost of 111228753Smm * a larger constant factor. 112228753Smm */ 113228753Smm if (as->buffer_length < 32) 114228753Smm /* Start with a minimum 32-character buffer. */ 115228753Smm as->buffer_length = 32; 116228753Smm else if (as->buffer_length < 8192) 117228753Smm /* Buffers under 8k are doubled for speed. */ 118228753Smm as->buffer_length += as->buffer_length; 119228753Smm else { 120228753Smm /* Buffers 8k and over grow by at least 25% each time. */ 121228753Smm size_t old_length = as->buffer_length; 122228753Smm as->buffer_length += as->buffer_length / 4; 123228753Smm /* Be safe: If size wraps, release buffer and return NULL. */ 124228753Smm if (as->buffer_length < old_length) { 125228753Smm free(as->s); 126228753Smm as->s = NULL; 127228753Smm return (NULL); 128228753Smm } 129228753Smm } 130228753Smm /* 131228753Smm * The computation above is a lower limit to how much we'll 132228753Smm * grow the buffer. In any case, we have to grow it enough to 133228753Smm * hold the request. 134228753Smm */ 135228753Smm if (as->buffer_length < s) 136228753Smm as->buffer_length = s; 137228753Smm /* Now we can reallocate the buffer. */ 138228753Smm as->s = (char *)realloc(as->s, as->buffer_length); 139228753Smm if (as->s == NULL) 140228753Smm return (NULL); 141228753Smm return (as); 142228753Smm} 143228753Smm 144228753Smmstruct archive_string * 145228753Smm__archive_strncat(struct archive_string *as, const void *_p, size_t n) 146228753Smm{ 147228753Smm size_t s; 148228753Smm const char *p, *pp; 149228753Smm 150228753Smm p = (const char *)_p; 151228753Smm 152228753Smm /* Like strlen(p), except won't examine positions beyond p[n]. */ 153228753Smm s = 0; 154228753Smm pp = p; 155228753Smm while (s < n && *pp) { 156228753Smm pp++; 157228753Smm s++; 158228753Smm } 159228753Smm return (__archive_string_append(as, p, s)); 160228753Smm} 161228753Smm 162228753Smmstruct archive_string * 163228753Smm__archive_strappend_char(struct archive_string *as, char c) 164228753Smm{ 165228753Smm return (__archive_string_append(as, &c, 1)); 166228753Smm} 167228753Smm 168228753Smm/* 169228753Smm * Translates a wide character string into UTF-8 and appends 170228753Smm * to the archive_string. Note: returns NULL if conversion fails, 171228753Smm * but still leaves a best-effort conversion in the argument as. 172228753Smm */ 173228753Smmstruct archive_string * 174228753Smm__archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w) 175228753Smm{ 176228753Smm char *p; 177228753Smm unsigned wc; 178228753Smm char buff[256]; 179228753Smm struct archive_string *return_val = as; 180228753Smm 181228753Smm /* 182228753Smm * Convert one wide char at a time into 'buff', whenever that 183228753Smm * fills, append it to the string. 184228753Smm */ 185228753Smm p = buff; 186228753Smm while (*w != L'\0') { 187228753Smm /* Flush the buffer when we have <=16 bytes free. */ 188228753Smm /* (No encoding has a single character >16 bytes.) */ 189228753Smm if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) { 190228753Smm *p = '\0'; 191228753Smm archive_strcat(as, buff); 192228753Smm p = buff; 193228753Smm } 194228753Smm wc = *w++; 195228753Smm /* If this is a surrogate pair, assemble the full code point.*/ 196228753Smm /* Note: wc must not be wchar_t here, because the full code 197228753Smm * point can be more than 16 bits! */ 198228753Smm if (wc >= 0xD800 && wc <= 0xDBff 199228753Smm && *w >= 0xDC00 && *w <= 0xDFFF) { 200228753Smm wc -= 0xD800; 201228753Smm wc *= 0x400; 202228753Smm wc += (*w - 0xDC00); 203228753Smm wc += 0x10000; 204228753Smm ++w; 205228753Smm } 206228753Smm /* Translate code point to UTF8 */ 207228753Smm if (wc <= 0x7f) { 208228753Smm *p++ = (char)wc; 209228753Smm } else if (wc <= 0x7ff) { 210228753Smm *p++ = 0xc0 | ((wc >> 6) & 0x1f); 211228753Smm *p++ = 0x80 | (wc & 0x3f); 212228753Smm } else if (wc <= 0xffff) { 213228753Smm *p++ = 0xe0 | ((wc >> 12) & 0x0f); 214228753Smm *p++ = 0x80 | ((wc >> 6) & 0x3f); 215228753Smm *p++ = 0x80 | (wc & 0x3f); 216228753Smm } else if (wc <= 0x1fffff) { 217228753Smm *p++ = 0xf0 | ((wc >> 18) & 0x07); 218228753Smm *p++ = 0x80 | ((wc >> 12) & 0x3f); 219228753Smm *p++ = 0x80 | ((wc >> 6) & 0x3f); 220228753Smm *p++ = 0x80 | (wc & 0x3f); 221228753Smm } else { 222228753Smm /* Unicode has no codes larger than 0x1fffff. */ 223228753Smm /* TODO: use \uXXXX escape here instead of ? */ 224228753Smm *p++ = '?'; 225228753Smm return_val = NULL; 226228753Smm } 227228753Smm } 228228753Smm *p = '\0'; 229228753Smm archive_strcat(as, buff); 230228753Smm return (return_val); 231228753Smm} 232228753Smm 233228753Smmstatic int 234228753Smmutf8_to_unicode(int *pwc, const char *s, size_t n) 235228753Smm{ 236228753Smm int ch; 237228753Smm 238228753Smm /* 239228753Smm * Decode 1-4 bytes depending on the value of the first byte. 240228753Smm */ 241228753Smm ch = (unsigned char)*s; 242228753Smm if (ch == 0) { 243228753Smm return (0); /* Standard: return 0 for end-of-string. */ 244228753Smm } 245228753Smm if ((ch & 0x80) == 0) { 246228753Smm *pwc = ch & 0x7f; 247228753Smm return (1); 248228753Smm } 249228753Smm if ((ch & 0xe0) == 0xc0) { 250228753Smm if (n < 2) 251228753Smm return (-1); 252228753Smm if ((s[1] & 0xc0) != 0x80) return (-1); 253228753Smm *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 254228753Smm return (2); 255228753Smm } 256228753Smm if ((ch & 0xf0) == 0xe0) { 257228753Smm if (n < 3) 258228753Smm return (-1); 259228753Smm if ((s[1] & 0xc0) != 0x80) return (-1); 260228753Smm if ((s[2] & 0xc0) != 0x80) return (-1); 261228753Smm *pwc = ((ch & 0x0f) << 12) 262228753Smm | ((s[1] & 0x3f) << 6) 263228753Smm | (s[2] & 0x3f); 264228753Smm return (3); 265228753Smm } 266228753Smm if ((ch & 0xf8) == 0xf0) { 267228753Smm if (n < 4) 268228753Smm return (-1); 269228753Smm if ((s[1] & 0xc0) != 0x80) return (-1); 270228753Smm if ((s[2] & 0xc0) != 0x80) return (-1); 271228753Smm if ((s[3] & 0xc0) != 0x80) return (-1); 272228753Smm *pwc = ((ch & 0x07) << 18) 273228753Smm | ((s[1] & 0x3f) << 12) 274228753Smm | ((s[2] & 0x3f) << 6) 275228753Smm | (s[3] & 0x3f); 276228753Smm return (4); 277228753Smm } 278228753Smm /* Invalid first byte. */ 279228753Smm return (-1); 280228753Smm} 281228753Smm 282228753Smm/* 283228753Smm * Return a wide-character Unicode string by converting this archive_string 284228753Smm * from UTF-8. We assume that systems with 16-bit wchar_t always use 285228753Smm * UTF16 and systems with 32-bit wchar_t can accept UCS4. 286228753Smm */ 287228753Smmwchar_t * 288228753Smm__archive_string_utf8_w(struct archive_string *as) 289228753Smm{ 290228753Smm wchar_t *ws, *dest; 291228753Smm int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */ 292228753Smm const char *src; 293228753Smm int n; 294228753Smm 295228753Smm ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t)); 296228753Smm if (ws == NULL) 297228753Smm __archive_errx(1, "Out of memory"); 298228753Smm dest = ws; 299228753Smm src = as->s; 300228753Smm while (*src != '\0') { 301228753Smm n = utf8_to_unicode(&wc, src, 8); 302228753Smm if (n == 0) 303228753Smm break; 304228753Smm if (n < 0) { 305228753Smm free(ws); 306228753Smm return (NULL); 307228753Smm } 308228753Smm src += n; 309228753Smm if (wc >= 0xDC00 && wc <= 0xDBFF) { 310228753Smm /* This is a leading surrogate; some idiot 311228753Smm * has translated UTF16 to UTF8 without combining 312228753Smm * surrogates; rebuild the full code point before 313228753Smm * continuing. */ 314228753Smm n = utf8_to_unicode(&wc2, src, 8); 315228753Smm if (n < 0) { 316228753Smm free(ws); 317228753Smm return (NULL); 318228753Smm } 319228753Smm if (n == 0) /* Ignore the leading surrogate */ 320228753Smm break; 321228753Smm if (wc2 < 0xDC00 || wc2 > 0xDFFF) { 322228753Smm /* If the second character isn't a 323228753Smm * trailing surrogate, then someone 324228753Smm * has really screwed up and this is 325228753Smm * invalid. */ 326228753Smm free(ws); 327228753Smm return (NULL); 328228753Smm } else { 329228753Smm src += n; 330228753Smm wc -= 0xD800; 331228753Smm wc *= 0x400; 332228753Smm wc += wc2 - 0xDC00; 333228753Smm wc += 0x10000; 334228753Smm } 335228753Smm } 336228753Smm if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) { 337228753Smm /* We have a code point that won't fit into a 338228753Smm * wchar_t; convert it to a surrogate pair. */ 339228753Smm wc -= 0x10000; 340228753Smm *dest++ = ((wc >> 10) & 0x3ff) + 0xD800; 341228753Smm *dest++ = (wc & 0x3ff) + 0xDC00; 342228753Smm } else 343228753Smm *dest++ = wc; 344228753Smm } 345228753Smm *dest = L'\0'; 346228753Smm return (ws); 347228753Smm} 348228753Smm 349228753Smm#if defined(_WIN32) && !defined(__CYGWIN__) 350228753Smm 351228753Smm/* 352228753Smm * Translates a wide character string into current locale character set 353228753Smm * and appends to the archive_string. Note: returns NULL if conversion 354228753Smm * fails. 355228753Smm * 356228753Smm * Win32 builds use WideCharToMultiByte from the Windows API. 357228753Smm * (Maybe Cygwin should too? WideCharToMultiByte will know a 358228753Smm * lot more about local character encodings than the wcrtomb() 359228753Smm * wrapper is going to know.) 360228753Smm */ 361228753Smmstruct archive_string * 362228753Smm__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w) 363228753Smm{ 364228753Smm char *p; 365228753Smm int l, wl; 366228753Smm BOOL useDefaultChar = FALSE; 367228753Smm 368228753Smm wl = (int)wcslen(w); 369228753Smm l = wl * 4 + 4; 370228753Smm p = malloc(l); 371228753Smm if (p == NULL) 372228753Smm __archive_errx(1, "Out of memory"); 373228753Smm /* To check a useDefaultChar is to simulate error handling of 374228753Smm * the my_wcstombs() which is running on non Windows system with 375228753Smm * wctomb(). 376228753Smm * And to set NULL for last argument is necessary when a codepage 377228753Smm * is not CP_ACP(current locale). 378228753Smm */ 379228753Smm l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar); 380228753Smm if (l == 0) { 381228753Smm free(p); 382228753Smm return (NULL); 383228753Smm } 384228753Smm __archive_string_append(as, p, l); 385228753Smm free(p); 386228753Smm return (as); 387228753Smm} 388228753Smm 389228753Smm#else 390228753Smm 391228753Smm/* 392228753Smm * Translates a wide character string into current locale character set 393228753Smm * and appends to the archive_string. Note: returns NULL if conversion 394228753Smm * fails. 395228753Smm * 396228753Smm * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 397228753Smm * one character at a time. If a non-Windows platform doesn't have 398228753Smm * either of these, fall back to the built-in UTF8 conversion. 399228753Smm */ 400228753Smmstruct archive_string * 401228753Smm__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w) 402228753Smm{ 403228753Smm#if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB) 404228753Smm /* If there's no built-in locale support, fall back to UTF8 always. */ 405228753Smm return __archive_strappend_w_utf8(as, w); 406228753Smm#else 407228753Smm /* We cannot use the standard wcstombs() here because it 408228753Smm * cannot tell us how big the output buffer should be. So 409228753Smm * I've built a loop around wcrtomb() or wctomb() that 410228753Smm * converts a character at a time and resizes the string as 411228753Smm * needed. We prefer wcrtomb() when it's available because 412228753Smm * it's thread-safe. */ 413228753Smm int n; 414228753Smm char *p; 415228753Smm char buff[256]; 416228753Smm#if HAVE_WCRTOMB 417228753Smm mbstate_t shift_state; 418228753Smm 419228753Smm memset(&shift_state, 0, sizeof(shift_state)); 420228753Smm#else 421228753Smm /* Clear the shift state before starting. */ 422228753Smm wctomb(NULL, L'\0'); 423228753Smm#endif 424228753Smm 425228753Smm /* 426228753Smm * Convert one wide char at a time into 'buff', whenever that 427228753Smm * fills, append it to the string. 428228753Smm */ 429228753Smm p = buff; 430228753Smm while (*w != L'\0') { 431228753Smm /* Flush the buffer when we have <=16 bytes free. */ 432228753Smm /* (No encoding has a single character >16 bytes.) */ 433228753Smm if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) { 434228753Smm *p = '\0'; 435228753Smm archive_strcat(as, buff); 436228753Smm p = buff; 437228753Smm } 438228753Smm#if HAVE_WCRTOMB 439228753Smm n = wcrtomb(p, *w++, &shift_state); 440228753Smm#else 441228753Smm n = wctomb(p, *w++); 442228753Smm#endif 443228753Smm if (n == -1) 444228753Smm return (NULL); 445228753Smm p += n; 446228753Smm } 447228753Smm *p = '\0'; 448228753Smm archive_strcat(as, buff); 449228753Smm return (as); 450228753Smm#endif 451228753Smm} 452228753Smm 453228753Smm#endif /* _WIN32 && ! __CYGWIN__ */ 454