1286432Sbapt/* 2286432Sbapt * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3286432Sbapt * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved. 4286432Sbapt * Copyright 2015 John Marino <draco@marino.st> 5286432Sbapt * 6286432Sbapt * This source code is derived from the illumos localedef command, and 7286432Sbapt * provided under BSD-style license terms by Nexenta Systems, Inc. 8286432Sbapt * 9286432Sbapt * Redistribution and use in source and binary forms, with or without 10286432Sbapt * modification, are permitted provided that the following conditions 11286432Sbapt * are met: 12286432Sbapt * 13286432Sbapt * 1. Redistributions of source code must retain the above copyright 14286432Sbapt * notice, this list of conditions and the following disclaimer. 15286432Sbapt * 2. Redistributions in binary form must reproduce the above copyright 16286432Sbapt * notice, this list of conditions and the following disclaimer in the 17286432Sbapt * documentation and/or other materials provided with the distribution. 18286432Sbapt * 19286432Sbapt * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20286432Sbapt * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21286432Sbapt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22286432Sbapt * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 23286432Sbapt * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24286432Sbapt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25286432Sbapt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26286432Sbapt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27286432Sbapt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28286432Sbapt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29286432Sbapt * POSSIBILITY OF SUCH DAMAGE. 30286432Sbapt */ 31286432Sbapt 32286432Sbapt/* 33286432Sbapt * The functions in this file convert from the standard multibyte forms 34286432Sbapt * to the wide character forms used internally by libc. Unfortunately, 35286432Sbapt * this approach means that we need a method for each and every encoding. 36286432Sbapt */ 37286432Sbapt#include <sys/cdefs.h> 38286432Sbapt__FBSDID("$FreeBSD: stable/11/usr.bin/localedef/wide.c 321121 2017-07-18 08:35:22Z ngie $"); 39286432Sbapt 40290490Sbapt#include <ctype.h> 41286432Sbapt#include <stdlib.h> 42286432Sbapt#include <wchar.h> 43286432Sbapt#include <string.h> 44286432Sbapt#include <sys/types.h> 45286432Sbapt#include "localedef.h" 46286432Sbapt 47286432Sbaptstatic int towide_none(wchar_t *, const char *, unsigned); 48286432Sbaptstatic int towide_utf8(wchar_t *, const char *, unsigned); 49286432Sbaptstatic int towide_big5(wchar_t *, const char *, unsigned); 50286432Sbaptstatic int towide_gbk(wchar_t *, const char *, unsigned); 51286432Sbaptstatic int towide_gb2312(wchar_t *, const char *, unsigned); 52286432Sbaptstatic int towide_gb18030(wchar_t *, const char *, unsigned); 53286432Sbaptstatic int towide_mskanji(wchar_t *, const char *, unsigned); 54286432Sbaptstatic int towide_euccn(wchar_t *, const char *, unsigned); 55286432Sbaptstatic int towide_eucjp(wchar_t *, const char *, unsigned); 56286432Sbaptstatic int towide_euckr(wchar_t *, const char *, unsigned); 57286432Sbaptstatic int towide_euctw(wchar_t *, const char *, unsigned); 58286432Sbapt 59286432Sbaptstatic int tomb_none(char *, wchar_t); 60286432Sbaptstatic int tomb_utf8(char *, wchar_t); 61286432Sbaptstatic int tomb_mbs(char *, wchar_t); 62286432Sbapt 63286432Sbaptstatic int (*_towide)(wchar_t *, const char *, unsigned) = towide_none; 64286432Sbaptstatic int (*_tomb)(char *, wchar_t) = tomb_none; 65290233Sbaptstatic char _encoding_buffer[20] = {'N','O','N','E'}; 66290233Sbaptstatic const char *_encoding = _encoding_buffer; 67286432Sbaptstatic int _nbits = 7; 68286432Sbapt 69286432Sbapt/* 70286432Sbapt * Table of supported encodings. We only bother to list the multibyte 71286432Sbapt * encodings here, because single byte locales are handed by "NONE". 72286432Sbapt */ 73286432Sbaptstatic struct { 74286432Sbapt const char *name; 75286432Sbapt /* the name that the underlying libc implemenation uses */ 76286432Sbapt const char *cname; 77286432Sbapt /* the maximum number of bits required for priorities */ 78286432Sbapt int nbits; 79286432Sbapt int (*towide)(wchar_t *, const char *, unsigned); 80286432Sbapt int (*tomb)(char *, wchar_t); 81286432Sbapt} mb_encodings[] = { 82286432Sbapt /* 83286432Sbapt * UTF8 values max out at 0x1fffff (although in theory there could 84286432Sbapt * be later extensions, but it won't happen.) This means we only need 85286432Sbapt * 21 bits to be able to encode the entire range of priorities. 86286432Sbapt */ 87286432Sbapt { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 88286432Sbapt { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 89286432Sbapt { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 90286432Sbapt { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, 91286432Sbapt 92286432Sbapt { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 93286432Sbapt { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs }, 94286432Sbapt /* 95298746Spfg * Because the 3-byte form of EUC-JP use the same leading byte, 96286432Sbapt * only 17 bits required to provide unique priorities. (The low 97286432Sbapt * bit of that first byte is set.) By setting this value low, 98286432Sbapt * we can get by with only 3 bytes in the strxfrm expansion. 99286432Sbapt */ 100286432Sbapt { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 101286432Sbapt { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, 102286432Sbapt 103286432Sbapt { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 104286432Sbapt { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs }, 105286432Sbapt /* 106286432Sbapt * EUC-TW uses 2 bytes most of the time, but 4 bytes if the 107286432Sbapt * high order byte is 0x8E. However, with 4 byte encodings, 108286432Sbapt * the third byte will be A0-B0. So we only need to consider 109286432Sbapt * the lower order 24 bits for collation. 110286432Sbapt */ 111286432Sbapt { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 112286432Sbapt { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs }, 113286432Sbapt 114286432Sbapt { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 115286432Sbapt { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, 116286432Sbapt { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs }, 117286432Sbapt { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 118286432Sbapt { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, 119286432Sbapt 120286432Sbapt { "BIG5", "BIG5", 16, towide_big5, tomb_mbs }, 121286432Sbapt { "big5", "BIG5", 16, towide_big5, tomb_mbs }, 122286432Sbapt { "Big5", "BIG5", 16, towide_big5, tomb_mbs }, 123286432Sbapt 124286432Sbapt { "GBK", "GBK", 16, towide_gbk, tomb_mbs }, 125286432Sbapt 126286432Sbapt /* 127286432Sbapt * GB18030 can get away with just 31 bits. This is because the 128286432Sbapt * high order bit is always set for 4 byte values, and the 129286432Sbapt * at least one of the other bits in that 4 byte value will 130286432Sbapt * be non-zero. 131286432Sbapt */ 132286432Sbapt { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs }, 133286432Sbapt 134286432Sbapt /* 135286432Sbapt * This should probably be an aliase for euc-cn, or vice versa. 136286432Sbapt */ 137286432Sbapt { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs }, 138286432Sbapt 139286432Sbapt { NULL, NULL, 0, 0, 0 }, 140286432Sbapt}; 141286432Sbapt 142286432Sbaptstatic char * 143286432Sbaptshow_mb(const char *mb) 144286432Sbapt{ 145286432Sbapt static char buf[64]; 146286432Sbapt 147286432Sbapt /* ASCII stuff we just print */ 148286432Sbapt if (isascii(*mb) && isgraph(*mb)) { 149286432Sbapt buf[0] = *mb; 150286432Sbapt buf[1] = 0; 151286432Sbapt return (buf); 152286432Sbapt } 153286432Sbapt buf[0] = 0; 154286432Sbapt while (*mb != 0) { 155286432Sbapt char scr[8]; 156286432Sbapt (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); 157286432Sbapt (void) strlcat(buf, scr, sizeof (buf)); 158286432Sbapt mb++; 159286432Sbapt } 160286432Sbapt return (buf); 161286432Sbapt} 162286432Sbapt 163286432Sbaptstatic char *widemsg; 164286432Sbapt 165286432Sbaptvoid 166286432Sbaptwerr(const char *fmt, ...) 167286432Sbapt{ 168286432Sbapt char *msg; 169286432Sbapt 170286432Sbapt va_list va; 171286432Sbapt va_start(va, fmt); 172286432Sbapt (void) vasprintf(&msg, fmt, va); 173286432Sbapt va_end(va); 174286432Sbapt 175286432Sbapt free(widemsg); 176286432Sbapt widemsg = msg; 177286432Sbapt} 178286432Sbapt 179286432Sbapt/* 180286432Sbapt * This is used for 8-bit encodings. 181286432Sbapt */ 182286432Sbaptint 183286432Sbapttowide_none(wchar_t *c, const char *mb, unsigned n __unused) 184286432Sbapt{ 185286432Sbapt if (mb_cur_max != 1) { 186286432Sbapt werr("invalid or unsupported multibyte locale"); 187286432Sbapt return (-1); 188286432Sbapt } 189286432Sbapt *c = (uint8_t)*mb; 190286432Sbapt return (1); 191286432Sbapt} 192286432Sbapt 193286432Sbaptint 194286432Sbapttomb_none(char *mb, wchar_t wc) 195286432Sbapt{ 196286432Sbapt if (mb_cur_max != 1) { 197286432Sbapt werr("invalid or unsupported multibyte locale"); 198286432Sbapt return (-1); 199286432Sbapt } 200286432Sbapt *(uint8_t *)mb = (wc & 0xff); 201286432Sbapt mb[1] = 0; 202286432Sbapt return (1); 203286432Sbapt} 204286432Sbapt 205286432Sbapt/* 206286432Sbapt * UTF-8 stores wide characters in UTF-32 form. 207286432Sbapt */ 208286432Sbaptint 209286432Sbapttowide_utf8(wchar_t *wc, const char *mb, unsigned n) 210286432Sbapt{ 211286432Sbapt wchar_t c; 212286432Sbapt int nb; 213290498Sbapt wchar_t lv; /* lowest legal value */ 214286432Sbapt int i; 215286432Sbapt const uint8_t *s = (const uint8_t *)mb; 216286432Sbapt 217286432Sbapt c = *s; 218286432Sbapt 219286432Sbapt if ((c & 0x80) == 0) { 220286432Sbapt /* 7-bit ASCII */ 221286432Sbapt *wc = c; 222286432Sbapt return (1); 223286432Sbapt } else if ((c & 0xe0) == 0xc0) { 224286432Sbapt /* u80-u7ff - two bytes encoded */ 225286432Sbapt nb = 2; 226286432Sbapt lv = 0x80; 227286432Sbapt c &= ~0xe0; 228286432Sbapt } else if ((c & 0xf0) == 0xe0) { 229286432Sbapt /* u800-uffff - three bytes encoded */ 230286432Sbapt nb = 3; 231286432Sbapt lv = 0x800; 232286432Sbapt c &= ~0xf0; 233286432Sbapt } else if ((c & 0xf8) == 0xf0) { 234286432Sbapt /* u1000-u1fffff - four bytes encoded */ 235286432Sbapt nb = 4; 236286432Sbapt lv = 0x1000; 237286432Sbapt c &= ~0xf8; 238286432Sbapt } else { 239286432Sbapt /* 5 and 6 byte encodings are not legal unicode */ 240286432Sbapt werr("utf8 encoding too large (%s)", show_mb(mb)); 241286432Sbapt return (-1); 242286432Sbapt } 243286432Sbapt if (nb > (int)n) { 244286432Sbapt werr("incomplete utf8 sequence (%s)", show_mb(mb)); 245286432Sbapt return (-1); 246286432Sbapt } 247286432Sbapt 248286432Sbapt for (i = 1; i < nb; i++) { 249286432Sbapt if (((s[i]) & 0xc0) != 0x80) { 250286432Sbapt werr("illegal utf8 byte (%x)", s[i]); 251286432Sbapt return (-1); 252286432Sbapt } 253286432Sbapt c <<= 6; 254286432Sbapt c |= (s[i] & 0x3f); 255286432Sbapt } 256286432Sbapt 257286432Sbapt if (c < lv) { 258286432Sbapt werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); 259286432Sbapt return (-1); 260286432Sbapt } 261286432Sbapt *wc = c; 262286432Sbapt return (nb); 263286432Sbapt} 264286432Sbapt 265286432Sbaptint 266286432Sbapttomb_utf8(char *mb, wchar_t wc) 267286432Sbapt{ 268286432Sbapt uint8_t *s = (uint8_t *)mb; 269286432Sbapt uint8_t msk; 270286432Sbapt int cnt; 271286432Sbapt int i; 272286432Sbapt 273286432Sbapt if (wc <= 0x7f) { 274286432Sbapt s[0] = wc & 0x7f; 275286432Sbapt s[1] = 0; 276286432Sbapt return (1); 277286432Sbapt } 278286432Sbapt if (wc <= 0x7ff) { 279286432Sbapt cnt = 2; 280286432Sbapt msk = 0xc0; 281286432Sbapt } else if (wc <= 0xffff) { 282286432Sbapt cnt = 3; 283286432Sbapt msk = 0xe0; 284286432Sbapt } else if (wc <= 0x1fffff) { 285286432Sbapt cnt = 4; 286286432Sbapt msk = 0xf0; 287286432Sbapt } else { 288286432Sbapt werr("illegal uf8 char (%x)", wc); 289286432Sbapt return (-1); 290286432Sbapt } 291286432Sbapt for (i = cnt - 1; i; i--) { 292286432Sbapt s[i] = (wc & 0x3f) | 0x80; 293286432Sbapt wc >>= 6; 294286432Sbapt } 295286432Sbapt s[0] = (msk) | wc; 296286432Sbapt s[cnt] = 0; 297286432Sbapt return (cnt); 298286432Sbapt} 299286432Sbapt 300286432Sbapt/* 301286432Sbapt * Several encodings share a simplistic dual byte encoding. In these 302286432Sbapt * forms, they all indicate that a two byte sequence is to be used if 303286432Sbapt * the first byte has its high bit set. They all store this simple 304286432Sbapt * encoding as a 16-bit value, although a great many of the possible 305286432Sbapt * code points are not used in most character sets. This gives a possible 306286432Sbapt * set of just over 32,000 valid code points. 307286432Sbapt * 308286432Sbapt * 0x00 - 0x7f - 1 byte encoding 309286432Sbapt * 0x80 - 0x7fff - illegal 310286432Sbapt * 0x8000 - 0xffff - 2 byte encoding 311286432Sbapt */ 312286432Sbapt 313286432Sbaptstatic int 314286432Sbapttowide_dbcs(wchar_t *wc, const char *mb, unsigned n) 315286432Sbapt{ 316286432Sbapt wchar_t c; 317286432Sbapt 318290559Sbapt c = *(const uint8_t *)mb; 319286432Sbapt 320286432Sbapt if ((c & 0x80) == 0) { 321286432Sbapt /* 7-bit */ 322286432Sbapt *wc = c; 323286432Sbapt return (1); 324286432Sbapt } 325286432Sbapt if (n < 2) { 326286432Sbapt werr("incomplete character sequence (%s)", show_mb(mb)); 327286432Sbapt return (-1); 328286432Sbapt } 329286432Sbapt 330286432Sbapt /* Store both bytes as a single 16-bit wide. */ 331286432Sbapt c <<= 8; 332286432Sbapt c |= (uint8_t)(mb[1]); 333286432Sbapt *wc = c; 334286432Sbapt return (2); 335286432Sbapt} 336286432Sbapt 337286432Sbapt/* 338286432Sbapt * Most multibyte locales just convert the wide character to the multibyte 339286432Sbapt * form by stripping leading null bytes, and writing the 32-bit quantity 340286432Sbapt * in big-endian order. 341286432Sbapt */ 342286432Sbaptint 343286432Sbapttomb_mbs(char *mb, wchar_t wc) 344286432Sbapt{ 345286432Sbapt uint8_t *s = (uint8_t *)mb; 346286432Sbapt int n = 0, c; 347286432Sbapt 348286432Sbapt if ((wc & 0xff000000U) != 0) { 349286432Sbapt n = 4; 350286432Sbapt } else if ((wc & 0x00ff0000U) != 0) { 351286432Sbapt n = 3; 352286432Sbapt } else if ((wc & 0x0000ff00U) != 0) { 353286432Sbapt n = 2; 354286432Sbapt } else { 355286432Sbapt n = 1; 356286432Sbapt } 357286432Sbapt c = n; 358286432Sbapt while (n) { 359286432Sbapt n--; 360286432Sbapt s[n] = wc & 0xff; 361286432Sbapt wc >>= 8; 362286432Sbapt } 363286432Sbapt /* ensure null termination */ 364286432Sbapt s[c] = 0; 365286432Sbapt return (c); 366286432Sbapt} 367286432Sbapt 368286432Sbapt 369286432Sbapt/* 370286432Sbapt * big5 is a simple dual byte character set. 371286432Sbapt */ 372286432Sbaptint 373286432Sbapttowide_big5(wchar_t *wc, const char *mb, unsigned n) 374286432Sbapt{ 375286432Sbapt return (towide_dbcs(wc, mb, n)); 376286432Sbapt} 377286432Sbapt 378286432Sbapt/* 379286432Sbapt * GBK encodes wides in the same way that big5 does, the high order 380286432Sbapt * bit of the first byte indicates a double byte character. 381286432Sbapt */ 382286432Sbaptint 383286432Sbapttowide_gbk(wchar_t *wc, const char *mb, unsigned n) 384286432Sbapt{ 385286432Sbapt return (towide_dbcs(wc, mb, n)); 386286432Sbapt} 387286432Sbapt 388286432Sbapt/* 389286432Sbapt * GB2312 is another DBCS. Its cleaner than others in that the second 390286432Sbapt * byte does not encode ASCII, but it supports characters. 391286432Sbapt */ 392286432Sbaptint 393286432Sbapttowide_gb2312(wchar_t *wc, const char *mb, unsigned n) 394286432Sbapt{ 395286432Sbapt return (towide_dbcs(wc, mb, n)); 396286432Sbapt} 397286432Sbapt 398286432Sbapt/* 399286432Sbapt * GB18030. This encodes as 8, 16, or 32-bits. 400286432Sbapt * 7-bit values are in 1 byte, 4 byte sequences are used when 401286432Sbapt * the second byte encodes 0x30-39 and all other sequences are 2 bytes. 402286432Sbapt */ 403286432Sbaptint 404286432Sbapttowide_gb18030(wchar_t *wc, const char *mb, unsigned n) 405286432Sbapt{ 406286432Sbapt wchar_t c; 407286432Sbapt 408290559Sbapt c = *(const uint8_t *)mb; 409286432Sbapt 410286432Sbapt if ((c & 0x80) == 0) { 411286432Sbapt /* 7-bit */ 412286432Sbapt *wc = c; 413286432Sbapt return (1); 414286432Sbapt } 415286432Sbapt if (n < 2) { 416286432Sbapt werr("incomplete character sequence (%s)", show_mb(mb)); 417286432Sbapt return (-1); 418286432Sbapt } 419286432Sbapt 420286432Sbapt /* pull in the second byte */ 421286432Sbapt c <<= 8; 422286432Sbapt c |= (uint8_t)(mb[1]); 423286432Sbapt 424286432Sbapt if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { 425286432Sbapt if (n < 4) { 426286432Sbapt werr("incomplete 4-byte character sequence (%s)", 427286432Sbapt show_mb(mb)); 428286432Sbapt return (-1); 429286432Sbapt } 430286432Sbapt c <<= 8; 431286432Sbapt c |= (uint8_t)(mb[2]); 432286432Sbapt c <<= 8; 433286432Sbapt c |= (uint8_t)(mb[3]); 434286432Sbapt *wc = c; 435286432Sbapt return (4); 436286432Sbapt } 437286432Sbapt 438286432Sbapt *wc = c; 439286432Sbapt return (2); 440286432Sbapt} 441286432Sbapt 442286432Sbapt/* 443286432Sbapt * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it 444286432Sbapt * also has a range of single byte characters above 0x80. (0xa1-0xdf). 445286432Sbapt */ 446286432Sbaptint 447286432Sbapttowide_mskanji(wchar_t *wc, const char *mb, unsigned n) 448286432Sbapt{ 449286432Sbapt wchar_t c; 450286432Sbapt 451290559Sbapt c = *(const uint8_t *)mb; 452286432Sbapt 453286432Sbapt if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { 454286432Sbapt /* 7-bit */ 455286432Sbapt *wc = c; 456286432Sbapt return (1); 457286432Sbapt } 458286432Sbapt 459286432Sbapt if (n < 2) { 460286432Sbapt werr("incomplete character sequence (%s)", show_mb(mb)); 461286432Sbapt return (-1); 462286432Sbapt } 463286432Sbapt 464286432Sbapt /* Store both bytes as a single 16-bit wide. */ 465286432Sbapt c <<= 8; 466286432Sbapt c |= (uint8_t)(mb[1]); 467286432Sbapt *wc = c; 468286432Sbapt return (2); 469286432Sbapt} 470286432Sbapt 471286432Sbapt/* 472286432Sbapt * EUC forms. EUC encodings are "variable". FreeBSD carries some additional 473286432Sbapt * variable data to encode these, but we're going to treat each as independent 474286432Sbapt * instead. Its the only way we can sensibly move forward. 475286432Sbapt * 476286432Sbapt * Note that the way in which the different EUC forms vary is how wide 477286432Sbapt * CS2 and CS3 are and what the first byte of them is. 478286432Sbapt */ 479286432Sbaptstatic int 480286432Sbapttowide_euc_impl(wchar_t *wc, const char *mb, unsigned n, 481286432Sbapt uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 482286432Sbapt{ 483286432Sbapt int i; 484286432Sbapt int width = 2; 485286432Sbapt wchar_t c; 486286432Sbapt 487290559Sbapt c = *(const uint8_t *)mb; 488286432Sbapt 489286432Sbapt /* 490286432Sbapt * All variations of EUC encode 7-bit ASCII as one byte, and use 491286432Sbapt * additional bytes for more than that. 492286432Sbapt */ 493286432Sbapt if ((c & 0x80) == 0) { 494286432Sbapt /* 7-bit */ 495286432Sbapt *wc = c; 496286432Sbapt return (1); 497286432Sbapt } 498286432Sbapt 499286432Sbapt /* 500286432Sbapt * All EUC variants reserve 0xa1-0xff to identify CS1, which 501286432Sbapt * is always two bytes wide. Note that unused CS will be zero, 502286432Sbapt * and that cannot be true because we know that the high order 503286432Sbapt * bit must be set. 504286432Sbapt */ 505286432Sbapt if (c >= 0xa1) { 506286432Sbapt width = 2; 507286432Sbapt } else if (c == cs2) { 508286432Sbapt width = cs2width; 509286432Sbapt } else if (c == cs3) { 510286432Sbapt width = cs3width; 511286432Sbapt } 512286432Sbapt 513286432Sbapt if ((int)n < width) { 514286432Sbapt werr("incomplete character sequence (%s)", show_mb(mb)); 515286432Sbapt return (-1); 516286432Sbapt } 517286432Sbapt 518286432Sbapt for (i = 1; i < width; i++) { 519286432Sbapt /* pull in the next byte */ 520286432Sbapt c <<= 8; 521286432Sbapt c |= (uint8_t)(mb[i]); 522286432Sbapt } 523286432Sbapt 524286432Sbapt *wc = c; 525286432Sbapt return (width); 526286432Sbapt} 527286432Sbapt 528286432Sbapt/* 529286432Sbapt * EUC-CN encodes as follows: 530286432Sbapt * 531286432Sbapt * Code set 0 (ASCII): 0x21-0x7E 532286432Sbapt * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 533286432Sbapt * Code set 2: unused 534286432Sbapt * Code set 3: unused 535286432Sbapt */ 536286432Sbaptint 537286432Sbapttowide_euccn(wchar_t *wc, const char *mb, unsigned n) 538286432Sbapt{ 539286432Sbapt return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 540286432Sbapt} 541286432Sbapt 542286432Sbapt/* 543286432Sbapt * EUC-JP encodes as follows: 544286432Sbapt * 545286432Sbapt * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E 546286432Sbapt * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE 547286432Sbapt * Code set 2 (half-width katakana): 0x8EA1-0x8EDF 548286432Sbapt * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE 549286432Sbapt */ 550286432Sbaptint 551286432Sbapttowide_eucjp(wchar_t *wc, const char *mb, unsigned n) 552286432Sbapt{ 553286432Sbapt return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); 554286432Sbapt} 555286432Sbapt 556286432Sbapt/* 557286432Sbapt * EUC-KR encodes as follows: 558286432Sbapt * 559286432Sbapt * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E 560286432Sbapt * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE 561286432Sbapt * Code set 2: unused 562286432Sbapt * Code set 3: unused 563286432Sbapt */ 564286432Sbaptint 565286432Sbapttowide_euckr(wchar_t *wc, const char *mb, unsigned n) 566286432Sbapt{ 567286432Sbapt return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); 568286432Sbapt} 569286432Sbapt 570286432Sbapt/* 571286432Sbapt * EUC-TW encodes as follows: 572286432Sbapt * 573286432Sbapt * Code set 0 (ASCII): 0x21-0x7E 574286432Sbapt * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 575286432Sbapt * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 576286432Sbapt * Code set 3: unused 577286432Sbapt */ 578286432Sbaptint 579286432Sbapttowide_euctw(wchar_t *wc, const char *mb, unsigned n) 580286432Sbapt{ 581286432Sbapt return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 582286432Sbapt} 583286432Sbapt 584286432Sbapt/* 585286432Sbapt * Public entry points. 586286432Sbapt */ 587286432Sbapt 588286432Sbaptint 589286432Sbaptto_wide(wchar_t *wc, const char *mb) 590286432Sbapt{ 591286432Sbapt /* this won't fail hard */ 592286432Sbapt return (_towide(wc, mb, strlen(mb))); 593286432Sbapt} 594286432Sbapt 595286432Sbaptint 596286432Sbaptto_mb(char *mb, wchar_t wc) 597286432Sbapt{ 598286432Sbapt int rv; 599286432Sbapt 600286432Sbapt if ((rv = _tomb(mb, wc)) < 0) { 601321121Sngie warn("%s", widemsg); 602286432Sbapt free(widemsg); 603286432Sbapt widemsg = NULL; 604286432Sbapt } 605286432Sbapt return (rv); 606286432Sbapt} 607286432Sbapt 608286432Sbaptchar * 609286432Sbaptto_mb_string(const wchar_t *wcs) 610286432Sbapt{ 611286432Sbapt char *mbs; 612286432Sbapt char *ptr; 613286432Sbapt int len; 614286432Sbapt 615286432Sbapt mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); 616286432Sbapt if (mbs == NULL) { 617321121Sngie warn("out of memory"); 618286432Sbapt return (NULL); 619286432Sbapt } 620286432Sbapt ptr = mbs; 621286432Sbapt while (*wcs) { 622286432Sbapt if ((len = to_mb(ptr, *wcs)) < 0) { 623286432Sbapt INTERR; 624286432Sbapt free(mbs); 625286432Sbapt return (NULL); 626286432Sbapt } 627286432Sbapt wcs++; 628286432Sbapt ptr += len; 629286432Sbapt } 630286432Sbapt *ptr = 0; 631286432Sbapt return (mbs); 632286432Sbapt} 633286432Sbapt 634286432Sbaptvoid 635286432Sbaptset_wide_encoding(const char *encoding) 636286432Sbapt{ 637286432Sbapt int i; 638286432Sbapt 639286432Sbapt _towide = towide_none; 640286432Sbapt _tomb = tomb_none; 641286432Sbapt _nbits = 8; 642286432Sbapt 643290485Sbapt snprintf(_encoding_buffer, sizeof(_encoding_buffer), "NONE:%s", 644290485Sbapt encoding); 645286432Sbapt for (i = 0; mb_encodings[i].name; i++) { 646286432Sbapt if (strcasecmp(encoding, mb_encodings[i].name) == 0) { 647286432Sbapt _towide = mb_encodings[i].towide; 648286432Sbapt _tomb = mb_encodings[i].tomb; 649286432Sbapt _encoding = mb_encodings[i].cname; 650286432Sbapt _nbits = mb_encodings[i].nbits; 651286432Sbapt break; 652286432Sbapt } 653286432Sbapt } 654286432Sbapt} 655286432Sbapt 656286432Sbaptconst char * 657286432Sbaptget_wide_encoding(void) 658286432Sbapt{ 659286432Sbapt return (_encoding); 660286432Sbapt} 661286432Sbapt 662286432Sbaptint 663286432Sbaptmax_wide(void) 664286432Sbapt{ 665286432Sbapt return ((int)((1U << _nbits) - 1)); 666286432Sbapt} 667