conv.c revision 1.9
1/* $NetBSD: conv.c,v 1.9 2017/11/21 02:11:44 rin Exp $ */ 2/*- 3 * Copyright (c) 1993, 1994 4 * The Regents of the University of California. All rights reserved. 5 * Copyright (c) 1993, 1994, 1995, 1996 6 * Keith Bostic. All rights reserved. 7 * 8 * See the LICENSE file for redistribution information. 9 */ 10 11#include "config.h" 12 13#include <sys/cdefs.h> 14#if 0 15#ifndef lint 16static const char sccsid[] = "Id: conv.c,v 1.27 2001/08/18 21:41:41 skimo Exp (Berkeley) Date: 2001/08/18 21:41:41 "; 17#endif /* not lint */ 18#else 19__RCSID("$NetBSD: conv.c,v 1.9 2017/11/21 02:11:44 rin Exp $"); 20#endif 21 22#include <sys/types.h> 23#include <sys/queue.h> 24#include <sys/time.h> 25 26#include <bitstring.h> 27#include <errno.h> 28#include <limits.h> 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32#include <unistd.h> 33 34#include "common.h" 35 36#if defined(USE_WIDECHAR) && defined(USE_ICONV) 37#include <langinfo.h> 38#include <iconv.h> 39 40#define LANGCODESET nl_langinfo(CODESET) 41#else 42typedef int iconv_t; 43 44#define LANGCODESET "" 45#endif 46 47#include <locale.h> 48 49#ifdef USE_WIDECHAR 50static int 51raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen, 52 const CHAR_T **dst) 53{ 54 int i; 55 CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1; 56 size_t *blen = &cw->blen1; 57 58 BINC_RETW(NULL, *tostr, *blen, len); 59 60 *tolen = len; 61 for (i = 0; i < len; ++i) { 62 CHAR_T w = (u_char)str[i]; 63 memcpy((*tostr) + i, &w, sizeof(**tostr)); 64 } 65 66 *dst = cw->bp1; 67 68 return 0; 69} 70 71#ifndef ERROR_ON_CONVERT 72#define HANDLE_ICONV_ERROR(o, i, ol, il) do { \ 73 *o++ = *i++; \ 74 ol--; il--; \ 75 } while (/*CONSTCOND*/0) 76#define HANDLE_MBR_ERROR(n, mbs, d, s) do { \ 77 d = s; \ 78 MEMSET(&mbs, 0, 1); \ 79 n = 1; \ 80 } while (/*CONSTCOND*/0) 81#else 82#define HANDLE_ICONV_ERROR goto err 83#define HANDLE_MBR_ERROR goto err 84#endif 85 86#define CONV_BUFFER_SIZE 512 87/* fill the buffer with codeset encoding of string pointed to by str 88 * left has the number of bytes left in str and is adjusted 89 * len contains the number of bytes put in the buffer 90 */ 91#ifdef USE_ICONV 92#define CONVERT(str, left, src, len) \ 93 do { \ 94 size_t outleft; \ 95 char *bp = buffer; \ 96 outleft = CONV_BUFFER_SIZE; \ 97 errno = 0; \ 98 if (iconv(id, (const char **)&str, &left, &bp, &outleft) \ 99 == (size_t)-1 && errno != E2BIG) \ 100 HANDLE_ICONV_ERROR(bp, str, outleft, left); \ 101 if ((len = CONV_BUFFER_SIZE - outleft) == 0) { \ 102 error = -left; \ 103 goto err; \ 104 } \ 105 src = buffer; \ 106 } while (0) 107#else 108#define CONVERT(str, left, src, len) 109#endif 110 111static int 112default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, 113 size_t *tolen, const CHAR_T **dst, const char *enc) 114{ 115 int j; 116 size_t i = 0; 117 CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1; 118 size_t *blen = &cw->blen1; 119 mbstate_t mbs; 120 size_t n; 121 ssize_t nlen = len; 122 const char *src = (const char *)str; 123 iconv_t id = (iconv_t)-1; 124 char buffer[CONV_BUFFER_SIZE]; 125 size_t left = len; 126 int error = 1; 127 128 MEMSET(&mbs, 0, 1); 129 BINC_RETW(NULL, *tostr, *blen, nlen); 130 131#ifdef USE_ICONV 132 if (strcmp(nl_langinfo(CODESET), enc)) { 133 id = iconv_open(nl_langinfo(CODESET), enc); 134 if (id == (iconv_t)-1) 135 goto err; 136 CONVERT(str, left, src, len); 137 } 138#endif 139 140 for (i = 0, j = 0; j < len; ) { 141 CHAR_T w; 142 n = mbrtowc(&w, src + j, len - j, &mbs); 143 memcpy((*tostr) + i, &w, sizeof(**tostr)); 144 /* NULL character converted */ 145 if (n == (size_t)-2) error = -(len - j); 146 if (n == (size_t)-1 || n == (size_t)-2) { 147 HANDLE_MBR_ERROR(n, mbs, w, src[j]); 148 memcpy((*tostr) + i, &w, sizeof(**tostr)); 149 } 150 if (n == 0) n = 1; 151 j += n; 152 if (++i >= *blen) { 153 nlen += 256; 154 BINC_GOTOW(NULL, *tostr, *blen, nlen); 155 } 156 if (id != (iconv_t)-1 && j == len && left) { 157 CONVERT(str, left, src, len); 158 j = 0; 159 } 160 } 161 *tolen = i; 162 163 if (id != (iconv_t)-1) 164 iconv_close(id); 165 166 *dst = cw->bp1; 167 168 return 0; 169err: 170alloc_err: 171 *tolen = i; 172 if (id != (iconv_t)-1) 173 iconv_close(id); 174 *dst = cw->bp1; 175 176 return error; 177} 178 179static int 180fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, 181 size_t *tolen, const CHAR_T **dst) 182{ 183 return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING)); 184} 185 186static int 187ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, 188 size_t *tolen, const CHAR_T **dst) 189{ 190 return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_INPUTENCODING)); 191} 192 193static int 194cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, 195 size_t *tolen, const CHAR_T **dst) 196{ 197 return default_char2int(sp, str, len, cw, tolen, dst, LANGCODESET); 198} 199 200static int 201CHAR_T_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, 202 size_t *tolen, const char **dst) 203{ 204 *tolen = len * sizeof(CHAR_T); 205 *dst = (const char *)(const void *)str; 206 207 return 0; 208} 209 210static int 211CHAR_T_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, 212 size_t *tolen, const CHAR_T **dst) 213{ 214 *tolen = len / sizeof(CHAR_T); 215 *dst = (const CHAR_T*) str; 216 217 return 0; 218} 219 220static int 221int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen, 222 const char **dst) 223{ 224 int i; 225 char **tostr = (char **)(void *)&cw->bp1; 226 size_t *blen = &cw->blen1; 227 228 BINC_RETC(NULL, *tostr, *blen, len); 229 230 *tolen = len; 231 for (i = 0; i < len; ++i) { 232 CHAR_T w; 233 memcpy(&w, str + i, sizeof(w)); 234 (*tostr)[i] = w; 235 } 236 237 *dst = cw->bp1; 238 239 return 0; 240} 241 242static int 243default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, 244 size_t *tolen, const char **pdst, const char *enc) 245{ 246 size_t i, j = 0; 247 int offset = 0; 248 char **tostr = (char **)(void *)&cw->bp1; 249 size_t *blen = &cw->blen1; 250 mbstate_t mbs; 251 size_t n; 252 ssize_t nlen = len + MB_CUR_MAX; 253 char *dst; 254 size_t buflen; 255 char buffer[CONV_BUFFER_SIZE]; 256 iconv_t id = (iconv_t)-1; 257 258/* convert first len bytes of buffer and append it to cw->bp 259 * len is adjusted => 0 260 * offset contains the offset in cw->bp and is adjusted 261 * cw->bp is grown as required 262 */ 263#ifdef USE_ICONV 264#define CONVERT2(_buffer, lenp, cw, offset) \ 265 do { \ 266 const char *bp = _buffer; \ 267 size_t ret; \ 268 do { \ 269 size_t outleft = cw->blen1 - offset; \ 270 char *obp = (char *)cw->bp1 + offset; \ 271 if (cw->blen1 < offset + MB_CUR_MAX) { \ 272 nlen += 256; \ 273 BINC_GOTOC(NULL, cw->bp1, cw->blen1, nlen); \ 274 } \ 275 errno = 0; \ 276 ret = iconv(id, &bp, lenp, &obp, &outleft); \ 277 if (ret == (size_t)-1 && errno != E2BIG) \ 278 HANDLE_ICONV_ERROR(obp, bp, outleft, len); \ 279 offset = cw->blen1 - outleft; \ 280 } while (ret != 0); \ 281 } while (0) 282#else 283#define CONVERT2(_buffer, lenp, cw, offset) 284#endif 285 286 287 MEMSET(&mbs, 0, 1); 288 BINC_RETC(NULL, *tostr, *blen, nlen); 289 dst = *tostr; buflen = *blen; 290 291#ifdef USE_ICONV 292 if (strcmp(nl_langinfo(CODESET), enc)) { 293 id = iconv_open(enc, nl_langinfo(CODESET)); 294 if (id == (iconv_t)-1) 295 goto err; 296 dst = buffer; buflen = CONV_BUFFER_SIZE; 297 } 298#endif 299 300 for (i = 0, j = 0; i < (size_t)len; ++i) { 301 CHAR_T w; 302 memcpy(&w, str + i, sizeof(w)); 303 n = wcrtomb(dst + j, w, &mbs); 304 if (n == (size_t)-1) 305 HANDLE_MBR_ERROR(n, mbs, dst[j], w); 306 j += n; 307 if (buflen < j + MB_CUR_MAX) { 308 if (id != (iconv_t)-1) { 309 CONVERT2(buffer, &j, cw, offset); 310 } else { 311 nlen += 256; 312 BINC_RETC(NULL, *tostr, *blen, nlen); 313 dst = *tostr; buflen = *blen; 314 } 315 } 316 } 317 318 n = wcrtomb(dst + j, L'\0', &mbs); 319 j += n - 1; /* don't count NUL at the end */ 320 *tolen = j; 321 322 if (id != (iconv_t)-1) { 323 CONVERT2(buffer, &j, cw, offset); 324 CONVERT2(NULL, NULL, cw, offset); /* back to the initial state */ 325 *tolen = offset; 326 iconv_close(id); 327 } 328 329 *pdst = cw->bp1; 330 331 return 0; 332err: 333alloc_err: 334 *tolen = j; 335 if (id != (iconv_t)-1) { 336 iconv_close(id); 337 } 338 *pdst = cw->bp1; 339 340 return 1; 341} 342 343static int 344fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, 345 size_t *tolen, const char **dst) 346{ 347 return default_int2char(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING)); 348} 349 350static int 351cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, 352 size_t *tolen, const char **dst) 353{ 354 return default_int2char(sp, str, len, cw, tolen, dst, LANGCODESET); 355} 356 357#endif 358 359 360void 361conv_init (SCR *orig, SCR *sp) 362{ 363 if (orig != NULL) 364 MEMCPY(&sp->conv, &orig->conv, 1); 365 else { 366 setlocale(LC_ALL, ""); 367#ifdef USE_WIDECHAR 368 sp->conv.sys2int = cs_char2int; 369 sp->conv.int2sys = cs_int2char; 370 sp->conv.file2int = fe_char2int; 371 sp->conv.int2file = fe_int2char; 372 sp->conv.input2int = ie_char2int; 373#ifdef USE_ICONV 374 o_set(sp, O_FILEENCODING, OS_STRDUP, nl_langinfo(CODESET), 0); 375 o_set(sp, O_INPUTENCODING, OS_STRDUP, nl_langinfo(CODESET), 0); 376#endif 377#endif 378 } 379} 380 381int 382conv_enc (SCR *sp, int option, const char *enc) 383{ 384#if defined(USE_WIDECHAR) && defined(USE_ICONV) 385 iconv_t id; 386 char2wchar_t *c2w; 387 wchar2char_t *w2c; 388 389 switch (option) { 390 case O_FILEENCODING: 391 c2w = &sp->conv.file2int; 392 w2c = &sp->conv.int2file; 393 break; 394 case O_INPUTENCODING: 395 c2w = &sp->conv.input2int; 396 w2c = NULL; 397 break; 398 default: 399 c2w = NULL; 400 w2c = NULL; 401 break; 402 } 403 404 if (!*enc) { 405 if (c2w) *c2w = raw2int; 406 if (w2c) *w2c = int2raw; 407 return 0; 408 } 409 410 if (!strcmp(enc, "WCHAR_T")) { 411 if (c2w) *c2w = CHAR_T_char2int; 412 if (w2c) *w2c = CHAR_T_int2char; 413 return 0; 414 } 415 416 id = iconv_open(enc, nl_langinfo(CODESET)); 417 if (id == (iconv_t)-1) 418 goto err; 419 iconv_close(id); 420 id = iconv_open(nl_langinfo(CODESET), enc); 421 if (id == (iconv_t)-1) 422 goto err; 423 iconv_close(id); 424 425 switch (option) { 426 case O_FILEENCODING: 427 *c2w = fe_char2int; 428 *w2c = fe_int2char; 429 break; 430 case O_INPUTENCODING: 431 *c2w = ie_char2int; 432 break; 433 } 434 435 F_CLR(sp, SC_CONV_ERROR); 436 F_SET(sp, SC_SCR_REFORMAT); 437 438 return 0; 439err: 440 switch (option) { 441 case O_FILEENCODING: 442 msgq(sp, M_ERR, 443 "321|File encoding conversion not supported"); 444 break; 445 case O_INPUTENCODING: 446 msgq(sp, M_ERR, 447 "322|Input encoding conversion not supported"); 448 break; 449 } 450#endif 451 return 1; 452} 453 454