conv.c revision 1.9
1/*	$NetBSD: conv.c,v 1.9 2017/11/21 02:11:44 rin Exp $ */
2/*-
3 * Copyright (c) 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 * Copyright (c) 1993, 1994, 1995, 1996
6 *	Keith Bostic.  All rights reserved.
7 *
8 * See the LICENSE file for redistribution information.
9 */
10
11#include "config.h"
12
13#include <sys/cdefs.h>
14#if 0
15#ifndef lint
16static const char sccsid[] = "Id: conv.c,v 1.27 2001/08/18 21:41:41 skimo Exp  (Berkeley) Date: 2001/08/18 21:41:41 ";
17#endif /* not lint */
18#else
19__RCSID("$NetBSD: conv.c,v 1.9 2017/11/21 02:11:44 rin Exp $");
20#endif
21
22#include <sys/types.h>
23#include <sys/queue.h>
24#include <sys/time.h>
25
26#include <bitstring.h>
27#include <errno.h>
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32#include <unistd.h>
33
34#include "common.h"
35
36#if defined(USE_WIDECHAR) && defined(USE_ICONV)
37#include <langinfo.h>
38#include <iconv.h>
39
40#define LANGCODESET	nl_langinfo(CODESET)
41#else
42typedef int	iconv_t;
43
44#define LANGCODESET	""
45#endif
46
47#include <locale.h>
48
49#ifdef USE_WIDECHAR
50static int
51raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
52	const CHAR_T **dst)
53{
54    int i;
55    CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
56    size_t  *blen = &cw->blen1;
57
58    BINC_RETW(NULL, *tostr, *blen, len);
59
60    *tolen = len;
61    for (i = 0; i < len; ++i) {
62	CHAR_T w = (u_char)str[i];
63	memcpy((*tostr) + i, &w, sizeof(**tostr));
64    }
65
66    *dst = cw->bp1;
67
68    return 0;
69}
70
71#ifndef ERROR_ON_CONVERT
72#define HANDLE_ICONV_ERROR(o, i, ol, il) do {				\
73		*o++ = *i++;						\
74		ol--; il--;						\
75	} while (/*CONSTCOND*/0)
76#define HANDLE_MBR_ERROR(n, mbs, d, s) do {				\
77		d = s;							\
78		MEMSET(&mbs, 0, 1); 					\
79		n = 1; 							\
80	} while (/*CONSTCOND*/0)
81#else
82#define HANDLE_ICONV_ERROR goto err
83#define	HANDLE_MBR_ERROR goto err
84#endif
85
86#define CONV_BUFFER_SIZE    512
87/* fill the buffer with codeset encoding of string pointed to by str
88 * left has the number of bytes left in str and is adjusted
89 * len contains the number of bytes put in the buffer
90 */
91#ifdef USE_ICONV
92#define CONVERT(str, left, src, len)				    	\
93    do {								\
94	size_t outleft;							\
95	char *bp = buffer;						\
96	outleft = CONV_BUFFER_SIZE;					\
97	errno = 0;							\
98	if (iconv(id, (const char **)&str, &left, &bp, &outleft) 	\
99	    == (size_t)-1 && errno != E2BIG)				\
100		HANDLE_ICONV_ERROR(bp, str, outleft, left);		\
101	if ((len = CONV_BUFFER_SIZE - outleft) == 0) {			\
102	    error = -left;						\
103	    goto err;							\
104	}				    				\
105	src = buffer;							\
106    } while (0)
107#else
108#define CONVERT(str, left, src, len)
109#endif
110
111static int
112default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
113		size_t *tolen, const CHAR_T **dst, const char *enc)
114{
115    int j;
116    size_t i = 0;
117    CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
118    size_t  *blen = &cw->blen1;
119    mbstate_t mbs;
120    size_t   n;
121    ssize_t  nlen = len;
122    const char *src = (const char *)str;
123    iconv_t	id = (iconv_t)-1;
124    char	buffer[CONV_BUFFER_SIZE];
125    size_t	left = len;
126    int		error = 1;
127
128    MEMSET(&mbs, 0, 1);
129    BINC_RETW(NULL, *tostr, *blen, nlen);
130
131#ifdef USE_ICONV
132    if (strcmp(nl_langinfo(CODESET), enc)) {
133	id = iconv_open(nl_langinfo(CODESET), enc);
134	if (id == (iconv_t)-1)
135	    goto err;
136	CONVERT(str, left, src, len);
137    }
138#endif
139
140    for (i = 0, j = 0; j < len; ) {
141	CHAR_T w;
142	n = mbrtowc(&w, src + j, len - j, &mbs);
143	memcpy((*tostr) + i, &w, sizeof(**tostr));
144	/* NULL character converted */
145	if (n == (size_t)-2) error = -(len - j);
146	if (n == (size_t)-1 || n == (size_t)-2) {
147	    HANDLE_MBR_ERROR(n, mbs, w, src[j]);
148	    memcpy((*tostr) + i, &w, sizeof(**tostr));
149	}
150	if (n == 0) n = 1;
151	j += n;
152	if (++i >= *blen) {
153	    nlen += 256;
154	    BINC_GOTOW(NULL, *tostr, *blen, nlen);
155	}
156	if (id != (iconv_t)-1 && j == len && left) {
157	    CONVERT(str, left, src, len);
158	    j = 0;
159	}
160    }
161    *tolen = i;
162
163    if (id != (iconv_t)-1)
164	iconv_close(id);
165
166    *dst = cw->bp1;
167
168    return 0;
169err:
170alloc_err:
171    *tolen = i;
172    if (id != (iconv_t)-1)
173	iconv_close(id);
174    *dst = cw->bp1;
175
176    return error;
177}
178
179static int
180fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
181	    size_t *tolen, const CHAR_T **dst)
182{
183    return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
184}
185
186static int
187ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
188	    size_t *tolen, const CHAR_T **dst)
189{
190    return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_INPUTENCODING));
191}
192
193static int
194cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
195	    size_t *tolen, const CHAR_T **dst)
196{
197    return default_char2int(sp, str, len, cw, tolen, dst, LANGCODESET);
198}
199
200static int
201CHAR_T_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
202	size_t *tolen, const char **dst)
203{
204    *tolen = len * sizeof(CHAR_T);
205    *dst = (const char *)(const void *)str;
206
207    return 0;
208}
209
210static int
211CHAR_T_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
212	size_t *tolen, const CHAR_T **dst)
213{
214    *tolen = len / sizeof(CHAR_T);
215    *dst = (const CHAR_T*) str;
216
217    return 0;
218}
219
220static int
221int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
222	const char **dst)
223{
224    int i;
225    char **tostr = (char **)(void *)&cw->bp1;
226    size_t  *blen = &cw->blen1;
227
228    BINC_RETC(NULL, *tostr, *blen, len);
229
230    *tolen = len;
231    for (i = 0; i < len; ++i) {
232	CHAR_T w;
233	memcpy(&w, str + i, sizeof(w));
234	(*tostr)[i] = w;
235    }
236
237    *dst = cw->bp1;
238
239    return 0;
240}
241
242static int
243default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
244		size_t *tolen, const char **pdst, const char *enc)
245{
246    size_t i, j = 0;
247    int offset = 0;
248    char **tostr = (char **)(void *)&cw->bp1;
249    size_t  *blen = &cw->blen1;
250    mbstate_t mbs;
251    size_t n;
252    ssize_t  nlen = len + MB_CUR_MAX;
253    char *dst;
254    size_t buflen;
255    char	buffer[CONV_BUFFER_SIZE];
256    iconv_t	id = (iconv_t)-1;
257
258/* convert first len bytes of buffer and append it to cw->bp
259 * len is adjusted => 0
260 * offset contains the offset in cw->bp and is adjusted
261 * cw->bp is grown as required
262 */
263#ifdef USE_ICONV
264#define CONVERT2(_buffer, lenp, cw, offset)				\
265    do {								\
266	const char *bp = _buffer;					\
267	size_t ret;							\
268	do {								\
269	    size_t outleft = cw->blen1 - offset;			\
270	    char *obp = (char *)cw->bp1 + offset;		    	\
271	    if (cw->blen1 < offset + MB_CUR_MAX) {		    	\
272		nlen += 256;						\
273		BINC_GOTOC(NULL, cw->bp1, cw->blen1, nlen);		\
274	    }						    		\
275	    errno = 0;						    	\
276	    ret = iconv(id, &bp, lenp, &obp, &outleft);			\
277	    if (ret == (size_t)-1 && errno != E2BIG) 			\
278		    HANDLE_ICONV_ERROR(obp, bp, outleft, len);		\
279	    offset = cw->blen1 - outleft;			        \
280	} while (ret != 0);					        \
281    } while (0)
282#else
283#define CONVERT2(_buffer, lenp, cw, offset)
284#endif
285
286
287    MEMSET(&mbs, 0, 1);
288    BINC_RETC(NULL, *tostr, *blen, nlen);
289    dst = *tostr; buflen = *blen;
290
291#ifdef USE_ICONV
292    if (strcmp(nl_langinfo(CODESET), enc)) {
293	id = iconv_open(enc, nl_langinfo(CODESET));
294	if (id == (iconv_t)-1)
295	    goto err;
296	dst = buffer; buflen = CONV_BUFFER_SIZE;
297    }
298#endif
299
300    for (i = 0, j = 0; i < (size_t)len; ++i) {
301	CHAR_T w;
302	memcpy(&w, str + i, sizeof(w));
303	n = wcrtomb(dst + j, w, &mbs);
304	if (n == (size_t)-1)
305	   HANDLE_MBR_ERROR(n, mbs, dst[j], w);
306	j += n;
307	if (buflen < j + MB_CUR_MAX) {
308	    if (id != (iconv_t)-1) {
309		CONVERT2(buffer, &j, cw, offset);
310	    } else {
311		nlen += 256;
312		BINC_RETC(NULL, *tostr, *blen, nlen);
313		dst = *tostr; buflen = *blen;
314	    }
315	}
316    }
317
318    n = wcrtomb(dst + j, L'\0', &mbs);
319    j += n - 1;				/* don't count NUL at the end */
320    *tolen = j;
321
322    if (id != (iconv_t)-1) {
323	CONVERT2(buffer, &j, cw, offset);
324	CONVERT2(NULL, NULL, cw, offset);  /* back to the initial state */
325	*tolen = offset;
326	iconv_close(id);
327    }
328
329    *pdst = cw->bp1;
330
331    return 0;
332err:
333alloc_err:
334    *tolen = j;
335    if (id != (iconv_t)-1) {
336	iconv_close(id);
337    }
338    *pdst = cw->bp1;
339
340    return 1;
341}
342
343static int
344fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
345	    size_t *tolen, const char **dst)
346{
347    return default_int2char(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
348}
349
350static int
351cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
352	    size_t *tolen, const char **dst)
353{
354    return default_int2char(sp, str, len, cw, tolen, dst, LANGCODESET);
355}
356
357#endif
358
359
360void
361conv_init (SCR *orig, SCR *sp)
362{
363    if (orig != NULL)
364	MEMCPY(&sp->conv, &orig->conv, 1);
365    else {
366	setlocale(LC_ALL, "");
367#ifdef USE_WIDECHAR
368	sp->conv.sys2int = cs_char2int;
369	sp->conv.int2sys = cs_int2char;
370	sp->conv.file2int = fe_char2int;
371	sp->conv.int2file = fe_int2char;
372	sp->conv.input2int = ie_char2int;
373#ifdef USE_ICONV
374	o_set(sp, O_FILEENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
375	o_set(sp, O_INPUTENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
376#endif
377#endif
378    }
379}
380
381int
382conv_enc (SCR *sp, int option, const char *enc)
383{
384#if defined(USE_WIDECHAR) && defined(USE_ICONV)
385    iconv_t id;
386    char2wchar_t    *c2w;
387    wchar2char_t    *w2c;
388
389    switch (option) {
390    case O_FILEENCODING:
391	c2w = &sp->conv.file2int;
392	w2c = &sp->conv.int2file;
393	break;
394    case O_INPUTENCODING:
395	c2w = &sp->conv.input2int;
396	w2c = NULL;
397	break;
398    default:
399	c2w = NULL;
400	w2c = NULL;
401	break;
402    }
403
404    if (!*enc) {
405	if (c2w) *c2w = raw2int;
406	if (w2c) *w2c = int2raw;
407	return 0;
408    }
409
410    if (!strcmp(enc, "WCHAR_T")) {
411	if (c2w) *c2w = CHAR_T_char2int;
412	if (w2c) *w2c = CHAR_T_int2char;
413	return 0;
414    }
415
416    id = iconv_open(enc, nl_langinfo(CODESET));
417    if (id == (iconv_t)-1)
418	goto err;
419    iconv_close(id);
420    id = iconv_open(nl_langinfo(CODESET), enc);
421    if (id == (iconv_t)-1)
422	goto err;
423    iconv_close(id);
424
425    switch (option) {
426    case O_FILEENCODING:
427	*c2w = fe_char2int;
428	*w2c = fe_int2char;
429	break;
430    case O_INPUTENCODING:
431	*c2w = ie_char2int;
432	break;
433    }
434
435    F_CLR(sp, SC_CONV_ERROR);
436    F_SET(sp, SC_SCR_REFORMAT);
437
438    return 0;
439err:
440    switch (option) {
441    case O_FILEENCODING:
442	msgq(sp, M_ERR,
443	    "321|File encoding conversion not supported");
444	break;
445    case O_INPUTENCODING:
446	msgq(sp, M_ERR,
447	    "322|Input encoding conversion not supported");
448	break;
449    }
450#endif
451    return 1;
452}
453
454