conv.c revision 1.5
1/*	$NetBSD: conv.c,v 1.5 2017/11/06 03:02:22 rin Exp $ */
2/*-
3 * Copyright (c) 1993, 1994
4 *	The Regents of the University of California.  All rights reserved.
5 * Copyright (c) 1993, 1994, 1995, 1996
6 *	Keith Bostic.  All rights reserved.
7 *
8 * See the LICENSE file for redistribution information.
9 */
10
11#include "config.h"
12
13#include <sys/cdefs.h>
14#if 0
15#ifndef lint
16static const char sccsid[] = "Id: conv.c,v 1.27 2001/08/18 21:41:41 skimo Exp  (Berkeley) Date: 2001/08/18 21:41:41 ";
17#endif /* not lint */
18#else
19__RCSID("$NetBSD: conv.c,v 1.5 2017/11/06 03:02:22 rin Exp $");
20#endif
21
22#include <sys/types.h>
23#include <sys/queue.h>
24#include <sys/time.h>
25
26#include <bitstring.h>
27#include <errno.h>
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32#include <unistd.h>
33
34#include "common.h"
35
36#ifdef USE_ICONV
37#include <langinfo.h>
38#include <iconv.h>
39
40#define LANGCODESET	nl_langinfo(CODESET)
41#else
42typedef int	iconv_t;
43
44#define LANGCODESET	""
45#endif
46
47#include <locale.h>
48
49#ifdef USE_WIDECHAR
50static int
51raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
52	const CHAR_T **dst)
53{
54    int i;
55    CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
56    size_t  *blen = &cw->blen1;
57
58    BINC_RETW(NULL, *tostr, *blen, len);
59
60    *tolen = len;
61    for (i = 0; i < len; ++i) {
62	CHAR_T w = (u_char)str[i];
63	memcpy((*tostr) + i, &w, sizeof(**tostr));
64    }
65
66    *dst = cw->bp1;
67
68    return 0;
69}
70
71#ifndef ERROR_ON_CONVERT
72#define HANDLE_ICONV_ERROR(o, i, ol, il) do {				\
73		*o++ = *i++;						\
74		ol--; il--;						\
75	} while (/*CONSTCOND*/0)
76#define HANDLE_MBR_ERROR(n, mbs, d, s) do {				\
77		d = s;							\
78		MEMSET(&mbs, 0, 1); 					\
79		n = 1; 							\
80	} while (/*CONSTCOND*/0)
81#else
82#define HANDLE_ICONV_ERROR goto err
83#define	HANDLE_MBR_ERROR goto err
84#endif
85
86#define CONV_BUFFER_SIZE    512
87/* fill the buffer with codeset encoding of string pointed to by str
88 * left has the number of bytes left in str and is adjusted
89 * len contains the number of bytes put in the buffer
90 */
91#ifdef USE_ICONV
92#define CONVERT(str, left, src, len)				    	\
93    do {								\
94	size_t outleft;							\
95	char *bp = buffer;						\
96	outleft = CONV_BUFFER_SIZE;					\
97	errno = 0;							\
98	if (iconv(id, (const char **)&str, &left, &bp, &outleft) 	\
99	    == (size_t)-1 && errno != E2BIG)				\
100		HANDLE_ICONV_ERROR(bp, str, outleft, left);		\
101	if ((len = CONV_BUFFER_SIZE - outleft) == 0) {			\
102	    error = -left;						\
103	    goto err;							\
104	}				    				\
105	src = buffer;							\
106    } while (0)
107#else
108#define CONVERT(str, left, src, len)
109#endif
110
111static int
112default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
113		size_t *tolen, const CHAR_T **dst, const char *enc)
114{
115    int j;
116    size_t i = 0;
117    CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
118    size_t  *blen = &cw->blen1;
119    mbstate_t mbs;
120    size_t   n;
121    ssize_t  nlen = len;
122    const char *src = (const char *)str;
123    iconv_t	id = (iconv_t)-1;
124    char	buffer[CONV_BUFFER_SIZE];
125    size_t	left = len;
126    int		error = 1;
127
128    MEMSET(&mbs, 0, 1);
129    BINC_RETW(NULL, *tostr, *blen, nlen);
130
131#ifdef USE_ICONV
132    if (strcmp(nl_langinfo(CODESET), enc)) {
133	id = iconv_open(nl_langinfo(CODESET), enc);
134	if (id == (iconv_t)-1)
135	    goto err;
136	CONVERT(str, left, src, len);
137    }
138#endif
139
140    for (i = 0, j = 0; j < len; ) {
141	CHAR_T w;
142	n = mbrtowc(&w, src + j, len - j, &mbs);
143	memcpy((*tostr) + i, &w, sizeof(**tostr));
144	/* NULL character converted */
145	if (n == (size_t)-2) error = -(len - j);
146	if (n == (size_t)-1 || n == (size_t)-2) {
147	    HANDLE_MBR_ERROR(n, mbs, w, src[j]);
148	    memcpy((*tostr) + i, &w, sizeof(**tostr));
149	}
150	if (n == 0) n = 1;
151	j += n;
152	if (++i >= *blen) {
153	    nlen += 256;
154	    BINC_RETW(NULL, *tostr, *blen, nlen);
155	}
156	if (id != (iconv_t)-1 && j == len && left) {
157	    CONVERT(str, left, src, len);
158	    j = 0;
159	}
160    }
161    *tolen = i;
162
163    if (id != (iconv_t)-1)
164	iconv_close(id);
165
166    *dst = cw->bp1;
167
168    return 0;
169err:
170    *tolen = i;
171    if (id != (iconv_t)-1)
172	iconv_close(id);
173    *dst = cw->bp1;
174
175    return error;
176}
177
178static int
179fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
180	    size_t *tolen, const CHAR_T **dst)
181{
182    return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
183}
184
185static int
186ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
187	    size_t *tolen, const CHAR_T **dst)
188{
189    return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_INPUTENCODING));
190}
191
192static int
193cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
194	    size_t *tolen, const CHAR_T **dst)
195{
196    return default_char2int(sp, str, len, cw, tolen, dst, LANGCODESET);
197}
198
199static int
200CHAR_T_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
201	size_t *tolen, const char **dst)
202{
203    *tolen = len * sizeof(CHAR_T);
204    *dst = (const char *)(const void *)str;
205
206    return 0;
207}
208
209static int
210CHAR_T_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
211	size_t *tolen, const CHAR_T **dst)
212{
213    *tolen = len / sizeof(CHAR_T);
214    *dst = (const CHAR_T*) str;
215
216    return 0;
217}
218
219static int
220int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
221	const char **dst)
222{
223    int i;
224    char **tostr = (char **)(void *)&cw->bp1;
225    size_t  *blen = &cw->blen1;
226
227    BINC_RETC(NULL, *tostr, *blen, len);
228
229    *tolen = len;
230    for (i = 0; i < len; ++i) {
231	CHAR_T w;
232	memcpy(&w, str + i, sizeof(w));
233	(*tostr)[i] = w;
234    }
235
236    *dst = cw->bp1;
237
238    return 0;
239}
240
241static int
242default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
243		size_t *tolen, const char **pdst, const char *enc)
244{
245    size_t i, j;
246    int offset = 0;
247    char **tostr = (char **)(void *)&cw->bp1;
248    size_t  *blen = &cw->blen1;
249    mbstate_t mbs;
250    size_t n;
251    ssize_t  nlen = len + MB_CUR_MAX;
252    char *dst;
253    size_t buflen;
254    char	buffer[CONV_BUFFER_SIZE];
255    iconv_t	id = (iconv_t)-1;
256
257/* convert first len bytes of buffer and append it to cw->bp
258 * len is adjusted => 0
259 * offset contains the offset in cw->bp and is adjusted
260 * cw->bp is grown as required
261 */
262#ifdef USE_ICONV
263#define CONVERT2(len, cw, offset)					\
264    do {								\
265	const char *bp = buffer;					\
266	while (len != 0) {						\
267	    size_t outleft = cw->blen1 - offset;			\
268	    char *obp = (char *)cw->bp1 + offset;		    	\
269	    if (cw->blen1 < offset + MB_CUR_MAX) {		    	\
270		nlen += 256;						\
271		BINC_RETC(NULL, cw->bp1, cw->blen1, nlen);		\
272	    }						    		\
273	    errno = 0;						    	\
274	    if (iconv(id, &bp, &len, &obp, &outleft) == (size_t)-1 &&	\
275		errno != E2BIG) 					\
276		    HANDLE_ICONV_ERROR(obp, bp, outleft, len);		\
277	    offset = cw->blen1 - outleft;			        \
278	}							        \
279    } while (0)
280#else
281#define CONVERT2(len, cw, offset)
282#endif
283
284
285    MEMSET(&mbs, 0, 1);
286    BINC_RETC(NULL, *tostr, *blen, nlen);
287    dst = *tostr; buflen = *blen;
288
289#ifdef USE_ICONV
290    if (strcmp(nl_langinfo(CODESET), enc)) {
291	id = iconv_open(enc, nl_langinfo(CODESET));
292	if (id == (iconv_t)-1)
293	    goto err;
294	dst = buffer; buflen = CONV_BUFFER_SIZE;
295    }
296#endif
297
298    for (i = 0, j = 0; i < (size_t)len; ++i) {
299	CHAR_T w;
300	memcpy(&w, str + i, sizeof(w));
301	n = wcrtomb(dst + j, w, &mbs);
302	if (n == (size_t)-1)
303	   HANDLE_MBR_ERROR(n, mbs, dst[j], w);
304	j += n;
305	if (buflen < j + MB_CUR_MAX) {
306	    if (id != (iconv_t)-1) {
307		CONVERT2(j, cw, offset);
308	    } else {
309		nlen += 256;
310		BINC_RETC(NULL, *tostr, *blen, nlen);
311		dst = *tostr; buflen = *blen;
312	    }
313	}
314    }
315
316    n = wcrtomb(dst + j, L'\0', &mbs);
317    j += n - 1;				/* don't count NUL at the end */
318    *tolen = j;
319
320    if (id != (iconv_t)-1) {
321	CONVERT2(j, cw, offset);
322	*tolen = offset;
323    }
324
325    *pdst = cw->bp1;
326
327    return 0;
328err:
329    *tolen = j;
330
331    *pdst = cw->bp1;
332
333    return 1;
334}
335
336static int
337fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
338	    size_t *tolen, const char **dst)
339{
340    return default_int2char(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
341}
342
343static int
344cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
345	    size_t *tolen, const char **dst)
346{
347    return default_int2char(sp, str, len, cw, tolen, dst, LANGCODESET);
348}
349
350#endif
351
352
353void
354conv_init (SCR *orig, SCR *sp)
355{
356    if (orig != NULL)
357	MEMCPY(&sp->conv, &orig->conv, 1);
358    else {
359	setlocale(LC_ALL, "");
360#ifdef USE_WIDECHAR
361	sp->conv.sys2int = cs_char2int;
362	sp->conv.int2sys = cs_int2char;
363	sp->conv.file2int = fe_char2int;
364	sp->conv.int2file = fe_int2char;
365	sp->conv.input2int = ie_char2int;
366#endif
367#ifdef USE_ICONV
368	o_set(sp, O_FILEENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
369	o_set(sp, O_INPUTENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
370#endif
371    }
372}
373
374int
375conv_enc (SCR *sp, int option, const char *enc)
376{
377#if defined(USE_WIDECHAR) && defined(USE_ICONV)
378    iconv_t id;
379    char2wchar_t    *c2w;
380    wchar2char_t    *w2c;
381
382    switch (option) {
383    case O_FILEENCODING:
384	c2w = &sp->conv.file2int;
385	w2c = &sp->conv.int2file;
386	break;
387    case O_INPUTENCODING:
388	c2w = &sp->conv.input2int;
389	w2c = NULL;
390	break;
391    default:
392	c2w = NULL;
393	w2c = NULL;
394	break;
395    }
396
397    if (!*enc) {
398	if (c2w) *c2w = raw2int;
399	if (w2c) *w2c = int2raw;
400	return 0;
401    }
402
403    if (!strcmp(enc, "WCHAR_T")) {
404	if (c2w) *c2w = CHAR_T_char2int;
405	if (w2c) *w2c = CHAR_T_int2char;
406	return 0;
407    }
408
409    id = iconv_open(enc, nl_langinfo(CODESET));
410    if (id == (iconv_t)-1)
411	goto err;
412    iconv_close(id);
413    id = iconv_open(nl_langinfo(CODESET), enc);
414    if (id == (iconv_t)-1)
415	goto err;
416    iconv_close(id);
417
418    switch (option) {
419    case O_FILEENCODING:
420	*c2w = fe_char2int;
421	*w2c = fe_int2char;
422	break;
423    case O_INPUTENCODING:
424	*c2w = ie_char2int;
425	break;
426    }
427
428    F_CLR(sp, SC_CONV_ERROR);
429    F_SET(sp, SC_SCR_REFORMAT);
430
431    return 0;
432err:
433    switch (option) {
434    case O_FILEENCODING:
435	msgq(sp, M_ERR,
436	    "321|File encoding conversion not supported");
437	break;
438    case O_INPUTENCODING:
439	msgq(sp, M_ERR,
440	    "322|Input encoding conversion not supported");
441	break;
442    }
443#endif
444    return 1;
445}
446
447