1/*	$NetBSD: chartype.c,v 1.4 2010/04/15 00:55:57 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 *    must display the following acknowledgement:
17 *        This product includes software developed by the NetBSD
18 *        Foundation, Inc. and its contributors.
19 * 4. Neither the name of The NetBSD Foundation nor the names of its
20 *    contributors may be used to endorse or promote products derived
21 *    from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
25 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
26 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
27 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36/*
37 * chartype.c: character classification and meta information
38 */
39#include "config.h"
40#if !defined(lint) && !defined(SCCSID)
41__RCSID("$NetBSD: chartype.c,v 1.4 2010/04/15 00:55:57 christos Exp $");
42#endif /* not lint && not SCCSID */
43#include "el.h"
44#include <stdlib.h>
45
46#define CT_BUFSIZ 1024
47
48#ifdef WIDECHAR
49protected void
50ct_conv_buff_resize(ct_buffer_t *conv, size_t mincsize, size_t minwsize)
51{
52	void *p;
53	if (mincsize > conv->csize) {
54		conv->csize = mincsize;
55		p = el_realloc(conv->cbuff, conv->csize);
56		if (p == NULL) {
57			conv->csize = 0;
58			el_free(conv->cbuff);
59			conv->cbuff = NULL;
60		} else
61			conv->cbuff = p;
62	}
63
64	if (minwsize > conv->wsize) {
65		conv->wsize = minwsize;
66		p = el_realloc(conv->wbuff, conv->wsize);
67		if (p == NULL) {
68			conv->wsize = 0;
69			el_free(conv->wbuff);
70			conv->wbuff = NULL;
71		} else
72			conv->wbuff = p;
73	}
74}
75
76
77public char *
78ct_encode_string(const Char *s, ct_buffer_t *conv)
79{
80	char *dst;
81	ssize_t used = 0;
82
83	if (!s)
84		return NULL;
85	if (!conv->cbuff)
86		ct_conv_buff_resize(conv, CT_BUFSIZ, 0);
87	if (!conv->cbuff)
88		return NULL;
89
90	dst = conv->cbuff;
91	while (*s) {
92		used = ct_encode_char(dst, (int)(conv->csize -
93		    (dst - conv->cbuff)), *s);
94		if (used == -1) { /* failed to encode, need more buffer space */
95			used = dst - conv->cbuff;
96			ct_conv_buff_resize(conv, conv->csize + CT_BUFSIZ, 0);
97			if (!conv->cbuff)
98				return NULL;
99			dst = conv->cbuff + used;
100			/* don't increment s here - we want to retry it! */
101		}
102		else
103			++s;
104		dst += used;
105	}
106	if (dst >= (conv->cbuff + conv->csize)) {
107		used = dst - conv->cbuff;
108		ct_conv_buff_resize(conv, conv->csize + 1, 0);
109		if (!conv->cbuff)
110			return NULL;
111		dst = conv->cbuff + used;
112	}
113	*dst = '\0';
114	return conv->cbuff;
115}
116
117public Char *
118ct_decode_string(const char *s, ct_buffer_t *conv)
119{
120	size_t len = 0;
121
122	if (!s)
123		return NULL;
124	if (!conv->wbuff)
125		ct_conv_buff_resize(conv, 0, CT_BUFSIZ);
126	if (!conv->wbuff)
127		return NULL;
128
129	len = ct_mbstowcs(0, s, 0);
130	if (len > conv->wsize)
131		ct_conv_buff_resize(conv, 0, len + 1);
132	if (!conv->wbuff)
133		return NULL;
134	ct_mbstowcs(conv->wbuff, s, conv->wsize);
135	return conv->wbuff;
136}
137
138
139protected Char **
140ct_decode_argv(int argc, const char *argv[], ct_buffer_t *conv)
141{
142	size_t bufspace;
143	int i;
144	Char *p;
145	Char **wargv;
146	ssize_t bytes;
147
148	/* Make sure we have enough space in the conversion buffer to store all
149	 * the argv strings. */
150	for (i = 0, bufspace = 0; i < argc; ++i)
151		bufspace += argv[i] ? strlen(argv[i]) + 1 : 0;
152	ct_conv_buff_resize(conv, 0, bufspace);
153	if (!conv->wsize)
154		return NULL;
155
156	wargv = el_malloc(argc * sizeof(*wargv));
157
158	for (i = 0, p = conv->wbuff; i < argc; ++i) {
159		if (!argv[i]) {   /* don't pass null pointers to mbstowcs */
160			wargv[i] = NULL;
161			continue;
162		} else {
163			wargv[i] = p;
164			bytes = mbstowcs(p, argv[i], bufspace);
165		}
166		if (bytes == -1) {
167			el_free(wargv);
168			return NULL;
169		} else
170			bytes++;  /* include '\0' in the count */
171		bufspace -= bytes;
172		p += bytes;
173	}
174
175	return wargv;
176}
177
178
179protected size_t
180ct_enc_width(Char c)
181{
182	/* UTF-8 encoding specific values */
183	if (c < 0x80)
184		return 1;
185	else if (c < 0x0800)
186		return 2;
187	else if (c < 0x10000)
188		return 3;
189	else if (c < 0x110000)
190		return 4;
191	else
192		return 0; /* not a valid codepoint */
193}
194
195protected ssize_t
196ct_encode_char(char *dst, size_t len, Char c)
197{
198	ssize_t l = 0;
199	if (len < ct_enc_width(c))
200		return -1;
201	l = ct_wctomb(dst, c);
202
203	if (l < 0) {
204		ct_wctomb_reset;
205		l = 0;
206	}
207	return l;
208}
209#endif
210
211protected const Char *
212ct_visual_string(const Char *s)
213{
214	static Char *buff = NULL;
215	static size_t buffsize = 0;
216	void *p;
217	Char *dst;
218	ssize_t used = 0;
219
220	if (!s)
221		return NULL;
222	if (!buff) {
223	    buffsize = CT_BUFSIZ;
224	    buff = el_malloc(buffsize * sizeof(*buff));
225	}
226	dst = buff;
227	while (*s) {
228		used = ct_visual_char(dst, buffsize - (dst - buff), *s);
229		if (used == -1) { /* failed to encode, need more buffer space */
230			used = dst - buff;
231			buffsize += CT_BUFSIZ;
232			p = el_realloc(buff, buffsize * sizeof(*buff));
233			if (p == NULL)
234				goto out;
235			buff = p;
236			dst = buff + used;
237			/* don't increment s here - we want to retry it! */
238		}
239		else
240		    ++s;
241		dst += used;
242	}
243	if (dst >= (buff + buffsize)) { /* sigh */
244		buffsize += 1;
245		p = el_realloc(buff, buffsize * sizeof(*buff));
246		if (p == NULL)
247			goto out;
248		buff = p;
249		dst = buff + buffsize - 1;
250	}
251	*dst = 0;
252	return buff;
253out:
254	el_free(buff);
255	buffsize = 0;
256	return NULL;
257}
258
259
260
261protected int
262ct_visual_width(Char c)
263{
264	int t = ct_chr_class(c);
265	switch (t) {
266	case CHTYPE_ASCIICTL:
267		return 2; /* ^@ ^? etc. */
268	case CHTYPE_TAB:
269		return 1; /* Hmm, this really need to be handled outside! */
270	case CHTYPE_NL:
271		return 0; /* Should this be 1 instead? */
272#ifdef WIDECHAR
273	case CHTYPE_PRINT:
274		return wcwidth(c);
275	case CHTYPE_NONPRINT:
276		if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */
277			return 8; /* \U+12345 */
278		else
279			return 7; /* \U+1234 */
280#else
281	case CHTYPE_PRINT:
282		return 1;
283	case CHTYPE_NONPRINT:
284		return 4; /* \123 */
285#endif
286	default:
287		return 0; /* should not happen */
288	}
289}
290
291
292protected ssize_t
293ct_visual_char(Char *dst, size_t len, Char c)
294{
295	int t = ct_chr_class(c);
296	switch (t) {
297	case CHTYPE_TAB:
298	case CHTYPE_NL:
299	case CHTYPE_ASCIICTL:
300		if (len < 2)
301			return -1;   /* insufficient space */
302		*dst++ = '^';
303		if (c == '\177')
304			*dst = '?'; /* DEL -> ^? */
305		else
306			*dst = c | 0100;    /* uncontrolify it */
307		return 2;
308	case CHTYPE_PRINT:
309		if (len < 1)
310			return -1;  /* insufficient space */
311		*dst = c;
312		return 1;
313	case CHTYPE_NONPRINT:
314		/* we only use single-width glyphs for display,
315		 * so this is right */
316		if ((ssize_t)len < ct_visual_width(c))
317			return -1;   /* insufficient space */
318#ifdef WIDECHAR
319		*dst++ = '\\';
320		*dst++ = 'U';
321		*dst++ = '+';
322#define tohexdigit(v) "0123456789ABCDEF"[v]
323		if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */
324			*dst++ = tohexdigit(((unsigned int) c >> 16) & 0xf);
325		*dst++ = tohexdigit(((unsigned int) c >> 12) & 0xf);
326		*dst++ = tohexdigit(((unsigned int) c >>  8) & 0xf);
327		*dst++ = tohexdigit(((unsigned int) c >>  4) & 0xf);
328		*dst   = tohexdigit(((unsigned int) c      ) & 0xf);
329		return (c > 0xffff) ? 8 : 7;
330#else
331		*dst++ = '\\';
332#define tooctaldigit(v) ((v) + '0')
333		*dst++ = tooctaldigit(((unsigned int) c >> 6) & 0x7);
334		*dst++ = tooctaldigit(((unsigned int) c >> 3) & 0x7);
335		*dst++ = tooctaldigit(((unsigned int) c     ) & 0x7);
336#endif
337		/*FALLTHROUGH*/
338	/* these two should be handled outside this function */
339	default:            /* we should never hit the default */
340		return 0;
341	}
342}
343
344
345
346
347protected int
348ct_chr_class(Char c)
349{
350	if (c == '\t')
351		return CHTYPE_TAB;
352	else if (c == '\n')
353		return CHTYPE_NL;
354	else if (IsASCII(c) && Iscntrl(c))
355		return CHTYPE_ASCIICTL;
356	else if (Isprint(c))
357		return CHTYPE_PRINT;
358	else
359		return CHTYPE_NONPRINT;
360}
361