chartype.c revision 296175
1/*	$NetBSD: chartype.c,v 1.23 2016/02/28 23:02:24 christos Exp $	*/
2
3/*-
4 * Copyright (c) 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * chartype.c: character classification and meta information
31 */
32#include "config.h"
33#if !defined(lint) && !defined(SCCSID)
34__RCSID("$NetBSD: chartype.c,v 1.23 2016/02/28 23:02:24 christos Exp $");
35#endif /* not lint && not SCCSID */
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: head/lib/libedit/chartype.c 296175 2016-02-29 00:15:25Z pfg $");
38
39#include <ctype.h>
40#include <stdlib.h>
41#include <string.h>
42
43#include "el.h"
44
45#define CT_BUFSIZ ((size_t)1024)
46
47#ifdef WIDECHAR
48protected int
49ct_conv_cbuff_resize(ct_buffer_t *conv, size_t csize)
50{
51	void *p;
52
53	if (csize <= conv->csize)
54		return 0;
55
56	conv->csize = csize;
57
58	p = el_realloc(conv->cbuff, conv->csize * sizeof(*conv->cbuff));
59	if (p == NULL) {
60		conv->csize = 0;
61		el_free(conv->cbuff);
62		conv->cbuff = NULL;
63		return -1;
64	}
65	conv->cbuff = p;
66	return 0;
67}
68
69protected int
70ct_conv_wbuff_resize(ct_buffer_t *conv, size_t wsize)
71{
72	void *p;
73
74	if (wsize <= conv->wsize)
75		return 0;
76
77	conv->wsize = wsize;
78
79	p = el_realloc(conv->wbuff, conv->wsize * sizeof(*conv->wbuff));
80	if (p == NULL) {
81		conv->wsize = 0;
82		el_free(conv->wbuff);
83		conv->wbuff = NULL;
84		return -1;
85	}
86	conv->wbuff = p;
87	return 0;
88}
89
90
91public char *
92ct_encode_string(const Char *s, ct_buffer_t *conv)
93{
94	char *dst;
95	ssize_t used;
96
97	if (!s)
98		return NULL;
99
100	dst = conv->cbuff;
101	for (;;) {
102		used = (ssize_t)(dst - conv->cbuff);
103		if ((conv->csize - (size_t)used) < 5) {
104			if (ct_conv_cbuff_resize(conv,
105			    conv->csize + CT_BUFSIZ) == -1)
106				return NULL;
107			dst = conv->cbuff + used;
108		}
109		if (!*s)
110			break;
111		used = ct_encode_char(dst, (size_t)5, *s);
112		if (used == -1) /* failed to encode, need more buffer space */
113			abort();
114		++s;
115		dst += used;
116	}
117	*dst = '\0';
118	return conv->cbuff;
119}
120
121public Char *
122ct_decode_string(const char *s, ct_buffer_t *conv)
123{
124	size_t len;
125
126	if (!s)
127		return NULL;
128
129	len = ct_mbstowcs(NULL, s, (size_t)0);
130	if (len == (size_t)-1)
131		return NULL;
132
133	if (conv->wsize < ++len)
134		if (ct_conv_wbuff_resize(conv, len + CT_BUFSIZ) == -1)
135			return NULL;
136
137	ct_mbstowcs(conv->wbuff, s, conv->wsize);
138	return conv->wbuff;
139}
140
141
142protected Char **
143ct_decode_argv(int argc, const char *argv[], ct_buffer_t *conv)
144{
145	size_t bufspace;
146	int i;
147	Char *p;
148	Char **wargv;
149	ssize_t bytes;
150
151	/* Make sure we have enough space in the conversion buffer to store all
152	 * the argv strings. */
153	for (i = 0, bufspace = 0; i < argc; ++i)
154		bufspace += argv[i] ? strlen(argv[i]) + 1 : 0;
155	if (conv->wsize < ++bufspace)
156		if (ct_conv_wbuff_resize(conv, bufspace + CT_BUFSIZ) == -1)
157			return NULL;
158
159	wargv = el_malloc((size_t)argc * sizeof(*wargv));
160
161	for (i = 0, p = conv->wbuff; i < argc; ++i) {
162		if (!argv[i]) {   /* don't pass null pointers to mbstowcs */
163			wargv[i] = NULL;
164			continue;
165		} else {
166			wargv[i] = p;
167			bytes = (ssize_t)mbstowcs(p, argv[i], bufspace);
168		}
169		if (bytes == -1) {
170			el_free(wargv);
171			return NULL;
172		} else
173			bytes++;  /* include '\0' in the count */
174		bufspace -= (size_t)bytes;
175		p += bytes;
176	}
177
178	return wargv;
179}
180
181
182protected size_t
183ct_enc_width(Char c)
184{
185	/* UTF-8 encoding specific values */
186	if (c < 0x80)
187		return 1;
188	else if (c < 0x0800)
189		return 2;
190	else if (c < 0x10000)
191		return 3;
192	else if (c < 0x110000)
193		return 4;
194	else
195		return 0; /* not a valid codepoint */
196}
197
198protected ssize_t
199ct_encode_char(char *dst, size_t len, Char c)
200{
201	ssize_t l = 0;
202	if (len < ct_enc_width(c))
203		return -1;
204	l = ct_wctomb(dst, c);
205
206	if (l < 0) {
207		ct_wctomb_reset;
208		l = 0;
209	}
210	return l;
211}
212
213size_t
214ct_mbrtowc(wchar_t *wc, const char *s, size_t n)
215{
216	mbstate_t mbs;
217	/* This only works because UTF-8 is stateless */
218	memset(&mbs, 0, sizeof(mbs));
219	return mbrtowc(wc, s, n, &mbs);
220}
221
222#else
223
224size_t
225ct_mbrtowc(wchar_t *wc, const char *s, size_t n)
226	if (s == NULL)
227		return 0;
228	if (n == 0)
229		return (size_t)-2;
230	if (wc != NULL)
231		*wc = *s;
232	return *s != '\0';
233}
234#endif
235
236protected const Char *
237ct_visual_string(const Char *s)
238{
239	static Char *buff = NULL;
240	static size_t buffsize = 0;
241	void *p;
242	Char *dst;
243	ssize_t used = 0;
244
245	if (!s)
246		return NULL;
247	if (!buff) {
248	    buffsize = CT_BUFSIZ;
249	    buff = el_malloc(buffsize * sizeof(*buff));
250	}
251	dst = buff;
252	while (*s) {
253		used = ct_visual_char(dst, buffsize - (size_t)(dst - buff), *s);
254		if (used == -1) { /* failed to encode, need more buffer space */
255			used = dst - buff;
256			buffsize += CT_BUFSIZ;
257			p = el_realloc(buff, buffsize * sizeof(*buff));
258			if (p == NULL)
259				goto out;
260			buff = p;
261			dst = buff + used;
262			/* don't increment s here - we want to retry it! */
263		}
264		else
265		    ++s;
266		dst += used;
267	}
268	if (dst >= (buff + buffsize)) { /* sigh */
269		buffsize += 1;
270		p = el_realloc(buff, buffsize * sizeof(*buff));
271		if (p == NULL)
272			goto out;
273		buff = p;
274		dst = buff + buffsize - 1;
275	}
276	*dst = 0;
277	return buff;
278out:
279	el_free(buff);
280	buffsize = 0;
281	return NULL;
282}
283
284
285
286protected int
287ct_visual_width(Char c)
288{
289	int t = ct_chr_class(c);
290	switch (t) {
291	case CHTYPE_ASCIICTL:
292		return 2; /* ^@ ^? etc. */
293	case CHTYPE_TAB:
294		return 1; /* Hmm, this really need to be handled outside! */
295	case CHTYPE_NL:
296		return 0; /* Should this be 1 instead? */
297#ifdef WIDECHAR
298	case CHTYPE_PRINT:
299		return wcwidth(c);
300	case CHTYPE_NONPRINT:
301		if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */
302			return 8; /* \U+12345 */
303		else
304			return 7; /* \U+1234 */
305#else
306	case CHTYPE_PRINT:
307		return 1;
308	case CHTYPE_NONPRINT:
309		return 4; /* \123 */
310#endif
311	default:
312		return 0; /* should not happen */
313	}
314}
315
316
317protected ssize_t
318ct_visual_char(Char *dst, size_t len, Char c)
319{
320	int t = ct_chr_class(c);
321	switch (t) {
322	case CHTYPE_TAB:
323	case CHTYPE_NL:
324	case CHTYPE_ASCIICTL:
325		if (len < 2)
326			return -1;   /* insufficient space */
327		*dst++ = '^';
328		if (c == '\177')
329			*dst = '?'; /* DEL -> ^? */
330		else
331			*dst = c | 0100;    /* uncontrolify it */
332		return 2;
333	case CHTYPE_PRINT:
334		if (len < 1)
335			return -1;  /* insufficient space */
336		*dst = c;
337		return 1;
338	case CHTYPE_NONPRINT:
339		/* we only use single-width glyphs for display,
340		 * so this is right */
341		if ((ssize_t)len < ct_visual_width(c))
342			return -1;   /* insufficient space */
343#ifdef WIDECHAR
344		*dst++ = '\\';
345		*dst++ = 'U';
346		*dst++ = '+';
347#define tohexdigit(v) "0123456789ABCDEF"[v]
348		if (c > 0xffff) /* prefer standard 4-byte display over 5-byte */
349			*dst++ = tohexdigit(((unsigned int) c >> 16) & 0xf);
350		*dst++ = tohexdigit(((unsigned int) c >> 12) & 0xf);
351		*dst++ = tohexdigit(((unsigned int) c >>  8) & 0xf);
352		*dst++ = tohexdigit(((unsigned int) c >>  4) & 0xf);
353		*dst   = tohexdigit(((unsigned int) c      ) & 0xf);
354		return c > 0xffff ? 8 : 7;
355#else
356		*dst++ = '\\';
357#define tooctaldigit(v) (Char)((v) + '0')
358		*dst++ = tooctaldigit(((unsigned int) c >> 6) & 0x7);
359		*dst++ = tooctaldigit(((unsigned int) c >> 3) & 0x7);
360		*dst++ = tooctaldigit(((unsigned int) c     ) & 0x7);
361#endif
362		/*FALLTHROUGH*/
363	/* these two should be handled outside this function */
364	default:            /* we should never hit the default */
365		return 0;
366	}
367}
368
369
370
371
372protected int
373ct_chr_class(Char c)
374{
375	if (c == '\t')
376		return CHTYPE_TAB;
377	else if (c == '\n')
378		return CHTYPE_NL;
379	else if (IsASCII(c) && Iscntrl(c))
380		return CHTYPE_ASCIICTL;
381	else if (Isprint(c))
382		return CHTYPE_PRINT;
383	else
384		return CHTYPE_NONPRINT;
385}
386