utf8.c revision 323129
1/* $OpenBSD: utf8.c,v 1.3 2016/05/30 12:57:21 schwarze Exp $ */
2/*
3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * Utility functions for multibyte-character handling,
20 * in particular to sanitize untrusted strings for terminal output.
21 */
22
23#include "includes.h"
24
25#include <sys/types.h>
26#ifdef HAVE_LANGINFO_H
27# include <langinfo.h>
28#endif
29#include <limits.h>
30#include <stdarg.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#if defined(HAVE_STRNVIS) && defined(HAVE_VIS_H) && !defined(BROKEN_STRNVIS)
35# include <vis.h>
36#endif
37#ifdef HAVE_WCHAR_H
38# include <wchar.h>
39#endif
40
41#include "utf8.h"
42
43static int	 dangerous_locale(void);
44static int	 grow_dst(char **, size_t *, size_t, char **, size_t);
45static int	 vasnmprintf(char **, size_t, int *, const char *, va_list);
46
47
48/*
49 * For US-ASCII and UTF-8 encodings, we can safely recover from
50 * encoding errors and from non-printable characters.  For any
51 * other encodings, err to the side of caution and abort parsing:
52 * For state-dependent encodings, recovery is impossible.
53 * For arbitrary encodings, replacement of non-printable
54 * characters would be non-trivial and too fragile.
55 */
56
57static int
58dangerous_locale(void) {
59	char	*loc;
60
61	loc = nl_langinfo(CODESET);
62	return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8");
63}
64
65static int
66grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need)
67{
68	char	*tp;
69	size_t	 tsz;
70
71	if (*dp + need < *dst + *sz)
72		return 0;
73	tsz = *sz + 128;
74	if (tsz > maxsz)
75		tsz = maxsz;
76	if ((tp = realloc(*dst, tsz)) == NULL)
77		return -1;
78	*dp = tp + (*dp - *dst);
79	*dst = tp;
80	*sz = tsz;
81	return 0;
82}
83
84/*
85 * The following two functions limit the number of bytes written,
86 * including the terminating '\0', to sz.  Unless wp is NULL,
87 * they limit the number of display columns occupied to *wp.
88 * Whichever is reached first terminates the output string.
89 * To stay close to the standard interfaces, they return the number of
90 * non-NUL bytes that would have been written if both were unlimited.
91 * If wp is NULL, newline, carriage return, and tab are allowed;
92 * otherwise, the actual number of columns occupied by what was
93 * written is returned in *wp.
94 */
95
96static int
97vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap)
98{
99	char	*src;	/* Source string returned from vasprintf. */
100	char	*sp;	/* Pointer into src. */
101	char	*dst;	/* Destination string to be returned. */
102	char	*dp;	/* Pointer into dst. */
103	char	*tp;	/* Temporary pointer for dst. */
104	size_t	 sz;	/* Number of bytes allocated for dst. */
105	wchar_t	 wc;	/* Wide character at sp. */
106	int	 len;	/* Number of bytes in the character at sp. */
107	int	 ret;	/* Number of bytes needed to format src. */
108	int	 width;	/* Display width of the character wc. */
109	int	 total_width, max_width, print;
110
111	src = NULL;
112	if ((ret = vasprintf(&src, fmt, ap)) <= 0)
113		goto fail;
114
115	sz = strlen(src) + 1;
116	if ((dst = malloc(sz)) == NULL) {
117		free(src);
118		goto fail;
119	}
120
121	if (maxsz > INT_MAX)
122		maxsz = INT_MAX;
123
124	sp = src;
125	dp = dst;
126	ret = 0;
127	print = 1;
128	total_width = 0;
129	max_width = wp == NULL ? INT_MAX : *wp;
130	while (*sp != '\0') {
131		if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) {
132			(void)mbtowc(NULL, NULL, MB_CUR_MAX);
133			if (dangerous_locale()) {
134				ret = -1;
135				break;
136			}
137			len = 1;
138			width = -1;
139		} else if (wp == NULL &&
140		    (wc == L'\n' || wc == L'\r' || wc == L'\t')) {
141			/*
142			 * Don't use width uninitialized; the actual
143			 * value doesn't matter because total_width
144			 * is only returned for wp != NULL.
145			 */
146			width = 0;
147		} else if ((width = wcwidth(wc)) == -1 &&
148		    dangerous_locale()) {
149			ret = -1;
150			break;
151		}
152
153		/* Valid, printable character. */
154
155		if (width >= 0) {
156			if (print && (dp - dst >= (int)maxsz - len ||
157			    total_width > max_width - width))
158				print = 0;
159			if (print) {
160				if (grow_dst(&dst, &sz, maxsz,
161				    &dp, len) == -1) {
162					ret = -1;
163					break;
164				}
165				total_width += width;
166				memcpy(dp, sp, len);
167				dp += len;
168			}
169			sp += len;
170			if (ret >= 0)
171				ret += len;
172			continue;
173		}
174
175		/* Escaping required. */
176
177		while (len > 0) {
178			if (print && (dp - dst >= (int)maxsz - 4 ||
179			    total_width > max_width - 4))
180				print = 0;
181			if (print) {
182				if (grow_dst(&dst, &sz, maxsz,
183				    &dp, 4) == -1) {
184					ret = -1;
185					break;
186				}
187				tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0);
188				width = tp - dp;
189				total_width += width;
190				dp = tp;
191			} else
192				width = 4;
193			len--;
194			sp++;
195			if (ret >= 0)
196				ret += width;
197		}
198		if (len > 0)
199			break;
200	}
201	free(src);
202	*dp = '\0';
203	*str = dst;
204	if (wp != NULL)
205		*wp = total_width;
206
207	/*
208	 * If the string was truncated by the width limit but
209	 * would have fit into the size limit, the only sane way
210	 * to report the problem is using the return value, such
211	 * that the usual idiom "if (ret < 0 || ret >= sz) error"
212	 * works as expected.
213	 */
214
215	if (ret < (int)maxsz && !print)
216		ret = -1;
217	return ret;
218
219fail:
220	if (wp != NULL)
221		*wp = 0;
222	if (ret == 0) {
223		*str = src;
224		return 0;
225	} else {
226		*str = NULL;
227		return -1;
228	}
229}
230
231int
232snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...)
233{
234	va_list	 ap;
235	char	*cp;
236	int	 ret;
237
238	va_start(ap, fmt);
239	ret = vasnmprintf(&cp, sz, wp, fmt, ap);
240	va_end(ap);
241	if (cp != NULL) {
242		(void)strlcpy(str, cp, sz);
243		free(cp);
244	} else
245		*str = '\0';
246	return ret;
247}
248
249/*
250 * To stay close to the standard interfaces, the following functions
251 * return the number of non-NUL bytes written.
252 */
253
254int
255vfmprintf(FILE *stream, const char *fmt, va_list ap)
256{
257	char	*str;
258	int	 ret;
259
260	if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0)
261		return -1;
262	if (fputs(str, stream) == EOF)
263		ret = -1;
264	free(str);
265	return ret;
266}
267
268int
269fmprintf(FILE *stream, const char *fmt, ...)
270{
271	va_list	 ap;
272	int	 ret;
273
274	va_start(ap, fmt);
275	ret = vfmprintf(stream, fmt, ap);
276	va_end(ap);
277	return ret;
278}
279
280int
281mprintf(const char *fmt, ...)
282{
283	va_list	 ap;
284	int	 ret;
285
286	va_start(ap, fmt);
287	ret = vfmprintf(stdout, fmt, ap);
288	va_end(ap);
289	return ret;
290}
291