1/*-
2 * Copyright (c) 2011, 2012
3 *	Zhihao Yuan.  All rights reserved.
4 *
5 * See the LICENSE file for redistribution information.
6 */
7
8#ifndef lint
9static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10#endif /* not lint */
11
12#include <sys/types.h>
13
14int looks_utf8 __P((const char *, size_t));
15int looks_utf16 __P((const char *, size_t));
16int decode_utf8 __P((const char *));
17int decode_utf16 __P((const char *, int));
18
19#define F 0   /* character never appears in text */
20#define T 1   /* character appears in plain ASCII text */
21#define I 2   /* character appears in ISO-8859 text */
22#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23
24static char text_chars[256] = {
25	/*                  BEL BS HT LF    FF CR    */
26	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
27	/*                              ESC          */
28	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
29	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
30	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
31	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
32	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
33	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
34	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
35	/*            NEL                            */
36	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
37	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
38	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
39	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
40	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
41	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
42	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
43	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
44};
45
46/*
47 * looks_utf8 --
48 *  Decide whether some text looks like UTF-8. Returns:
49 *
50 *     -1: invalid UTF-8
51 *      0: uses odd control characters, so doesn't look like text
52 *      1: 7-bit text
53 *      2: definitely UTF-8 text (valid high-bit set bytes)
54 *
55 *  Based on RFC 3629. UTF-8 with BOM is not accepted.
56 *
57 * PUBLIC: int looks_utf8 __P((const char *, size_t));
58 */
59int
60looks_utf8(const char *ibuf, size_t nbytes)
61{
62	const u_char *buf = (u_char *)ibuf;
63	size_t i;
64	int n;
65	int gotone = 0, ctrl = 0;
66
67	for (i = 0; i < nbytes; i++) {
68		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
69			/*
70			 * Even if the whole file is valid UTF-8 sequences,
71			 * still reject it if it uses weird control characters.
72			 */
73
74			if (text_chars[buf[i]] != T)
75				ctrl = 1;
76		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
77			return -1;
78		} else {			   /* 11xxxxxx begins UTF-8 */
79			int following;
80
81			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
82				if (buf[i] > 0xC1)	/* C0, C1 */
83					following = 1;
84				else return -1;
85			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
86				following = 2;
87			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
88				if (buf[i] < 0xF5)
89					following = 3;
90				else return -1;		/* F5, F6, F7 */
91			else
92				return -1;		/* F8~FF */
93
94			for (n = 0; n < following; n++) {
95				i++;
96				if (i >= nbytes)
97					goto done;
98
99				if (buf[i] & 0x40)	/* 10xxxxxx */
100					return -1;
101			}
102
103			gotone = 1;
104		}
105	}
106done:
107	return ctrl ? 0 : (gotone ? 2 : 1);
108}
109
110/*
111 * looks_utf16 --
112 *  Decide whether some text looks like UTF-16. Returns:
113 *
114 *      0: invalid UTF-16
115 *      1: Little-endian UTF-16
116 *      2: Big-endian UTF-16
117 *
118 * PUBLIC: int looks_utf16 __P((const char *, size_t));
119 */
120int
121looks_utf16(const char *ibuf, size_t nbytes)
122{
123	const u_char *buf = (u_char *)ibuf;
124	int bigend;
125	size_t i;
126	unsigned int c;
127	int bom;
128	int following = 0;
129
130	if (nbytes < 2)
131		return 0;
132
133	bom = buf[0] << 8 ^ buf[1];
134	if (bom == 0xFFFE)
135		bigend = 0;
136	else if (bom == 0xFEFF)
137		bigend = 1;
138	else
139		return 0;
140
141	for (i = 2; i + 1 < nbytes; i += 2) {
142		if (bigend)
143			c = buf[i] << 8 ^ buf[i + 1];
144		else
145			c = buf[i] ^ buf[i + 1] << 8;
146
147		if (!following)
148			if (c < 0xD800 || c > 0xDFFF)
149				if (c < 128 && text_chars[c] != T)
150					return 0;
151				else
152					following = 0;
153			else if (c > 0xDBFF)
154				return 0;
155			else {
156				following = 1;
157				continue;
158			}
159		else if (c < 0xDC00 || c > 0xDFFF)
160			return 0;
161	}
162
163	return 1 + bigend;
164}
165
166#undef F
167#undef T
168#undef I
169#undef X
170
171/*
172 * decode_utf8 --
173 *  Decode a UTF-8 character from byte string to Unicode.
174 *  Returns -1 if the first byte is a not UTF-8 leader.
175 *
176 *  Based on RFC 3629, but without error detection.
177 *
178 * PUBLIC: int decode_utf8 __P((const char *));
179 */
180int decode_utf8(const char *ibuf) {
181	const u_char *buf = (u_char *)ibuf;
182	int u = -1;
183
184	if ((buf[0] & 0x80) == 0)
185		u = buf[0];
186	else if ((buf[0] & 0x40) == 0);
187	else {
188		if ((buf[0] & 0x20) == 0)
189			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
190		else if ((buf[0] & 0x10) == 0)
191			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
192			  ^ (buf[2] ^ 0x80);
193		else if (((buf[0] & 0x08) == 0))
194			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
195			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
196	}
197	return u;
198}
199
200/*
201 * decode_utf16 --
202 *  Decode a UTF-16 character from byte string to Unicode.
203 *  Returns -1 if the first unsigned integer is invalid.
204 *
205 *  No error detection on supplementary bytes.
206 *
207 * PUBLIC: int decode_utf16 __P((const char *, int));
208 */
209int decode_utf16(const char* ibuf, int bigend) {
210	const u_char *buf = (u_char *)ibuf;
211	int u = -1;
212	unsigned int w1, w2;
213
214	if (bigend)
215		w1 = buf[0] << 8 ^ buf[1];
216	else
217		w1 = buf[0] ^ buf[1] << 8;
218
219	if (w1 < 0xD800 || w1 > 0xDFFF)
220		u = w1;
221	else if (w1 > 0xDBFF);
222	else {
223		if (bigend)
224			w2 = buf[2] << 8 ^ buf[3];
225		else
226			w2 = buf[2] ^ buf[3] << 8;
227		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
228	}
229	return u;
230}
231