Deleted Added
full compact
encoding.c (254225) encoding.c (281373)
1/*-
2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
4 *
5 * See the LICENSE file for redistribution information.
6 */
7
8#ifndef lint
9static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10#endif /* not lint */
11
12#include <sys/types.h>
13
1/*-
2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
4 *
5 * See the LICENSE file for redistribution information.
6 */
7
8#ifndef lint
9static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10#endif /* not lint */
11
12#include <sys/types.h>
13
14int looks_utf8 __P((const char *, size_t));
15int looks_utf16 __P((const char *, size_t));
16int decode_utf8 __P((const char *));
17int decode_utf16 __P((const char *, int));
14int looks_utf8(const char *, size_t);
15int looks_utf16(const char *, size_t);
16int decode_utf8(const char *);
17int decode_utf16(const char *, int);
18
19#define F 0 /* character never appears in text */
20#define T 1 /* character appears in plain ASCII text */
21#define I 2 /* character appears in ISO-8859 text */
22#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23
24static char text_chars[256] = {
25 /* BEL BS HT LF FF CR */

--- 23 unchanged lines hidden (view full) ---

49 *
50 * -1: invalid UTF-8
51 * 0: uses odd control characters, so doesn't look like text
52 * 1: 7-bit text
53 * 2: definitely UTF-8 text (valid high-bit set bytes)
54 *
55 * Based on RFC 3629. UTF-8 with BOM is not accepted.
56 *
18
19#define F 0 /* character never appears in text */
20#define T 1 /* character appears in plain ASCII text */
21#define I 2 /* character appears in ISO-8859 text */
22#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23
24static char text_chars[256] = {
25 /* BEL BS HT LF FF CR */

--- 23 unchanged lines hidden (view full) ---

49 *
50 * -1: invalid UTF-8
51 * 0: uses odd control characters, so doesn't look like text
52 * 1: 7-bit text
53 * 2: definitely UTF-8 text (valid high-bit set bytes)
54 *
55 * Based on RFC 3629. UTF-8 with BOM is not accepted.
56 *
57 * PUBLIC: int looks_utf8 __P((const char *, size_t));
57 * PUBLIC: int looks_utf8(const char *, size_t);
58 */
59int
60looks_utf8(const char *ibuf, size_t nbytes)
61{
62 const u_char *buf = (u_char *)ibuf;
63 size_t i;
64 int n;
65 int gotone = 0, ctrl = 0;

--- 44 unchanged lines hidden (view full) ---

110/*
111 * looks_utf16 --
112 * Decide whether some text looks like UTF-16. Returns:
113 *
114 * 0: invalid UTF-16
115 * 1: Little-endian UTF-16
116 * 2: Big-endian UTF-16
117 *
58 */
59int
60looks_utf8(const char *ibuf, size_t nbytes)
61{
62 const u_char *buf = (u_char *)ibuf;
63 size_t i;
64 int n;
65 int gotone = 0, ctrl = 0;

--- 44 unchanged lines hidden (view full) ---

110/*
111 * looks_utf16 --
112 * Decide whether some text looks like UTF-16. Returns:
113 *
114 * 0: invalid UTF-16
115 * 1: Little-endian UTF-16
116 * 2: Big-endian UTF-16
117 *
118 * PUBLIC: int looks_utf16 __P((const char *, size_t));
118 * PUBLIC: int looks_utf16(const char *, size_t);
119 */
120int
121looks_utf16(const char *ibuf, size_t nbytes)
122{
123 const u_char *buf = (u_char *)ibuf;
124 int bigend;
125 size_t i;
126 unsigned int c;

--- 43 unchanged lines hidden (view full) ---

170
171/*
172 * decode_utf8 --
173 * Decode a UTF-8 character from byte string to Unicode.
174 * Returns -1 if the first byte is a not UTF-8 leader.
175 *
176 * Based on RFC 3629, but without error detection.
177 *
119 */
120int
121looks_utf16(const char *ibuf, size_t nbytes)
122{
123 const u_char *buf = (u_char *)ibuf;
124 int bigend;
125 size_t i;
126 unsigned int c;

--- 43 unchanged lines hidden (view full) ---

170
171/*
172 * decode_utf8 --
173 * Decode a UTF-8 character from byte string to Unicode.
174 * Returns -1 if the first byte is a not UTF-8 leader.
175 *
176 * Based on RFC 3629, but without error detection.
177 *
178 * PUBLIC: int decode_utf8 __P((const char *));
178 * PUBLIC: int decode_utf8(const char *);
179 */
179 */
180int decode_utf8(const char *ibuf) {
180int
181decode_utf8(const char *ibuf)
182{
181 const u_char *buf = (u_char *)ibuf;
182 int u = -1;
183
184 if ((buf[0] & 0x80) == 0)
185 u = buf[0];
186 else if ((buf[0] & 0x40) == 0);
187 else {
188 if ((buf[0] & 0x20) == 0)
189 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
190 else if ((buf[0] & 0x10) == 0)
191 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
192 ^ (buf[2] ^ 0x80);
193 else if (((buf[0] & 0x08) == 0))
194 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
195 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
196 }
183 const u_char *buf = (u_char *)ibuf;
184 int u = -1;
185
186 if ((buf[0] & 0x80) == 0)
187 u = buf[0];
188 else if ((buf[0] & 0x40) == 0);
189 else {
190 if ((buf[0] & 0x20) == 0)
191 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
192 else if ((buf[0] & 0x10) == 0)
193 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
194 ^ (buf[2] ^ 0x80);
195 else if (((buf[0] & 0x08) == 0))
196 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
197 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
198 }
199
197 return u;
198}
199
200/*
201 * decode_utf16 --
202 * Decode a UTF-16 character from byte string to Unicode.
203 * Returns -1 if the first unsigned integer is invalid.
204 *
205 * No error detection on supplementary bytes.
206 *
200 return u;
201}
202
203/*
204 * decode_utf16 --
205 * Decode a UTF-16 character from byte string to Unicode.
206 * Returns -1 if the first unsigned integer is invalid.
207 *
208 * No error detection on supplementary bytes.
209 *
207 * PUBLIC: int decode_utf16 __P((const char *, int));
210 * PUBLIC: int decode_utf16(const char *, int);
208 */
211 */
209int decode_utf16(const char* ibuf, int bigend) {
212int
213decode_utf16(const char* ibuf, int bigend)
214{
210 const u_char *buf = (u_char *)ibuf;
211 int u = -1;
212 unsigned int w1, w2;
213
214 if (bigend)
215 w1 = buf[0] << 8 ^ buf[1];
216 else
217 w1 = buf[0] ^ buf[1] << 8;
218
219 if (w1 < 0xD800 || w1 > 0xDFFF)
220 u = w1;
221 else if (w1 > 0xDBFF);
222 else {
223 if (bigend)
224 w2 = buf[2] << 8 ^ buf[3];
225 else
226 w2 = buf[2] ^ buf[3] << 8;
227 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
228 }
215 const u_char *buf = (u_char *)ibuf;
216 int u = -1;
217 unsigned int w1, w2;
218
219 if (bigend)
220 w1 = buf[0] << 8 ^ buf[1];
221 else
222 w1 = buf[0] ^ buf[1] << 8;
223
224 if (w1 < 0xD800 || w1 > 0xDFFF)
225 u = w1;
226 else if (w1 > 0xDBFF);
227 else {
228 if (bigend)
229 w2 = buf[2] << 8 ^ buf[3];
230 else
231 w2 = buf[2] ^ buf[3] << 8;
232 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
233 }
234
229 return u;
230}
235 return u;
236}