encoding.c (254225) | encoding.c (281373) |
---|---|
1/*- 2 * Copyright (c) 2011, 2012 3 * Zhihao Yuan. All rights reserved. 4 * 5 * See the LICENSE file for redistribution information. 6 */ 7 8#ifndef lint 9static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $"; 10#endif /* not lint */ 11 12#include <sys/types.h> 13 | 1/*- 2 * Copyright (c) 2011, 2012 3 * Zhihao Yuan. All rights reserved. 4 * 5 * See the LICENSE file for redistribution information. 6 */ 7 8#ifndef lint 9static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $"; 10#endif /* not lint */ 11 12#include <sys/types.h> 13 |
14int looks_utf8 __P((const char *, size_t)); 15int looks_utf16 __P((const char *, size_t)); 16int decode_utf8 __P((const char *)); 17int decode_utf16 __P((const char *, int)); | 14int looks_utf8(const char *, size_t); 15int looks_utf16(const char *, size_t); 16int decode_utf8(const char *); 17int decode_utf16(const char *, int); |
18 19#define F 0 /* character never appears in text */ 20#define T 1 /* character appears in plain ASCII text */ 21#define I 2 /* character appears in ISO-8859 text */ 22#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 23 24static char text_chars[256] = { 25 /* BEL BS HT LF FF CR */ --- 23 unchanged lines hidden (view full) --- 49 * 50 * -1: invalid UTF-8 51 * 0: uses odd control characters, so doesn't look like text 52 * 1: 7-bit text 53 * 2: definitely UTF-8 text (valid high-bit set bytes) 54 * 55 * Based on RFC 3629. UTF-8 with BOM is not accepted. 56 * | 18 19#define F 0 /* character never appears in text */ 20#define T 1 /* character appears in plain ASCII text */ 21#define I 2 /* character appears in ISO-8859 text */ 22#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 23 24static char text_chars[256] = { 25 /* BEL BS HT LF FF CR */ --- 23 unchanged lines hidden (view full) --- 49 * 50 * -1: invalid UTF-8 51 * 0: uses odd control characters, so doesn't look like text 52 * 1: 7-bit text 53 * 2: definitely UTF-8 text (valid high-bit set bytes) 54 * 55 * Based on RFC 3629. UTF-8 with BOM is not accepted. 56 * |
57 * PUBLIC: int looks_utf8 __P((const char *, size_t)); | 57 * PUBLIC: int looks_utf8(const char *, size_t); |
58 */ 59int 60looks_utf8(const char *ibuf, size_t nbytes) 61{ 62 const u_char *buf = (u_char *)ibuf; 63 size_t i; 64 int n; 65 int gotone = 0, ctrl = 0; --- 44 unchanged lines hidden (view full) --- 110/* 111 * looks_utf16 -- 112 * Decide whether some text looks like UTF-16. Returns: 113 * 114 * 0: invalid UTF-16 115 * 1: Little-endian UTF-16 116 * 2: Big-endian UTF-16 117 * | 58 */ 59int 60looks_utf8(const char *ibuf, size_t nbytes) 61{ 62 const u_char *buf = (u_char *)ibuf; 63 size_t i; 64 int n; 65 int gotone = 0, ctrl = 0; --- 44 unchanged lines hidden (view full) --- 110/* 111 * looks_utf16 -- 112 * Decide whether some text looks like UTF-16. Returns: 113 * 114 * 0: invalid UTF-16 115 * 1: Little-endian UTF-16 116 * 2: Big-endian UTF-16 117 * |
118 * PUBLIC: int looks_utf16 __P((const char *, size_t)); | 118 * PUBLIC: int looks_utf16(const char *, size_t); |
119 */ 120int 121looks_utf16(const char *ibuf, size_t nbytes) 122{ 123 const u_char *buf = (u_char *)ibuf; 124 int bigend; 125 size_t i; 126 unsigned int c; --- 43 unchanged lines hidden (view full) --- 170 171/* 172 * decode_utf8 -- 173 * Decode a UTF-8 character from byte string to Unicode. 174 * Returns -1 if the first byte is a not UTF-8 leader. 175 * 176 * Based on RFC 3629, but without error detection. 177 * | 119 */ 120int 121looks_utf16(const char *ibuf, size_t nbytes) 122{ 123 const u_char *buf = (u_char *)ibuf; 124 int bigend; 125 size_t i; 126 unsigned int c; --- 43 unchanged lines hidden (view full) --- 170 171/* 172 * decode_utf8 -- 173 * Decode a UTF-8 character from byte string to Unicode. 174 * Returns -1 if the first byte is a not UTF-8 leader. 175 * 176 * Based on RFC 3629, but without error detection. 177 * |
178 * PUBLIC: int decode_utf8 __P((const char *)); | 178 * PUBLIC: int decode_utf8(const char *); |
179 */ | 179 */ |
180int decode_utf8(const char *ibuf) { | 180int 181decode_utf8(const char *ibuf) 182{ |
181 const u_char *buf = (u_char *)ibuf; 182 int u = -1; 183 184 if ((buf[0] & 0x80) == 0) 185 u = buf[0]; 186 else if ((buf[0] & 0x40) == 0); 187 else { 188 if ((buf[0] & 0x20) == 0) 189 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 190 else if ((buf[0] & 0x10) == 0) 191 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 192 ^ (buf[2] ^ 0x80); 193 else if (((buf[0] & 0x08) == 0)) 194 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 195 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 196 } | 183 const u_char *buf = (u_char *)ibuf; 184 int u = -1; 185 186 if ((buf[0] & 0x80) == 0) 187 u = buf[0]; 188 else if ((buf[0] & 0x40) == 0); 189 else { 190 if ((buf[0] & 0x20) == 0) 191 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 192 else if ((buf[0] & 0x10) == 0) 193 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 194 ^ (buf[2] ^ 0x80); 195 else if (((buf[0] & 0x08) == 0)) 196 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 197 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 198 } |
199 |
|
197 return u; 198} 199 200/* 201 * decode_utf16 -- 202 * Decode a UTF-16 character from byte string to Unicode. 203 * Returns -1 if the first unsigned integer is invalid. 204 * 205 * No error detection on supplementary bytes. 206 * | 200 return u; 201} 202 203/* 204 * decode_utf16 -- 205 * Decode a UTF-16 character from byte string to Unicode. 206 * Returns -1 if the first unsigned integer is invalid. 207 * 208 * No error detection on supplementary bytes. 209 * |
207 * PUBLIC: int decode_utf16 __P((const char *, int)); | 210 * PUBLIC: int decode_utf16(const char *, int); |
208 */ | 211 */ |
209int decode_utf16(const char* ibuf, int bigend) { | 212int 213decode_utf16(const char* ibuf, int bigend) 214{ |
210 const u_char *buf = (u_char *)ibuf; 211 int u = -1; 212 unsigned int w1, w2; 213 214 if (bigend) 215 w1 = buf[0] << 8 ^ buf[1]; 216 else 217 w1 = buf[0] ^ buf[1] << 8; 218 219 if (w1 < 0xD800 || w1 > 0xDFFF) 220 u = w1; 221 else if (w1 > 0xDBFF); 222 else { 223 if (bigend) 224 w2 = buf[2] << 8 ^ buf[3]; 225 else 226 w2 = buf[2] ^ buf[3] << 8; 227 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 228 } | 215 const u_char *buf = (u_char *)ibuf; 216 int u = -1; 217 unsigned int w1, w2; 218 219 if (bigend) 220 w1 = buf[0] << 8 ^ buf[1]; 221 else 222 w1 = buf[0] ^ buf[1] << 8; 223 224 if (w1 < 0xD800 || w1 > 0xDFFF) 225 u = w1; 226 else if (w1 > 0xDBFF); 227 else { 228 if (bigend) 229 w2 = buf[2] << 8 ^ buf[3]; 230 else 231 w2 = buf[2] ^ buf[3] << 8; 232 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 233 } |
234 |
|
229 return u; 230} | 235 return u; 236} |