1254225Speter/*- 2254225Speter * Copyright (c) 2011, 2012 3254225Speter * Zhihao Yuan. All rights reserved. 4254225Speter * 5254225Speter * See the LICENSE file for redistribution information. 6254225Speter */ 7254225Speter 8254225Speter#ifndef lint 9254225Speterstatic const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $"; 10254225Speter#endif /* not lint */ 11254225Speter 12254225Speter#include <sys/types.h> 13254225Speter 14254225Speterint looks_utf8 __P((const char *, size_t)); 15254225Speterint looks_utf16 __P((const char *, size_t)); 16254225Speterint decode_utf8 __P((const char *)); 17254225Speterint decode_utf16 __P((const char *, int)); 18254225Speter 19254225Speter#define F 0 /* character never appears in text */ 20254225Speter#define T 1 /* character appears in plain ASCII text */ 21254225Speter#define I 2 /* character appears in ISO-8859 text */ 22254225Speter#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 23254225Speter 24254225Speterstatic char text_chars[256] = { 25254225Speter /* BEL BS HT LF FF CR */ 26254225Speter F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 27254225Speter /* ESC */ 28254225Speter F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 29254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 30254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 31254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 32254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 33254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 34254225Speter T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 35254225Speter /* NEL */ 36254225Speter X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 37254225Speter X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 38254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 39254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 40254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 41254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 42254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 43254225Speter I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 44254225Speter}; 45254225Speter 46254225Speter/* 47254225Speter * looks_utf8 -- 48254225Speter * Decide whether some text looks like UTF-8. Returns: 49254225Speter * 50254225Speter * -1: invalid UTF-8 51254225Speter * 0: uses odd control characters, so doesn't look like text 52254225Speter * 1: 7-bit text 53254225Speter * 2: definitely UTF-8 text (valid high-bit set bytes) 54254225Speter * 55254225Speter * Based on RFC 3629. UTF-8 with BOM is not accepted. 56254225Speter * 57254225Speter * PUBLIC: int looks_utf8 __P((const char *, size_t)); 58254225Speter */ 59254225Speterint 60254225Speterlooks_utf8(const char *ibuf, size_t nbytes) 61254225Speter{ 62254225Speter const u_char *buf = (u_char *)ibuf; 63254225Speter size_t i; 64254225Speter int n; 65254225Speter int gotone = 0, ctrl = 0; 66254225Speter 67254225Speter for (i = 0; i < nbytes; i++) { 68254225Speter if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 69254225Speter /* 70254225Speter * Even if the whole file is valid UTF-8 sequences, 71254225Speter * still reject it if it uses weird control characters. 72254225Speter */ 73254225Speter 74254225Speter if (text_chars[buf[i]] != T) 75254225Speter ctrl = 1; 76254225Speter } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 77254225Speter return -1; 78254225Speter } else { /* 11xxxxxx begins UTF-8 */ 79254225Speter int following; 80254225Speter 81254225Speter if ((buf[i] & 0x20) == 0) /* 110xxxxx */ 82254225Speter if (buf[i] > 0xC1) /* C0, C1 */ 83254225Speter following = 1; 84254225Speter else return -1; 85254225Speter else if ((buf[i] & 0x10) == 0) /* 1110xxxx */ 86254225Speter following = 2; 87254225Speter else if ((buf[i] & 0x08) == 0) /* 11110xxx */ 88254225Speter if (buf[i] < 0xF5) 89254225Speter following = 3; 90254225Speter else return -1; /* F5, F6, F7 */ 91254225Speter else 92254225Speter return -1; /* F8~FF */ 93254225Speter 94254225Speter for (n = 0; n < following; n++) { 95254225Speter i++; 96254225Speter if (i >= nbytes) 97254225Speter goto done; 98254225Speter 99254225Speter if (buf[i] & 0x40) /* 10xxxxxx */ 100254225Speter return -1; 101254225Speter } 102254225Speter 103254225Speter gotone = 1; 104254225Speter } 105254225Speter } 106254225Speterdone: 107254225Speter return ctrl ? 0 : (gotone ? 2 : 1); 108254225Speter} 109254225Speter 110254225Speter/* 111254225Speter * looks_utf16 -- 112254225Speter * Decide whether some text looks like UTF-16. Returns: 113254225Speter * 114254225Speter * 0: invalid UTF-16 115254225Speter * 1: Little-endian UTF-16 116254225Speter * 2: Big-endian UTF-16 117254225Speter * 118254225Speter * PUBLIC: int looks_utf16 __P((const char *, size_t)); 119254225Speter */ 120254225Speterint 121254225Speterlooks_utf16(const char *ibuf, size_t nbytes) 122254225Speter{ 123254225Speter const u_char *buf = (u_char *)ibuf; 124254225Speter int bigend; 125254225Speter size_t i; 126254225Speter unsigned int c; 127254225Speter int bom; 128254225Speter int following = 0; 129254225Speter 130254225Speter if (nbytes < 2) 131254225Speter return 0; 132254225Speter 133254225Speter bom = buf[0] << 8 ^ buf[1]; 134254225Speter if (bom == 0xFFFE) 135254225Speter bigend = 0; 136254225Speter else if (bom == 0xFEFF) 137254225Speter bigend = 1; 138254225Speter else 139254225Speter return 0; 140254225Speter 141254225Speter for (i = 2; i + 1 < nbytes; i += 2) { 142254225Speter if (bigend) 143254225Speter c = buf[i] << 8 ^ buf[i + 1]; 144254225Speter else 145254225Speter c = buf[i] ^ buf[i + 1] << 8; 146254225Speter 147254225Speter if (!following) 148254225Speter if (c < 0xD800 || c > 0xDFFF) 149254225Speter if (c < 128 && text_chars[c] != T) 150254225Speter return 0; 151254225Speter else 152254225Speter following = 0; 153254225Speter else if (c > 0xDBFF) 154254225Speter return 0; 155254225Speter else { 156254225Speter following = 1; 157254225Speter continue; 158254225Speter } 159254225Speter else if (c < 0xDC00 || c > 0xDFFF) 160254225Speter return 0; 161254225Speter } 162254225Speter 163254225Speter return 1 + bigend; 164254225Speter} 165254225Speter 166254225Speter#undef F 167254225Speter#undef T 168254225Speter#undef I 169254225Speter#undef X 170254225Speter 171254225Speter/* 172254225Speter * decode_utf8 -- 173254225Speter * Decode a UTF-8 character from byte string to Unicode. 174254225Speter * Returns -1 if the first byte is a not UTF-8 leader. 175254225Speter * 176254225Speter * Based on RFC 3629, but without error detection. 177254225Speter * 178254225Speter * PUBLIC: int decode_utf8 __P((const char *)); 179254225Speter */ 180254225Speterint decode_utf8(const char *ibuf) { 181254225Speter const u_char *buf = (u_char *)ibuf; 182254225Speter int u = -1; 183254225Speter 184254225Speter if ((buf[0] & 0x80) == 0) 185254225Speter u = buf[0]; 186254225Speter else if ((buf[0] & 0x40) == 0); 187254225Speter else { 188254225Speter if ((buf[0] & 0x20) == 0) 189254225Speter u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 190254225Speter else if ((buf[0] & 0x10) == 0) 191254225Speter u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 192254225Speter ^ (buf[2] ^ 0x80); 193254225Speter else if (((buf[0] & 0x08) == 0)) 194254225Speter u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 195254225Speter ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 196254225Speter } 197254225Speter return u; 198254225Speter} 199254225Speter 200254225Speter/* 201254225Speter * decode_utf16 -- 202254225Speter * Decode a UTF-16 character from byte string to Unicode. 203254225Speter * Returns -1 if the first unsigned integer is invalid. 204254225Speter * 205254225Speter * No error detection on supplementary bytes. 206254225Speter * 207254225Speter * PUBLIC: int decode_utf16 __P((const char *, int)); 208254225Speter */ 209254225Speterint decode_utf16(const char* ibuf, int bigend) { 210254225Speter const u_char *buf = (u_char *)ibuf; 211254225Speter int u = -1; 212254225Speter unsigned int w1, w2; 213254225Speter 214254225Speter if (bigend) 215254225Speter w1 = buf[0] << 8 ^ buf[1]; 216254225Speter else 217254225Speter w1 = buf[0] ^ buf[1] << 8; 218254225Speter 219254225Speter if (w1 < 0xD800 || w1 > 0xDFFF) 220254225Speter u = w1; 221254225Speter else if (w1 > 0xDBFF); 222254225Speter else { 223254225Speter if (bigend) 224254225Speter w2 = buf[2] << 8 ^ buf[3]; 225254225Speter else 226254225Speter w2 = buf[2] ^ buf[3] << 8; 227254225Speter u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 228254225Speter } 229254225Speter return u; 230254225Speter} 231