1/* utf8.c -- convert characters to/from UTF-8 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: iccir $ 9 $Date: 2007/01/30 23:46:52 $ 10 $Revision: 1.3 $ 11 12 Uses public interfaces to abstract input source and output 13 sink, which may be user supplied or either FILE* or memory 14 based Tidy implementations. Encoding support is uniform 15 regardless of I/O mechanism. 16 17 Note, UTF-8 encoding, by itself, does not affect the actual 18 "codepoints" of the underlying character encoding. In the 19 cases of ASCII, Latin1, Unicode (16-bit, BMP), these all 20 refer to ISO-10646 "codepoints". For anything else, they 21 refer to some other "codepoint" set. 22 23 Put another way, UTF-8 is a variable length method to 24 represent any non-negative integer value. The glyph 25 that a integer value represents is unchanged and defined 26 externally (e.g. by ISO-10646, Big5, Win1252, MacRoman, 27 Latin2-9, and so on). 28 29 Put still another way, UTF-8 is more of a _transfer_ encoding 30 than a _character_ encoding, per se. 31*/ 32 33#include "tidy.h" 34#include "forward.h" 35#include "utf8.h" 36 37/* 38UTF-8 encoding/decoding functions 39Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence 40 41Also see below for UTF-16 encoding/decoding functions 42 43References : 44 451) UCS Transformation Format 8 (UTF-8): 46ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D 47<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335> 48<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html> 49 50Table 4 - Mapping from UCS-4 to UTF-8 51 522) Unicode standards: 53<http://www.unicode.org/unicode/standard/standard.html> 54 553) Legal UTF-8 byte sequences: 56<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html> 57 58Code point 1st byte 2nd byte 3rd byte 4th byte 59---------- -------- -------- -------- -------- 60U+0000..U+007F 00..7F 61U+0080..U+07FF C2..DF 80..BF 62U+0800..U+0FFF E0 A0..BF 80..BF 63U+1000..U+FFFF E1..EF 80..BF 80..BF 64U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 65U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 66U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 67 68The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also 69allows for the use of five- and six-byte sequences to encode 70characters that are outside the range of the Unicode character 71set; those five- and six-byte sequences are illegal for the use 72of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646 73does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF 74(but it does allow other noncharacters). 75 764) RFC 2279: UTF-8, a transformation format of ISO 10646: 77<http://www.ietf.org/rfc/rfc2279.txt> 78 795) UTF-8 and Unicode FAQ: 80<http://www.cl.cam.ac.uk/~mgk25/unicode.html> 81 826) Markus Kuhn's UTF-8 decoder stress test file: 83<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt> 84 857) UTF-8 Demo: 86<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt> 87 888) UTF-8 Sampler: 89<http://www.columbia.edu/kermit/utf8.html> 90 919) Transformation Format for 16 Planes of Group 00 (UTF-16): 92ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C 93<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf> 94<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html> 95 9610) RFC 2781: UTF-16, an encoding of ISO 10646: 97<http://www.ietf.org/rfc/rfc2781.txt> 98 9911) UTF-16 invalid surrogate pairs: 100<http://www.unicode.org/unicode/faq/utf_bom.html#16> 101 102UTF-16 UTF-8 UCS-4 103D83F DFF* F0 9F BF B* 0001FFF* 104D87F DFF* F0 AF BF B* 0002FFF* 105D8BF DFF* F0 BF BF B* 0003FFF* 106D8FF DFF* F1 8F BF B* 0004FFF* 107D93F DFF* F1 9F BF B* 0005FFF* 108D97F DFF* F1 AF BF B* 0006FFF* 109 ... 110DBBF DFF* F3 BF BF B* 000FFFF* 111DBFF DFF* F4 8F BF B* 0010FFF* 112 113* = E or F 114 1151010 A 1161011 B 1171100 C 1181101 D 1191110 E 1201111 F 121 122*/ 123 124#define kNumUTF8Sequences 7 125#define kMaxUTF8Bytes 4 126 127#define kUTF8ByteSwapNotAChar 0xFFFE 128#define kUTF8NotAChar 0xFFFF 129 130#define kMaxUTF8FromUCS4 0x10FFFF 131 132#define kUTF16SurrogatesBegin 0x10000 133#define kMaxUTF16FromUCS4 0x10FFFF 134 135/* UTF-16 surrogate pair areas */ 136#define kUTF16LowSurrogateBegin 0xD800 137#define kUTF16LowSurrogateEnd 0xDBFF 138#define kUTF16HighSurrogateBegin 0xDC00 139#define kUTF16HighSurrogateEnd 0xDFFF 140 141 142/* offsets into validUTF8 table below */ 143static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] = 144{ 145 0, /* 1 byte */ 146 1, /* 2 bytes */ 147 2, /* 3 bytes */ 148 4, /* 4 bytes */ 149 kNumUTF8Sequences /* must be last */ 150}; 151 152static const struct validUTF8Sequence 153{ 154 uint lowChar; 155 uint highChar; 156 int numBytes; 157 byte validBytes[8]; 158} validUTF8[kNumUTF8Sequences] = 159{ 160/* low high #bytes byte 1 byte 2 byte 3 byte 4 */ 161 {0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}}, 162 {0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}}, 163 {0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 164 {0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}}, 165 {0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 166 {0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}}, 167 {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}} 168}; 169 170int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes, 171 TidyInputSource* inp, int* count ) 172{ 173 byte tempbuf[10]; 174 byte *buf = &tempbuf[0]; 175 uint ch = 0, n = 0; 176 int i, bytes = 0; 177 Bool hasError = no; 178 179 if ( successorBytes ) 180 buf = (byte*) successorBytes; 181 182 /* special check if we have been passed an EOF char */ 183 if ( firstByte == EndOfStream ) 184 { 185 /* at present */ 186 *c = firstByte; 187 *count = 1; 188 return 0; 189 } 190 191 ch = firstByte; /* first byte is passed in separately */ 192 193 if (ch <= 0x7F) /* 0XXX XXXX one byte */ 194 { 195 n = ch; 196 bytes = 1; 197 } 198 else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */ 199 { 200 n = ch & 31; 201 bytes = 2; 202 } 203 else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */ 204 { 205 n = ch & 15; 206 bytes = 3; 207 } 208 else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */ 209 { 210 n = ch & 7; 211 bytes = 4; 212 } 213 else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */ 214 { 215 n = ch & 3; 216 bytes = 5; 217 hasError = yes; 218 } 219 else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */ 220 { 221 n = ch & 1; 222 bytes = 6; 223 hasError = yes; 224 } 225 else 226 { 227 /* not a valid first byte of a UTF-8 sequence */ 228 n = ch; 229 bytes = 1; 230 hasError = yes; 231 } 232 233 /* successor bytes should have the form 10XX XXXX */ 234 235 /* If caller supplied buffer, use it. Else see if caller 236 ** supplied an input source, use that. 237 */ 238 if ( successorBytes ) 239 { 240 for ( i=0; i < bytes-1; ++i ) 241 { 242 if ( !buf[i] || (buf[i] & 0xC0) != 0x80 ) 243 { 244 hasError = yes; 245 bytes = i; 246 break; 247 } 248 n = (n << 6) | (buf[i] & 0x3F); 249 } 250 } 251 else if ( inp ) 252 { 253 for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i ) 254 { 255 int b = inp->getByte( inp->sourceData ); 256 buf[i] = (tmbchar) b; 257 258 /* End of data or illegal successor byte value */ 259 if ( b == EOF || (buf[i] & 0xC0) != 0x80 ) 260 { 261 hasError = yes; 262 bytes = i; 263 if ( b != EOF ) 264 inp->ungetByte( inp->sourceData, buf[i] ); 265 break; 266 } 267 n = (n << 6) | (buf[i] & 0x3F); 268 } 269 } 270 else if ( bytes > 1 ) 271 { 272 hasError = yes; 273 bytes = 1; 274 } 275 276 if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar))) 277 hasError = yes; 278 279 if (!hasError && (n > kMaxUTF8FromUCS4)) 280 hasError = yes; 281 282#if 0 /* Breaks Big5 D8 - DF */ 283 if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd)) 284 /* unpaired surrogates not allowed */ 285 hasError = yes; 286#endif 287 288 if (!hasError) 289 { 290 int lo, hi; 291 292 lo = offsetUTF8Sequences[bytes - 1]; 293 hi = offsetUTF8Sequences[bytes] - 1; 294 295 /* check for overlong sequences */ 296 if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar)) 297 hasError = yes; 298 else 299 { 300 hasError = yes; /* assume error until proven otherwise */ 301 302 for (i = lo; i <= hi; i++) 303 { 304 int tempCount; 305 byte theByte; 306 307 for (tempCount = 0; tempCount < bytes; tempCount++) 308 { 309 if (!tempCount) 310 theByte = (tmbchar) firstByte; 311 else 312 theByte = buf[tempCount - 1]; 313 314 if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] && 315 theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] ) 316 hasError = no; 317 if (hasError) 318 break; 319 } 320 } 321 } 322 } 323 324#if 1 && defined(_DEBUG) 325 if ( hasError ) 326 { 327 /* debug */ 328 fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes ); 329 fprintf( stderr, "0x%02x ", firstByte ); 330 for (i = 1; i < bytes; i++) 331 fprintf( stderr, "0x%02x ", buf[i - 1] ); 332 fprintf( stderr, " = U+%04ulx\n", n ); 333 } 334#endif 335 336 *count = bytes; 337 *c = n; 338 if ( hasError ) 339 return -1; 340 return 0; 341} 342 343int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf, 344 TidyOutputSink* outp, int* count ) 345{ 346 byte tempbuf[10] = {0}; 347 byte* buf = &tempbuf[0]; 348 int bytes = 0; 349 Bool hasError = no; 350 351 if ( encodebuf ) 352 buf = (byte*) encodebuf; 353 354 if (c <= 0x7F) /* 0XXX XXXX one byte */ 355 { 356 buf[0] = (tmbchar) c; 357 bytes = 1; 358 } 359 else if (c <= 0x7FF) /* 110X XXXX two bytes */ 360 { 361 buf[0] = (tmbchar) ( 0xC0 | (c >> 6) ); 362 buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) ); 363 bytes = 2; 364 } 365 else if (c <= 0xFFFF) /* 1110 XXXX three bytes */ 366 { 367 buf[0] = (tmbchar) (0xE0 | (c >> 12)); 368 buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 369 buf[2] = (tmbchar) (0x80 | (c & 0x3F)); 370 bytes = 3; 371 if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar ) 372 hasError = yes; 373#if 0 /* Breaks Big5 D8 - DF */ 374 else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd ) 375 /* unpaired surrogates not allowed */ 376 hasError = yes; 377#endif 378 } 379 else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */ 380 { 381 buf[0] = (tmbchar) (0xF0 | (c >> 18)); 382 buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 383 buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 384 buf[3] = (tmbchar) (0x80 | (c & 0x3F)); 385 bytes = 4; 386 if (c > kMaxUTF8FromUCS4) 387 hasError = yes; 388 } 389 else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */ 390 { 391 buf[0] = (tmbchar) (0xF8 | (c >> 24)); 392 buf[1] = (tmbchar) (0x80 | (c >> 18)); 393 buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 394 buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 395 buf[4] = (tmbchar) (0x80 | (c & 0x3F)); 396 bytes = 5; 397 hasError = yes; 398 } 399 else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */ 400 { 401 buf[0] = (tmbchar) (0xFC | (c >> 30)); 402 buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F)); 403 buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F)); 404 buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F)); 405 buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F)); 406 buf[5] = (tmbchar) (0x80 | (c & 0x3F)); 407 bytes = 6; 408 hasError = yes; 409 } 410 else 411 hasError = yes; 412 413 /* don't output invalid UTF-8 byte sequence to a stream */ 414 if ( !hasError && outp != NULL ) 415 { 416 int ix; 417 for ( ix=0; ix < bytes; ++ix ) 418 outp->putByte( outp->sinkData, buf[ix] ); 419 } 420 421#if 1 && defined(_DEBUG) 422 if ( hasError ) 423 { 424 int i; 425 fprintf( stderr, "UTF-8 encoding error for U+%x : ", c ); 426 for (i = 0; i < bytes; i++) 427 fprintf( stderr, "0x%02x ", buf[i] ); 428 fprintf( stderr, "\n" ); 429 } 430#endif 431 432 *count = bytes; 433 if (hasError) 434 return -1; 435 return 0; 436} 437 438 439/* return one less than the number of bytes used by the UTF-8 byte sequence */ 440/* str points to the UTF-8 byte sequence */ 441/* the Unicode char is returned in *ch */ 442uint TY_(GetUTF8)( ctmbstr str, uint *ch ) 443{ 444 uint n; 445 int bytes; 446 447 int err; 448 449 bytes = 0; 450 451 /* first byte "str[0]" is passed in separately from the */ 452 /* rest of the UTF-8 byte sequence starting at "str[1]" */ 453 err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes ); 454 if (err) 455 { 456#if 1 && defined(_DEBUG) 457 fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n); 458#endif 459 n = 0xFFFD; /* replacement char */ 460 } 461 462 *ch = n; 463 return bytes - 1; 464} 465 466/* store char c as UTF-8 encoded byte stream */ 467tmbstr TY_(PutUTF8)( tmbstr buf, uint c ) 468{ 469 int err, count = 0; 470 471 err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); 472 if (err) 473 { 474#if 1 && defined(_DEBUG) 475 fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c); 476#endif 477 /* replacement char 0xFFFD encoded as UTF-8 */ 478 buf[0] = (byte) 0xEF; 479 buf[1] = (byte) 0xBF; 480 buf[2] = (byte) 0xBD; 481 count = 3; 482 } 483 484 buf += count; 485 return buf; 486} 487 488Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 ) 489{ 490 return ( ucs4 <= kMaxUTF16FromUCS4 ); 491} 492 493Bool TY_(IsHighSurrogate)( tchar ch ) 494{ 495 return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd ); 496} 497Bool TY_(IsLowSurrogate)( tchar ch ) 498{ 499 return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd ); 500} 501 502tchar TY_(CombineSurrogatePair)( tchar high, tchar low ) 503{ 504 assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) ); 505 return ( ((low - kUTF16LowSurrogateBegin) * 0x400) + 506 high - kUTF16HighSurrogateBegin + 0x10000 ); 507} 508 509Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high ) 510{ 511 Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low ); 512 if ( status ) 513 { 514 *low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin; 515 *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin; 516 } 517 return status; 518} 519 520Bool TY_(IsValidCombinedChar)( tchar ch ) 521{ 522 return ( ch >= kUTF16SurrogatesBegin && 523 (ch & 0x0000FFFE) != 0x0000FFFE && 524 (ch & 0x0000FFFF) != 0x0000FFFF ); 525} 526 527Bool TY_(IsCombinedChar)( tchar ch ) 528{ 529 return ( ch >= kUTF16SurrogatesBegin ); 530} 531 532/* 533 * local variables: 534 * mode: c 535 * indent-tabs-mode: nil 536 * c-basic-offset: 4 537 * eval: (c-set-offset 'substatement-open 0) 538 * end: 539 */ 540