1/* streamio.c -- handles character stream I/O 2 3 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 4 See tidy.h for the copyright notice. 5 6 CVS Info : 7 8 $Author: iccir $ 9 $Date: 2007/02/03 02:31:30 $ 10 $Revision: 1.6 $ 11 12 Wrapper around Tidy input source and output sink 13 that calls appropriate interfaces, and applies 14 necessary char encoding transformations: to/from 15 ISO-10646 and/or UTF-8. 16 17*/ 18 19#include <stdio.h> 20#include <errno.h> 21 22#include "streamio.h" 23#include "tidy-int.h" 24#include "lexer.h" 25#include "message.h" 26#include "utf8.h" 27#include "tmbstr.h" 28 29#ifdef TIDY_WIN32_MLANG_SUPPORT 30#include "win32tc.h" 31#endif 32 33/************************ 34** Forward Declarations 35************************/ 36 37static uint ReadCharFromStream( StreamIn* in ); 38 39static uint ReadByte( StreamIn* in ); 40static void UngetByte( StreamIn* in, uint byteValue ); 41 42static void PutByte( uint byteValue, StreamOut* out ); 43 44static void EncodeWin1252( uint c, StreamOut* out ); 45static void EncodeMacRoman( uint c, StreamOut* out ); 46static void EncodeIbm858( uint c, StreamOut* out ); 47static void EncodeLatin0( uint c, StreamOut* out ); 48 49static uint DecodeIbm850(uint c); 50static uint DecodeLatin0(uint c); 51 52static uint PopChar( StreamIn *in ); 53 54/****************************** 55** Static (duration) Globals 56******************************/ 57 58static StreamOut stderrStreamOut = 59{ 60 ASCII, 61 FSM_ASCII, 62 DEFAULT_NL_CONFIG, 63#ifdef TIDY_WIN32_MLANG_SUPPORT 64 (ulong)NULL, 65#endif 66 FileIO, 67 { 0, TY_(filesink_putByte) } 68}; 69 70static StreamOut stdoutStreamOut = 71{ 72 ASCII, 73 FSM_ASCII, 74 DEFAULT_NL_CONFIG, 75#ifdef TIDY_WIN32_MLANG_SUPPORT 76 (ulong)NULL, 77#endif 78 FileIO, 79 { 0, TY_(filesink_putByte) } 80}; 81 82StreamOut* TY_(StdErrOutput)(void) 83{ 84 if ( stderrStreamOut.sink.sinkData == 0 ) 85 stderrStreamOut.sink.sinkData = stderr; 86 return &stderrStreamOut; 87} 88 89#if 0 90StreamOut* TY_(StdOutOutput)(void) 91{ 92 if ( stdoutStreamOut.sink.sinkData == 0 ) 93 stdoutStreamOut.sink.sinkData = stdout; 94 return &stdoutStreamOut; 95} 96#endif 97 98void TY_(ReleaseStreamOut)( StreamOut* out ) 99{ 100 if ( out && out != &stderrStreamOut && out != &stdoutStreamOut ) 101 { 102 if ( out->iotype == FileIO ) 103 fclose( (FILE*) out->sink.sinkData ); 104 MemFree( out ); 105 } 106} 107 108 109/************************ 110** Source 111************************/ 112 113StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding ) 114{ 115 StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) ); 116 117 ClearMemory( in, sizeof(StreamIn) ); 118 in->curline = 1; 119 in->curcol = 1; 120 in->encoding = encoding; 121 in->state = FSM_ASCII; 122 in->doc = doc; 123 in->bufsize = CHARBUF_SIZE; 124 in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize); 125#ifdef TIDY_STORE_ORIGINAL_TEXT 126 in->otextbuf = NULL; 127 in->otextlen = 0; 128 in->otextsize = 0; 129#endif 130 return in; 131} 132 133void TY_(freeStreamIn)(StreamIn* in) 134{ 135#ifdef TIDY_STORE_ORIGINAL_TEXT 136 if (in->otextbuf) 137 MemFree(in->otextbuf); 138#endif 139 MemFree(in->charbuf); 140 MemFree(in); 141} 142 143StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding ) 144{ 145 StreamIn *in = TY_(initStreamIn)( doc, encoding ); 146 if ( TY_(initFileSource)( &in->source, fp ) != 0 ) 147 { 148 TY_(freeStreamIn)( in ); 149 return NULL; 150 } 151 in->iotype = FileIO; 152 return in; 153} 154 155StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding ) 156{ 157 StreamIn *in = TY_(initStreamIn)( doc, encoding ); 158 tidyInitInputBuffer( &in->source, buf ); 159 in->iotype = BufferIO; 160 return in; 161} 162 163StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding ) 164{ 165 StreamIn *in = TY_(initStreamIn)( doc, encoding ); 166 memcpy( &in->source, source, sizeof(TidyInputSource) ); 167 in->iotype = UserIO; 168 return in; 169} 170 171int TY_(ReadBOMEncoding)(StreamIn *in) 172{ 173 uint c, c1; 174#if SUPPORT_UTF16_ENCODINGS 175 uint bom; 176#endif 177 178 c = ReadByte(in); 179 if (c == EndOfStream) 180 return -1; 181 182 c1 = ReadByte( in ); 183 if (c1 == EndOfStream) 184 { 185 UngetByte(in, c); 186 return -1; 187 } 188 189 /* todo: dont warn about mismatch for auto input encoding */ 190 /* todo: let the user override the encoding found here */ 191 192#if SUPPORT_UTF16_ENCODINGS 193 bom = (c << 8) + c1; 194 195 if ( bom == UNICODE_BOM_BE ) 196 { 197 /* big-endian UTF-16 */ 198 if ( in->encoding != UTF16 && in->encoding != UTF16BE ) 199 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE); 200 201 return UTF16BE; /* return decoded BOM */ 202 } 203 else if (bom == UNICODE_BOM_LE) 204 { 205 /* little-endian UTF-16 */ 206 if (in->encoding != UTF16 && in->encoding != UTF16LE) 207 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE); 208 209 return UTF16LE; /* return decoded BOM */ 210 } 211 else 212#endif /* SUPPORT_UTF16_ENCODINGS */ 213 { 214 uint c2 = ReadByte(in); 215 216 if (c2 == EndOfStream) 217 { 218 UngetByte(in, c1); 219 UngetByte(in, c); 220 return -1; 221 } 222 223 if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8) 224 { 225 /* UTF-8 */ 226 if (in->encoding != UTF8) 227 TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8); 228 229 return UTF8; 230 } 231 else 232 UngetByte( in, c2 ); 233 } 234 235 UngetByte(in, c1); 236 UngetByte(in, c); 237 238 return -1; 239} 240 241#ifdef TIDY_STORE_ORIGINAL_TEXT 242void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c) 243{ 244 if (in->otextlen + 1 >= in->otextsize) 245 { 246 size_t size = in->otextsize ? 1 : 2; 247 in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size); 248 in->otextsize += size; 249 } 250 in->otextbuf[in->otextlen++] = c; 251 in->otextbuf[in->otextlen ] = 0; 252} 253 254void TY_(AddCharToOriginalText)(StreamIn *in, tchar c) 255{ 256 int i, err, count = 0; 257 tmbchar buf[10] = {0}; 258 259 err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count); 260 261 if (err) 262 { 263 /* replacement character 0xFFFD encoded as UTF-8 */ 264 buf[0] = (byte) 0xEF; 265 buf[1] = (byte) 0xBF; 266 buf[2] = (byte) 0xBD; 267 count = 3; 268 } 269 270 for (i = 0; i < count; ++i) 271 AddByteToOriginalText(in, buf[i]); 272} 273#endif 274 275 276uint TY_(ReadChar)( StreamIn *in ) 277{ 278 uint c = EndOfStream; 279 uint tabsize = cfg( in->doc, TidyTabSize ); 280#ifdef TIDY_STORE_ORIGINAL_TEXT 281 Bool added = no; 282#endif 283 284/* Apple Inc. Changes: 285 2005-01-18 swilkin Change to deal with possible '\0' char or other char that should be discarded following '\r' 286*/ 287#ifdef TIDY_APPLE_CHANGES 288 if ( !in->pushed ) 289 { 290#else 291 if ( in->pushed ) 292 return PopChar( in ); 293#endif 294 in->lastcol = in->curcol; 295 296 if ( in->tabs > 0 ) 297 { 298 in->curcol++; 299 in->tabs--; 300 return ' '; 301 } 302#ifdef TIDY_APPLE_CHANGES 303 } 304#endif 305 306 for (;;) 307 { 308#ifdef TIDY_APPLE_CHANGES 309 if ( in->pushed ) 310 c = PopChar(in); 311 else 312#endif 313 c = ReadCharFromStream(in); 314 315 if ( EndOfStream == c ) 316 return EndOfStream; 317 318 if (c == '\n') 319 { 320#ifdef TIDY_STORE_ORIGINAL_TEXT 321 added = yes; 322 AddCharToOriginalText(in, (tchar)c); 323#endif 324 in->curcol = 1; 325 in->curline++; 326 break; 327 } 328 329 if (c == '\t') 330 { 331#ifdef TIDY_STORE_ORIGINAL_TEXT 332 added = yes; 333 AddCharToOriginalText(in, (tchar)c); 334#endif 335 in->tabs = tabsize > 0 ? 336 tabsize - ((in->curcol - 1) % tabsize) - 1 337 : 0; 338 in->curcol++; 339 c = ' '; 340 break; 341 } 342 343 /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */ 344 if (c == '\r') 345 { 346#ifdef TIDY_STORE_ORIGINAL_TEXT 347 added = yes; 348 AddCharToOriginalText(in, (tchar)c); 349#endif 350 c = ReadCharFromStream(in); 351 if (c != '\n') 352 { 353 TY_(UngetChar)( c, in ); 354 c = '\n'; 355 } 356 else 357 { 358#ifdef TIDY_STORE_ORIGINAL_TEXT 359 AddCharToOriginalText(in, (tchar)c); 360#endif 361 } 362 in->curcol = 1; 363 in->curline++; 364 break; 365 } 366 367#ifndef NO_NATIVE_ISO2022_SUPPORT 368 /* strip control characters, except for Esc */ 369 if (c == '\033') 370 break; 371#endif 372 373 /* Form Feed is allowed in HTML */ 374 if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) ) 375 break; 376 377 if ( c < 32 ) 378 continue; /* discard control char */ 379 380 /* watch out for chars that have already been decoded such as */ 381 /* IS02022, UTF-8 etc, that don't require further decoding */ 382 383 if ( 384 in->encoding == RAW 385#ifndef NO_NATIVE_ISO2022_SUPPORT 386 || in->encoding == ISO2022 387#endif 388 || in->encoding == UTF8 389 390#if SUPPORT_ASIAN_ENCODINGS 391 || in->encoding == SHIFTJIS /* #431953 - RJ */ 392 || in->encoding == BIG5 /* #431953 - RJ */ 393#endif 394 ) 395 { 396 in->curcol++; 397 break; 398 } 399 400#if SUPPORT_UTF16_ENCODINGS 401 /* handle surrogate pairs */ 402 if ( in->encoding == UTF16LE || 403 in->encoding == UTF16 || 404 in->encoding == UTF16BE ) 405 { 406 if ( !TY_(IsValidUTF16FromUCS4)(c) ) 407 { 408 /* invalid UTF-16 value */ 409 TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes); 410 c = 0; 411 } 412 else if ( TY_(IsLowSurrogate)(c) ) 413 { 414 uint n = c; 415 uint m = ReadCharFromStream( in ); 416 if ( m == EndOfStream ) 417 return EndOfStream; 418 419 c = 0; 420 if ( TY_(IsHighSurrogate)(m) ) 421 { 422 n = TY_(CombineSurrogatePair)( m, n ); 423 if ( TY_(IsValidCombinedChar)(n) ) 424 c = n; 425 } 426 /* not a valid pair */ 427 if ( 0 == c ) 428 TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes ); 429 } 430 } 431#endif 432 433 /* Do first: acts on range 128 - 255 */ 434 switch ( in->encoding ) 435 { 436 case MACROMAN: 437 c = TY_(DecodeMacRoman)( c ); 438 break; 439 case IBM858: 440 c = DecodeIbm850( c ); 441 break; 442 case LATIN0: 443 c = DecodeLatin0( c ); 444 break; 445 } 446 447 /* produced e.g. as a side-effect of smart quotes in Word */ 448 /* but can't happen if using MACROMAN encoding */ 449 if ( 127 < c && c < 160 ) 450 { 451 uint c1 = 0, replMode = DISCARDED_CHAR; 452 Bool isVendorChar = ( in->encoding == WIN1252 || 453 in->encoding == MACROMAN ); 454 Bool isWinChar = ( in->encoding == WIN1252 || 455 TY_(ReplacementCharEncoding) == WIN1252 ); 456 Bool isMacChar = ( in->encoding == MACROMAN || 457 TY_(ReplacementCharEncoding) == MACROMAN ); 458 459 /* set error position just before offending character */ 460 if (in->doc->lexer) 461 { 462 in->doc->lexer->lines = in->curline; 463 in->doc->lexer->columns = in->curcol; 464 } 465 466 if ( isWinChar ) 467 c1 = TY_(DecodeWin1252)( c ); 468 else if ( isMacChar ) 469 c1 = TY_(DecodeMacRoman)( c ); 470 if ( c1 ) 471 replMode = REPLACED_CHAR; 472 473 if ( c1 == 0 && isVendorChar ) 474 TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR); 475 else if ( ! isVendorChar ) 476 TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR); 477 478 c = c1; 479 } 480 481 if ( c == 0 ) 482 continue; /* illegal char is discarded */ 483 484 in->curcol++; 485 break; 486 } 487 488#ifdef TIDY_STORE_ORIGINAL_TEXT 489 if (!added) 490 AddCharToOriginalText(in, (tchar)c); 491#endif 492 493 return c; 494} 495 496static uint PopChar( StreamIn *in ) 497{ 498 uint c = EndOfStream; 499 if ( in->pushed ) 500 { 501 assert( in->bufpos > 0 ); 502 c = in->charbuf[ --in->bufpos ]; 503 if ( in->bufpos == 0 ) 504 in->pushed = no; 505 506 if ( c == '\n' ) 507 { 508 in->curcol = 1; 509 in->curline++; 510 return c; 511 } 512 in->curcol++; 513 } 514 return c; 515} 516 517void TY_(UngetChar)( uint c, StreamIn *in ) 518{ 519 if (c == EndOfStream) 520 { 521 /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */ 522 return; 523 } 524 525 in->pushed = yes; 526 527 if (in->bufpos + 1 >= in->bufsize) 528 in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize)); 529 530 in->charbuf[(in->bufpos)++] = c; 531 532 if (c == '\n') 533 --(in->curline); 534 535 in->curcol = in->lastcol; 536} 537 538 539 540/************************ 541** Sink 542************************/ 543 544static StreamOut* initStreamOut( int encoding, uint nl ) 545{ 546 StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) ); 547 ClearMemory( out, sizeof(StreamOut) ); 548 out->encoding = encoding; 549 out->state = FSM_ASCII; 550 out->nl = nl; 551 return out; 552} 553 554StreamOut* TY_(FileOutput)( FILE* fp, int encoding, uint nl ) 555{ 556 StreamOut* out = initStreamOut( encoding, nl ); 557 TY_(initFileSink)( &out->sink, fp ); 558 out->iotype = FileIO; 559 return out; 560} 561StreamOut* TY_(BufferOutput)( TidyBuffer* buf, int encoding, uint nl ) 562{ 563 StreamOut* out = initStreamOut( encoding, nl ); 564 tidyInitOutputBuffer( &out->sink, buf ); 565 out->iotype = BufferIO; 566 return out; 567} 568StreamOut* TY_(UserOutput)( TidyOutputSink* sink, int encoding, uint nl ) 569{ 570 StreamOut* out = initStreamOut( encoding, nl ); 571 memcpy( &out->sink, sink, sizeof(TidyOutputSink) ); 572 out->iotype = UserIO; 573 return out; 574} 575 576void TY_(WriteChar)( uint c, StreamOut* out ) 577{ 578 /* Translate outgoing newlines */ 579 if ( LF == c ) 580 { 581 if ( out->nl == TidyCRLF ) 582 TY_(WriteChar)( CR, out ); 583 else if ( out->nl == TidyCR ) 584 c = CR; 585 } 586 587 if (out->encoding == MACROMAN) 588 { 589 EncodeMacRoman( c, out ); 590 } 591 else if (out->encoding == WIN1252) 592 { 593 EncodeWin1252( c, out ); 594 } 595 else if (out->encoding == IBM858) 596 { 597 EncodeIbm858( c, out ); 598 } 599 else if (out->encoding == LATIN0) 600 { 601 EncodeLatin0( c, out ); 602 } 603 604 else if (out->encoding == UTF8) 605 { 606 int count = 0; 607 608 TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count ); 609 if (count <= 0) 610 { 611 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */ 612 /* replacement char 0xFFFD encoded as UTF-8 */ 613 PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out); 614 } 615 } 616#ifndef NO_NATIVE_ISO2022_SUPPORT 617 else if (out->encoding == ISO2022) 618 { 619 if (c == 0x1b) /* ESC */ 620 out->state = FSM_ESC; 621 else 622 { 623 switch (out->state) 624 { 625 case FSM_ESC: 626 if (c == '$') 627 out->state = FSM_ESCD; 628 else if (c == '(') 629 out->state = FSM_ESCP; 630 else 631 out->state = FSM_ASCII; 632 break; 633 634 case FSM_ESCD: 635 if (c == '(') 636 out->state = FSM_ESCDP; 637 else 638 out->state = FSM_NONASCII; 639 break; 640 641 case FSM_ESCDP: 642 out->state = FSM_NONASCII; 643 break; 644 645 case FSM_ESCP: 646 out->state = FSM_ASCII; 647 break; 648 649 case FSM_NONASCII: 650 c &= 0x7F; 651 break; 652 } 653 } 654 655 PutByte(c, out); 656 } 657#endif /* NO_NATIVE_ISO2022_SUPPORT */ 658 659#if SUPPORT_UTF16_ENCODINGS 660 else if ( out->encoding == UTF16LE || 661 out->encoding == UTF16BE || 662 out->encoding == UTF16 ) 663 { 664 int i, numChars = 1; 665 uint theChars[2]; 666 667 if ( !TY_(IsValidUTF16FromUCS4)(c) ) 668 { 669 /* invalid UTF-16 value */ 670 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ 671 c = 0; 672 numChars = 0; 673 } 674 else if ( TY_(IsCombinedChar)(c) ) 675 { 676 /* output both, unless something goes wrong */ 677 numChars = 2; 678 if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) ) 679 { 680 /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ 681 c = 0; 682 numChars = 0; 683 } 684 } 685 else 686 { 687 /* just put the char out */ 688 theChars[0] = c; 689 } 690 691 for (i = 0; i < numChars; i++) 692 { 693 c = theChars[i]; 694 695 if (out->encoding == UTF16LE) 696 { 697 uint ch = c & 0xFF; PutByte(ch, out); 698 ch = (c >> 8) & 0xFF; PutByte(ch, out); 699 } 700 701 else if (out->encoding == UTF16BE || out->encoding == UTF16) 702 { 703 uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 704 ch = c & 0xFF; PutByte(ch, out); 705 } 706 } 707 } 708#endif 709 710#if SUPPORT_ASIAN_ENCODINGS 711 else if (out->encoding == BIG5 || out->encoding == SHIFTJIS) 712 { 713 if (c < 128) 714 PutByte(c, out); 715 else 716 { 717 uint ch = (c >> 8) & 0xFF; PutByte(ch, out); 718 ch = c & 0xFF; PutByte(ch, out); 719 } 720 } 721#endif 722 723 else 724 PutByte( c, out ); 725} 726 727 728 729/**************************** 730** Miscellaneous / Helpers 731****************************/ 732 733/* char encoding used when replacing illegal SGML chars, 734** regardless of specified encoding. Set at compile time 735** to either Windows or Mac. 736*/ 737const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC; 738 739 740/* Mapping for Windows Western character set CP 1252 741** (chars 128-159/U+0080-U+009F) to Unicode. 742*/ 743static const uint Win2Unicode[32] = 744{ 745 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 746 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, 747 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 748 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178 749}; 750 751/* Function for conversion from Windows-1252 to Unicode */ 752uint TY_(DecodeWin1252)(uint c) 753{ 754 if (127 < c && c < 160) 755 c = Win2Unicode[c - 128]; 756 757 return c; 758} 759 760static void EncodeWin1252( uint c, StreamOut* out ) 761{ 762 if (c < 128 || (c > 159 && c < 256)) 763 PutByte(c, out); 764 else 765 { 766 int i; 767 768 for (i = 128; i < 160; i++) 769 if (Win2Unicode[i - 128] == c) 770 { 771 PutByte(i, out); 772 break; 773 } 774 } 775} 776 777/* 778 John Love-Jensen contributed this table for mapping MacRoman 779 character set to Unicode 780*/ 781 782/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */ 783static const uint Mac2Unicode[128] = 784{ 785 /* x7F = DEL */ 786 787 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 788 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, 789 790 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 791 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, 792 793 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 794 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, 795 796 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 797 /* =BD U+2126 OHM SIGN */ 798 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, 799 800 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 801 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, 802 803 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 804 /* =DB U+00A4 CURRENCY SIGN */ 805 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, 806 807 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 808 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, 809 /* xF0 = Apple Logo */ 810 /* =F0 U+2665 BLACK HEART SUIT */ 811 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 812 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7 813}; 814 815/* Function to convert from MacRoman to Unicode */ 816uint TY_(DecodeMacRoman)(uint c) 817{ 818 if (127 < c) 819 c = Mac2Unicode[c - 128]; 820 return c; 821} 822 823static void EncodeMacRoman( uint c, StreamOut* out ) 824{ 825 if (c < 128) 826 PutByte(c, out); 827 else 828 { 829 /* For mac users, map Unicode back to MacRoman. */ 830 int i; 831 for (i = 128; i < 256; i++) 832 { 833 if (Mac2Unicode[i - 128] == c) 834 { 835 PutByte(i, out); 836 break; 837 } 838 } 839 } 840} 841 842/* Mapping for OS/2 Western character set CP 850 843** (chars 128-255) to Unicode. 844*/ 845static const uint IBM2Unicode[128] = 846{ 847 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, 848 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, 849 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, 850 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, 851 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, 852 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, 853 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, 854 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, 855 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, 856 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 857 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce, 858 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 859 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 860 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4, 861 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, 862 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0 863}; 864 865/* Function for conversion from OS/2-850 to Unicode */ 866static uint DecodeIbm850(uint c) 867{ 868 if (127 < c && c < 256) 869 c = IBM2Unicode[c - 128]; 870 871 return c; 872} 873 874/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */ 875static void EncodeIbm858( uint c, StreamOut* out ) 876{ 877 if (c < 128) 878 PutByte(c, out); 879 else 880 { 881 int i; 882 for (i = 128; i < 256; i++) 883 { 884 if (IBM2Unicode[i - 128] == c) 885 { 886 PutByte(i, out); 887 break; 888 } 889 } 890 } 891} 892 893 894/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */ 895static uint DecodeLatin0(uint c) 896{ 897 if (159 < c && c < 191) 898 { 899 switch (c) 900 { 901 case 0xA4: c = 0x20AC; break; 902 case 0xA6: c = 0x0160; break; 903 case 0xA8: c = 0x0161; break; 904 case 0xB4: c = 0x017D; break; 905 case 0xB8: c = 0x017E; break; 906 case 0xBC: c = 0x0152; break; 907 case 0xBD: c = 0x0153; break; 908 case 0xBE: c = 0x0178; break; 909 } 910 } 911 return c; 912} 913 914/* Map Unicode back to ISO-8859-15. */ 915static void EncodeLatin0( uint c, StreamOut* out ) 916{ 917 switch (c) 918 { 919 case 0x20AC: c = 0xA4; break; 920 case 0x0160: c = 0xA6; break; 921 case 0x0161: c = 0xA8; break; 922 case 0x017D: c = 0xB4; break; 923 case 0x017E: c = 0xB8; break; 924 case 0x0152: c = 0xBC; break; 925 case 0x0153: c = 0xBD; break; 926 case 0x0178: c = 0xBE; break; 927 } 928 PutByte(c, out); 929} 930 931/* 932 Table to map symbol font characters to Unicode; undefined 933 characters are mapped to 0x0000 and characters without any 934 Unicode equivalent are mapped to '?'. Is this appropriate? 935*/ 936 937static const uint Symbol2Unicode[] = 938{ 939 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 940 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, 941 942 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 943 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, 944 945 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D, 946 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F, 947 948 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 949 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, 950 951 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393, 952 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F, 953 954 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9, 955 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F, 956 957 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3, 958 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF, 959 960 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9, 961 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F, 962 963 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 964 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 965 966 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 967 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 968 969 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663, 970 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193, 971 972 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7, 973 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5, 974 975 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229, 976 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209, 977 978 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5, 979 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3, 980 981 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F, 982 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 983 984 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F, 985 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F 986}; 987 988#if 0 989/* Function to convert from Symbol Font chars to Unicode */ 990uint DecodeSymbolFont(uint c) 991{ 992 if (c > 255) 993 return c; 994 995 /* todo: add some error message */ 996 997 return Symbol2Unicode[c]; 998} 999#endif 1000 1001 1002/* Facilitates user defined source by providing 1003** an entry point to marshal pointers-to-functions. 1004** Needed by .NET and possibly other language bindings. 1005*/ 1006Bool TIDY_CALL tidyInitSource( TidyInputSource* source, 1007 void* srcData, 1008 TidyGetByteFunc gbFunc, 1009 TidyUngetByteFunc ugbFunc, 1010 TidyEOFFunc endFunc ) 1011{ 1012 Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc ); 1013 1014 if ( status ) 1015 { 1016 source->sourceData = srcData; 1017 source->getByte = gbFunc; 1018 source->ungetByte = ugbFunc; 1019 source->eof = endFunc; 1020 } 1021 1022 return status; 1023} 1024 1025Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink, 1026 void* snkData, 1027 TidyPutByteFunc pbFunc ) 1028{ 1029 Bool status = ( sink && snkData && pbFunc ); 1030 if ( status ) 1031 { 1032 sink->sinkData = snkData; 1033 sink->putByte = pbFunc; 1034 } 1035 return status; 1036} 1037 1038/* GetByte must return a byte value in a signed 1039** integer so that a negative value can signal EOF 1040** without interfering w/ 0-255 legitimate byte values. 1041*/ 1042uint TIDY_CALL tidyGetByte( TidyInputSource* source ) 1043{ 1044 int bv = source->getByte( source->sourceData ); 1045 return (uint) bv; 1046} 1047Bool TIDY_CALL tidyIsEOF( TidyInputSource* source ) 1048{ 1049 return source->eof( source->sourceData ); 1050} 1051void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch ) 1052{ 1053 source->ungetByte( source->sourceData, (byte) ch ); 1054} 1055void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch ) 1056{ 1057 sink->putByte( sink->sinkData, (byte) ch ); 1058} 1059 1060static uint ReadByte( StreamIn* in ) 1061{ 1062 return tidyGetByte( &in->source ); 1063} 1064Bool TY_(IsEOF)( StreamIn* in ) 1065{ 1066 return tidyIsEOF( &in->source ); 1067} 1068static void UngetByte( StreamIn* in, uint byteValue ) 1069{ 1070 tidyUngetByte( &in->source, byteValue ); 1071} 1072static void PutByte( uint byteValue, StreamOut* out ) 1073{ 1074 tidyPutByte( &out->sink, byteValue ); 1075} 1076 1077#if 0 1078static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count ) 1079{ 1080 int i; 1081 1082 for (i = 0; i < *count; i++) 1083 { 1084 /* should never get here; testing for 0xFF, a valid char, is not a good idea */ 1085 if ( in && TY_(IsEOF)(in) ) 1086 { 1087 /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */ 1088 *count = -i; 1089 return; 1090 } 1091 1092 in->source.ungetByte( in->source.sourceData, buf[i] ); 1093 } 1094} 1095 1096/* 1097 Read raw bytes from stream, return <= 0 if EOF; or if 1098 "unget" is true, Unget the bytes to re-synchronize the input stream 1099 Normally UTF-8 successor bytes are read using this routine. 1100*/ 1101static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count ) 1102{ 1103 int ix; 1104 for ( ix=0; ix < *count; ++ix ) 1105 { 1106 if ( in->rawPushed ) 1107 { 1108 buf[ix] = in->rawBytebuf[ --in->rawBufpos ]; 1109 if ( in->rawBufpos == 0 ) 1110 in->rawPushed = no; 1111 } 1112 else 1113 { 1114 if ( in->source.eof(in->source.sourceData) ) 1115 { 1116 *count = -i; 1117 break; 1118 } 1119 buf[ix] = in->source.getByte( in->source.sourceData ); 1120 } 1121 } 1122} 1123#endif /* 0 */ 1124 1125/* read char from stream */ 1126static uint ReadCharFromStream( StreamIn* in ) 1127{ 1128 uint c, n; 1129#ifdef TIDY_WIN32_MLANG_SUPPORT 1130 uint bytesRead = 0; 1131#endif 1132 1133 if ( TY_(IsEOF)(in) ) 1134 return EndOfStream; 1135 1136 c = ReadByte( in ); 1137 1138 if (c == EndOfStream) 1139 return c; 1140 1141#ifndef NO_NATIVE_ISO2022_SUPPORT 1142 /* 1143 A document in ISO-2022 based encoding uses some ESC sequences 1144 called "designator" to switch character sets. The designators 1145 defined and used in ISO-2022-JP are: 1146 1147 "ESC" + "(" + ? for ISO646 variants 1148 1149 "ESC" + "$" + ? and 1150 "ESC" + "$" + "(" + ? for multibyte character sets 1151 1152 Where ? stands for a single character used to indicate the 1153 character set for multibyte characters. 1154 1155 Tidy handles this by preserving the escape sequence and 1156 setting the top bit of each byte for non-ascii chars. This 1157 bit is then cleared on output. The input stream keeps track 1158 of the state to determine when to set/clear the bit. 1159 */ 1160 1161 if (in->encoding == ISO2022) 1162 { 1163 if (c == 0x1b) /* ESC */ 1164 { 1165 in->state = FSM_ESC; 1166 return c; 1167 } 1168 1169 switch (in->state) 1170 { 1171 case FSM_ESC: 1172 if (c == '$') 1173 in->state = FSM_ESCD; 1174 else if (c == '(') 1175 in->state = FSM_ESCP; 1176 else 1177 in->state = FSM_ASCII; 1178 break; 1179 1180 case FSM_ESCD: 1181 if (c == '(') 1182 in->state = FSM_ESCDP; 1183 else 1184 in->state = FSM_NONASCII; 1185 break; 1186 1187 case FSM_ESCDP: 1188 in->state = FSM_NONASCII; 1189 break; 1190 1191 case FSM_ESCP: 1192 in->state = FSM_ASCII; 1193 break; 1194 1195 case FSM_NONASCII: 1196 c |= 0x80; 1197 break; 1198 } 1199 1200 return c; 1201 } 1202#endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */ 1203 1204#if SUPPORT_UTF16_ENCODINGS 1205 if ( in->encoding == UTF16LE ) 1206 { 1207 uint c1 = ReadByte( in ); 1208 if ( EndOfStream == c1 ) 1209 return EndOfStream; 1210 n = (c1 << 8) + c; 1211 return n; 1212 } 1213 1214 if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */ 1215 { 1216 uint c1 = ReadByte( in ); 1217 if ( EndOfStream == c1 ) 1218 return EndOfStream; 1219 n = (c << 8) + c1; 1220 return n; 1221 } 1222#endif 1223 1224 if ( in->encoding == UTF8 ) 1225 { 1226 /* deal with UTF-8 encoded char */ 1227 1228 int err, count = 0; 1229 1230 /* first byte "c" is passed in separately */ 1231 err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count ); 1232 if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */ 1233 return EndOfStream; 1234 else if (err) 1235 { 1236 /* set error position just before offending character */ 1237 in->doc->lexer->lines = in->curline; 1238 in->doc->lexer->columns = in->curcol; 1239 1240 TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no); 1241 n = 0xFFFD; /* replacement char */ 1242 } 1243 1244 return n; 1245 } 1246 1247#if SUPPORT_ASIAN_ENCODINGS 1248 /* 1249 This section is suitable for any "multibyte" variable-width 1250 character encoding in which a one-byte code is less than 1251 128, and the first byte of a two-byte code is greater or 1252 equal to 128. Note that Big5 and ShiftJIS fit into this 1253 kind, even though their second byte may be less than 128 1254 */ 1255 if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS)) 1256 { 1257 if (c < 128) 1258 return c; 1259 else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */ 1260 { 1261 /* 1262 Rick Cameron pointed out that for Shift_JIS, the values from 1263 0xa1 through 0xdf represent singe-byte characters 1264 (U+FF61 to U+FF9F - half-shift Katakana) 1265 */ 1266 return c; 1267 } 1268 else 1269 { 1270 uint c1 = ReadByte( in ); 1271 if ( EndOfStream == c1 ) 1272 return EndOfStream; 1273 n = (c << 8) + c1; 1274 return n; 1275 } 1276 } 1277#endif 1278 1279#ifdef TIDY_WIN32_MLANG_SUPPORT 1280 else if (in->encoding > WIN32MLANG) 1281 { 1282 assert( in->mlang != 0 ); 1283 return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead); 1284 } 1285#endif 1286 1287 else 1288 n = c; 1289 1290 return n; 1291} 1292 1293/* Output a Byte Order Mark if required */ 1294void TY_(outBOM)( StreamOut *out ) 1295{ 1296 if ( out->encoding == UTF8 1297#if SUPPORT_UTF16_ENCODINGS 1298 || out->encoding == UTF16LE 1299 || out->encoding == UTF16BE 1300 || out->encoding == UTF16 1301#endif 1302 ) 1303 { 1304 /* this will take care of encoding the BOM correctly */ 1305 TY_(WriteChar)( UNICODE_BOM, out ); 1306 } 1307} 1308 1309/* this is in intermediate fix for various problems in the */ 1310/* long term code and data in charsets.c should be used */ 1311static struct _enc2iana 1312{ 1313 uint id; 1314 ctmbstr name; 1315 ctmbstr tidyOptName; 1316} const enc2iana[] = 1317{ 1318 { ASCII, "us-ascii", "ascii" }, 1319 { LATIN0, "iso-8859-15", "latin0" }, 1320 { LATIN1, "iso-8859-1", "latin1" }, 1321 { UTF8, "utf-8", "utf8" }, 1322 { MACROMAN, "macintosh", "mac" }, 1323 { WIN1252, "windows-1252", "win1252" }, 1324 { IBM858, "ibm00858", "ibm858" }, 1325#if SUPPORT_UTF16_ENCODINGS 1326 { UTF16LE, "utf-16", "utf16le" }, 1327 { UTF16BE, "utf-16", "utf16be" }, 1328 { UTF16, "utf-16", "utf16" }, 1329#endif 1330#if SUPPORT_ASIAN_ENCODINGS 1331 { BIG5, "big5", "big5" }, 1332 { SHIFTJIS, "shift_jis", "shiftjis"}, 1333#endif 1334#ifndef NO_NATIVE_ISO2022_SUPPORT 1335 { ISO2022, NULL, "iso2022" }, 1336#endif 1337 { RAW, NULL, "raw" } 1338}; 1339 1340ctmbstr TY_(GetEncodingNameFromTidyId)(uint id) 1341{ 1342 uint i; 1343 1344 for (i = 0; enc2iana[i].name; ++i) 1345 if (enc2iana[i].id == id) 1346 return enc2iana[i].name; 1347 1348 return NULL; 1349} 1350 1351ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id) 1352{ 1353 uint i; 1354 1355 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) 1356 if (enc2iana[i].id == id) 1357 return enc2iana[i].tidyOptName; 1358 1359 return NULL; 1360} 1361 1362int TY_(GetCharEncodingFromOptName)( ctmbstr charenc ) 1363{ 1364 uint i; 1365 1366 for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) 1367 if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 ) 1368 return enc2iana[i].id; 1369 1370 return -1; 1371} 1372 1373/* 1374 * local variables: 1375 * mode: c 1376 * indent-tabs-mode: nil 1377 * c-basic-offset: 4 1378 * eval: (c-set-offset 'substatement-open 0) 1379 * end: 1380 */ 1381