1/* streamio.c -- handles character stream I/O
2
3  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
4  See tidy.h for the copyright notice.
5
6  CVS Info :
7
8    $Author: iccir $
9    $Date: 2007/02/03 02:31:30 $
10    $Revision: 1.6 $
11
12  Wrapper around Tidy input source and output sink
13  that calls appropriate interfaces, and applies
14  necessary char encoding transformations: to/from
15  ISO-10646 and/or UTF-8.
16
17*/
18
19#include <stdio.h>
20#include <errno.h>
21
22#include "streamio.h"
23#include "tidy-int.h"
24#include "lexer.h"
25#include "message.h"
26#include "utf8.h"
27#include "tmbstr.h"
28
29#ifdef TIDY_WIN32_MLANG_SUPPORT
30#include "win32tc.h"
31#endif
32
33/************************
34** Forward Declarations
35************************/
36
37static uint ReadCharFromStream( StreamIn* in );
38
39static uint ReadByte( StreamIn* in );
40static void UngetByte( StreamIn* in, uint byteValue );
41
42static void PutByte( uint byteValue, StreamOut* out );
43
44static void EncodeWin1252( uint c, StreamOut* out );
45static void EncodeMacRoman( uint c, StreamOut* out );
46static void EncodeIbm858( uint c, StreamOut* out );
47static void EncodeLatin0( uint c, StreamOut* out );
48
49static uint DecodeIbm850(uint c);
50static uint DecodeLatin0(uint c);
51
52static uint PopChar( StreamIn *in );
53
54/******************************
55** Static (duration) Globals
56******************************/
57
58static StreamOut stderrStreamOut =
59{
60    ASCII,
61    FSM_ASCII,
62    DEFAULT_NL_CONFIG,
63#ifdef TIDY_WIN32_MLANG_SUPPORT
64    (ulong)NULL,
65#endif
66    FileIO,
67    { 0, TY_(filesink_putByte) }
68};
69
70static StreamOut stdoutStreamOut =
71{
72    ASCII,
73    FSM_ASCII,
74    DEFAULT_NL_CONFIG,
75#ifdef TIDY_WIN32_MLANG_SUPPORT
76    (ulong)NULL,
77#endif
78    FileIO,
79    { 0, TY_(filesink_putByte) }
80};
81
82StreamOut* TY_(StdErrOutput)(void)
83{
84  if ( stderrStreamOut.sink.sinkData == 0 )
85      stderrStreamOut.sink.sinkData = stderr;
86  return &stderrStreamOut;
87}
88
89#if 0
90StreamOut* TY_(StdOutOutput)(void)
91{
92  if ( stdoutStreamOut.sink.sinkData == 0 )
93      stdoutStreamOut.sink.sinkData = stdout;
94  return &stdoutStreamOut;
95}
96#endif
97
98void  TY_(ReleaseStreamOut)( StreamOut* out )
99{
100    if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
101    {
102        if ( out->iotype == FileIO )
103            fclose( (FILE*) out->sink.sinkData );
104        MemFree( out );
105    }
106}
107
108
109/************************
110** Source
111************************/
112
113StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
114{
115    StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );
116
117    ClearMemory( in, sizeof(StreamIn) );
118    in->curline = 1;
119    in->curcol = 1;
120    in->encoding = encoding;
121    in->state = FSM_ASCII;
122    in->doc = doc;
123    in->bufsize = CHARBUF_SIZE;
124    in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize);
125#ifdef TIDY_STORE_ORIGINAL_TEXT
126    in->otextbuf = NULL;
127    in->otextlen = 0;
128    in->otextsize = 0;
129#endif
130    return in;
131}
132
133void TY_(freeStreamIn)(StreamIn* in)
134{
135#ifdef TIDY_STORE_ORIGINAL_TEXT
136    if (in->otextbuf)
137        MemFree(in->otextbuf);
138#endif
139    MemFree(in->charbuf);
140    MemFree(in);
141}
142
143StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
144{
145    StreamIn *in = TY_(initStreamIn)( doc, encoding );
146    if ( TY_(initFileSource)( &in->source, fp ) != 0 )
147    {
148        TY_(freeStreamIn)( in );
149        return NULL;
150    }
151    in->iotype = FileIO;
152    return in;
153}
154
155StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
156{
157    StreamIn *in = TY_(initStreamIn)( doc, encoding );
158    tidyInitInputBuffer( &in->source, buf );
159    in->iotype = BufferIO;
160    return in;
161}
162
163StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
164{
165    StreamIn *in = TY_(initStreamIn)( doc, encoding );
166    memcpy( &in->source, source, sizeof(TidyInputSource) );
167    in->iotype = UserIO;
168    return in;
169}
170
171int TY_(ReadBOMEncoding)(StreamIn *in)
172{
173    uint c, c1;
174#if SUPPORT_UTF16_ENCODINGS
175    uint bom;
176#endif
177
178    c = ReadByte(in);
179    if (c == EndOfStream)
180        return -1;
181
182    c1 = ReadByte( in );
183    if (c1 == EndOfStream)
184    {
185        UngetByte(in, c);
186        return -1;
187    }
188
189    /* todo: dont warn about mismatch for auto input encoding */
190    /* todo: let the user override the encoding found here */
191
192#if SUPPORT_UTF16_ENCODINGS
193    bom = (c << 8) + c1;
194
195    if ( bom == UNICODE_BOM_BE )
196    {
197        /* big-endian UTF-16 */
198        if ( in->encoding != UTF16 && in->encoding != UTF16BE )
199            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE);
200
201        return UTF16BE; /* return decoded BOM */
202    }
203    else if (bom == UNICODE_BOM_LE)
204    {
205        /* little-endian UTF-16 */
206        if (in->encoding != UTF16 && in->encoding != UTF16LE)
207            TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
208
209        return UTF16LE; /* return decoded BOM */
210    }
211    else
212#endif /* SUPPORT_UTF16_ENCODINGS */
213    {
214        uint c2 = ReadByte(in);
215
216        if (c2 == EndOfStream)
217        {
218            UngetByte(in, c1);
219            UngetByte(in, c);
220            return -1;
221        }
222
223        if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
224        {
225            /* UTF-8 */
226            if (in->encoding != UTF8)
227                TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8);
228
229            return UTF8;
230        }
231        else
232            UngetByte( in, c2 );
233    }
234
235    UngetByte(in, c1);
236    UngetByte(in, c);
237
238    return -1;
239}
240
241#ifdef TIDY_STORE_ORIGINAL_TEXT
242void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
243{
244    if (in->otextlen + 1 >= in->otextsize)
245    {
246        size_t size = in->otextsize ? 1 : 2;
247        in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);
248        in->otextsize += size;
249    }
250    in->otextbuf[in->otextlen++] = c;
251    in->otextbuf[in->otextlen  ] = 0;
252}
253
254void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
255{
256    int i, err, count = 0;
257    tmbchar buf[10] = {0};
258
259    err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
260
261    if (err)
262    {
263        /* replacement character 0xFFFD encoded as UTF-8 */
264        buf[0] = (byte) 0xEF;
265        buf[1] = (byte) 0xBF;
266        buf[2] = (byte) 0xBD;
267        count = 3;
268    }
269
270    for (i = 0; i < count; ++i)
271        AddByteToOriginalText(in, buf[i]);
272}
273#endif
274
275
276uint TY_(ReadChar)( StreamIn *in )
277{
278    uint c = EndOfStream;
279    uint tabsize = cfg( in->doc, TidyTabSize );
280#ifdef TIDY_STORE_ORIGINAL_TEXT
281    Bool added = no;
282#endif
283
284/* Apple Inc. Changes:
285   2005-01-18 swilkin Change to deal with possible '\0' char or other char that should be discarded following '\r'
286*/
287#ifdef TIDY_APPLE_CHANGES
288    if ( !in->pushed )
289    {
290#else
291    if ( in->pushed )
292        return PopChar( in );
293#endif
294    in->lastcol = in->curcol;
295
296    if ( in->tabs > 0 )
297    {
298        in->curcol++;
299        in->tabs--;
300        return ' ';
301    }
302#ifdef TIDY_APPLE_CHANGES
303    }
304#endif
305
306    for (;;)
307    {
308#ifdef TIDY_APPLE_CHANGES
309        if ( in->pushed )
310            c = PopChar(in);
311        else
312#endif
313        c = ReadCharFromStream(in);
314
315        if ( EndOfStream == c )
316            return EndOfStream;
317
318        if (c == '\n')
319        {
320#ifdef TIDY_STORE_ORIGINAL_TEXT
321            added = yes;
322            AddCharToOriginalText(in, (tchar)c);
323#endif
324            in->curcol = 1;
325            in->curline++;
326            break;
327        }
328
329        if (c == '\t')
330        {
331#ifdef TIDY_STORE_ORIGINAL_TEXT
332            added = yes;
333            AddCharToOriginalText(in, (tchar)c);
334#endif
335            in->tabs = tabsize > 0 ?
336                tabsize - ((in->curcol - 1) % tabsize) - 1
337                : 0;
338            in->curcol++;
339            c = ' ';
340            break;
341        }
342
343        /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
344        if (c == '\r')
345        {
346#ifdef TIDY_STORE_ORIGINAL_TEXT
347            added = yes;
348            AddCharToOriginalText(in, (tchar)c);
349#endif
350            c = ReadCharFromStream(in);
351            if (c != '\n')
352            {
353                TY_(UngetChar)( c, in );
354                c = '\n';
355            }
356            else
357            {
358#ifdef TIDY_STORE_ORIGINAL_TEXT
359                AddCharToOriginalText(in, (tchar)c);
360#endif
361            }
362            in->curcol = 1;
363            in->curline++;
364            break;
365        }
366
367#ifndef NO_NATIVE_ISO2022_SUPPORT
368        /* strip control characters, except for Esc */
369        if (c == '\033')
370            break;
371#endif
372
373        /* Form Feed is allowed in HTML */
374        if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
375            break;
376
377        if ( c < 32 )
378            continue; /* discard control char */
379
380        /* watch out for chars that have already been decoded such as */
381        /* IS02022, UTF-8 etc, that don't require further decoding */
382
383        if (
384            in->encoding == RAW
385#ifndef NO_NATIVE_ISO2022_SUPPORT
386         || in->encoding == ISO2022
387#endif
388         || in->encoding == UTF8
389
390#if SUPPORT_ASIAN_ENCODINGS
391         || in->encoding == SHIFTJIS /* #431953 - RJ */
392         || in->encoding == BIG5     /* #431953 - RJ */
393#endif
394           )
395        {
396            in->curcol++;
397            break;
398        }
399
400#if SUPPORT_UTF16_ENCODINGS
401        /* handle surrogate pairs */
402        if ( in->encoding == UTF16LE ||
403             in->encoding == UTF16   ||
404             in->encoding == UTF16BE )
405        {
406            if ( !TY_(IsValidUTF16FromUCS4)(c) )
407            {
408                /* invalid UTF-16 value */
409                TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
410                c = 0;
411            }
412            else if ( TY_(IsLowSurrogate)(c) )
413            {
414                uint n = c;
415                uint m = ReadCharFromStream( in );
416                if ( m == EndOfStream )
417                   return EndOfStream;
418
419                c = 0;
420                if ( TY_(IsHighSurrogate)(m) )
421                {
422                    n = TY_(CombineSurrogatePair)( m, n );
423                    if ( TY_(IsValidCombinedChar)(n) )
424                        c = n;
425                }
426                /* not a valid pair */
427                if ( 0 == c )
428                    TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
429            }
430        }
431#endif
432
433        /* Do first: acts on range 128 - 255 */
434        switch ( in->encoding )
435        {
436        case MACROMAN:
437            c = TY_(DecodeMacRoman)( c );
438            break;
439        case IBM858:
440            c = DecodeIbm850( c );
441            break;
442        case LATIN0:
443            c = DecodeLatin0( c );
444            break;
445        }
446
447        /* produced e.g. as a side-effect of smart quotes in Word */
448        /* but can't happen if using MACROMAN encoding */
449        if ( 127 < c && c < 160 )
450        {
451            uint c1 = 0, replMode = DISCARDED_CHAR;
452            Bool isVendorChar = ( in->encoding == WIN1252 ||
453                                  in->encoding == MACROMAN );
454            Bool isWinChar    = ( in->encoding == WIN1252 ||
455                                  TY_(ReplacementCharEncoding) == WIN1252 );
456            Bool isMacChar    = ( in->encoding == MACROMAN ||
457                                  TY_(ReplacementCharEncoding) == MACROMAN );
458
459            /* set error position just before offending character */
460            if (in->doc->lexer)
461            {
462                in->doc->lexer->lines = in->curline;
463                in->doc->lexer->columns = in->curcol;
464            }
465
466            if ( isWinChar )
467                c1 = TY_(DecodeWin1252)( c );
468            else if ( isMacChar )
469                c1 = TY_(DecodeMacRoman)( c );
470            if ( c1 )
471                replMode = REPLACED_CHAR;
472
473            if ( c1 == 0 && isVendorChar )
474                TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
475            else if ( ! isVendorChar )
476                TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
477
478            c = c1;
479        }
480
481        if ( c == 0 )
482            continue; /* illegal char is discarded */
483
484        in->curcol++;
485        break;
486    }
487
488#ifdef TIDY_STORE_ORIGINAL_TEXT
489    if (!added)
490        AddCharToOriginalText(in, (tchar)c);
491#endif
492
493    return c;
494}
495
496static uint PopChar( StreamIn *in )
497{
498    uint c = EndOfStream;
499    if ( in->pushed )
500    {
501        assert( in->bufpos > 0 );
502        c = in->charbuf[ --in->bufpos ];
503        if ( in->bufpos == 0 )
504            in->pushed = no;
505
506        if ( c == '\n' )
507        {
508            in->curcol = 1;
509            in->curline++;
510            return c;
511        }
512        in->curcol++;
513    }
514    return c;
515}
516
517void TY_(UngetChar)( uint c, StreamIn *in )
518{
519    if (c == EndOfStream)
520    {
521        /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
522        return;
523    }
524
525    in->pushed = yes;
526
527    if (in->bufpos + 1 >= in->bufsize)
528        in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));
529
530    in->charbuf[(in->bufpos)++] = c;
531
532    if (c == '\n')
533        --(in->curline);
534
535    in->curcol = in->lastcol;
536}
537
538
539
540/************************
541** Sink
542************************/
543
544static StreamOut* initStreamOut( int encoding, uint nl )
545{
546    StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );
547    ClearMemory( out, sizeof(StreamOut) );
548    out->encoding = encoding;
549    out->state = FSM_ASCII;
550    out->nl = nl;
551    return out;
552}
553
554StreamOut* TY_(FileOutput)( FILE* fp, int encoding, uint nl )
555{
556    StreamOut* out = initStreamOut( encoding, nl );
557    TY_(initFileSink)( &out->sink, fp );
558    out->iotype = FileIO;
559    return out;
560}
561StreamOut* TY_(BufferOutput)( TidyBuffer* buf, int encoding, uint nl )
562{
563    StreamOut* out = initStreamOut( encoding, nl );
564    tidyInitOutputBuffer( &out->sink, buf );
565    out->iotype = BufferIO;
566    return out;
567}
568StreamOut* TY_(UserOutput)( TidyOutputSink* sink, int encoding, uint nl )
569{
570    StreamOut* out = initStreamOut( encoding, nl );
571    memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
572    out->iotype = UserIO;
573    return out;
574}
575
576void TY_(WriteChar)( uint c, StreamOut* out )
577{
578    /* Translate outgoing newlines */
579    if ( LF == c )
580    {
581      if ( out->nl == TidyCRLF )
582          TY_(WriteChar)( CR, out );
583      else if ( out->nl == TidyCR )
584          c = CR;
585    }
586
587    if (out->encoding == MACROMAN)
588    {
589        EncodeMacRoman( c, out );
590    }
591    else if (out->encoding == WIN1252)
592    {
593        EncodeWin1252( c, out );
594    }
595    else if (out->encoding == IBM858)
596    {
597        EncodeIbm858( c, out );
598    }
599    else if (out->encoding == LATIN0)
600    {
601        EncodeLatin0( c, out );
602    }
603
604    else if (out->encoding == UTF8)
605    {
606        int count = 0;
607
608        TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
609        if (count <= 0)
610        {
611          /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
612            /* replacement char 0xFFFD encoded as UTF-8 */
613            PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
614        }
615    }
616#ifndef NO_NATIVE_ISO2022_SUPPORT
617    else if (out->encoding == ISO2022)
618    {
619        if (c == 0x1b)  /* ESC */
620            out->state = FSM_ESC;
621        else
622        {
623            switch (out->state)
624            {
625            case FSM_ESC:
626                if (c == '$')
627                    out->state = FSM_ESCD;
628                else if (c == '(')
629                    out->state = FSM_ESCP;
630                else
631                    out->state = FSM_ASCII;
632                break;
633
634            case FSM_ESCD:
635                if (c == '(')
636                    out->state = FSM_ESCDP;
637                else
638                    out->state = FSM_NONASCII;
639                break;
640
641            case FSM_ESCDP:
642                out->state = FSM_NONASCII;
643                break;
644
645            case FSM_ESCP:
646                out->state = FSM_ASCII;
647                break;
648
649            case FSM_NONASCII:
650                c &= 0x7F;
651                break;
652            }
653        }
654
655        PutByte(c, out);
656    }
657#endif /* NO_NATIVE_ISO2022_SUPPORT */
658
659#if SUPPORT_UTF16_ENCODINGS
660    else if ( out->encoding == UTF16LE ||
661              out->encoding == UTF16BE ||
662              out->encoding == UTF16 )
663    {
664        int i, numChars = 1;
665        uint theChars[2];
666
667        if ( !TY_(IsValidUTF16FromUCS4)(c) )
668        {
669            /* invalid UTF-16 value */
670            /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
671            c = 0;
672            numChars = 0;
673        }
674        else if ( TY_(IsCombinedChar)(c) )
675        {
676            /* output both, unless something goes wrong */
677            numChars = 2;
678            if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
679            {
680                /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
681                c = 0;
682                numChars = 0;
683            }
684        }
685        else
686        {
687            /* just put the char out */
688            theChars[0] = c;
689        }
690
691        for (i = 0; i < numChars; i++)
692        {
693            c = theChars[i];
694
695            if (out->encoding == UTF16LE)
696            {
697                uint ch = c & 0xFF; PutByte(ch, out);
698                ch = (c >> 8) & 0xFF; PutByte(ch, out);
699            }
700
701            else if (out->encoding == UTF16BE || out->encoding == UTF16)
702            {
703                uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
704                ch = c & 0xFF; PutByte(ch, out);
705            }
706        }
707    }
708#endif
709
710#if SUPPORT_ASIAN_ENCODINGS
711    else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
712    {
713        if (c < 128)
714            PutByte(c, out);
715        else
716        {
717            uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
718            ch = c & 0xFF; PutByte(ch, out);
719        }
720    }
721#endif
722
723    else
724        PutByte( c, out );
725}
726
727
728
729/****************************
730** Miscellaneous / Helpers
731****************************/
732
733/* char encoding used when replacing illegal SGML chars,
734** regardless of specified encoding.  Set at compile time
735** to either Windows or Mac.
736*/
737const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC;
738
739
740/* Mapping for Windows Western character set CP 1252
741** (chars 128-159/U+0080-U+009F) to Unicode.
742*/
743static const uint Win2Unicode[32] =
744{
745    0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
746    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
747    0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
748    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
749};
750
751/* Function for conversion from Windows-1252 to Unicode */
752uint TY_(DecodeWin1252)(uint c)
753{
754    if (127 < c && c < 160)
755        c = Win2Unicode[c - 128];
756
757    return c;
758}
759
760static void EncodeWin1252( uint c, StreamOut* out )
761{
762    if (c < 128 || (c > 159 && c < 256))
763        PutByte(c, out);
764    else
765    {
766        int i;
767
768        for (i = 128; i < 160; i++)
769            if (Win2Unicode[i - 128] == c)
770            {
771                PutByte(i, out);
772                break;
773            }
774    }
775}
776
777/*
778   John Love-Jensen contributed this table for mapping MacRoman
779   character set to Unicode
780*/
781
782/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
783static const uint Mac2Unicode[128] =
784{
785    /* x7F = DEL */
786
787    0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
788    0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
789
790    0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
791    0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
792
793    0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
794    0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
795
796    0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
797                                            /* =BD U+2126 OHM SIGN */
798    0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
799
800    0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
801    0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
802
803    0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
804                            /* =DB U+00A4 CURRENCY SIGN */
805    0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
806
807    0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
808    0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
809    /* xF0 = Apple Logo */
810    /* =F0 U+2665 BLACK HEART SUIT */
811    0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
812    0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
813};
814
815/* Function to convert from MacRoman to Unicode */
816uint TY_(DecodeMacRoman)(uint c)
817{
818    if (127 < c)
819        c = Mac2Unicode[c - 128];
820    return c;
821}
822
823static void EncodeMacRoman( uint c, StreamOut* out )
824{
825        if (c < 128)
826            PutByte(c, out);
827        else
828        {
829            /* For mac users, map Unicode back to MacRoman. */
830            int i;
831            for (i = 128; i < 256; i++)
832            {
833                if (Mac2Unicode[i - 128] == c)
834                {
835                    PutByte(i, out);
836                    break;
837                }
838            }
839        }
840}
841
842/* Mapping for OS/2 Western character set CP 850
843** (chars 128-255) to Unicode.
844*/
845static const uint IBM2Unicode[128] =
846{
847    0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
848    0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
849    0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
850    0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
851    0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
852    0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
853    0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
854    0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
855    0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
856    0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
857    0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
858    0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
859    0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
860    0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
861    0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
862    0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
863};
864
865/* Function for conversion from OS/2-850 to Unicode */
866static uint DecodeIbm850(uint c)
867{
868    if (127 < c && c < 256)
869        c = IBM2Unicode[c - 128];
870
871    return c;
872}
873
874/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
875static void EncodeIbm858( uint c, StreamOut* out )
876{
877    if (c < 128)
878        PutByte(c, out);
879    else
880    {
881        int i;
882        for (i = 128; i < 256; i++)
883        {
884            if (IBM2Unicode[i - 128] == c)
885            {
886                PutByte(i, out);
887                break;
888            }
889        }
890    }
891}
892
893
894/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
895static uint DecodeLatin0(uint c)
896{
897    if (159 < c && c < 191)
898    {
899        switch (c)
900        {
901        case 0xA4: c = 0x20AC; break;
902        case 0xA6: c = 0x0160; break;
903        case 0xA8: c = 0x0161; break;
904        case 0xB4: c = 0x017D; break;
905        case 0xB8: c = 0x017E; break;
906        case 0xBC: c = 0x0152; break;
907        case 0xBD: c = 0x0153; break;
908        case 0xBE: c = 0x0178; break;
909        }
910    }
911    return c;
912}
913
914/* Map Unicode back to ISO-8859-15. */
915static void EncodeLatin0( uint c, StreamOut* out )
916{
917    switch (c)
918    {
919    case 0x20AC: c = 0xA4; break;
920    case 0x0160: c = 0xA6; break;
921    case 0x0161: c = 0xA8; break;
922    case 0x017D: c = 0xB4; break;
923    case 0x017E: c = 0xB8; break;
924    case 0x0152: c = 0xBC; break;
925    case 0x0153: c = 0xBD; break;
926    case 0x0178: c = 0xBE; break;
927    }
928    PutByte(c, out);
929}
930
931/*
932   Table to map symbol font characters to Unicode; undefined
933   characters are mapped to 0x0000 and characters without any
934   Unicode equivalent are mapped to '?'. Is this appropriate?
935*/
936
937static const uint Symbol2Unicode[] =
938{
939    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
940    0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
941
942    0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
943    0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
944
945    0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
946    0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
947
948    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
949    0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
950
951    0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
952    0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
953
954    0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
955    0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
956
957    0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
958    0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
959
960    0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
961    0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
962
963    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
964    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
965
966    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
967    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
968
969    0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
970    0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
971
972    0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
973    0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
974
975    0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
976    0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
977
978    0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
979    0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
980
981    0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
982    0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
983
984    0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
985    0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
986};
987
988#if 0
989/* Function to convert from Symbol Font chars to Unicode */
990uint DecodeSymbolFont(uint c)
991{
992    if (c > 255)
993        return c;
994
995    /* todo: add some error message */
996
997    return Symbol2Unicode[c];
998}
999#endif
1000
1001
1002/* Facilitates user defined source by providing
1003** an entry point to marshal pointers-to-functions.
1004** Needed by .NET and possibly other language bindings.
1005*/
1006Bool TIDY_CALL tidyInitSource( TidyInputSource*  source,
1007                               void*             srcData,
1008                               TidyGetByteFunc   gbFunc,
1009                               TidyUngetByteFunc ugbFunc,
1010                               TidyEOFFunc       endFunc )
1011{
1012  Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
1013
1014  if ( status )
1015  {
1016    source->sourceData = srcData;
1017    source->getByte    = gbFunc;
1018    source->ungetByte  = ugbFunc;
1019    source->eof        = endFunc;
1020  }
1021
1022  return status;
1023}
1024
1025Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
1026                             void*           snkData,
1027                             TidyPutByteFunc pbFunc )
1028{
1029  Bool status = ( sink && snkData && pbFunc );
1030  if ( status )
1031  {
1032    sink->sinkData = snkData;
1033    sink->putByte  = pbFunc;
1034  }
1035  return status;
1036}
1037
1038/* GetByte must return a byte value in a signed
1039** integer so that a negative value can signal EOF
1040** without interfering w/ 0-255 legitimate byte values.
1041*/
1042uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1043{
1044  int bv = source->getByte( source->sourceData );
1045  return (uint) bv;
1046}
1047Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1048{
1049  return source->eof( source->sourceData );
1050}
1051void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1052{
1053    source->ungetByte( source->sourceData, (byte) ch );
1054}
1055void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1056{
1057    sink->putByte( sink->sinkData, (byte) ch );
1058}
1059
1060static uint ReadByte( StreamIn* in )
1061{
1062    return tidyGetByte( &in->source );
1063}
1064Bool TY_(IsEOF)( StreamIn* in )
1065{
1066    return tidyIsEOF( &in->source );
1067}
1068static void UngetByte( StreamIn* in, uint byteValue )
1069{
1070    tidyUngetByte( &in->source, byteValue );
1071}
1072static void PutByte( uint byteValue, StreamOut* out )
1073{
1074    tidyPutByte( &out->sink, byteValue );
1075}
1076
1077#if 0
1078static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1079{
1080    int i;
1081
1082    for (i = 0; i < *count; i++)
1083    {
1084        /* should never get here; testing for 0xFF, a valid char, is not a good idea */
1085        if ( in && TY_(IsEOF)(in) )
1086        {
1087            /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1088            *count = -i;
1089            return;
1090        }
1091
1092        in->source.ungetByte( in->source.sourceData, buf[i] );
1093    }
1094}
1095
1096/*
1097   Read raw bytes from stream, return <= 0 if EOF; or if
1098   "unget" is true, Unget the bytes to re-synchronize the input stream
1099   Normally UTF-8 successor bytes are read using this routine.
1100*/
1101static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1102{
1103    int ix;
1104    for ( ix=0; ix < *count; ++ix )
1105    {
1106        if ( in->rawPushed )
1107        {
1108            buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1109            if ( in->rawBufpos == 0 )
1110                in->rawPushed = no;
1111        }
1112        else
1113        {
1114            if ( in->source.eof(in->source.sourceData) )
1115            {
1116                *count = -i;
1117                break;
1118            }
1119            buf[ix] = in->source.getByte( in->source.sourceData );
1120        }
1121    }
1122}
1123#endif /* 0 */
1124
1125/* read char from stream */
1126static uint ReadCharFromStream( StreamIn* in )
1127{
1128    uint c, n;
1129#ifdef TIDY_WIN32_MLANG_SUPPORT
1130    uint bytesRead = 0;
1131#endif
1132
1133    if ( TY_(IsEOF)(in) )
1134        return EndOfStream;
1135
1136    c = ReadByte( in );
1137
1138    if (c == EndOfStream)
1139        return c;
1140
1141#ifndef NO_NATIVE_ISO2022_SUPPORT
1142    /*
1143       A document in ISO-2022 based encoding uses some ESC sequences
1144       called "designator" to switch character sets. The designators
1145       defined and used in ISO-2022-JP are:
1146
1147        "ESC" + "(" + ?     for ISO646 variants
1148
1149        "ESC" + "$" + ?     and
1150        "ESC" + "$" + "(" + ?   for multibyte character sets
1151
1152       Where ? stands for a single character used to indicate the
1153       character set for multibyte characters.
1154
1155       Tidy handles this by preserving the escape sequence and
1156       setting the top bit of each byte for non-ascii chars. This
1157       bit is then cleared on output. The input stream keeps track
1158       of the state to determine when to set/clear the bit.
1159    */
1160
1161    if (in->encoding == ISO2022)
1162    {
1163        if (c == 0x1b)  /* ESC */
1164        {
1165            in->state = FSM_ESC;
1166            return c;
1167        }
1168
1169        switch (in->state)
1170        {
1171        case FSM_ESC:
1172            if (c == '$')
1173                in->state = FSM_ESCD;
1174            else if (c == '(')
1175                in->state = FSM_ESCP;
1176            else
1177                in->state = FSM_ASCII;
1178            break;
1179
1180        case FSM_ESCD:
1181            if (c == '(')
1182                in->state = FSM_ESCDP;
1183            else
1184                in->state = FSM_NONASCII;
1185            break;
1186
1187        case FSM_ESCDP:
1188            in->state = FSM_NONASCII;
1189            break;
1190
1191        case FSM_ESCP:
1192            in->state = FSM_ASCII;
1193            break;
1194
1195        case FSM_NONASCII:
1196            c |= 0x80;
1197            break;
1198        }
1199
1200        return c;
1201    }
1202#endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1203
1204#if SUPPORT_UTF16_ENCODINGS
1205    if ( in->encoding == UTF16LE )
1206    {
1207        uint c1 = ReadByte( in );
1208        if ( EndOfStream == c1 )
1209            return EndOfStream;
1210        n = (c1 << 8) + c;
1211        return n;
1212    }
1213
1214    if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1215    {
1216        uint c1 = ReadByte( in );
1217        if ( EndOfStream == c1 )
1218            return EndOfStream;
1219        n = (c << 8) + c1;
1220        return n;
1221    }
1222#endif
1223
1224    if ( in->encoding == UTF8 )
1225    {
1226        /* deal with UTF-8 encoded char */
1227
1228        int err, count = 0;
1229
1230        /* first byte "c" is passed in separately */
1231        err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1232        if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1233            return EndOfStream;
1234        else if (err)
1235        {
1236            /* set error position just before offending character */
1237            in->doc->lexer->lines = in->curline;
1238            in->doc->lexer->columns = in->curcol;
1239
1240            TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1241            n = 0xFFFD; /* replacement char */
1242        }
1243
1244        return n;
1245    }
1246
1247#if SUPPORT_ASIAN_ENCODINGS
1248    /*
1249       This section is suitable for any "multibyte" variable-width
1250       character encoding in which a one-byte code is less than
1251       128, and the first byte of a two-byte code is greater or
1252       equal to 128. Note that Big5 and ShiftJIS fit into this
1253       kind, even though their second byte may be less than 128
1254    */
1255    if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1256    {
1257        if (c < 128)
1258            return c;
1259        else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1260        {
1261            /*
1262              Rick Cameron pointed out that for Shift_JIS, the values from
1263              0xa1 through 0xdf represent singe-byte characters
1264              (U+FF61 to U+FF9F - half-shift Katakana)
1265            */
1266            return c;
1267        }
1268        else
1269        {
1270            uint c1 = ReadByte( in );
1271            if ( EndOfStream == c1 )
1272                return EndOfStream;
1273            n = (c << 8) + c1;
1274            return n;
1275        }
1276    }
1277#endif
1278
1279#ifdef TIDY_WIN32_MLANG_SUPPORT
1280    else if (in->encoding > WIN32MLANG)
1281    {
1282        assert( in->mlang != 0 );
1283        return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1284    }
1285#endif
1286
1287    else
1288        n = c;
1289
1290    return n;
1291}
1292
1293/* Output a Byte Order Mark if required */
1294void TY_(outBOM)( StreamOut *out )
1295{
1296    if ( out->encoding == UTF8
1297#if SUPPORT_UTF16_ENCODINGS
1298         || out->encoding == UTF16LE
1299         || out->encoding == UTF16BE
1300         || out->encoding == UTF16
1301#endif
1302       )
1303    {
1304        /* this will take care of encoding the BOM correctly */
1305        TY_(WriteChar)( UNICODE_BOM, out );
1306    }
1307}
1308
1309/* this is in intermediate fix for various problems in the */
1310/* long term code and data in charsets.c should be used    */
1311static struct _enc2iana
1312{
1313    uint id;
1314    ctmbstr name;
1315    ctmbstr tidyOptName;
1316} const enc2iana[] =
1317{
1318  { ASCII,    "us-ascii",     "ascii"   },
1319  { LATIN0,   "iso-8859-15",  "latin0"  },
1320  { LATIN1,   "iso-8859-1",   "latin1"  },
1321  { UTF8,     "utf-8",        "utf8"   },
1322  { MACROMAN, "macintosh",    "mac"     },
1323  { WIN1252,  "windows-1252", "win1252" },
1324  { IBM858,   "ibm00858",     "ibm858"  },
1325#if SUPPORT_UTF16_ENCODINGS
1326  { UTF16LE,  "utf-16",       "utf16le" },
1327  { UTF16BE,  "utf-16",       "utf16be" },
1328  { UTF16,    "utf-16",       "utf16"   },
1329#endif
1330#if SUPPORT_ASIAN_ENCODINGS
1331  { BIG5,     "big5",         "big5"    },
1332  { SHIFTJIS, "shift_jis",    "shiftjis"},
1333#endif
1334#ifndef NO_NATIVE_ISO2022_SUPPORT
1335  { ISO2022,  NULL,           "iso2022" },
1336#endif
1337  { RAW,      NULL,           "raw"     }
1338};
1339
1340ctmbstr TY_(GetEncodingNameFromTidyId)(uint id)
1341{
1342    uint i;
1343
1344    for (i = 0; enc2iana[i].name; ++i)
1345        if (enc2iana[i].id == id)
1346            return enc2iana[i].name;
1347
1348    return NULL;
1349}
1350
1351ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id)
1352{
1353    uint i;
1354
1355    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1356        if (enc2iana[i].id == id)
1357            return enc2iana[i].tidyOptName;
1358
1359    return NULL;
1360}
1361
1362int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1363{
1364    uint i;
1365
1366    for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1367        if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1368            return enc2iana[i].id;
1369
1370    return -1;
1371}
1372
1373/*
1374 * local variables:
1375 * mode: c
1376 * indent-tabs-mode: nil
1377 * c-basic-offset: 4
1378 * eval: (c-set-offset 'substatement-open 0)
1379 * end:
1380 */
1381