1#ifndef __STREAMIO_H__
2#define __STREAMIO_H__
3
4/* streamio.h -- handles character stream I/O
5
6  (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7  See tidy.h for the copyright notice.
8
9  CVS Info :
10
11    $Author: iccir $
12    $Date: 2007/01/30 23:46:52 $
13    $Revision: 1.3 $
14
15  Wrapper around Tidy input source and output sink
16  that calls appropriate interfaces, and applies
17  necessary char encoding transformations: to/from
18  ISO-10646 and/or UTF-8.
19
20*/
21
22#include "forward.h"
23#include "buffio.h"
24#include "fileio.h"
25
26#ifdef __cplusplus
27extern "C"
28{
29#endif
30typedef enum
31{
32  FileIO,
33  BufferIO,
34  UserIO
35} IOType;
36
37/* states for ISO 2022
38
39 A document in ISO-2022 based encoding uses some ESC sequences called
40 "designator" to switch character sets. The designators defined and
41 used in ISO-2022-JP are:
42
43    "ESC" + "(" + ?     for ISO646 variants
44
45    "ESC" + "$" + ?     and
46    "ESC" + "$" + "(" + ?   for multibyte character sets
47*/
48typedef enum
49{
50  FSM_ASCII,
51  FSM_ESC,
52  FSM_ESCD,
53  FSM_ESCDP,
54  FSM_ESCP,
55  FSM_NONASCII
56} ISO2022State;
57
58/************************
59** Source
60************************/
61
62#define CHARBUF_SIZE 5
63
64/* non-raw input is cleaned up*/
65struct _StreamIn
66{
67    ISO2022State    state;     /* FSM for ISO2022 */
68    Bool   pushed;
69    tchar* charbuf;
70    uint   bufpos;
71    uint   bufsize;
72    int    tabs;
73    int    lastcol;
74    int    curcol;
75    int    curline;
76    int    encoding;
77    IOType iotype;
78
79    TidyInputSource source;
80
81#ifdef TIDY_WIN32_MLANG_SUPPORT
82    ulong  mlang;
83#endif
84
85#ifdef TIDY_STORE_ORIGINAL_TEXT
86    tmbstr otextbuf;
87    size_t otextsize;
88    uint   otextlen;
89#endif
90
91    /* Pointer back to document for error reporting */
92    TidyDocImpl* doc;
93};
94
95StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding );
96void TY_(freeStreamIn)(StreamIn* in);
97
98StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding );
99StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding );
100StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding );
101
102int       TY_(ReadBOMEncoding)(StreamIn *in);
103uint      TY_(ReadChar)( StreamIn* in );
104void      TY_(UngetChar)( uint c, StreamIn* in );
105Bool      TY_(IsEOF)( StreamIn* in );
106
107
108/************************
109** Sink
110************************/
111
112struct _StreamOut
113{
114    int   encoding;
115    ISO2022State   state;     /* for ISO 2022 */
116    uint  nl;
117
118#ifdef TIDY_WIN32_MLANG_SUPPORT
119    ulong mlang;
120#endif
121
122    IOType iotype;
123    TidyOutputSink sink;
124};
125
126StreamOut* TY_(FileOutput)( FILE* fp, int encoding, uint newln );
127StreamOut* TY_(BufferOutput)( TidyBuffer* buf, int encoding, uint newln );
128StreamOut* TY_(UserOutput)( TidyOutputSink* sink, int encoding, uint newln );
129
130StreamOut* TY_(StdErrOutput)(void);
131/* StreamOut* StdOutOutput(void); */
132void       TY_(ReleaseStreamOut)( StreamOut* out );
133
134void TY_(WriteChar)( uint c, StreamOut* out );
135void TY_(outBOM)( StreamOut *out );
136
137ctmbstr TY_(GetEncodingNameFromTidyId)(uint id);
138ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id);
139int TY_(GetCharEncodingFromOptName)(ctmbstr charenc);
140
141/************************
142** Misc
143************************/
144
145/* character encodings
146*/
147#define RAW         0
148#define ASCII       1
149#define LATIN0      2
150#define LATIN1      3
151#define UTF8        4
152#define ISO2022     5
153#define MACROMAN    6
154#define WIN1252     7
155#define IBM858      8
156
157#if SUPPORT_UTF16_ENCODINGS
158#define UTF16LE     9
159#define UTF16BE     10
160#define UTF16       11
161#endif
162
163/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
164** (i.e., to Unicode) before being recoded into UTF-8. This may be
165** confusing: usually UTF-8 implies ISO10646 codepoints.
166*/
167#if SUPPORT_ASIAN_ENCODINGS
168#if SUPPORT_UTF16_ENCODINGS
169#define BIG5        12
170#define SHIFTJIS    13
171#else
172#define BIG5        9
173#define SHIFTJIS    10
174#endif
175#endif
176
177#ifdef TIDY_WIN32_MLANG_SUPPORT
178/* hack: windows code page numbers start at 37 */
179#define WIN32MLANG  36
180#endif
181
182
183/* char encoding used when replacing illegal SGML chars,
184** regardless of specified encoding.  Set at compile time
185** to either Windows or Mac.
186*/
187extern const int TY_(ReplacementCharEncoding);
188
189/* Function for conversion from Windows-1252 to Unicode */
190uint TY_(DecodeWin1252)(uint c);
191
192/* Function to convert from MacRoman to Unicode */
193uint TY_(DecodeMacRoman)(uint c);
194
195#ifdef __cplusplus
196}
197#endif
198
199
200/* Use numeric constants as opposed to escape chars (\r, \n)
201** to avoid conflict Mac compilers that may re-define these.
202*/
203#define CR    0xD
204#define LF    0xA
205
206#if   defined(MAC_OS_CLASSIC)
207#define DEFAULT_NL_CONFIG TidyCR
208#elif defined(_WIN32) || defined(OS2_OS)
209#define DEFAULT_NL_CONFIG TidyCRLF
210#else
211#define DEFAULT_NL_CONFIG TidyLF
212#endif
213
214
215#endif /* __STREAMIO_H__ */
216