1#ifndef __STREAMIO_H__ 2#define __STREAMIO_H__ 3 4/* streamio.h -- handles character stream I/O 5 6 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 7 See tidy.h for the copyright notice. 8 9 CVS Info : 10 11 $Author$ 12 $Date$ 13 $Revision$ 14 15 Wrapper around Tidy input source and output sink 16 that calls appropriate interfaces, and applies 17 necessary char encoding transformations: to/from 18 ISO-10646 and/or UTF-8. 19 20*/ 21 22#include "forward.h" 23#include "buffio.h" 24#include "fileio.h" 25 26#ifdef __cplusplus 27extern "C" 28{ 29#endif 30typedef enum 31{ 32 FileIO, 33 BufferIO, 34 UserIO 35} IOType; 36 37/* states for ISO 2022 38 39 A document in ISO-2022 based encoding uses some ESC sequences called 40 "designator" to switch character sets. The designators defined and 41 used in ISO-2022-JP are: 42 43 "ESC" + "(" + ? for ISO646 variants 44 45 "ESC" + "$" + ? and 46 "ESC" + "$" + "(" + ? for multibyte character sets 47*/ 48typedef enum 49{ 50 FSM_ASCII, 51 FSM_ESC, 52 FSM_ESCD, 53 FSM_ESCDP, 54 FSM_ESCP, 55 FSM_NONASCII 56} ISO2022State; 57 58/************************ 59** Source 60************************/ 61 62#define CHARBUF_SIZE 5 63 64/* non-raw input is cleaned up*/ 65struct _StreamIn 66{ 67 ISO2022State state; /* FSM for ISO2022 */ 68 Bool pushed; 69 tchar* charbuf; 70 uint bufpos; 71 uint bufsize; 72 int tabs; 73 int lastcol; 74 int curcol; 75 int curline; 76 int encoding; 77 IOType iotype; 78 79 TidyInputSource source; 80 81#ifdef TIDY_WIN32_MLANG_SUPPORT 82 ulong mlang; 83#endif 84 85#ifdef TIDY_STORE_ORIGINAL_TEXT 86 tmbstr otextbuf; 87 size_t otextsize; 88 uint otextlen; 89#endif 90 91 /* Pointer back to document for error reporting */ 92 TidyDocImpl* doc; 93}; 94 95StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding ); 96void TY_(freeStreamIn)(StreamIn* in); 97 98StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding ); 99StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding ); 100StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding ); 101 102int TY_(ReadBOMEncoding)(StreamIn *in); 103uint TY_(ReadChar)( StreamIn* in ); 104void TY_(UngetChar)( uint c, StreamIn* in ); 105Bool TY_(IsEOF)( StreamIn* in ); 106 107 108/************************ 109** Sink 110************************/ 111 112struct _StreamOut 113{ 114 int encoding; 115 ISO2022State state; /* for ISO 2022 */ 116 uint nl; 117 118#ifdef TIDY_WIN32_MLANG_SUPPORT 119 ulong mlang; 120#endif 121 122 IOType iotype; 123 TidyOutputSink sink; 124}; 125 126StreamOut* TY_(FileOutput)( FILE* fp, int encoding, uint newln ); 127StreamOut* TY_(BufferOutput)( TidyBuffer* buf, int encoding, uint newln ); 128StreamOut* TY_(UserOutput)( TidyOutputSink* sink, int encoding, uint newln ); 129 130StreamOut* TY_(StdErrOutput)(void); 131/* StreamOut* StdOutOutput(void); */ 132void TY_(ReleaseStreamOut)( StreamOut* out ); 133 134void TY_(WriteChar)( uint c, StreamOut* out ); 135void TY_(outBOM)( StreamOut *out ); 136 137ctmbstr TY_(GetEncodingNameFromTidyId)(uint id); 138ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id); 139int TY_(GetCharEncodingFromOptName)(ctmbstr charenc); 140 141/************************ 142** Misc 143************************/ 144 145/* character encodings 146*/ 147#define RAW 0 148#define ASCII 1 149#define LATIN0 2 150#define LATIN1 3 151#define UTF8 4 152#define ISO2022 5 153#define MACROMAN 6 154#define WIN1252 7 155#define IBM858 8 156 157#if SUPPORT_UTF16_ENCODINGS 158#define UTF16LE 9 159#define UTF16BE 10 160#define UTF16 11 161#endif 162 163/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints 164** (i.e., to Unicode) before being recoded into UTF-8. This may be 165** confusing: usually UTF-8 implies ISO10646 codepoints. 166*/ 167#if SUPPORT_ASIAN_ENCODINGS 168#if SUPPORT_UTF16_ENCODINGS 169#define BIG5 12 170#define SHIFTJIS 13 171#else 172#define BIG5 9 173#define SHIFTJIS 10 174#endif 175#endif 176 177#ifdef TIDY_WIN32_MLANG_SUPPORT 178/* hack: windows code page numbers start at 37 */ 179#define WIN32MLANG 36 180#endif 181 182 183/* char encoding used when replacing illegal SGML chars, 184** regardless of specified encoding. Set at compile time 185** to either Windows or Mac. 186*/ 187extern const int TY_(ReplacementCharEncoding); 188 189/* Function for conversion from Windows-1252 to Unicode */ 190uint TY_(DecodeWin1252)(uint c); 191 192/* Function to convert from MacRoman to Unicode */ 193uint TY_(DecodeMacRoman)(uint c); 194 195#ifdef __cplusplus 196} 197#endif 198 199 200/* Use numeric constants as opposed to escape chars (\r, \n) 201** to avoid conflict Mac compilers that may re-define these. 202*/ 203#define CR 0xD 204#define LF 0xA 205 206#if defined(MAC_OS_CLASSIC) 207#define DEFAULT_NL_CONFIG TidyCR 208#elif defined(_WIN32) || defined(OS2_OS) 209#define DEFAULT_NL_CONFIG TidyCRLF 210#else 211#define DEFAULT_NL_CONFIG TidyLF 212#endif 213 214 215#endif /* __STREAMIO_H__ */ 216