1///////////////////////////////////////////////////////////////////////////// 2// Name: htmlpars.h 3// Purpose: wxHtmlParser class (generic parser) 4// Author: Vaclav Slavik 5// RCS-ID: $Id: htmlpars.h 49563 2007-10-31 20:46:21Z VZ $ 6// Copyright: (c) 1999 Vaclav Slavik 7// Licence: wxWindows licence 8///////////////////////////////////////////////////////////////////////////// 9 10#ifndef _WX_HTMLPARS_H_ 11#define _WX_HTMLPARS_H_ 12 13#include "wx/defs.h" 14#if wxUSE_HTML 15 16#include "wx/html/htmltag.h" 17#include "wx/filesys.h" 18#include "wx/hash.h" 19#include "wx/fontenc.h" 20 21class WXDLLIMPEXP_FWD_BASE wxMBConv; 22class WXDLLIMPEXP_FWD_HTML wxHtmlParser; 23class WXDLLIMPEXP_FWD_HTML wxHtmlTagHandler; 24class WXDLLIMPEXP_FWD_HTML wxHtmlEntitiesParser; 25 26class wxHtmlTextPieces; 27class wxHtmlParserState; 28 29 30enum wxHtmlURLType 31{ 32 wxHTML_URL_PAGE, 33 wxHTML_URL_IMAGE, 34 wxHTML_URL_OTHER 35}; 36 37// This class handles generic parsing of HTML document : it scans 38// the document and divides it into blocks of tags (where one block 39// consists of starting and ending tag and of text between these 40// 2 tags. 41class WXDLLIMPEXP_HTML wxHtmlParser : public wxObject 42{ 43 DECLARE_ABSTRACT_CLASS(wxHtmlParser) 44 45public: 46 wxHtmlParser(); 47 virtual ~wxHtmlParser(); 48 49 // Sets the class which will be used for opening files 50 void SetFS(wxFileSystem *fs) { m_FS = fs; } 51 52 wxFileSystem* GetFS() const { return m_FS; } 53 54 // Opens file if the parser is allowed to open given URL (may be forbidden 55 // for security reasons) 56 virtual wxFSFile *OpenURL(wxHtmlURLType type, const wxString& url) const; 57 58 // You can simply call this method when you need parsed output. 59 // This method does these things: 60 // 1. call InitParser(source); 61 // 2. call DoParsing(); 62 // 3. call GetProduct(); (its return value is then returned) 63 // 4. call DoneParser(); 64 wxObject* Parse(const wxString& source); 65 66 // Sets the source. This must be called before running Parse() method. 67 virtual void InitParser(const wxString& source); 68 // This must be called after Parse(). 69 virtual void DoneParser(); 70 71 // May be called during parsing to immediately return from Parse(). 72 virtual void StopParsing() { m_stopParsing = true; } 73 74 // Parses the m_Source from begin_pos to end_pos-1. 75 // (in noparams version it parses whole m_Source) 76 void DoParsing(int begin_pos, int end_pos); 77 void DoParsing(); 78 79 // Returns pointer to the tag at parser's current position 80 wxHtmlTag *GetCurrentTag() const { return m_CurTag; } 81 82 // Returns product of parsing 83 // Returned value is result of parsing of the part. The type of this result 84 // depends on internal representation in derived parser 85 // (see wxHtmlWinParser for details). 86 virtual wxObject* GetProduct() = 0; 87 88 // adds handler to the list & hash table of handlers. 89 virtual void AddTagHandler(wxHtmlTagHandler *handler); 90 91 // Forces the handler to handle additional tags (not returned by GetSupportedTags). 92 // The handler should already be in use by this parser. 93 // Example: you want to parse following pseudo-html structure: 94 // <myitems> 95 // <it name="one" value="1"> 96 // <it name="two" value="2"> 97 // </myitems> 98 // <it> This last it has different meaning, we don't want it to be parsed by myitems handler! 99 // handler can handle only 'myitems' (e.g. its GetSupportedTags returns "MYITEMS") 100 // you can call PushTagHandler(handler, "IT") when you find <myitems> 101 // and call PopTagHandler() when you find </myitems> 102 void PushTagHandler(wxHtmlTagHandler *handler, const wxString& tags); 103 104 // Restores state before last call to PushTagHandler 105 void PopTagHandler(); 106 107 wxString* GetSource() {return &m_Source;} 108 void SetSource(const wxString& src); 109 110 // Sets HTML source and remembers current parser's state so that it can 111 // later be restored. This is useful for on-line modifications of 112 // HTML source (for example, <pre> handler replaces spaces with 113 // and newlines with <br>) 114 virtual void SetSourceAndSaveState(const wxString& src); 115 // Restores parser's state from stack or returns false if the stack is 116 // empty 117 virtual bool RestoreState(); 118 119 // Returns HTML source inside the element (i.e. between the starting 120 // and ending tag) 121 wxString GetInnerSource(const wxHtmlTag& tag); 122 123 // Parses HTML string 'markup' and extracts charset info from <meta> tag 124 // if present. Returns empty string if the tag is missing. 125 // For wxHTML's internal use. 126 static wxString ExtractCharsetInformation(const wxString& markup); 127 128 // Returns entity parser object, used to substitute HTML &entities; 129 wxHtmlEntitiesParser *GetEntitiesParser() const { return m_entitiesParser; } 130 131protected: 132 // DOM structure 133 void CreateDOMTree(); 134 void DestroyDOMTree(); 135 void CreateDOMSubTree(wxHtmlTag *cur, 136 int begin_pos, int end_pos, 137 wxHtmlTagsCache *cache); 138 139 // Adds text to the output. 140 // This is called from Parse() and must be overriden in derived classes. 141 // txt is not guaranteed to be only one word. It is largest continuous part of text 142 // (= not broken by tags) 143 // NOTE : using char* because of speed improvements 144 virtual void AddText(const wxChar* txt) = 0; 145 146 // Adds tag and proceeds it. Parse() may (and usually is) called from this method. 147 // This is called from Parse() and may be overriden. 148 // Default behavior is that it looks for proper handler in m_Handlers. The tag is 149 // ignored if no hander is found. 150 // Derived class is *responsible* for filling in m_Handlers table. 151 virtual void AddTag(const wxHtmlTag& tag); 152 153protected: 154 // DOM tree: 155 wxHtmlTag *m_CurTag; 156 wxHtmlTag *m_Tags; 157 wxHtmlTextPieces *m_TextPieces; 158 size_t m_CurTextPiece; 159 160 wxString m_Source; 161 162 wxHtmlParserState *m_SavedStates; 163 164 // handlers that handle particular tags. The table is accessed by 165 // key = tag's name. 166 // This attribute MUST be filled by derived class otherwise it would 167 // be empty and no tags would be recognized 168 // (see wxHtmlWinParser for details about filling it) 169 // m_HandlersHash is for random access based on knowledge of tag name (BR, P, etc.) 170 // it may (and often does) contain more references to one object 171 // m_HandlersList is list of all handlers and it is guaranteed to contain 172 // only one reference to each handler instance. 173 wxList m_HandlersList; 174 wxHashTable m_HandlersHash; 175 176 DECLARE_NO_COPY_CLASS(wxHtmlParser) 177 178 // class for opening files (file system) 179 wxFileSystem *m_FS; 180 // handlers stack used by PushTagHandler and PopTagHandler 181 wxList *m_HandlersStack; 182 183 // entity parse 184 wxHtmlEntitiesParser *m_entitiesParser; 185 186 // flag indicating that the parser should stop 187 bool m_stopParsing; 188}; 189 190 191 192// This class (and derived classes) cooperates with wxHtmlParser. 193// Each recognized tag is passed to handler which is capable 194// of handling it. Each tag is handled in 3 steps: 195// 1. Handler will modifies state of parser 196// (using its public methods) 197// 2. Parser parses source between starting and ending tag 198// 3. Handler restores original state of the parser 199class WXDLLIMPEXP_HTML wxHtmlTagHandler : public wxObject 200{ 201 DECLARE_ABSTRACT_CLASS(wxHtmlTagHandler) 202 203public: 204 wxHtmlTagHandler() : wxObject () { m_Parser = NULL; } 205 206 // Sets the parser. 207 // NOTE : each _instance_ of handler is guaranteed to be called 208 // only by one parser. This means you don't have to care about 209 // reentrancy. 210 virtual void SetParser(wxHtmlParser *parser) 211 { m_Parser = parser; } 212 213 // Returns list of supported tags. The list is in uppercase and 214 // tags are delimited by ','. 215 // Example : "I,B,FONT,P" 216 // is capable of handling italic, bold, font and paragraph tags 217 virtual wxString GetSupportedTags() = 0; 218 219 // This is hadling core method. It does all the Steps 1-3. 220 // To process step 2, you can call ParseInner() 221 // returned value : true if it called ParseInner(), 222 // false etherwise 223 virtual bool HandleTag(const wxHtmlTag& tag) = 0; 224 225protected: 226 // parses input between beginning and ending tag. 227 // m_Parser must be set. 228 void ParseInner(const wxHtmlTag& tag) 229 { m_Parser->DoParsing(tag.GetBeginPos(), tag.GetEndPos1()); } 230 231 // Parses given source as if it was tag's inner code (see 232 // wxHtmlParser::GetInnerSource). Unlike ParseInner(), this method lets 233 // you specify the source code to parse. This is useful when you need to 234 // modify the inner text before parsing. 235 void ParseInnerSource(const wxString& source); 236 237 wxHtmlParser *m_Parser; 238 239 DECLARE_NO_COPY_CLASS(wxHtmlTagHandler) 240}; 241 242 243// This class is used to parse HTML entities in strings. It can handle 244// both named entities and &#xxxx entries where xxxx is Unicode code. 245class WXDLLIMPEXP_HTML wxHtmlEntitiesParser : public wxObject 246{ 247 DECLARE_DYNAMIC_CLASS(wxHtmlEntitiesParser) 248 249public: 250 wxHtmlEntitiesParser(); 251 virtual ~wxHtmlEntitiesParser(); 252 253 // Sets encoding of output string. 254 // Has no effect if wxUSE_WCHAR_T==0 or wxUSE_UNICODE==1 255 void SetEncoding(wxFontEncoding encoding); 256 257 // Parses entities in input and replaces them with respective characters 258 // (with respect to output encoding) 259 wxString Parse(const wxString& input); 260 261 // Returns character for given entity or 0 if the enity is unknown 262 wxChar GetEntityChar(const wxString& entity); 263 264 // Returns character that represents given Unicode code 265#if wxUSE_UNICODE 266 wxChar GetCharForCode(unsigned code) { return (wxChar)code; } 267#else 268 wxChar GetCharForCode(unsigned code); 269#endif 270 271protected: 272#if wxUSE_WCHAR_T && !wxUSE_UNICODE 273 wxMBConv *m_conv; 274 wxFontEncoding m_encoding; 275#endif 276 277 DECLARE_NO_COPY_CLASS(wxHtmlEntitiesParser) 278}; 279 280 281#endif 282 283#endif // _WX_HTMLPARS_H_ 284