1# -*- coding: iso-8859-1 -*-
2""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5    # put this file (drv_libxml2.py) in PYTHONPATH
6    import xml.sax
7    reader = xml.sax.make_parser(["drv_libxml2"])
8    # ...and the rest is standard python sax.
9
10CAVEATS
11    - Lexical handlers are supported, except for start/endEntity
12      (waiting for XmlReader.ResolveEntity) and start/endDTD
13    - Error callbacks are not exactly synchronous, they tend
14      to be invoked before the corresponding content callback,
15      because the underlying reader interface parses
16      data by chunks of 512 bytes
17
18TODO
19    - search for TODO
20    - some ErrorHandler events (warning)
21    - some ContentHandler events (setDocumentLocator, skippedEntity)
22    - EntityResolver (using libxml2.?)
23    - DTDHandler (if/when libxml2 exposes such node types)
24    - DeclHandler (if/when libxml2 exposes such node types)
25    - property_xml_string?
26    - feature_string_interning?
27    - Incremental parser
28    - additional performance tuning:
29      - one might cache callbacks to avoid some name lookups
30      - one might implement a smarter way to pass attributes to startElement
31        (some kind of lazy evaluation?)
32      - there might be room for improvement in start/endPrefixMapping
33      - other?
34
35"""
36
37__author__  = u"St�phane Bidoul <sbi@skynet.be>"
38__version__ = "0.3"
39
40import codecs
41from types import StringType, UnicodeType
42StringTypes = (StringType,UnicodeType)
43
44from xml.sax._exceptions import *
45from xml.sax import xmlreader, saxutils
46from xml.sax.handler import \
47     feature_namespaces, \
48     feature_namespace_prefixes, \
49     feature_string_interning, \
50     feature_validation, \
51     feature_external_ges, \
52     feature_external_pes, \
53     property_lexical_handler, \
54     property_declaration_handler, \
55     property_dom_node, \
56     property_xml_string
57
58# libxml2 returns strings as UTF8
59_decoder = codecs.lookup("utf8")[1]
60def _d(s):
61    if s is None:
62        return s
63    else:
64        return _decoder(s)[0]
65
66try:
67    import libxml2
68except ImportError, e:
69    raise SAXReaderNotAvailable("libxml2 not available: " \
70                                "import error was: %s" % e)
71
72class Locator(xmlreader.Locator):
73    """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74
75    def __init__(self,locator):
76        self.__locator = locator
77
78    def getColumnNumber(self):
79        "Return the column number where the current event ends."
80        return -1
81
82    def getLineNumber(self):
83        "Return the line number where the current event ends."
84        return self.__locator.LineNumber()
85
86    def getPublicId(self):
87        "Return the public identifier for the current event."
88        return None
89
90    def getSystemId(self):
91        "Return the system identifier for the current event."
92        return self.__locator.BaseURI()
93
94class LibXml2Reader(xmlreader.XMLReader):
95
96    def __init__(self):
97        xmlreader.XMLReader.__init__(self)
98        # features
99        self.__ns = 0
100        self.__nspfx = 0
101        self.__validate = 0
102        self.__extparams = 1
103        # parsing flag
104        self.__parsing = 0
105        # additional handlers
106        self.__lex_handler = None
107        self.__decl_handler = None
108        # error messages accumulator
109        self.__errors = None
110
111    def _errorHandler(self,arg,msg,severity,locator):
112        if self.__errors is None:
113            self.__errors = []
114        self.__errors.append((severity,
115                              SAXParseException(msg,None,
116                                                Locator(locator))))
117
118    def _reportErrors(self,fatal):
119        for severity,exception in self.__errors:
120            if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
121                            libxml2.PARSER_SEVERITY_WARNING):
122                self._err_handler.warning(exception)
123            else:
124                # when fatal is set, the parse will stop;
125                # we consider that the last error reported
126                # is the fatal one.
127                if fatal and exception is self.__errors[-1][1]:
128                    self._err_handler.fatalError(exception)
129                else:
130                    self._err_handler.error(exception)
131        self.__errors = None
132
133    def parse(self, source):
134        self.__parsing = 1
135        try:
136            # prepare source and create reader
137            if type(source) in StringTypes:
138                reader = libxml2.newTextReaderFilename(source)
139            else:
140                source = saxutils.prepare_input_source(source)
141                input = libxml2.inputBuffer(source.getByteStream())
142                reader = input.newTextReader(source.getSystemId())
143            reader.SetErrorHandler(self._errorHandler,None)
144            # configure reader
145            if self.__extparams:
146                reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
147                reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
148                reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
149                reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
150            else:
151                reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
152            # we reuse attribute maps (for a slight performance gain)
153            if self.__ns:
154                attributesNSImpl = xmlreader.AttributesNSImpl({},{})
155            else:
156                attributesImpl = xmlreader.AttributesImpl({})
157            # prefixes to pop (for endPrefixMapping)
158            prefixes = []
159            # start loop
160            self._cont_handler.startDocument()
161            while 1:
162                r = reader.Read()
163                # check for errors
164                if r == 1:
165                    if not self.__errors is None:
166                        self._reportErrors(0)
167                elif r == 0:
168                    if not self.__errors is None:
169                        self._reportErrors(0)
170                    break # end of parse
171                else:
172                    if not self.__errors is None:
173                        self._reportErrors(1)
174                    else:
175                        self._err_handler.fatalError(\
176                            SAXException("Read failed (no details available)"))
177                    break # fatal parse error
178                # get node type
179                nodeType = reader.NodeType()
180                # Element
181                if nodeType == 1:
182                    if self.__ns:
183                        eltName = (_d(reader.NamespaceUri()),\
184                                   _d(reader.LocalName()))
185                        eltQName = _d(reader.Name())
186                        attributesNSImpl._attrs = attrs = {}
187                        attributesNSImpl._qnames = qnames = {}
188                        newPrefixes = []
189                        while reader.MoveToNextAttribute():
190                            qname = _d(reader.Name())
191                            value = _d(reader.Value())
192                            if qname.startswith("xmlns"):
193                                if len(qname) > 5:
194                                    newPrefix = qname[6:]
195                                else:
196                                    newPrefix = None
197                                newPrefixes.append(newPrefix)
198                                self._cont_handler.startPrefixMapping(\
199                                    newPrefix,value)
200                                if not self.__nspfx:
201                                    continue # don't report xmlns attribute
202                            attName = (_d(reader.NamespaceUri()),
203                                       _d(reader.LocalName()))
204                            qnames[attName] = qname
205                            attrs[attName] = value
206                        reader.MoveToElement()
207                        self._cont_handler.startElementNS( \
208                            eltName,eltQName,attributesNSImpl)
209                        if reader.IsEmptyElement():
210                            self._cont_handler.endElementNS(eltName,eltQName)
211                            for newPrefix in newPrefixes:
212                                self._cont_handler.endPrefixMapping(newPrefix)
213                        else:
214                            prefixes.append(newPrefixes)
215                    else:
216                        eltName = _d(reader.Name())
217                        attributesImpl._attrs = attrs = {}
218                        while reader.MoveToNextAttribute():
219                            attName = _d(reader.Name())
220                            attrs[attName] = _d(reader.Value())
221                        reader.MoveToElement()
222                        self._cont_handler.startElement( \
223                            eltName,attributesImpl)
224                        if reader.IsEmptyElement():
225                            self._cont_handler.endElement(eltName)
226                # EndElement
227                elif nodeType == 15:
228                    if self.__ns:
229                        self._cont_handler.endElementNS( \
230                             (_d(reader.NamespaceUri()),_d(reader.LocalName())),
231                             _d(reader.Name()))
232                        for prefix in prefixes.pop():
233                            self._cont_handler.endPrefixMapping(prefix)
234                    else:
235                        self._cont_handler.endElement(_d(reader.Name()))
236                # Text
237                elif nodeType == 3:
238                    self._cont_handler.characters(_d(reader.Value()))
239                # Whitespace
240                elif nodeType == 13:
241                    self._cont_handler.ignorableWhitespace(_d(reader.Value()))
242                # SignificantWhitespace
243                elif nodeType == 14:
244                    self._cont_handler.characters(_d(reader.Value()))
245                # CDATA
246                elif nodeType == 4:
247                    if not self.__lex_handler is None:
248                        self.__lex_handler.startCDATA()
249                    self._cont_handler.characters(_d(reader.Value()))
250                    if not self.__lex_handler is None:
251                        self.__lex_handler.endCDATA()
252                # EntityReference
253                elif nodeType == 5:
254                    if not self.__lex_handler is None:
255                        self.startEntity(_d(reader.Name()))
256                    reader.ResolveEntity()
257                # EndEntity
258                elif nodeType == 16:
259                    if not self.__lex_handler is None:
260                        self.endEntity(_d(reader.Name()))
261                # ProcessingInstruction
262                elif nodeType == 7:
263                    self._cont_handler.processingInstruction( \
264                        _d(reader.Name()),_d(reader.Value()))
265                # Comment
266                elif nodeType == 8:
267                    if not self.__lex_handler is None:
268                        self.__lex_handler.comment(_d(reader.Value()))
269                # DocumentType
270                elif nodeType == 10:
271                    #if not self.__lex_handler is None:
272                    #    self.__lex_handler.startDTD()
273                    pass # TODO (how to detect endDTD? on first non-dtd event?)
274                # XmlDeclaration
275                elif nodeType == 17:
276                    pass # TODO
277                # Entity
278                elif nodeType == 6:
279                    pass # TODO (entity decl)
280                # Notation (decl)
281                elif nodeType == 12:
282                    pass # TODO
283                # Attribute (never in this loop)
284                #elif nodeType == 2:
285                #    pass
286                # Document (not exposed)
287                #elif nodeType == 9:
288                #    pass
289                # DocumentFragment (never returned by XmlReader)
290                #elif nodeType == 11:
291                #    pass
292                # None
293                #elif nodeType == 0:
294                #    pass
295                # -
296                else:
297                    raise SAXException("Unexpected node type %d" % nodeType)
298            if r == 0:
299                self._cont_handler.endDocument()
300            reader.Close()
301        finally:
302            self.__parsing = 0
303
304    def setDTDHandler(self, handler):
305        # TODO (when supported, the inherited method works just fine)
306        raise SAXNotSupportedException("DTDHandler not supported")
307
308    def setEntityResolver(self, resolver):
309        # TODO (when supported, the inherited method works just fine)
310        raise SAXNotSupportedException("EntityResolver not supported")
311
312    def getFeature(self, name):
313        if name == feature_namespaces:
314            return self.__ns
315        elif name == feature_namespace_prefixes:
316            return self.__nspfx
317        elif name == feature_validation:
318            return self.__validate
319        elif name == feature_external_ges:
320            return 1 # TODO (does that relate to PARSER_LOADDTD)?
321        elif name == feature_external_pes:
322            return self.__extparams
323        else:
324            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
325                                            name)
326
327    def setFeature(self, name, state):
328        if self.__parsing:
329            raise SAXNotSupportedException("Cannot set feature %s " \
330                                           "while parsing" % name)
331        if name == feature_namespaces:
332            self.__ns = state
333        elif name == feature_namespace_prefixes:
334            self.__nspfx = state
335        elif name == feature_validation:
336            self.__validate = state
337        elif name == feature_external_ges:
338            if state == 0:
339                # TODO (does that relate to PARSER_LOADDTD)?
340                raise SAXNotSupportedException("Feature '%s' not supported" % \
341                                               name)
342        elif name == feature_external_pes:
343            self.__extparams = state
344        else:
345            raise SAXNotRecognizedException("Feature '%s' not recognized" % \
346                                            name)
347
348    def getProperty(self, name):
349        if name == property_lexical_handler:
350            return self.__lex_handler
351        elif name == property_declaration_handler:
352            return self.__decl_handler
353        else:
354            raise SAXNotRecognizedException("Property '%s' not recognized" % \
355                                            name)
356
357    def setProperty(self, name, value):
358        if name == property_lexical_handler:
359            self.__lex_handler = value
360        elif name == property_declaration_handler:
361            # TODO: remove if/when libxml2 supports dtd events
362            raise SAXNotSupportedException("Property '%s' not supported" % \
363                                           name)
364            self.__decl_handler = value
365        else:
366            raise SAXNotRecognizedException("Property '%s' not recognized" % \
367                                            name)
368
369def create_parser():
370    return LibXml2Reader()
371
372