1require 'rexml/parseexception'
2require 'rexml/undefinednamespaceexception'
3require 'rexml/source'
4require 'set'
5
6module REXML
7  module Parsers
8    # = Using the Pull Parser
9    # <em>This API is experimental, and subject to change.</em>
10    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
11    #  while parser.has_next?
12    #    res = parser.next
13    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
14    #  end
15    # See the PullEvent class for information on the content of the results.
16    # The data is identical to the arguments passed for the various events to
17    # the StreamListener API.
18    #
19    # Notice that:
20    #  parser = PullParser.new( "<a>BAD DOCUMENT" )
21    #  while parser.has_next?
22    #    res = parser.next
23    #    raise res[1] if res.error?
24    #  end
25    #
26    # Nat Price gave me some good ideas for the API.
27    class BaseParser
28      LETTER = '[:alpha:]'
29      DIGIT = '[:digit:]'
30
31      COMBININGCHAR = '' # TODO
32      EXTENDER = ''      # TODO
33
34      NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*"
35      NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
36      UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
37
38      NAMECHAR = '[\-\w\.:]'
39      NAME = "([\\w:]#{NAMECHAR}*)"
40      NMTOKEN = "(?:#{NAMECHAR})+"
41      NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
42      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
43      REFERENCE_RE = /#{REFERENCE}/
44
45      DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
46      DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
47      ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
48      COMMENT_START = /\A<!--/u
49      COMMENT_PATTERN = /<!--(.*?)-->/um
50      CDATA_START = /\A<!\[CDATA\[/u
51      CDATA_END = /^\s*\]\s*>/um
52      CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
53      XMLDECL_START = /\A<\?xml\s/u;
54      XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
55      INSTRUCTION_START = /\A<\?/u
56      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
57      TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
58      CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
59
60      VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
61      ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
62      STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
63
64      ENTITY_START = /^\s*<!ENTITY/
65      IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
66      ELEMENTDECL_START = /^\s*<!ELEMENT/um
67      ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
68      SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
69      ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
70      NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
71      ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
72      ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
73      ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
74      DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
75      ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
76      ATTDEF_RE = /#{ATTDEF}/
77      ATTLISTDECL_START = /^\s*<!ATTLIST/um
78      ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
79      NOTATIONDECL_START = /^\s*<!NOTATION/um
80      PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
81      SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
82
83      TEXT_PATTERN = /\A([^<]*)/um
84
85      # Entity constants
86      PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
87      SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
88      PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
89      EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
90      NDATADECL = "\\s+NDATA\\s+#{NAME}"
91      PEREFERENCE = "%#{NAME};"
92      ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
93      PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
94      ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
95      PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
96      GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
97      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
98
99      EREFERENCE = /&(?!#{NAME};)/
100
101      DEFAULT_ENTITIES = {
102        'gt' => [/&gt;/, '&gt;', '>', />/],
103        'lt' => [/&lt;/, '&lt;', '<', /</],
104        'quot' => [/&quot;/, '&quot;', '"', /"/],
105        "apos" => [/&apos;/, "&apos;", "'", /'/]
106      }
107
108
109      ######################################################################
110      # These are patterns to identify common markup errors, to make the
111      # error messages more informative.
112      ######################################################################
113      MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
114
115      def initialize( source )
116        self.stream = source
117        @listeners = []
118      end
119
120      def add_listener( listener )
121        @listeners << listener
122      end
123
124      attr_reader :source
125
126      def stream=( source )
127        @source = SourceFactory.create_from( source )
128        @closed = nil
129        @document_status = nil
130        @tags = []
131        @stack = []
132        @entities = []
133        @nsstack = []
134      end
135
136      def position
137        if @source.respond_to? :position
138          @source.position
139        else
140          # FIXME
141          0
142        end
143      end
144
145      # Returns true if there are no more events
146      def empty?
147        return (@source.empty? and @stack.empty?)
148      end
149
150      # Returns true if there are more events.  Synonymous with !empty?
151      def has_next?
152        return !(@source.empty? and @stack.empty?)
153      end
154
155      # Push an event back on the head of the stream.  This method
156      # has (theoretically) infinite depth.
157      def unshift token
158        @stack.unshift(token)
159      end
160
161      # Peek at the +depth+ event in the stack.  The first element on the stack
162      # is at depth 0.  If +depth+ is -1, will parse to the end of the input
163      # stream and return the last event, which is always :end_document.
164      # Be aware that this causes the stream to be parsed up to the +depth+
165      # event, so you can effectively pre-parse the entire document (pull the
166      # entire thing into memory) using this method.
167      def peek depth=0
168        raise %Q[Illegal argument "#{depth}"] if depth < -1
169        temp = []
170        if depth == -1
171          temp.push(pull()) until empty?
172        else
173          while @stack.size+temp.size < depth+1
174            temp.push(pull())
175          end
176        end
177        @stack += temp if temp.size > 0
178        @stack[depth]
179      end
180
181      # Returns the next event.  This is a +PullEvent+ object.
182      def pull
183        pull_event.tap do |event|
184          @listeners.each do |listener|
185            listener.receive event
186          end
187        end
188      end
189
190      def pull_event
191        if @closed
192          x, @closed = @closed, nil
193          return [ :end_element, x ]
194        end
195        return [ :end_document ] if empty?
196        return @stack.shift if @stack.size > 0
197        #STDERR.puts @source.encoding
198        @source.read if @source.buffer.size<2
199        #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
200        if @document_status == nil
201          #@source.consume( /^\s*/um )
202          word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
203          word = word[1] unless word.nil?
204          #STDERR.puts "WORD = #{word.inspect}"
205          case word
206          when COMMENT_START
207            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
208          when XMLDECL_START
209            #STDERR.puts "XMLDECL"
210            results = @source.match( XMLDECL_PATTERN, true )[1]
211            version = VERSION.match( results )
212            version = version[1] unless version.nil?
213            encoding = ENCODING.match(results)
214            encoding = encoding[1] unless encoding.nil?
215            if need_source_encoding_update?(encoding)
216              @source.encoding = encoding
217            end
218            if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
219              encoding = "UTF-16"
220            end
221            standalone = STANDALONE.match(results)
222            standalone = standalone[1] unless standalone.nil?
223            return [ :xmldecl, version, encoding, standalone ]
224          when INSTRUCTION_START
225            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
226          when DOCTYPE_START
227            md = @source.match( DOCTYPE_PATTERN, true )
228            @nsstack.unshift(curr_ns=Set.new)
229            identity = md[1]
230            close = md[2]
231            identity =~ IDENTITY
232            name = $1
233            raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
234            pub_sys = $2.nil? ? nil : $2.strip
235            long_name = $4.nil? ? nil : $4.strip
236            uri = $6.nil? ? nil : $6.strip
237            args = [ :start_doctype, name, pub_sys, long_name, uri ]
238            if close == ">"
239              @document_status = :after_doctype
240              @source.read if @source.buffer.size<2
241              md = @source.match(/^\s*/um, true)
242              @stack << [ :end_doctype ]
243            else
244              @document_status = :in_doctype
245            end
246            return args
247          when /^\s+/
248          else
249            @document_status = :after_doctype
250            @source.read if @source.buffer.size<2
251            md = @source.match(/\s*/um, true)
252            if @source.encoding == "UTF-8"
253              @source.buffer.force_encoding(::Encoding::UTF_8)
254            end
255          end
256        end
257        if @document_status == :in_doctype
258          md = @source.match(/\s*(.*?>)/um)
259          case md[1]
260          when SYSTEMENTITY
261            match = @source.match( SYSTEMENTITY, true )[1]
262            return [ :externalentity, match ]
263
264          when ELEMENTDECL_START
265            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
266
267          when ENTITY_START
268            match = @source.match( ENTITYDECL, true ).to_a.compact
269            match[0] = :entitydecl
270            ref = false
271            if match[1] == '%'
272              ref = true
273              match.delete_at 1
274            end
275            # Now we have to sort out what kind of entity reference this is
276            if match[2] == 'SYSTEM'
277              # External reference
278              match[3] = match[3][1..-2] # PUBID
279              match.delete_at(4) if match.size > 4 # Chop out NDATA decl
280              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
281            elsif match[2] == 'PUBLIC'
282              # External reference
283              match[3] = match[3][1..-2] # PUBID
284              match[4] = match[4][1..-2] # HREF
285              # match is [ :entity, name, PUBLIC, pubid, href ]
286            else
287              match[2] = match[2][1..-2]
288              match.pop if match.size == 4
289              # match is [ :entity, name, value ]
290            end
291            match << '%' if ref
292            return match
293          when ATTLISTDECL_START
294            md = @source.match( ATTLISTDECL_PATTERN, true )
295            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
296            element = md[1]
297            contents = md[0]
298
299            pairs = {}
300            values = md[0].scan( ATTDEF_RE )
301            values.each do |attdef|
302              unless attdef[3] == "#IMPLIED"
303                attdef.compact!
304                val = attdef[3]
305                val = attdef[4] if val == "#FIXED "
306                pairs[attdef[0]] = val
307                if attdef[0] =~ /^xmlns:(.*)/
308                  @nsstack[0] << $1
309                end
310              end
311            end
312            return [ :attlistdecl, element, pairs, contents ]
313          when NOTATIONDECL_START
314            md = nil
315            if @source.match( PUBLIC )
316              md = @source.match( PUBLIC, true )
317              vals = [md[1],md[2],md[4],md[6]]
318            elsif @source.match( SYSTEM )
319              md = @source.match( SYSTEM, true )
320              vals = [md[1],md[2],nil,md[4]]
321            else
322              raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
323            end
324            return [ :notationdecl, *vals ]
325          when CDATA_END
326            @document_status = :after_doctype
327            @source.match( CDATA_END, true )
328            return [ :end_doctype ]
329          end
330        end
331        begin
332          if @source.buffer[0] == ?<
333            if @source.buffer[1] == ?/
334              @nsstack.shift
335              last_tag = @tags.pop
336              #md = @source.match_to_consume( '>', CLOSE_MATCH)
337              md = @source.match( CLOSE_MATCH, true )
338              raise REXML::ParseException.new( "Missing end tag for "+
339                "'#{last_tag}' (got \"#{md[1]}\")",
340                @source) unless last_tag == md[1]
341              return [ :end_element, last_tag ]
342            elsif @source.buffer[1] == ?!
343              md = @source.match(/\A(\s*[^>]*>)/um)
344              #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
345              raise REXML::ParseException.new("Malformed node", @source) unless md
346              if md[0][2] == ?-
347                md = @source.match( COMMENT_PATTERN, true )
348
349                case md[1]
350                when /--/, /-\z/
351                  raise REXML::ParseException.new("Malformed comment", @source)
352                end
353
354                return [ :comment, md[1] ] if md
355              else
356                md = @source.match( CDATA_PATTERN, true )
357                return [ :cdata, md[1] ] if md
358              end
359              raise REXML::ParseException.new( "Declarations can only occur "+
360                "in the doctype declaration.", @source)
361            elsif @source.buffer[1] == ??
362              md = @source.match( INSTRUCTION_PATTERN, true )
363              return [ :processing_instruction, md[1], md[2] ] if md
364              raise REXML::ParseException.new( "Bad instruction declaration",
365                @source)
366            else
367              # Get the next tag
368              md = @source.match(TAG_MATCH, true)
369              unless md
370                # Check for missing attribute quotes
371                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
372                raise REXML::ParseException.new("malformed XML: missing tag start", @source)
373              end
374              attributes = {}
375              prefixes = Set.new
376              prefixes << md[2] if md[2]
377              @nsstack.unshift(curr_ns=Set.new)
378              if md[4].size > 0
379                attrs = md[4].scan( ATTRIBUTE_PATTERN )
380                raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
381                attrs.each do |attr_name, prefix, local_part, quote, value|
382                  if prefix == "xmlns"
383                    if local_part == "xml"
384                      if value != "http://www.w3.org/XML/1998/namespace"
385                        msg = "The 'xml' prefix must not be bound to any other namespace "+
386                        "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
387                        raise REXML::ParseException.new( msg, @source, self )
388                      end
389                    elsif local_part == "xmlns"
390                      msg = "The 'xmlns' prefix must not be declared "+
391                      "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
392                      raise REXML::ParseException.new( msg, @source, self)
393                    end
394                    curr_ns << local_part
395                  elsif prefix
396                    prefixes << prefix unless prefix == "xml"
397                  end
398
399                  if attributes.has_key?(attr_name)
400                    msg = "Duplicate attribute #{attr_name.inspect}"
401                    raise REXML::ParseException.new(msg, @source, self)
402                  end
403
404                  attributes[attr_name] = value
405                end
406              end
407
408              # Verify that all of the prefixes have been defined
409              for prefix in prefixes
410                unless @nsstack.find{|k| k.member?(prefix)}
411                  raise UndefinedNamespaceException.new(prefix,@source,self)
412                end
413              end
414
415              if md[6]
416                @closed = md[1]
417                @nsstack.shift
418              else
419                @tags.push( md[1] )
420              end
421              return [ :start_element, md[1], attributes ]
422            end
423          else
424            md = @source.match( TEXT_PATTERN, true )
425            if md[0].length == 0
426              @source.match( /(\s+)/, true )
427            end
428            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
429            #return [ :text, "" ] if md[0].length == 0
430            # unnormalized = Text::unnormalize( md[1], self )
431            # return PullEvent.new( :text, md[1], unnormalized )
432            return [ :text, md[1] ]
433          end
434        rescue REXML::UndefinedNamespaceException
435          raise
436        rescue REXML::ParseException
437          raise
438        rescue Exception, NameError => error
439          raise REXML::ParseException.new( "Exception parsing",
440            @source, self, (error ? error : $!) )
441        end
442        return [ :dummy ]
443      end
444      private :pull_event
445
446      def entity( reference, entities )
447        value = nil
448        value = entities[ reference ] if entities
449        if not value
450          value = DEFAULT_ENTITIES[ reference ]
451          value = value[2] if value
452        end
453        unnormalize( value, entities ) if value
454      end
455
456      # Escapes all possible entities
457      def normalize( input, entities=nil, entity_filter=nil )
458        copy = input.clone
459        # Doing it like this rather than in a loop improves the speed
460        copy.gsub!( EREFERENCE, '&amp;' )
461        entities.each do |key, value|
462          copy.gsub!( value, "&#{key};" ) unless entity_filter and
463                                      entity_filter.include?(entity)
464        end if entities
465        copy.gsub!( EREFERENCE, '&amp;' )
466        DEFAULT_ENTITIES.each do |key, value|
467          copy.gsub!( value[3], value[1] )
468        end
469        copy
470      end
471
472      # Unescapes all possible entities
473      def unnormalize( string, entities=nil, filter=nil )
474        rv = string.clone
475        rv.gsub!( /\r\n?/, "\n" )
476        matches = rv.scan( REFERENCE_RE )
477        return rv if matches.size == 0
478        rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
479          m=$1
480          m = "0#{m}" if m[0] == ?x
481          [Integer(m)].pack('U*')
482        }
483        matches.collect!{|x|x[0]}.compact!
484        if matches.size > 0
485          matches.each do |entity_reference|
486            unless filter and filter.include?(entity_reference)
487              entity_value = entity( entity_reference, entities )
488              if entity_value
489                re = /&#{entity_reference};/
490                rv.gsub!( re, entity_value )
491              else
492                er = DEFAULT_ENTITIES[entity_reference]
493                rv.gsub!( er[0], er[2] ) if er
494              end
495            end
496          end
497          rv.gsub!( /&amp;/, '&' )
498        end
499        rv
500      end
501
502      private
503      def need_source_encoding_update?(xml_declaration_encoding)
504        return false if xml_declaration_encoding.nil?
505        return false if /\AUTF-16\z/i =~ xml_declaration_encoding
506        true
507      end
508    end
509  end
510end
511
512=begin
513  case event[0]
514  when :start_element
515  when :text
516  when :end_element
517  when :processing_instruction
518  when :cdata
519  when :comment
520  when :xmldecl
521  when :start_doctype
522  when :end_doctype
523  when :externalentity
524  when :elementdecl
525  when :entity
526  when :attlistdecl
527  when :notationdecl
528  when :end_doctype
529  end
530=end
531