1require "forwardable"
2require "open-uri"
3
4require "rss/rss"
5require "rss/xml"
6
7module RSS
8
9  class NotWellFormedError < Error
10    attr_reader :line, :element
11
12    # Create a new NotWellFormedError for an error at +line+
13    # in +element+.  If a block is given the return value of
14    # the block ends up in the error message.
15    def initialize(line=nil, element=nil)
16      message = "This is not well formed XML"
17      if element or line
18        message << "\nerror occurred"
19        message << " in #{element}" if element
20        message << " at about #{line} line" if line
21      end
22      message << "\n#{yield}" if block_given?
23      super(message)
24    end
25  end
26
27  class XMLParserNotFound < Error
28    def initialize
29      super("available XML parser was not found in " <<
30            "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
31    end
32  end
33
34  class NotValidXMLParser < Error
35    def initialize(parser)
36      super("#{parser} is not an available XML parser. " <<
37            "Available XML parser" <<
38            (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") <<
39            "#{AVAILABLE_PARSERS.inspect}.")
40    end
41  end
42
43  class NSError < InvalidRSSError
44    attr_reader :tag, :prefix, :uri
45    def initialize(tag, prefix, require_uri)
46      @tag, @prefix, @uri = tag, prefix, require_uri
47      super("prefix <#{prefix}> doesn't associate uri " <<
48            "<#{require_uri}> in tag <#{tag}>")
49    end
50  end
51
52  class Parser
53
54    extend Forwardable
55
56    class << self
57
58      @@default_parser = nil
59
60      def default_parser
61        @@default_parser || AVAILABLE_PARSERS.first
62      end
63
64      # Set @@default_parser to new_value if it is one of the
65      # available parsers. Else raise NotValidXMLParser error.
66      def default_parser=(new_value)
67        if AVAILABLE_PARSERS.include?(new_value)
68          @@default_parser = new_value
69        else
70          raise NotValidXMLParser.new(new_value)
71        end
72      end
73
74      def parse(rss, do_validate=true, ignore_unknown_element=true,
75                parser_class=default_parser)
76        parser = new(rss, parser_class)
77        parser.do_validate = do_validate
78        parser.ignore_unknown_element = ignore_unknown_element
79        parser.parse
80      end
81    end
82
83    def_delegators(:@parser, :parse, :rss,
84                   :ignore_unknown_element,
85                   :ignore_unknown_element=, :do_validate,
86                   :do_validate=)
87
88    def initialize(rss, parser_class=self.class.default_parser)
89      @parser = parser_class.new(normalize_rss(rss))
90    end
91
92    private
93
94    # Try to get the XML associated with +rss+.
95    # Return +rss+ if it already looks like XML, or treat it as a URI,
96    # or a file to get the XML,
97    def normalize_rss(rss)
98      return rss if maybe_xml?(rss)
99
100      uri = to_uri(rss)
101
102      if uri.respond_to?(:read)
103        uri.read
104      elsif !rss.tainted? and File.readable?(rss)
105        File.open(rss) {|f| f.read}
106      else
107        rss
108      end
109    end
110
111    # maybe_xml? tests if source is a string that looks like XML.
112    def maybe_xml?(source)
113      source.is_a?(String) and /</ =~ source
114    end
115
116    # Attempt to convert rss to a URI, but just return it if
117    # there's a ::URI::Error
118    def to_uri(rss)
119      return rss if rss.is_a?(::URI::Generic)
120
121      begin
122        ::URI.parse(rss)
123      rescue ::URI::Error
124        rss
125      end
126    end
127  end
128
129  class BaseParser
130
131    class << self
132      def raise_for_undefined_entity?
133        listener.raise_for_undefined_entity?
134      end
135    end
136
137    def initialize(rss)
138      @listener = self.class.listener.new
139      @rss = rss
140    end
141
142    def rss
143      @listener.rss
144    end
145
146    def ignore_unknown_element
147      @listener.ignore_unknown_element
148    end
149
150    def ignore_unknown_element=(new_value)
151      @listener.ignore_unknown_element = new_value
152    end
153
154    def do_validate
155      @listener.do_validate
156    end
157
158    def do_validate=(new_value)
159      @listener.do_validate = new_value
160    end
161
162    def parse
163      if @listener.rss.nil?
164        _parse
165      end
166      @listener.rss
167    end
168
169  end
170
171  class BaseListener
172
173    extend Utils
174
175    class << self
176
177      @@accessor_bases = {}
178      @@registered_uris = {}
179      @@class_names = {}
180
181      # return the setter for the uri, tag_name pair, or nil.
182      def setter(uri, tag_name)
183        _getter = getter(uri, tag_name)
184        if _getter
185          "#{_getter}="
186        else
187          nil
188        end
189      end
190
191      def getter(uri, tag_name)
192        (@@accessor_bases[uri] || {})[tag_name]
193      end
194
195      # return the tag_names for setters associated with uri
196      def available_tags(uri)
197        (@@accessor_bases[uri] || {}).keys
198      end
199
200      # register uri against this name.
201      def register_uri(uri, name)
202        @@registered_uris[name] ||= {}
203        @@registered_uris[name][uri] = nil
204      end
205
206      # test if this uri is registered against this name
207      def uri_registered?(uri, name)
208        @@registered_uris[name].has_key?(uri)
209      end
210
211      # record class_name for the supplied uri and tag_name
212      def install_class_name(uri, tag_name, class_name)
213        @@class_names[uri] ||= {}
214        @@class_names[uri][tag_name] = class_name
215      end
216
217      # retrieve class_name for the supplied uri and tag_name
218      # If it doesn't exist, capitalize the tag_name
219      def class_name(uri, tag_name)
220        name = (@@class_names[uri] || {})[tag_name]
221        return name if name
222
223        tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase}
224        tag_name[0, 1].upcase + tag_name[1..-1]
225      end
226
227      def install_get_text_element(uri, name, accessor_base)
228        install_accessor_base(uri, name, accessor_base)
229        def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
230      end
231
232      def raise_for_undefined_entity?
233        true
234      end
235
236      private
237      # set the accessor for the uri, tag_name pair
238      def install_accessor_base(uri, tag_name, accessor_base)
239        @@accessor_bases[uri] ||= {}
240        @@accessor_bases[uri][tag_name] = accessor_base.chomp("=")
241      end
242
243      def def_get_text_element(uri, element_name, file, line)
244        register_uri(uri, element_name)
245        method_name = "start_#{element_name}"
246        unless private_method_defined?(method_name)
247          define_method(method_name) do |name, prefix, attrs, ns|
248            uri = _ns(ns, prefix)
249            if self.class.uri_registered?(uri, element_name)
250              start_get_text_element(name, prefix, ns, uri)
251            else
252              start_else_element(name, prefix, attrs, ns)
253            end
254          end
255          private(method_name)
256        end
257      end
258    end
259  end
260
261  module ListenerMixin
262    attr_reader :rss
263
264    attr_accessor :ignore_unknown_element
265    attr_accessor :do_validate
266
267    def initialize
268      @rss = nil
269      @ignore_unknown_element = true
270      @do_validate = true
271      @ns_stack = [{"xml" => :xml}]
272      @tag_stack = [[]]
273      @text_stack = ['']
274      @proc_stack = []
275      @last_element = nil
276      @version = @encoding = @standalone = nil
277      @xml_stylesheets = []
278      @xml_child_mode = false
279      @xml_element = nil
280      @last_xml_element = nil
281    end
282
283    # set instance vars for version, encoding, standalone
284    def xmldecl(version, encoding, standalone)
285      @version, @encoding, @standalone = version, encoding, standalone
286    end
287
288    def instruction(name, content)
289      if name == "xml-stylesheet"
290        params = parse_pi_content(content)
291        if params.has_key?("href")
292          @xml_stylesheets << XMLStyleSheet.new(params)
293        end
294      end
295    end
296
297    def tag_start(name, attributes)
298      @text_stack.push('')
299
300      ns = @ns_stack.last.dup
301      attrs = {}
302      attributes.each do |n, v|
303        if /\Axmlns(?:\z|:)/ =~ n
304          ns[$POSTMATCH] = v
305        else
306          attrs[n] = v
307        end
308      end
309      @ns_stack.push(ns)
310
311      prefix, local = split_name(name)
312      @tag_stack.last.push([_ns(ns, prefix), local])
313      @tag_stack.push([])
314      if @xml_child_mode
315        previous = @last_xml_element
316        element_attrs = attributes.dup
317        unless previous
318          ns.each do |ns_prefix, value|
319            next if ns_prefix == "xml"
320            key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}"
321            element_attrs[key] ||= value
322          end
323        end
324        next_element = XML::Element.new(local,
325                                        prefix.empty? ? nil : prefix,
326                                        _ns(ns, prefix),
327                                        element_attrs)
328        previous << next_element if previous
329        @last_xml_element = next_element
330        pr = Proc.new do |text, tags|
331          if previous
332            @last_xml_element = previous
333          else
334            @xml_element = @last_xml_element
335            @last_xml_element = nil
336          end
337        end
338        @proc_stack.push(pr)
339      else
340        if @rss.nil? and respond_to?("initial_start_#{local}", true)
341          __send__("initial_start_#{local}", local, prefix, attrs, ns.dup)
342        elsif respond_to?("start_#{local}", true)
343          __send__("start_#{local}", local, prefix, attrs, ns.dup)
344        else
345          start_else_element(local, prefix, attrs, ns.dup)
346        end
347      end
348    end
349
350    def tag_end(name)
351      if DEBUG
352        p "end tag #{name}"
353        p @tag_stack
354      end
355      text = @text_stack.pop
356      tags = @tag_stack.pop
357      pr = @proc_stack.pop
358      pr.call(text, tags) unless pr.nil?
359      @ns_stack.pop
360    end
361
362    def text(data)
363      if @xml_child_mode
364        @last_xml_element << data if @last_xml_element
365      else
366        @text_stack.last << data
367      end
368    end
369
370    private
371    def _ns(ns, prefix)
372      ns.fetch(prefix, "")
373    end
374
375    CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
376    # Extract the first name="value" pair from content.
377    # Works with single quotes according to the constant
378    # CONTENT_PATTERN. Return a Hash.
379    def parse_pi_content(content)
380      params = {}
381      content.scan(CONTENT_PATTERN) do |name, quote, value|
382        params[name] = value
383      end
384      params
385    end
386
387    def start_else_element(local, prefix, attrs, ns)
388      class_name = self.class.class_name(_ns(ns, prefix), local)
389      current_class = @last_element.class
390      if known_class?(current_class, class_name)
391        next_class = current_class.const_get(class_name)
392        start_have_something_element(local, prefix, attrs, ns, next_class)
393      else
394        if !@do_validate or @ignore_unknown_element
395          @proc_stack.push(setup_next_element_in_unknown_element)
396        else
397          parent = "ROOT ELEMENT???"
398          if current_class.tag_name
399            parent = current_class.tag_name
400          end
401          raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
402        end
403      end
404    end
405
406    if Module.method(:const_defined?).arity == -1
407      def known_class?(target_class, class_name)
408        class_name and
409          (target_class.const_defined?(class_name, false) or
410           target_class.constants.include?(class_name.to_sym))
411      end
412    else
413      def known_class?(target_class, class_name)
414        class_name and
415          (target_class.const_defined?(class_name) or
416           target_class.constants.include?(class_name))
417      end
418    end
419
420    NAMESPLIT = /^(?:([\w:][-\w.]*):)?([\w:][-\w.]*)/
421    def split_name(name)
422      name =~ NAMESPLIT
423      [$1 || '', $2]
424    end
425
426    def check_ns(tag_name, prefix, ns, require_uri, ignore_unknown_element=nil)
427      if _ns(ns, prefix) == require_uri
428        true
429      else
430        if ignore_unknown_element.nil?
431          ignore_unknown_element = @ignore_unknown_element
432        end
433
434        if ignore_unknown_element
435          false
436        elsif @do_validate
437          raise NSError.new(tag_name, prefix, require_uri)
438        else
439          # Force bind required URI with prefix
440          @ns_stack.last[prefix] = require_uri
441          true
442        end
443      end
444    end
445
446    def start_get_text_element(tag_name, prefix, ns, required_uri)
447      pr = Proc.new do |text, tags|
448        setter = self.class.setter(required_uri, tag_name)
449        if setter and @last_element.respond_to?(setter)
450          if @do_validate
451            getter = self.class.getter(required_uri, tag_name)
452            if @last_element.__send__(getter)
453              raise TooMuchTagError.new(tag_name, @last_element.tag_name)
454            end
455          end
456          @last_element.__send__(setter, text.to_s)
457        else
458          if @do_validate and !@ignore_unknown_element
459            raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
460                                          @last_element.tag_name)
461          end
462        end
463      end
464      @proc_stack.push(pr)
465    end
466
467    def start_have_something_element(tag_name, prefix, attrs, ns, klass)
468      if check_ns(tag_name, prefix, ns, klass.required_uri)
469        attributes = collect_attributes(tag_name, prefix, attrs, ns, klass)
470        @proc_stack.push(setup_next_element(tag_name, klass, attributes))
471      else
472        @proc_stack.push(setup_next_element_in_unknown_element)
473      end
474    end
475
476    def collect_attributes(tag_name, prefix, attrs, ns, klass)
477      attributes = {}
478      klass.get_attributes.each do |a_name, a_uri, required, element_name|
479        if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
480          a_uri = [a_uri]
481        end
482        unless a_uri == [""]
483          for prefix, uri in ns
484            if a_uri.include?(uri)
485              val = attrs["#{prefix}:#{a_name}"]
486              break if val
487            end
488          end
489        end
490        if val.nil? and a_uri.include?("")
491          val = attrs[a_name]
492        end
493
494        if @do_validate and required and val.nil?
495          unless a_uri.include?("")
496            for prefix, uri in ns
497              if a_uri.include?(uri)
498                a_name = "#{prefix}:#{a_name}"
499              end
500            end
501          end
502          raise MissingAttributeError.new(tag_name, a_name)
503        end
504
505        attributes[a_name] = val
506      end
507      attributes
508    end
509
510    def setup_next_element(tag_name, klass, attributes)
511      previous = @last_element
512      next_element = klass.new(@do_validate, attributes)
513      previous.set_next_element(tag_name, next_element)
514      @last_element = next_element
515      @last_element.parent = previous if klass.need_parent?
516      @xml_child_mode = @last_element.have_xml_content?
517
518      Proc.new do |text, tags|
519        p(@last_element.class) if DEBUG
520        if @xml_child_mode
521          @last_element.content = @xml_element.to_s
522          xml_setter = @last_element.class.xml_setter
523          @last_element.__send__(xml_setter, @xml_element)
524          @xml_element = nil
525          @xml_child_mode = false
526        else
527          if klass.have_content?
528            if @last_element.need_base64_encode?
529              text = text.lstrip.unpack("m").first
530            end
531            @last_element.content = text
532          end
533        end
534        if @do_validate
535          @last_element.validate_for_stream(tags, @ignore_unknown_element)
536        end
537        @last_element = previous
538      end
539    end
540
541    def setup_next_element_in_unknown_element
542      current_element, @last_element = @last_element, nil
543      Proc.new {@last_element = current_element}
544    end
545  end
546
547  unless const_defined? :AVAILABLE_PARSER_LIBRARIES
548    AVAILABLE_PARSER_LIBRARIES = [
549      ["rss/xmlparser", :XMLParserParser],
550      ["rss/xmlscanner", :XMLScanParser],
551      ["rss/rexmlparser", :REXMLParser],
552    ]
553  end
554
555  AVAILABLE_PARSERS = []
556
557  AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
558    begin
559      require lib
560      AVAILABLE_PARSERS.push(const_get(parser))
561    rescue LoadError
562    end
563  end
564
565  if AVAILABLE_PARSERS.empty?
566    raise XMLParserNotFound
567  end
568end
569