1require "forwardable" 2require "open-uri" 3 4require "rss/rss" 5require "rss/xml" 6 7module RSS 8 9 class NotWellFormedError < Error 10 attr_reader :line, :element 11 12 # Create a new NotWellFormedError for an error at +line+ 13 # in +element+. If a block is given the return value of 14 # the block ends up in the error message. 15 def initialize(line=nil, element=nil) 16 message = "This is not well formed XML" 17 if element or line 18 message << "\nerror occurred" 19 message << " in #{element}" if element 20 message << " at about #{line} line" if line 21 end 22 message << "\n#{yield}" if block_given? 23 super(message) 24 end 25 end 26 27 class XMLParserNotFound < Error 28 def initialize 29 super("available XML parser was not found in " << 30 "#{AVAILABLE_PARSER_LIBRARIES.inspect}.") 31 end 32 end 33 34 class NotValidXMLParser < Error 35 def initialize(parser) 36 super("#{parser} is not an available XML parser. " << 37 "Available XML parser" << 38 (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") << 39 "#{AVAILABLE_PARSERS.inspect}.") 40 end 41 end 42 43 class NSError < InvalidRSSError 44 attr_reader :tag, :prefix, :uri 45 def initialize(tag, prefix, require_uri) 46 @tag, @prefix, @uri = tag, prefix, require_uri 47 super("prefix <#{prefix}> doesn't associate uri " << 48 "<#{require_uri}> in tag <#{tag}>") 49 end 50 end 51 52 class Parser 53 54 extend Forwardable 55 56 class << self 57 58 @@default_parser = nil 59 60 def default_parser 61 @@default_parser || AVAILABLE_PARSERS.first 62 end 63 64 # Set @@default_parser to new_value if it is one of the 65 # available parsers. Else raise NotValidXMLParser error. 66 def default_parser=(new_value) 67 if AVAILABLE_PARSERS.include?(new_value) 68 @@default_parser = new_value 69 else 70 raise NotValidXMLParser.new(new_value) 71 end 72 end 73 74 def parse(rss, do_validate=true, ignore_unknown_element=true, 75 parser_class=default_parser) 76 parser = new(rss, parser_class) 77 parser.do_validate = do_validate 78 parser.ignore_unknown_element = ignore_unknown_element 79 parser.parse 80 end 81 end 82 83 def_delegators(:@parser, :parse, :rss, 84 :ignore_unknown_element, 85 :ignore_unknown_element=, :do_validate, 86 :do_validate=) 87 88 def initialize(rss, parser_class=self.class.default_parser) 89 @parser = parser_class.new(normalize_rss(rss)) 90 end 91 92 private 93 94 # Try to get the XML associated with +rss+. 95 # Return +rss+ if it already looks like XML, or treat it as a URI, 96 # or a file to get the XML, 97 def normalize_rss(rss) 98 return rss if maybe_xml?(rss) 99 100 uri = to_uri(rss) 101 102 if uri.respond_to?(:read) 103 uri.read 104 elsif !rss.tainted? and File.readable?(rss) 105 File.open(rss) {|f| f.read} 106 else 107 rss 108 end 109 end 110 111 # maybe_xml? tests if source is a string that looks like XML. 112 def maybe_xml?(source) 113 source.is_a?(String) and /</ =~ source 114 end 115 116 # Attempt to convert rss to a URI, but just return it if 117 # there's a ::URI::Error 118 def to_uri(rss) 119 return rss if rss.is_a?(::URI::Generic) 120 121 begin 122 ::URI.parse(rss) 123 rescue ::URI::Error 124 rss 125 end 126 end 127 end 128 129 class BaseParser 130 131 class << self 132 def raise_for_undefined_entity? 133 listener.raise_for_undefined_entity? 134 end 135 end 136 137 def initialize(rss) 138 @listener = self.class.listener.new 139 @rss = rss 140 end 141 142 def rss 143 @listener.rss 144 end 145 146 def ignore_unknown_element 147 @listener.ignore_unknown_element 148 end 149 150 def ignore_unknown_element=(new_value) 151 @listener.ignore_unknown_element = new_value 152 end 153 154 def do_validate 155 @listener.do_validate 156 end 157 158 def do_validate=(new_value) 159 @listener.do_validate = new_value 160 end 161 162 def parse 163 if @listener.rss.nil? 164 _parse 165 end 166 @listener.rss 167 end 168 169 end 170 171 class BaseListener 172 173 extend Utils 174 175 class << self 176 177 @@accessor_bases = {} 178 @@registered_uris = {} 179 @@class_names = {} 180 181 # return the setter for the uri, tag_name pair, or nil. 182 def setter(uri, tag_name) 183 _getter = getter(uri, tag_name) 184 if _getter 185 "#{_getter}=" 186 else 187 nil 188 end 189 end 190 191 def getter(uri, tag_name) 192 (@@accessor_bases[uri] || {})[tag_name] 193 end 194 195 # return the tag_names for setters associated with uri 196 def available_tags(uri) 197 (@@accessor_bases[uri] || {}).keys 198 end 199 200 # register uri against this name. 201 def register_uri(uri, name) 202 @@registered_uris[name] ||= {} 203 @@registered_uris[name][uri] = nil 204 end 205 206 # test if this uri is registered against this name 207 def uri_registered?(uri, name) 208 @@registered_uris[name].has_key?(uri) 209 end 210 211 # record class_name for the supplied uri and tag_name 212 def install_class_name(uri, tag_name, class_name) 213 @@class_names[uri] ||= {} 214 @@class_names[uri][tag_name] = class_name 215 end 216 217 # retrieve class_name for the supplied uri and tag_name 218 # If it doesn't exist, capitalize the tag_name 219 def class_name(uri, tag_name) 220 name = (@@class_names[uri] || {})[tag_name] 221 return name if name 222 223 tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase} 224 tag_name[0, 1].upcase + tag_name[1..-1] 225 end 226 227 def install_get_text_element(uri, name, accessor_base) 228 install_accessor_base(uri, name, accessor_base) 229 def_get_text_element(uri, name, *get_file_and_line_from_caller(1)) 230 end 231 232 def raise_for_undefined_entity? 233 true 234 end 235 236 private 237 # set the accessor for the uri, tag_name pair 238 def install_accessor_base(uri, tag_name, accessor_base) 239 @@accessor_bases[uri] ||= {} 240 @@accessor_bases[uri][tag_name] = accessor_base.chomp("=") 241 end 242 243 def def_get_text_element(uri, element_name, file, line) 244 register_uri(uri, element_name) 245 method_name = "start_#{element_name}" 246 unless private_method_defined?(method_name) 247 define_method(method_name) do |name, prefix, attrs, ns| 248 uri = _ns(ns, prefix) 249 if self.class.uri_registered?(uri, element_name) 250 start_get_text_element(name, prefix, ns, uri) 251 else 252 start_else_element(name, prefix, attrs, ns) 253 end 254 end 255 private(method_name) 256 end 257 end 258 end 259 end 260 261 module ListenerMixin 262 attr_reader :rss 263 264 attr_accessor :ignore_unknown_element 265 attr_accessor :do_validate 266 267 def initialize 268 @rss = nil 269 @ignore_unknown_element = true 270 @do_validate = true 271 @ns_stack = [{"xml" => :xml}] 272 @tag_stack = [[]] 273 @text_stack = [''] 274 @proc_stack = [] 275 @last_element = nil 276 @version = @encoding = @standalone = nil 277 @xml_stylesheets = [] 278 @xml_child_mode = false 279 @xml_element = nil 280 @last_xml_element = nil 281 end 282 283 # set instance vars for version, encoding, standalone 284 def xmldecl(version, encoding, standalone) 285 @version, @encoding, @standalone = version, encoding, standalone 286 end 287 288 def instruction(name, content) 289 if name == "xml-stylesheet" 290 params = parse_pi_content(content) 291 if params.has_key?("href") 292 @xml_stylesheets << XMLStyleSheet.new(params) 293 end 294 end 295 end 296 297 def tag_start(name, attributes) 298 @text_stack.push('') 299 300 ns = @ns_stack.last.dup 301 attrs = {} 302 attributes.each do |n, v| 303 if /\Axmlns(?:\z|:)/ =~ n 304 ns[$POSTMATCH] = v 305 else 306 attrs[n] = v 307 end 308 end 309 @ns_stack.push(ns) 310 311 prefix, local = split_name(name) 312 @tag_stack.last.push([_ns(ns, prefix), local]) 313 @tag_stack.push([]) 314 if @xml_child_mode 315 previous = @last_xml_element 316 element_attrs = attributes.dup 317 unless previous 318 ns.each do |ns_prefix, value| 319 next if ns_prefix == "xml" 320 key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}" 321 element_attrs[key] ||= value 322 end 323 end 324 next_element = XML::Element.new(local, 325 prefix.empty? ? nil : prefix, 326 _ns(ns, prefix), 327 element_attrs) 328 previous << next_element if previous 329 @last_xml_element = next_element 330 pr = Proc.new do |text, tags| 331 if previous 332 @last_xml_element = previous 333 else 334 @xml_element = @last_xml_element 335 @last_xml_element = nil 336 end 337 end 338 @proc_stack.push(pr) 339 else 340 if @rss.nil? and respond_to?("initial_start_#{local}", true) 341 __send__("initial_start_#{local}", local, prefix, attrs, ns.dup) 342 elsif respond_to?("start_#{local}", true) 343 __send__("start_#{local}", local, prefix, attrs, ns.dup) 344 else 345 start_else_element(local, prefix, attrs, ns.dup) 346 end 347 end 348 end 349 350 def tag_end(name) 351 if DEBUG 352 p "end tag #{name}" 353 p @tag_stack 354 end 355 text = @text_stack.pop 356 tags = @tag_stack.pop 357 pr = @proc_stack.pop 358 pr.call(text, tags) unless pr.nil? 359 @ns_stack.pop 360 end 361 362 def text(data) 363 if @xml_child_mode 364 @last_xml_element << data if @last_xml_element 365 else 366 @text_stack.last << data 367 end 368 end 369 370 private 371 def _ns(ns, prefix) 372 ns.fetch(prefix, "") 373 end 374 375 CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/ 376 # Extract the first name="value" pair from content. 377 # Works with single quotes according to the constant 378 # CONTENT_PATTERN. Return a Hash. 379 def parse_pi_content(content) 380 params = {} 381 content.scan(CONTENT_PATTERN) do |name, quote, value| 382 params[name] = value 383 end 384 params 385 end 386 387 def start_else_element(local, prefix, attrs, ns) 388 class_name = self.class.class_name(_ns(ns, prefix), local) 389 current_class = @last_element.class 390 if known_class?(current_class, class_name) 391 next_class = current_class.const_get(class_name) 392 start_have_something_element(local, prefix, attrs, ns, next_class) 393 else 394 if !@do_validate or @ignore_unknown_element 395 @proc_stack.push(setup_next_element_in_unknown_element) 396 else 397 parent = "ROOT ELEMENT???" 398 if current_class.tag_name 399 parent = current_class.tag_name 400 end 401 raise NotExpectedTagError.new(local, _ns(ns, prefix), parent) 402 end 403 end 404 end 405 406 if Module.method(:const_defined?).arity == -1 407 def known_class?(target_class, class_name) 408 class_name and 409 (target_class.const_defined?(class_name, false) or 410 target_class.constants.include?(class_name.to_sym)) 411 end 412 else 413 def known_class?(target_class, class_name) 414 class_name and 415 (target_class.const_defined?(class_name) or 416 target_class.constants.include?(class_name)) 417 end 418 end 419 420 NAMESPLIT = /^(?:([\w:][-\w.]*):)?([\w:][-\w.]*)/ 421 def split_name(name) 422 name =~ NAMESPLIT 423 [$1 || '', $2] 424 end 425 426 def check_ns(tag_name, prefix, ns, require_uri, ignore_unknown_element=nil) 427 if _ns(ns, prefix) == require_uri 428 true 429 else 430 if ignore_unknown_element.nil? 431 ignore_unknown_element = @ignore_unknown_element 432 end 433 434 if ignore_unknown_element 435 false 436 elsif @do_validate 437 raise NSError.new(tag_name, prefix, require_uri) 438 else 439 # Force bind required URI with prefix 440 @ns_stack.last[prefix] = require_uri 441 true 442 end 443 end 444 end 445 446 def start_get_text_element(tag_name, prefix, ns, required_uri) 447 pr = Proc.new do |text, tags| 448 setter = self.class.setter(required_uri, tag_name) 449 if setter and @last_element.respond_to?(setter) 450 if @do_validate 451 getter = self.class.getter(required_uri, tag_name) 452 if @last_element.__send__(getter) 453 raise TooMuchTagError.new(tag_name, @last_element.tag_name) 454 end 455 end 456 @last_element.__send__(setter, text.to_s) 457 else 458 if @do_validate and !@ignore_unknown_element 459 raise NotExpectedTagError.new(tag_name, _ns(ns, prefix), 460 @last_element.tag_name) 461 end 462 end 463 end 464 @proc_stack.push(pr) 465 end 466 467 def start_have_something_element(tag_name, prefix, attrs, ns, klass) 468 if check_ns(tag_name, prefix, ns, klass.required_uri) 469 attributes = collect_attributes(tag_name, prefix, attrs, ns, klass) 470 @proc_stack.push(setup_next_element(tag_name, klass, attributes)) 471 else 472 @proc_stack.push(setup_next_element_in_unknown_element) 473 end 474 end 475 476 def collect_attributes(tag_name, prefix, attrs, ns, klass) 477 attributes = {} 478 klass.get_attributes.each do |a_name, a_uri, required, element_name| 479 if a_uri.is_a?(String) or !a_uri.respond_to?(:include?) 480 a_uri = [a_uri] 481 end 482 unless a_uri == [""] 483 for prefix, uri in ns 484 if a_uri.include?(uri) 485 val = attrs["#{prefix}:#{a_name}"] 486 break if val 487 end 488 end 489 end 490 if val.nil? and a_uri.include?("") 491 val = attrs[a_name] 492 end 493 494 if @do_validate and required and val.nil? 495 unless a_uri.include?("") 496 for prefix, uri in ns 497 if a_uri.include?(uri) 498 a_name = "#{prefix}:#{a_name}" 499 end 500 end 501 end 502 raise MissingAttributeError.new(tag_name, a_name) 503 end 504 505 attributes[a_name] = val 506 end 507 attributes 508 end 509 510 def setup_next_element(tag_name, klass, attributes) 511 previous = @last_element 512 next_element = klass.new(@do_validate, attributes) 513 previous.set_next_element(tag_name, next_element) 514 @last_element = next_element 515 @last_element.parent = previous if klass.need_parent? 516 @xml_child_mode = @last_element.have_xml_content? 517 518 Proc.new do |text, tags| 519 p(@last_element.class) if DEBUG 520 if @xml_child_mode 521 @last_element.content = @xml_element.to_s 522 xml_setter = @last_element.class.xml_setter 523 @last_element.__send__(xml_setter, @xml_element) 524 @xml_element = nil 525 @xml_child_mode = false 526 else 527 if klass.have_content? 528 if @last_element.need_base64_encode? 529 text = text.lstrip.unpack("m").first 530 end 531 @last_element.content = text 532 end 533 end 534 if @do_validate 535 @last_element.validate_for_stream(tags, @ignore_unknown_element) 536 end 537 @last_element = previous 538 end 539 end 540 541 def setup_next_element_in_unknown_element 542 current_element, @last_element = @last_element, nil 543 Proc.new {@last_element = current_element} 544 end 545 end 546 547 unless const_defined? :AVAILABLE_PARSER_LIBRARIES 548 AVAILABLE_PARSER_LIBRARIES = [ 549 ["rss/xmlparser", :XMLParserParser], 550 ["rss/xmlscanner", :XMLScanParser], 551 ["rss/rexmlparser", :REXMLParser], 552 ] 553 end 554 555 AVAILABLE_PARSERS = [] 556 557 AVAILABLE_PARSER_LIBRARIES.each do |lib, parser| 558 begin 559 require lib 560 AVAILABLE_PARSERS.push(const_get(parser)) 561 rescue LoadError 562 end 563 end 564 565 if AVAILABLE_PARSERS.empty? 566 raise XMLParserNotFound 567 end 568end 569