1require 'rexml/parseexception' 2require 'rexml/undefinednamespaceexception' 3require 'rexml/source' 4require 'set' 5 6module REXML 7 module Parsers 8 # = Using the Pull Parser 9 # <em>This API is experimental, and subject to change.</em> 10 # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) 11 # while parser.has_next? 12 # res = parser.next 13 # puts res[1]['att'] if res.start_tag? and res[0] == 'b' 14 # end 15 # See the PullEvent class for information on the content of the results. 16 # The data is identical to the arguments passed for the various events to 17 # the StreamListener API. 18 # 19 # Notice that: 20 # parser = PullParser.new( "<a>BAD DOCUMENT" ) 21 # while parser.has_next? 22 # res = parser.next 23 # raise res[1] if res.error? 24 # end 25 # 26 # Nat Price gave me some good ideas for the API. 27 class BaseParser 28 LETTER = '[:alpha:]' 29 DIGIT = '[:digit:]' 30 31 COMBININGCHAR = '' # TODO 32 EXTENDER = '' # TODO 33 34 NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" 35 NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" 36 UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" 37 38 NAMECHAR = '[\-\w\.:]' 39 NAME = "([\\w:]#{NAMECHAR}*)" 40 NMTOKEN = "(?:#{NAMECHAR})+" 41 NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" 42 REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" 43 REFERENCE_RE = /#{REFERENCE}/ 44 45 DOCTYPE_START = /\A\s*<!DOCTYPE\s/um 46 DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um 47 ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um 48 COMMENT_START = /\A<!--/u 49 COMMENT_PATTERN = /<!--(.*?)-->/um 50 CDATA_START = /\A<!\[CDATA\[/u 51 CDATA_END = /^\s*\]\s*>/um 52 CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um 53 XMLDECL_START = /\A<\?xml\s/u; 54 XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um 55 INSTRUCTION_START = /\A<\?/u 56 INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um 57 TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um 58 CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um 59 60 VERSION = /\bversion\s*=\s*["'](.*?)['"]/um 61 ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um 62 STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um 63 64 ENTITY_START = /^\s*<!ENTITY/ 65 IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u 66 ELEMENTDECL_START = /^\s*<!ELEMENT/um 67 ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um 68 SYSTEMENTITY = /^\s*(%.*?;)\s*$/um 69 ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" 70 NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" 71 ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" 72 ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" 73 ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" 74 DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" 75 ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" 76 ATTDEF_RE = /#{ATTDEF}/ 77 ATTLISTDECL_START = /^\s*<!ATTLIST/um 78 ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um 79 NOTATIONDECL_START = /^\s*<!NOTATION/um 80 PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um 81 SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um 82 83 TEXT_PATTERN = /\A([^<]*)/um 84 85 # Entity constants 86 PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" 87 SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} 88 PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} 89 EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" 90 NDATADECL = "\\s+NDATA\\s+#{NAME}" 91 PEREFERENCE = "%#{NAME};" 92 ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} 93 PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" 94 ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" 95 PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" 96 GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" 97 ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um 98 99 EREFERENCE = /&(?!#{NAME};)/ 100 101 DEFAULT_ENTITIES = { 102 'gt' => [/>/, '>', '>', />/], 103 'lt' => [/</, '<', '<', /</], 104 'quot' => [/"/, '"', '"', /"/], 105 "apos" => [/'/, "'", "'", /'/] 106 } 107 108 109 ###################################################################### 110 # These are patterns to identify common markup errors, to make the 111 # error messages more informative. 112 ###################################################################### 113 MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um 114 115 def initialize( source ) 116 self.stream = source 117 @listeners = [] 118 end 119 120 def add_listener( listener ) 121 @listeners << listener 122 end 123 124 attr_reader :source 125 126 def stream=( source ) 127 @source = SourceFactory.create_from( source ) 128 @closed = nil 129 @document_status = nil 130 @tags = [] 131 @stack = [] 132 @entities = [] 133 @nsstack = [] 134 end 135 136 def position 137 if @source.respond_to? :position 138 @source.position 139 else 140 # FIXME 141 0 142 end 143 end 144 145 # Returns true if there are no more events 146 def empty? 147 return (@source.empty? and @stack.empty?) 148 end 149 150 # Returns true if there are more events. Synonymous with !empty? 151 def has_next? 152 return !(@source.empty? and @stack.empty?) 153 end 154 155 # Push an event back on the head of the stream. This method 156 # has (theoretically) infinite depth. 157 def unshift token 158 @stack.unshift(token) 159 end 160 161 # Peek at the +depth+ event in the stack. The first element on the stack 162 # is at depth 0. If +depth+ is -1, will parse to the end of the input 163 # stream and return the last event, which is always :end_document. 164 # Be aware that this causes the stream to be parsed up to the +depth+ 165 # event, so you can effectively pre-parse the entire document (pull the 166 # entire thing into memory) using this method. 167 def peek depth=0 168 raise %Q[Illegal argument "#{depth}"] if depth < -1 169 temp = [] 170 if depth == -1 171 temp.push(pull()) until empty? 172 else 173 while @stack.size+temp.size < depth+1 174 temp.push(pull()) 175 end 176 end 177 @stack += temp if temp.size > 0 178 @stack[depth] 179 end 180 181 # Returns the next event. This is a +PullEvent+ object. 182 def pull 183 pull_event.tap do |event| 184 @listeners.each do |listener| 185 listener.receive event 186 end 187 end 188 end 189 190 def pull_event 191 if @closed 192 x, @closed = @closed, nil 193 return [ :end_element, x ] 194 end 195 return [ :end_document ] if empty? 196 return @stack.shift if @stack.size > 0 197 #STDERR.puts @source.encoding 198 @source.read if @source.buffer.size<2 199 #STDERR.puts "BUFFER = #{@source.buffer.inspect}" 200 if @document_status == nil 201 #@source.consume( /^\s*/um ) 202 word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) 203 word = word[1] unless word.nil? 204 #STDERR.puts "WORD = #{word.inspect}" 205 case word 206 when COMMENT_START 207 return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] 208 when XMLDECL_START 209 #STDERR.puts "XMLDECL" 210 results = @source.match( XMLDECL_PATTERN, true )[1] 211 version = VERSION.match( results ) 212 version = version[1] unless version.nil? 213 encoding = ENCODING.match(results) 214 encoding = encoding[1] unless encoding.nil? 215 if need_source_encoding_update?(encoding) 216 @source.encoding = encoding 217 end 218 if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding 219 encoding = "UTF-16" 220 end 221 standalone = STANDALONE.match(results) 222 standalone = standalone[1] unless standalone.nil? 223 return [ :xmldecl, version, encoding, standalone ] 224 when INSTRUCTION_START 225 return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] 226 when DOCTYPE_START 227 md = @source.match( DOCTYPE_PATTERN, true ) 228 @nsstack.unshift(curr_ns=Set.new) 229 identity = md[1] 230 close = md[2] 231 identity =~ IDENTITY 232 name = $1 233 raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? 234 pub_sys = $2.nil? ? nil : $2.strip 235 long_name = $4.nil? ? nil : $4.strip 236 uri = $6.nil? ? nil : $6.strip 237 args = [ :start_doctype, name, pub_sys, long_name, uri ] 238 if close == ">" 239 @document_status = :after_doctype 240 @source.read if @source.buffer.size<2 241 md = @source.match(/^\s*/um, true) 242 @stack << [ :end_doctype ] 243 else 244 @document_status = :in_doctype 245 end 246 return args 247 when /^\s+/ 248 else 249 @document_status = :after_doctype 250 @source.read if @source.buffer.size<2 251 md = @source.match(/\s*/um, true) 252 if @source.encoding == "UTF-8" 253 @source.buffer.force_encoding(::Encoding::UTF_8) 254 end 255 end 256 end 257 if @document_status == :in_doctype 258 md = @source.match(/\s*(.*?>)/um) 259 case md[1] 260 when SYSTEMENTITY 261 match = @source.match( SYSTEMENTITY, true )[1] 262 return [ :externalentity, match ] 263 264 when ELEMENTDECL_START 265 return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] 266 267 when ENTITY_START 268 match = @source.match( ENTITYDECL, true ).to_a.compact 269 match[0] = :entitydecl 270 ref = false 271 if match[1] == '%' 272 ref = true 273 match.delete_at 1 274 end 275 # Now we have to sort out what kind of entity reference this is 276 if match[2] == 'SYSTEM' 277 # External reference 278 match[3] = match[3][1..-2] # PUBID 279 match.delete_at(4) if match.size > 4 # Chop out NDATA decl 280 # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] 281 elsif match[2] == 'PUBLIC' 282 # External reference 283 match[3] = match[3][1..-2] # PUBID 284 match[4] = match[4][1..-2] # HREF 285 # match is [ :entity, name, PUBLIC, pubid, href ] 286 else 287 match[2] = match[2][1..-2] 288 match.pop if match.size == 4 289 # match is [ :entity, name, value ] 290 end 291 match << '%' if ref 292 return match 293 when ATTLISTDECL_START 294 md = @source.match( ATTLISTDECL_PATTERN, true ) 295 raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? 296 element = md[1] 297 contents = md[0] 298 299 pairs = {} 300 values = md[0].scan( ATTDEF_RE ) 301 values.each do |attdef| 302 unless attdef[3] == "#IMPLIED" 303 attdef.compact! 304 val = attdef[3] 305 val = attdef[4] if val == "#FIXED " 306 pairs[attdef[0]] = val 307 if attdef[0] =~ /^xmlns:(.*)/ 308 @nsstack[0] << $1 309 end 310 end 311 end 312 return [ :attlistdecl, element, pairs, contents ] 313 when NOTATIONDECL_START 314 md = nil 315 if @source.match( PUBLIC ) 316 md = @source.match( PUBLIC, true ) 317 vals = [md[1],md[2],md[4],md[6]] 318 elsif @source.match( SYSTEM ) 319 md = @source.match( SYSTEM, true ) 320 vals = [md[1],md[2],nil,md[4]] 321 else 322 raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) 323 end 324 return [ :notationdecl, *vals ] 325 when CDATA_END 326 @document_status = :after_doctype 327 @source.match( CDATA_END, true ) 328 return [ :end_doctype ] 329 end 330 end 331 begin 332 if @source.buffer[0] == ?< 333 if @source.buffer[1] == ?/ 334 @nsstack.shift 335 last_tag = @tags.pop 336 #md = @source.match_to_consume( '>', CLOSE_MATCH) 337 md = @source.match( CLOSE_MATCH, true ) 338 raise REXML::ParseException.new( "Missing end tag for "+ 339 "'#{last_tag}' (got \"#{md[1]}\")", 340 @source) unless last_tag == md[1] 341 return [ :end_element, last_tag ] 342 elsif @source.buffer[1] == ?! 343 md = @source.match(/\A(\s*[^>]*>)/um) 344 #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" 345 raise REXML::ParseException.new("Malformed node", @source) unless md 346 if md[0][2] == ?- 347 md = @source.match( COMMENT_PATTERN, true ) 348 349 case md[1] 350 when /--/, /-\z/ 351 raise REXML::ParseException.new("Malformed comment", @source) 352 end 353 354 return [ :comment, md[1] ] if md 355 else 356 md = @source.match( CDATA_PATTERN, true ) 357 return [ :cdata, md[1] ] if md 358 end 359 raise REXML::ParseException.new( "Declarations can only occur "+ 360 "in the doctype declaration.", @source) 361 elsif @source.buffer[1] == ?? 362 md = @source.match( INSTRUCTION_PATTERN, true ) 363 return [ :processing_instruction, md[1], md[2] ] if md 364 raise REXML::ParseException.new( "Bad instruction declaration", 365 @source) 366 else 367 # Get the next tag 368 md = @source.match(TAG_MATCH, true) 369 unless md 370 # Check for missing attribute quotes 371 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) 372 raise REXML::ParseException.new("malformed XML: missing tag start", @source) 373 end 374 attributes = {} 375 prefixes = Set.new 376 prefixes << md[2] if md[2] 377 @nsstack.unshift(curr_ns=Set.new) 378 if md[4].size > 0 379 attrs = md[4].scan( ATTRIBUTE_PATTERN ) 380 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 381 attrs.each do |attr_name, prefix, local_part, quote, value| 382 if prefix == "xmlns" 383 if local_part == "xml" 384 if value != "http://www.w3.org/XML/1998/namespace" 385 msg = "The 'xml' prefix must not be bound to any other namespace "+ 386 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" 387 raise REXML::ParseException.new( msg, @source, self ) 388 end 389 elsif local_part == "xmlns" 390 msg = "The 'xmlns' prefix must not be declared "+ 391 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" 392 raise REXML::ParseException.new( msg, @source, self) 393 end 394 curr_ns << local_part 395 elsif prefix 396 prefixes << prefix unless prefix == "xml" 397 end 398 399 if attributes.has_key?(attr_name) 400 msg = "Duplicate attribute #{attr_name.inspect}" 401 raise REXML::ParseException.new(msg, @source, self) 402 end 403 404 attributes[attr_name] = value 405 end 406 end 407 408 # Verify that all of the prefixes have been defined 409 for prefix in prefixes 410 unless @nsstack.find{|k| k.member?(prefix)} 411 raise UndefinedNamespaceException.new(prefix,@source,self) 412 end 413 end 414 415 if md[6] 416 @closed = md[1] 417 @nsstack.shift 418 else 419 @tags.push( md[1] ) 420 end 421 return [ :start_element, md[1], attributes ] 422 end 423 else 424 md = @source.match( TEXT_PATTERN, true ) 425 if md[0].length == 0 426 @source.match( /(\s+)/, true ) 427 end 428 #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 429 #return [ :text, "" ] if md[0].length == 0 430 # unnormalized = Text::unnormalize( md[1], self ) 431 # return PullEvent.new( :text, md[1], unnormalized ) 432 return [ :text, md[1] ] 433 end 434 rescue REXML::UndefinedNamespaceException 435 raise 436 rescue REXML::ParseException 437 raise 438 rescue Exception, NameError => error 439 raise REXML::ParseException.new( "Exception parsing", 440 @source, self, (error ? error : $!) ) 441 end 442 return [ :dummy ] 443 end 444 private :pull_event 445 446 def entity( reference, entities ) 447 value = nil 448 value = entities[ reference ] if entities 449 if not value 450 value = DEFAULT_ENTITIES[ reference ] 451 value = value[2] if value 452 end 453 unnormalize( value, entities ) if value 454 end 455 456 # Escapes all possible entities 457 def normalize( input, entities=nil, entity_filter=nil ) 458 copy = input.clone 459 # Doing it like this rather than in a loop improves the speed 460 copy.gsub!( EREFERENCE, '&' ) 461 entities.each do |key, value| 462 copy.gsub!( value, "&#{key};" ) unless entity_filter and 463 entity_filter.include?(entity) 464 end if entities 465 copy.gsub!( EREFERENCE, '&' ) 466 DEFAULT_ENTITIES.each do |key, value| 467 copy.gsub!( value[3], value[1] ) 468 end 469 copy 470 end 471 472 # Unescapes all possible entities 473 def unnormalize( string, entities=nil, filter=nil ) 474 rv = string.clone 475 rv.gsub!( /\r\n?/, "\n" ) 476 matches = rv.scan( REFERENCE_RE ) 477 return rv if matches.size == 0 478 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { 479 m=$1 480 m = "0#{m}" if m[0] == ?x 481 [Integer(m)].pack('U*') 482 } 483 matches.collect!{|x|x[0]}.compact! 484 if matches.size > 0 485 matches.each do |entity_reference| 486 unless filter and filter.include?(entity_reference) 487 entity_value = entity( entity_reference, entities ) 488 if entity_value 489 re = /&#{entity_reference};/ 490 rv.gsub!( re, entity_value ) 491 else 492 er = DEFAULT_ENTITIES[entity_reference] 493 rv.gsub!( er[0], er[2] ) if er 494 end 495 end 496 end 497 rv.gsub!( /&/, '&' ) 498 end 499 rv 500 end 501 502 private 503 def need_source_encoding_update?(xml_declaration_encoding) 504 return false if xml_declaration_encoding.nil? 505 return false if /\AUTF-16\z/i =~ xml_declaration_encoding 506 true 507 end 508 end 509 end 510end 511 512=begin 513 case event[0] 514 when :start_element 515 when :text 516 when :end_element 517 when :processing_instruction 518 when :cdata 519 when :comment 520 when :xmldecl 521 when :start_doctype 522 when :end_doctype 523 when :externalentity 524 when :elementdecl 525 when :entity 526 when :attlistdecl 527 when :notationdecl 528 when :end_doctype 529 end 530=end 531