1# coding: US-ASCII 2require 'rexml/encoding' 3 4module REXML 5 # Generates Source-s. USE THIS CLASS. 6 class SourceFactory 7 # Generates a Source object 8 # @param arg Either a String, or an IO 9 # @return a Source, or nil if a bad argument was given 10 def SourceFactory::create_from(arg) 11 if arg.respond_to? :read and 12 arg.respond_to? :readline and 13 arg.respond_to? :nil? and 14 arg.respond_to? :eof? 15 IOSource.new(arg) 16 elsif arg.respond_to? :to_str 17 require 'stringio' 18 IOSource.new(StringIO.new(arg)) 19 elsif arg.kind_of? Source 20 arg 21 else 22 raise "#{arg.class} is not a valid input stream. It must walk \n"+ 23 "like either a String, an IO, or a Source." 24 end 25 end 26 end 27 28 # A Source can be searched for patterns, and wraps buffers and other 29 # objects and provides consumption of text 30 class Source 31 include Encoding 32 # The current buffer (what we're going to read next) 33 attr_reader :buffer 34 # The line number of the last consumed text 35 attr_reader :line 36 attr_reader :encoding 37 38 # Constructor 39 # @param arg must be a String, and should be a valid XML document 40 # @param encoding if non-null, sets the encoding of the source to this 41 # value, overriding all encoding detection 42 def initialize(arg, encoding=nil) 43 @orig = @buffer = arg 44 if encoding 45 self.encoding = encoding 46 else 47 detect_encoding 48 end 49 @line = 0 50 end 51 52 53 # Inherited from Encoding 54 # Overridden to support optimized en/decoding 55 def encoding=(enc) 56 return unless super 57 encoding_updated 58 end 59 60 # Scans the source for a given pattern. Note, that this is not your 61 # usual scan() method. For one thing, the pattern argument has some 62 # requirements; for another, the source can be consumed. You can easily 63 # confuse this method. Originally, the patterns were easier 64 # to construct and this method more robust, because this method 65 # generated search regexes on the fly; however, this was 66 # computationally expensive and slowed down the entire REXML package 67 # considerably, since this is by far the most commonly called method. 68 # @param pattern must be a Regexp, and must be in the form of 69 # /^\s*(#{your pattern, with no groups})(.*)/. The first group 70 # will be returned; the second group is used if the consume flag is 71 # set. 72 # @param consume if true, the pattern returned will be consumed, leaving 73 # everything after it in the Source. 74 # @return the pattern, if found, or nil if the Source is empty or the 75 # pattern is not found. 76 def scan(pattern, cons=false) 77 return nil if @buffer.nil? 78 rv = @buffer.scan(pattern) 79 @buffer = $' if cons and rv.size>0 80 rv 81 end 82 83 def read 84 end 85 86 def consume( pattern ) 87 @buffer = $' if pattern.match( @buffer ) 88 end 89 90 def match_to( char, pattern ) 91 return pattern.match(@buffer) 92 end 93 94 def match_to_consume( char, pattern ) 95 md = pattern.match(@buffer) 96 @buffer = $' 97 return md 98 end 99 100 def match(pattern, cons=false) 101 md = pattern.match(@buffer) 102 @buffer = $' if cons and md 103 return md 104 end 105 106 # @return true if the Source is exhausted 107 def empty? 108 @buffer == "" 109 end 110 111 def position 112 @orig.index( @buffer ) 113 end 114 115 # @return the current line in the source 116 def current_line 117 lines = @orig.split 118 res = lines.grep @buffer[0..30] 119 res = res[-1] if res.kind_of? Array 120 lines.index( res ) if res 121 end 122 123 private 124 def detect_encoding 125 buffer_encoding = @buffer.encoding 126 detected_encoding = "UTF-8" 127 begin 128 @buffer.force_encoding("ASCII-8BIT") 129 if @buffer[0, 2] == "\xfe\xff" 130 @buffer[0, 2] = "" 131 detected_encoding = "UTF-16BE" 132 elsif @buffer[0, 2] == "\xff\xfe" 133 @buffer[0, 2] = "" 134 detected_encoding = "UTF-16LE" 135 elsif @buffer[0, 3] == "\xef\xbb\xbf" 136 @buffer[0, 3] = "" 137 detected_encoding = "UTF-8" 138 end 139 ensure 140 @buffer.force_encoding(buffer_encoding) 141 end 142 self.encoding = detected_encoding 143 end 144 145 def encoding_updated 146 if @encoding != 'UTF-8' 147 @buffer = decode(@buffer) 148 @to_utf = true 149 else 150 @to_utf = false 151 @buffer.force_encoding ::Encoding::UTF_8 152 end 153 end 154 end 155 156 # A Source that wraps an IO. See the Source class for method 157 # documentation 158 class IOSource < Source 159 #attr_reader :block_size 160 161 # block_size has been deprecated 162 def initialize(arg, block_size=500, encoding=nil) 163 @er_source = @source = arg 164 @to_utf = false 165 @pending_buffer = nil 166 167 if encoding 168 super("", encoding) 169 else 170 super(@source.read(3) || "") 171 end 172 173 if !@to_utf and 174 @buffer.respond_to?(:force_encoding) and 175 @source.respond_to?(:external_encoding) and 176 @source.external_encoding != ::Encoding::UTF_8 177 @force_utf8 = true 178 else 179 @force_utf8 = false 180 end 181 end 182 183 def scan(pattern, cons=false) 184 rv = super 185 # You'll notice that this next section is very similar to the same 186 # section in match(), but just a liiittle different. This is 187 # because it is a touch faster to do it this way with scan() 188 # than the way match() does it; enough faster to warrent duplicating 189 # some code 190 if rv.size == 0 191 until @buffer =~ pattern or @source.nil? 192 begin 193 @buffer << readline 194 rescue Iconv::IllegalSequence 195 raise 196 rescue 197 @source = nil 198 end 199 end 200 rv = super 201 end 202 rv.taint 203 rv 204 end 205 206 def read 207 begin 208 @buffer << readline 209 rescue Exception, NameError 210 @source = nil 211 end 212 end 213 214 def consume( pattern ) 215 match( pattern, true ) 216 end 217 218 def match( pattern, cons=false ) 219 rv = pattern.match(@buffer) 220 @buffer = $' if cons and rv 221 while !rv and @source 222 begin 223 @buffer << readline 224 rv = pattern.match(@buffer) 225 @buffer = $' if cons and rv 226 rescue 227 @source = nil 228 end 229 end 230 rv.taint 231 rv 232 end 233 234 def empty? 235 super and ( @source.nil? || @source.eof? ) 236 end 237 238 def position 239 @er_source.pos rescue 0 240 end 241 242 # @return the current line in the source 243 def current_line 244 begin 245 pos = @er_source.pos # The byte position in the source 246 lineno = @er_source.lineno # The XML < position in the source 247 @er_source.rewind 248 line = 0 # The \r\n position in the source 249 begin 250 while @er_source.pos < pos 251 @er_source.readline 252 line += 1 253 end 254 rescue 255 end 256 rescue IOError 257 pos = -1 258 line = -1 259 end 260 [pos, lineno, line] 261 end 262 263 private 264 def readline 265 str = @source.readline(@line_break) 266 if @pending_buffer 267 if str.nil? 268 str = @pending_buffer 269 else 270 str = @pending_buffer + str 271 end 272 @pending_buffer = nil 273 end 274 return nil if str.nil? 275 276 if @to_utf 277 decode(str) 278 else 279 str.force_encoding(::Encoding::UTF_8) if @force_utf8 280 str 281 end 282 end 283 284 def encoding_updated 285 case @encoding 286 when "UTF-16BE", "UTF-16LE" 287 @source.binmode 288 @source.set_encoding(@encoding) 289 end 290 @line_break = encode(">") 291 @pending_buffer, @buffer = @buffer, "" 292 @pending_buffer.force_encoding(@encoding) 293 super 294 end 295 end 296end 297