1# coding: US-ASCII
2require 'rexml/encoding'
3
4module REXML
5  # Generates Source-s.  USE THIS CLASS.
6  class SourceFactory
7    # Generates a Source object
8    # @param arg Either a String, or an IO
9    # @return a Source, or nil if a bad argument was given
10    def SourceFactory::create_from(arg)
11      if arg.respond_to? :read and
12          arg.respond_to? :readline and
13          arg.respond_to? :nil? and
14          arg.respond_to? :eof?
15        IOSource.new(arg)
16      elsif arg.respond_to? :to_str
17        require 'stringio'
18        IOSource.new(StringIO.new(arg))
19      elsif arg.kind_of? Source
20        arg
21      else
22        raise "#{arg.class} is not a valid input stream.  It must walk \n"+
23          "like either a String, an IO, or a Source."
24      end
25    end
26  end
27
28  # A Source can be searched for patterns, and wraps buffers and other
29  # objects and provides consumption of text
30  class Source
31    include Encoding
32    # The current buffer (what we're going to read next)
33    attr_reader :buffer
34    # The line number of the last consumed text
35    attr_reader :line
36    attr_reader :encoding
37
38    # Constructor
39    # @param arg must be a String, and should be a valid XML document
40    # @param encoding if non-null, sets the encoding of the source to this
41    # value, overriding all encoding detection
42    def initialize(arg, encoding=nil)
43      @orig = @buffer = arg
44      if encoding
45        self.encoding = encoding
46      else
47        detect_encoding
48      end
49      @line = 0
50    end
51
52
53    # Inherited from Encoding
54    # Overridden to support optimized en/decoding
55    def encoding=(enc)
56      return unless super
57      encoding_updated
58    end
59
60    # Scans the source for a given pattern.  Note, that this is not your
61    # usual scan() method.  For one thing, the pattern argument has some
62    # requirements; for another, the source can be consumed.  You can easily
63    # confuse this method.  Originally, the patterns were easier
64    # to construct and this method more robust, because this method
65    # generated search regexes on the fly; however, this was
66    # computationally expensive and slowed down the entire REXML package
67    # considerably, since this is by far the most commonly called method.
68    # @param pattern must be a Regexp, and must be in the form of
69    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
70    # will be returned; the second group is used if the consume flag is
71    # set.
72    # @param consume if true, the pattern returned will be consumed, leaving
73    # everything after it in the Source.
74    # @return the pattern, if found, or nil if the Source is empty or the
75    # pattern is not found.
76    def scan(pattern, cons=false)
77      return nil if @buffer.nil?
78      rv = @buffer.scan(pattern)
79      @buffer = $' if cons and rv.size>0
80      rv
81    end
82
83    def read
84    end
85
86    def consume( pattern )
87      @buffer = $' if pattern.match( @buffer )
88    end
89
90    def match_to( char, pattern )
91      return pattern.match(@buffer)
92    end
93
94    def match_to_consume( char, pattern )
95      md = pattern.match(@buffer)
96      @buffer = $'
97      return md
98    end
99
100    def match(pattern, cons=false)
101      md = pattern.match(@buffer)
102      @buffer = $' if cons and md
103      return md
104    end
105
106    # @return true if the Source is exhausted
107    def empty?
108      @buffer == ""
109    end
110
111    def position
112      @orig.index( @buffer )
113    end
114
115    # @return the current line in the source
116    def current_line
117      lines = @orig.split
118      res = lines.grep @buffer[0..30]
119      res = res[-1] if res.kind_of? Array
120      lines.index( res ) if res
121    end
122
123    private
124    def detect_encoding
125      buffer_encoding = @buffer.encoding
126      detected_encoding = "UTF-8"
127      begin
128        @buffer.force_encoding("ASCII-8BIT")
129        if @buffer[0, 2] == "\xfe\xff"
130          @buffer[0, 2] = ""
131          detected_encoding = "UTF-16BE"
132        elsif @buffer[0, 2] == "\xff\xfe"
133          @buffer[0, 2] = ""
134          detected_encoding = "UTF-16LE"
135        elsif @buffer[0, 3] == "\xef\xbb\xbf"
136          @buffer[0, 3] = ""
137          detected_encoding = "UTF-8"
138        end
139      ensure
140        @buffer.force_encoding(buffer_encoding)
141      end
142      self.encoding = detected_encoding
143    end
144
145    def encoding_updated
146      if @encoding != 'UTF-8'
147        @buffer = decode(@buffer)
148        @to_utf = true
149      else
150        @to_utf = false
151        @buffer.force_encoding ::Encoding::UTF_8
152      end
153    end
154  end
155
156  # A Source that wraps an IO.  See the Source class for method
157  # documentation
158  class IOSource < Source
159    #attr_reader :block_size
160
161    # block_size has been deprecated
162    def initialize(arg, block_size=500, encoding=nil)
163      @er_source = @source = arg
164      @to_utf = false
165      @pending_buffer = nil
166
167      if encoding
168        super("", encoding)
169      else
170        super(@source.read(3) || "")
171      end
172
173      if !@to_utf and
174          @buffer.respond_to?(:force_encoding) and
175          @source.respond_to?(:external_encoding) and
176          @source.external_encoding != ::Encoding::UTF_8
177        @force_utf8 = true
178      else
179        @force_utf8 = false
180      end
181    end
182
183    def scan(pattern, cons=false)
184      rv = super
185      # You'll notice that this next section is very similar to the same
186      # section in match(), but just a liiittle different.  This is
187      # because it is a touch faster to do it this way with scan()
188      # than the way match() does it; enough faster to warrent duplicating
189      # some code
190      if rv.size == 0
191        until @buffer =~ pattern or @source.nil?
192          begin
193            @buffer << readline
194          rescue Iconv::IllegalSequence
195            raise
196          rescue
197            @source = nil
198          end
199        end
200        rv = super
201      end
202      rv.taint
203      rv
204    end
205
206    def read
207      begin
208        @buffer << readline
209      rescue Exception, NameError
210        @source = nil
211      end
212    end
213
214    def consume( pattern )
215      match( pattern, true )
216    end
217
218    def match( pattern, cons=false )
219      rv = pattern.match(@buffer)
220      @buffer = $' if cons and rv
221      while !rv and @source
222        begin
223          @buffer << readline
224          rv = pattern.match(@buffer)
225          @buffer = $' if cons and rv
226        rescue
227          @source = nil
228        end
229      end
230      rv.taint
231      rv
232    end
233
234    def empty?
235      super and ( @source.nil? || @source.eof? )
236    end
237
238    def position
239      @er_source.pos rescue 0
240    end
241
242    # @return the current line in the source
243    def current_line
244      begin
245        pos = @er_source.pos        # The byte position in the source
246        lineno = @er_source.lineno  # The XML < position in the source
247        @er_source.rewind
248        line = 0                    # The \r\n position in the source
249        begin
250          while @er_source.pos < pos
251            @er_source.readline
252            line += 1
253          end
254        rescue
255        end
256      rescue IOError
257        pos = -1
258        line = -1
259      end
260      [pos, lineno, line]
261    end
262
263    private
264    def readline
265      str = @source.readline(@line_break)
266      if @pending_buffer
267        if str.nil?
268          str = @pending_buffer
269        else
270          str = @pending_buffer + str
271        end
272        @pending_buffer = nil
273      end
274      return nil if str.nil?
275
276      if @to_utf
277        decode(str)
278      else
279        str.force_encoding(::Encoding::UTF_8) if @force_utf8
280        str
281      end
282    end
283
284    def encoding_updated
285      case @encoding
286      when "UTF-16BE", "UTF-16LE"
287        @source.binmode
288        @source.set_encoding(@encoding)
289      end
290      @line_break = encode(">")
291      @pending_buffer, @buffer = @buffer, ""
292      @pending_buffer.force_encoding(@encoding)
293      super
294    end
295  end
296end
297