1require 'rexml/rexml'
2require 'rexml/entity'
3require 'rexml/doctype'
4require 'rexml/child'
5require 'rexml/doctype'
6require 'rexml/parseexception'
7
8module REXML
9  # Represents text nodes in an XML document
10  class Text < Child
11    include Comparable
12    # The order in which the substitutions occur
13    SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
14    SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
15    # Characters which are substituted in written strings
16    SLAICEPS = [ '<', '>', '"', "'", '&' ]
17    SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
18
19    # If +raw+ is true, then REXML leaves the value alone
20    attr_accessor :raw
21
22    NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
23    NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
24    VALID_CHAR = [
25      0x9, 0xA, 0xD,
26      (0x20..0xD7FF),
27      (0xE000..0xFFFD),
28      (0x10000..0x10FFFF)
29    ]
30
31    if String.method_defined? :encode
32      VALID_XML_CHARS = Regexp.new('^['+
33        VALID_CHAR.map { |item|
34          case item
35          when Fixnum
36            [item].pack('U').force_encoding('utf-8')
37          when Range
38            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
39          end
40        }.join +
41      ']*$')
42    else
43      VALID_XML_CHARS = /^(
44           [\x09\x0A\x0D\x20-\x7E]            # ASCII
45         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
46         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
47         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
48         |  \xEF[\x80-\xBE]{2}                #
49         |  \xEF\xBF[\x80-\xBD]               # excluding U+fffe and U+ffff
50         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
51         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
52         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
53         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
54       )*$/nx;
55    end
56
57    # Constructor
58    # +arg+ if a String, the content is set to the String.  If a Text,
59    # the object is shallowly cloned.
60    #
61    # +respect_whitespace+ (boolean, false) if true, whitespace is
62    # respected
63    #
64    # +parent+ (nil) if this is a Parent object, the parent
65    # will be set to this.
66    #
67    # +raw+ (nil) This argument can be given three values.
68    # If true, then the value of used to construct this object is expected to
69    # contain no unescaped XML markup, and REXML will not change the text. If
70    # this value is false, the string may contain any characters, and REXML will
71    # escape any and all defined entities whose values are contained in the
72    # text.  If this value is nil (the default), then the raw value of the
73    # parent will be used as the raw value for this node.  If there is no raw
74    # value for the parent, and no value is supplied, the default is false.
75    # Use this field if you have entities defined for some text, and you don't
76    # want REXML to escape that text in output.
77    #   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
78    #   Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
79    #   Text.new( "<&", false, nil, true )  #-> Parse exception
80    #   Text.new( "&lt;&amp;", false, nil, true )  #-> "&lt;&amp;"
81    #   # Assume that the entity "s" is defined to be "sean"
82    #   # and that the entity    "r" is defined to be "russell"
83    #   Text.new( "sean russell" )          #-> "&s; &r;"
84    #   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
85    #
86    # +entity_filter+ (nil) This can be an array of entities to match in the
87    # supplied text.  This argument is only useful if +raw+ is set to false.
88    #   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
89    #   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
90    # In the last example, the +entity_filter+ argument is ignored.
91    #
92    # +illegal+ INTERNAL USE ONLY
93    def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
94      entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
95
96      @raw = false
97      @parent = nil
98
99      if parent
100        super( parent )
101        @raw = parent.raw
102      end
103
104      @raw = raw unless raw.nil?
105      @entity_filter = entity_filter
106      @normalized = @unnormalized = nil
107
108      if arg.kind_of? String
109        @string = arg.dup
110        @string.squeeze!(" \n\t") unless respect_whitespace
111      elsif arg.kind_of? Text
112        @string = arg.to_s
113        @raw = arg.raw
114      elsif
115        raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
116      end
117
118      @string.gsub!( /\r\n?/, "\n" )
119
120      Text.check(@string, illegal, doctype) if @raw
121    end
122
123    def parent= parent
124      super(parent)
125      Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
126    end
127
128    # check for illegal characters
129    def Text.check string, pattern, doctype
130
131      # illegal anywhere
132      if string !~ VALID_XML_CHARS
133        if String.method_defined? :encode
134          string.chars.each do |c|
135            case c.ord
136            when *VALID_CHAR
137            else
138              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
139            end
140          end
141        else
142          string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c|
143            case c.unpack('U')
144            when *VALID_CHAR
145            else
146              raise "Illegal character #{c.inspect} in raw string \"#{string}\""
147            end
148          end
149        end
150      end
151
152      # context sensitive
153      string.scan(pattern) do
154        if $1[-1] != ?;
155          raise "Illegal character '#{$1}' in raw string \"#{string}\""
156        elsif $1[0] == ?&
157          if $5 and $5[0] == ?#
158            case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
159            when *VALID_CHAR
160            else
161              raise "Illegal character '#{$1}' in raw string \"#{string}\""
162            end
163          # FIXME: below can't work but this needs API change.
164          # elsif @parent and $3 and !SUBSTITUTES.include?($1)
165          #   if !doctype or !doctype.entities.has_key?($3)
166          #     raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
167          #   end
168          end
169        end
170      end
171    end
172
173    def node_type
174      :text
175    end
176
177    def empty?
178      @string.size==0
179    end
180
181
182    def clone
183      return Text.new(self)
184    end
185
186
187    # Appends text to this text node.  The text is appended in the +raw+ mode
188    # of this text node.
189    def <<( to_append )
190      @string << to_append.gsub( /\r\n?/, "\n" )
191    end
192
193
194    # +other+ a String or a Text
195    # +returns+ the result of (to_s <=> arg.to_s)
196    def <=>( other )
197      to_s() <=> other.to_s
198    end
199
200    def doctype
201      if @parent
202        doc = @parent.document
203        doc.doctype if doc
204      end
205    end
206
207    REFERENCE = /#{Entity::REFERENCE}/
208    # Returns the string value of this text node.  This string is always
209    # escaped, meaning that it is a valid XML text node string, and all
210    # entities that can be escaped, have been inserted.  This method respects
211    # the entity filter set in the constructor.
212    #
213    #   # Assume that the entity "s" is defined to be "sean", and that the
214    #   # entity "r" is defined to be "russell"
215    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
216    #   t.to_s   #-> "&lt; &amp; &s; russell"
217    #   t = Text.new( "< & &s; russell", false, nil, false )
218    #   t.to_s   #-> "&lt; &amp; &s; russell"
219    #   u = Text.new( "sean russell", false, nil, true )
220    #   u.to_s   #-> "sean russell"
221    def to_s
222      return @string if @raw
223      return @normalized if @normalized
224
225      @normalized = Text::normalize( @string, doctype, @entity_filter )
226    end
227
228    def inspect
229      @string.inspect
230    end
231
232    # Returns the string value of this text.  This is the text without
233    # entities, as it might be used programmatically, or printed to the
234    # console.  This ignores the 'raw' attribute setting, and any
235    # entity_filter.
236    #
237    #   # Assume that the entity "s" is defined to be "sean", and that the
238    #   # entity "r" is defined to be "russell"
239    #   t = Text.new( "< & sean russell", false, nil, false, ['s'] )
240    #   t.value   #-> "< & sean russell"
241    #   t = Text.new( "< & &s; russell", false, nil, false )
242    #   t.value   #-> "< & sean russell"
243    #   u = Text.new( "sean russell", false, nil, true )
244    #   u.value   #-> "sean russell"
245    def value
246      return @unnormalized if @unnormalized
247      @unnormalized = Text::unnormalize( @string, doctype )
248    end
249
250    # Sets the contents of this text node.  This expects the text to be
251    # unnormalized.  It returns self.
252    #
253    #   e = Element.new( "a" )
254    #   e.add_text( "foo" )   # <a>foo</a>
255    #   e[0].value = "bar"    # <a>bar</a>
256    #   e[0].value = "<a>"    # <a>&lt;a&gt;</a>
257    def value=( val )
258      @string = val.gsub( /\r\n?/, "\n" )
259      @unnormalized = nil
260      @normalized = nil
261      @raw = false
262    end
263
264     def wrap(string, width, addnewline=false)
265       # Recursively wrap string at width.
266       return string if string.length <= width
267       place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
268       if addnewline then
269         return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
270       else
271         return string[0,place] + "\n" + wrap(string[place+1..-1], width)
272       end
273     end
274
275    def indent_text(string, level=1, style="\t", indentfirstline=true)
276      return string if level < 0
277      new_string = ''
278      string.each_line { |line|
279        indent_string = style * level
280        new_line = (indent_string + line).sub(/[\s]+$/,'')
281        new_string << new_line
282      }
283      new_string.strip! unless indentfirstline
284      return new_string
285    end
286
287    # == DEPRECATED
288    # See REXML::Formatters
289    #
290    def write( writer, indent=-1, transitive=false, ie_hack=false )
291      Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters")
292      formatter = if indent > -1
293          REXML::Formatters::Pretty.new( indent )
294        else
295          REXML::Formatters::Default.new
296        end
297      formatter.write( self, writer )
298    end
299
300    # FIXME
301    # This probably won't work properly
302    def xpath
303      path = @parent.xpath
304      path += "/text()"
305      return path
306    end
307
308    # Writes out text, substituting special characters beforehand.
309    # +out+ A String, IO, or any other object supporting <<( String )
310    # +input+ the text to substitute and the write out
311    #
312    #   z=utf8.unpack("U*")
313    #   ascOut=""
314    #   z.each{|r|
315    #     if r <  0x100
316    #       ascOut.concat(r.chr)
317    #     else
318    #       ascOut.concat(sprintf("&#x%x;", r))
319    #     end
320    #   }
321    #   puts ascOut
322    def write_with_substitution out, input
323      copy = input.clone
324      # Doing it like this rather than in a loop improves the speed
325      copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
326      copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
327      copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
328      copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
329      copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
330      copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
331      out << copy
332    end
333
334    # Reads text, substituting entities
335    def Text::read_with_substitution( input, illegal=nil )
336      copy = input.clone
337
338      if copy =~ illegal
339        raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
340      end if illegal
341
342      copy.gsub!( /\r\n?/, "\n" )
343      if copy.include? ?&
344        copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
345        copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
346        copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
347        copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
348        copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
349        copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {
350          m=$1
351          #m='0' if m==''
352          m = "0#{m}" if m[0] == ?x
353          [Integer(m)].pack('U*')
354        }
355      end
356      copy
357    end
358
359    EREFERENCE = /&(?!#{Entity::NAME};)/
360    # Escapes all possible entities
361    def Text::normalize( input, doctype=nil, entity_filter=nil )
362      copy = input.to_s
363      # Doing it like this rather than in a loop improves the speed
364      #copy = copy.gsub( EREFERENCE, '&amp;' )
365      copy = copy.gsub( "&", "&amp;" )
366      if doctype
367        # Replace all ampersands that aren't part of an entity
368        doctype.entities.each_value do |entity|
369          copy = copy.gsub( entity.value,
370            "&#{entity.name};" ) if entity.value and
371              not( entity_filter and entity_filter.include?(entity) )
372        end
373      else
374        # Replace all ampersands that aren't part of an entity
375        DocType::DEFAULT_ENTITIES.each_value do |entity|
376          copy = copy.gsub(entity.value, "&#{entity.name};" )
377        end
378      end
379      copy
380    end
381
382    # Unescapes all possible entities
383    def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
384      sum = 0
385      string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
386        s = Text.expand($&, doctype, filter)
387        if sum + s.bytesize > REXML.entity_expansion_text_limit
388          raise "entity expansion has grown too large"
389        else
390          sum += s.bytesize
391        end
392        s
393      }
394    end
395
396    def Text.expand(ref, doctype, filter)
397      if ref[1] == ?#
398        if ref[2] == ?x
399          [ref[3...-1].to_i(16)].pack('U*')
400        else
401          [ref[2...-1].to_i].pack('U*')
402        end
403      elsif ref == '&amp;'
404        '&'
405      elsif filter and filter.include?( ref[1...-1] )
406        ref
407      elsif doctype
408        doctype.entity( ref[1...-1] ) or ref
409      else
410        entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
411        entity_value ? entity_value.value : ref
412      end
413    end
414  end
415end
416