1require 'rexml/rexml' 2require 'rexml/entity' 3require 'rexml/doctype' 4require 'rexml/child' 5require 'rexml/doctype' 6require 'rexml/parseexception' 7 8module REXML 9 # Represents text nodes in an XML document 10 class Text < Child 11 include Comparable 12 # The order in which the substitutions occur 13 SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ] 14 SUBSTITUTES = ['&', '<', '>', '"', ''', ' '] 15 # Characters which are substituted in written strings 16 SLAICEPS = [ '<', '>', '"', "'", '&' ] 17 SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ] 18 19 # If +raw+ is true, then REXML leaves the value alone 20 attr_accessor :raw 21 22 NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um 23 NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ 24 VALID_CHAR = [ 25 0x9, 0xA, 0xD, 26 (0x20..0xD7FF), 27 (0xE000..0xFFFD), 28 (0x10000..0x10FFFF) 29 ] 30 31 if String.method_defined? :encode 32 VALID_XML_CHARS = Regexp.new('^['+ 33 VALID_CHAR.map { |item| 34 case item 35 when Fixnum 36 [item].pack('U').force_encoding('utf-8') 37 when Range 38 [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') 39 end 40 }.join + 41 ']*$') 42 else 43 VALID_XML_CHARS = /^( 44 [\x09\x0A\x0D\x20-\x7E] # ASCII 45 | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte 46 | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs 47 | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte 48 | \xEF[\x80-\xBE]{2} # 49 | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff 50 | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates 51 | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 52 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 53 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 54 )*$/nx; 55 end 56 57 # Constructor 58 # +arg+ if a String, the content is set to the String. If a Text, 59 # the object is shallowly cloned. 60 # 61 # +respect_whitespace+ (boolean, false) if true, whitespace is 62 # respected 63 # 64 # +parent+ (nil) if this is a Parent object, the parent 65 # will be set to this. 66 # 67 # +raw+ (nil) This argument can be given three values. 68 # If true, then the value of used to construct this object is expected to 69 # contain no unescaped XML markup, and REXML will not change the text. If 70 # this value is false, the string may contain any characters, and REXML will 71 # escape any and all defined entities whose values are contained in the 72 # text. If this value is nil (the default), then the raw value of the 73 # parent will be used as the raw value for this node. If there is no raw 74 # value for the parent, and no value is supplied, the default is false. 75 # Use this field if you have entities defined for some text, and you don't 76 # want REXML to escape that text in output. 77 # Text.new( "<&", false, nil, false ) #-> "<&" 78 # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" 79 # Text.new( "<&", false, nil, true ) #-> Parse exception 80 # Text.new( "<&", false, nil, true ) #-> "<&" 81 # # Assume that the entity "s" is defined to be "sean" 82 # # and that the entity "r" is defined to be "russell" 83 # Text.new( "sean russell" ) #-> "&s; &r;" 84 # Text.new( "sean russell", false, nil, true ) #-> "sean russell" 85 # 86 # +entity_filter+ (nil) This can be an array of entities to match in the 87 # supplied text. This argument is only useful if +raw+ is set to false. 88 # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell" 89 # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" 90 # In the last example, the +entity_filter+ argument is ignored. 91 # 92 # +illegal+ INTERNAL USE ONLY 93 def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 94 entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) 95 96 @raw = false 97 @parent = nil 98 99 if parent 100 super( parent ) 101 @raw = parent.raw 102 end 103 104 @raw = raw unless raw.nil? 105 @entity_filter = entity_filter 106 @normalized = @unnormalized = nil 107 108 if arg.kind_of? String 109 @string = arg.dup 110 @string.squeeze!(" \n\t") unless respect_whitespace 111 elsif arg.kind_of? Text 112 @string = arg.to_s 113 @raw = arg.raw 114 elsif 115 raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})" 116 end 117 118 @string.gsub!( /\r\n?/, "\n" ) 119 120 Text.check(@string, illegal, doctype) if @raw 121 end 122 123 def parent= parent 124 super(parent) 125 Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent 126 end 127 128 # check for illegal characters 129 def Text.check string, pattern, doctype 130 131 # illegal anywhere 132 if string !~ VALID_XML_CHARS 133 if String.method_defined? :encode 134 string.chars.each do |c| 135 case c.ord 136 when *VALID_CHAR 137 else 138 raise "Illegal character #{c.inspect} in raw string \"#{string}\"" 139 end 140 end 141 else 142 string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| 143 case c.unpack('U') 144 when *VALID_CHAR 145 else 146 raise "Illegal character #{c.inspect} in raw string \"#{string}\"" 147 end 148 end 149 end 150 end 151 152 # context sensitive 153 string.scan(pattern) do 154 if $1[-1] != ?; 155 raise "Illegal character '#{$1}' in raw string \"#{string}\"" 156 elsif $1[0] == ?& 157 if $5 and $5[0] == ?# 158 case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) 159 when *VALID_CHAR 160 else 161 raise "Illegal character '#{$1}' in raw string \"#{string}\"" 162 end 163 # FIXME: below can't work but this needs API change. 164 # elsif @parent and $3 and !SUBSTITUTES.include?($1) 165 # if !doctype or !doctype.entities.has_key?($3) 166 # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" 167 # end 168 end 169 end 170 end 171 end 172 173 def node_type 174 :text 175 end 176 177 def empty? 178 @string.size==0 179 end 180 181 182 def clone 183 return Text.new(self) 184 end 185 186 187 # Appends text to this text node. The text is appended in the +raw+ mode 188 # of this text node. 189 def <<( to_append ) 190 @string << to_append.gsub( /\r\n?/, "\n" ) 191 end 192 193 194 # +other+ a String or a Text 195 # +returns+ the result of (to_s <=> arg.to_s) 196 def <=>( other ) 197 to_s() <=> other.to_s 198 end 199 200 def doctype 201 if @parent 202 doc = @parent.document 203 doc.doctype if doc 204 end 205 end 206 207 REFERENCE = /#{Entity::REFERENCE}/ 208 # Returns the string value of this text node. This string is always 209 # escaped, meaning that it is a valid XML text node string, and all 210 # entities that can be escaped, have been inserted. This method respects 211 # the entity filter set in the constructor. 212 # 213 # # Assume that the entity "s" is defined to be "sean", and that the 214 # # entity "r" is defined to be "russell" 215 # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 216 # t.to_s #-> "< & &s; russell" 217 # t = Text.new( "< & &s; russell", false, nil, false ) 218 # t.to_s #-> "< & &s; russell" 219 # u = Text.new( "sean russell", false, nil, true ) 220 # u.to_s #-> "sean russell" 221 def to_s 222 return @string if @raw 223 return @normalized if @normalized 224 225 @normalized = Text::normalize( @string, doctype, @entity_filter ) 226 end 227 228 def inspect 229 @string.inspect 230 end 231 232 # Returns the string value of this text. This is the text without 233 # entities, as it might be used programmatically, or printed to the 234 # console. This ignores the 'raw' attribute setting, and any 235 # entity_filter. 236 # 237 # # Assume that the entity "s" is defined to be "sean", and that the 238 # # entity "r" is defined to be "russell" 239 # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 240 # t.value #-> "< & sean russell" 241 # t = Text.new( "< & &s; russell", false, nil, false ) 242 # t.value #-> "< & sean russell" 243 # u = Text.new( "sean russell", false, nil, true ) 244 # u.value #-> "sean russell" 245 def value 246 return @unnormalized if @unnormalized 247 @unnormalized = Text::unnormalize( @string, doctype ) 248 end 249 250 # Sets the contents of this text node. This expects the text to be 251 # unnormalized. It returns self. 252 # 253 # e = Element.new( "a" ) 254 # e.add_text( "foo" ) # <a>foo</a> 255 # e[0].value = "bar" # <a>bar</a> 256 # e[0].value = "<a>" # <a><a></a> 257 def value=( val ) 258 @string = val.gsub( /\r\n?/, "\n" ) 259 @unnormalized = nil 260 @normalized = nil 261 @raw = false 262 end 263 264 def wrap(string, width, addnewline=false) 265 # Recursively wrap string at width. 266 return string if string.length <= width 267 place = string.rindex(' ', width) # Position in string with last ' ' before cutoff 268 if addnewline then 269 return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) 270 else 271 return string[0,place] + "\n" + wrap(string[place+1..-1], width) 272 end 273 end 274 275 def indent_text(string, level=1, style="\t", indentfirstline=true) 276 return string if level < 0 277 new_string = '' 278 string.each_line { |line| 279 indent_string = style * level 280 new_line = (indent_string + line).sub(/[\s]+$/,'') 281 new_string << new_line 282 } 283 new_string.strip! unless indentfirstline 284 return new_string 285 end 286 287 # == DEPRECATED 288 # See REXML::Formatters 289 # 290 def write( writer, indent=-1, transitive=false, ie_hack=false ) 291 Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") 292 formatter = if indent > -1 293 REXML::Formatters::Pretty.new( indent ) 294 else 295 REXML::Formatters::Default.new 296 end 297 formatter.write( self, writer ) 298 end 299 300 # FIXME 301 # This probably won't work properly 302 def xpath 303 path = @parent.xpath 304 path += "/text()" 305 return path 306 end 307 308 # Writes out text, substituting special characters beforehand. 309 # +out+ A String, IO, or any other object supporting <<( String ) 310 # +input+ the text to substitute and the write out 311 # 312 # z=utf8.unpack("U*") 313 # ascOut="" 314 # z.each{|r| 315 # if r < 0x100 316 # ascOut.concat(r.chr) 317 # else 318 # ascOut.concat(sprintf("&#x%x;", r)) 319 # end 320 # } 321 # puts ascOut 322 def write_with_substitution out, input 323 copy = input.clone 324 # Doing it like this rather than in a loop improves the speed 325 copy.gsub!( SPECIALS[0], SUBSTITUTES[0] ) 326 copy.gsub!( SPECIALS[1], SUBSTITUTES[1] ) 327 copy.gsub!( SPECIALS[2], SUBSTITUTES[2] ) 328 copy.gsub!( SPECIALS[3], SUBSTITUTES[3] ) 329 copy.gsub!( SPECIALS[4], SUBSTITUTES[4] ) 330 copy.gsub!( SPECIALS[5], SUBSTITUTES[5] ) 331 out << copy 332 end 333 334 # Reads text, substituting entities 335 def Text::read_with_substitution( input, illegal=nil ) 336 copy = input.clone 337 338 if copy =~ illegal 339 raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) 340 end if illegal 341 342 copy.gsub!( /\r\n?/, "\n" ) 343 if copy.include? ?& 344 copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) 345 copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] ) 346 copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) 347 copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) 348 copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) 349 copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) { 350 m=$1 351 #m='0' if m=='' 352 m = "0#{m}" if m[0] == ?x 353 [Integer(m)].pack('U*') 354 } 355 end 356 copy 357 end 358 359 EREFERENCE = /&(?!#{Entity::NAME};)/ 360 # Escapes all possible entities 361 def Text::normalize( input, doctype=nil, entity_filter=nil ) 362 copy = input.to_s 363 # Doing it like this rather than in a loop improves the speed 364 #copy = copy.gsub( EREFERENCE, '&' ) 365 copy = copy.gsub( "&", "&" ) 366 if doctype 367 # Replace all ampersands that aren't part of an entity 368 doctype.entities.each_value do |entity| 369 copy = copy.gsub( entity.value, 370 "&#{entity.name};" ) if entity.value and 371 not( entity_filter and entity_filter.include?(entity) ) 372 end 373 else 374 # Replace all ampersands that aren't part of an entity 375 DocType::DEFAULT_ENTITIES.each_value do |entity| 376 copy = copy.gsub(entity.value, "&#{entity.name};" ) 377 end 378 end 379 copy 380 end 381 382 # Unescapes all possible entities 383 def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) 384 sum = 0 385 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { 386 s = Text.expand($&, doctype, filter) 387 if sum + s.bytesize > REXML.entity_expansion_text_limit 388 raise "entity expansion has grown too large" 389 else 390 sum += s.bytesize 391 end 392 s 393 } 394 end 395 396 def Text.expand(ref, doctype, filter) 397 if ref[1] == ?# 398 if ref[2] == ?x 399 [ref[3...-1].to_i(16)].pack('U*') 400 else 401 [ref[2...-1].to_i].pack('U*') 402 end 403 elsif ref == '&' 404 '&' 405 elsif filter and filter.include?( ref[1...-1] ) 406 ref 407 elsif doctype 408 doctype.entity( ref[1...-1] ) or ref 409 else 410 entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] 411 entity_value ? entity_value.value : ref 412 end 413 end 414 end 415end 416