1require 'test/unit' 2 3class TestUTF16 < Test::Unit::TestCase 4 def encdump(obj) 5 case obj 6 when String 7 d = obj.dump 8 if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d 9 d 10 else 11 "#{d}.force_encoding(#{obj.encoding.name.dump})" 12 end 13 when Regexp 14 "Regexp.new(#{encdump(obj.source)}, #{obj.options})" 15 else 16 raise Argument, "unexpected: #{obj.inspect}" 17 end 18 end 19 20 def enccall(recv, meth, *args) 21 desc = '' 22 if String === recv 23 desc << encdump(recv) 24 else 25 desc << recv.inspect 26 end 27 desc << '.' << meth.to_s 28 if !args.empty? 29 desc << '(' 30 args.each_with_index {|a, i| 31 desc << ',' if 0 < i 32 if String === a 33 desc << encdump(a) 34 else 35 desc << a.inspect 36 end 37 } 38 desc << ')' 39 end 40 result = nil 41 assert_nothing_raised(desc) { 42 result = recv.send(meth, *args) 43 } 44 result 45 end 46 47 def assert_str_equal(expected, actual, message=nil) 48 full_message = build_message(message, <<EOT) 49#{encdump expected} expected but not equal to 50#{encdump actual}. 51EOT 52 assert_equal(expected, actual, full_message) 53 end 54 55 # tests start 56 57 def test_utf16be_valid_encoding 58 [ 59 "\x00\x00", 60 "\xd7\xff", 61 "\xd8\x00\xdc\x00", 62 "\xdb\xff\xdf\xff", 63 "\xe0\x00", 64 "\xff\xff", 65 ].each {|s| 66 s.force_encoding("utf-16be") 67 assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?") 68 } 69 [ 70 "\x00", 71 "\xd7", 72 "\xd8\x00", 73 "\xd8\x00\xd8\x00", 74 "\xdc\x00", 75 "\xdc\x00\xd8\x00", 76 "\xdc\x00\xdc\x00", 77 "\xe0", 78 "\xff", 79 ].each {|s| 80 s.force_encoding("utf-16be") 81 assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?") 82 } 83 end 84 85 def test_utf16le_valid_encoding 86 [ 87 "\x00\x00", 88 "\xff\xd7", 89 "\x00\xd8\x00\xdc", 90 "\xff\xdb\xff\xdf", 91 "\x00\xe0", 92 "\xff\xff", 93 ].each {|s| 94 s.force_encoding("utf-16le") 95 assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?") 96 } 97 [ 98 "\x00", 99 "\xd7", 100 "\x00\xd8", 101 "\x00\xd8\x00\xd8", 102 "\x00\xdc", 103 "\x00\xdc\x00\xd8", 104 "\x00\xdc\x00\xdc", 105 "\xe0", 106 "\xff", 107 ].each {|s| 108 s.force_encoding("utf-16le") 109 assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?") 110 } 111 end 112 113 def test_strftime 114 s = "aa".force_encoding("utf-16be") 115 assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) } 116 end 117 118 def test_intern 119 s = "aaaa".force_encoding("utf-16be") 120 assert_equal(s.encoding, s.intern.to_s.encoding, "#{encdump s}.intern.to_s.encoding") 121 end 122 123 def test_sym_eq 124 s = "aa".force_encoding("utf-16le") 125 assert(s.intern != :aa, "#{encdump s}.intern != :aa") 126 end 127 128 def test_compatible 129 s1 = "aa".force_encoding("utf-16be") 130 s2 = "z".force_encoding("us-ascii") 131 assert_nil(Encoding.compatible?(s1, s2), "Encoding.compatible?(#{encdump s1}, #{encdump s2})") 132 end 133 134 def test_casecmp 135 s1 = "aa".force_encoding("utf-16be") 136 s2 = "AA" 137 assert_not_equal(0, s1.casecmp(s2), "#{encdump s1}.casecmp(#{encdump s2})") 138 end 139 140 def test_end_with 141 s1 = "ab".force_encoding("utf-16be") 142 s2 = "b".force_encoding("utf-16be") 143 assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})") 144 end 145 146 def test_hex 147 assert_raise(Encoding::CompatibilityError) { 148 "ff".encode("utf-16le").hex 149 } 150 assert_raise(Encoding::CompatibilityError) { 151 "ff".encode("utf-16be").hex 152 } 153 end 154 155 def test_oct 156 assert_raise(Encoding::CompatibilityError) { 157 "77".encode("utf-16le").oct 158 } 159 assert_raise(Encoding::CompatibilityError) { 160 "77".encode("utf-16be").oct 161 } 162 end 163 164 def test_count 165 s1 = "aa".force_encoding("utf-16be") 166 s2 = "aa" 167 assert_raise(Encoding::CompatibilityError, "#{encdump s1}.count(#{encdump s2})") { 168 s1.count(s2) 169 } 170 end 171 172 def test_plus 173 s1 = "a".force_encoding("us-ascii") 174 s2 = "aa".force_encoding("utf-16be") 175 assert_raise(Encoding::CompatibilityError, "#{encdump s1} + #{encdump s2}") { 176 s1 + s2 177 } 178 end 179 180 def test_encoding_find 181 assert_raise(ArgumentError) { 182 Encoding.find("utf-8".force_encoding("utf-16be")) 183 } 184 end 185 186 def test_interpolation 187 s = "aa".force_encoding("utf-16be") 188 assert_raise(Encoding::CompatibilityError, "\"a\#{#{encdump s}}\"") { 189 "a#{s}" 190 } 191 end 192 193 def test_slice! 194 enccall("aa".force_encoding("UTF-16BE"), :slice!, -1) 195 end 196 197 def test_plus_empty1 198 s1 = "" 199 s2 = "aa".force_encoding("utf-16be") 200 assert_nothing_raised("#{encdump s1} << #{encdump s2}") { 201 s1 + s2 202 } 203 end 204 205 def test_plus_empty2 206 s1 = "aa" 207 s2 = "".force_encoding("utf-16be") 208 assert_nothing_raised("#{encdump s1} << #{encdump s2}") { 209 s1 + s2 210 } 211 end 212 213 def test_plus_nonempty 214 s1 = "aa" 215 s2 = "bb".force_encoding("utf-16be") 216 assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") { 217 s1 + s2 218 } 219 end 220 221 def test_concat_empty1 222 s1 = "" 223 s2 = "aa".force_encoding("utf-16be") 224 assert_nothing_raised("#{encdump s1} << #{encdump s2}") { 225 s1 << s2 226 } 227 end 228 229 def test_concat_empty2 230 s1 = "aa" 231 s2 = "".force_encoding("utf-16be") 232 assert_nothing_raised("#{encdump s1} << #{encdump s2}") { 233 s1 << s2 234 } 235 end 236 237 def test_concat_nonempty 238 s1 = "aa" 239 s2 = "bb".force_encoding("utf-16be") 240 assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") { 241 s1 << s2 242 } 243 end 244 245 def test_chomp 246 s = "\1\n".force_encoding("utf-16be") 247 assert_equal(s, s.chomp, "#{encdump s}.chomp") 248 s = "\0\n".force_encoding("utf-16be") 249 assert_equal("", s.chomp, "#{encdump s}.chomp") 250 s = "\0\r\0\n".force_encoding("utf-16be") 251 assert_equal("", s.chomp, "#{encdump s}.chomp") 252 end 253 254 def test_succ 255 s = "\xff\xff".force_encoding("utf-16be") 256 assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?") 257 258 s = "\xdb\xff\xdf\xff".force_encoding("utf-16be") 259 assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?") 260 end 261 262 def test_regexp_union 263 enccall(Regexp, :union, "aa".force_encoding("utf-16be"), "bb".force_encoding("utf-16be")) 264 end 265 266 def test_empty_regexp 267 s = "".force_encoding("utf-16be") 268 assert_equal(Encoding.find("utf-16be"), Regexp.new(s).encoding, 269 "Regexp.new(#{encdump s}).encoding") 270 end 271 272 def test_regexp_match 273 assert_raise(Encoding::CompatibilityError) { Regexp.new("aa".force_encoding("utf-16be")) =~ "aa" } 274 end 275 276 def test_gsub 277 s = "abcd".force_encoding("utf-16be") 278 assert_nothing_raised { 279 s.gsub(Regexp.new(".".encode("utf-16be")), "xy") 280 } 281 s = "ab\0\ncd".force_encoding("utf-16be") 282 assert_raise(Encoding::CompatibilityError) { 283 s.gsub(Regexp.new(".".encode("utf-16be")), "xy") 284 } 285 end 286 287 def test_split_awk 288 s = " ab cd ".encode("utf-16be") 289 r = s.split(" ".encode("utf-16be")) 290 assert_equal(2, r.length) 291 assert_str_equal("ab".encode("utf-16be"), r[0]) 292 assert_str_equal("cd".encode("utf-16be"), r[1]) 293 end 294 295 def test_count2 296 e = "abc".count("^b") 297 assert_equal(e, "abc".encode("utf-16be").count("^b".encode("utf-16be"))) 298 assert_equal(e, "abc".encode("utf-16le").count("^b".encode("utf-16le"))) 299 end 300 301 def test_header 302 assert_raise(ArgumentError) { eval("# encoding:utf-16le\nfoo") } 303 assert_raise(ArgumentError) { eval("# encoding:utf-16be\nfoo") } 304 end 305 306 307 def test_is_mbc_newline 308 sl = "f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n\0".force_encoding("utf-16le") 309 sb = "\0f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n".force_encoding("utf-16be") 310 al = sl.lines.to_a 311 ab = sb.lines.to_a 312 assert_equal("f\0o\0o\0\n\0".force_encoding("utf-16le"), al.shift) 313 assert_equal("b\0a\0r\0\n\0".force_encoding("utf-16le"), al.shift) 314 assert_equal("b\0a\0z\0\n\0".force_encoding("utf-16le"), al.shift) 315 assert_equal("\0f\0o\0o\0\n".force_encoding("utf-16be"), ab.shift) 316 assert_equal("\0b\0a\0r\0\n".force_encoding("utf-16be"), ab.shift) 317 assert_equal("\0b\0a\0z\0\n".force_encoding("utf-16be"), ab.shift) 318 319 sl = "f\0o\0o\0\n\0".force_encoding("utf-16le") 320 sb = "\0f\0o\0o\0\n".force_encoding("utf-16be") 321 sl2 = "f\0o\0o\0".force_encoding("utf-16le") 322 sb2 = "\0f\0o\0o".force_encoding("utf-16be") 323 assert_equal(sl2, sl.chomp) 324 assert_equal(sl2, sl.chomp.chomp) 325 assert_equal(sb2, sb.chomp) 326 assert_equal(sb2, sb.chomp.chomp) 327 328 sl = "f\0o\0o\0\n".force_encoding("utf-16le") 329 sb = "\0f\0o\0o\n".force_encoding("utf-16be") 330 assert_equal(sl, sl.chomp) 331 assert_equal(sb, sb.chomp) 332 end 333 334 def test_code_to_mbc 335 assert_equal("a\0".force_encoding("utf-16le"), "a".ord.chr("utf-16le")) 336 assert_equal("\0a".force_encoding("utf-16be"), "a".ord.chr("utf-16be")) 337 end 338 339 def utf8_to_utf16(s, e) 340 s.chars.map {|c| c.ord.chr(e) }.join 341 end 342 343 def test_mbc_case_fold 344 rl = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16le"), "i") 345 rb = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16be"), "i") 346 assert_equal(Encoding.find("utf-16le"), rl.encoding) 347 assert_equal(Encoding.find("utf-16be"), rb.encoding) 348 assert_match(rl, utf8_to_utf16("\u3042a\u3042a", "utf-16le")) 349 assert_match(rb, utf8_to_utf16("\u3042a\u3042a", "utf-16be")) 350 end 351 352 def test_surrogate_pair 353 sl = "\x42\xd8\xb7\xdf".force_encoding("utf-16le") 354 sb = "\xd8\x42\xdf\xb7".force_encoding("utf-16be") 355 356 assert_equal(1, sl.size) 357 assert_equal(1, sb.size) 358 assert_equal(0x20bb7, sl.ord) 359 assert_equal(0x20bb7, sb.ord) 360 assert_equal(sl, 0x20bb7.chr("utf-16le")) 361 assert_equal(sb, 0x20bb7.chr("utf-16be")) 362 assert_equal("", sl.chop) 363 assert_equal("", sb.chop) 364 end 365 366 def test_regexp_escape 367 s = "\0*".force_encoding("UTF-16BE") 368 r = Regexp.new(Regexp.escape(s)) 369 assert(r =~ s, "#{encdump(r)} =~ #{encdump(s)}") 370 end 371 372 def test_casecmp2 373 assert_equal(0, "\0A".force_encoding("UTF-16BE").casecmp("\0a".force_encoding("UTF-16BE"))) 374 assert_not_equal(0, "\0A".force_encoding("UTF-16LE").casecmp("\0a".force_encoding("UTF-16LE"))) 375 assert_not_equal(0, "A\0".force_encoding("UTF-16BE").casecmp("a\0".force_encoding("UTF-16BE"))) 376 assert_equal(0, "A\0".force_encoding("UTF-16LE").casecmp("a\0".force_encoding("UTF-16LE"))) 377 378 ary = ["01".force_encoding("UTF-16LE"), 379 "10".force_encoding("UTF-16LE")] 380 e = ary.sort {|x,y| x <=> y } 381 a = ary.sort {|x,y| x.casecmp(y) } 382 assert_equal(e, a) 383 end 384end 385