1require 'test/unit'
2
3class TestUTF16 < Test::Unit::TestCase
4  def encdump(obj)
5    case obj
6    when String
7      d = obj.dump
8      if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d
9        d
10      else
11        "#{d}.force_encoding(#{obj.encoding.name.dump})"
12      end
13    when Regexp
14      "Regexp.new(#{encdump(obj.source)}, #{obj.options})"
15    else
16      raise Argument, "unexpected: #{obj.inspect}"
17    end
18  end
19
20  def enccall(recv, meth, *args)
21    desc = ''
22    if String === recv
23      desc << encdump(recv)
24    else
25      desc << recv.inspect
26    end
27    desc << '.' << meth.to_s
28    if !args.empty?
29      desc << '('
30      args.each_with_index {|a, i|
31        desc << ',' if 0 < i
32        if String === a
33          desc << encdump(a)
34        else
35          desc << a.inspect
36        end
37      }
38      desc << ')'
39    end
40    result = nil
41    assert_nothing_raised(desc) {
42      result = recv.send(meth, *args)
43    }
44    result
45  end
46
47  def assert_str_equal(expected, actual, message=nil)
48    full_message = build_message(message, <<EOT)
49#{encdump expected} expected but not equal to
50#{encdump actual}.
51EOT
52    assert_equal(expected, actual, full_message)
53  end
54
55  # tests start
56
57  def test_utf16be_valid_encoding
58    [
59      "\x00\x00",
60      "\xd7\xff",
61      "\xd8\x00\xdc\x00",
62      "\xdb\xff\xdf\xff",
63      "\xe0\x00",
64      "\xff\xff",
65    ].each {|s|
66      s.force_encoding("utf-16be")
67      assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
68    }
69    [
70      "\x00",
71      "\xd7",
72      "\xd8\x00",
73      "\xd8\x00\xd8\x00",
74      "\xdc\x00",
75      "\xdc\x00\xd8\x00",
76      "\xdc\x00\xdc\x00",
77      "\xe0",
78      "\xff",
79    ].each {|s|
80      s.force_encoding("utf-16be")
81      assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
82    }
83  end
84
85  def test_utf16le_valid_encoding
86    [
87      "\x00\x00",
88      "\xff\xd7",
89      "\x00\xd8\x00\xdc",
90      "\xff\xdb\xff\xdf",
91      "\x00\xe0",
92      "\xff\xff",
93    ].each {|s|
94      s.force_encoding("utf-16le")
95      assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
96    }
97    [
98      "\x00",
99      "\xd7",
100      "\x00\xd8",
101      "\x00\xd8\x00\xd8",
102      "\x00\xdc",
103      "\x00\xdc\x00\xd8",
104      "\x00\xdc\x00\xdc",
105      "\xe0",
106      "\xff",
107    ].each {|s|
108      s.force_encoding("utf-16le")
109      assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
110    }
111  end
112
113  def test_strftime
114    s = "aa".force_encoding("utf-16be")
115    assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) }
116  end
117
118  def test_intern
119    s = "aaaa".force_encoding("utf-16be")
120    assert_equal(s.encoding, s.intern.to_s.encoding, "#{encdump s}.intern.to_s.encoding")
121  end
122
123  def test_sym_eq
124    s = "aa".force_encoding("utf-16le")
125    assert(s.intern != :aa, "#{encdump s}.intern != :aa")
126  end
127
128  def test_compatible
129    s1 = "aa".force_encoding("utf-16be")
130    s2 = "z".force_encoding("us-ascii")
131    assert_nil(Encoding.compatible?(s1, s2), "Encoding.compatible?(#{encdump s1}, #{encdump s2})")
132  end
133
134  def test_casecmp
135    s1 = "aa".force_encoding("utf-16be")
136    s2 = "AA"
137    assert_not_equal(0, s1.casecmp(s2), "#{encdump s1}.casecmp(#{encdump s2})")
138  end
139
140  def test_end_with
141    s1 = "ab".force_encoding("utf-16be")
142    s2 = "b".force_encoding("utf-16be")
143    assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})")
144  end
145
146  def test_hex
147    assert_raise(Encoding::CompatibilityError) {
148      "ff".encode("utf-16le").hex
149    }
150    assert_raise(Encoding::CompatibilityError) {
151      "ff".encode("utf-16be").hex
152    }
153  end
154
155  def test_oct
156    assert_raise(Encoding::CompatibilityError) {
157      "77".encode("utf-16le").oct
158    }
159    assert_raise(Encoding::CompatibilityError) {
160      "77".encode("utf-16be").oct
161    }
162  end
163
164  def test_count
165    s1 = "aa".force_encoding("utf-16be")
166    s2 = "aa"
167    assert_raise(Encoding::CompatibilityError, "#{encdump s1}.count(#{encdump s2})") {
168      s1.count(s2)
169    }
170  end
171
172  def test_plus
173    s1 = "a".force_encoding("us-ascii")
174    s2 = "aa".force_encoding("utf-16be")
175    assert_raise(Encoding::CompatibilityError, "#{encdump s1} + #{encdump s2}") {
176      s1 + s2
177    }
178  end
179
180  def test_encoding_find
181    assert_raise(ArgumentError) {
182      Encoding.find("utf-8".force_encoding("utf-16be"))
183    }
184  end
185
186  def test_interpolation
187    s = "aa".force_encoding("utf-16be")
188    assert_raise(Encoding::CompatibilityError, "\"a\#{#{encdump s}}\"") {
189      "a#{s}"
190    }
191  end
192
193  def test_slice!
194    enccall("aa".force_encoding("UTF-16BE"), :slice!, -1)
195  end
196
197  def test_plus_empty1
198    s1 = ""
199    s2 = "aa".force_encoding("utf-16be")
200    assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
201      s1 + s2
202    }
203  end
204
205  def test_plus_empty2
206    s1 = "aa"
207    s2 = "".force_encoding("utf-16be")
208    assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
209      s1 + s2
210    }
211  end
212
213  def test_plus_nonempty
214    s1 = "aa"
215    s2 = "bb".force_encoding("utf-16be")
216    assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") {
217      s1 + s2
218    }
219  end
220
221  def test_concat_empty1
222    s1 = ""
223    s2 = "aa".force_encoding("utf-16be")
224    assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
225      s1 << s2
226    }
227  end
228
229  def test_concat_empty2
230    s1 = "aa"
231    s2 = "".force_encoding("utf-16be")
232    assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
233      s1 << s2
234    }
235  end
236
237  def test_concat_nonempty
238    s1 = "aa"
239    s2 = "bb".force_encoding("utf-16be")
240    assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") {
241      s1 << s2
242    }
243  end
244
245  def test_chomp
246    s = "\1\n".force_encoding("utf-16be")
247    assert_equal(s, s.chomp, "#{encdump s}.chomp")
248    s = "\0\n".force_encoding("utf-16be")
249    assert_equal("", s.chomp, "#{encdump s}.chomp")
250    s = "\0\r\0\n".force_encoding("utf-16be")
251    assert_equal("", s.chomp, "#{encdump s}.chomp")
252  end
253
254  def test_succ
255    s = "\xff\xff".force_encoding("utf-16be")
256    assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?")
257
258    s = "\xdb\xff\xdf\xff".force_encoding("utf-16be")
259    assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?")
260  end
261
262  def test_regexp_union
263    enccall(Regexp, :union, "aa".force_encoding("utf-16be"), "bb".force_encoding("utf-16be"))
264  end
265
266  def test_empty_regexp
267    s = "".force_encoding("utf-16be")
268    assert_equal(Encoding.find("utf-16be"), Regexp.new(s).encoding,
269                "Regexp.new(#{encdump s}).encoding")
270  end
271
272  def test_regexp_match
273    assert_raise(Encoding::CompatibilityError) { Regexp.new("aa".force_encoding("utf-16be")) =~ "aa" }
274  end
275
276  def test_gsub
277    s = "abcd".force_encoding("utf-16be")
278    assert_nothing_raised {
279      s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
280    }
281    s = "ab\0\ncd".force_encoding("utf-16be")
282    assert_raise(Encoding::CompatibilityError) {
283      s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
284    }
285  end
286
287  def test_split_awk
288    s = " ab cd ".encode("utf-16be")
289    r = s.split(" ".encode("utf-16be"))
290    assert_equal(2, r.length)
291    assert_str_equal("ab".encode("utf-16be"), r[0])
292    assert_str_equal("cd".encode("utf-16be"), r[1])
293  end
294
295  def test_count2
296    e = "abc".count("^b")
297    assert_equal(e, "abc".encode("utf-16be").count("^b".encode("utf-16be")))
298    assert_equal(e, "abc".encode("utf-16le").count("^b".encode("utf-16le")))
299  end
300
301  def test_header
302    assert_raise(ArgumentError) { eval("# encoding:utf-16le\nfoo") }
303    assert_raise(ArgumentError) { eval("# encoding:utf-16be\nfoo") }
304  end
305
306
307  def test_is_mbc_newline
308    sl = "f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n\0".force_encoding("utf-16le")
309    sb = "\0f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n".force_encoding("utf-16be")
310    al = sl.lines.to_a
311    ab = sb.lines.to_a
312    assert_equal("f\0o\0o\0\n\0".force_encoding("utf-16le"), al.shift)
313    assert_equal("b\0a\0r\0\n\0".force_encoding("utf-16le"), al.shift)
314    assert_equal("b\0a\0z\0\n\0".force_encoding("utf-16le"), al.shift)
315    assert_equal("\0f\0o\0o\0\n".force_encoding("utf-16be"), ab.shift)
316    assert_equal("\0b\0a\0r\0\n".force_encoding("utf-16be"), ab.shift)
317    assert_equal("\0b\0a\0z\0\n".force_encoding("utf-16be"), ab.shift)
318
319    sl = "f\0o\0o\0\n\0".force_encoding("utf-16le")
320    sb = "\0f\0o\0o\0\n".force_encoding("utf-16be")
321    sl2 = "f\0o\0o\0".force_encoding("utf-16le")
322    sb2 = "\0f\0o\0o".force_encoding("utf-16be")
323    assert_equal(sl2, sl.chomp)
324    assert_equal(sl2, sl.chomp.chomp)
325    assert_equal(sb2, sb.chomp)
326    assert_equal(sb2, sb.chomp.chomp)
327
328    sl = "f\0o\0o\0\n".force_encoding("utf-16le")
329    sb = "\0f\0o\0o\n".force_encoding("utf-16be")
330    assert_equal(sl, sl.chomp)
331    assert_equal(sb, sb.chomp)
332  end
333
334  def test_code_to_mbc
335    assert_equal("a\0".force_encoding("utf-16le"), "a".ord.chr("utf-16le"))
336    assert_equal("\0a".force_encoding("utf-16be"), "a".ord.chr("utf-16be"))
337  end
338
339  def utf8_to_utf16(s, e)
340    s.chars.map {|c| c.ord.chr(e) }.join
341  end
342
343  def test_mbc_case_fold
344    rl = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16le"), "i")
345    rb = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16be"), "i")
346    assert_equal(Encoding.find("utf-16le"), rl.encoding)
347    assert_equal(Encoding.find("utf-16be"), rb.encoding)
348    assert_match(rl, utf8_to_utf16("\u3042a\u3042a", "utf-16le"))
349    assert_match(rb, utf8_to_utf16("\u3042a\u3042a", "utf-16be"))
350  end
351
352  def test_surrogate_pair
353    sl = "\x42\xd8\xb7\xdf".force_encoding("utf-16le")
354    sb = "\xd8\x42\xdf\xb7".force_encoding("utf-16be")
355
356    assert_equal(1, sl.size)
357    assert_equal(1, sb.size)
358    assert_equal(0x20bb7, sl.ord)
359    assert_equal(0x20bb7, sb.ord)
360    assert_equal(sl, 0x20bb7.chr("utf-16le"))
361    assert_equal(sb, 0x20bb7.chr("utf-16be"))
362    assert_equal("", sl.chop)
363    assert_equal("", sb.chop)
364  end
365
366  def test_regexp_escape
367    s = "\0*".force_encoding("UTF-16BE")
368    r = Regexp.new(Regexp.escape(s))
369    assert(r =~ s, "#{encdump(r)} =~ #{encdump(s)}")
370  end
371
372  def test_casecmp2
373    assert_equal(0, "\0A".force_encoding("UTF-16BE").casecmp("\0a".force_encoding("UTF-16BE")))
374    assert_not_equal(0, "\0A".force_encoding("UTF-16LE").casecmp("\0a".force_encoding("UTF-16LE")))
375    assert_not_equal(0, "A\0".force_encoding("UTF-16BE").casecmp("a\0".force_encoding("UTF-16BE")))
376    assert_equal(0, "A\0".force_encoding("UTF-16LE").casecmp("a\0".force_encoding("UTF-16LE")))
377
378    ary = ["01".force_encoding("UTF-16LE"),
379           "10".force_encoding("UTF-16LE")]
380    e = ary.sort {|x,y| x <=> y }
381    a = ary.sort {|x,y| x.casecmp(y) }
382    assert_equal(e, a)
383  end
384end
385