1#!/usr/bin/tclsh
2
3# gen_unicode_test.tcl --
4#
5#	This program parses the RFC 3454 file and generates the
6#	corresponding unicode.test file with unicode package tests.
7#	The input to this program should be NormalizationTest.txt.
8#	It can be downloaded from:
9#	ftp://ftp.unicode.org/Public/UNIDATA/NormalizationTest.txt
10#	Short test suite is generated by default. If you want to generate
11#	all tests (more than 300000 test cases) add suffix 'full' as the
12#	third argument.
13#
14# Usage: gen_unicode_test.tcl infile outdir ?full?
15#
16# RCS: @(#) $Id: gen_unicode_test.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $
17
18package require struct::list
19
20set short_test_list [list \
21    "LATIN CAPITAL LETTER D, COMBINING DOT ABOVE, COMBINING DOT BELOW" \
22    "NO-BREAK SPACE" \
23    "VULGAR FRACTION ONE HALF" \
24    "ORIYA LETTER RRA" \
25    "KANNADA VOWEL SIGN EE" \
26    "TIBETAN LETTER GHA" \
27    "MODIFIER LETTER CAPITAL A" \
28    "GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA" \
29    "KANGXI RADICAL SPROUT" \
30    "HIRAGANA LETTER DE" \
31    "KATAKANA LETTER PA" \
32    "HANGUL LETTER SIOS-PIEUP" \
33    "HANGUL SYLLABLE GYANG" \
34    "CJK COMPATIBILITY IDEOGRAPH-F98E" \
35    "ARABIC LETTER HEH DOACHASHMEE ISOLATED FORM" \
36    "ARABIC LIGATURE AIN WITH JEEM ISOLATED FORM" \
37    "FULLWIDTH DIGIT THREE" \
38    "LATIN SMALL LETTER A, COMBINING CYRILLIC TITLO, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, LATIN SMALL LETTER B" \
39    "LATIN SMALL LETTER A, DEVANAGARI SIGN NUKTA, COMBINING TILDE OVERLAY, COMBINING RING OVERLAY, LATIN SMALL LETTER B" \
40    "HANGUL SYLLABLE BYO, COMBINING TILDE OVERLAY, HANGUL JONGSEONG TIKEUT"]
41
42set fd [open [lindex $argv 0]]
43
44set all_tests {}
45set n 0
46while {[gets $fd line] >= 0} {
47    set line [string trim $line]
48    if {![regexp \
49	      {^([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);.*\) (.*)} \
50	      $line -> c(1) c(2) c(3) c(4) c(5) title]} continue
51
52    set q 1
53    foreach i {1 2 3 4 5} {
54	set s($i) {}
55	set us($i) ""
56	foreach xnum $c($i) {
57	    set uc [scan $xnum %x]
58	    if {$uc > 0xffff} {
59		set q 0
60	    }
61	    lappend s($i) $uc
62	    append us($i) \\u$xnum
63	}
64    }
65    if {!$q} {
66	# Test case contains character which is greater than 0xFFFF and can't
67	# be represented in Tcl
68	continue
69    }
70    set test($n) [list $s(1) $s(2) $s(3) $s(4) $s(5) $title]
71    set test1($n) [list $us(1) $us(2) $us(3) $us(4) $us(5) $title]
72    if {[lsearch $short_test_list $title] >= 0} {
73	lappend all_tests $n
74    }
75    incr n
76}
77
78close $fd
79
80if {[string equal [lindex $argv 2] full]} {
81    set all_tests [struct::list iota $n]
82}
83
84set f [open [file join [lindex $argv 1] unicode.test] w]
85fconfigure $f -translation lf
86puts $f \
87"# unicode.test
88#
89# Tests for the unicode package. This file is automatically generated by
90# the gen_unicode_test.tcl script. Do not modify this file by hands.
91#
92# RCS: @(#) \$Id\$
93
94# -------------------------------------------------------------------------
95
96source \[file join \\
97	\[file dirname \[file dirname \[file join \[pwd\] \[info script\]\]\]\] \\
98	devtools testutilities.tcl\]
99
100testsNeedTcl     8.3
101testsNeedTcltest 1.0
102
103testing {
104    useLocalFile unicode_data.tcl
105    useLocalFile unicode.tcl
106}
107
108# -------------------------------------------------------------------------
109"
110
111set j 0
112foreach i $all_tests {
113    puts $f \
114"
115test unicode-1.[incr j] {normalizeS D: [lindex $test1($i) 5]} {
116    unicode::normalizeS D \"[lindex $test1($i) 0]\"
117} \"[lindex $test1($i) 2]\"
118
119test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
120    unicode::normalize D [list [lindex $test($i) 1]]
121} {[lindex $test($i) 2]}
122
123test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
124    unicode::normalize D [list [lindex $test($i) 2]]
125} {[lindex $test($i) 2]}
126
127test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
128    unicode::normalize D [list [lindex $test($i) 3]]
129} {[lindex $test($i) 4]}
130
131test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
132    unicode::normalize D [list [lindex $test($i) 4]]
133} {[lindex $test($i) 4]}
134"
135}
136
137set j 0
138foreach i $all_tests {
139    puts $f \
140"
141test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
142    unicode::normalize C [list [lindex $test($i) 0]]
143} {[lindex $test($i) 1]}
144
145test unicode-2.[incr j] {normalizeS C: [lindex $test1($i) 5]} {
146    unicode::normalizeS C \"[lindex $test1($i) 1]\"
147} \"[lindex $test1($i) 1]\"
148
149test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
150    unicode::normalize C [list [lindex $test($i) 2]]
151} {[lindex $test($i) 1]}
152
153test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
154    unicode::normalize C [list [lindex $test($i) 3]]
155} {[lindex $test($i) 3]}
156
157test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
158    unicode::normalize C [list [lindex $test($i) 4]]
159} {[lindex $test($i) 3]}
160"
161}
162
163set j 0
164foreach i $all_tests {
165    puts $f \
166"
167test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
168    unicode::normalize KD [list [lindex $test($i) 0]]
169} {[lindex $test($i) 4]}
170
171test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
172    unicode::normalize KD [list [lindex $test($i) 1]]
173} {[lindex $test($i) 4]}
174
175test unicode-3.[incr j] {normalizeS KD: [lindex $test1($i) 5]} {
176    unicode::normalizeS KD \"[lindex $test1($i) 2]\"
177} \"[lindex $test1($i) 4]\"
178
179test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
180    unicode::normalize KD [list [lindex $test($i) 3]]
181} {[lindex $test($i) 4]}
182
183test unicode-1.[incr j] {normalize KD: [lindex $test($i) 5]} {
184    unicode::normalize KD [list [lindex $test($i) 4]]
185} {[lindex $test($i) 4]}
186"
187}
188
189set j 0
190foreach i $all_tests {
191    puts $f \
192"
193test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
194    unicode::normalize KC [list [lindex $test($i) 0]]
195} {[lindex $test($i) 3]}
196
197test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
198    unicode::normalize KC [list [lindex $test($i) 1]]
199} {[lindex $test($i) 3]}
200
201test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
202    unicode::normalize KC [list [lindex $test($i) 2]]
203} {[lindex $test($i) 3]}
204
205test unicode-4.[incr j] {normalizeS KC: [lindex $test1($i) 5]} {
206    unicode::normalizeS KC \"[lindex $test1($i) 3]\"
207} \"[lindex $test1($i) 3]\"
208
209test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
210    unicode::normalize KC [list [lindex $test($i) 4]]
211} {[lindex $test($i) 3]}
212"
213}
214
215puts $f \
216"
217test unicode-5.1 {fromstring} {
218    unicode::fromstring \"\\u0403\\u0405\\u0406\\u041f\\u0034\"
219} {1027 1029 1030 1055 52}
220
221test unicode-5.2 {fromstring} {
222    unicode::fromstring \"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\u0008\\u0009\\u000a\\u000b\\u000c\\u000d\"
223} {1 2 3 4 5 6 7 8 9 10 11 12 13}
224
225test unicode-6.1 {tostring} {
226    unicode::tostring {16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1}
227} \"\\u0010\\u000f\\u000e\\u000d\\u000c\\u000b\\u000a\\u0009\\u0008\\u0007\\u0006\\u0005\\u0004\\u0003\\u0002\\u0001\"
228
229test unicode-6.2 {tostring} {
230    unicode::tostring {12345 12346 12347 12348 12349 12350 12351}
231} \"\\u3039\\u303a\\u303b\\u303c\\u303d\\u303e\\u303f\"
232
233test unicode-7.1 {normalize bad form} {
234    catch {unicode::normalize S \"\"} result
235    set result
236} \"::unicode::normalize: Only D, C, KD and KC forms are allowed\"
237
238test unicode-8.1 {normalizeS bad form} {
239    catch {unicode::normalizeS S \"\"} result
240    set result
241} \"::unicode::normalizeS: Only D, C, KD and KC forms are allowed\"
242
243::tcltest::cleanupTests
244"
245
246close $f
247
248