stringprep/tools/gen_unicode_test.tcl

#!/usr/bin/tclsh

# gen_unicode_test.tcl --
#
#	This program parses the RFC 3454 file and generates the
#	corresponding unicode.test file with unicode package tests.
#	The input to this program should be NormalizationTest.txt.
#	It can be downloaded from:
#	ftp://ftp.unicode.org/Public/UNIDATA/NormalizationTest.txt
#	Short test suite is generated by default. If you want to generate
#	all tests (more than 300000 test cases) add suffix 'full' as the
#	third argument.
#
# Usage: gen_unicode_test.tcl infile outdir ?full?
#
# RCS: @(#) $Id: gen_unicode_test.tcl,v 1.1 2008/01/29 02:18:10 patthoyts Exp $

package require struct::list

set short_test_list [list \
    "LATIN CAPITAL LETTER D, COMBINING DOT ABOVE, COMBINING DOT BELOW" \
    "NO-BREAK SPACE" \
    "VULGAR FRACTION ONE HALF" \
    "ORIYA LETTER RRA" \
    "KANNADA VOWEL SIGN EE" \
    "TIBETAN LETTER GHA" \
    "MODIFIER LETTER CAPITAL A" \
    "GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA" \
    "KANGXI RADICAL SPROUT" \
    "HIRAGANA LETTER DE" \
    "KATAKANA LETTER PA" \
    "HANGUL LETTER SIOS-PIEUP" \
    "HANGUL SYLLABLE GYANG" \
    "CJK COMPATIBILITY IDEOGRAPH-F98E" \
    "ARABIC LETTER HEH DOACHASHMEE ISOLATED FORM" \
    "ARABIC LIGATURE AIN WITH JEEM ISOLATED FORM" \
    "FULLWIDTH DIGIT THREE" \
    "LATIN SMALL LETTER A, COMBINING CYRILLIC TITLO, COMBINING COMMA ABOVE RIGHT, COMBINING GRAVE ACCENT, HEBREW ACCENT ZINOR, LATIN SMALL LETTER B" \
    "LATIN SMALL LETTER A, DEVANAGARI SIGN NUKTA, COMBINING TILDE OVERLAY, COMBINING RING OVERLAY, LATIN SMALL LETTER B" \
    "HANGUL SYLLABLE BYO, COMBINING TILDE OVERLAY, HANGUL JONGSEONG TIKEUT"]

set fd [open [lindex $argv 0]]

set all_tests {}
set n 0
while {[gets $fd line] >= 0} {
    set line [string trim $line]
    if {![regexp \
	      {^([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);([ [:xdigit:]]+);.*\) (.*)} \
	      $line -> c(1) c(2) c(3) c(4) c(5) title]} continue

    set q 1
    foreach i {1 2 3 4 5} {
	set s($i) {}
	set us($i) ""
	foreach xnum $c($i) {
	    set uc [scan $xnum %x]
	    if {$uc > 0xffff} {
		set q 0
	    }
	    lappend s($i) $uc
	    append us($i) \\u$xnum
	}
    }
    if {!$q} {
	# Test case contains character which is greater than 0xFFFF and can't
	# be represented in Tcl
	continue
    }
    set test($n) [list $s(1) $s(2) $s(3) $s(4) $s(5) $title]
    set test1($n) [list $us(1) $us(2) $us(3) $us(4) $us(5) $title]
    if {[lsearch $short_test_list $title] >= 0} {
	lappend all_tests $n
    }
    incr n
}

close $fd

if {[string equal [lindex $argv 2] full]} {
    set all_tests [struct::list iota $n]
}

set f [open [file join [lindex $argv 1] unicode.test] w]
fconfigure $f -translation lf
puts $f \
"# unicode.test
#
# Tests for the unicode package. This file is automatically generated by
# the gen_unicode_test.tcl script. Do not modify this file by hands.
#
# RCS: @(#) \$Id\$

# -------------------------------------------------------------------------

source \[file join \\
	\[file dirname \[file dirname \[file join \[pwd\] \[info script\]\]\]\] \\
	devtools testutilities.tcl\]

testsNeedTcl     8.3
testsNeedTcltest 1.0

testing {
    useLocalFile unicode_data.tcl
    useLocalFile unicode.tcl
}

# -------------------------------------------------------------------------
"

set j 0
foreach i $all_tests {
    puts $f \
"
test unicode-1.[incr j] {normalizeS D: [lindex $test1($i) 5]} {
    unicode::normalizeS D \"[lindex $test1($i) 0]\"
} \"[lindex $test1($i) 2]\"

test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
    unicode::normalize D [list [lindex $test($i) 1]]
} {[lindex $test($i) 2]}

test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
    unicode::normalize D [list [lindex $test($i) 2]]
} {[lindex $test($i) 2]}

test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
    unicode::normalize D [list [lindex $test($i) 3]]
} {[lindex $test($i) 4]}

test unicode-1.[incr j] {normalize D: [lindex $test($i) 5]} {
    unicode::normalize D [list [lindex $test($i) 4]]
} {[lindex $test($i) 4]}
"
}

set j 0
foreach i $all_tests {
    puts $f \
"
test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
    unicode::normalize C [list [lindex $test($i) 0]]
} {[lindex $test($i) 1]}

test unicode-2.[incr j] {normalizeS C: [lindex $test1($i) 5]} {
    unicode::normalizeS C \"[lindex $test1($i) 1]\"
} \"[lindex $test1($i) 1]\"

test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
    unicode::normalize C [list [lindex $test($i) 2]]
} {[lindex $test($i) 1]}

test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
    unicode::normalize C [list [lindex $test($i) 3]]
} {[lindex $test($i) 3]}

test unicode-2.[incr j] {normalize C: [lindex $test($i) 5]} {
    unicode::normalize C [list [lindex $test($i) 4]]
} {[lindex $test($i) 3]}
"
}

set j 0
foreach i $all_tests {
    puts $f \
"
test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
    unicode::normalize KD [list [lindex $test($i) 0]]
} {[lindex $test($i) 4]}

test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
    unicode::normalize KD [list [lindex $test($i) 1]]
} {[lindex $test($i) 4]}

test unicode-3.[incr j] {normalizeS KD: [lindex $test1($i) 5]} {
    unicode::normalizeS KD \"[lindex $test1($i) 2]\"
} \"[lindex $test1($i) 4]\"

test unicode-3.[incr j] {normalize KD: [lindex $test($i) 5]} {
    unicode::normalize KD [list [lindex $test($i) 3]]
} {[lindex $test($i) 4]}

test unicode-1.[incr j] {normalize KD: [lindex $test($i) 5]} {
    unicode::normalize KD [list [lindex $test($i) 4]]
} {[lindex $test($i) 4]}
"
}

set j 0
foreach i $all_tests {
    puts $f \
"
test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
    unicode::normalize KC [list [lindex $test($i) 0]]
} {[lindex $test($i) 3]}

test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
    unicode::normalize KC [list [lindex $test($i) 1]]
} {[lindex $test($i) 3]}

test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
    unicode::normalize KC [list [lindex $test($i) 2]]
} {[lindex $test($i) 3]}

test unicode-4.[incr j] {normalizeS KC: [lindex $test1($i) 5]} {
    unicode::normalizeS KC \"[lindex $test1($i) 3]\"
} \"[lindex $test1($i) 3]\"

test unicode-4.[incr j] {normalize KC: [lindex $test($i) 5]} {
    unicode::normalize KC [list [lindex $test($i) 4]]
} {[lindex $test($i) 3]}
"
}

puts $f \
"
test unicode-5.1 {fromstring} {
    unicode::fromstring \"\\u0403\\u0405\\u0406\\u041f\\u0034\"
} {1027 1029 1030 1055 52}

test unicode-5.2 {fromstring} {
    unicode::fromstring \"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\u0008\\u0009\\u000a\\u000b\\u000c\\u000d\"
} {1 2 3 4 5 6 7 8 9 10 11 12 13}

test unicode-6.1 {tostring} {
    unicode::tostring {16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1}
} \"\\u0010\\u000f\\u000e\\u000d\\u000c\\u000b\\u000a\\u0009\\u0008\\u0007\\u0006\\u0005\\u0004\\u0003\\u0002\\u0001\"

test unicode-6.2 {tostring} {
    unicode::tostring {12345 12346 12347 12348 12349 12350 12351}
} \"\\u3039\\u303a\\u303b\\u303c\\u303d\\u303e\\u303f\"

test unicode-7.1 {normalize bad form} {
    catch {unicode::normalize S \"\"} result
    set result
} \"::unicode::normalize: Only D, C, KD and KC forms are allowed\"

test unicode-8.1 {normalizeS bad form} {
    catch {unicode::normalizeS S \"\"} result
    set result
} \"::unicode::normalizeS: Only D, C, KD and KC forms are allowed\"

::tcltest::cleanupTests
"

close $f