1#!/usr/bin/env tclsh 2 3# gen_stringprep_data.tcl -- 4# 5# This program parses the RFC 3454 file and generates the 6# corresponding stringprep_data.tcl file with compressed character 7# data tables. The input to this program should be rfc3454.txt. 8# It can be downloaded from http://www.ietf.org/rfc/rfc3454.txt 9# 10# Copyright (c) 1998-1999 by Scriptics Corporation. 11# All rights reserved. 12# 13# Modified for ejabberd by Alexey Shchepin 14# Modified for Tcl stringprep by Sergei Golovan 15# 16# Usage: gen_stringprep_data.tcl infile outdir 17# 18# RCS: @(#) $Id: gen_stringprep_data.tcl,v 1.2 2009/11/02 00:26:44 patthoyts Exp $ 19 20 21namespace eval uni { 22 set shift 7; # number of bits of data within a page 23 # This value can be adjusted to find the 24 # best split to minimize table size 25 26 variable pMap; # map from page to page index, each entry is 27 # an index into the pages table, indexed by 28 # page number 29 variable pages; # map from page index to page info, each 30 # entry is a list of indices into the groups 31 # table, the list is indexed by the offset 32 variable groups; # list of character info values, indexed by 33 # group number, initialized with the 34 # unassigned character group 35} 36 37proc uni::getValue {i} { 38 variable casemap 39 variable casemap2 40 variable tablemap 41 42 if {[info exists tablemap($i)]} { 43 set tables $tablemap($i) 44 } else { 45 set tables {} 46 } 47 48 if {[info exists casemap2($i)]} { 49 set multicase 1 50 set delta $casemap2($i) 51 } else { 52 set multicase 0 53 if {[info exists casemap($i)]} { 54 set delta $casemap($i) 55 } else { 56 set delta 0 57 } 58 } 59 60 if {abs($delta) > 0xFFFFF} { 61 puts "delta must be less than 22 bits wide" 62 exit 63 } 64 65 set a1 0 66 set b1 0 67 set b2 0 68 set b3 0 69 set c11 0 70 set c12 0 71 set c21 0 72 set c22 0 73 set c3 0 74 set c4 0 75 set c5 0 76 set c6 0 77 set c7 0 78 set c8 0 79 set c9 0 80 set d1 0 81 set d2 0 82 83 foreach tab $tables { 84 switch -glob -- $tab { 85 A.1 {set a1 1} 86 B.1 {set b1 1} 87 B.2 {set b2 1} 88 B.3 {set b3 1} 89 C.1.1 {set c11 1} 90 C.1.2 {set c12 1} 91 C.2.1 {set c21 1} 92 C.2.2 {set c22 1} 93 C.3 {set c3 1} 94 C.4 {set c4 1} 95 C.5 {set c5 1} 96 C.6 {set c6 1} 97 C.7 {set c7 1} 98 C.8 {set c8 1} 99 C.9 {set c9 1} 100 D.1 {set d1 1} 101 D.2 {set d2 1} 102 } 103 } 104 105 set val [expr {($a1 << 0) | 106 ($b1 << 1) | 107 ($b3 << 2) | 108 ($c11 << 3) | 109 ($c12 << 4) | 110 ($c21 << 5) | 111 ($c22 << 6) | 112 (($c3 | $c4 | $c5 | $c6 | $c7 | $c8 | $c9) << 7) | 113 ($d1 << 8) | 114 ($d2 << 9) | 115 ($multicase << 10) | 116 ($delta << 11)}] 117 118 return $val 119} 120 121proc uni::getGroup {value} { 122 variable groups 123 124 set gIndex [lsearch -exact $groups $value] 125 if {$gIndex == -1} { 126 set gIndex [llength $groups] 127 lappend groups $value 128 } 129 return $gIndex 130} 131 132proc uni::addPage {info} { 133 variable pMap 134 variable pages 135 variable pages_map 136 137 if {[info exists pages_map($info)]} { 138 lappend pMap $pages_map($info) 139 } else { 140 set pIndex [llength $pages] 141 lappend pages $info 142 set pages_map($info) $pIndex 143 lappend pMap $pIndex 144 } 145 return 146} 147 148 149proc uni::load_tables {data} { 150 variable casemap 151 variable casemap2 152 variable multicasemap 153 variable tablemap 154 155 set multicasemap {} 156 set table "" 157 158 foreach line [split $data \n] { 159 if {$table == ""} { 160 if {[regexp { ----- Start Table (.*) -----} $line temp table]} { 161 #puts "Start table '$table'" 162 } 163 } else { 164 if {[regexp { ----- End Table (.*) -----} $line temp table1]} { 165 set table "" 166 } else { 167 if {$table == "B.1"} { 168 if {[regexp {^ ([[:xdigit:]]+); ;} $line \ 169 temp val]} { 170 scan $val %x val 171 if {$val <= 0x10ffff} { 172 lappend tablemap($val) $table 173 } 174 } 175 } elseif {$table == "B.2"} { 176 # B.2 table is used for mapping with normalisation 177 if {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \ 178 temp from to]} { 179 scan $from %x from 180 scan $to %x to 181 if {$from <= 0x10ffff && $to <= 0x10ffff} { 182 set casemap($from) [expr {$to - $from}] 183 } 184 } elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \ 185 temp from to1 to2]} { 186 scan $from %x from 187 scan $to1 %x to1 188 scan $to2 %x to2 189 if {$from <= 0x10ffff && \ 190 $to1 <= 0x10ffff && $to2 <= 0x10ffff} { 191 set casemap2($from) [llength $multicasemap] 192 lappend multicasemap [list $to1 $to2] 193 } 194 } elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \ 195 temp from to1 to2 to3]} { 196 scan $from %x from 197 scan $to1 %x to1 198 scan $to2 %x to2 199 scan $to3 %x to3 200 if {$from <= 0x10ffff && \ 201 $to1 <= 0x10ffff && $to2 <= 0x10ffff && \ 202 $to3 <= 0x10ffff} { 203 set casemap2($from) [llength $multicasemap] 204 lappend multicasemap [list $to1 $to2 $to3] 205 } 206 } elseif {[regexp {^ ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \ 207 temp from to1 to2 to3 to4]} { 208 scan $from %x from 209 scan $to1 %x to1 210 scan $to2 %x to2 211 scan $to3 %x to3 212 scan $to4 %x to4 213 if {$from <= 0x10ffff && \ 214 $to1 <= 0x10ffff && $to2 <= 0x10ffff && \ 215 $to3 <= 0x10ffff && $to4 <= 0x10ffff} { 216 set casemap2($from) [llength $multicasemap] 217 lappend multicasemap [list $to1 $to2 $to3 $to4] 218 } 219 } else { 220 #puts "missed: $line" 221 } 222 223 } elseif {$table == "B.3"} { 224 # B.3 table is used for mapping without normalisation (B.3 is a subset of B.2) 225 if {[regexp {^ ([[:xdigit:]]+);} $line temp from]} { 226 scan $from %x from 227 if {$from <= 0x10ffff} { 228 lappend tablemap($from) $table 229 } 230 } 231 } else { 232 if {[regexp {^ ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \ 233 temp from to]} { 234 scan $from %x from 235 scan $to %x to 236 for {set i $from} {$i <= $to && $i <= 0x10ffff} {incr i} { 237 lappend tablemap($i) $table 238 } 239 } elseif {[regexp {^ ([[:xdigit:]]+)} $line \ 240 temp val]} { 241 scan $val %x val 242 if {$val <= 0x10ffff} { 243 lappend tablemap($val) $table 244 } 245 } 246 } 247 } 248 } 249 } 250} 251 252proc uni::buildTables {} { 253 variable shift 254 255 variable casemap 256 variable tablemap 257 258 variable pMap {} 259 variable pages {} 260 variable groups {} 261 set info {} ;# temporary page info 262 263 set mask [expr {(1 << $shift) - 1}] 264 265 set next 0 266 267 for {set i 0} {$i <= 0x10ffff} {incr i} { 268 set gIndex [getGroup [getValue $i]] 269 270 # Split character index into offset and page number 271 set offset [expr {$i & $mask}] 272 set page [expr {($i >> $shift)}] 273 274 # Add the group index to the info for the current page 275 lappend info $gIndex 276 277 # If this is the last entry in the page, add the page 278 if {$offset == $mask} { 279 addPage $info 280 set info {} 281 } 282 } 283 return 284} 285 286proc uni::main {} { 287 global argc argv0 argv 288 variable pMap 289 variable pages 290 variable groups 291 variable shift 292 variable multicasemap 293 294 if {$argc != 2} { 295 puts stderr "\nusage: $argv0 <datafile> <outdir>\n" 296 exit 1 297 } 298 set f [open [lindex $argv 0] r] 299 set data [read $f] 300 close $f 301 302 load_tables $data 303 buildTables 304 #puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" 305 #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] 306 #puts "shift = $shift, space = $size" 307 308 set f [open [file join [lindex $argv 1] stringprep_data.tcl] w] 309 fconfigure $f -translation lf 310 puts $f \ 311"# stringprep_data.tcl -- 312# 313# Declarations of Unicode character information tables. This file is 314# automatically generated by the gen_stringprep_data.tcl script. Do not 315# modify this file by hand. 316# 317# Copyright (c) 1998 Scriptics Corporation. 318# Copyright (c) 2007 Alexey Shchepin 319# Copyright (c) 2008 Sergei Golovan 320# 321# RCS: @(#) \$Id\$ 322# 323 324package provide stringprep::data 1.0.1 325 326namespace eval ::stringprep::data { 327 328# 329# A 16-bit Unicode character is split into two parts in order to index 330# into the following tables. The lower OFFSET_BITS comprise an offset 331# into a page of characters. The upper bits comprise the page number. 332# 333 334set OFFSET_BITS $shift 335 336# 337# The pageMap is indexed by page number and returns an alternate page number 338# that identifies a unique page of characters. Many Unicode characters map 339# to the same alternate page number. 340# 341 342array unset pageMap 343array set pageMap \[list \\" 344 array unset tmp 345 foreach idx $pMap { 346 if {![info exists tmp($idx)]} { 347 set tmp($idx) 1 348 } else { 349 incr tmp($idx) 350 } 351 } 352 set max 0 353 set max_id 0 354 foreach idx [array names tmp] { 355 if {$tmp($idx) > $max} { 356 set max $tmp($idx) 357 set max_id $idx 358 } 359 } 360 set line " " 361 set last [expr {[llength $pMap] - 1}] 362 for {set i 0} {$i <= $last} {incr i} { 363 set num [lindex $pMap $i] 364 if {$num != $max_id} { 365 append line " $i $num" 366 } 367 if {[string length $line] > 70} { 368 puts $f "$line \\" 369 set line " " 370 } 371 } 372 puts $f "$line\] 373 374set COMMON_PAGE_MAP $max_id 375 376# 377# The groupMap is indexed by combining the alternate page number with 378# the page offset and returns a group number that identifies a unique 379# set of character attributes. 380# 381 382set groupMap \[list \\" 383 set line " " 384 set lasti [expr {[llength $pages] - 1}] 385 for {set i 0} {$i <= $lasti} {incr i} { 386 set page [lindex $pages $i] 387 set lastj [expr {[llength $page] - 1}] 388 for {set j 0} {$j <= $lastj} {incr j} { 389 append line [lindex $page $j] 390 if {$j != $lastj || $i != $lasti} { 391 append line " " 392 } 393 if {[string length $line] > 70} { 394 puts $f "$line\\" 395 set line " " 396 } 397 } 398 } 399 puts $f "$line\] 400 401# 402# Each group represents a unique set of character attributes. The attributes 403# are encoded into a 32-bit value as follows: 404# 405# Bit 0 A.1 406# 407# Bit 1 B.1 408# 409# Bit 2 B.3 410# 411# Bit 3 C.1.1 412# 413# Bit 4 C.1.2 414# 415# Bit 5 C.2.1 416# 417# Bit 6 C.2.2 418# 419# Bit 7 C.3--C.9 420# 421# Bit 8 D.1 422# 423# Bit 9 D.2 424# 425# Bit 10 Case maps to several characters 426# 427# Bits 11-31 Case delta: delta for case conversions. This should be the 428# highest field so we can easily sign extend. 429# 430 431set groups \[list \\" 432 set line " " 433 set last [expr {[llength $groups] - 1}] 434 for {set i 0} {$i <= $last} {incr i} { 435 set val [lindex $groups $i] 436 437 append line [format "%d" $val] 438 if {$i != $last} { 439 append line " " 440 } 441 if {[string length $line] > 65} { 442 puts $f "$line\\" 443 set line " " 444 } 445 } 446 puts $f "$line\] 447 448# 449# Table for characters that lowercased to multiple ones 450# 451 452set multiCaseTable \[list \\" 453 set last [expr {[llength $multicasemap] - 1}] 454 for {set i 0} {$i <= $last} {incr i} { 455 set val [lindex $multicasemap $i] 456 457 set line " " 458 append line "{" [join $val " "] "}" 459 puts $f "$line \\" 460 } 461 puts $f "\] 462 463# 464# The following constants are used to determine the category of a 465# Unicode character. 466# 467 468set A1Mask \[expr {1 << 0}\] 469set B1Mask \[expr {1 << 1}\] 470set B3Mask \[expr {1 << 2}\] 471set C11Mask \[expr {1 << 3}\] 472set C12Mask \[expr {1 << 4}\] 473set C21Mask \[expr {1 << 5}\] 474set C22Mask \[expr {1 << 6}\] 475set C39Mask \[expr {1 << 7}\] 476set D1Mask \[expr {1 << 8}\] 477set D2Mask \[expr {1 << 9}\] 478set MCMask \[expr {1 << 10}\] 479 480# 481# The following procs extract the fields of the character info. 482# 483 484proc GetCaseType {info} {expr {(\$info & 0xE0) >> 5}} 485proc GetCategory {info} {expr {\$info & 0x1F}} 486proc GetDelta {info} {expr {\$info >> 11}} 487proc GetMC {info} { 488 variable multiCaseTable 489 lindex \$multiCaseTable \[GetDelta \$info\] 490} 491 492# 493# This proc extracts the information about a character from the 494# Unicode character tables. 495# 496 497proc GetUniCharInfo {uc} { 498 variable OFFSET_BITS 499 variable COMMON_PAGE_MAP 500 variable pageMap 501 variable groupMap 502 variable groups 503 504 set page \[expr {(\$uc & 0x1fffff) >> \$OFFSET_BITS}\] 505 if {\[info exists pageMap(\$page)\]} { 506 set apage \$pageMap(\$page) 507 } else { 508 set apage \$COMMON_PAGE_MAP 509 } 510 511 lindex \$groups \\ 512 \[lindex \$groupMap \\ 513 \[expr {(\$apage << \$OFFSET_BITS) | \\ 514 (\$uc & ((1 << \$OFFSET_BITS) - 1))}\]\] 515} 516 517} ; # namespace eval ::stringprep::data 518" 519 close $f 520} 521 522uni::main 523 524return 525