1# uniParse.tcl -- 2# 3# This program parses the UnicodeData file and generates the 4# corresponding tclUniData.c file with compressed character 5# data tables. The input to this program should be the latest 6# UnicodeData file from: 7# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt 8# 9# Copyright (c) 1998-1999 by Scriptics Corporation. 10# All rights reserved. 11# 12# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ 13 14 15namespace eval uni { 16 set shift 5; # number of bits of data within a page 17 # This value can be adjusted to find the 18 # best split to minimize table size 19 20 variable pMap; # map from page to page index, each entry is 21 # an index into the pages table, indexed by 22 # page number 23 variable pages; # map from page index to page info, each 24 # entry is a list of indices into the groups 25 # table, the list is indexed by the offset 26 variable groups; # list of character info values, indexed by 27 # group number, initialized with the 28 # unassigned character group 29 30 variable categories { 31 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp 32 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So 33 }; # Ordered list of character categories, must 34 # match the enumeration in the header file. 35 36 variable titleCount 0; # Count of the number of title case 37 # characters. This value is used in the 38 # regular expression code to allocate enough 39 # space for the title case variants. 40} 41 42proc uni::getValue {items index} { 43 variable categories 44 variable titleCount 45 46 # Extract character info 47 48 set category [lindex $items 2] 49 if {[scan [lindex $items 12] %4x toupper] == 1} { 50 set toupper [expr {$index - $toupper}] 51 } else { 52 set toupper {} 53 } 54 if {[scan [lindex $items 13] %4x tolower] == 1} { 55 set tolower [expr {$tolower - $index}] 56 } else { 57 set tolower {} 58 } 59 if {[scan [lindex $items 14] %4x totitle] == 1} { 60 set totitle [expr {$index - $totitle}] 61 } else { 62 set totitle {} 63 } 64 65 set categoryIndex [lsearch -exact $categories $category] 66 if {$categoryIndex < 0} { 67 puts "Unexpected character category: $index($category)" 68 set categoryIndex 0 69 } elseif {$category == "Lt"} { 70 incr titleCount 71 } 72 73 return "$categoryIndex,$toupper,$tolower,$totitle" 74} 75 76proc uni::getGroup {value} { 77 variable groups 78 79 set gIndex [lsearch -exact $groups $value] 80 if {$gIndex == -1} { 81 set gIndex [llength $groups] 82 lappend groups $value 83 } 84 return $gIndex 85} 86 87proc uni::addPage {info} { 88 variable pMap 89 variable pages 90 91 set pIndex [lsearch -exact $pages $info] 92 if {$pIndex == -1} { 93 set pIndex [llength $pages] 94 lappend pages $info 95 } 96 lappend pMap $pIndex 97 return 98} 99 100proc uni::buildTables {data} { 101 variable shift 102 103 variable pMap {} 104 variable pages {} 105 variable groups {{0,,,}} 106 set info {} ;# temporary page info 107 108 set mask [expr {(1 << $shift) - 1}] 109 110 set next 0 111 112 foreach line [split $data \n] { 113 if {$line == ""} { 114 set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" 115 } 116 117 set items [split $line \;] 118 119 scan [lindex $items 0] %4x index 120 set index [format 0x%0.4x $index] 121 122 set gIndex [getGroup [getValue $items $index]] 123 124 # Since the input table omits unassigned characters, these will 125 # show up as gaps in the index sequence. There are a few special cases 126 # where the gaps correspond to a uniform block of assigned characters. 127 # These are indicated as such in the character name. 128 129 # Enter all unassigned characters up to the current character. 130 if {($index > $next) \ 131 && ![regexp "Last>$" [lindex $items 1]]} { 132 for {} {$next < $index} {incr next} { 133 lappend info 0 134 if {($next & $mask) == $mask} { 135 addPage $info 136 set info {} 137 } 138 } 139 } 140 141 # Enter all assigned characters up to the current character 142 for {set i $next} {$i <= $index} {incr i} { 143 # Split character index into offset and page number 144 set offset [expr {$i & $mask}] 145 set page [expr {($i >> $shift)}] 146 147 # Add the group index to the info for the current page 148 lappend info $gIndex 149 150 # If this is the last entry in the page, add the page 151 if {$offset == $mask} { 152 addPage $info 153 set info {} 154 } 155 } 156 set next [expr {$index + 1}] 157 } 158 return 159} 160 161proc uni::main {} { 162 global argc argv0 argv 163 variable pMap 164 variable pages 165 variable groups 166 variable shift 167 variable titleCount 168 169 if {$argc != 2} { 170 puts stderr "\nusage: $argv0 <datafile> <outdir>\n" 171 exit 1 172 } 173 set f [open [lindex $argv 0] r] 174 set data [read $f] 175 close $f 176 177 buildTables $data 178 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" 179 set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] 180 puts "shift = 6, space = $size" 181 puts "title case count = $titleCount" 182 183 set f [open [file join [lindex $argv 1] tclUniData.c] w] 184 fconfigure $f -translation lf 185 puts $f "/* 186 * tclUniData.c -- 187 * 188 * Declarations of Unicode character information tables. This file is 189 * automatically generated by the tools/uniParse.tcl script. Do not 190 * modify this file by hand. 191 * 192 * Copyright (c) 1998 by Scriptics Corporation. 193 * All rights reserved. 194 * 195 * RCS: @(#) \$Id\$ 196 */ 197 198/* 199 * A 16-bit Unicode character is split into two parts in order to index 200 * into the following tables. The lower OFFSET_BITS comprise an offset 201 * into a page of characters. The upper bits comprise the page number. 202 */ 203 204#define OFFSET_BITS $shift 205 206/* 207 * The pageMap is indexed by page number and returns an alternate page number 208 * that identifies a unique page of characters. Many Unicode characters map 209 * to the same alternate page number. 210 */ 211 212static unsigned char pageMap\[\] = {" 213 set line " " 214 set last [expr {[llength $pMap] - 1}] 215 for {set i 0} {$i <= $last} {incr i} { 216 append line [lindex $pMap $i] 217 if {$i != $last} { 218 append line ", " 219 } 220 if {[string length $line] > 70} { 221 puts $f $line 222 set line " " 223 } 224 } 225 puts $f $line 226 puts $f "}; 227 228/* 229 * The groupMap is indexed by combining the alternate page number with 230 * the page offset and returns a group number that identifies a unique 231 * set of character attributes. 232 */ 233 234static unsigned char groupMap\[\] = {" 235 set line " " 236 set lasti [expr {[llength $pages] - 1}] 237 for {set i 0} {$i <= $lasti} {incr i} { 238 set page [lindex $pages $i] 239 set lastj [expr {[llength $page] - 1}] 240 for {set j 0} {$j <= $lastj} {incr j} { 241 append line [lindex $page $j] 242 if {$j != $lastj || $i != $lasti} { 243 append line ", " 244 } 245 if {[string length $line] > 70} { 246 puts $f $line 247 set line " " 248 } 249 } 250 } 251 puts $f $line 252 puts $f "}; 253 254/* 255 * Each group represents a unique set of character attributes. The attributes 256 * are encoded into a 32-bit value as follows: 257 * 258 * Bits 0-4 Character category: see the constants listed below. 259 * 260 * Bits 5-7 Case delta type: 000 = identity 261 * 010 = add delta for lower 262 * 011 = add delta for lower, add 1 for title 263 * 100 = sutract delta for title/upper 264 * 101 = sub delta for upper, sub 1 for title 265 * 110 = sub delta for upper, add delta for lower 266 * 267 * Bits 8-21 Reserved for future use. 268 * 269 * Bits 22-31 Case delta: delta for case conversions. This should be the 270 * highest field so we can easily sign extend. 271 */ 272 273static int groups\[\] = {" 274 set line " " 275 set last [expr {[llength $groups] - 1}] 276 for {set i 0} {$i <= $last} {incr i} { 277 foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} 278 279 # Compute the case conversion type and delta 280 281 if {$totitle != ""} { 282 if {$totitle == $toupper} { 283 # subtract delta for title or upper 284 set case 4 285 set delta $toupper 286 } elseif {$toupper != ""} { 287 # subtract delta for upper, subtract 1 for title 288 set case 5 289 set delta $toupper 290 } else { 291 # add delta for lower, add 1 for title 292 set case 3 293 set delta $tolower 294 } 295 } elseif {$toupper != ""} { 296 # subtract delta for upper, add delta for lower 297 set case 6 298 set delta $toupper 299 } elseif {$tolower != ""} { 300 # add delta for lower 301 set case 2 302 set delta $tolower 303 } else { 304 # noop 305 set case 0 306 set delta 0 307 } 308 309 set val [expr {($delta << 22) | ($case << 5) | $type}] 310 311 append line [format "%d" $val] 312 if {$i != $last} { 313 append line ", " 314 } 315 if {[string length $line] > 65} { 316 puts $f $line 317 set line " " 318 } 319 } 320 puts $f $line 321 puts $f "}; 322 323/* 324 * The following constants are used to determine the category of a 325 * Unicode character. 326 */ 327 328#define UNICODE_CATEGORY_MASK 0X1F 329 330enum { 331 UNASSIGNED, 332 UPPERCASE_LETTER, 333 LOWERCASE_LETTER, 334 TITLECASE_LETTER, 335 MODIFIER_LETTER, 336 OTHER_LETTER, 337 NON_SPACING_MARK, 338 ENCLOSING_MARK, 339 COMBINING_SPACING_MARK, 340 DECIMAL_DIGIT_NUMBER, 341 LETTER_NUMBER, 342 OTHER_NUMBER, 343 SPACE_SEPARATOR, 344 LINE_SEPARATOR, 345 PARAGRAPH_SEPARATOR, 346 CONTROL, 347 FORMAT, 348 PRIVATE_USE, 349 SURROGATE, 350 CONNECTOR_PUNCTUATION, 351 DASH_PUNCTUATION, 352 OPEN_PUNCTUATION, 353 CLOSE_PUNCTUATION, 354 INITIAL_QUOTE_PUNCTUATION, 355 FINAL_QUOTE_PUNCTUATION, 356 OTHER_PUNCTUATION, 357 MATH_SYMBOL, 358 CURRENCY_SYMBOL, 359 MODIFIER_SYMBOL, 360 OTHER_SYMBOL 361}; 362 363/* 364 * The following macros extract the fields of the character info. The 365 * GetDelta() macro is complicated because we can't rely on the C compiler 366 * to do sign extension on right shifts. 367 */ 368 369#define GetCaseType(info) (((info) & 0xE0) >> 5) 370#define GetCategory(info) ((info) & 0x1F) 371#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) 372 373/* 374 * This macro extracts the information about a character from the 375 * Unicode character tables. 376 */ 377 378#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) 379" 380 381 close $f 382} 383 384uni::main 385 386return 387