1#!/usr/bin/env tclsh
2
3# gen_stringprep_data.tcl --
4#
5#	This program parses the RFC 3454 file and generates the
6#	corresponding stringprep_data.tcl file with compressed character
7#	data tables.  The input to this program should be rfc3454.txt.
8#	It can be downloaded from http://www.ietf.org/rfc/rfc3454.txt
9#
10# Copyright (c) 1998-1999 by Scriptics Corporation.
11# All rights reserved.
12#
13# Modified for ejabberd by Alexey Shchepin
14# Modified for Tcl stringprep by Sergei Golovan
15#
16# Usage: gen_stringprep_data.tcl infile outdir
17#
18# RCS: @(#) $Id: gen_stringprep_data.tcl,v 1.2 2009/11/02 00:26:44 patthoyts Exp $
19
20
21namespace eval uni {
22    set shift 7;		# number of bits of data within a page
23				# This value can be adjusted to find the
24				# best split to minimize table size
25
26    variable pMap;		# map from page to page index, each entry is
27				# an index into the pages table, indexed by
28				# page number
29    variable pages;		# map from page index to page info, each
30				# entry is a list of indices into the groups
31				# table, the list is indexed by the offset
32    variable groups;		# list of character info values, indexed by
33				# group number, initialized with the
34				# unassigned character group
35}
36
37proc uni::getValue {i} {
38    variable casemap
39    variable casemap2
40    variable tablemap
41
42    if {[info exists tablemap($i)]} {
43	set tables $tablemap($i)
44    } else {
45	set tables {}
46    }
47
48    if {[info exists casemap2($i)]} {
49	set multicase 1
50	set delta $casemap2($i)
51    } else {
52	set multicase 0
53	if {[info exists casemap($i)]} {
54	    set delta $casemap($i)
55	} else {
56	    set delta 0
57	}
58    }
59
60    if {abs($delta) > 0xFFFFF} {
61	puts "delta must be less than 22 bits wide"
62	exit
63    }
64
65    set a1 0
66    set b1 0
67    set b2 0
68    set b3 0
69    set c11 0
70    set c12 0
71    set c21 0
72    set c22 0
73    set c3 0
74    set c4 0
75    set c5 0
76    set c6 0
77    set c7 0
78    set c8 0
79    set c9 0
80    set d1 0
81    set d2 0
82
83    foreach tab $tables {
84	switch -glob -- $tab {
85	    A.1   {set a1 1}
86	    B.1   {set b1 1}
87	    B.2   {set b2 1}
88	    B.3   {set b3 1}
89	    C.1.1 {set c11 1}
90	    C.1.2 {set c12 1}
91	    C.2.1 {set c21 1}
92	    C.2.2 {set c22 1}
93	    C.3   {set c3 1}
94	    C.4   {set c4 1}
95	    C.5   {set c5 1}
96	    C.6   {set c6 1}
97	    C.7   {set c7 1}
98	    C.8   {set c8 1}
99	    C.9   {set c9 1}
100	    D.1   {set d1 1}
101	    D.2   {set d2 1}
102	}
103    }
104
105    set val [expr {($a1  << 0) |
106		   ($b1  << 1) |
107		   ($b3  << 2) |
108		   ($c11 << 3) |
109		   ($c12 << 4) |
110		   ($c21 << 5) |
111		   ($c22 << 6) |
112		   (($c3 | $c4 | $c5 | $c6 | $c7 | $c8 | $c9) << 7) |
113		   ($d1  << 8) |
114		   ($d2  << 9) |
115		   ($multicase << 10) |
116		   ($delta << 11)}]
117
118    return $val
119}
120
121proc uni::getGroup {value} {
122    variable groups
123
124    set gIndex [lsearch -exact $groups $value]
125    if {$gIndex == -1} {
126	set gIndex [llength $groups]
127	lappend groups $value
128    }
129    return $gIndex
130}
131
132proc uni::addPage {info} {
133    variable pMap
134    variable pages
135    variable pages_map
136
137    if {[info exists pages_map($info)]} {
138	lappend pMap $pages_map($info)
139    } else {
140	set pIndex [llength $pages]
141	lappend pages $info
142	set pages_map($info) $pIndex
143	lappend pMap $pIndex
144    }
145    return
146}
147
148
149proc uni::load_tables {data} {
150    variable casemap
151    variable casemap2
152    variable multicasemap
153    variable tablemap
154
155    set multicasemap {}
156    set table ""
157
158    foreach line [split $data \n] {
159	if {$table == ""} {
160	    if {[regexp {   ----- Start Table (.*) -----} $line temp table]} {
161		#puts "Start table '$table'"
162	    }
163	} else {
164	    if {[regexp {   ----- End Table (.*) -----} $line temp table1]} {
165		set table ""
166	    } else {
167		if {$table == "B.1"} {
168		    if {[regexp {^   ([[:xdigit:]]+); ;} $line \
169			     temp val]} {
170			scan $val %x val
171			if {$val <= 0x10ffff} {
172			    lappend tablemap($val) $table
173			}
174		    }
175		} elseif {$table == "B.2"} {
176		    # B.2 table is used for mapping with normalisation
177		    if {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+);} $line \
178			     temp from to]} {
179			scan $from %x from
180			scan $to %x to
181			if {$from <= 0x10ffff && $to <= 0x10ffff} {
182			    set casemap($from) [expr {$to - $from}]
183			}
184		    } elseif {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
185			     temp from to1 to2]} {
186			scan $from %x from
187			scan $to1 %x to1
188			scan $to2 %x to2
189			if {$from <= 0x10ffff && \
190				$to1 <= 0x10ffff && $to2 <= 0x10ffff} {
191			    set casemap2($from) [llength $multicasemap]
192			    lappend multicasemap [list $to1 $to2]
193			}
194		    } elseif {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
195			     temp from to1 to2 to3]} {
196			scan $from %x from
197			scan $to1 %x to1
198			scan $to2 %x to2
199			scan $to3 %x to3
200			if {$from <= 0x10ffff && \
201				$to1 <= 0x10ffff && $to2 <= 0x10ffff && \
202				$to3 <= 0x10ffff} {
203			    set casemap2($from) [llength $multicasemap]
204			    lappend multicasemap [list $to1 $to2 $to3]
205			}
206		    } elseif {[regexp {^   ([[:xdigit:]]+); ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+) ([[:xdigit:]]+);} $line \
207			     temp from to1 to2 to3 to4]} {
208			scan $from %x from
209			scan $to1 %x to1
210			scan $to2 %x to2
211			scan $to3 %x to3
212			scan $to4 %x to4
213			if {$from <= 0x10ffff && \
214				$to1 <= 0x10ffff && $to2 <= 0x10ffff && \
215				$to3 <= 0x10ffff && $to4 <= 0x10ffff} {
216			    set casemap2($from) [llength $multicasemap]
217			    lappend multicasemap [list $to1 $to2 $to3 $to4]
218			}
219		    } else {
220			#puts "missed: $line"
221		    }
222
223		} elseif {$table == "B.3"} {
224		    # B.3 table is used for mapping without normalisation (B.3 is a subset of B.2)
225		    if {[regexp {^   ([[:xdigit:]]+);} $line temp from]} {
226			scan $from %x from
227			if {$from <= 0x10ffff} {
228			    lappend tablemap($from) $table
229			}
230		    }
231		} else {
232		    if {[regexp {^   ([[:xdigit:]]+)-([[:xdigit:]]+)} $line \
233			     temp from to]} {
234			scan $from %x from
235			scan $to %x to
236			for {set i $from} {$i <= $to && $i <= 0x10ffff} {incr i} {
237			    lappend tablemap($i) $table
238			}
239		    } elseif {[regexp {^   ([[:xdigit:]]+)} $line \
240			     temp val]} {
241			scan $val %x val
242			if {$val <= 0x10ffff} {
243			    lappend tablemap($val) $table
244			}
245		    }
246		}
247	    }
248	}
249    }
250}
251
252proc uni::buildTables {} {
253    variable shift
254
255    variable casemap
256    variable tablemap
257
258    variable pMap {}
259    variable pages {}
260    variable groups {}
261    set info {}			;# temporary page info
262
263    set mask [expr {(1 << $shift) - 1}]
264
265    set next 0
266
267    for {set i 0} {$i <= 0x10ffff} {incr i} {
268	set gIndex [getGroup [getValue $i]]
269
270	# Split character index into offset and page number
271	set offset [expr {$i & $mask}]
272	set page [expr {($i >> $shift)}]
273
274	# Add the group index to the info for the current page
275	lappend info $gIndex
276
277	# If this is the last entry in the page, add the page
278	if {$offset == $mask} {
279	    addPage $info
280	    set info {}
281	}
282    }
283    return
284}
285
286proc uni::main {} {
287    global argc argv0 argv
288    variable pMap
289    variable pages
290    variable groups
291    variable shift
292    variable multicasemap
293
294    if {$argc != 2} {
295	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
296	exit 1
297    }
298    set f [open [lindex $argv 0] r]
299    set data [read $f]
300    close $f
301
302    load_tables $data
303    buildTables
304    #puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
305    #set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
306    #puts "shift = $shift, space = $size"
307
308    set f [open [file join [lindex $argv 1] stringprep_data.tcl] w]
309    fconfigure $f -translation lf
310    puts $f \
311"# stringprep_data.tcl --
312#
313#	Declarations of Unicode character information tables.  This file is
314#	automatically generated by the gen_stringprep_data.tcl script.  Do not
315#	modify this file by hand.
316#
317# Copyright (c) 1998 Scriptics Corporation.
318# Copyright (c) 2007 Alexey Shchepin
319# Copyright (c) 2008 Sergei Golovan
320#
321# RCS: @(#) \$Id\$
322#
323
324package provide stringprep::data 1.0.1
325
326namespace eval ::stringprep::data {
327
328#
329# A 16-bit Unicode character is split into two parts in order to index
330# into the following tables.  The lower OFFSET_BITS comprise an offset
331# into a page of characters.  The upper bits comprise the page number.
332#
333
334set OFFSET_BITS $shift
335
336#
337# The pageMap is indexed by page number and returns an alternate page number
338# that identifies a unique page of characters.  Many Unicode characters map
339# to the same alternate page number.
340#
341
342array unset pageMap
343array set pageMap \[list \\"
344    array unset tmp
345    foreach idx $pMap {
346	if {![info exists tmp($idx)]} {
347	    set tmp($idx) 1
348	} else {
349	    incr tmp($idx)
350	}
351    }
352    set max 0
353    set max_id 0
354    foreach idx [array names tmp] {
355	if {$tmp($idx) > $max} {
356	    set max $tmp($idx)
357	    set max_id $idx
358	}
359    }
360    set line "   "
361    set last [expr {[llength $pMap] - 1}]
362    for {set i 0} {$i <= $last} {incr i} {
363	set num [lindex $pMap $i]
364	if {$num != $max_id} {
365	    append line " $i $num"
366	}
367	if {[string length $line] > 70} {
368	    puts $f "$line \\"
369	    set line "   "
370	}
371    }
372    puts $f "$line\]
373
374set COMMON_PAGE_MAP $max_id
375
376#
377# The groupMap is indexed by combining the alternate page number with
378# the page offset and returns a group number that identifies a unique
379# set of character attributes.
380#
381
382set groupMap \[list \\"
383    set line "    "
384    set lasti [expr {[llength $pages] - 1}]
385    for {set i 0} {$i <= $lasti} {incr i} {
386	set page [lindex $pages $i]
387	set lastj [expr {[llength $page] - 1}]
388	for {set j 0} {$j <= $lastj} {incr j} {
389	    append line [lindex $page $j]
390	    if {$j != $lastj || $i != $lasti} {
391		append line " "
392	    }
393	    if {[string length $line] > 70} {
394		puts $f "$line\\"
395		set line "    "
396	    }
397	}
398    }
399    puts $f "$line\]
400
401#
402# Each group represents a unique set of character attributes.  The attributes
403# are encoded into a 32-bit value as follows:
404#
405# Bit  0	A.1
406#
407# Bit  1	B.1
408#
409# Bit  2	B.3
410#
411# Bit  3	C.1.1
412#
413# Bit  4	C.1.2
414#
415# Bit  5	C.2.1
416#
417# Bit  6	C.2.2
418#
419# Bit  7	C.3--C.9
420#
421# Bit  8	D.1
422#
423# Bit  9	D.2
424#
425# Bit  10	Case maps to several characters
426#
427# Bits 11-31	Case delta: delta for case conversions.  This should be the
428#		highest field so we can easily sign extend.
429#
430
431set groups \[list \\"
432    set line "    "
433    set last [expr {[llength $groups] - 1}]
434    for {set i 0} {$i <= $last} {incr i} {
435	set val [lindex $groups $i]
436
437	append line [format "%d" $val]
438	if {$i != $last} {
439	    append line " "
440	}
441	if {[string length $line] > 65} {
442	    puts $f "$line\\"
443	    set line "    "
444	}
445    }
446    puts $f "$line\]
447
448#
449# Table for characters that lowercased to multiple ones
450#
451
452set multiCaseTable \[list \\"
453    set last [expr {[llength $multicasemap] - 1}]
454    for {set i 0} {$i <= $last} {incr i} {
455	set val [lindex $multicasemap $i]
456
457	set line "    "
458	append line "{" [join $val " "] "}"
459	puts $f "$line \\"
460    }
461    puts $f "\]
462
463#
464# The following constants are used to determine the category of a
465# Unicode character.
466#
467
468set A1Mask  \[expr {1 << 0}\]
469set B1Mask  \[expr {1 << 1}\]
470set B3Mask  \[expr {1 << 2}\]
471set C11Mask \[expr {1 << 3}\]
472set C12Mask \[expr {1 << 4}\]
473set C21Mask \[expr {1 << 5}\]
474set C22Mask \[expr {1 << 6}\]
475set C39Mask \[expr {1 << 7}\]
476set D1Mask  \[expr {1 << 8}\]
477set D2Mask  \[expr {1 << 9}\]
478set MCMask  \[expr {1 << 10}\]
479
480#
481# The following procs extract the fields of the character info.
482#
483
484proc GetCaseType {info} {expr {(\$info & 0xE0) >> 5}}
485proc GetCategory {info} {expr {\$info & 0x1F}}
486proc GetDelta {info} {expr {\$info >> 11}}
487proc GetMC {info} {
488    variable multiCaseTable
489    lindex \$multiCaseTable \[GetDelta \$info\]
490}
491
492#
493# This proc extracts the information about a character from the
494# Unicode character tables.
495#
496
497proc GetUniCharInfo {uc} {
498    variable OFFSET_BITS
499    variable COMMON_PAGE_MAP
500    variable pageMap
501    variable groupMap
502    variable groups
503
504    set page \[expr {(\$uc & 0x1fffff) >> \$OFFSET_BITS}\]
505    if {\[info exists pageMap(\$page)\]} {
506	set apage \$pageMap(\$page)
507    } else {
508	set apage \$COMMON_PAGE_MAP
509    }
510
511    lindex \$groups \\
512	   \[lindex \$groupMap \\
513		   \[expr {(\$apage << \$OFFSET_BITS) | \\
514			   (\$uc & ((1 << \$OFFSET_BITS) - 1))}\]\]
515}
516
517} ; # namespace eval ::stringprep::data
518"
519    close $f
520}
521
522uni::main
523
524return
525