1#!perl
2#
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
5#
6# Usage:
7#    <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10#    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11#    unicore/Decomposition.pl (or unicode/Decomposition.pl)
12#
13# Output files:
14#    unfcan.h
15#    unfcpt.h
16#    unfcmb.h
17#    unfcmp.h
18#    unfexc.h
19#
20use 5.006;
21use strict;
22use warnings;
23use Carp;
24use File::Spec;
25use SelectSaver;
26
27our $PACKAGE = 'Unicode::Normalize, mkheader';
28
29our $prefix = "UNF_";
30our $structname = "${prefix}complist";
31
32# Starting in v5.20, the tables in lib/unicore are built using the platform's
33# native character set for code points 0-255.  But in v5.35, pack U stopped
34# trying to compensate
35*pack_U = ($] ge 5.020 && $] lt 5.035)
36          ? sub { return pack('U*', map { utf8::unicode_to_native($_) } @_); }
37          : sub { return pack('U*', @_); };
38
39# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
40our %Comp1st;	# $codepoint => $listname  : may be composed with a next char.
41our %CompList;	# $listname,$2nd  => $codepoint : composite
42
43##### The below part is common to mkheader and PP #####
44
45our %Combin;	# $codepoint => $number    : combination class
46our %Canon;	# $codepoint => \@codepoints : canonical decomp.
47our %Compat;	# $codepoint => \@codepoints : compat. decomp.
48our %Compos;	# $1st,$2nd  => $codepoint : composite
49our %Exclus;	# $codepoint => 1          : composition exclusions
50our %Single;	# $codepoint => 1          : singletons
51our %NonStD;	# $codepoint => 1          : non-starter decompositions
52our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
53
54# from core Unicode database
55our $Combin = do "unicore/CombiningClass.pl"
56    || do "unicode/CombiningClass.pl"
57    || croak "$PACKAGE: CombiningClass.pl not found";
58our $Decomp = do "unicore/Decomposition.pl"
59    || do "unicode/Decomposition.pl"
60    || croak "$PACKAGE: Decomposition.pl not found";
61
62# CompositionExclusions.txt since Unicode 3.2.0.  If this ever changes, it
63# would be better to get the values from Unicode::UCD rather than hard-code
64# them here, as that will protect from having to make fixes for future
65# changes.
66our @CompEx = qw(
67    0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
68    0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
69    0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
70    FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
71    FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
72    FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
73    1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
74);
75
76# definition of Hangul constants
77use constant SBase  => 0xAC00;
78use constant SFinal => 0xD7A3; # SBase -1 + SCount
79use constant SCount =>  11172; # LCount * NCount
80use constant NCount =>    588; # VCount * TCount
81use constant LBase  => 0x1100;
82use constant LFinal => 0x1112;
83use constant LCount =>     19;
84use constant VBase  => 0x1161;
85use constant VFinal => 0x1175;
86use constant VCount =>     21;
87use constant TBase  => 0x11A7;
88use constant TFinal => 0x11C2;
89use constant TCount =>     28;
90
91sub decomposeHangul {
92    my $sindex = $_[0] - SBase;
93    my $lindex = int( $sindex / NCount);
94    my $vindex = int(($sindex % NCount) / TCount);
95    my $tindex =      $sindex % TCount;
96    my @ret = (
97       LBase + $lindex,
98       VBase + $vindex,
99      $tindex ? (TBase + $tindex) : (),
100    );
101    return wantarray ? @ret : pack_U(@ret);
102}
103
104########## getting full decomposition ##########
105
106## converts string "hhhh hhhh hhhh" to a numeric list
107## (hex digits separated by spaces)
108sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
109
110while ($Combin =~ /(.+)/g) {
111    my @tab = split /\t/, $1;
112    my $ini = hex $tab[0];
113    if ($tab[1] eq '') {
114	$Combin{$ini} = $tab[2];
115    } else {
116	$Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
117    }
118}
119
120while ($Decomp =~ /(.+)/g) {
121    my @tab = split /\t/, $1;
122    my $compat = $tab[2] =~ s/<[^>]+>//;
123    my $dec = [ _getHexArray($tab[2]) ]; # decomposition
124    my $ini = hex($tab[0]); # initial decomposable character
125    my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
126    # ($ini .. $end) is the range of decomposable characters.
127
128    foreach my $u ($ini .. $end) {
129	$Compat{$u} = $dec;
130	$Canon{$u} = $dec if ! $compat;
131    }
132}
133
134for my $s (@CompEx) {
135    my $u = hex $s;
136    next if !$Canon{$u}; # not assigned
137    next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
138    $Exclus{$u} = 1;
139}
140
141foreach my $u (keys %Canon) {
142    my $dec = $Canon{$u};
143
144    if (@$dec == 2) {
145	if ($Combin{ $dec->[0] }) {
146	    $NonStD{$u} = 1;
147	} else {
148	    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
149	    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
150	}
151    } elsif (@$dec == 1) {
152	$Single{$u} = 1;
153    } else {
154	my $h = sprintf '%04X', $u;
155	croak("Weird Canonical Decomposition of U+$h");
156    }
157}
158
159# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
160foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
161    $Comp2nd{$j} = 1;
162}
163
164sub getCanonList {
165    my @src = @_;
166    my @dec = map {
167	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
168	    : $Canon{$_} ? @{ $Canon{$_} } : $_
169		} @src;
170    return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
171    # condition @src == @dec is not ok.
172}
173
174sub getCompatList {
175    my @src = @_;
176    my @dec = map {
177	(SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
178	    : $Compat{$_} ? @{ $Compat{$_} } : $_
179		} @src;
180    return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
181    # condition @src == @dec is not ok.
182}
183
184# exhaustive decomposition
185foreach my $key (keys %Canon) {
186    $Canon{$key}  = [ getCanonList($key) ];
187}
188
189# exhaustive decomposition
190foreach my $key (keys %Compat) {
191    $Compat{$key} = [ getCompatList($key) ];
192}
193
194##### The above part is common to mkheader and PP #####
195
196foreach my $comp1st (keys %Compos) {
197    my $listname = sprintf("${structname}_%06x", $comp1st);
198		# %04x is bad since it'd place _3046 after _1d157.
199    $Comp1st{$comp1st} = $listname;
200    my $rh1st = $Compos{$comp1st};
201
202    foreach my $comp2nd (keys %$rh1st) {
203	my $uc = $rh1st->{$comp2nd};
204	$CompList{$listname}{$comp2nd} = $uc;
205    }
206}
207
208sub split_into_char {
209    use bytes;
210    my $uni = shift;
211    my $len = length($uni);
212    my @ary;
213    for(my $i = 0; $i < $len; ++$i) {
214	push @ary, ord(substr($uni,$i,1));
215    }
216    return @ary;
217}
218
219sub _U_stringify {
220    sprintf '"%s"', join '',
221	map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
222}
223
224foreach my $hash (\%Canon, \%Compat) {
225    foreach my $key (keys %$hash) {
226	$hash->{$key} = _U_stringify( @{ $hash->{$key} } );
227    }
228}
229
230########## writing header files ##########
231
232my @boolfunc = (
233    {
234	name => "Exclusion",
235	type => "bool",
236	hash => \%Exclus,
237    },
238    {
239	name => "Singleton",
240	type => "bool",
241	hash => \%Single,
242    },
243    {
244	name => "NonStDecomp",
245	type => "bool",
246	hash => \%NonStD,
247    },
248    {
249	name => "Comp2nd",
250	type => "bool",
251	hash => \%Comp2nd,
252    },
253);
254
255my $orig_fh = SelectSaver->new;
256{
257
258my $file = "unfexc.h";
259open FH, ">$file" or croak "$PACKAGE: $file can't be made";
260binmode FH; select FH;
261
262    print << 'EOF';
263/*
264 * This file is auto-generated by mkheader.
265 * Any changes here will be lost!
266 */
267EOF
268
269foreach my $tbl (@boolfunc) {
270    my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
271    my $type = $tbl->{type};
272    my $name = $tbl->{name};
273    print "$type is$name (UV uv)\n{\nreturn\n\t";
274
275    while (@temp) {
276	my $cur = shift @temp;
277	if (@temp && $cur + 1 == $temp[0]) {
278	    print "($cur <= uv && uv <= ";
279	    while (@temp && $cur + 1 == $temp[0]) {
280		$cur = shift @temp;
281	    }
282	    print "$cur)";
283	    print "\n\t|| " if @temp;
284	} else {
285	    print "uv == $cur";
286	    print "\n\t|| " if @temp;
287	}
288    }
289    print "\n\t? TRUE : FALSE;\n}\n\n";
290}
291
292close FH;
293
294####################################
295
296my $compinit =
297    "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
298
299foreach my $i (sort keys %CompList) {
300    $compinit .= "$structname $i [] = {\n";
301    $compinit .= join ",\n",
302	map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
303	    sort {$a <=> $b } keys %{ $CompList{$i} };
304    $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
305}
306
307my @tripletable = (
308    {
309	file => "unfcmb",
310	name => "combin",
311	type => "STDCHAR",
312	hash => \%Combin,
313	null =>  0,
314    },
315    {
316	file => "unfcan",
317	name => "canon",
318	type => "char*",
319	hash => \%Canon,
320	null => "NULL",
321    },
322    {
323	file => "unfcpt",
324	name => "compat",
325	type => "char*",
326	hash => \%Compat,
327	null => "NULL",
328    },
329    {
330	file => "unfcmp",
331	name => "compos",
332	type => "$structname *",
333	hash => \%Comp1st,
334	null => "NULL",
335	init => $compinit,
336    },
337);
338
339foreach my $tbl (@tripletable) {
340    my $file = "$tbl->{file}.h";
341    my $head = "${prefix}$tbl->{name}";
342    my $type = $tbl->{type};
343    my $hash = $tbl->{hash};
344    my $null = $tbl->{null};
345    my $init = $tbl->{init};
346
347    open FH, ">$file" or croak "$PACKAGE: $file can't be made";
348    binmode FH; select FH;
349    my %val;
350
351    print FH << 'EOF';
352/*
353 * This file is auto-generated by mkheader.
354 * Any changes here will be lost!
355 */
356EOF
357
358    print $init if defined $init;
359
360    foreach my $uv (keys %$hash) {
361	croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
362	    unless $uv <= 0x10FFFF;
363	my @c = unpack 'CCCC', pack 'N', $uv;
364	$val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
365    }
366
367    foreach my $p (sort { $a <=> $b } keys %val) {
368	next if ! $val{ $p };
369	for (my $r = 0; $r < 256; $r++) {
370	    next if ! $val{ $p }{ $r };
371	    printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
372	    for (my $c = 0; $c < 256; $c++) {
373		print "\t", defined $val{$p}{$r}{$c}
374		    ? "($type)".$val{$p}{$r}{$c}
375		    : $null;
376		print ','  if $c != 255;
377		print "\n" if $c % 8 == 7;
378	    }
379	    print "};\n\n";
380	}
381    }
382    foreach my $p (sort { $a <=> $b } keys %val) {
383	next if ! $val{ $p };
384	printf "static $type* ${head}_%02x [256] = {\n", $p;
385	for (my $r = 0; $r < 256; $r++) {
386	    print $val{ $p }{ $r }
387		? sprintf("${head}_%02x_%02x", $p, $r)
388		: "NULL";
389	    print ','  if $r != 255;
390	    print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
391	}
392	print "};\n\n";
393    }
394    print "static $type** $head [] = {\n";
395    for (my $p = 0; $p <= 0x10; $p++) {
396	print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
397	print ','  if $p != 0x10;
398	print "\n";
399    }
400    print "};\n\n";
401    close FH;
402}
403
404}   # End of block for SelectSaver
405
4061;
407__END__
408