1289260Sbapt#!/usr/local/bin/perl -wC
2289260Sbapt
3289260Sbaptuse strict;
4289260Sbapt#use File::Copy;
5289260Sbapt#use XML::Parser;
6289260Sbaptuse Tie::IxHash;
7289260Sbapt#use Data::Dumper;
8289260Sbaptuse Getopt::Long;
9289260Sbapt#use Digest::SHA qw(sha1_hex);
10289260Sbapt#require "charmaps.pm";
11289260Sbapt
12289260Sbapt
13289260Sbaptif ($#ARGV != 1) {
14289260Sbapt	print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n";
15289260Sbapt	exit(1);
16289260Sbapt}
17289260Sbapt
18289260Sbaptmy $CLDRDIR = undef;
19289260Sbaptmy $ETCDIR = undef;
20289260Sbapt
21289260Sbaptmy $result = GetOptions (
22289260Sbapt		"cldr=s"	=> \$CLDRDIR,
23289260Sbapt		"etc=s"		=> \$ETCDIR,
24289260Sbapt	    );
25289260Sbapt
26289260Sbaptmy @SECTIONS = (
27289260Sbapt	["en_US",       "* 0x0000 - 0x007F Basic Latin\n" .
28289260Sbapt	                "* 0x0080 - 0x00FF Latin-1 Supplement\n" .
29289260Sbapt	                "* 0x0100 - 0x017F Latin Extended-A\n" .
30289260Sbapt	                "* 0x0180 - 0x024F Latin Extended-B\n" .
31289260Sbapt	                "* 0x0250 - 0x02AF IPA Extensions\n" .
32289260Sbapt	                "* 0x1D00 - 0x1D7F Phonetic Extensions\n" .
33289260Sbapt	                "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" .
34289260Sbapt	                "* 0x1E00 - 0x1EFF Latin Extended Additional\n" .
35289260Sbapt	                "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n".
36289260Sbapt	                "* 0x2C60 - 0x2C7F Latin Extended-C\n" .
37289260Sbapt	                "* 0xA720 - 0xA7FF Latin Extended-D\n" .
38289260Sbapt	                "* 0xAB30 - 0xAB6F Latin Extended-E\n" .
39289260Sbapt	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n".
40289260Sbapt	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
41289260Sbapt	["el_GR",       "* 0x0370 - 0x03FF Greek (No Coptic!)\n" .
42289260Sbapt	                "* 0x1F00 - 0x1FFF Greek Extended\n"],
43289260Sbapt	["ru_RU",       "* 0x0400 - 0x04FF Cyrillic\n" .
44289260Sbapt	                "* 0x0500 - 0x052F Cyrillic Supplementary\n" .
45289260Sbapt	                "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" .
46289260Sbapt	                "* 0xA640 - 0xA69F Cyrillic Extended-B\n"],
47289260Sbapt	["hy_AM",       "* 0x0530 - 0x058F Armenian\n" .
48289260Sbapt	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
49289260Sbapt	["he_IL",       "* 0x0590 - 0x05FF Hebrew\n" .
50289260Sbapt	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
51289260Sbapt	["ar_SA",       "* 0x0600 - 0x06FF Arabic\n" .
52289260Sbapt		        "* 0x0750 - 0x074F Arabic Supplement\n" .
53289260Sbapt		        "* 0x08A0 - 0x08FF Arabic Extended-A\n" .
54289260Sbapt		        "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" .
55289260Sbapt		        "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"],
56289260Sbapt	["hi_IN",       "* 0x0900 - 0x097F Devanagari\n" .
57289260Sbapt	                "* 0xA8E0 - 0xA8FF Devanagari Extended\n"],
58289260Sbapt	["bn_IN",       "* 0x0900 - 0x097F Bengali\n"],
59289260Sbapt	["pa_Guru_IN",  "* 0x0A00 - 0x0A7F Gurmukhi\n"],
60289260Sbapt	["gu_IN",       "* 0x0A80 - 0x0AFF Gujarati\n"],
61289260Sbapt	["or_IN",       "* 0x0B00 - 0x0B7F Oriya\n"],
62289260Sbapt	["ta_IN",       "* 0x0B80 - 0x0BFF Tamil\n"],
63289260Sbapt	["te_IN",       "* 0x0C00 - 0x0C7F Telugu\n"],
64289260Sbapt	["kn_IN",       "* 0x0C80 - 0x0CFF Kannada\n"],
65289260Sbapt	["ml_IN",       "* 0x0D00 - 0x0D7F Malayalam\n"],
66289260Sbapt	["si_LK",       "* 0x0D80 - 0x0DFF Sinhala\n"],
67289260Sbapt	["th_TH",       "* 0x0E00 - 0x0E7F Thai\n"],
68289260Sbapt	["lo_LA",       "* 0x0E80 - 0x0EFF Lao\n"],
69289260Sbapt	["bo_IN",       "* 0x0F00 - 0x0FFF Tibetan\n"],
70289260Sbapt	["my_MM",       "* 0x1000 - 0x109F Myanmar\n" .
71289260Sbapt	                "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" .
72289260Sbapt	                "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"],
73289260Sbapt	["ka_GE",       "* 0x10A0 - 0x10FF Georgia\n" .
74289260Sbapt	                "* 0x2D00 - 0x2D2F Georgian Supplement\n"],
75289260Sbapt	["ja_JP",       "* 0x1100 - 0x11FF Hangul Jamo\n" .
76289260Sbapt	                "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
77289260Sbapt	                "* 0x3040 - 0x309F Hiragana\n" .
78289260Sbapt	                "* 0x30A0 - 0x30FF Katakana\n" .
79289260Sbapt	                "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" .
80289260Sbapt	                "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
81289260Sbapt	                "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
82289260Sbapt	                "* 0x3300 - 0x33FF CJK Compatibility\n" .
83289260Sbapt	                "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" .
84289260Sbapt	                "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" .
85289260Sbapt	                "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
86289260Sbapt	                "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" .
87289260Sbapt	                "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" .
88289260Sbapt	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
89289260Sbapt	["am_ET",       "* 0x1200 - 0x137F Ethiopic\n" .
90289260Sbapt	                "* 0x1380 - 0x139F Ethiopic Supplement\n" .
91289260Sbapt	                "* 0x2D80 - 0x2DDF Ethiopic Extended\n" .
92289260Sbapt	                "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"],
93289260Sbapt	["chr_US",      "* 0x13A0 - 0x13FF Cherokee\n"],
94289260Sbapt	["km_KH",       "* 0x1780 - 0x17FF Khmer\n" .
95289260Sbapt	                "* 0x19E0 - 0x19FF Khmer Symbols\n"],
96289260Sbapt	["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"],
97289260Sbapt	["ii_CN",       "* 0xA000 - 0xA48F Yi Syllables\n" .
98289260Sbapt	                "* 0xA490 - 0xA4CF Yi Radicals\n"],
99289260Sbapt	["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"],
100289260Sbapt	["ko_KR",       "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
101289260Sbapt			"* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" .
102289260Sbapt	                "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
103289260Sbapt	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
104289260Sbapt);
105289260Sbapt
106289260Sbapt#	["zh_Hans_CN",  "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" .
107289260Sbapt#	                "* 0x2F00 - 0x2FDF Rangxi Radicales\n" .
108289260Sbapt#	                "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
109289260Sbapt#	                "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
110289260Sbapt#			"* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" .
111289260Sbapt#	                "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"],
112289260Sbapt
113289260Sbaptmy %seen = ();
114289260Sbaptmy %pending_seen = ();
115289260Sbaptmy %utf8map = ();
116289260Sbaptmy %utf8aliases = ();
117289260Sbaptmy $outfilename = "$ETCDIR/common.UTF-8.src";
118289260Sbaptmy $manual_file = "$ETCDIR/manual-input.UTF-8";
119289260Sbaptmy $stars = "**********************************************************************\n";
120289260Sbapt
121289260Sbaptget_utf8map("$CLDRDIR/posix/UTF-8.cm");
122289260Sbaptgenerate_header ();
123289260Sbaptgenerate_sections ();
124289260Sbaptgenerate_footer ();
125289260Sbapt
126289260Sbapt############################
127289260Sbapt
128289260Sbaptsub get_utf8map {
129289260Sbapt	my $file = shift;
130289260Sbapt
131289260Sbapt	open(FIN, $file);
132289260Sbapt	my @lines = <FIN>;
133289260Sbapt	close(FIN);
134289260Sbapt	chomp(@lines);
135289260Sbapt
136289260Sbapt	my $prev_k = undef;
137289260Sbapt	my $prev_v = "";
138289260Sbapt	my $incharmap = 0;
139289260Sbapt	foreach my $l (@lines) {
140289260Sbapt		$l =~ s/\r//;
141289260Sbapt		next if ($l =~ /^\#/);
142289260Sbapt		next if ($l eq "");
143289260Sbapt
144289260Sbapt		if ($l eq "CHARMAP") {
145289260Sbapt			$incharmap = 1;
146289260Sbapt			next;
147289260Sbapt		}
148289260Sbapt
149289260Sbapt		next if (!$incharmap);
150289260Sbapt		last if ($l eq "END CHARMAP");
151289260Sbapt
152289260Sbapt		$l =~ /^<([^\s]+)>\s+(.*)/;
153289260Sbapt		my $k = $1;
154289260Sbapt		my $v = $2;
155289260Sbapt		$k =~ s/_/ /g;		# unicode char string
156289260Sbapt		$v =~ s/\\x//g;		# UTF-8 char code
157289260Sbapt		$utf8map{$k} = $v;
158289260Sbapt
159289260Sbapt		$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
160289260Sbapt
161289260Sbapt		$prev_v = $v;
162289260Sbapt		$prev_k = $k;
163289260Sbapt	}
164289260Sbapt}
165289260Sbapt
166289260Sbaptsub generate_header {
167289260Sbapt	open(FOUT, ">", "$outfilename")
168289260Sbapt		or die ("can't write to $outfilename\n");
169289260Sbapt	print FOUT <<EOF;
170289260Sbapt# Warning: Do not edit. This file is automatically generated from the
171289260Sbapt# tools in /usr/src/tools/tools/locale. The data is obtained from the
172289260Sbapt# CLDR project, obtained from http://cldr.unicode.org/
173289260Sbapt# -----------------------------------------------------------------------------
174289260Sbapt
175289260Sbaptcomment_char *
176289260Sbaptescape_char /
177289260Sbapt
178289260SbaptLC_CTYPE
179289260SbaptEOF
180289260Sbapt}
181289260Sbapt
182289260Sbaptsub generate_footer {
183289260Sbapt	print FOUT "\nEND LC_CTYPE\n";
184289260Sbapt	close (FOUT);
185289260Sbapt}
186289260Sbapt
187289260Sbaptsub already_seen {
188289260Sbapt	my $ucode = shift;
189289260Sbapt	if (defined $seen{$ucode}) {
190289260Sbapt		return 1;
191289260Sbapt	}
192289260Sbapt	$pending_seen{$ucode} = 1;
193289260Sbapt	return 0;
194289260Sbapt}
195289260Sbapt
196289260Sbaptsub already_seen_RO {
197289260Sbapt	my $ucode = shift;
198289260Sbapt	if (defined $seen{$ucode}) {
199289260Sbapt		return 1;
200289260Sbapt	}
201289260Sbapt	return 0;
202289260Sbapt}
203289260Sbapt
204289260Sbaptsub merge_seen {
205289260Sbapt	foreach my $sn (keys %pending_seen) {
206289260Sbapt		$seen{$sn} = 1;
207289260Sbapt	}
208289260Sbapt	%pending_seen = ();
209289260Sbapt}
210289260Sbapt
211289260Sbaptsub initialize_lines {
212289260Sbapt	my @result = ();
213289260Sbapt	my $terr = shift;
214289260Sbapt	my $n;
215289260Sbapt	my $back2hex;
216289260Sbapt	my @types = ("graph", "alpha");
217289260Sbapt	if ($terr eq "ja_JP") {
218289260Sbapt	    foreach my $T (@types) {
219289260Sbapt		push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n";
220289260Sbapt		for ($n = hex("3401"); $n <= hex("4DB4"); $n++) {
221289260Sbapt			$back2hex=sprintf("%X", $n);
222289260Sbapt			push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
223289260Sbapt				$back2hex . ">;/\n";
224289260Sbapt		}
225289260Sbapt		push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n";
226289260Sbapt		push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n";
227289260Sbapt		for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) {
228289260Sbapt			$back2hex=sprintf("%X", $n);
229289260Sbapt			push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
230289260Sbapt				$back2hex . ">;/\n";
231289260Sbapt		}
232289260Sbapt		push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n";
233289260Sbapt	    }
234289260Sbapt	    push @result, "merge\tnow\n";
235289260Sbapt	}
236289260Sbapt	return @result;
237289260Sbapt}
238289260Sbapt
239289260Sbaptsub compress_ctype {
240289260Sbapt	my $territory = shift;
241289260Sbapt	my $term;
242289260Sbapt	my $active = 0;
243289260Sbapt	my $cat_loaded = 0;
244289260Sbapt	my $lock_ID;
245289260Sbapt	my $prev_ID;
246289260Sbapt	my $curr_ID;
247289260Sbapt	my $lock_name;
248289260Sbapt	my $prev_name;
249289260Sbapt	my $curr_name;
250289260Sbapt	my $key_name;
251289260Sbapt	my $category = '';
252289260Sbapt
253289260Sbapt	my @lines = initialize_lines ($territory);
254289260Sbapt
255289260Sbapt	my $filename = "$CLDRDIR/posix/$territory.UTF-8.src";
256289260Sbapt	if (! -f $filename) {
257289260Sbapt		print STDERR "Cannot open $filename\n";
258289260Sbapt		return;
259289260Sbapt	}
260289260Sbapt	open(FIN, "$filename");
261289260Sbapt	print "Reading from $filename\n";
262289260Sbapt	while (<FIN>) {
263289260Sbapt		if (/^LC_CTYPE/../^END LC_CTYPE/) {
264289260Sbapt			if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" &&
265289260Sbapt				$_ ne "*************\n" && $_ ne "\n") {
266289260Sbapt				push @lines, $_;
267289260Sbapt			}
268289260Sbapt		}
269289260Sbapt	}
270289260Sbapt	close(FIN);
271289260Sbapt	foreach my $line (@lines) {
272289260Sbapt		if ($line =~ m/^([a-z]{3,})\t/) {
273289260Sbapt			$category = $1;
274289260Sbapt			if ($category eq 'merge') {
275289260Sbapt				merge_seen;
276289260Sbapt				next;
277289260Sbapt			}
278289260Sbapt			if ($category ne 'print') {
279289260Sbapt				$cat_loaded = 1;
280289260Sbapt			}
281289260Sbapt		}
282289260Sbapt		next if ($category eq 'print');
283289260Sbapt		if ($category eq 'toupper' || $category eq 'tolower') {
284289260Sbapt			if ($line =~ m/<([-_A-Za-z0-9]+)>,/) {
285289260Sbapt				$key_name = $1;
286289260Sbapt				$key_name =~ s/_/ /g;
287289260Sbapt				if (already_seen_RO (hex($utf8map{$key_name}))) {
288289260Sbapt					next;
289289260Sbapt				}
290289260Sbapt				if ($cat_loaded) { print FOUT $category; }
291289260Sbapt				$cat_loaded = 0;
292289260Sbapt				$line =~ s/^[a-z]{3,}\t/\t/;
293289260Sbapt				print FOUT $line;
294289260Sbapt			}
295289260Sbapt			next;
296289260Sbapt		}
297289260Sbapt		if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) {
298289260Sbapt			$term = ($2 eq '') ? 1 : 0;
299289260Sbapt			$curr_name = $1;
300289260Sbapt			$key_name = $1;
301289260Sbapt			$key_name =~ s/_/ /g;
302289260Sbapt			$curr_ID = hex($utf8map{$key_name});
303289260Sbapt			if (already_seen ($curr_ID)) {
304289260Sbapt				next;
305289260Sbapt			}
306289260Sbapt			if ($active) {
307289260Sbapt				if ($curr_ID == $prev_ID + 1) {
308289260Sbapt					$prev_ID = $curr_ID;
309289260Sbapt					$prev_name = $curr_name;
310289260Sbapt				} else {
311289260Sbapt					if ($cat_loaded) { print FOUT $category; }
312289260Sbapt					$cat_loaded = 0;
313289260Sbapt					if ($prev_ID == $lock_ID) {
314289260Sbapt						print FOUT "\t<" . $prev_name . ">;/\n";
315289260Sbapt					} elsif ($prev_ID - 1 == $lock_ID) {
316289260Sbapt						print FOUT "\t<" . $lock_name . ">;/\n";
317289260Sbapt						print FOUT "\t<" . $prev_name . ">;/\n";
318289260Sbapt					} else {
319289260Sbapt						print FOUT "\t<" . $lock_name .
320289260Sbapt						       ">;...;<" . $prev_name . ">;/\n";
321289260Sbapt					}
322289260Sbapt					$lock_ID = $curr_ID;
323289260Sbapt					$prev_ID = $curr_ID;
324289260Sbapt					$lock_name = $curr_name;
325289260Sbapt					$prev_name = $curr_name;
326289260Sbapt				}
327289260Sbapt			} else {
328289260Sbapt				$active = 1;
329289260Sbapt				$lock_ID = $curr_ID;
330289260Sbapt				$prev_ID = $curr_ID;
331289260Sbapt				$lock_name = $curr_name;
332289260Sbapt				$prev_name = $curr_name;
333289260Sbapt			}
334289260Sbapt			if ($term) {
335289260Sbapt				if ($cat_loaded) { print FOUT $category; }
336289260Sbapt				$cat_loaded = 0;
337289260Sbapt				if ($curr_ID == $lock_ID) {
338289260Sbapt					print FOUT "\t<" . $curr_name . ">\n";
339289260Sbapt				} elsif ($curr_ID == $lock_ID + 1) {
340289260Sbapt					print FOUT "\t<" . $lock_name . ">;/\n";
341289260Sbapt					print FOUT "\t<" . $curr_name . ">\n";
342289260Sbapt				} else {
343289260Sbapt					print FOUT "\t<" . $lock_name .
344289260Sbapt					       ">;...;<" . $curr_name . ">\n";
345289260Sbapt				}
346289260Sbapt				$active = 0;
347289260Sbapt			}
348289260Sbapt		} else {
349289260Sbapt			print FOUT $line;
350289260Sbapt		}
351289260Sbapt	}
352289260Sbapt}
353289260Sbapt
354289260Sbaptsub generate_sections {
355289260Sbapt	foreach my $section (@SECTIONS ) {
356289260Sbapt		print FOUT "\n";
357289260Sbapt		print FOUT $stars;
358289260Sbapt		print FOUT @$section[1];
359289260Sbapt		print FOUT $stars;
360289260Sbapt		compress_ctype (@$section[0]);
361289260Sbapt		merge_seen;
362289260Sbapt	}
363289260Sbapt	my @lines = ();
364289260Sbapt	open(FIN, "$manual_file");
365289260Sbapt	print "Reading from $manual_file\n";
366289260Sbapt	while (<FIN>) {
367289260Sbapt		push @lines, $_;
368289260Sbapt	}
369289260Sbapt	close(FIN);
370289260Sbapt	foreach my $line (@lines) {
371289260Sbapt		print FOUT $line;
372289260Sbapt	}
373289260Sbapt}
374