1#!/usr/local/bin/perl -wC
2
3use strict;
4#use File::Copy;
5#use XML::Parser;
6use Tie::IxHash;
7#use Data::Dumper;
8use Getopt::Long;
9#use Digest::SHA qw(sha1_hex);
10#require "charmaps.pm";
11
12
13if ($#ARGV != 1) {
14	print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n";
15	exit(1);
16}
17
18my $CLDRDIR = undef;
19my $ETCDIR = undef;
20
21my $result = GetOptions (
22		"cldr=s"	=> \$CLDRDIR,
23		"etc=s"		=> \$ETCDIR,
24	    );
25
26my @SECTIONS = (
27	["en_US",       "* 0x0000 - 0x007F Basic Latin\n" .
28	                "* 0x0080 - 0x00FF Latin-1 Supplement\n" .
29	                "* 0x0100 - 0x017F Latin Extended-A\n" .
30	                "* 0x0180 - 0x024F Latin Extended-B\n" .
31	                "* 0x0250 - 0x02AF IPA Extensions\n" .
32	                "* 0x1D00 - 0x1D7F Phonetic Extensions\n" .
33	                "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" .
34	                "* 0x1E00 - 0x1EFF Latin Extended Additional\n" .
35	                "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n".
36	                "* 0x2C60 - 0x2C7F Latin Extended-C\n" .
37	                "* 0xA720 - 0xA7FF Latin Extended-D\n" .
38	                "* 0xAB30 - 0xAB6F Latin Extended-E\n" .
39	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n".
40	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
41	["el_GR",       "* 0x0370 - 0x03FF Greek (No Coptic!)\n" .
42	                "* 0x1F00 - 0x1FFF Greek Extended\n"],
43	["ru_RU",       "* 0x0400 - 0x04FF Cyrillic\n" .
44	                "* 0x0500 - 0x052F Cyrillic Supplementary\n" .
45	                "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" .
46	                "* 0xA640 - 0xA69F Cyrillic Extended-B\n"],
47	["hy_AM",       "* 0x0530 - 0x058F Armenian\n" .
48	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
49	["he_IL",       "* 0x0590 - 0x05FF Hebrew\n" .
50	                "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
51	["ar_SA",       "* 0x0600 - 0x06FF Arabic\n" .
52		        "* 0x0750 - 0x074F Arabic Supplement\n" .
53		        "* 0x08A0 - 0x08FF Arabic Extended-A\n" .
54		        "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" .
55		        "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"],
56	["hi_IN",       "* 0x0900 - 0x097F Devanagari\n" .
57	                "* 0xA8E0 - 0xA8FF Devanagari Extended\n"],
58	["bn_IN",       "* 0x0900 - 0x097F Bengali\n"],
59	["pa_Guru_IN",  "* 0x0A00 - 0x0A7F Gurmukhi\n"],
60	["gu_IN",       "* 0x0A80 - 0x0AFF Gujarati\n"],
61	["or_IN",       "* 0x0B00 - 0x0B7F Oriya\n"],
62	["ta_IN",       "* 0x0B80 - 0x0BFF Tamil\n"],
63	["te_IN",       "* 0x0C00 - 0x0C7F Telugu\n"],
64	["kn_IN",       "* 0x0C80 - 0x0CFF Kannada\n"],
65	["ml_IN",       "* 0x0D00 - 0x0D7F Malayalam\n"],
66	["si_LK",       "* 0x0D80 - 0x0DFF Sinhala\n"],
67	["th_TH",       "* 0x0E00 - 0x0E7F Thai\n"],
68	["lo_LA",       "* 0x0E80 - 0x0EFF Lao\n"],
69	["bo_IN",       "* 0x0F00 - 0x0FFF Tibetan\n"],
70	["my_MM",       "* 0x1000 - 0x109F Myanmar\n" .
71	                "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" .
72	                "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"],
73	["ka_GE",       "* 0x10A0 - 0x10FF Georgia\n" .
74	                "* 0x2D00 - 0x2D2F Georgian Supplement\n"],
75	["ja_JP",       "* 0x1100 - 0x11FF Hangul Jamo\n" .
76	                "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
77	                "* 0x3040 - 0x309F Hiragana\n" .
78	                "* 0x30A0 - 0x30FF Katakana\n" .
79	                "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" .
80	                "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
81	                "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
82	                "* 0x3300 - 0x33FF CJK Compatibility\n" .
83	                "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" .
84	                "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" .
85	                "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
86	                "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" .
87	                "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" .
88	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
89	["am_ET",       "* 0x1200 - 0x137F Ethiopic\n" .
90	                "* 0x1380 - 0x139F Ethiopic Supplement\n" .
91	                "* 0x2D80 - 0x2DDF Ethiopic Extended\n" .
92	                "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"],
93	["chr_US",      "* 0x13A0 - 0x13FF Cherokee\n"],
94	["km_KH",       "* 0x1780 - 0x17FF Khmer\n" .
95	                "* 0x19E0 - 0x19FF Khmer Symbols\n"],
96	["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"],
97	["ii_CN",       "* 0xA000 - 0xA48F Yi Syllables\n" .
98	                "* 0xA490 - 0xA4CF Yi Radicals\n"],
99	["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"],
100	["ko_KR",       "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
101			"* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" .
102	                "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
103	                "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
104);
105
106#	["zh_Hans_CN",  "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" .
107#	                "* 0x2F00 - 0x2FDF Rangxi Radicales\n" .
108#	                "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
109#	                "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
110#			"* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" .
111#	                "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"],
112
113my %seen = ();
114my %pending_seen = ();
115my %utf8map = ();
116my %utf8aliases = ();
117my $outfilename = "$ETCDIR/common.UTF-8.src";
118my $manual_file = "$ETCDIR/manual-input.UTF-8";
119my $stars = "**********************************************************************\n";
120
121get_utf8map("$CLDRDIR/posix/UTF-8.cm");
122generate_header ();
123generate_sections ();
124generate_footer ();
125
126############################
127
128sub get_utf8map {
129	my $file = shift;
130
131	open(FIN, $file);
132	my @lines = <FIN>;
133	close(FIN);
134	chomp(@lines);
135
136	my $prev_k = undef;
137	my $prev_v = "";
138	my $incharmap = 0;
139	foreach my $l (@lines) {
140		$l =~ s/\r//;
141		next if ($l =~ /^\#/);
142		next if ($l eq "");
143
144		if ($l eq "CHARMAP") {
145			$incharmap = 1;
146			next;
147		}
148
149		next if (!$incharmap);
150		last if ($l eq "END CHARMAP");
151
152		$l =~ /^<([^\s]+)>\s+(.*)/;
153		my $k = $1;
154		my $v = $2;
155		$k =~ s/_/ /g;		# unicode char string
156		$v =~ s/\\x//g;		# UTF-8 char code
157		$utf8map{$k} = $v;
158
159		$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
160
161		$prev_v = $v;
162		$prev_k = $k;
163	}
164}
165
166sub generate_header {
167	open(FOUT, ">", "$outfilename")
168		or die ("can't write to $outfilename\n");
169	print FOUT <<EOF;
170# Warning: Do not edit. This file is automatically generated from the
171# tools in /usr/src/tools/tools/locale. The data is obtained from the
172# CLDR project, obtained from http://cldr.unicode.org/
173# -----------------------------------------------------------------------------
174
175comment_char *
176escape_char /
177
178LC_CTYPE
179EOF
180}
181
182sub generate_footer {
183	print FOUT "\nEND LC_CTYPE\n";
184	close (FOUT);
185}
186
187sub already_seen {
188	my $ucode = shift;
189	if (defined $seen{$ucode}) {
190		return 1;
191	}
192	$pending_seen{$ucode} = 1;
193	return 0;
194}
195
196sub already_seen_RO {
197	my $ucode = shift;
198	if (defined $seen{$ucode}) {
199		return 1;
200	}
201	return 0;
202}
203
204sub merge_seen {
205	foreach my $sn (keys %pending_seen) {
206		$seen{$sn} = 1;
207	}
208	%pending_seen = ();
209}
210
211sub initialize_lines {
212	my @result = ();
213	my $terr = shift;
214	my $n;
215	my $back2hex;
216	my @types = ("graph", "alpha");
217	if ($terr eq "ja_JP") {
218	    foreach my $T (@types) {
219		push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n";
220		for ($n = hex("3401"); $n <= hex("4DB4"); $n++) {
221			$back2hex=sprintf("%X", $n);
222			push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
223				$back2hex . ">;/\n";
224		}
225		push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n";
226		push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n";
227		for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) {
228			$back2hex=sprintf("%X", $n);
229			push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
230				$back2hex . ">;/\n";
231		}
232		push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n";
233	    }
234	    push @result, "merge\tnow\n";
235	}
236	return @result;
237}
238
239sub compress_ctype {
240	my $territory = shift;
241	my $term;
242	my $active = 0;
243	my $cat_loaded = 0;
244	my $lock_ID;
245	my $prev_ID;
246	my $curr_ID;
247	my $lock_name;
248	my $prev_name;
249	my $curr_name;
250	my $key_name;
251	my $category = '';
252
253	my @lines = initialize_lines ($territory);
254
255	my $filename = "$CLDRDIR/posix/$territory.UTF-8.src";
256	if (! -f $filename) {
257		print STDERR "Cannot open $filename\n";
258		return;
259	}
260	open(FIN, "$filename");
261	print "Reading from $filename\n";
262	while (<FIN>) {
263		if (/^LC_CTYPE/../^END LC_CTYPE/) {
264			if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" &&
265				$_ ne "*************\n" && $_ ne "\n") {
266				push @lines, $_;
267			}
268		}
269	}
270	close(FIN);
271	foreach my $line (@lines) {
272		if ($line =~ m/^([a-z]{3,})\t/) {
273			$category = $1;
274			if ($category eq 'merge') {
275				merge_seen;
276				next;
277			}
278			if ($category ne 'print') {
279				$cat_loaded = 1;
280			}
281		}
282		next if ($category eq 'print');
283		if ($category eq 'toupper' || $category eq 'tolower') {
284			if ($line =~ m/<([-_A-Za-z0-9]+)>,/) {
285				$key_name = $1;
286				$key_name =~ s/_/ /g;
287				if (already_seen_RO (hex($utf8map{$key_name}))) {
288					next;
289				}
290				if ($cat_loaded) { print FOUT $category; }
291				$cat_loaded = 0;
292				$line =~ s/^[a-z]{3,}\t/\t/;
293				print FOUT $line;
294			}
295			next;
296		}
297		if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) {
298			$term = ($2 eq '') ? 1 : 0;
299			$curr_name = $1;
300			$key_name = $1;
301			$key_name =~ s/_/ /g;
302			$curr_ID = hex($utf8map{$key_name});
303			if (already_seen ($curr_ID)) {
304				next;
305			}
306			if ($active) {
307				if ($curr_ID == $prev_ID + 1) {
308					$prev_ID = $curr_ID;
309					$prev_name = $curr_name;
310				} else {
311					if ($cat_loaded) { print FOUT $category; }
312					$cat_loaded = 0;
313					if ($prev_ID == $lock_ID) {
314						print FOUT "\t<" . $prev_name . ">;/\n";
315					} elsif ($prev_ID - 1 == $lock_ID) {
316						print FOUT "\t<" . $lock_name . ">;/\n";
317						print FOUT "\t<" . $prev_name . ">;/\n";
318					} else {
319						print FOUT "\t<" . $lock_name .
320						       ">;...;<" . $prev_name . ">;/\n";
321					}
322					$lock_ID = $curr_ID;
323					$prev_ID = $curr_ID;
324					$lock_name = $curr_name;
325					$prev_name = $curr_name;
326				}
327			} else {
328				$active = 1;
329				$lock_ID = $curr_ID;
330				$prev_ID = $curr_ID;
331				$lock_name = $curr_name;
332				$prev_name = $curr_name;
333			}
334			if ($term) {
335				if ($cat_loaded) { print FOUT $category; }
336				$cat_loaded = 0;
337				if ($curr_ID == $lock_ID) {
338					print FOUT "\t<" . $curr_name . ">\n";
339				} elsif ($curr_ID == $lock_ID + 1) {
340					print FOUT "\t<" . $lock_name . ">;/\n";
341					print FOUT "\t<" . $curr_name . ">\n";
342				} else {
343					print FOUT "\t<" . $lock_name .
344					       ">;...;<" . $curr_name . ">\n";
345				}
346				$active = 0;
347			}
348		} else {
349			print FOUT $line;
350		}
351	}
352}
353
354sub generate_sections {
355	foreach my $section (@SECTIONS ) {
356		print FOUT "\n";
357		print FOUT $stars;
358		print FOUT @$section[1];
359		print FOUT $stars;
360		compress_ctype (@$section[0]);
361		merge_seen;
362	}
363	my @lines = ();
364	open(FIN, "$manual_file");
365	print "Reading from $manual_file\n";
366	while (<FIN>) {
367		push @lines, $_;
368	}
369	close(FIN);
370	foreach my $line (@lines) {
371		print FOUT $line;
372	}
373}
374