1#!/usr/local/bin/perl -wC 2 3use strict; 4#use File::Copy; 5#use XML::Parser; 6use Tie::IxHash; 7#use Data::Dumper; 8use Getopt::Long; 9#use Digest::SHA qw(sha1_hex); 10#require "charmaps.pm"; 11 12 13if ($#ARGV != 1) { 14 print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n"; 15 exit(1); 16} 17 18my $CLDRDIR = undef; 19my $ETCDIR = undef; 20 21my $result = GetOptions ( 22 "cldr=s" => \$CLDRDIR, 23 "etc=s" => \$ETCDIR, 24 ); 25 26my @SECTIONS = ( 27 ["en_US", "* 0x0000 - 0x007F Basic Latin\n" . 28 "* 0x0080 - 0x00FF Latin-1 Supplement\n" . 29 "* 0x0100 - 0x017F Latin Extended-A\n" . 30 "* 0x0180 - 0x024F Latin Extended-B\n" . 31 "* 0x0250 - 0x02AF IPA Extensions\n" . 32 "* 0x1D00 - 0x1D7F Phonetic Extensions\n" . 33 "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" . 34 "* 0x1E00 - 0x1EFF Latin Extended Additional\n" . 35 "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n". 36 "* 0x2C60 - 0x2C7F Latin Extended-C\n" . 37 "* 0xA720 - 0xA7FF Latin Extended-D\n" . 38 "* 0xAB30 - 0xAB6F Latin Extended-E\n" . 39 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n". 40 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 41 ["el_GR", "* 0x0370 - 0x03FF Greek (No Coptic!)\n" . 42 "* 0x1F00 - 0x1FFF Greek Extended\n"], 43 ["ru_RU", "* 0x0400 - 0x04FF Cyrillic\n" . 44 "* 0x0500 - 0x052F Cyrillic Supplementary\n" . 45 "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" . 46 "* 0xA640 - 0xA69F Cyrillic Extended-B\n"], 47 ["hy_AM", "* 0x0530 - 0x058F Armenian\n" . 48 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], 49 ["he_IL", "* 0x0590 - 0x05FF Hebrew\n" . 50 "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], 51 ["ar_SA", "* 0x0600 - 0x06FF Arabic\n" . 52 "* 0x0750 - 0x074F Arabic Supplement\n" . 53 "* 0x08A0 - 0x08FF Arabic Extended-A\n" . 54 "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" . 55 "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"], 56 ["hi_IN", "* 0x0900 - 0x097F Devanagari\n" . 57 "* 0xA8E0 - 0xA8FF Devanagari Extended\n"], 58 ["bn_IN", "* 0x0900 - 0x097F Bengali\n"], 59 ["pa_Guru_IN", "* 0x0A00 - 0x0A7F Gurmukhi\n"], 60 ["gu_IN", "* 0x0A80 - 0x0AFF Gujarati\n"], 61 ["or_IN", "* 0x0B00 - 0x0B7F Oriya\n"], 62 ["ta_IN", "* 0x0B80 - 0x0BFF Tamil\n"], 63 ["te_IN", "* 0x0C00 - 0x0C7F Telugu\n"], 64 ["kn_IN", "* 0x0C80 - 0x0CFF Kannada\n"], 65 ["ml_IN", "* 0x0D00 - 0x0D7F Malayalam\n"], 66 ["si_LK", "* 0x0D80 - 0x0DFF Sinhala\n"], 67 ["th_TH", "* 0x0E00 - 0x0E7F Thai\n"], 68 ["lo_LA", "* 0x0E80 - 0x0EFF Lao\n"], 69 ["bo_IN", "* 0x0F00 - 0x0FFF Tibetan\n"], 70 ["my_MM", "* 0x1000 - 0x109F Myanmar\n" . 71 "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" . 72 "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"], 73 ["ka_GE", "* 0x10A0 - 0x10FF Georgia\n" . 74 "* 0x2D00 - 0x2D2F Georgian Supplement\n"], 75 ["ja_JP", "* 0x1100 - 0x11FF Hangul Jamo\n" . 76 "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . 77 "* 0x3040 - 0x309F Hiragana\n" . 78 "* 0x30A0 - 0x30FF Katakana\n" . 79 "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" . 80 "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . 81 "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . 82 "* 0x3300 - 0x33FF CJK Compatibility\n" . 83 "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" . 84 "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" . 85 "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . 86 "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" . 87 "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" . 88 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 89 ["am_ET", "* 0x1200 - 0x137F Ethiopic\n" . 90 "* 0x1380 - 0x139F Ethiopic Supplement\n" . 91 "* 0x2D80 - 0x2DDF Ethiopic Extended\n" . 92 "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"], 93 ["chr_US", "* 0x13A0 - 0x13FF Cherokee\n"], 94 ["km_KH", "* 0x1780 - 0x17FF Khmer\n" . 95 "* 0x19E0 - 0x19FF Khmer Symbols\n"], 96 ["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"], 97 ["ii_CN", "* 0xA000 - 0xA48F Yi Syllables\n" . 98 "* 0xA490 - 0xA4CF Yi Radicals\n"], 99 ["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"], 100 ["ko_KR", "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . 101 "* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" . 102 "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . 103 "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 104); 105 106# ["zh_Hans_CN", "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" . 107# "* 0x2F00 - 0x2FDF Rangxi Radicales\n" . 108# "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . 109# "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . 110# "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" . 111# "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"], 112 113my %seen = (); 114my %pending_seen = (); 115my %utf8map = (); 116my %utf8aliases = (); 117my $outfilename = "$ETCDIR/common.UTF-8.src"; 118my $manual_file = "$ETCDIR/manual-input.UTF-8"; 119my $stars = "**********************************************************************\n"; 120 121get_utf8map("$CLDRDIR/posix/UTF-8.cm"); 122generate_header (); 123generate_sections (); 124generate_footer (); 125 126############################ 127 128sub get_utf8map { 129 my $file = shift; 130 131 open(FIN, $file); 132 my @lines = <FIN>; 133 close(FIN); 134 chomp(@lines); 135 136 my $prev_k = undef; 137 my $prev_v = ""; 138 my $incharmap = 0; 139 foreach my $l (@lines) { 140 $l =~ s/\r//; 141 next if ($l =~ /^\#/); 142 next if ($l eq ""); 143 144 if ($l eq "CHARMAP") { 145 $incharmap = 1; 146 next; 147 } 148 149 next if (!$incharmap); 150 last if ($l eq "END CHARMAP"); 151 152 $l =~ /^<([^\s]+)>\s+(.*)/; 153 my $k = $1; 154 my $v = $2; 155 $k =~ s/_/ /g; # unicode char string 156 $v =~ s/\\x//g; # UTF-8 char code 157 $utf8map{$k} = $v; 158 159 $utf8aliases{$k} = $prev_k if ($prev_v eq $v); 160 161 $prev_v = $v; 162 $prev_k = $k; 163 } 164} 165 166sub generate_header { 167 open(FOUT, ">", "$outfilename") 168 or die ("can't write to $outfilename\n"); 169 print FOUT <<EOF; 170# Warning: Do not edit. This file is automatically generated from the 171# tools in /usr/src/tools/tools/locale. The data is obtained from the 172# CLDR project, obtained from http://cldr.unicode.org/ 173# ----------------------------------------------------------------------------- 174 175comment_char * 176escape_char / 177 178LC_CTYPE 179EOF 180} 181 182sub generate_footer { 183 print FOUT "\nEND LC_CTYPE\n"; 184 close (FOUT); 185} 186 187sub already_seen { 188 my $ucode = shift; 189 if (defined $seen{$ucode}) { 190 return 1; 191 } 192 $pending_seen{$ucode} = 1; 193 return 0; 194} 195 196sub already_seen_RO { 197 my $ucode = shift; 198 if (defined $seen{$ucode}) { 199 return 1; 200 } 201 return 0; 202} 203 204sub merge_seen { 205 foreach my $sn (keys %pending_seen) { 206 $seen{$sn} = 1; 207 } 208 %pending_seen = (); 209} 210 211sub initialize_lines { 212 my @result = (); 213 my $terr = shift; 214 my $n; 215 my $back2hex; 216 my @types = ("graph", "alpha"); 217 if ($terr eq "ja_JP") { 218 foreach my $T (@types) { 219 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n"; 220 for ($n = hex("3401"); $n <= hex("4DB4"); $n++) { 221 $back2hex=sprintf("%X", $n); 222 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . 223 $back2hex . ">;/\n"; 224 } 225 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n"; 226 push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n"; 227 for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) { 228 $back2hex=sprintf("%X", $n); 229 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . 230 $back2hex . ">;/\n"; 231 } 232 push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n"; 233 } 234 push @result, "merge\tnow\n"; 235 } 236 return @result; 237} 238 239sub compress_ctype { 240 my $territory = shift; 241 my $term; 242 my $active = 0; 243 my $cat_loaded = 0; 244 my $lock_ID; 245 my $prev_ID; 246 my $curr_ID; 247 my $lock_name; 248 my $prev_name; 249 my $curr_name; 250 my $key_name; 251 my $category = ''; 252 253 my @lines = initialize_lines ($territory); 254 255 my $filename = "$CLDRDIR/posix/$territory.UTF-8.src"; 256 if (! -f $filename) { 257 print STDERR "Cannot open $filename\n"; 258 return; 259 } 260 open(FIN, "$filename"); 261 print "Reading from $filename\n"; 262 while (<FIN>) { 263 if (/^LC_CTYPE/../^END LC_CTYPE/) { 264 if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" && 265 $_ ne "*************\n" && $_ ne "\n") { 266 push @lines, $_; 267 } 268 } 269 } 270 close(FIN); 271 foreach my $line (@lines) { 272 if ($line =~ m/^([a-z]{3,})\t/) { 273 $category = $1; 274 if ($category eq 'merge') { 275 merge_seen; 276 next; 277 } 278 if ($category ne 'print') { 279 $cat_loaded = 1; 280 } 281 } 282 next if ($category eq 'print'); 283 if ($category eq 'toupper' || $category eq 'tolower') { 284 if ($line =~ m/<([-_A-Za-z0-9]+)>,/) { 285 $key_name = $1; 286 $key_name =~ s/_/ /g; 287 if (already_seen_RO (hex($utf8map{$key_name}))) { 288 next; 289 } 290 if ($cat_loaded) { print FOUT $category; } 291 $cat_loaded = 0; 292 $line =~ s/^[a-z]{3,}\t/\t/; 293 print FOUT $line; 294 } 295 next; 296 } 297 if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) { 298 $term = ($2 eq '') ? 1 : 0; 299 $curr_name = $1; 300 $key_name = $1; 301 $key_name =~ s/_/ /g; 302 $curr_ID = hex($utf8map{$key_name}); 303 if (already_seen ($curr_ID)) { 304 next; 305 } 306 if ($active) { 307 if ($curr_ID == $prev_ID + 1) { 308 $prev_ID = $curr_ID; 309 $prev_name = $curr_name; 310 } else { 311 if ($cat_loaded) { print FOUT $category; } 312 $cat_loaded = 0; 313 if ($prev_ID == $lock_ID) { 314 print FOUT "\t<" . $prev_name . ">;/\n"; 315 } elsif ($prev_ID - 1 == $lock_ID) { 316 print FOUT "\t<" . $lock_name . ">;/\n"; 317 print FOUT "\t<" . $prev_name . ">;/\n"; 318 } else { 319 print FOUT "\t<" . $lock_name . 320 ">;...;<" . $prev_name . ">;/\n"; 321 } 322 $lock_ID = $curr_ID; 323 $prev_ID = $curr_ID; 324 $lock_name = $curr_name; 325 $prev_name = $curr_name; 326 } 327 } else { 328 $active = 1; 329 $lock_ID = $curr_ID; 330 $prev_ID = $curr_ID; 331 $lock_name = $curr_name; 332 $prev_name = $curr_name; 333 } 334 if ($term) { 335 if ($cat_loaded) { print FOUT $category; } 336 $cat_loaded = 0; 337 if ($curr_ID == $lock_ID) { 338 print FOUT "\t<" . $curr_name . ">\n"; 339 } elsif ($curr_ID == $lock_ID + 1) { 340 print FOUT "\t<" . $lock_name . ">;/\n"; 341 print FOUT "\t<" . $curr_name . ">\n"; 342 } else { 343 print FOUT "\t<" . $lock_name . 344 ">;...;<" . $curr_name . ">\n"; 345 } 346 $active = 0; 347 } 348 } else { 349 print FOUT $line; 350 } 351 } 352} 353 354sub generate_sections { 355 foreach my $section (@SECTIONS ) { 356 print FOUT "\n"; 357 print FOUT $stars; 358 print FOUT @$section[1]; 359 print FOUT $stars; 360 compress_ctype (@$section[0]); 361 merge_seen; 362 } 363 my @lines = (); 364 open(FIN, "$manual_file"); 365 print "Reading from $manual_file\n"; 366 while (<FIN>) { 367 push @lines, $_; 368 } 369 close(FIN); 370 foreach my $line (@lines) { 371 print FOUT $line; 372 } 373} 374