1289260Sbapt#!/usr/local/bin/perl -wC 2289260Sbapt 3289260Sbaptuse strict; 4289260Sbapt#use File::Copy; 5289260Sbapt#use XML::Parser; 6289260Sbaptuse Tie::IxHash; 7289260Sbapt#use Data::Dumper; 8289260Sbaptuse Getopt::Long; 9289260Sbapt#use Digest::SHA qw(sha1_hex); 10289260Sbapt#require "charmaps.pm"; 11289260Sbapt 12289260Sbapt 13289260Sbaptif ($#ARGV != 1) { 14289260Sbapt print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n"; 15289260Sbapt exit(1); 16289260Sbapt} 17289260Sbapt 18289260Sbaptmy $CLDRDIR = undef; 19289260Sbaptmy $ETCDIR = undef; 20289260Sbapt 21289260Sbaptmy $result = GetOptions ( 22289260Sbapt "cldr=s" => \$CLDRDIR, 23289260Sbapt "etc=s" => \$ETCDIR, 24289260Sbapt ); 25289260Sbapt 26289260Sbaptmy @SECTIONS = ( 27289260Sbapt ["en_US", "* 0x0000 - 0x007F Basic Latin\n" . 28289260Sbapt "* 0x0080 - 0x00FF Latin-1 Supplement\n" . 29289260Sbapt "* 0x0100 - 0x017F Latin Extended-A\n" . 30289260Sbapt "* 0x0180 - 0x024F Latin Extended-B\n" . 31289260Sbapt "* 0x0250 - 0x02AF IPA Extensions\n" . 32289260Sbapt "* 0x1D00 - 0x1D7F Phonetic Extensions\n" . 33289260Sbapt "* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" . 34289260Sbapt "* 0x1E00 - 0x1EFF Latin Extended Additional\n" . 35289260Sbapt "* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n". 36289260Sbapt "* 0x2C60 - 0x2C7F Latin Extended-C\n" . 37289260Sbapt "* 0xA720 - 0xA7FF Latin Extended-D\n" . 38289260Sbapt "* 0xAB30 - 0xAB6F Latin Extended-E\n" . 39289260Sbapt "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n". 40289260Sbapt "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 41289260Sbapt ["el_GR", "* 0x0370 - 0x03FF Greek (No Coptic!)\n" . 42289260Sbapt "* 0x1F00 - 0x1FFF Greek Extended\n"], 43289260Sbapt ["ru_RU", "* 0x0400 - 0x04FF Cyrillic\n" . 44289260Sbapt "* 0x0500 - 0x052F Cyrillic Supplementary\n" . 45289260Sbapt "* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" . 46289260Sbapt "* 0xA640 - 0xA69F Cyrillic Extended-B\n"], 47289260Sbapt ["hy_AM", "* 0x0530 - 0x058F Armenian\n" . 48289260Sbapt "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], 49289260Sbapt ["he_IL", "* 0x0590 - 0x05FF Hebrew\n" . 50289260Sbapt "* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], 51289260Sbapt ["ar_SA", "* 0x0600 - 0x06FF Arabic\n" . 52289260Sbapt "* 0x0750 - 0x074F Arabic Supplement\n" . 53289260Sbapt "* 0x08A0 - 0x08FF Arabic Extended-A\n" . 54289260Sbapt "* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" . 55289260Sbapt "* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"], 56289260Sbapt ["hi_IN", "* 0x0900 - 0x097F Devanagari\n" . 57289260Sbapt "* 0xA8E0 - 0xA8FF Devanagari Extended\n"], 58289260Sbapt ["bn_IN", "* 0x0900 - 0x097F Bengali\n"], 59289260Sbapt ["pa_Guru_IN", "* 0x0A00 - 0x0A7F Gurmukhi\n"], 60289260Sbapt ["gu_IN", "* 0x0A80 - 0x0AFF Gujarati\n"], 61289260Sbapt ["or_IN", "* 0x0B00 - 0x0B7F Oriya\n"], 62289260Sbapt ["ta_IN", "* 0x0B80 - 0x0BFF Tamil\n"], 63289260Sbapt ["te_IN", "* 0x0C00 - 0x0C7F Telugu\n"], 64289260Sbapt ["kn_IN", "* 0x0C80 - 0x0CFF Kannada\n"], 65289260Sbapt ["ml_IN", "* 0x0D00 - 0x0D7F Malayalam\n"], 66289260Sbapt ["si_LK", "* 0x0D80 - 0x0DFF Sinhala\n"], 67289260Sbapt ["th_TH", "* 0x0E00 - 0x0E7F Thai\n"], 68289260Sbapt ["lo_LA", "* 0x0E80 - 0x0EFF Lao\n"], 69289260Sbapt ["bo_IN", "* 0x0F00 - 0x0FFF Tibetan\n"], 70289260Sbapt ["my_MM", "* 0x1000 - 0x109F Myanmar\n" . 71289260Sbapt "* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" . 72289260Sbapt "* 0xAA60 - 0xAA7F Myanmar Extended-A\n"], 73289260Sbapt ["ka_GE", "* 0x10A0 - 0x10FF Georgia\n" . 74289260Sbapt "* 0x2D00 - 0x2D2F Georgian Supplement\n"], 75289260Sbapt ["ja_JP", "* 0x1100 - 0x11FF Hangul Jamo\n" . 76289260Sbapt "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . 77289260Sbapt "* 0x3040 - 0x309F Hiragana\n" . 78289260Sbapt "* 0x30A0 - 0x30FF Katakana\n" . 79289260Sbapt "* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" . 80289260Sbapt "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . 81289260Sbapt "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . 82289260Sbapt "* 0x3300 - 0x33FF CJK Compatibility\n" . 83289260Sbapt "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" . 84289260Sbapt "* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" . 85289260Sbapt "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . 86289260Sbapt "* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" . 87289260Sbapt "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" . 88289260Sbapt "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 89289260Sbapt ["am_ET", "* 0x1200 - 0x137F Ethiopic\n" . 90289260Sbapt "* 0x1380 - 0x139F Ethiopic Supplement\n" . 91289260Sbapt "* 0x2D80 - 0x2DDF Ethiopic Extended\n" . 92289260Sbapt "* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"], 93289260Sbapt ["chr_US", "* 0x13A0 - 0x13FF Cherokee\n"], 94289260Sbapt ["km_KH", "* 0x1780 - 0x17FF Khmer\n" . 95289260Sbapt "* 0x19E0 - 0x19FF Khmer Symbols\n"], 96289260Sbapt ["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"], 97289260Sbapt ["ii_CN", "* 0xA000 - 0xA48F Yi Syllables\n" . 98289260Sbapt "* 0xA490 - 0xA4CF Yi Radicals\n"], 99289260Sbapt ["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"], 100289260Sbapt ["ko_KR", "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . 101289260Sbapt "* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" . 102289260Sbapt "* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . 103289260Sbapt "* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], 104289260Sbapt); 105289260Sbapt 106289260Sbapt# ["zh_Hans_CN", "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" . 107289260Sbapt# "* 0x2F00 - 0x2FDF Rangxi Radicales\n" . 108289260Sbapt# "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . 109289260Sbapt# "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . 110289260Sbapt# "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" . 111289260Sbapt# "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"], 112289260Sbapt 113289260Sbaptmy %seen = (); 114289260Sbaptmy %pending_seen = (); 115289260Sbaptmy %utf8map = (); 116289260Sbaptmy %utf8aliases = (); 117289260Sbaptmy $outfilename = "$ETCDIR/common.UTF-8.src"; 118289260Sbaptmy $manual_file = "$ETCDIR/manual-input.UTF-8"; 119289260Sbaptmy $stars = "**********************************************************************\n"; 120289260Sbapt 121289260Sbaptget_utf8map("$CLDRDIR/posix/UTF-8.cm"); 122289260Sbaptgenerate_header (); 123289260Sbaptgenerate_sections (); 124289260Sbaptgenerate_footer (); 125289260Sbapt 126289260Sbapt############################ 127289260Sbapt 128289260Sbaptsub get_utf8map { 129289260Sbapt my $file = shift; 130289260Sbapt 131289260Sbapt open(FIN, $file); 132289260Sbapt my @lines = <FIN>; 133289260Sbapt close(FIN); 134289260Sbapt chomp(@lines); 135289260Sbapt 136289260Sbapt my $prev_k = undef; 137289260Sbapt my $prev_v = ""; 138289260Sbapt my $incharmap = 0; 139289260Sbapt foreach my $l (@lines) { 140289260Sbapt $l =~ s/\r//; 141289260Sbapt next if ($l =~ /^\#/); 142289260Sbapt next if ($l eq ""); 143289260Sbapt 144289260Sbapt if ($l eq "CHARMAP") { 145289260Sbapt $incharmap = 1; 146289260Sbapt next; 147289260Sbapt } 148289260Sbapt 149289260Sbapt next if (!$incharmap); 150289260Sbapt last if ($l eq "END CHARMAP"); 151289260Sbapt 152289260Sbapt $l =~ /^<([^\s]+)>\s+(.*)/; 153289260Sbapt my $k = $1; 154289260Sbapt my $v = $2; 155289260Sbapt $k =~ s/_/ /g; # unicode char string 156289260Sbapt $v =~ s/\\x//g; # UTF-8 char code 157289260Sbapt $utf8map{$k} = $v; 158289260Sbapt 159289260Sbapt $utf8aliases{$k} = $prev_k if ($prev_v eq $v); 160289260Sbapt 161289260Sbapt $prev_v = $v; 162289260Sbapt $prev_k = $k; 163289260Sbapt } 164289260Sbapt} 165289260Sbapt 166289260Sbaptsub generate_header { 167289260Sbapt open(FOUT, ">", "$outfilename") 168289260Sbapt or die ("can't write to $outfilename\n"); 169289260Sbapt print FOUT <<EOF; 170289260Sbapt# Warning: Do not edit. This file is automatically generated from the 171289260Sbapt# tools in /usr/src/tools/tools/locale. The data is obtained from the 172289260Sbapt# CLDR project, obtained from http://cldr.unicode.org/ 173289260Sbapt# ----------------------------------------------------------------------------- 174289260Sbapt 175289260Sbaptcomment_char * 176289260Sbaptescape_char / 177289260Sbapt 178289260SbaptLC_CTYPE 179289260SbaptEOF 180289260Sbapt} 181289260Sbapt 182289260Sbaptsub generate_footer { 183289260Sbapt print FOUT "\nEND LC_CTYPE\n"; 184289260Sbapt close (FOUT); 185289260Sbapt} 186289260Sbapt 187289260Sbaptsub already_seen { 188289260Sbapt my $ucode = shift; 189289260Sbapt if (defined $seen{$ucode}) { 190289260Sbapt return 1; 191289260Sbapt } 192289260Sbapt $pending_seen{$ucode} = 1; 193289260Sbapt return 0; 194289260Sbapt} 195289260Sbapt 196289260Sbaptsub already_seen_RO { 197289260Sbapt my $ucode = shift; 198289260Sbapt if (defined $seen{$ucode}) { 199289260Sbapt return 1; 200289260Sbapt } 201289260Sbapt return 0; 202289260Sbapt} 203289260Sbapt 204289260Sbaptsub merge_seen { 205289260Sbapt foreach my $sn (keys %pending_seen) { 206289260Sbapt $seen{$sn} = 1; 207289260Sbapt } 208289260Sbapt %pending_seen = (); 209289260Sbapt} 210289260Sbapt 211289260Sbaptsub initialize_lines { 212289260Sbapt my @result = (); 213289260Sbapt my $terr = shift; 214289260Sbapt my $n; 215289260Sbapt my $back2hex; 216289260Sbapt my @types = ("graph", "alpha"); 217289260Sbapt if ($terr eq "ja_JP") { 218289260Sbapt foreach my $T (@types) { 219289260Sbapt push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n"; 220289260Sbapt for ($n = hex("3401"); $n <= hex("4DB4"); $n++) { 221289260Sbapt $back2hex=sprintf("%X", $n); 222289260Sbapt push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . 223289260Sbapt $back2hex . ">;/\n"; 224289260Sbapt } 225289260Sbapt push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n"; 226289260Sbapt push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n"; 227289260Sbapt for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) { 228289260Sbapt $back2hex=sprintf("%X", $n); 229289260Sbapt push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . 230289260Sbapt $back2hex . ">;/\n"; 231289260Sbapt } 232289260Sbapt push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n"; 233289260Sbapt } 234289260Sbapt push @result, "merge\tnow\n"; 235289260Sbapt } 236289260Sbapt return @result; 237289260Sbapt} 238289260Sbapt 239289260Sbaptsub compress_ctype { 240289260Sbapt my $territory = shift; 241289260Sbapt my $term; 242289260Sbapt my $active = 0; 243289260Sbapt my $cat_loaded = 0; 244289260Sbapt my $lock_ID; 245289260Sbapt my $prev_ID; 246289260Sbapt my $curr_ID; 247289260Sbapt my $lock_name; 248289260Sbapt my $prev_name; 249289260Sbapt my $curr_name; 250289260Sbapt my $key_name; 251289260Sbapt my $category = ''; 252289260Sbapt 253289260Sbapt my @lines = initialize_lines ($territory); 254289260Sbapt 255289260Sbapt my $filename = "$CLDRDIR/posix/$territory.UTF-8.src"; 256289260Sbapt if (! -f $filename) { 257289260Sbapt print STDERR "Cannot open $filename\n"; 258289260Sbapt return; 259289260Sbapt } 260289260Sbapt open(FIN, "$filename"); 261289260Sbapt print "Reading from $filename\n"; 262289260Sbapt while (<FIN>) { 263289260Sbapt if (/^LC_CTYPE/../^END LC_CTYPE/) { 264289260Sbapt if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" && 265289260Sbapt $_ ne "*************\n" && $_ ne "\n") { 266289260Sbapt push @lines, $_; 267289260Sbapt } 268289260Sbapt } 269289260Sbapt } 270289260Sbapt close(FIN); 271289260Sbapt foreach my $line (@lines) { 272289260Sbapt if ($line =~ m/^([a-z]{3,})\t/) { 273289260Sbapt $category = $1; 274289260Sbapt if ($category eq 'merge') { 275289260Sbapt merge_seen; 276289260Sbapt next; 277289260Sbapt } 278289260Sbapt if ($category ne 'print') { 279289260Sbapt $cat_loaded = 1; 280289260Sbapt } 281289260Sbapt } 282289260Sbapt next if ($category eq 'print'); 283289260Sbapt if ($category eq 'toupper' || $category eq 'tolower') { 284289260Sbapt if ($line =~ m/<([-_A-Za-z0-9]+)>,/) { 285289260Sbapt $key_name = $1; 286289260Sbapt $key_name =~ s/_/ /g; 287289260Sbapt if (already_seen_RO (hex($utf8map{$key_name}))) { 288289260Sbapt next; 289289260Sbapt } 290289260Sbapt if ($cat_loaded) { print FOUT $category; } 291289260Sbapt $cat_loaded = 0; 292289260Sbapt $line =~ s/^[a-z]{3,}\t/\t/; 293289260Sbapt print FOUT $line; 294289260Sbapt } 295289260Sbapt next; 296289260Sbapt } 297289260Sbapt if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) { 298289260Sbapt $term = ($2 eq '') ? 1 : 0; 299289260Sbapt $curr_name = $1; 300289260Sbapt $key_name = $1; 301289260Sbapt $key_name =~ s/_/ /g; 302289260Sbapt $curr_ID = hex($utf8map{$key_name}); 303289260Sbapt if (already_seen ($curr_ID)) { 304289260Sbapt next; 305289260Sbapt } 306289260Sbapt if ($active) { 307289260Sbapt if ($curr_ID == $prev_ID + 1) { 308289260Sbapt $prev_ID = $curr_ID; 309289260Sbapt $prev_name = $curr_name; 310289260Sbapt } else { 311289260Sbapt if ($cat_loaded) { print FOUT $category; } 312289260Sbapt $cat_loaded = 0; 313289260Sbapt if ($prev_ID == $lock_ID) { 314289260Sbapt print FOUT "\t<" . $prev_name . ">;/\n"; 315289260Sbapt } elsif ($prev_ID - 1 == $lock_ID) { 316289260Sbapt print FOUT "\t<" . $lock_name . ">;/\n"; 317289260Sbapt print FOUT "\t<" . $prev_name . ">;/\n"; 318289260Sbapt } else { 319289260Sbapt print FOUT "\t<" . $lock_name . 320289260Sbapt ">;...;<" . $prev_name . ">;/\n"; 321289260Sbapt } 322289260Sbapt $lock_ID = $curr_ID; 323289260Sbapt $prev_ID = $curr_ID; 324289260Sbapt $lock_name = $curr_name; 325289260Sbapt $prev_name = $curr_name; 326289260Sbapt } 327289260Sbapt } else { 328289260Sbapt $active = 1; 329289260Sbapt $lock_ID = $curr_ID; 330289260Sbapt $prev_ID = $curr_ID; 331289260Sbapt $lock_name = $curr_name; 332289260Sbapt $prev_name = $curr_name; 333289260Sbapt } 334289260Sbapt if ($term) { 335289260Sbapt if ($cat_loaded) { print FOUT $category; } 336289260Sbapt $cat_loaded = 0; 337289260Sbapt if ($curr_ID == $lock_ID) { 338289260Sbapt print FOUT "\t<" . $curr_name . ">\n"; 339289260Sbapt } elsif ($curr_ID == $lock_ID + 1) { 340289260Sbapt print FOUT "\t<" . $lock_name . ">;/\n"; 341289260Sbapt print FOUT "\t<" . $curr_name . ">\n"; 342289260Sbapt } else { 343289260Sbapt print FOUT "\t<" . $lock_name . 344289260Sbapt ">;...;<" . $curr_name . ">\n"; 345289260Sbapt } 346289260Sbapt $active = 0; 347289260Sbapt } 348289260Sbapt } else { 349289260Sbapt print FOUT $line; 350289260Sbapt } 351289260Sbapt } 352289260Sbapt} 353289260Sbapt 354289260Sbaptsub generate_sections { 355289260Sbapt foreach my $section (@SECTIONS ) { 356289260Sbapt print FOUT "\n"; 357289260Sbapt print FOUT $stars; 358289260Sbapt print FOUT @$section[1]; 359289260Sbapt print FOUT $stars; 360289260Sbapt compress_ctype (@$section[0]); 361289260Sbapt merge_seen; 362289260Sbapt } 363289260Sbapt my @lines = (); 364289260Sbapt open(FIN, "$manual_file"); 365289260Sbapt print "Reading from $manual_file\n"; 366289260Sbapt while (<FIN>) { 367289260Sbapt push @lines, $_; 368289260Sbapt } 369289260Sbapt close(FIN); 370289260Sbapt foreach my $line (@lines) { 371289260Sbapt print FOUT $line; 372289260Sbapt } 373289260Sbapt} 374