1#!/usr/bin/perl -w 2 3use strict; 4use Getopt::Std; 5use Fcntl qw(O_TRUNC O_CREAT O_WRONLY SEEK_SET); 6use File::Temp qw(tempfile); 7use IO::File; 8 9my %opt; 10getopts("cf:u:i:", \%opt); 11 12my $comment_char = "#"; 13my $escape_char = "\\"; 14my $val_match = undef; # set in set_escape 15my %sym = (); 16my %width = (); 17my %ctype_classes = ( 18 # there are the charactors that get automagically included, there is no 19 # standard way to avoid them. XXX even if you have a charset without 20 # some of these charactors defined! 21 22 # They are accessable in a regex via [:classname:], and libc has a 23 # isX() for most of these. 24 upper => {map { ($_, 1); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)}, 25 lower => {map { ($_, 1); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)}, 26 alpha => {}, 27 #alnum => {}, 28 digit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9)}, 29 space => {}, 30 cntrl => {}, 31 punct => {}, 32 graph => {}, 33 print => {}, 34 xdigit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f)}, 35 blank => {" " => 1, "\t" => 1}, 36 37 toupper => {map { ($_, "\U$_"); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)}, 38 tolower => {map { ($_, "\L$_"); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)}, 39); 40 41my %cele = ( 42 # collating-elements -- these are a lot like %sym that only works 43 # in LC_COLLATE, can also be accessed in a regex via [.element.] 44); 45 46my %csym = ( 47 # collating-symbols -- these are used to define a set of charactors 48 # that compare as equals (in one or more passes), can also be accessed 49 # in a regex via [=symbol=] 50); 51 52my @corder = (); # collating order 53my @corder_weights = (); # collating directions (forward, backward, position) 54 55my @colldef = (); 56 57my(%monetary, %numeric, %time, %messages); 58 59# This is the default charmap, unlike %ctype_classes you _can_ avoid this 60# merely by having your own charmap definition file 61my $default_charmap = <<EOT; 62CHARMAP 63<NUL> \\000 64<alert> \\007 65<backspace> \\010 66<tab> \\011 67<newline> \\012 68<vertical-tab> \\013 69<form-feed> \\014 70<carriage-return> \\015 71<space> \\040 72<exclamation-mark> \\041 73<quotation-mark> \\042 74<number-sign> \\043 75<dollar-sign> \\044 76<percent-sign> \\045 77<ampersand> \\046 78<apostrophe> \\047 79<left-parenthesis> \\050 80<right-parenthesis> \\051 81<asterisk> \\052 82<plus-sign> \\053 83<comma> \\054 84<hyphen> \\055 85<hyphen-minus> \\055 86<period> \\056 87<full-stop> \\056 88<slash> \\057 89<solidus> \\057 90<zero> \\060 91<one> \\061 92<two> \\062 93<three> \\063 94<four> \\064 95<five> \\065 96<six> \\066 97<seven> \\067 98<eight> \\070 99<nine> \\071 100<colon> \\072 101<semicolon> \\073 102<less-then-sign> \\074 103<less-than-sign> \\074 104<equals-sign> \\075 105<greater-then-sign> \\076 106<greater-than-sign> \\076 107<question-mark> \\077 108<commercial-at> \\100 109<A> \\101 110<B> \\102 111<C> \\103 112<D> \\104 113<E> \\105 114<F> \\106 115<G> \\107 116<H> \\110 117<I> \\111 118<J> \\112 119<K> \\113 120<L> \\114 121<M> \\115 122<N> \\116 123<O> \\117 124<P> \\120 125<Q> \\121 126<R> \\122 127<S> \\123 128<T> \\124 129<U> \\125 130<V> \\126 131<W> \\127 132<X> \\130 133<Y> \\131 134<Z> \\132 135<left-square-bracket> \\133 136<backslash> \\134 137<reverse-solidus> \\134 138<right-square-bracket> \\135 139<circumflex> \\136 140<circumflex-accent> \\136 141<underscore> \\137 142<underline> \\137 143<low-line> \\137 144<grave-accent> \\140 145<a> \\141 146<b> \\142 147<c> \\143 148<d> \\144 149<e> \\145 150<f> \\146 151<g> \\147 152<h> \\150 153<i> \\151 154<j> \\152 155<k> \\153 156<l> \\154 157<m> \\155 158<n> \\156 159<o> \\157 160<p> \\160 161<q> \\161 162<r> \\162 163<s> \\163 164<t> \\164 165<u> \\165 166<v> \\166 167<w> \\167 168<x> \\170 169<y> \\171 170<z> \\172 171<left-brace> \\173 172<left-curly-bracket> \\173 173<vertical-line> \\174 174<right-brace> \\175 175<right-curly-bracket> \\175 176<tilde> \\176 177<DEL> \\177 178 179<SOH> \\x01 180<STX> \\x02 181<ETX> \\x03 182<EOT> \\x04 183<ENQ> \\x05 184<ACK> \\x06 185<BEL> \\x07 186<BS> \\x08 187<HT> \\x09 188<NL> \\x0a 189<VT> \\x0b 190<NP> \\x0c 191<CR> \\x0d 192<SO> \\x0e 193<SI> \\x0f 194<DLE> \\x10 195<DC1> \\x11 196<DC2> \\x12 197<DC3> \\x13 198<DC4> \\x14 199<NAK> \\x15 200<SYN> \\x16 201<ETB> \\x17 202<CAN> \\x18 203<EM> \\x19 204<SUB> \\x1a 205<ESC> \\x1b 206<FS> \\x1c 207<IS4> \\x1c 208<GS> \\x1d 209<IS3> \\x1d 210<RS> \\x1e 211<IS2> \\x1e 212<US> \\x1f 213<IS1> \\x1f 214END CHARMAP 215EOT 216 217&set_escape($escape_char); 218 219use strict qw(vars); 220 221if (@ARGV != 1) { 222 &exit(4, "usage: $0 [-c] [-f charmap-file] [-u codesetname] [-i localdef-file] LOCALENAME\n"); 223} 224 225my $locale_dir = $ARGV[0]; 226$locale_dir = "/usr/share/locale/$locale_dir" unless ($locale_dir =~ m{/}); 227 228my $CMAP; 229if (defined($opt{'f'})) { 230 # Using new IO::File $opt{'f'}, "r" runs into problems with long path names 231 sysopen(CMAP_KLUDGE, $opt{'f'}, O_RDONLY) || &exit(4, "Can't open $opt{f}: $!\n"); 232 $CMAP = new IO::Handle; 233 $CMAP->fdopen(fileno(CMAP_KLUDGE), "r") || &exit(4, "Can't fdopen $opt{f}: $!\n"); 234} else { 235 # er, not everyone gets IO::Scalar, so use an unamed tmp file 236 # $CMAP = new IO::Scalar \$default_charmap; 237 $CMAP = new_tmpfile IO::File; 238 print $CMAP $default_charmap; 239 seek $CMAP, 0, SEEK_SET; 240} 241 242while(<$CMAP>) { 243 if (m/^\s*CHARMAP\s*$/) { 244 &parse_charmaps(); 245 } elsif (m/^\s*WIDTH\s*$/) { 246 &parse_widths(); 247 } elsif (m/^\s*($comment_char.*)?$/) { 248 } else { 249 chomp; 250 &exit(4, "syntax error on line $. ($_)"); 251 } 252} 253&parse_widths() if (0 == %width); 254 255if (defined($opt{'i'})) { 256 sysopen(STDIN, $opt{'i'}, 0) || &exit(4, "Can't open localdef file $opt{i}: $!"); 257} else { 258 $opt{'i'} = "/dev/stdin"; 259} 260 261my %LC_parsers = ( 262 NONE => [\&parse_LC_NONE, qr/^\s*((escape|comment)_char\s+$val_match\s*)?$/], 263 CTYPE => [\&parse_LC_CTYPE, qr/^\s*(\S+)\s+(\S+.*?)\s*$/], 264 COLLATE => [\&parse_LC_COLLATE, qr/^\s*(<[^>\s]+>|order_end|END|(\S*)\s+(\S+.*?)|collating[_-]element\s*<[^>]+>\s+from\s+$val_match)\s*$/, 1], 265 TIME => [\&parse_LC_TIME, qr/^\s*(ab_?day|day|abmon|mon|d_t_fmt|d_fmt|t_fmt|am_pm|t_fmt_ampm|era|era_d_fmt|era_t_fmt|era_d_t_fmt|alt_digits|copy|END)\s+(\S+.*?)\s*$/], 266 NUMERIC => [\&parse_LC_NUMERIC, qr/^\s*(decimal_point|thousands_sep|grouping|END|copy)\s+(\S+.*?)\s*$/], 267 MONETARY => [\&parse_LC_MONETARY, qr/^\s*(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|mon_grouping|positive_sign|negative_sign|int_frac_digits|frac_digits|p_cs_precedes|p_sep_by_space|n_cs_precedes|n_sep_by_space|p_sign_posn|n_sign_posn|int_p_cs_precedes|int_n_cs_precedes|int_p_sep_by_space|int_n_sep_by_space|int_p_sign_posn|int_n_sign_posn|copy|END)\s+(\S+.*?)\s*$/], 268 MESSAGES => [\&parse_LC_MESSAGES, qr/^\s*(END|yesexpr|noexpr|yesstr|nostr|copy)\s+(\S+.*?)\s*$/], 269 "COLLATE order" => [\&parse_collate_order, qr/^\s*(order_end|(<[^>\s]+>|UNDEFINED|\Q...\E)(\s+\S+.*)?)\s*$/], 270); 271my($current_LC, $parse_func, $validate_line, $call_parse_on_END) 272 = ("NONE", $LC_parsers{"NONE"}->[0], $LC_parsers{"NONE"}->[1], undef); 273 274while(<STDIN>) { 275 next if (m/^\s*($comment_char.*)?\s*$/); 276 if (m/\Q$escape_char\E$/) { 277 chomp; 278 chop; 279 my $tmp = <STDIN>; 280 if (!defined($tmp)) { 281 &exit(4, "Syntax error, last line ($.) of $opt{i} is marked as a continued line\n"); 282 } 283 $tmp =~ s/^\s*//; 284 $_ .= $tmp; 285 redo; 286 } 287 288 if ($current_LC eq "NONE" && m/^\s*LC_([A-Z]+)\s*$/) { 289 &set_parser($1); 290 next; 291 } 292 293 unless (m/$validate_line/) { 294 &exit(4, "Syntax error on line $. of $opt{i}\n"); 295 } 296 297 my($action, $args); 298 if (m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) { 299 ($action, $args) = ($1, $3); 300 } else { 301 $action = $_; 302 chomp $action; 303 } 304 305 if ($action eq "END") { 306 if ($args ne "LC_$current_LC" || $current_LC eq "NONE") { 307 &exit(4, "Syntax error on line $. of $opt{i} attempting to end $args when LC_$current_LC is open\n"); 308 } 309 if ($call_parse_on_END) { 310 &{$parse_func}($action, $args); 311 } 312 &set_parser("NONE"); 313 } else { 314 &{$parse_func}($action, $args); 315 } 316} 317 318mkdir($locale_dir); 319&run_mklocale(); 320&write_lc_money(); 321&write_lc_time(); 322&write_lc_messages(); 323&write_lc_collate(); 324exit 0; 325 326sub parse_charmaps { 327 while(<$CMAP>) { 328 # XXX need to parse out <code_set_name>, <mb_cur_max>, <mb_cur_min>, 329 # <escape_char>, and <comment_char> before the generic "<sym> val" 330 if (m/^\s*<([\w\-]+)>\s+($val_match+)\s*$/) { 331 my($sym, $val) = ($1, $2); 332 $val = &parse_value_double_backwhack($val); 333 $sym{$sym} = $val; 334 } elsif (m/^\s*<([\w\-]*\d)>\s*\Q...\E\s*<([\w\-]*\d)>\s+($val_match+)\s*$/) { 335 # We don't deal with $se < $ss, or overflow of the last byte of $vs 336 # then again the standard doesn't say anything in particular needs 337 # to happen for those cases 338 my($ss, $se, $vs) = ($1, $2, $3); 339 $vs = &parse_value_double_backwhack($vs); 340 my $vlast = length($vs) -1; 341 for(my($s, $v) = ($ss, $vs); $s cmp $se; $s++) { 342 $sym{$s} = $v; 343 substr($v, $vlast) = chr(ord(substr($v, $vlast)) +1) 344 } 345 } elsif (m/^\s*END\s+CHARMAP\s*$/) { 346 return; 347 } elsif (m/^\s*($comment_char.*)?$/) { 348 } else { 349 &exit(4, "syntax error on line $."); 350 } 351 } 352} 353 354sub parse_widths { 355 my $default = 1; 356 my @syms; 357 358 while(<$CMAP>) { 359 if (m/^\s*<([\w\-]+)>\s+(\d+)\s*$/) { 360 my($sym, $w) = ($1, $2); 361 print "$sym width $w\n"; 362 if (!defined($sym{$sym})) { 363 warn "localedef: can't set width of unknown symbol $sym on line $.\n"; 364 } else { 365 $width{$sym} = $w; 366 } 367 } elsif (m/^\s*<([\w\-]+)>\s*\Q...\E\s*<([\w\-]+)>\s+(\d+)\s*$/) { 368 my($ss, $se, $w) = ($1, $2, $3); 369 if (!@syms) { 370 @syms = sort { $a cmp $b } keys(%sym); 371 } 372 373 # Yes, we could do a binary search for find $ss in @syms 374 foreach my $s (@syms) { 375 if (($s cmp $ss) >= 0) { 376 last if (($s cmp $se) > 0); 377 } 378 } 379 } elsif (m/^\s*WIDTH_DEFAULT\s+(\d+)\s*$/) { 380 $default = $1; 381 } elsif (m/^\s*END\s+WIDTH\s*$/) { 382 last; 383 } elsif (m/^\s*($comment_char.*)?$/) { 384 } else { 385 &exit(4, "syntax error on line $."); 386 } 387 } 388 389 foreach my $s (keys(%sym)) { 390 if (!defined($width{$s})) { 391 $width{$s} = $default; 392 } 393 } 394} 395 396# This parses a single value in any of the 7 forms it can appear in, 397# returns [0] the parsed value and [1] the remander of the string 398sub parse_value_return_extra { 399 my $val = ""; 400 local($_) = $_[0]; 401 402 while(1) { 403 $val .= &unsym($1), next 404 if (m/\G"((?:[^"\Q$escape_char\E]+|\Q$escape_char\E.)*)"/gc); 405 $val .= chr(oct($1)), next 406 if (m/\G\Q$escape_char\E([0-7]+)/gc); 407 $val .= chr(0+$1), next 408 if (m/\G\Q$escape_char\Ed([0-9]+)/gc); 409 $val .= pack("H*", $1), next 410 if (m/\G\Q$escape_char\Ex([0-9a-fA-F]+)/gc); 411 $val .= $1, next 412 if (m/\G([^,;<>\s\Q$escape_char()\E])/gc); 413 $val .= $1 414 if (m/\G(?:\Q$escape_char\E)([,;<>\Q$escape_char()\E])/gc); 415 $val .= &unsym($1), next 416 if (m/\G(<[^>]+>)/gc); 417 418 m/\G(.*)$/; 419 420 return ($val, $1); 421 } 422} 423 424# Parse one value, if there is more then one value alert the media 425sub parse_value { 426 my ($ret, $err) = &parse_value_return_extra($_[0]); 427 if ($err ne "") { 428 &exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n"); 429 } 430 431 return $ret; 432} 433 434sub parse_value_double_backwhack { 435 my($val) = @_; 436 437 my ($ret, $err) = &parse_value_return_extra($val); 438 return $ret if ($err eq ""); 439 440 $val =~ s{\\\\}{\\}g; 441 ($ret, $err) = &parse_value_return_extra($val); 442 if ($err ne "") { 443 &exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n"); 444 } 445 446 return $ret; 447} 448# $values is the string to parse, $dot_expand is a function ref that will 449# return an array to insert when "X;...;Y" is parsed (undef means that 450# construct is a syntax error), $nest is true if parens indicate a nested 451# value string should be parsed and put in an array ref, $return_extra 452# is true if any unparsable trailing junk should be returned as the last 453# element (otherwise it is a syntax error). Any text matching the regex 454# $specials is returned as an hash. 455sub parse_values { 456 my($values, $sep, $dot_expand, $nest, $return_extra, $specials) = @_; 457 my(@ret, $live_dots); 458 459 while($values ne "") { 460 if (defined($specials) && $values =~ s/^($specials)($sep|$)//) { 461 push(@ret, { $1, undef }); 462 next; 463 } 464 if ($nest && $values =~ s/^\(//) { 465 my @subret = &parse_values($values, ',', $dot_expand, $nest, 1, $specials); 466 $values = pop(@subret); 467 push(@ret, [@subret]); 468 unless ($values =~ s/^\)($sep)?//) { 469 &exit(4, "Syntax error, unmatched open paren on line $. of $opt{i}\n"); 470 } 471 next; 472 } 473 474 my($v, $l) = &parse_value_return_extra($values); 475 $values = $l; 476 477 if ($live_dots) { 478 splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], $v)); 479 $live_dots = 0; 480 } else { 481 push(@ret, $v); 482 } 483 484 if (defined($dot_expand) && $values =~ s/^$sep\Q...\E$sep//) { 485 $live_dots = 1; 486 } elsif($values =~ s/^$sep//) { 487 # Normal case 488 } elsif($values =~ m/^$/) { 489 last; 490 } else { 491 last if ($return_extra); 492 &exit(4, "Syntax error parsing arguments on line $. of $opt{i}\n"); 493 } 494 } 495 496 if ($live_dots) { 497 splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], undef)); 498 } 499 if ($return_extra) { 500 push(@ret, $values); 501 } 502 503 return @ret; 504} 505 506sub parse_LC_NONE { 507 my($cmd, $arg) = @_; 508 509 if ($cmd eq "comment_char") { 510 $comment_char = &parse_value($arg); 511 } elsif($cmd eq "escape_char") { 512 &set_escape_char(&parse_value($arg)); 513 } elsif($cmd eq "") { 514 } else { 515 &exit(4, "Syntax error on line $. of $opt{i}\n"); 516 } 517} 518 519sub parse_LC_CTYPE { 520 my($cmd, $arg) = @_; 521 522 my $ctype_classes = join("|", keys(%ctype_classes)); 523 if ($cmd eq "copy") { 524 # XXX -- the locale command line utility doesn't currently 525 # output any LC_CTYPE info, so there isn't much of a way 526 # to implent copy yet 527 &exit(2, "copy not supported on line $. of $opt{i}\n"); 528 } elsif($cmd eq "charclass") { 529 my $cc = &parse_value($arg); 530 if (!defined($ctype_classes{$cc})) { 531 $ctype_classes{$cc} = []; 532 } else { 533 warn "charclass $cc defined more then once\n"; 534 } 535 } elsif($cmd =~ m/^to(upper|lower)$/) { 536 my @arg = &parse_values($arg, ';', undef, 1); 537 foreach my $p (@arg) { 538 &exit(4, "Syntax error on line $. of $opt{i} ${cmd}'s arguments must be character pairs like (a,A);(b,B)\n") if ("ARRAY" ne ref $p || 2 != @$p); 539 } 540 foreach my $pair (@arg) { 541 $ctype_classes{$cmd}{$pair->[0]} = $pair->[1]; 542 } 543 } elsif($cmd =~ m/^($ctype_classes)$/) { 544 my @arg = &parse_values($arg, ';', \&dot_expand, 0); 545 foreach my $c (@arg) { 546 $ctype_classes{$1}->{$c} = 1; 547 } 548 } elsif($cmd =~ "END") { 549 &add_to_ctype_class('alpha', keys(%{$ctype_classes{'lower'}})); 550 &add_to_ctype_class('alpha', keys(%{$ctype_classes{'upper'}})); 551 foreach my $c (qw(alpha lower upper)) { 552 foreach my $d (qw(cntrl digit punct space)) { 553 &deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}})); 554 } 555 } 556 557 &add_to_ctype_class('space', keys(%{$ctype_classes{'blank'}})); 558 foreach my $d (qw(upper lower alpha digit graph xdigit)) { 559 &deny_in_ctype_class('space', $d, keys(%{$ctype_classes{$d}})); 560 } 561 562 foreach my $d (qw(upper lower alpha digit punct graph print xdigit)) { 563 &deny_in_ctype_class('cntrl', $d, keys(%{$ctype_classes{$d}})); 564 } 565 566 foreach my $d (qw(upper lower alpha digit cntrl xdigit space)) { 567 &deny_in_ctype_class('punct', $d, keys(%{$ctype_classes{$d}})); 568 } 569 570 foreach my $c (qw(graph print)) { 571 foreach my $a (qw(upper lower alpha digit xdigit punct)) { 572 &add_to_ctype_class($c, keys(%{$ctype_classes{$a}})); 573 } 574 foreach my $d (qw(cntrl)) { 575 &deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}})); 576 } 577 } 578 &add_to_ctype_class('print', keys(%{$ctype_classes{'space'}})); 579 580 # Yes, this is a requirment of the standard 581 &exit(2, "The digit class must have exactly 10 elements\n") if (10 != values(%{$ctype_classes{'digit'}})); 582 foreach my $d (values %{$ctype_classes{'digit'}}) { 583 if (!defined $ctype_classes{'xdigits'}->{$d}) { 584 &exit(4, "$d isn't in class xdigits, but all digits must appaer in xdigits\n"); 585 } 586 } 587 588 $ctype_classes{'alnum'} = {} unless defined $ctype_classes{'alnum'}; 589 foreach my $a (qw(alpha digit)) { 590 &add_to_ctype_class('alnum', keys(%{$ctype_classes{$a}})); 591 } 592 593 } else { 594 &exit(4, "Syntax error on line $. of $opt{i}\n"); 595 } 596} 597 598sub parse_LC_COLLATE { 599 my ($cmd, $arg) = @_; 600 if (defined($arg) && $arg ne "") { 601 push(@colldef, "$cmd $arg"); 602 } else { 603 push(@colldef, "$cmd"); 604 } 605} 606 607sub parse_collate_order { 608 my($cmd, $arg) = @_; 609 610 if ($cmd =~ m/order[-_]end/) { 611 # restore the parent parser 612 &set_parser("COLLATE"); 613 my $undef_at; 614 for(my $i = 0; $i <= $#corder; ++$i) { 615 next unless "ARRAY" eq ref($corder[$i]); 616 # If ... appears as the "key" for a order entry it means the 617 # rest of the line is duplicated once for everything in the 618 # open ended range (key-pev-line, key-next-line). Any ... 619 # in the weight fields are delt with by &fixup_collate_order_args 620 if ($corder[$i]->[0] eq "...") { 621 my(@sym, $from, $to); 622 623 my @charset = sort { $sym{$a} cmp $sym{$b} } keys(%sym); 624 if ($i != 0) { 625 $from = $corder[$i -1]->[0]; 626 } else { 627 $from = $charset[0]; 628 } 629 if ($i != $#corder) { 630 $to = $corder[$i +1]->[0]; 631 } else { 632 $to = $charset[$#charset]; 633 } 634 635 my @expand; 636 my($s, $e) = (&parse_value($from), &parse_value($to)); 637 foreach my $c (@charset) { 638 if (($sym{$c} cmp $s) > 0) { 639 last if (($sym{$c} cmp $e) >= 0); 640 my @entry = @{$corder[$i]}; 641 $entry[0] = "<$c>"; 642 push(@expand, \@entry); 643 } 644 } 645 splice(@corder, $i, 1, @expand); 646 } elsif($corder[$i]->[0] eq "UNDEFINED") { 647 $undef_at = $i; 648 next; 649 } 650 &fixup_collate_order_args($corder[$i]); 651 } 652 653 if ($undef_at) { 654 my @insert; 655 my %cused = map { ("ARRAY" eq ref $_) ? ($_->[0], undef) : () } @corder; 656 foreach my $s (keys(%sym)) { 657 next if (exists $cused{"<$s>"}); 658 my @entry = @{$corder[$undef_at]}; 659 $entry[0] = "<$s>"; 660 &fixup_collate_order_args(\@entry); 661 push(@insert, \@entry); 662 } 663 splice(@corder, $undef_at, 1, @insert); 664 } 665 } elsif((!defined $arg) || $arg eq "") { 666 if (!exists($csym{$cmd})) { 667 my($decode, $was_sym) = &unsym_with_check($cmd); 668 if ($was_sym) { 669 my %dots = ( "..." => undef ); 670 my @dots = (\%dots) x (0+@corder_weights); 671 push(@corder, [$cmd, @dots]); 672 } else { 673 warn "Undefined collation symbol $cmd used on line $. of $opt{i}\n"; 674 } 675 } else { 676 push(@corder, $cmd); 677 } 678 } else { 679 unless (defined($cele{$cmd} || defined $sym{$cmd})) { 680 warn "Undefined collation element or charset sym $cmd used on line $. of $opt{i}\n"; 681 } else { 682 # This expands all the symbols (but not colating elements), which 683 # makes life easier for dealing with ..., but harder for 684 # outputing the actual table at the end where we end up 685 # converting literal sequences back into symbols in some cases 686 my @args = &parse_values($arg, ';', undef, 0, 0, 687 qr/IGNORE|\Q...\E/); 688 689 if (@args != @corder_weights) { 690 if (@args < @corder_weights) { 691 &exit(4, "Only " . (0 + @args) 692 . " weights supplied on line $. of $opt{i}, needed " 693 . (0 + @corder_weights) 694 . "\n"); 695 } else { 696 &exit(4, "Too many weights supplied on line $. of $opt{i}," 697 . " wanted " . (0 + @corder_weights) . " but had " 698 . (0 + @args) 699 . "\n"); 700 } 701 } 702 703 push(@corder, [$cmd, @args]); 704 } 705 } 706} 707 708sub parse_LC_MONETARY { 709 my($cmd, $arg) = @_; 710 711 if ($cmd eq "copy") { 712 &do_copy(&parse_value($arg)); 713 } elsif($cmd eq "END") { 714 } elsif($cmd eq "mon_grouping") { 715 my @v = &parse_values($arg, ';', undef, 0); 716 $monetary{$cmd} = \@v; 717 } else { 718 my $v = &parse_value($arg); 719 $monetary{$cmd} = $v; 720 } 721} 722 723sub parse_LC_MESSAGES { 724 my($cmd, $arg) = @_; 725 726 if ($cmd eq "copy") { 727 &do_copy(&parse_value($arg)); 728 } elsif($cmd eq "END") { 729 } else { 730 my $v = &parse_value($arg); 731 $messages{$cmd} = $v; 732 } 733} 734 735sub parse_LC_NUMERIC { 736 my($cmd, $arg) = @_; 737 738 if ($cmd eq "copy") { 739 &do_copy(&parse_value($arg)); 740 } elsif($cmd eq "END") { 741 } elsif($cmd eq "grouping") { 742 my @v = &parse_values($arg, ';', undef, 0); 743 $numeric{$cmd} = \@v; 744 } else { 745 my $v = &parse_value($arg); 746 $numeric{$cmd} = $v; 747 } 748} 749 750sub parse_LC_TIME { 751 my($cmd, $arg) = @_; 752 753 $cmd =~ s/^ab_day$/abday/; 754 755 if ($cmd eq "copy") { 756 &do_copy(&parse_value($arg)); 757 } elsif($cmd eq "END") { 758 } elsif($cmd =~ m/abday|day|mon|abmon|am_pm|alt_digits/) { 759 my @v = &parse_values($arg, ';', undef, 0); 760 $time{$cmd} = \@v; 761 } elsif($cmd eq "era") { 762 my @v = &parse_values($arg, ':', undef, 0); 763 $time{$cmd} = \@v; 764 } else { 765 my $v = &parse_value($arg); 766 $time{$cmd} = $v; 767 } 768} 769 770 771############################################################################### 772 773sub run_mklocale { 774 my $L = (new IO::File "|/usr/bin/mklocale -o $locale_dir/LC_CTYPE") || &exit(5, "$0: Can't start mklocale $!\n"); 775 if (defined($opt{'u'})) { 776 $L->print(qq{ENCODING "$opt{u}"\n}); 777 } else { 778 if ($ARGV[0] =~ m/(big5|euc|gb18030|gb2312|gbk|mskanji|utf-8)/i) { 779 my $enc = uc($1); 780 $L->print(qq{ENCODING "$enc"\n}); 781 } elsif($ARGV[0] =~ m/utf8/) { 782 $L->print(qq{ENCODING "UTF-8"\n}); 783 } else { 784 $L->print(qq{ENCODING "NONE"\n}); 785 } 786 } 787 foreach my $class (keys(%ctype_classes)) { 788 unless ($class =~ m/^(tolower|toupper|alpha|control|digit|grah|lower|space|upper|xdigit|blank|print|ideogram|special|phonogram)$/) { 789 $L->print("# skipping $class\n"); 790 next; 791 } 792 793 if (!%{$ctype_classes{$class}}) { 794 $L->print("# Nothing in \U$class\n"); 795 next; 796 } 797 798 if ($class =~ m/^to/) { 799 my $t = $class; 800 $t =~ s/^to/map/; 801 $L->print("\U$t "); 802 803 foreach my $from (keys(%{$ctype_classes{$class}})) { 804 $L->print("[", &hexchars($from), " ", 805 &hexchars($ctype_classes{$class}->{$from}), "] "); 806 } 807 } else { 808 $L->print("\U$class "); 809 810 foreach my $rune (keys(%{$ctype_classes{$class}})) { 811 $L->print(&hexchars($rune), " "); 812 } 813 } 814 $L->print("\n"); 815 } 816 817 my @width; 818 foreach my $s (keys(%width)) { 819 my $w = $width{$s}; 820 $w = 3 if ($w > 3); 821 push(@{$width[$w]}, &hexchars($sym{$s})); 822 } 823 for(my $w = 0; $w <= $#width; ++$w) { 824 next if (!defined $width[$w]); 825 next if (0 == @{$width[$w]}); 826 $L->print("SWIDTH$w ", join(" ", @{$width[$w]}), "\n"); 827 } 828 829 if (!$L->close()) { 830 if (0 == $!) { 831 &exit(5, "Bad return from mklocale $?"); 832 } else { 833 &exit(5, "Couldn't close mklocale pipe: $!"); 834 } 835 } 836} 837 838############################################################################### 839 840sub hexchars { 841 my($str) = $_[0]; 842 my($ret); 843 844 $ret = unpack "H*", $str; 845 &exit(2, "Rune >4 bytes ($ret; for $str)") if (length($ret) > 8); 846 847 return "0x" . $ret; 848} 849 850sub hexseq { 851 my($str) = $_[0]; 852 my($ret); 853 854 $ret = unpack "H*", $str; 855 $ret =~ s/(..)/\\x$1/g; 856 857 return $ret; 858} 859 860# dot_expand in the target charset 861sub dot_expand { 862 my($s, $e) = @_; 863 my(@ret); 864 865 my @charset = sort { $a cmp $b } values(%sym); 866 foreach my $c (@charset) { 867 if (($c cmp $s) >= 0) { 868 last if (($c cmp $e) > 0); 869 push(@ret, $c); 870 } 871 } 872 873 return @ret; 874} 875 876# Convert symbols into literal values 877sub unsym { 878 my @ret = &unsym_with_check(@_); 879 return $ret[0]; 880} 881 882# Convert symbols into literal values (return[0]), and a count of how 883# many symbols were converted (return[1]). 884sub unsym_with_check { 885 my($str) = $_[0]; 886 887 my $rx = join("|", keys(%sym)); 888 return ($str, 0) if ($rx eq ""); 889 my $found = $str =~ s/<($rx)>/$sym{$1}/eg; 890 891 return ($str, $found); 892} 893 894# Convert a string of literals back into symbols. It is an error 895# for there to be literal values that can't be mapped back. The 896# converter uses a gredy algo. It is likely this could be done 897# more efficently with a regex ctrated at runtime. It would also be 898# a good idea to only create %rsym if %sym changes, but that isn't 899# the simplest thing to do in perl5. 900sub resym { 901 my($str) = $_[0]; 902 my(%rsym, $k, $v); 903 my $max_len = 0; 904 my $ret = ""; 905 906 while(($k, $v) = each(%sym)) { 907 # Collisions in $v are ok, we merely need a mapping, not the 908 # identical mapping 909 $rsym{$v} = $k; 910 $max_len = length($v) if (length($v) > $max_len); 911 } 912 913 SYM: while("" ne $str) { 914 foreach my $l ($max_len .. 1) { 915 next if ($l > length($str)); 916 my $s = substr($str, 0, $l); 917 if (defined($rsym{$s})) { 918 $ret .= "<" . $rsym{$s} . ">"; 919 substr($str, 0, $l) = ""; 920 next SYM; 921 } 922 } 923 &exit(4, "Can't convert $str ($_[0]) back into symbolic form\n"); 924 } 925 926 return $ret; 927} 928 929sub set_escape { 930 $escape_char = $_[0]; 931 $val_match = qr/"(?:[^"\Q$escape_char\E]+|\Q$escape_char\E")+"|(?:\Q$escape_char\E(?:[0-7]+|d[0-9]+|x[0-9a-fA-F]+))|[^,;<>\s\Q$escape_char\E]|(?:\Q$escape_char\E)[,;<>\Q$escape_char\E]/; 932} 933 934sub set_parser { 935 my $section = $_[0]; 936 ($current_LC, $parse_func, $validate_line, $call_parse_on_END) 937 = ($section, $LC_parsers{$section}->[0], $LC_parsers{$section}->[1], 938 $LC_parsers{$section}->[2]); 939 unless (defined $parse_func) { 940 &exit(4, "Unknown section name LC_$section on line $. of $opt{i}\n"); 941 } 942} 943 944sub do_copy { 945 my($from) = @_; 946 local($ENV{LC_ALL}) = $from; 947 948 my $C = (new IO::File "/usr/bin/locale -k LC_$current_LC |") || &exit(5, "can't fork locale during copy of LC_$current_LC"); 949 while(<$C>) { 950 if (s/=\s*$/ ""/ || s/=/ /) { 951 if (m/$validate_line/ && m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) { 952 my($action, $args) = ($1, $3); 953 &{$parse_func}($action, $args); 954 } else { 955 &exit(4, "Syntax error on line $. of locale -k output" 956 . " during copy $current_LC\n"); 957 } 958 } else { 959 &exit(4, "Ill-formed line $. from locale -k during copy $current_LC\n"); 960 } 961 } 962 $C->close() || &exit(5, "copying LC_$current_LC from $from failed"); 963} 964 965sub fixup_collate_order_args { 966 my $co = $_[0]; 967 968 foreach my $s (@{$co}[1..$#{$co}]) { 969 if ("HASH" eq ref($s) && exists($s->{"..."})) { 970 $s = $co->[0]; 971 } 972 } 973} 974 975sub add_to_ctype_class { 976 my($class, @runes) = @_; 977 978 my $c = $ctype_classes{$class}; 979 foreach my $r (@runes) { 980 $c->{$r} = 2 unless exists $c->{$r}; 981 } 982} 983 984sub deny_in_ctype_class { 985 my($class, $deny_reason, @runes) = @_; 986 987 my $c = $ctype_classes{$class}; 988 foreach my $r (@runes) { 989 next unless exists $c->{$r}; 990 $deny_reason =~ s/^(\S+)$/can't belong in class $class and in class $1 at the same time/; 991 &exit(4, &hexchars($r) . " " . $deny_reason . "\n"); 992 } 993} 994 995# write_lc_{money,time,messages} all use the existing Libc format, which 996# is raw text with each record terminated by a newline, and records 997# in a predetermined order. 998 999sub write_lc_money { 1000 my $F = (new IO::File "$locale_dir/LC_MONETARY", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_MONETARY: $!"); 1001 foreach my $s (qw(int_curr_symbol currency_symbol mon_decimal_point mon_thousands_sep mon_grouping positive_sign negative_sign int_frac_digits frac_digits p_cs_precedes p_sep_by_space n_cs_precedes n_sep_by_space p_sign_posn n_sign_posn int_p_cs_precedes int_n_cs_precedes int_p_sep_by_space int_n_sep_by_space int_p_sign_posn int_n_sign_posn)) { 1002 if (exists $monetary{$s}) { 1003 my $v = $monetary{$s}; 1004 if ("ARRAY" eq ref $v) { 1005 $F->print(join(";", @$v), "\n"); 1006 } else { 1007 $F->print("$v\n"); 1008 } 1009 } else { 1010 if ($s =~ m/^(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|positive_sign|negative_sign)$/) { 1011 $F->print("\n"); 1012 } else { 1013 $F->print("-1\n"); 1014 } 1015 } 1016 } 1017} 1018 1019sub write_lc_time { 1020 my $F = (new IO::File "$locale_dir/LC_DATE", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_DATE: $!"); 1021 my %array_cnt = (abmon => 12, mon => 12, abday => 7, day => 7, alt_month => 12, am_pm => 2); 1022 1023 $time{"md_order"} = "md" unless defined $time{"md_order"}; 1024 1025 foreach my $s (qw(abmon mon abday day t_fmt d_fmt d_t_fmt am_pm d_t_fmt mon md_order t_fmt_ampm)) { 1026 my $cnt = $array_cnt{$s}; 1027 my $v = $time{$s}; 1028 1029 if (defined $v) { 1030 if (defined $cnt) { 1031 my @a = @{$v}; 1032 &exit(4, "$0: $s has " . (0 + @a) 1033 . " elements, it needs to have exactly $cnt\n") 1034 unless (@a == $cnt); 1035 $F->print(join("\n", @a), "\n"); 1036 } else { 1037 $F->print("$v\n"); 1038 } 1039 } else { 1040 $cnt = 1 if !defined $cnt; 1041 $F->print("\n" x $cnt); 1042 } 1043 } 1044} 1045 1046sub write_lc_messages { 1047 mkdir("$locale_dir/LC_MESSAGES"); 1048 my $F = (new IO::File "$locale_dir/LC_MESSAGES/LC_MESSAGES", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_DATE: $!"); 1049 1050 foreach my $s (qw(yesexpr noexpr yesstr nostr)) { 1051 my $v = $messages{$s}; 1052 1053 if (defined $v) { 1054 $F->print("$v\n"); 1055 } else { 1056 $F->print("\n"); 1057 } 1058 } 1059} 1060 1061sub bylenval { 1062 return 0 if ("ARRAY" ne ref $a || "ARRAY" ne ref $b); 1063 1064 my($aval, $af) = &unsym_with_check($a->[0]); 1065 $aval = $cele{$a->[0]} unless $af; 1066 my($bval, $bf) = &unsym_with_check($b->[0]); 1067 $bval = $cele{$b->[0]} unless $bf; 1068 1069 my $r = length($aval) - length($bval); 1070 return $r if $r; 1071 return $aval cmp $bval; 1072} 1073 1074sub write_lc_collate { 1075 return unless @colldef; 1076 1077 # colldef doesn't parse the whole glory of SuSv3 charmaps, and we 1078 # already have, so we cna spit out a simplifyed one; unfortunitly 1079 # it doesn't like "/dev/fd/N" so we need a named tmp file 1080 my($CMAP, $cmapname) = tempfile(DIR => "/tmp"); 1081 foreach my $s (keys(%sym)) { 1082 $CMAP->print("<$s>\t", sprintf "\\x%02x\n", ord($sym{$s})); 1083 } 1084 $CMAP->flush(); 1085 unshift(@colldef, qq{charmap $cmapname}); 1086 unshift(@colldef, "LC_COLLATE"); 1087 $colldef[$#colldef] = "END LC_COLLATE"; 1088 1089 # Can't just use /dev/stdin, colldef appears to use seek, 1090 # and even seems to need a named temp file (re-open?) 1091 my($COL, $colname) = tempfile(DIR => "/tmp"); 1092 $COL->print(join("\n", @colldef), "\n"); 1093 $COL->flush(); 1094 1095 my $rc = system( 1096 "/usr/bin/colldef -o $locale_dir/LC_COLLATE $colname"); 1097 unlink $colname, $cmapname; 1098 if ($rc) { 1099 &exit(1, "Bad return from colldef $rc"); 1100 } 1101} 1102 1103# Pack an int of unknown size into a series of bytes, each of which 1104# contains 7 bits of data, and the top bit is clear on the last 1105# byte of data. Also works on arrays -- does not encode the size of 1106# the array. This format is great for data that tends to have fewer 1107# then 21 bits. 1108sub pack_p_int { 1109 if (@_ > 1) { 1110 my $ret = ""; 1111 foreach my $v (@_) { 1112 $ret .= &pack_p_int($v); 1113 } 1114 1115 return $ret; 1116 } 1117 1118 my $v = $_[0]; 1119 my $b; 1120 1121 &exit(4, "pack_p_int only works on positive values") if ($v < 0); 1122 if ($v < 128) { 1123 $b = chr($v); 1124 } else { 1125 $b = chr(($v & 0x7f) | 0x80); 1126 $b .= pack_p_int($v >> 7); 1127 } 1128 return $b; 1129} 1130 1131sub strip_angles { 1132 my $s = $_[0]; 1133 $s =~ s/^<(.*)>$/$1/; 1134 return $s; 1135} 1136 1137# For localedef 1138# xc=0 "no warnings, locale defined" 1139# xc=1 "warnings, locale defined" 1140# xc=2 "implmentation limits or unsupported charactor sets, no locale defined" 1141# xc=3 "can't create new locales" 1142# xc=4+ "wornings or errors, no locale defined" 1143sub exit { 1144 my($xc, $message) = @_; 1145 1146 print STDERR $message; 1147 exit $xc; 1148} 1149