1#!/usr/bin/perl -w 2 3use strict; 4use Getopt::Std; 5use Fcntl qw(O_TRUNC O_CREAT O_WRONLY SEEK_SET); 6use File::Temp qw(tempfile); 7use IO::File; 8 9my %opt; 10getopts("cf:u:i:", \%opt); 11 12my $comment_char = "#"; 13my $escape_char = "\\"; 14my $val_match = undef; # set in set_escape 15my %sym = (); 16my %width = (); 17my %ctype_classes = ( 18 # there are the charactors that get automagically included, there is no 19 # standard way to avoid them. XXX even if you have a charset without 20 # some of these charactors defined! 21 22 # They are accessable in a regex via [:classname:], and libc has a 23 # isX() for most of these. 24 upper => {map { ($_, 1); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)}, 25 lower => {map { ($_, 1); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)}, 26 alpha => {}, 27 #alnum => {}, 28 digit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9)}, 29 space => {}, 30 cntrl => {}, 31 punct => {}, 32 graph => {}, 33 print => {}, 34 xdigit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f)}, 35 blank => {" " => 1, "\t" => 1}, 36 37 toupper => {map { ($_, "\U$_"); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)}, 38 tolower => {map { ($_, "\L$_"); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)}, 39); 40 41my %cele = ( 42 # collating-elements -- these are a lot like %sym that only works 43 # in LC_COLLATE, can also be accessed in a regex via [.element.] 44); 45 46my %csym = ( 47 # collating-symbols -- these are used to define a set of charactors 48 # that compare as equals (in one or more passes), can also be accessed 49 # in a regex via [=symbol=] 50); 51 52my @corder = (); # collating order 53my @corder_weights = (); # collating directions (forward, backward, position) 54 55my @colldef = (); 56 57my(%monetary, %numeric, %time, %messages); 58 59# This is the default charmap, unlike %ctype_classes you _can_ avoid this 60# merely by having your own charmap definition file 61my $default_charmap = <<EOT; 62CHARMAP 63<NUL> \\000 64<alert> \\007 65<backspace> \\010 66<tab> \\011 67<newline> \\012 68<vertical-tab> \\013 69<form-feed> \\014 70<carriage-return> \\015 71<space> \\040 72<exclamation-mark> \\041 73<quotation-mark> \\042 74<number-sign> \\043 75<dollar-sign> \\044 76<percent-sign> \\045 77<ampersand> \\046 78<apostrophe> \\047 79<left-parenthesis> \\050 80<right-parenthesis> \\051 81<asterisk> \\052 82<plus-sign> \\053 83<comma> \\054 84<hyphen> \\055 85<hyphen-minus> \\055 86<period> \\056 87<full-stop> \\056 88<slash> \\057 89<solidus> \\057 90<zero> \\060 91<one> \\061 92<two> \\062 93<three> \\063 94<four> \\064 95<five> \\065 96<six> \\066 97<seven> \\067 98<eight> \\070 99<nine> \\071 100<colon> \\072 101<semicolon> \\073 102<less-then-sign> \\074 103<less-than-sign> \\074 104<equals-sign> \\075 105<greater-then-sign> \\076 106<greater-than-sign> \\076 107<question-mark> \\077 108<commercial-at> \\100 109<A> \\101 110<B> \\102 111<C> \\103 112<D> \\104 113<E> \\105 114<F> \\106 115<G> \\107 116<H> \\110 117<I> \\111 118<J> \\112 119<K> \\113 120<L> \\114 121<M> \\115 122<N> \\116 123<O> \\117 124<P> \\120 125<Q> \\121 126<R> \\122 127<S> \\123 128<T> \\124 129<U> \\125 130<V> \\126 131<W> \\127 132<X> \\130 133<Y> \\131 134<Z> \\132 135<left-square-bracket> \\133 136<backslash> \\134 137<reverse-solidus> \\134 138<right-square-bracket> \\135 139<circumflex> \\136 140<circumflex-accent> \\136 141<underscore> \\137 142<underline> \\137 143<low-line> \\137 144<grave-accent> \\140 145<a> \\141 146<b> \\142 147<c> \\143 148<d> \\144 149<e> \\145 150<f> \\146 151<g> \\147 152<h> \\150 153<i> \\151 154<j> \\152 155<k> \\153 156<l> \\154 157<m> \\155 158<n> \\156 159<o> \\157 160<p> \\160 161<q> \\161 162<r> \\162 163<s> \\163 164<t> \\164 165<u> \\165 166<v> \\166 167<w> \\167 168<x> \\170 169<y> \\171 170<z> \\172 171<left-brace> \\173 172<left-curly-bracket> \\173 173<vertical-line> \\174 174<right-brace> \\175 175<right-curly-bracket> \\175 176<tilde> \\176 177<DEL> \\177 178 179<SOH> \\x01 180<STX> \\x02 181<ETX> \\x03 182<EOT> \\x04 183<ENQ> \\x05 184<ACK> \\x06 185<BEL> \\x07 186<BS> \\x08 187<HT> \\x09 188<NL> \\x0a 189<VT> \\x0b 190<NP> \\x0c 191<CR> \\x0d 192<SO> \\x0e 193<SI> \\x0f 194<DLE> \\x10 195<DC1> \\x11 196<DC2> \\x12 197<DC3> \\x13 198<DC4> \\x14 199<NAK> \\x15 200<SYN> \\x16 201<ETB> \\x17 202<CAN> \\x18 203<EM> \\x19 204<SUB> \\x1a 205<ESC> \\x1b 206<FS> \\x1c 207<IS4> \\x1c 208<GS> \\x1d 209<IS3> \\x1d 210<RS> \\x1e 211<IS2> \\x1e 212<US> \\x1f 213<IS1> \\x1f 214END CHARMAP 215EOT 216 217&set_escape($escape_char); 218 219use strict qw(vars); 220 221if (@ARGV != 1) { 222 &exit(4, "usage: $0 [-c] [-f charmap-file] [-u codesetname] [-i localdef-file] LOCALENAME\n"); 223} 224 225my $locale_dir = $ARGV[0]; 226$locale_dir = "/usr/share/locale/$locale_dir" unless ($locale_dir =~ m{/}); 227 228my $CMAP; 229if (defined($opt{'f'})) { 230 # Using new IO::File $opt{'f'}, "r" runs into problems with long path names 231 sysopen(CMAP_KLUDGE, $opt{'f'}, O_RDONLY) || &exit(4, "Can't open $opt{f}: $!\n"); 232 $CMAP = new IO::Handle; 233 $CMAP->fdopen(fileno(CMAP_KLUDGE), "r") || &exit(4, "Can't fdopen $opt{f}: $!\n"); 234} else { 235 # er, not everyone gets IO::Scalar, so use an unamed tmp file 236 # $CMAP = new IO::Scalar \$default_charmap; 237 $CMAP = new_tmpfile IO::File; 238 print $CMAP $default_charmap; 239 seek $CMAP, 0, SEEK_SET; 240} 241 242while(<$CMAP>) { 243 if (m/^\s*CHARMAP\s*$/) { 244 &parse_charmaps(); 245 } elsif (m/^\s*WIDTH\s*$/) { 246 &parse_widths(); 247 } elsif (m/^\s*($comment_char.*)?$/) { 248 } else { 249 chomp; 250 &exit(4, "syntax error on line $. ($_)"); 251 } 252} 253&parse_widths() if (0 == %width); 254 255if (defined($opt{'i'})) { 256 sysopen(STDIN, $opt{'i'}, 0) || &exit(4, "Can't open localdef file $opt{i}: $!"); 257} else { 258 $opt{'i'} = "/dev/stdin"; 259} 260 261my %LC_parsers = ( 262 NONE => [\&parse_LC_NONE, qr/^\s*((escape|comment)_char\s+$val_match\s*)?$/], 263 CTYPE => [\&parse_LC_CTYPE, qr/^\s*(\S+)\s+(\S+.*?)\s*$/], 264 COLLATE => [\&parse_LC_COLLATE, qr/^\s*(<[^>\s]+>|order_end|END|(\S*)\s+(\S+.*?)|collating[_-]element\s*<[^>]+>\s+from\s+$val_match)\s*$/, 1], 265 TIME => [\&parse_LC_TIME, qr/^\s*(ab_?day|day|abmon|mon|d_t_fmt|d_fmt|t_fmt|am_pm|t_fmt_ampm|era|era_d_fmt|era_t_fmt|era_d_t_fmt|alt_digits|copy|END)\s+(\S+.*?)\s*$/], 266 NUMERIC => [\&parse_LC_NUMERIC, qr/^\s*(decimal_point|thousands_sep|grouping|END|copy)\s+(\S+.*?)\s*$/], 267 MONETARY => [\&parse_LC_MONETARY, qr/^\s*(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|mon_grouping|positive_sign|negative_sign|int_frac_digits|frac_digits|p_cs_precedes|p_sep_by_space|n_cs_precedes|n_sep_by_space|p_sign_posn|n_sign_posn|int_p_cs_precedes|int_n_cs_precedes|int_p_sep_by_space|int_n_sep_by_space|int_p_sign_posn|int_n_sign_posn|copy|END)\s+(\S+.*?)\s*$/], 268 MESSAGES => [\&parse_LC_MESSAGES, qr/^\s*(END|yesexpr|noexpr|yesstr|nostr|copy)\s+(\S+.*?)\s*$/], 269 "COLLATE order" => [\&parse_collate_order, qr/^\s*(order_end|(<[^>\s]+>|UNDEFINED|\Q...\E)(\s+\S+.*)?)\s*$/], 270); 271my($current_LC, $parse_func, $validate_line, $call_parse_on_END) 272 = ("NONE", $LC_parsers{"NONE"}->[0], $LC_parsers{"NONE"}->[1], undef); 273 274while(<STDIN>) { 275 next if (m/^\s*($comment_char.*)?\s*$/); 276 if (m/\Q$escape_char\E$/) { 277 chomp; 278 chop; 279 my $tmp = <STDIN>; 280 if (!defined($tmp)) { 281 &exit(4, "Syntax error, last line ($.) of $opt{i} is marked as a continued line\n"); 282 } 283 $tmp =~ s/^\s*//; 284 $_ .= $tmp; 285 redo; 286 } 287 288 if ($current_LC eq "NONE" && m/^\s*LC_([A-Z]+)\s*$/) { 289 &set_parser($1); 290 next; 291 } 292 293 unless (m/$validate_line/) { 294 &exit(4, "Syntax error on line $. of $opt{i}\n"); 295 } 296 297 my($action, $args); 298 if (m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) { 299 ($action, $args) = ($1, $3); 300 } else { 301 $action = $_; 302 chomp $action; 303 } 304 305 if ($action eq "END") { 306 if ($args ne "LC_$current_LC" || $current_LC eq "NONE") { 307 &exit(4, "Syntax error on line $. of $opt{i} attempting to end $args when LC_$current_LC is open\n"); 308 } 309 if ($call_parse_on_END) { 310 &{$parse_func}($action, $args); 311 } 312 &set_parser("NONE"); 313 } else { 314 &{$parse_func}($action, $args); 315 } 316} 317 318mkdir($locale_dir); 319&run_mklocale(); 320&write_lc_money(); 321&write_lc_time(); 322&write_lc_messages(); 323&write_lc_numeric(); 324&write_lc_collate(); 325exit 0; 326 327sub parse_charmaps { 328 while(<$CMAP>) { 329 # XXX need to parse out <code_set_name>, <mb_cur_max>, <mb_cur_min>, 330 # <escape_char>, and <comment_char> before the generic "<sym> val" 331 if (m/^\s*<([\w\-]+)>\s+($val_match+)\s*$/) { 332 my($sym, $val) = ($1, $2); 333 $val = &parse_value_double_backwhack($val); 334 $sym{$sym} = $val; 335 } elsif (m/^\s*<([\w\-]*\d)>\s*\Q...\E\s*<([\w\-]*\d)>\s+($val_match+)\s*$/) { 336 # We don't deal with $se < $ss, or overflow of the last byte of $vs 337 # then again the standard doesn't say anything in particular needs 338 # to happen for those cases 339 my($ss, $se, $vs) = ($1, $2, $3); 340 $vs = &parse_value_double_backwhack($vs); 341 my $vlast = length($vs) -1; 342 for(my($s, $v) = ($ss, $vs); $s cmp $se; $s++) { 343 $sym{$s} = $v; 344 substr($v, $vlast) = chr(ord(substr($v, $vlast)) +1) 345 } 346 } elsif (m/^\s*END\s+CHARMAP\s*$/) { 347 return; 348 } elsif (m/^\s*($comment_char.*)?$/) { 349 } else { 350 &exit(4, "syntax error on line $."); 351 } 352 } 353} 354 355sub parse_widths { 356 my $default = 1; 357 my @syms; 358 359 while(<$CMAP>) { 360 if (m/^\s*<([\w\-]+)>\s+(\d+)\s*$/) { 361 my($sym, $w) = ($1, $2); 362 print "$sym width $w\n"; 363 if (!defined($sym{$sym})) { 364 warn "localedef: can't set width of unknown symbol $sym on line $.\n"; 365 } else { 366 $width{$sym} = $w; 367 } 368 } elsif (m/^\s*<([\w\-]+)>\s*\Q...\E\s*<([\w\-]+)>\s+(\d+)\s*$/) { 369 my($ss, $se, $w) = ($1, $2, $3); 370 if (!@syms) { 371 @syms = sort { $a cmp $b } keys(%sym); 372 } 373 374 # Yes, we could do a binary search for find $ss in @syms 375 foreach my $s (@syms) { 376 if (($s cmp $ss) >= 0) { 377 last if (($s cmp $se) > 0); 378 } 379 } 380 } elsif (m/^\s*WIDTH_DEFAULT\s+(\d+)\s*$/) { 381 $default = $1; 382 } elsif (m/^\s*END\s+WIDTH\s*$/) { 383 last; 384 } elsif (m/^\s*($comment_char.*)?$/) { 385 } else { 386 &exit(4, "syntax error on line $."); 387 } 388 } 389 390 foreach my $s (keys(%sym)) { 391 if (!defined($width{$s})) { 392 $width{$s} = $default; 393 } 394 } 395} 396 397# This parses a single value in any of the 7 forms it can appear in, 398# returns [0] the parsed value and [1] the remander of the string 399sub parse_value_return_extra { 400 my $val = ""; 401 local($_) = $_[0]; 402 403 while(1) { 404 $val .= &unsym($1), next 405 if (m/\G"((?:[^"\Q$escape_char\E]+|\Q$escape_char\E.)*)"/gc); 406 $val .= chr(oct($1)), next 407 if (m/\G\Q$escape_char\E([0-7]+)/gc); 408 $val .= chr(0+$1), next 409 if (m/\G\Q$escape_char\Ed([0-9]+)/gc); 410 $val .= pack("H*", $1), next 411 if (m/\G\Q$escape_char\Ex([0-9a-fA-F]+)/gc); 412 $val .= $1, next 413 if (m/\G([^,;<>\s\Q$escape_char()\E])/gc); 414 $val .= $1 415 if (m/\G(?:\Q$escape_char\E)([,;<>\Q$escape_char()\E])/gc); 416 $val .= &unsym($1), next 417 if (m/\G(<[^>]+>)/gc); 418 419 m/\G(.*)$/; 420 421 return ($val, $1); 422 } 423} 424 425# Parse one value, if there is more then one value alert the media 426sub parse_value { 427 my ($ret, $err) = &parse_value_return_extra($_[0]); 428 if ($err ne "") { 429 &exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n"); 430 } 431 432 return $ret; 433} 434 435sub parse_value_double_backwhack { 436 my($val) = @_; 437 438 my ($ret, $err) = &parse_value_return_extra($val); 439 return $ret if ($err eq ""); 440 441 $val =~ s{\\\\}{\\}g; 442 ($ret, $err) = &parse_value_return_extra($val); 443 if ($err ne "") { 444 &exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n"); 445 } 446 447 return $ret; 448} 449# $values is the string to parse, $dot_expand is a function ref that will 450# return an array to insert when "X;...;Y" is parsed (undef means that 451# construct is a syntax error), $nest is true if parens indicate a nested 452# value string should be parsed and put in an array ref, $return_extra 453# is true if any unparsable trailing junk should be returned as the last 454# element (otherwise it is a syntax error). Any text matching the regex 455# $specials is returned as an hash. 456sub parse_values { 457 my($values, $sep, $dot_expand, $nest, $return_extra, $specials) = @_; 458 my(@ret, $live_dots); 459 460 while($values ne "") { 461 if (defined($specials) && $values =~ s/^($specials)($sep|$)//) { 462 push(@ret, { $1, undef }); 463 next; 464 } 465 if ($nest && $values =~ s/^\(//) { 466 my @subret = &parse_values($values, ',', $dot_expand, $nest, 1, $specials); 467 $values = pop(@subret); 468 push(@ret, [@subret]); 469 unless ($values =~ s/^\)($sep)?//) { 470 &exit(4, "Syntax error, unmatched open paren on line $. of $opt{i}\n"); 471 } 472 next; 473 } 474 475 my($v, $l) = &parse_value_return_extra($values); 476 $values = $l; 477 478 if ($live_dots) { 479 splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], $v)); 480 $live_dots = 0; 481 } else { 482 push(@ret, $v); 483 } 484 485 if (defined($dot_expand) && $values =~ s/^$sep\Q...\E$sep//) { 486 $live_dots = 1; 487 } elsif($values =~ s/^$sep//) { 488 # Normal case 489 } elsif($values =~ m/^$/) { 490 last; 491 } else { 492 last if ($return_extra); 493 &exit(4, "Syntax error parsing arguments on line $. of $opt{i}\n"); 494 } 495 } 496 497 if ($live_dots) { 498 splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], undef)); 499 } 500 if ($return_extra) { 501 push(@ret, $values); 502 } 503 504 return @ret; 505} 506 507sub parse_LC_NONE { 508 my($cmd, $arg) = @_; 509 510 if ($cmd eq "comment_char") { 511 $comment_char = &parse_value($arg); 512 } elsif($cmd eq "escape_char") { 513 &set_escape_char(&parse_value($arg)); 514 } elsif($cmd eq "") { 515 } else { 516 &exit(4, "Syntax error on line $. of $opt{i}\n"); 517 } 518} 519 520sub parse_LC_CTYPE { 521 my($cmd, $arg) = @_; 522 523 my $ctype_classes = join("|", keys(%ctype_classes)); 524 if ($cmd eq "copy") { 525 # XXX -- the locale command line utility doesn't currently 526 # output any LC_CTYPE info, so there isn't much of a way 527 # to implent copy yet 528 &exit(2, "copy not supported on line $. of $opt{i}\n"); 529 } elsif($cmd eq "charclass") { 530 my $cc = &parse_value($arg); 531 if (!defined($ctype_classes{$cc})) { 532 $ctype_classes{$cc} = []; 533 } else { 534 warn "charclass $cc defined more then once\n"; 535 } 536 } elsif($cmd =~ m/^to(upper|lower)$/) { 537 my @arg = &parse_values($arg, ';', undef, 1); 538 foreach my $p (@arg) { 539 &exit(4, "Syntax error on line $. of $opt{i} ${cmd}'s arguments must be character pairs like (a,A);(b,B)\n") if ("ARRAY" ne ref $p || 2 != @$p); 540 } 541 foreach my $pair (@arg) { 542 $ctype_classes{$cmd}{$pair->[0]} = $pair->[1]; 543 } 544 } elsif($cmd =~ m/^($ctype_classes)$/) { 545 my @arg = &parse_values($arg, ';', \&dot_expand, 0); 546 foreach my $c (@arg) { 547 $ctype_classes{$1}->{$c} = 1; 548 } 549 } elsif($cmd =~ "END") { 550 &add_to_ctype_class('alpha', keys(%{$ctype_classes{'lower'}})); 551 &add_to_ctype_class('alpha', keys(%{$ctype_classes{'upper'}})); 552 foreach my $c (qw(alpha lower upper)) { 553 foreach my $d (qw(cntrl digit punct space)) { 554 &deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}})); 555 } 556 } 557 558 &add_to_ctype_class('space', keys(%{$ctype_classes{'blank'}})); 559 foreach my $d (qw(upper lower alpha digit graph xdigit)) { 560 &deny_in_ctype_class('space', $d, keys(%{$ctype_classes{$d}})); 561 } 562 563 foreach my $d (qw(upper lower alpha digit punct graph print xdigit)) { 564 &deny_in_ctype_class('cntrl', $d, keys(%{$ctype_classes{$d}})); 565 } 566 567 foreach my $d (qw(upper lower alpha digit cntrl xdigit space)) { 568 &deny_in_ctype_class('punct', $d, keys(%{$ctype_classes{$d}})); 569 } 570 571 foreach my $c (qw(graph print)) { 572 foreach my $a (qw(upper lower alpha digit xdigit punct)) { 573 &add_to_ctype_class($c, keys(%{$ctype_classes{$a}})); 574 } 575 foreach my $d (qw(cntrl)) { 576 &deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}})); 577 } 578 } 579 &add_to_ctype_class('print', keys(%{$ctype_classes{'space'}})); 580 581 # Yes, this is a requirment of the standard 582 &exit(2, "The digit class must have exactly 10 elements\n") if (10 != values(%{$ctype_classes{'digit'}})); 583 foreach my $d (values %{$ctype_classes{'digit'}}) { 584 if (!defined $ctype_classes{'xdigits'}->{$d}) { 585 &exit(4, "$d isn't in class xdigits, but all digits must appaer in xdigits\n"); 586 } 587 } 588 589 $ctype_classes{'alnum'} = {} unless defined $ctype_classes{'alnum'}; 590 foreach my $a (qw(alpha digit)) { 591 &add_to_ctype_class('alnum', keys(%{$ctype_classes{$a}})); 592 } 593 594 } else { 595 &exit(4, "Syntax error on line $. of $opt{i}\n"); 596 } 597} 598 599sub parse_LC_COLLATE { 600 my ($cmd, $arg) = @_; 601 if (defined($arg) && $arg ne "") { 602 push(@colldef, "$cmd $arg"); 603 } else { 604 push(@colldef, "$cmd"); 605 } 606} 607 608sub parse_collate_order { 609 my($cmd, $arg) = @_; 610 611 if ($cmd =~ m/order[-_]end/) { 612 # restore the parent parser 613 &set_parser("COLLATE"); 614 my $undef_at; 615 for(my $i = 0; $i <= $#corder; ++$i) { 616 next unless "ARRAY" eq ref($corder[$i]); 617 # If ... appears as the "key" for a order entry it means the 618 # rest of the line is duplicated once for everything in the 619 # open ended range (key-pev-line, key-next-line). Any ... 620 # in the weight fields are delt with by &fixup_collate_order_args 621 if ($corder[$i]->[0] eq "...") { 622 my(@sym, $from, $to); 623 624 my @charset = sort { $sym{$a} cmp $sym{$b} } keys(%sym); 625 if ($i != 0) { 626 $from = $corder[$i -1]->[0]; 627 } else { 628 $from = $charset[0]; 629 } 630 if ($i != $#corder) { 631 $to = $corder[$i +1]->[0]; 632 } else { 633 $to = $charset[$#charset]; 634 } 635 636 my @expand; 637 my($s, $e) = (&parse_value($from), &parse_value($to)); 638 foreach my $c (@charset) { 639 if (($sym{$c} cmp $s) > 0) { 640 last if (($sym{$c} cmp $e) >= 0); 641 my @entry = @{$corder[$i]}; 642 $entry[0] = "<$c>"; 643 push(@expand, \@entry); 644 } 645 } 646 splice(@corder, $i, 1, @expand); 647 } elsif($corder[$i]->[0] eq "UNDEFINED") { 648 $undef_at = $i; 649 next; 650 } 651 &fixup_collate_order_args($corder[$i]); 652 } 653 654 if ($undef_at) { 655 my @insert; 656 my %cused = map { ("ARRAY" eq ref $_) ? ($_->[0], undef) : () } @corder; 657 foreach my $s (keys(%sym)) { 658 next if (exists $cused{"<$s>"}); 659 my @entry = @{$corder[$undef_at]}; 660 $entry[0] = "<$s>"; 661 &fixup_collate_order_args(\@entry); 662 push(@insert, \@entry); 663 } 664 splice(@corder, $undef_at, 1, @insert); 665 } 666 } elsif((!defined $arg) || $arg eq "") { 667 if (!exists($csym{$cmd})) { 668 my($decode, $was_sym) = &unsym_with_check($cmd); 669 if ($was_sym) { 670 my %dots = ( "..." => undef ); 671 my @dots = (\%dots) x (0+@corder_weights); 672 push(@corder, [$cmd, @dots]); 673 } else { 674 warn "Undefined collation symbol $cmd used on line $. of $opt{i}\n"; 675 } 676 } else { 677 push(@corder, $cmd); 678 } 679 } else { 680 unless (defined($cele{$cmd} || defined $sym{$cmd})) { 681 warn "Undefined collation element or charset sym $cmd used on line $. of $opt{i}\n"; 682 } else { 683 # This expands all the symbols (but not colating elements), which 684 # makes life easier for dealing with ..., but harder for 685 # outputing the actual table at the end where we end up 686 # converting literal sequences back into symbols in some cases 687 my @args = &parse_values($arg, ';', undef, 0, 0, 688 qr/IGNORE|\Q...\E/); 689 690 if (@args != @corder_weights) { 691 if (@args < @corder_weights) { 692 &exit(4, "Only " . (0 + @args) 693 . " weights supplied on line $. of $opt{i}, needed " 694 . (0 + @corder_weights) 695 . "\n"); 696 } else { 697 &exit(4, "Too many weights supplied on line $. of $opt{i}," 698 . " wanted " . (0 + @corder_weights) . " but had " 699 . (0 + @args) 700 . "\n"); 701 } 702 } 703 704 push(@corder, [$cmd, @args]); 705 } 706 } 707} 708 709sub parse_LC_MONETARY { 710 my($cmd, $arg) = @_; 711 712 if ($cmd eq "copy") { 713 &do_copy(&parse_value($arg)); 714 } elsif($cmd eq "END") { 715 } elsif($cmd eq "mon_grouping") { 716 my @v = &parse_values($arg, ';', undef, 0); 717 $monetary{$cmd} = \@v; 718 } else { 719 my $v = &parse_value($arg); 720 $monetary{$cmd} = $v; 721 } 722} 723 724sub parse_LC_MESSAGES { 725 my($cmd, $arg) = @_; 726 727 if ($cmd eq "copy") { 728 &do_copy(&parse_value($arg)); 729 } elsif($cmd eq "END") { 730 } else { 731 my $v = &parse_value($arg); 732 $messages{$cmd} = $v; 733 } 734} 735 736sub parse_LC_NUMERIC { 737 my($cmd, $arg) = @_; 738 739 if ($cmd eq "copy") { 740 &do_copy(&parse_value($arg)); 741 } elsif($cmd eq "END") { 742 } elsif($cmd eq "grouping") { 743 my @v = &parse_values($arg, ';', undef, 0); 744 $numeric{$cmd} = \@v; 745 } else { 746 my $v = &parse_value($arg); 747 $numeric{$cmd} = $v; 748 } 749} 750 751sub parse_LC_TIME { 752 my($cmd, $arg) = @_; 753 754 $cmd =~ s/^ab_day$/abday/; 755 756 if ($cmd eq "copy") { 757 &do_copy(&parse_value($arg)); 758 } elsif($cmd eq "END") { 759 } elsif($cmd =~ m/abday|day|mon|abmon|am_pm|alt_digits/) { 760 my @v = &parse_values($arg, ';', undef, 0); 761 $time{$cmd} = \@v; 762 } elsif($cmd eq "era") { 763 my @v = &parse_values($arg, ':', undef, 0); 764 $time{$cmd} = \@v; 765 } else { 766 my $v = &parse_value($arg); 767 $time{$cmd} = $v; 768 } 769} 770 771 772############################################################################### 773 774sub run_mklocale { 775 my $L = (new IO::File "|/usr/bin/mklocale -o $locale_dir/LC_CTYPE") || &exit(5, "$0: Can't start mklocale $!\n"); 776 if (defined($opt{'u'})) { 777 $L->print(qq{ENCODING "$opt{u}"\n}); 778 } else { 779 if ($ARGV[0] =~ m/(big5|euc|gb18030|gb2312|gbk|mskanji|utf-8)/i) { 780 my $enc = uc($1); 781 $L->print(qq{ENCODING "$enc"\n}); 782 } elsif($ARGV[0] =~ m/utf8/) { 783 $L->print(qq{ENCODING "UTF-8"\n}); 784 } else { 785 $L->print(qq{ENCODING "NONE"\n}); 786 } 787 } 788 foreach my $class (keys(%ctype_classes)) { 789 unless ($class =~ m/^(tolower|toupper|alpha|control|digit|grah|lower|space|upper|xdigit|blank|print|ideogram|special|phonogram)$/) { 790 $L->print("# skipping $class\n"); 791 next; 792 } 793 794 if (!%{$ctype_classes{$class}}) { 795 $L->print("# Nothing in \U$class\n"); 796 next; 797 } 798 799 if ($class =~ m/^to/) { 800 my $t = $class; 801 $t =~ s/^to/map/; 802 $L->print("\U$t "); 803 804 foreach my $from (keys(%{$ctype_classes{$class}})) { 805 $L->print("[", &hexchars($from), " ", 806 &hexchars($ctype_classes{$class}->{$from}), "] "); 807 } 808 } else { 809 $L->print("\U$class "); 810 811 foreach my $rune (keys(%{$ctype_classes{$class}})) { 812 $L->print(&hexchars($rune), " "); 813 } 814 } 815 $L->print("\n"); 816 } 817 818 my @width; 819 foreach my $s (keys(%width)) { 820 my $w = $width{$s}; 821 $w = 3 if ($w > 3); 822 push(@{$width[$w]}, &hexchars($sym{$s})); 823 } 824 for(my $w = 0; $w <= $#width; ++$w) { 825 next if (!defined $width[$w]); 826 next if (0 == @{$width[$w]}); 827 $L->print("SWIDTH$w ", join(" ", @{$width[$w]}), "\n"); 828 } 829 830 if (!$L->close()) { 831 if (0 == $!) { 832 &exit(5, "Bad return from mklocale $?"); 833 } else { 834 &exit(5, "Couldn't close mklocale pipe: $!"); 835 } 836 } 837} 838 839############################################################################### 840 841sub hexchars { 842 my($str) = $_[0]; 843 my($ret); 844 845 $ret = unpack "H*", $str; 846 &exit(2, "Rune >4 bytes ($ret; for $str)") if (length($ret) > 8); 847 848 return "0x" . $ret; 849} 850 851sub hexseq { 852 my($str) = $_[0]; 853 my($ret); 854 855 $ret = unpack "H*", $str; 856 $ret =~ s/(..)/\\x$1/g; 857 858 return $ret; 859} 860 861# dot_expand in the target charset 862sub dot_expand { 863 my($s, $e) = @_; 864 my(@ret); 865 866 my @charset = sort { $a cmp $b } values(%sym); 867 foreach my $c (@charset) { 868 if (($c cmp $s) >= 0) { 869 last if (($c cmp $e) > 0); 870 push(@ret, $c); 871 } 872 } 873 874 return @ret; 875} 876 877# Convert symbols into literal values 878sub unsym { 879 my @ret = &unsym_with_check(@_); 880 return $ret[0]; 881} 882 883# Convert symbols into literal values (return[0]), and a count of how 884# many symbols were converted (return[1]). 885sub unsym_with_check { 886 my($str) = $_[0]; 887 888 my $rx = join("|", keys(%sym)); 889 return ($str, 0) if ($rx eq ""); 890 my $found = $str =~ s/<($rx)>/$sym{$1}/eg; 891 892 return ($str, $found); 893} 894 895# Convert a string of literals back into symbols. It is an error 896# for there to be literal values that can't be mapped back. The 897# converter uses a gredy algo. It is likely this could be done 898# more efficently with a regex ctrated at runtime. It would also be 899# a good idea to only create %rsym if %sym changes, but that isn't 900# the simplest thing to do in perl5. 901sub resym { 902 my($str) = $_[0]; 903 my(%rsym, $k, $v); 904 my $max_len = 0; 905 my $ret = ""; 906 907 while(($k, $v) = each(%sym)) { 908 # Collisions in $v are ok, we merely need a mapping, not the 909 # identical mapping 910 $rsym{$v} = $k; 911 $max_len = length($v) if (length($v) > $max_len); 912 } 913 914 SYM: while("" ne $str) { 915 foreach my $l ($max_len .. 1) { 916 next if ($l > length($str)); 917 my $s = substr($str, 0, $l); 918 if (defined($rsym{$s})) { 919 $ret .= "<" . $rsym{$s} . ">"; 920 substr($str, 0, $l) = ""; 921 next SYM; 922 } 923 } 924 &exit(4, "Can't convert $str ($_[0]) back into symbolic form\n"); 925 } 926 927 return $ret; 928} 929 930sub set_escape { 931 $escape_char = $_[0]; 932 $val_match = qr/"(?:[^"\Q$escape_char\E]+|\Q$escape_char\E")+"|(?:\Q$escape_char\E(?:[0-7]+|d[0-9]+|x[0-9a-fA-F]+))|[^,;<>\s\Q$escape_char\E]|(?:\Q$escape_char\E)[,;<>\Q$escape_char\E]/; 933} 934 935sub set_parser { 936 my $section = $_[0]; 937 ($current_LC, $parse_func, $validate_line, $call_parse_on_END) 938 = ($section, $LC_parsers{$section}->[0], $LC_parsers{$section}->[1], 939 $LC_parsers{$section}->[2]); 940 unless (defined $parse_func) { 941 &exit(4, "Unknown section name LC_$section on line $. of $opt{i}\n"); 942 } 943} 944 945sub do_copy { 946 my($from) = @_; 947 local($ENV{LC_ALL}) = $from; 948 949 my $C = (new IO::File "/usr/bin/locale -k LC_$current_LC |") || &exit(5, "can't fork locale during copy of LC_$current_LC"); 950 while(<$C>) { 951 if (s/=\s*$/ ""/ || s/=/ /) { 952 if (m/$validate_line/ && m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) { 953 my($action, $args) = ($1, $3); 954 &{$parse_func}($action, $args); 955 } else { 956 &exit(4, "Syntax error on line $. of locale -k output" 957 . " during copy $current_LC\n"); 958 } 959 } else { 960 &exit(4, "Ill-formed line $. from locale -k during copy $current_LC\n"); 961 } 962 } 963 $C->close() || &exit(5, "copying LC_$current_LC from $from failed"); 964} 965 966sub fixup_collate_order_args { 967 my $co = $_[0]; 968 969 foreach my $s (@{$co}[1..$#{$co}]) { 970 if ("HASH" eq ref($s) && exists($s->{"..."})) { 971 $s = $co->[0]; 972 } 973 } 974} 975 976sub add_to_ctype_class { 977 my($class, @runes) = @_; 978 979 my $c = $ctype_classes{$class}; 980 foreach my $r (@runes) { 981 $c->{$r} = 2 unless exists $c->{$r}; 982 } 983} 984 985sub deny_in_ctype_class { 986 my($class, $deny_reason, @runes) = @_; 987 988 my $c = $ctype_classes{$class}; 989 foreach my $r (@runes) { 990 next unless exists $c->{$r}; 991 $deny_reason =~ s/^(\S+)$/can't belong in class $class and in class $1 at the same time/; 992 &exit(4, &hexchars($r) . " " . $deny_reason . "\n"); 993 } 994} 995 996# write_lc_{money,time,messages} all use the existing Libc format, which 997# is raw text with each record terminated by a newline, and records 998# in a predetermined order. 999 1000sub write_lc_money { 1001 my $F = (new IO::File "$locale_dir/LC_MONETARY", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_MONETARY: $!"); 1002 foreach my $s (qw(int_curr_symbol currency_symbol mon_decimal_point mon_thousands_sep mon_grouping positive_sign negative_sign int_frac_digits frac_digits p_cs_precedes p_sep_by_space n_cs_precedes n_sep_by_space p_sign_posn n_sign_posn int_p_cs_precedes int_n_cs_precedes int_p_sep_by_space int_n_sep_by_space int_p_sign_posn int_n_sign_posn)) { 1003 if (exists $monetary{$s}) { 1004 my $v = $monetary{$s}; 1005 if ("ARRAY" eq ref $v) { 1006 $F->print(join(";", @$v), "\n"); 1007 } else { 1008 $F->print("$v\n"); 1009 } 1010 } else { 1011 if ($s =~ m/^(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|positive_sign|negative_sign)$/) { 1012 $F->print("\n"); 1013 } else { 1014 $F->print("-1\n"); 1015 } 1016 } 1017 } 1018} 1019 1020sub write_lc_time { 1021 my $F = (new IO::File "$locale_dir/LC_TIME", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_TIME: $!"); 1022 my %array_cnt = (abmon => 12, mon => 12, abday => 7, day => 7, alt_month => 12, am_pm => 2); 1023 1024 $time{"md_order"} = "md" unless defined $time{"md_order"}; 1025 1026 foreach my $s (qw(abmon mon abday day t_fmt d_fmt d_t_fmt am_pm d_t_fmt mon md_order t_fmt_ampm)) { 1027 my $cnt = $array_cnt{$s}; 1028 my $v = $time{$s}; 1029 1030 if (defined $v) { 1031 if (defined $cnt) { 1032 my @a = @{$v}; 1033 &exit(4, "$0: $s has " . (0 + @a) 1034 . " elements, it needs to have exactly $cnt\n") 1035 unless (@a == $cnt); 1036 $F->print(join("\n", @a), "\n"); 1037 } else { 1038 $F->print("$v\n"); 1039 } 1040 } else { 1041 $cnt = 1 if !defined $cnt; 1042 $F->print("\n" x $cnt); 1043 } 1044 } 1045} 1046 1047sub write_lc_messages { 1048 mkdir("$locale_dir/LC_MESSAGES"); 1049 my $F = (new IO::File "$locale_dir/LC_MESSAGES/LC_MESSAGES", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_MESSAGES/LC_MESSAGES: $!"); 1050 1051 foreach my $s (qw(yesexpr noexpr yesstr nostr)) { 1052 my $v = $messages{$s}; 1053 1054 if (defined $v) { 1055 $F->print("$v\n"); 1056 } else { 1057 $F->print("\n"); 1058 } 1059 } 1060} 1061 1062sub write_lc_numeric { 1063 my $F = (new IO::File "$locale_dir/LC_NUMERIC", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_NUMERIC: $!"); 1064 1065 foreach my $s (qw(decimal_point thousands_sep grouping)) { 1066 if (exists $numeric{$s}) { 1067 my $v = $numeric{$s}; 1068 if ("ARRAY" eq ref $v) { 1069 $F->print(join(";", @$v), "\n"); 1070 } else { 1071 $F->print("$v\n"); 1072 } 1073 } else { 1074 $F->print("\n"); 1075 } 1076 } 1077} 1078 1079sub bylenval { 1080 return 0 if ("ARRAY" ne ref $a || "ARRAY" ne ref $b); 1081 1082 my($aval, $af) = &unsym_with_check($a->[0]); 1083 $aval = $cele{$a->[0]} unless $af; 1084 my($bval, $bf) = &unsym_with_check($b->[0]); 1085 $bval = $cele{$b->[0]} unless $bf; 1086 1087 my $r = length($aval) - length($bval); 1088 return $r if $r; 1089 return $aval cmp $bval; 1090} 1091 1092sub write_lc_collate { 1093 return unless @colldef; 1094 1095 # colldef doesn't parse the whole glory of SuSv3 charmaps, and we 1096 # already have, so we cna spit out a simplifyed one; unfortunitly 1097 # it doesn't like "/dev/fd/N" so we need a named tmp file 1098 my($CMAP, $cmapname) = tempfile(DIR => "/tmp"); 1099 foreach my $s (keys(%sym)) { 1100 $CMAP->print("<$s>\t", sprintf "\\x%02x\n", ord($sym{$s})); 1101 } 1102 $CMAP->flush(); 1103 unshift(@colldef, qq{charmap $cmapname}); 1104 unshift(@colldef, "LC_COLLATE"); 1105 $colldef[$#colldef] = "END LC_COLLATE"; 1106 1107 # Can't just use /dev/stdin, colldef appears to use seek, 1108 # and even seems to need a named temp file (re-open?) 1109 my($COL, $colname) = tempfile(DIR => "/tmp"); 1110 $COL->print(join("\n", @colldef), "\n"); 1111 $COL->flush(); 1112 1113 my $rc = system( 1114 "/usr/bin/colldef -o $locale_dir/LC_COLLATE $colname"); 1115 unlink $colname, $cmapname; 1116 if ($rc) { 1117 &exit(1, "Bad return from colldef $rc"); 1118 } 1119} 1120 1121# Pack an int of unknown size into a series of bytes, each of which 1122# contains 7 bits of data, and the top bit is clear on the last 1123# byte of data. Also works on arrays -- does not encode the size of 1124# the array. This format is great for data that tends to have fewer 1125# then 21 bits. 1126sub pack_p_int { 1127 if (@_ > 1) { 1128 my $ret = ""; 1129 foreach my $v (@_) { 1130 $ret .= &pack_p_int($v); 1131 } 1132 1133 return $ret; 1134 } 1135 1136 my $v = $_[0]; 1137 my $b; 1138 1139 &exit(4, "pack_p_int only works on positive values") if ($v < 0); 1140 if ($v < 128) { 1141 $b = chr($v); 1142 } else { 1143 $b = chr(($v & 0x7f) | 0x80); 1144 $b .= pack_p_int($v >> 7); 1145 } 1146 return $b; 1147} 1148 1149sub strip_angles { 1150 my $s = $_[0]; 1151 $s =~ s/^<(.*)>$/$1/; 1152 return $s; 1153} 1154 1155# For localedef 1156# xc=0 "no warnings, locale defined" 1157# xc=1 "warnings, locale defined" 1158# xc=2 "implmentation limits or unsupported charactor sets, no locale defined" 1159# xc=3 "can't create new locales" 1160# xc=4+ "wornings or errors, no locale defined" 1161sub exit { 1162 my($xc, $message) = @_; 1163 1164 print STDERR $message; 1165 exit $xc; 1166} 1167