1#!/usr/bin/perl -w
2
3use strict;
4use Getopt::Std;
5use Fcntl qw(O_TRUNC O_CREAT O_WRONLY SEEK_SET);
6use File::Temp qw(tempfile);
7use IO::File;
8
9my %opt;
10getopts("cf:u:i:", \%opt);
11
12my $comment_char = "#";
13my $escape_char = "\\";
14my $val_match = undef;  # set in set_escape
15my %sym = ();
16my %width = ();
17my %ctype_classes = (
18	# there are the charactors that get automagically included, there is no
19	# standard way to avoid them.  XXX even if you have a charset without
20	# some of these charactors defined!
21
22	# They are accessable in a regex via [:classname:], and libc has a
23	# isX() for most of these.
24	upper => {map { ($_, 1); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)},
25	lower => {map { ($_, 1); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)},
26	alpha => {},
27	#alnum => {},
28	digit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9)},
29	space => {},
30	cntrl => {},
31	punct => {},
32	graph => {},
33	print => {},
34	xdigit => {map { ($_, 1); } qw(0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f)},
35	blank => {" " => 1, "\t" => 1},
36
37	toupper => {map { ($_, "\U$_"); } qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)},
38	tolower => {map { ($_, "\L$_"); } qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z)},
39);
40
41my %cele = (
42	# collating-elements  -- these are a lot like %sym that only works
43	# in LC_COLLATE, can also be accessed in a regex via [.element.]
44);
45
46my %csym = (
47	# collating-symbols -- these are used to define a set of charactors
48	# that compare as equals (in one or more passes), can also be accessed
49	# in a regex via [=symbol=]
50);
51
52my @corder = (); # collating order
53my @corder_weights = (); # collating directions (forward, backward, position)
54
55my @colldef = ();
56
57my(%monetary, %numeric, %time, %messages);
58
59# This is the default charmap, unlike %ctype_classes you _can_ avoid this
60# merely by having your own charmap definition file
61my $default_charmap = <<EOT;
62CHARMAP
63<NUL>	 \\000
64<alert>	 \\007
65<backspace>	 \\010
66<tab>	 \\011
67<newline>	 \\012
68<vertical-tab>	 \\013
69<form-feed>	 \\014
70<carriage-return>	 \\015
71<space>	 \\040
72<exclamation-mark>	 \\041
73<quotation-mark>	 \\042
74<number-sign>	 \\043
75<dollar-sign>	 \\044
76<percent-sign>	 \\045
77<ampersand>	 \\046
78<apostrophe>	 \\047
79<left-parenthesis>	 \\050
80<right-parenthesis>	 \\051
81<asterisk>	 \\052
82<plus-sign>	 \\053
83<comma>	 \\054
84<hyphen>	 \\055
85<hyphen-minus>	 \\055
86<period>	 \\056
87<full-stop>	 \\056
88<slash>	 \\057
89<solidus>	 \\057
90<zero>	 \\060
91<one>	 \\061
92<two>	 \\062
93<three>	 \\063
94<four>	 \\064
95<five>	 \\065
96<six>	 \\066
97<seven>	 \\067
98<eight>	 \\070
99<nine>	 \\071
100<colon>	 \\072
101<semicolon>	 \\073
102<less-then-sign>	 \\074
103<less-than-sign>	 \\074
104<equals-sign>	 \\075
105<greater-then-sign>	 \\076
106<greater-than-sign>	 \\076
107<question-mark>	 \\077
108<commercial-at>	 \\100
109<A>	 \\101
110<B>	 \\102
111<C>	 \\103
112<D>	 \\104
113<E>	 \\105
114<F>	 \\106
115<G>	 \\107
116<H>	 \\110
117<I>	 \\111
118<J>	 \\112
119<K>	 \\113
120<L>	 \\114
121<M>	 \\115
122<N>	 \\116
123<O>	 \\117
124<P>	 \\120
125<Q>	 \\121
126<R>	 \\122
127<S>	 \\123
128<T>	 \\124
129<U>	 \\125
130<V>	 \\126
131<W>	 \\127
132<X>	 \\130
133<Y>	 \\131
134<Z>	 \\132
135<left-square-bracket>	 \\133
136<backslash>	 \\134
137<reverse-solidus>	 \\134
138<right-square-bracket>	 \\135
139<circumflex>	 \\136
140<circumflex-accent>	 \\136
141<underscore>	 \\137
142<underline>	 \\137
143<low-line>	 \\137
144<grave-accent>	 \\140
145<a>	 \\141
146<b>	 \\142
147<c>	 \\143
148<d>	 \\144
149<e>	 \\145
150<f>	 \\146
151<g>	 \\147
152<h>	 \\150
153<i>	 \\151
154<j>	 \\152
155<k>	 \\153
156<l>	 \\154
157<m>	 \\155
158<n>	 \\156
159<o>	 \\157
160<p>	 \\160
161<q>	 \\161
162<r>	 \\162
163<s>	 \\163
164<t>	 \\164
165<u>	 \\165
166<v>	 \\166
167<w>	 \\167
168<x>	 \\170
169<y>	 \\171
170<z>	 \\172
171<left-brace>	 \\173
172<left-curly-bracket>	 \\173
173<vertical-line>	 \\174
174<right-brace>	 \\175
175<right-curly-bracket>	 \\175
176<tilde>	 \\176
177<DEL>	 \\177
178
179<SOH> \\x01
180<STX> \\x02
181<ETX> \\x03
182<EOT> \\x04
183<ENQ> \\x05
184<ACK> \\x06
185<BEL> \\x07
186<BS> \\x08
187<HT> \\x09
188<NL> \\x0a
189<VT> \\x0b
190<NP> \\x0c
191<CR> \\x0d
192<SO> \\x0e
193<SI> \\x0f
194<DLE> \\x10
195<DC1> \\x11
196<DC2> \\x12
197<DC3> \\x13
198<DC4> \\x14
199<NAK> \\x15
200<SYN> \\x16
201<ETB> \\x17
202<CAN> \\x18
203<EM> \\x19
204<SUB> \\x1a
205<ESC> \\x1b
206<FS> \\x1c
207<IS4> \\x1c
208<GS> \\x1d
209<IS3> \\x1d
210<RS> \\x1e
211<IS2> \\x1e
212<US> \\x1f
213<IS1> \\x1f
214END CHARMAP
215EOT
216
217&set_escape($escape_char);
218
219use strict qw(vars);
220
221if (@ARGV != 1) {
222	&exit(4, "usage: $0 [-c] [-f charmap-file] [-u codesetname] [-i localdef-file] LOCALENAME\n");
223}
224
225my $locale_dir = $ARGV[0];
226$locale_dir = "/usr/share/locale/$locale_dir" unless ($locale_dir =~ m{/});
227
228my $CMAP;
229if (defined($opt{'f'})) {
230	# Using new IO::File $opt{'f'}, "r" runs into problems with long path names
231	sysopen(CMAP_KLUDGE, $opt{'f'}, O_RDONLY) || &exit(4, "Can't open $opt{f}: $!\n");
232	$CMAP = new IO::Handle;
233	$CMAP->fdopen(fileno(CMAP_KLUDGE), "r") || &exit(4, "Can't fdopen $opt{f}: $!\n");
234} else {
235	# er, not everyone gets IO::Scalar, so use an unamed tmp file
236	# $CMAP = new IO::Scalar \$default_charmap;
237	$CMAP = new_tmpfile IO::File;
238	print $CMAP $default_charmap;
239	seek $CMAP, 0, SEEK_SET;
240}
241
242while(<$CMAP>) {
243	if (m/^\s*CHARMAP\s*$/) {
244		&parse_charmaps();
245	} elsif (m/^\s*WIDTH\s*$/) {
246		&parse_widths();
247	} elsif (m/^\s*($comment_char.*)?$/) {
248	} else {
249		chomp;
250		&exit(4, "syntax error on line $. ($_)");
251	}
252}
253&parse_widths() if (0 == %width);
254
255if (defined($opt{'i'})) {
256	sysopen(STDIN, $opt{'i'}, 0) || &exit(4, "Can't open localdef file $opt{i}: $!");
257} else {
258	$opt{'i'} = "/dev/stdin";
259}
260
261my %LC_parsers = (
262	NONE => [\&parse_LC_NONE, qr/^\s*((escape|comment)_char\s+$val_match\s*)?$/],
263	CTYPE => [\&parse_LC_CTYPE, qr/^\s*(\S+)\s+(\S+.*?)\s*$/],
264	COLLATE => [\&parse_LC_COLLATE, qr/^\s*(<[^>\s]+>|order_end|END|(\S*)\s+(\S+.*?)|collating[_-]element\s*<[^>]+>\s+from\s+$val_match)\s*$/, 1],
265	TIME => [\&parse_LC_TIME, qr/^\s*(ab_?day|day|abmon|mon|d_t_fmt|d_fmt|t_fmt|am_pm|t_fmt_ampm|era|era_d_fmt|era_t_fmt|era_d_t_fmt|alt_digits|copy|END)\s+(\S+.*?)\s*$/],
266	NUMERIC => [\&parse_LC_NUMERIC, qr/^\s*(decimal_point|thousands_sep|grouping|END|copy)\s+(\S+.*?)\s*$/],
267	MONETARY => [\&parse_LC_MONETARY, qr/^\s*(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|mon_grouping|positive_sign|negative_sign|int_frac_digits|frac_digits|p_cs_precedes|p_sep_by_space|n_cs_precedes|n_sep_by_space|p_sign_posn|n_sign_posn|int_p_cs_precedes|int_n_cs_precedes|int_p_sep_by_space|int_n_sep_by_space|int_p_sign_posn|int_n_sign_posn|copy|END)\s+(\S+.*?)\s*$/],
268	MESSAGES => [\&parse_LC_MESSAGES, qr/^\s*(END|yesexpr|noexpr|yesstr|nostr|copy)\s+(\S+.*?)\s*$/],
269	"COLLATE order" => [\&parse_collate_order, qr/^\s*(order_end|(<[^>\s]+>|UNDEFINED|\Q...\E)(\s+\S+.*)?)\s*$/],
270);
271my($current_LC, $parse_func, $validate_line, $call_parse_on_END)
272  = ("NONE", $LC_parsers{"NONE"}->[0], $LC_parsers{"NONE"}->[1], undef);
273
274while(<STDIN>) {
275	next if (m/^\s*($comment_char.*)?\s*$/);
276	if (m/\Q$escape_char\E$/) {
277		chomp;
278		chop;
279		my $tmp = <STDIN>;
280		if (!defined($tmp)) {
281			&exit(4, "Syntax error, last line ($.) of $opt{i} is marked as a continued line\n");
282		}
283		$tmp =~ s/^\s*//;
284		$_ .= $tmp;
285		redo;
286	}
287
288	if ($current_LC eq "NONE" && m/^\s*LC_([A-Z]+)\s*$/) {
289		&set_parser($1);
290		next;
291	}
292
293	unless (m/$validate_line/) {
294		&exit(4, "Syntax error on line $. of $opt{i}\n");
295	}
296
297	my($action, $args);
298	if (m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) {
299		($action, $args) = ($1, $3);
300	} else {
301		$action = $_;
302		chomp $action;
303	}
304
305	if ($action eq "END") {
306		if ($args ne "LC_$current_LC" || $current_LC eq "NONE") {
307			&exit(4, "Syntax error on line $. of $opt{i} attempting to end $args when LC_$current_LC is open\n");
308		}
309		if ($call_parse_on_END) {
310		    &{$parse_func}($action, $args);
311		}
312		&set_parser("NONE");
313	} else {
314		&{$parse_func}($action, $args);
315	}
316}
317
318mkdir($locale_dir);
319&run_mklocale();
320&write_lc_money();
321&write_lc_time();
322&write_lc_messages();
323&write_lc_numeric();
324&write_lc_collate();
325exit 0;
326
327sub parse_charmaps {
328	while(<$CMAP>) {
329		# XXX need to parse out <code_set_name>, <mb_cur_max>, <mb_cur_min>,
330		# <escape_char>, and <comment_char> before the generic "<sym> val"
331		if (m/^\s*<([\w\-]+)>\s+($val_match+)\s*$/) {
332			my($sym, $val) = ($1, $2);
333			$val = &parse_value_double_backwhack($val);
334			$sym{$sym} = $val;
335		} elsif (m/^\s*<([\w\-]*\d)>\s*\Q...\E\s*<([\w\-]*\d)>\s+($val_match+)\s*$/) {
336			# We don't deal with $se < $ss, or overflow of the last byte of $vs
337			# then again the standard doesn't say anything in particular needs
338			# to happen for those cases
339			my($ss, $se, $vs) = ($1, $2, $3);
340			$vs = &parse_value_double_backwhack($vs);
341			my $vlast = length($vs) -1;
342			for(my($s, $v) = ($ss, $vs); $s cmp $se; $s++) {
343				$sym{$s} = $v;
344				substr($v, $vlast) = chr(ord(substr($v, $vlast)) +1)
345			}
346		} elsif (m/^\s*END\s+CHARMAP\s*$/) {
347			return;
348		} elsif (m/^\s*($comment_char.*)?$/) {
349		} else {
350			&exit(4, "syntax error on line $.");
351		}
352	}
353}
354
355sub parse_widths {
356	my $default = 1;
357	my @syms;
358
359	while(<$CMAP>) {
360		if (m/^\s*<([\w\-]+)>\s+(\d+)\s*$/) {
361			my($sym, $w) = ($1, $2);
362			print "$sym width $w\n";
363			if (!defined($sym{$sym})) {
364				warn "localedef: can't set width of unknown symbol $sym on line $.\n";
365			} else {
366				$width{$sym} = $w;
367			}
368		} elsif (m/^\s*<([\w\-]+)>\s*\Q...\E\s*<([\w\-]+)>\s+(\d+)\s*$/) {
369			my($ss, $se, $w) = ($1, $2, $3);
370			if (!@syms) {
371				@syms = sort { $a cmp $b } keys(%sym);
372			}
373
374			# Yes, we could do a binary search for find $ss in @syms
375			foreach my $s (@syms) {
376				if (($s cmp $ss) >= 0) {
377					last if (($s cmp $se) > 0);
378				}
379			}
380		} elsif (m/^\s*WIDTH_DEFAULT\s+(\d+)\s*$/) {
381			$default = $1;
382		} elsif (m/^\s*END\s+WIDTH\s*$/) {
383			last;
384		} elsif (m/^\s*($comment_char.*)?$/) {
385		} else {
386			&exit(4, "syntax error on line $.");
387		}
388	}
389
390	foreach my $s (keys(%sym)) {
391		if (!defined($width{$s})) {
392			$width{$s} = $default;
393		}
394	}
395}
396
397# This parses a single value in any of the 7 forms it can appear in,
398# returns [0] the parsed value and [1] the remander of the string
399sub parse_value_return_extra {
400	my $val = "";
401	local($_) = $_[0];
402
403	while(1) {
404		$val .= &unsym($1), next
405		  if (m/\G"((?:[^"\Q$escape_char\E]+|\Q$escape_char\E.)*)"/gc);
406		$val .= chr(oct($1)), next
407		  if (m/\G\Q$escape_char\E([0-7]+)/gc);
408		$val .= chr(0+$1), next
409		  if (m/\G\Q$escape_char\Ed([0-9]+)/gc);
410		$val .= pack("H*", $1), next
411		  if (m/\G\Q$escape_char\Ex([0-9a-fA-F]+)/gc);
412		$val .= $1, next
413		  if (m/\G([^,;<>\s\Q$escape_char()\E])/gc);
414		$val .= $1
415		  if (m/\G(?:\Q$escape_char\E)([,;<>\Q$escape_char()\E])/gc);
416		$val .= &unsym($1), next
417		  if (m/\G(<[^>]+>)/gc);
418
419		m/\G(.*)$/;
420
421		return ($val, $1);
422	}
423}
424
425# Parse one value, if there is more then one value alert the media
426sub parse_value {
427	my ($ret, $err) = &parse_value_return_extra($_[0]);
428	if ($err ne "") {
429		&exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n");
430	}
431
432	return $ret;
433}
434
435sub parse_value_double_backwhack {
436	my($val) = @_;
437
438	my ($ret, $err) = &parse_value_return_extra($val);
439	return $ret if ($err eq "");
440
441	$val =~ s{\\\\}{\\}g;
442	($ret, $err) = &parse_value_return_extra($val);
443	if ($err ne "") {
444		&exit(4, "Syntax error, unexpected '$err' in value (after '$ret') on line $.\n");
445	}
446
447	return $ret;
448}
449# $values is the string to parse, $dot_expand is a function ref that will
450# return an array to insert when "X;...;Y" is parsed (undef means that
451# construct is a syntax error), $nest is true if parens indicate a nested
452# value string should be parsed and put in an array ref, $return_extra
453# is true if any unparsable trailing junk should be returned as the last
454# element (otherwise it is a syntax error).  Any text matching the regex
455# $specials is returned as an hash.
456sub parse_values {
457	my($values, $sep, $dot_expand, $nest, $return_extra, $specials) = @_;
458	my(@ret, $live_dots);
459
460	while($values ne "") {
461		if (defined($specials) && $values =~ s/^($specials)($sep|$)//) {
462			push(@ret, { $1, undef });
463			next;
464		}
465		if ($nest && $values =~ s/^\(//) {
466			my @subret = &parse_values($values, ',', $dot_expand, $nest, 1, $specials);
467			$values = pop(@subret);
468			push(@ret, [@subret]);
469			unless ($values =~ s/^\)($sep)?//) {
470				&exit(4, "Syntax error, unmatched open paren on line $. of $opt{i}\n");
471			}
472			next;
473		}
474
475		my($v, $l) = &parse_value_return_extra($values);
476		$values = $l;
477
478		if ($live_dots) {
479			splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], $v));
480			$live_dots = 0;
481		} else {
482			push(@ret, $v);
483		}
484
485		if (defined($dot_expand) && $values =~ s/^$sep\Q...\E$sep//) {
486			$live_dots = 1;
487		} elsif($values =~ s/^$sep//) {
488			# Normal case
489		} elsif($values =~ m/^$/) {
490			last;
491		} else {
492			last if ($return_extra);
493			&exit(4, "Syntax error parsing arguments on line $. of $opt{i}\n");
494		}
495	}
496
497	if ($live_dots) {
498		splice(@ret, -1, 1, &{$dot_expand}($ret[$#ret], undef));
499	}
500	if ($return_extra) {
501		push(@ret, $values);
502	}
503
504	return @ret;
505}
506
507sub parse_LC_NONE {
508	my($cmd, $arg) = @_;
509
510	if ($cmd eq "comment_char") {
511		$comment_char = &parse_value($arg);
512	} elsif($cmd eq "escape_char") {
513		&set_escape_char(&parse_value($arg));
514	} elsif($cmd eq "") {
515	} else {
516		&exit(4, "Syntax error on line $. of $opt{i}\n");
517	}
518}
519
520sub parse_LC_CTYPE {
521	my($cmd, $arg) = @_;
522
523	my $ctype_classes = join("|", keys(%ctype_classes));
524	if ($cmd eq "copy") {
525		# XXX -- the locale command line utility doesn't currently
526		# output any LC_CTYPE info, so there isn't much of a way
527		# to implent copy yet
528		&exit(2, "copy not supported on line $. of $opt{i}\n");
529	} elsif($cmd eq "charclass") {
530		my $cc = &parse_value($arg);
531		if (!defined($ctype_classes{$cc})) {
532			$ctype_classes{$cc} = [];
533		} else {
534			warn "charclass $cc defined more then once\n";
535		}
536	} elsif($cmd =~ m/^to(upper|lower)$/) {
537		my @arg = &parse_values($arg, ';', undef, 1);
538		foreach my $p (@arg) {
539			&exit(4, "Syntax error on line $. of $opt{i} ${cmd}'s arguments must be character pairs like (a,A);(b,B)\n") if ("ARRAY" ne ref $p || 2 != @$p);
540		}
541		foreach my $pair (@arg) {
542			$ctype_classes{$cmd}{$pair->[0]} = $pair->[1];
543		}
544	} elsif($cmd =~ m/^($ctype_classes)$/) {
545		my @arg = &parse_values($arg, ';', \&dot_expand, 0);
546		foreach my $c (@arg) {
547			$ctype_classes{$1}->{$c} = 1;
548		}
549	} elsif($cmd =~ "END") {
550		&add_to_ctype_class('alpha', keys(%{$ctype_classes{'lower'}}));
551		&add_to_ctype_class('alpha', keys(%{$ctype_classes{'upper'}}));
552		foreach my $c (qw(alpha lower upper)) {
553			foreach my $d (qw(cntrl digit punct space)) {
554				&deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}}));
555			}
556		}
557
558		&add_to_ctype_class('space', keys(%{$ctype_classes{'blank'}}));
559		foreach my $d (qw(upper lower alpha digit graph xdigit)) {
560			&deny_in_ctype_class('space', $d, keys(%{$ctype_classes{$d}}));
561		}
562
563		foreach my $d (qw(upper lower alpha digit punct graph print xdigit)) {
564			&deny_in_ctype_class('cntrl', $d, keys(%{$ctype_classes{$d}}));
565		}
566
567		foreach my $d (qw(upper lower alpha digit cntrl xdigit space)) {
568			&deny_in_ctype_class('punct', $d, keys(%{$ctype_classes{$d}}));
569		}
570
571		foreach my $c (qw(graph print)) {
572			foreach my $a (qw(upper lower alpha digit xdigit punct)) {
573				&add_to_ctype_class($c, keys(%{$ctype_classes{$a}}));
574			}
575			foreach my $d (qw(cntrl)) {
576				&deny_in_ctype_class($c, $d, keys(%{$ctype_classes{$d}}));
577			}
578		}
579		&add_to_ctype_class('print', keys(%{$ctype_classes{'space'}}));
580
581		# Yes, this is a requirment of the standard
582		&exit(2, "The digit class must have exactly 10 elements\n") if (10 != values(%{$ctype_classes{'digit'}}));
583		foreach my $d (values %{$ctype_classes{'digit'}}) {
584			if (!defined $ctype_classes{'xdigits'}->{$d}) {
585				&exit(4, "$d isn't in class xdigits, but all digits must appaer in xdigits\n");
586			}
587		}
588
589		$ctype_classes{'alnum'} = {} unless defined $ctype_classes{'alnum'};
590		foreach my $a (qw(alpha digit)) {
591			&add_to_ctype_class('alnum', keys(%{$ctype_classes{$a}}));
592		}
593
594	} else {
595		&exit(4, "Syntax error on line $. of $opt{i}\n");
596	}
597}
598
599sub parse_LC_COLLATE {
600    my ($cmd, $arg) = @_;
601    if (defined($arg) && $arg ne "") {
602	push(@colldef, "$cmd $arg");
603    } else {
604	push(@colldef, "$cmd");
605    }
606}
607
608sub parse_collate_order {
609	my($cmd, $arg) = @_;
610
611	if ($cmd =~ m/order[-_]end/) {
612		# restore the parent parser
613		&set_parser("COLLATE");
614		my $undef_at;
615		for(my $i = 0; $i <= $#corder; ++$i) {
616			next unless "ARRAY" eq ref($corder[$i]);
617			# If ... appears as the "key" for a order entry it means the
618			# rest of the line is duplicated once for everything in the
619			# open ended range (key-pev-line, key-next-line).  Any ...
620			# in the weight fields are delt with by &fixup_collate_order_args
621			if ($corder[$i]->[0] eq "...") {
622				my(@sym, $from, $to);
623
624				my @charset = sort { $sym{$a} cmp $sym{$b} } keys(%sym);
625				if ($i != 0) {
626					$from = $corder[$i -1]->[0];
627				} else {
628					$from = $charset[0];
629				}
630				if ($i != $#corder) {
631					$to = $corder[$i +1]->[0];
632				} else {
633					$to = $charset[$#charset];
634				}
635
636				my @expand;
637				my($s, $e) = (&parse_value($from), &parse_value($to));
638				foreach my $c (@charset) {
639					if (($sym{$c} cmp $s) > 0) {
640						last if (($sym{$c} cmp $e) >= 0);
641						my @entry = @{$corder[$i]};
642						$entry[0] = "<$c>";
643						push(@expand, \@entry);
644					}
645				}
646				splice(@corder, $i, 1, @expand);
647			} elsif($corder[$i]->[0] eq "UNDEFINED") {
648				$undef_at = $i;
649				next;
650			}
651			&fixup_collate_order_args($corder[$i]);
652		}
653
654		if ($undef_at) {
655			my @insert;
656			my %cused = map { ("ARRAY" eq ref $_) ? ($_->[0], undef) : () } @corder;
657			foreach my $s (keys(%sym)) {
658				next if (exists $cused{"<$s>"});
659				my @entry = @{$corder[$undef_at]};
660				$entry[0] = "<$s>";
661				&fixup_collate_order_args(\@entry);
662				push(@insert, \@entry);
663			}
664			splice(@corder, $undef_at, 1, @insert);
665		}
666	} elsif((!defined $arg) || $arg eq "") {
667		if (!exists($csym{$cmd})) {
668			my($decode, $was_sym) = &unsym_with_check($cmd);
669			if ($was_sym) {
670				my %dots = ( "..." => undef );
671				my @dots = (\%dots) x (0+@corder_weights);
672				push(@corder, [$cmd, @dots]);
673			} else {
674				warn "Undefined collation symbol $cmd used on line $. of $opt{i}\n";
675			}
676		} else {
677			push(@corder, $cmd);
678		}
679	} else {
680		unless (defined($cele{$cmd} || defined $sym{$cmd})) {
681			warn "Undefined collation element or charset sym $cmd used on line $. of $opt{i}\n";
682		} else {
683			# This expands all the symbols (but not colating elements), which
684			# makes life easier for dealing with ..., but harder for
685			# outputing the actual table at the end where we end up
686			# converting literal sequences back into symbols in some cases
687			my @args = &parse_values($arg, ';', undef, 0, 0,
688			  qr/IGNORE|\Q...\E/);
689
690			if (@args != @corder_weights) {
691				if (@args < @corder_weights) {
692					&exit(4, "Only " . (0 + @args)
693					  . " weights supplied on line $. of $opt{i}, needed "
694					  . (0 + @corder_weights)
695					  . "\n");
696				} else {
697					&exit(4,  "Too many weights supplied on line $. of $opt{i},"
698					  . " wanted " . (0 + @corder_weights) . " but had "
699					  . (0 + @args)
700					  . "\n");
701				}
702			}
703
704			push(@corder, [$cmd, @args]);
705		}
706	}
707}
708
709sub parse_LC_MONETARY {
710	my($cmd, $arg) = @_;
711
712	if ($cmd eq "copy") {
713		&do_copy(&parse_value($arg));
714	} elsif($cmd eq "END") {
715	} elsif($cmd eq "mon_grouping") {
716		my @v = &parse_values($arg, ';', undef, 0);
717		$monetary{$cmd} = \@v;
718	} else {
719		my $v = &parse_value($arg);
720		$monetary{$cmd} = $v;
721	}
722}
723
724sub parse_LC_MESSAGES {
725	my($cmd, $arg) = @_;
726
727	if ($cmd eq "copy") {
728		&do_copy(&parse_value($arg));
729	} elsif($cmd eq "END") {
730	} else {
731		my $v = &parse_value($arg);
732		$messages{$cmd} = $v;
733	}
734}
735
736sub parse_LC_NUMERIC {
737	my($cmd, $arg) = @_;
738
739	if ($cmd eq "copy") {
740		&do_copy(&parse_value($arg));
741	} elsif($cmd eq "END") {
742	} elsif($cmd eq "grouping") {
743		my @v = &parse_values($arg, ';', undef, 0);
744		$numeric{$cmd} = \@v;
745	} else {
746		my $v = &parse_value($arg);
747		$numeric{$cmd} = $v;
748	}
749}
750
751sub parse_LC_TIME {
752	my($cmd, $arg) = @_;
753
754	$cmd =~ s/^ab_day$/abday/;
755
756	if ($cmd eq "copy") {
757		&do_copy(&parse_value($arg));
758	} elsif($cmd eq "END") {
759	} elsif($cmd =~ m/abday|day|mon|abmon|am_pm|alt_digits/) {
760		my @v = &parse_values($arg, ';', undef, 0);
761		$time{$cmd} = \@v;
762	} elsif($cmd eq "era") {
763		my @v = &parse_values($arg, ':', undef, 0);
764		$time{$cmd} = \@v;
765	} else {
766		my $v = &parse_value($arg);
767		$time{$cmd} = $v;
768	}
769}
770
771
772###############################################################################
773
774sub run_mklocale {
775	my $L = (new IO::File "|/usr/bin/mklocale -o $locale_dir/LC_CTYPE") || &exit(5, "$0: Can't start mklocale $!\n");
776	if (defined($opt{'u'})) {
777		$L->print(qq{ENCODING "$opt{u}"\n});
778	} else {
779		if ($ARGV[0] =~ m/(big5|euc|gb18030|gb2312|gbk|mskanji|utf-8)/i) {
780		    my $enc = uc($1);
781		    $L->print(qq{ENCODING "$enc"\n});
782		} elsif($ARGV[0] =~ m/utf8/) {
783		    $L->print(qq{ENCODING "UTF-8"\n});
784		} else {
785		    $L->print(qq{ENCODING "NONE"\n});
786		}
787	}
788	foreach my $class (keys(%ctype_classes)) {
789		unless ($class =~ m/^(tolower|toupper|alpha|control|digit|grah|lower|space|upper|xdigit|blank|print|ideogram|special|phonogram)$/) {
790			$L->print("# skipping $class\n");
791			next;
792		}
793
794		if (!%{$ctype_classes{$class}}) {
795			$L->print("# Nothing in \U$class\n");
796			next;
797		}
798
799		if ($class =~ m/^to/) {
800			my $t = $class;
801			$t =~ s/^to/map/;
802			$L->print("\U$t ");
803
804			foreach my $from (keys(%{$ctype_classes{$class}})) {
805				$L->print("[", &hexchars($from), " ",
806				  &hexchars($ctype_classes{$class}->{$from}), "] ");
807			}
808		} else {
809			$L->print("\U$class ");
810
811			foreach my $rune (keys(%{$ctype_classes{$class}})) {
812				$L->print(&hexchars($rune), " ");
813			}
814		}
815		$L->print("\n");
816	}
817
818	my @width;
819	foreach my $s (keys(%width)) {
820		my $w = $width{$s};
821		$w = 3 if ($w > 3);
822		push(@{$width[$w]}, &hexchars($sym{$s}));
823	}
824	for(my $w = 0; $w <= $#width; ++$w) {
825		next if (!defined $width[$w]);
826		next if (0 == @{$width[$w]});
827		$L->print("SWIDTH$w ", join(" ", @{$width[$w]}), "\n");
828	}
829
830	if (!$L->close()) {
831		if (0 == $!) {
832			&exit(5, "Bad return from mklocale $?");
833		} else {
834			&exit(5, "Couldn't close mklocale pipe: $!");
835		}
836	}
837}
838
839###############################################################################
840
841sub hexchars {
842	my($str) = $_[0];
843	my($ret);
844
845	$ret = unpack "H*", $str;
846	&exit(2, "Rune >4 bytes ($ret; for $str)") if (length($ret) > 8);
847
848	return "0x" . $ret;
849}
850
851sub hexseq {
852	my($str) = $_[0];
853	my($ret);
854
855	$ret = unpack "H*", $str;
856	$ret =~ s/(..)/\\x$1/g;
857
858	return $ret;
859}
860
861# dot_expand in the target charset
862sub dot_expand {
863	my($s, $e) = @_;
864	my(@ret);
865
866	my @charset = sort { $a cmp $b } values(%sym);
867	foreach my $c (@charset) {
868		if (($c cmp $s) >= 0) {
869			last if (($c cmp $e) > 0);
870			push(@ret, $c);
871		}
872	}
873
874	return @ret;
875}
876
877# Convert symbols into literal values
878sub unsym {
879	my @ret = &unsym_with_check(@_);
880	return $ret[0];
881}
882
883# Convert symbols into literal values (return[0]), and a count of how
884# many symbols were converted (return[1]).
885sub unsym_with_check {
886	my($str) = $_[0];
887
888	my $rx = join("|", keys(%sym));
889	return ($str, 0) if ($rx eq "");
890	my $found = $str =~ s/<($rx)>/$sym{$1}/eg;
891
892	return ($str, $found);
893}
894
895# Convert a string of literals back into symbols.  It is an error
896# for there to be literal values that can't be mapped back.  The
897# converter uses a gredy algo.  It is likely this could be done
898# more efficently with a regex ctrated at runtime.  It would also be
899# a good idea to only create %rsym if %sym changes, but that isn't
900# the simplest thing to do in perl5.
901sub resym {
902	my($str) = $_[0];
903	my(%rsym, $k, $v);
904	my $max_len = 0;
905	my $ret = "";
906
907	while(($k, $v) = each(%sym)) {
908		# Collisions in $v are ok, we merely need a mapping, not the
909		# identical mapping
910		$rsym{$v} = $k;
911		$max_len = length($v) if (length($v) > $max_len);
912	}
913
914	SYM: while("" ne $str) {
915		foreach my $l ($max_len .. 1) {
916			next if ($l > length($str));
917			my $s = substr($str, 0, $l);
918			if (defined($rsym{$s})) {
919				$ret .= "<" . $rsym{$s} . ">";
920				substr($str, 0, $l) = "";
921				next SYM;
922			}
923		}
924		&exit(4, "Can't convert $str ($_[0]) back into symbolic form\n");
925	}
926
927	return $ret;
928}
929
930sub set_escape {
931	$escape_char = $_[0];
932	$val_match = qr/"(?:[^"\Q$escape_char\E]+|\Q$escape_char\E")+"|(?:\Q$escape_char\E(?:[0-7]+|d[0-9]+|x[0-9a-fA-F]+))|[^,;<>\s\Q$escape_char\E]|(?:\Q$escape_char\E)[,;<>\Q$escape_char\E]/;
933}
934
935sub set_parser {
936	my $section = $_[0];
937	($current_LC, $parse_func, $validate_line, $call_parse_on_END)
938	  = ($section, $LC_parsers{$section}->[0], $LC_parsers{$section}->[1],
939	  $LC_parsers{$section}->[2]);
940	unless (defined $parse_func) {
941		&exit(4, "Unknown section name LC_$section on line $. of $opt{i}\n");
942	}
943}
944
945sub do_copy {
946	my($from) = @_;
947	local($ENV{LC_ALL}) = $from;
948
949	my $C = (new IO::File "/usr/bin/locale -k LC_$current_LC |") || &exit(5, "can't fork locale during copy of LC_$current_LC");
950	while(<$C>) {
951		if (s/=\s*$/ ""/ || s/=/ /) {
952			if (m/$validate_line/ && m/^\s*(\S*)(\s+(\S+.*?))?\s*$/) {
953				my($action, $args) = ($1, $3);
954				&{$parse_func}($action, $args);
955			} else {
956				&exit(4, "Syntax error on line $. of locale -k output"
957				  . " during copy $current_LC\n");
958			}
959		} else {
960			&exit(4, "Ill-formed line $. from locale -k during copy $current_LC\n");
961		}
962	}
963	$C->close() || &exit(5, "copying LC_$current_LC from $from failed");
964}
965
966sub fixup_collate_order_args {
967	my $co = $_[0];
968
969	foreach my $s (@{$co}[1..$#{$co}]) {
970		if ("HASH" eq ref($s) && exists($s->{"..."})) {
971			$s = $co->[0];
972		}
973	}
974}
975
976sub add_to_ctype_class {
977	my($class, @runes) = @_;
978
979	my $c = $ctype_classes{$class};
980	foreach my $r (@runes) {
981		$c->{$r} = 2 unless exists $c->{$r};
982	}
983}
984
985sub deny_in_ctype_class {
986	my($class, $deny_reason, @runes) = @_;
987
988	my $c = $ctype_classes{$class};
989	foreach my $r (@runes) {
990		next unless exists $c->{$r};
991		$deny_reason =~ s/^(\S+)$/can't belong in class $class and in class $1 at the same time/;
992		&exit(4, &hexchars($r) . " " . $deny_reason . "\n");
993	}
994}
995
996# write_lc_{money,time,messages} all use the existing Libc format, which
997# is raw text with each record terminated by a newline, and records
998# in a predetermined order.
999
1000sub write_lc_money {
1001	my $F = (new IO::File "$locale_dir/LC_MONETARY", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_MONETARY: $!");
1002	foreach my $s (qw(int_curr_symbol currency_symbol mon_decimal_point mon_thousands_sep mon_grouping positive_sign negative_sign int_frac_digits frac_digits p_cs_precedes p_sep_by_space n_cs_precedes n_sep_by_space p_sign_posn n_sign_posn int_p_cs_precedes int_n_cs_precedes int_p_sep_by_space int_n_sep_by_space int_p_sign_posn int_n_sign_posn)) {
1003		if (exists $monetary{$s}) {
1004			my $v = $monetary{$s};
1005			if ("ARRAY" eq ref $v) {
1006				$F->print(join(";", @$v), "\n");
1007			} else {
1008				$F->print("$v\n");
1009			}
1010		} else {
1011			if ($s =~ m/^(int_curr_symbol|currency_symbol|mon_decimal_point|mon_thousands_sep|positive_sign|negative_sign)$/) {
1012				$F->print("\n");
1013			} else {
1014				$F->print("-1\n");
1015			}
1016		}
1017	}
1018}
1019
1020sub write_lc_time {
1021	my $F = (new IO::File "$locale_dir/LC_TIME", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_TIME: $!");
1022	my %array_cnt = (abmon => 12, mon => 12, abday => 7, day => 7, alt_month => 12, am_pm => 2);
1023
1024	$time{"md_order"} = "md" unless defined $time{"md_order"};
1025
1026	foreach my $s (qw(abmon mon abday day t_fmt d_fmt d_t_fmt am_pm d_t_fmt mon md_order t_fmt_ampm)) {
1027		my $cnt = $array_cnt{$s};
1028		my $v = $time{$s};
1029
1030		if (defined $v) {
1031			if (defined $cnt) {
1032				my @a = @{$v};
1033				&exit(4, "$0: $s has " . (0 + @a)
1034				  . " elements, it needs to have exactly $cnt\n")
1035				  unless (@a == $cnt);
1036				$F->print(join("\n", @a), "\n");
1037			} else {
1038				$F->print("$v\n");
1039			}
1040		} else {
1041			$cnt = 1 if !defined $cnt;
1042			$F->print("\n" x $cnt);
1043		}
1044	}
1045}
1046
1047sub write_lc_messages {
1048	mkdir("$locale_dir/LC_MESSAGES");
1049	my $F = (new IO::File "$locale_dir/LC_MESSAGES/LC_MESSAGES", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_MESSAGES/LC_MESSAGES: $!");
1050
1051	foreach my $s (qw(yesexpr noexpr yesstr nostr)) {
1052		my $v = $messages{$s};
1053
1054		if (defined $v) {
1055			$F->print("$v\n");
1056		} else {
1057			$F->print("\n");
1058		}
1059	}
1060}
1061
1062sub write_lc_numeric {
1063	my $F = (new IO::File "$locale_dir/LC_NUMERIC", O_TRUNC|O_WRONLY|O_CREAT, 0666) || &exit(4, "$0 can't create $locale_dir/LC_NUMERIC: $!");
1064
1065	foreach my $s (qw(decimal_point thousands_sep grouping)) {
1066		if (exists $numeric{$s}) {
1067			my $v = $numeric{$s};
1068			if ("ARRAY" eq ref $v) {
1069				$F->print(join(";", @$v), "\n");
1070			} else {
1071				$F->print("$v\n");
1072			}
1073		} else {
1074			$F->print("\n");
1075		}
1076	}
1077}
1078
1079sub bylenval {
1080	return 0 if ("ARRAY" ne ref $a || "ARRAY" ne ref $b);
1081
1082	my($aval, $af) = &unsym_with_check($a->[0]);
1083	$aval = $cele{$a->[0]} unless $af;
1084	my($bval, $bf) = &unsym_with_check($b->[0]);
1085	$bval = $cele{$b->[0]} unless $bf;
1086
1087	my $r = length($aval) - length($bval);
1088	return $r if $r;
1089	return $aval cmp $bval;
1090}
1091
1092sub write_lc_collate {
1093    return unless @colldef;
1094
1095    # colldef doesn't parse the whole glory of SuSv3 charmaps, and we
1096    # already have, so we cna spit out a simplifyed one; unfortunitly
1097    # it doesn't like "/dev/fd/N" so we need a named tmp file
1098    my($CMAP, $cmapname) = tempfile(DIR => "/tmp");
1099    foreach my $s (keys(%sym)) {
1100	$CMAP->print("<$s>\t", sprintf "\\x%02x\n", ord($sym{$s}));
1101    }
1102    $CMAP->flush();
1103    unshift(@colldef, qq{charmap $cmapname});
1104    unshift(@colldef, "LC_COLLATE");
1105    $colldef[$#colldef] = "END LC_COLLATE";
1106
1107    # Can't just use /dev/stdin, colldef appears to use seek,
1108    # and even seems to need a named temp file (re-open?)
1109    my($COL, $colname) = tempfile(DIR => "/tmp");
1110    $COL->print(join("\n", @colldef), "\n");
1111    $COL->flush();
1112
1113    my $rc = system(
1114      "/usr/bin/colldef -o $locale_dir/LC_COLLATE $colname");
1115    unlink $colname, $cmapname;
1116    if ($rc) {
1117	&exit(1, "Bad return from colldef $rc");
1118    }
1119}
1120
1121# Pack an int of unknown size into a series of bytes, each of which
1122# contains 7 bits of data, and the top bit is clear on the last
1123# byte of data.  Also works on arrays -- does not encode the size of
1124# the array.  This format is great for data that tends to have fewer
1125# then 21 bits.
1126sub pack_p_int {
1127	if (@_ > 1) {
1128		my $ret = "";
1129		foreach my $v (@_) {
1130			$ret .= &pack_p_int($v);
1131		}
1132
1133		return $ret;
1134	}
1135
1136	my $v = $_[0];
1137	my $b;
1138
1139	&exit(4, "pack_p_int only works on positive values") if ($v < 0);
1140	if ($v < 128) {
1141		$b = chr($v);
1142	} else {
1143		$b = chr(($v & 0x7f) | 0x80);
1144		$b .= pack_p_int($v >> 7);
1145	}
1146	return $b;
1147}
1148
1149sub strip_angles {
1150	my $s = $_[0];
1151	$s =~ s/^<(.*)>$/$1/;
1152	return $s;
1153}
1154
1155# For localedef
1156#  xc=0 "no warnings, locale defined"
1157#  xc=1 "warnings, locale defined"
1158#  xc=2 "implmentation limits or unsupported charactor sets, no locale defined"
1159#  xc=3 "can't create new locales"
1160#  xc=4+ "wornings or errors, no locale defined"
1161sub exit {
1162    my($xc, $message) = @_;
1163
1164    print STDERR $message;
1165    exit $xc;
1166}
1167