1#!/usr/bin/env perl
2#	$OpenBSD: gen_ctype_utf8.pl,v 1.8 2023/02/16 01:06:01 afresh1 Exp $	#
3use 5.022;
4use warnings;
5
6# Copyright (c) 2015 Andrew Fresh <afresh1@openbsd.org>
7#
8# Permission to use, copy, modify, and distribute this software for any
9# purpose with or without fee is hereby granted, provided that the above
10# copyright notice and this permission notice appear in all copies.
11#
12# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19
20use Unicode::UCD v0.610 qw( charinfo charprop prop_invmap );
21
22my @lists = qw(
23    ALPHA
24    CONTROL
25    DIGIT
26    GRAPH
27    LOWER
28    PUNCT
29    SPACE
30    UPPER
31    XDIGIT
32    BLANK
33    PRINT
34    SPECIAL
35    PHONOGRAM
36
37    SWIDTH0
38    SWIDTH1
39    SWIDTH2
40);
41
42my @maps = qw(
43    MAPUPPER
44    MAPLOWER
45    TODIGIT
46);
47
48my ( $blocks_ranges_ref, $blocks_maps_ref ) = prop_invmap("Block");
49
50print "/*\t\$" . 'OpenBSD' . "\$\t*/\n";
51print <<'EOL';
52
53/*
54 * COPYRIGHT AND PERMISSION NOTICE
55 *
56 * Copyright (c) 1991-2021 Unicode, Inc. All rights reserved.
57 * Distributed under the Terms of Use in
58 * https://www.unicode.org/copyright.html.
59 *
60 * Permission is hereby granted, free of charge, to any person obtaining
61 * a copy of the Unicode data files and any associated documentation
62 * (the "Data Files") or Unicode software and any associated documentation
63 * (the "Software") to deal in the Data Files or Software
64 * without restriction, including without limitation the rights to use,
65 * copy, modify, merge, publish, distribute, and/or sell copies of
66 * the Data Files or Software, and to permit persons to whom the Data Files
67 * or Software are furnished to do so, provided that either
68 * (a) this copyright and permission notice appear with all copies
69 * of the Data Files or Software, or
70 * (b) this copyright and permission notice appear in associated
71 * Documentation.
72 *
73 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
74 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
75 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
76 * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
77 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
78 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
79 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
80 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
81 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
82 * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
83 *
84 * Except as contained in this notice, the name of a copyright holder
85 * shall not be used in advertising or otherwise to promote the sale,
86 * use or other dealings in these Data Files or Software without prior
87 * written authorization of the copyright holder.
88 */
89
90ENCODING        "UTF8"
91VARIABLE        CODESET=UTF-8
92
93EOL
94
95print "/* Unicode Version " . Unicode::UCD::UnicodeVersion() . " */\n";
96
97for my $i ( 0 .. $#{ $blocks_ranges_ref } ) {
98	my $start = $blocks_ranges_ref->[ $i ];
99	my $end   = ( $blocks_ranges_ref->[ $i + 1 ] || 0 ) - 1;
100
101	my $descr = sprintf "U+%04X - U+%04X : %s",
102	     $start, $end, $blocks_maps_ref->[$i];
103
104	warn "$descr\n";
105	print "\n/*\n * $descr\n */\n\n";
106
107	last if $end == -1;
108	next if $blocks_maps_ref->[$i] eq 'No_Block';
109
110	my %info;
111	categorize( $_, \%info ) for $start .. $end;
112	print_info(%info);
113}
114
115# http://www.unicode.org/reports/tr44/tr44-16.html#General_Category_Values
116# Table 12. General_Category Values
117#
118# Abbr         Long                             Description
119# Lu   Uppercase_Letter      an uppercase letter
120# Ll   Lowercase_Letter      a lowercase letter
121# Lt   Titlecase_Letter      a digraphic character, with first part uppercase
122# LC   Cased_Letter          Lu | Ll | Lt
123# Lm   Modifier_Letter       a modifier letter
124# Lo   Other_Letter          other letters, including syllables and ideographs
125# L    Letter                Lu | Ll | Lt | Lm | Lo
126# Mn   Nonspacing_Mark       a nonspacing combining mark (zero advance width)
127# Mc   Spacing_Mark          a spacing combining mark (positive advance width)
128# Me   Enclosing_Mark        an enclosing combining mark
129# M    Mark                  Mn | Mc | Me
130# Nd   Decimal_Number        a decimal digit
131# Nl   Letter_Number         a letterlike numeric character
132# No   Other_Number          a numeric character of other type
133# N    Number                Nd | Nl | No
134# Pc   Connector_Punctuation a connecting punctuation mark, like a tie
135# Pd   Dash_Punctuation      a dash or hyphen punctuation mark
136# Ps   Open_Punctuation      an opening punctuation mark (of a pair)
137# Pe   Close_Punctuation     a closing punctuation mark (of a pair)
138# Pi   Initial_Punctuation   an initial quotation mark
139# Pf   Final_Punctuation     a final quotation mark
140# Po   Other_Punctuation     a punctuation mark of other type
141# P    Punctuation           Pc | Pd | Ps | Pe | Pi | Pf | Po
142# Sm   Math_Symbol           a symbol of mathematical use
143# Sc   Currency_Symbol       a currency sign
144# Sk   Modifier_Symbol       a non-letterlike modifier symbol
145# So   Other_Symbol          a symbol of other type
146# S    Symbol                Sm | Sc | Sk | So
147# Zs   Space_Separator       a space character (of various non-zero widths)
148# Zl   Line_Separator        U+2028 LINE SEPARATOR only
149# Zp   Paragraph_Separator   U+2029 PARAGRAPH SEPARATOR only
150# Z    Separator             Zs | Zl | Zp
151# Cc   Control               a C0 or C1 control code
152# Cf   Format                a format control character
153# Cs   Surrogate             a surrogate code point
154# Co   Private_Use           a private-use character
155# Cn   Unassigned            a reserved unassigned code point or a noncharacter
156# C    Other                 Cc | Cf | Cs | Co | Cn
157
158sub categorize
159{
160	my ( $code, $info ) = @_;
161
162	# http://www.unicode.org/L2/L2003/03139-posix-classes.htm
163	my $charinfo = charinfo($code);
164	return unless $charinfo;
165	my $general_category = $charinfo->{category};
166	my $gc = substr $general_category, 0, 1;
167
168	my $is_upper = $general_category eq 'Lu';
169	my $is_lower = $general_category eq 'Ll';
170	my $is_space = charprop( $code, 'Sentence_Break' ) eq 'Sp';
171
172	my $is_print;
173	my $matched;
174	if ( $general_category eq 'Nd' ) {
175		push @{ $info->{DIGIT} }, $code;
176		$is_print = 1;
177		$matched  = 1;
178	} elsif ( $gc eq 'P' or $gc eq 'S' ) {
179		push @{ $info->{PUNCT} }, $code;
180		$is_print = 1;
181		$matched  = 1;
182	} elsif ( charprop( $code, 'White_Space' ) eq 'Yes' ) {
183		push @{ $info->{SPACE} }, $code;
184		$is_print = 1 if charprop( $code, 'Grapheme_Base' ) eq 'Yes';
185		$matched = 1;
186	} elsif ( charprop( $code, 'Alphabetic' ) eq 'Yes' ) {
187		push @{ $info->{ALPHA} }, $code
188		    if charprop( $code, 'Numeric_Type' ) eq 'None';
189		push @{ $info->{LOWER} }, $code if $is_lower;
190		push @{ $info->{UPPER} }, $code if $is_upper;
191		push @{ $info->{PHONOGRAM} }, $code
192		    if $charinfo->{name} =~ /SYLLABLE/
193		    or $charinfo->{block} =~ /Syllable/i;
194		$is_print = 1;
195		$matched  = 1;
196	}
197
198	if ( $general_category eq 'Cc'
199		or charprop( $code, 'Grapheme_Cluster_Break' ) eq 'Control' )
200	{
201		push @{ $info->{CONTROL} }, $code;
202		$matched = 1;
203	}
204
205	push @{ $info->{BLANK} }, $code if $is_space;
206
207	if (
208		not(
209			   $is_space or $general_category eq 'Cc',
210			or $general_category eq 'Ss',
211			or $general_category eq 'Cn',
212		)
213	    )
214	{
215		push @{ $info->{GRAPH} }, $code;
216		push @{ $info->{SPECIAL} }, $code unless $matched;
217		$is_print = 1;
218	}
219	push @{ $info->{PRINT} }, $code if $is_print;
220
221	if ( charprop( $code, 'Hex_Digit' ) eq 'Yes' ) {
222		push @{ $info->{XDIGIT} }, $code;
223		$info->{TODIGIT}{$code} = hex chr $code
224		    if charprop( $code, 'ASCII_Hex_Digit' ) eq 'Yes';
225		$matched = 1;
226	}
227
228	if ($is_lower) {
229		my $mapping = ord charprop( $code, 'Simple_Uppercase_Mapping' );
230		$info->{MAPUPPER}{$code} = $mapping if $mapping != $code;
231	}
232
233	if ($is_upper) {
234		my $mapping = ord charprop( $code, 'Simple_Lowercase_Mapping' );
235		$info->{MAPLOWER}{$code} = $mapping if $mapping != $code;
236	}
237
238	{
239		my $mapping = charprop( $code, 'Numeric_Value' );
240		$info->{TODIGIT}{$code} = $mapping
241		    if $mapping =~ /^[0-9]+$/ and chr($code) ne $mapping;
242	}
243
244	if ($is_print) {
245		my $columns = codepoint_columns( $code, $charinfo );
246		push @{ $info->{"SWIDTH$columns"} }, $code if defined $columns;
247	}
248}
249
250sub print_info
251{
252	my (%info) = @_;
253
254	my $printed = 0;
255
256	foreach my $list (@lists) {
257		next unless $info{$list};
258		$printed = 1;
259		print_list( $list => $info{$list} );
260	}
261
262	print "\n" if $printed;
263
264	foreach my $map (@maps) {
265		next unless $info{$map};
266		print_map( $map => $info{$map} );
267	}
268}
269
270sub print_list
271{
272	my ( $list, $points ) = @_;
273
274	my @squished = reverse @{ squish_points($points) };
275	my $line = sprintf "%-10s%s", $list, pop @squished;
276
277	while (@squished) {
278		my $item = pop @squished;
279
280		if ( length("$line  $item") > 80 ) {
281			say $line;
282			$line = sprintf "%-10s%s", $list, $item;
283		} else {
284			$line .= "  $item";    # two leading spaces on purpose
285		}
286	}
287
288	say $line;
289}
290
291sub print_map
292{
293	my ( $map, $points ) = @_;
294	my $single = '< %s %s >';
295	my $range  = '< %s : %s >';
296
297	my %map;
298
299	my $adjustment;
300	my $last_diff = 0;
301	my $first_point;
302	my $prev_point;
303	foreach my $point ( sort { $a <=> $b } keys %{$points} ) {
304		my $diff = $point - $points->{$point};
305
306		if ( $diff != $last_diff
307			or
308			( defined $prev_point and $point - 1 != $prev_point ) )
309		{
310			$first_point = undef;
311			$adjustment  = undef;
312			$last_diff   = undef;
313		}
314
315		$first_point //= $point;
316		$adjustment  //= $points->{$point};
317		$last_diff   //= $diff;
318
319		$prev_point = $point;
320
321		push @{ $map{$first_point}{$adjustment} }, $point;
322	}
323
324	my @ranges;
325
326	foreach my $point ( keys %map ) {
327		foreach my $adjustment ( keys %{ $map{$point} } ) {
328			my $adj =
329			    $map eq 'TODIGIT'
330			    ? ( $adjustment || '0x0000' )
331			    : format_point($adjustment);
332			foreach (
333				@{ squish_points( $map{$point}{$adjustment} ) }
334			    )
335			{
336				my $format = / - / ? $range : $single;
337				my $formatted = sprintf $format, $_, $adj;
338				push @ranges, $formatted;
339			}
340		}
341	}
342
343	printf "%-10s%s\n", $map, $_ for sort @ranges;
344}
345
346sub squish_points
347{
348	my ($points) = @_;
349	my @squished;
350
351	my $start;
352	my $last_point = 0;
353
354	foreach my $i ( 0 .. $#{$points} + 1 ) {
355
356		my $point = $points->[$i];
357
358		if ( defined $point and $point - 1 == $last_point ) {
359			$last_point = $point;
360			next;
361		}
362
363		if ( defined $start ) {
364			if ( $start == $i - 1 ) {
365				push @squished,
366				    format_point( $points->[$start] );
367			}
368
369			# TODO: This is nice, but breaks print_map
370			#elsif ( $start == $i - 2 ) {
371			#    push @squished, format_point( $points->[$start] ),
372			#        format_point( $points->[ $i - 1 ] );
373			#}
374			else {
375				push @squished, join ' - ',
376				    format_point( $points->[$start] ),
377				    format_point( $points->[ $i - 1 ] );
378			}
379		}
380
381		$start      = $i;
382		$last_point = $point;
383	}
384
385	return \@squished;
386}
387
388sub format_point
389{
390	my ($point) = @_;
391	state %make_chr;
392	%make_chr = map { $_ => 1 } ( 0 .. 9, 'a' .. 'z', 'A' .. 'Z' )
393	    unless %make_chr;
394
395	my $chr = chr $point;
396	return "'$chr'" if $make_chr{$chr};
397	return sprintf "0x%04x", $point;
398}
399
400sub codepoint_columns
401{
402	my ( $code, $charinfo ) = @_;
403	return undef unless defined $code;
404
405	# Private use areas are _most likely_ used by one column glyphs
406	return 1 if $charinfo->{category} eq 'Co';
407
408	return 0 if $charinfo->{category} eq 'Mn';
409	return 0 if $charinfo->{category} eq 'Me';
410	return 0 if index( $charinfo->{category}, 'C' ) == 0;
411
412	return 2 if $charinfo->{block} eq 'Hangul Jamo';
413	return 2 if $charinfo->{block} eq 'Hangul Jamo Extended-B';
414
415	{
416		my $eaw = charprop( $code, 'East_Asian_Width' );
417		return 2 if $eaw eq 'Wide' or $eaw eq 'Fullwidth';
418	}
419
420	return 1;
421}
422
423__END__
424=head1 NAME
425
426gen_ctype_utf8.pl - rebuild  src/share/locale/ctype/en_US.UTF-8.src
427
428=head1 SYNOPSIS
429
430gen_ctype_utf8.pl > en_US.UTF-8.src
431
432=head1 DESCRIPTION
433
434The perl community does a good job of keeping their Unicode tables up to date
435we can reuse their hard work to rebuild our tables.
436
437We don't directly use the files from the Unicode Consortium instead we use
438summary files that we generate.
439See L<mklocale(1)> for more information about these files.
440
441=head1 CAVEATS
442
443Requires perl 5.22 or newer.
444
445=head1 AUTHOR
446
447Andrew Fresh <afresh1@openbsd.org>
448