1#!/usr/bin/env perl 2# $OpenBSD: gen_ctype_utf8.pl,v 1.8 2023/02/16 01:06:01 afresh1 Exp $ # 3use 5.022; 4use warnings; 5 6# Copyright (c) 2015 Andrew Fresh <afresh1@openbsd.org> 7# 8# Permission to use, copy, modify, and distribute this software for any 9# purpose with or without fee is hereby granted, provided that the above 10# copyright notice and this permission notice appear in all copies. 11# 12# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 13# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 14# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 15# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 16# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 17# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 18# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 19 20use Unicode::UCD v0.610 qw( charinfo charprop prop_invmap ); 21 22my @lists = qw( 23 ALPHA 24 CONTROL 25 DIGIT 26 GRAPH 27 LOWER 28 PUNCT 29 SPACE 30 UPPER 31 XDIGIT 32 BLANK 33 PRINT 34 SPECIAL 35 PHONOGRAM 36 37 SWIDTH0 38 SWIDTH1 39 SWIDTH2 40); 41 42my @maps = qw( 43 MAPUPPER 44 MAPLOWER 45 TODIGIT 46); 47 48my ( $blocks_ranges_ref, $blocks_maps_ref ) = prop_invmap("Block"); 49 50print "/*\t\$" . 'OpenBSD' . "\$\t*/\n"; 51print <<'EOL'; 52 53/* 54 * COPYRIGHT AND PERMISSION NOTICE 55 * 56 * Copyright (c) 1991-2021 Unicode, Inc. All rights reserved. 57 * Distributed under the Terms of Use in 58 * https://www.unicode.org/copyright.html. 59 * 60 * Permission is hereby granted, free of charge, to any person obtaining 61 * a copy of the Unicode data files and any associated documentation 62 * (the "Data Files") or Unicode software and any associated documentation 63 * (the "Software") to deal in the Data Files or Software 64 * without restriction, including without limitation the rights to use, 65 * copy, modify, merge, publish, distribute, and/or sell copies of 66 * the Data Files or Software, and to permit persons to whom the Data Files 67 * or Software are furnished to do so, provided that either 68 * (a) this copyright and permission notice appear with all copies 69 * of the Data Files or Software, or 70 * (b) this copyright and permission notice appear in associated 71 * Documentation. 72 * 73 * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 74 * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 75 * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 76 * NONINFRINGEMENT OF THIRD PARTY RIGHTS. 77 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 78 * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 79 * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 80 * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 81 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 82 * PERFORMANCE OF THE DATA FILES OR SOFTWARE. 83 * 84 * Except as contained in this notice, the name of a copyright holder 85 * shall not be used in advertising or otherwise to promote the sale, 86 * use or other dealings in these Data Files or Software without prior 87 * written authorization of the copyright holder. 88 */ 89 90ENCODING "UTF8" 91VARIABLE CODESET=UTF-8 92 93EOL 94 95print "/* Unicode Version " . Unicode::UCD::UnicodeVersion() . " */\n"; 96 97for my $i ( 0 .. $#{ $blocks_ranges_ref } ) { 98 my $start = $blocks_ranges_ref->[ $i ]; 99 my $end = ( $blocks_ranges_ref->[ $i + 1 ] || 0 ) - 1; 100 101 my $descr = sprintf "U+%04X - U+%04X : %s", 102 $start, $end, $blocks_maps_ref->[$i]; 103 104 warn "$descr\n"; 105 print "\n/*\n * $descr\n */\n\n"; 106 107 last if $end == -1; 108 next if $blocks_maps_ref->[$i] eq 'No_Block'; 109 110 my %info; 111 categorize( $_, \%info ) for $start .. $end; 112 print_info(%info); 113} 114 115# http://www.unicode.org/reports/tr44/tr44-16.html#General_Category_Values 116# Table 12. General_Category Values 117# 118# Abbr Long Description 119# Lu Uppercase_Letter an uppercase letter 120# Ll Lowercase_Letter a lowercase letter 121# Lt Titlecase_Letter a digraphic character, with first part uppercase 122# LC Cased_Letter Lu | Ll | Lt 123# Lm Modifier_Letter a modifier letter 124# Lo Other_Letter other letters, including syllables and ideographs 125# L Letter Lu | Ll | Lt | Lm | Lo 126# Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) 127# Mc Spacing_Mark a spacing combining mark (positive advance width) 128# Me Enclosing_Mark an enclosing combining mark 129# M Mark Mn | Mc | Me 130# Nd Decimal_Number a decimal digit 131# Nl Letter_Number a letterlike numeric character 132# No Other_Number a numeric character of other type 133# N Number Nd | Nl | No 134# Pc Connector_Punctuation a connecting punctuation mark, like a tie 135# Pd Dash_Punctuation a dash or hyphen punctuation mark 136# Ps Open_Punctuation an opening punctuation mark (of a pair) 137# Pe Close_Punctuation a closing punctuation mark (of a pair) 138# Pi Initial_Punctuation an initial quotation mark 139# Pf Final_Punctuation a final quotation mark 140# Po Other_Punctuation a punctuation mark of other type 141# P Punctuation Pc | Pd | Ps | Pe | Pi | Pf | Po 142# Sm Math_Symbol a symbol of mathematical use 143# Sc Currency_Symbol a currency sign 144# Sk Modifier_Symbol a non-letterlike modifier symbol 145# So Other_Symbol a symbol of other type 146# S Symbol Sm | Sc | Sk | So 147# Zs Space_Separator a space character (of various non-zero widths) 148# Zl Line_Separator U+2028 LINE SEPARATOR only 149# Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only 150# Z Separator Zs | Zl | Zp 151# Cc Control a C0 or C1 control code 152# Cf Format a format control character 153# Cs Surrogate a surrogate code point 154# Co Private_Use a private-use character 155# Cn Unassigned a reserved unassigned code point or a noncharacter 156# C Other Cc | Cf | Cs | Co | Cn 157 158sub categorize 159{ 160 my ( $code, $info ) = @_; 161 162 # http://www.unicode.org/L2/L2003/03139-posix-classes.htm 163 my $charinfo = charinfo($code); 164 return unless $charinfo; 165 my $general_category = $charinfo->{category}; 166 my $gc = substr $general_category, 0, 1; 167 168 my $is_upper = $general_category eq 'Lu'; 169 my $is_lower = $general_category eq 'Ll'; 170 my $is_space = charprop( $code, 'Sentence_Break' ) eq 'Sp'; 171 172 my $is_print; 173 my $matched; 174 if ( $general_category eq 'Nd' ) { 175 push @{ $info->{DIGIT} }, $code; 176 $is_print = 1; 177 $matched = 1; 178 } elsif ( $gc eq 'P' or $gc eq 'S' ) { 179 push @{ $info->{PUNCT} }, $code; 180 $is_print = 1; 181 $matched = 1; 182 } elsif ( charprop( $code, 'White_Space' ) eq 'Yes' ) { 183 push @{ $info->{SPACE} }, $code; 184 $is_print = 1 if charprop( $code, 'Grapheme_Base' ) eq 'Yes'; 185 $matched = 1; 186 } elsif ( charprop( $code, 'Alphabetic' ) eq 'Yes' ) { 187 push @{ $info->{ALPHA} }, $code 188 if charprop( $code, 'Numeric_Type' ) eq 'None'; 189 push @{ $info->{LOWER} }, $code if $is_lower; 190 push @{ $info->{UPPER} }, $code if $is_upper; 191 push @{ $info->{PHONOGRAM} }, $code 192 if $charinfo->{name} =~ /SYLLABLE/ 193 or $charinfo->{block} =~ /Syllable/i; 194 $is_print = 1; 195 $matched = 1; 196 } 197 198 if ( $general_category eq 'Cc' 199 or charprop( $code, 'Grapheme_Cluster_Break' ) eq 'Control' ) 200 { 201 push @{ $info->{CONTROL} }, $code; 202 $matched = 1; 203 } 204 205 push @{ $info->{BLANK} }, $code if $is_space; 206 207 if ( 208 not( 209 $is_space or $general_category eq 'Cc', 210 or $general_category eq 'Ss', 211 or $general_category eq 'Cn', 212 ) 213 ) 214 { 215 push @{ $info->{GRAPH} }, $code; 216 push @{ $info->{SPECIAL} }, $code unless $matched; 217 $is_print = 1; 218 } 219 push @{ $info->{PRINT} }, $code if $is_print; 220 221 if ( charprop( $code, 'Hex_Digit' ) eq 'Yes' ) { 222 push @{ $info->{XDIGIT} }, $code; 223 $info->{TODIGIT}{$code} = hex chr $code 224 if charprop( $code, 'ASCII_Hex_Digit' ) eq 'Yes'; 225 $matched = 1; 226 } 227 228 if ($is_lower) { 229 my $mapping = ord charprop( $code, 'Simple_Uppercase_Mapping' ); 230 $info->{MAPUPPER}{$code} = $mapping if $mapping != $code; 231 } 232 233 if ($is_upper) { 234 my $mapping = ord charprop( $code, 'Simple_Lowercase_Mapping' ); 235 $info->{MAPLOWER}{$code} = $mapping if $mapping != $code; 236 } 237 238 { 239 my $mapping = charprop( $code, 'Numeric_Value' ); 240 $info->{TODIGIT}{$code} = $mapping 241 if $mapping =~ /^[0-9]+$/ and chr($code) ne $mapping; 242 } 243 244 if ($is_print) { 245 my $columns = codepoint_columns( $code, $charinfo ); 246 push @{ $info->{"SWIDTH$columns"} }, $code if defined $columns; 247 } 248} 249 250sub print_info 251{ 252 my (%info) = @_; 253 254 my $printed = 0; 255 256 foreach my $list (@lists) { 257 next unless $info{$list}; 258 $printed = 1; 259 print_list( $list => $info{$list} ); 260 } 261 262 print "\n" if $printed; 263 264 foreach my $map (@maps) { 265 next unless $info{$map}; 266 print_map( $map => $info{$map} ); 267 } 268} 269 270sub print_list 271{ 272 my ( $list, $points ) = @_; 273 274 my @squished = reverse @{ squish_points($points) }; 275 my $line = sprintf "%-10s%s", $list, pop @squished; 276 277 while (@squished) { 278 my $item = pop @squished; 279 280 if ( length("$line $item") > 80 ) { 281 say $line; 282 $line = sprintf "%-10s%s", $list, $item; 283 } else { 284 $line .= " $item"; # two leading spaces on purpose 285 } 286 } 287 288 say $line; 289} 290 291sub print_map 292{ 293 my ( $map, $points ) = @_; 294 my $single = '< %s %s >'; 295 my $range = '< %s : %s >'; 296 297 my %map; 298 299 my $adjustment; 300 my $last_diff = 0; 301 my $first_point; 302 my $prev_point; 303 foreach my $point ( sort { $a <=> $b } keys %{$points} ) { 304 my $diff = $point - $points->{$point}; 305 306 if ( $diff != $last_diff 307 or 308 ( defined $prev_point and $point - 1 != $prev_point ) ) 309 { 310 $first_point = undef; 311 $adjustment = undef; 312 $last_diff = undef; 313 } 314 315 $first_point //= $point; 316 $adjustment //= $points->{$point}; 317 $last_diff //= $diff; 318 319 $prev_point = $point; 320 321 push @{ $map{$first_point}{$adjustment} }, $point; 322 } 323 324 my @ranges; 325 326 foreach my $point ( keys %map ) { 327 foreach my $adjustment ( keys %{ $map{$point} } ) { 328 my $adj = 329 $map eq 'TODIGIT' 330 ? ( $adjustment || '0x0000' ) 331 : format_point($adjustment); 332 foreach ( 333 @{ squish_points( $map{$point}{$adjustment} ) } 334 ) 335 { 336 my $format = / - / ? $range : $single; 337 my $formatted = sprintf $format, $_, $adj; 338 push @ranges, $formatted; 339 } 340 } 341 } 342 343 printf "%-10s%s\n", $map, $_ for sort @ranges; 344} 345 346sub squish_points 347{ 348 my ($points) = @_; 349 my @squished; 350 351 my $start; 352 my $last_point = 0; 353 354 foreach my $i ( 0 .. $#{$points} + 1 ) { 355 356 my $point = $points->[$i]; 357 358 if ( defined $point and $point - 1 == $last_point ) { 359 $last_point = $point; 360 next; 361 } 362 363 if ( defined $start ) { 364 if ( $start == $i - 1 ) { 365 push @squished, 366 format_point( $points->[$start] ); 367 } 368 369 # TODO: This is nice, but breaks print_map 370 #elsif ( $start == $i - 2 ) { 371 # push @squished, format_point( $points->[$start] ), 372 # format_point( $points->[ $i - 1 ] ); 373 #} 374 else { 375 push @squished, join ' - ', 376 format_point( $points->[$start] ), 377 format_point( $points->[ $i - 1 ] ); 378 } 379 } 380 381 $start = $i; 382 $last_point = $point; 383 } 384 385 return \@squished; 386} 387 388sub format_point 389{ 390 my ($point) = @_; 391 state %make_chr; 392 %make_chr = map { $_ => 1 } ( 0 .. 9, 'a' .. 'z', 'A' .. 'Z' ) 393 unless %make_chr; 394 395 my $chr = chr $point; 396 return "'$chr'" if $make_chr{$chr}; 397 return sprintf "0x%04x", $point; 398} 399 400sub codepoint_columns 401{ 402 my ( $code, $charinfo ) = @_; 403 return undef unless defined $code; 404 405 # Private use areas are _most likely_ used by one column glyphs 406 return 1 if $charinfo->{category} eq 'Co'; 407 408 return 0 if $charinfo->{category} eq 'Mn'; 409 return 0 if $charinfo->{category} eq 'Me'; 410 return 0 if index( $charinfo->{category}, 'C' ) == 0; 411 412 return 2 if $charinfo->{block} eq 'Hangul Jamo'; 413 return 2 if $charinfo->{block} eq 'Hangul Jamo Extended-B'; 414 415 { 416 my $eaw = charprop( $code, 'East_Asian_Width' ); 417 return 2 if $eaw eq 'Wide' or $eaw eq 'Fullwidth'; 418 } 419 420 return 1; 421} 422 423__END__ 424=head1 NAME 425 426gen_ctype_utf8.pl - rebuild src/share/locale/ctype/en_US.UTF-8.src 427 428=head1 SYNOPSIS 429 430gen_ctype_utf8.pl > en_US.UTF-8.src 431 432=head1 DESCRIPTION 433 434The perl community does a good job of keeping their Unicode tables up to date 435we can reuse their hard work to rebuild our tables. 436 437We don't directly use the files from the Unicode Consortium instead we use 438summary files that we generate. 439See L<mklocale(1)> for more information about these files. 440 441=head1 CAVEATS 442 443Requires perl 5.22 or newer. 444 445=head1 AUTHOR 446 447Andrew Fresh <afresh1@openbsd.org> 448